Ejemplo n.º 1
0
 def __init__(self, feat_path, feature_maker, label_path=None, fold_in_cv=None):
     """ Construct a dat set.
     @param feat_path: path to raw data file
     @param feature_maker: instance of FeatureTransformer to convert raw data to features
     @param label_path: (optional) path to label file
     @param fold_in_cv: (optional) tuple (x,y). Split data to y folds, and take
         only x-th fold if x>0, or all except x-th fold if x<0
     """
     self.instances = []
     feat_lines = util.open_csv(feat_path, fold_in_cv)
     if label_path:
         label_lines = util.open_csv(label_path, fold_in_cv)
     else:
         label_lines = itertools.repeat(None)   # endless None
     for (feat_line, label_line) in itertools.izip(feat_lines, label_lines):
         self.instances.append(self._make_feature(feat_line, label_line))
Ejemplo n.º 2
0
def main():
    global feature_maker
    info('start')
    start = datetime.now()

    args = parse_args()
    feature_maker = feature_transformer.get_maker(args.D, args.transform)

    if args.cmd == 'test':   # train on training data and predict on testing data
        feature_maker.init_per_train(util.open_csv(args.train))
        data = TrainData(args.train, feature_maker, args.train_label)
        model = new_model(feature_maker.dim)
        for r in xrange(args.R):
            if r > 0: data.rewind()
            model = train_one(data, model)
        with open(args.prediction, 'w') as outfile:
            outfile.write('id_label,pred\n')
            for ID, x in TestData(args.test, feature_maker):
                pred = model.predict(x)
                for (k,p) in zip(K,pred):
                    outfile.write('%s_y%d,%.16f\n' % (ID,k+1,p))
                    if k == 12:
                        outfile.write('%s_y14,0.0\n' % ID)
    else:   # do cross validation
        nfold = args.nfold
        cnt_ins = [0]*args.R
        cnt_loss = [0.]*args.R
        for fold in xrange(1,nfold+1):
            feature_maker.init_per_train(util.open_csv(args.train, (-fold,nfold)))
            train_data = TrainData(args.train, feature_maker, args.train_label, (-fold,nfold))
            model = new_model(feature_maker.dim)
            valid_data = TestData(args.train, feature_maker, args.train_label, (fold,nfold))
            for r in xrange(args.R):
                if r > 0: train_data.rewind()
                model = train_one(train_data, model)
                if r > 0: valid_data.rewind()
                f_ins, f_loss = evaluate(valid_data, model)
                info("round validation: %f" % (f_loss/f_ins))
                cnt_ins[r] += f_ins
                cnt_loss[r] += f_loss
            del train_data
            del valid_data
            del model
        for r in xrange(args.R):
            print "%d round CV result: %f"%(r, cnt_loss[r]/cnt_ins[r])

    info('Done, elapsed time: %s' % str(datetime.now() - start))
Ejemplo n.º 3
0
 def rewind(self):
     self.feat_lines = util.open_csv(self.feat_path, self.fold_in_cv)
     if self.label_path:
         self.label_lines = util.open_csv(self.label_path, self.fold_in_cv)
     else:
         self.label_lines = itertools.repeat(None)   # endless None