def __init__(self, feat_path, feature_maker, label_path=None, fold_in_cv=None): """ Construct a dat set. @param feat_path: path to raw data file @param feature_maker: instance of FeatureTransformer to convert raw data to features @param label_path: (optional) path to label file @param fold_in_cv: (optional) tuple (x,y). Split data to y folds, and take only x-th fold if x>0, or all except x-th fold if x<0 """ self.instances = [] feat_lines = util.open_csv(feat_path, fold_in_cv) if label_path: label_lines = util.open_csv(label_path, fold_in_cv) else: label_lines = itertools.repeat(None) # endless None for (feat_line, label_line) in itertools.izip(feat_lines, label_lines): self.instances.append(self._make_feature(feat_line, label_line))
def main(): global feature_maker info('start') start = datetime.now() args = parse_args() feature_maker = feature_transformer.get_maker(args.D, args.transform) if args.cmd == 'test': # train on training data and predict on testing data feature_maker.init_per_train(util.open_csv(args.train)) data = TrainData(args.train, feature_maker, args.train_label) model = new_model(feature_maker.dim) for r in xrange(args.R): if r > 0: data.rewind() model = train_one(data, model) with open(args.prediction, 'w') as outfile: outfile.write('id_label,pred\n') for ID, x in TestData(args.test, feature_maker): pred = model.predict(x) for (k,p) in zip(K,pred): outfile.write('%s_y%d,%.16f\n' % (ID,k+1,p)) if k == 12: outfile.write('%s_y14,0.0\n' % ID) else: # do cross validation nfold = args.nfold cnt_ins = [0]*args.R cnt_loss = [0.]*args.R for fold in xrange(1,nfold+1): feature_maker.init_per_train(util.open_csv(args.train, (-fold,nfold))) train_data = TrainData(args.train, feature_maker, args.train_label, (-fold,nfold)) model = new_model(feature_maker.dim) valid_data = TestData(args.train, feature_maker, args.train_label, (fold,nfold)) for r in xrange(args.R): if r > 0: train_data.rewind() model = train_one(train_data, model) if r > 0: valid_data.rewind() f_ins, f_loss = evaluate(valid_data, model) info("round validation: %f" % (f_loss/f_ins)) cnt_ins[r] += f_ins cnt_loss[r] += f_loss del train_data del valid_data del model for r in xrange(args.R): print "%d round CV result: %f"%(r, cnt_loss[r]/cnt_ins[r]) info('Done, elapsed time: %s' % str(datetime.now() - start))
def rewind(self): self.feat_lines = util.open_csv(self.feat_path, self.fold_in_cv) if self.label_path: self.label_lines = util.open_csv(self.label_path, self.fold_in_cv) else: self.label_lines = itertools.repeat(None) # endless None