def main(): scriptdir = os.path.dirname(os.path.realpath(__file__)) data = scriptdir+'/../data/cwi_training/cwi_training.txt.lbl.conll' testdata = scriptdir+'/../data/cwi_testing/cwi_testing.gold.txt.lbl.conll' pickled_data = scriptdir+'/../data.pickle' parser = argparse.ArgumentParser() parser.add_argument('--threshold', '-t', type=float, help='Threshold for predicting 0/1. If not specified, the optimal threshold will first be computed as the median of all CV splits. May take a while.') parser.add_argument('--iterations', '-i', type=int, default=50, help='Training iterations.') parser.add_argument('--hidden-layers', '-l', dest='layers', required=True, type=int, nargs='+', help='List of layer sizes') parser.add_argument('--cv-splits', '-c', dest='splits', type=int, help='No. of crossvalidation splits. If not specified, no CV will be performed.') parser.add_argument('--data', '-d', default=data, help='Features and labels') parser.add_argument('--testdata', '-y', default=testdata, help='Test data (not needed for crossval).') parser.add_argument('--verbose', '-v', dest='verbose', action='store_true', help='Print average loss at every training iteration.') parser.add_argument('--output', '-o', help="Output file") parser.add_argument('--features', '-f', dest='features', default=[], type=str, nargs='+', help='List of feature types') args = parser.parse_args() # X, y = load_pickled(args.data) combined_data = 'X_y_all.txt' cutoff = combine_data(args.data, args.testdata, combined_data) X, y, _ = feats_and_classify.collect_features(combined_data, True, args.features) X_tr = X[:cutoff] y_tr = y[:cutoff] X_te = X[cutoff:] y_te = y[cutoff:] conf = NeuralNetConfig(X=X, y=y, layers=args.layers, iterations=args.iterations, verbose=args.verbose) if args.splits: if args.threshold: crossval(X_tr,y_tr,args.splits, conf, t=args.threshold) else: # compute optimal threshold for each CV split print '### Computing optimal threshold... ' ts = crossval(X_tr,y_tr,args.splits, conf) avg = np.average(ts) med = np.median(ts) print '\nThresholds for crossval splits:', ts print 'Mean threshold', avg print 'Median threshold', med print 'Threshold st.dev.', np.std(ts) # Run CV with fixed avg/median threshold print '\n\n### Running with avg. threshold... ' crossval(X_tr,y_tr,args.splits, conf, t=avg) print '\n\n### Running with med. threshold... ' crossval(X_tr,y_tr,args.splits, conf, t=med) else: nn = NN(conf) nn.train(X_tr,y_tr,args.iterations) if args.testdata: # X_test, y_test = load_pickled(args.testdata) pred = nn.get_output(X_te) if args.output: with open(args.output, 'w') as of: for p in pred: of.write('%f\n'%p) t, res = nn.test(X_te,y_te,args.threshold) resout = "G: %f, R: %f, A: %f, P: %f\n"%res sys.stderr.write('%s %f\n'%(' '.join(args.features), t)) sys.stderr.write(resout)
def predictTestSet(): #generate training features and labels trainfile='/home/natschluter/GroupAlgorithms/cwi2016/data/cwi_training/cwi_training_cat.lbl.conll' trainfeatures, trainlabels, vec = feats_and_classify_py2.collect_features(trainfile) #generate training+test features bothfiles='/home/natschluter/GroupAlgorithms/cwi2016/data/train_and_test1.conll' bothfeatures, bothlabels, bothvec = feats_and_classify_py2.collect_features(bothfiles) thresholds_med=np.median(np.array([ 0.145, 0.85, 0.12, 0.657, 0.71, 0.824, 0.506, 0.461, 0.662, 0.888])) TrainX=bothfeatures[np.array(range(len(trainfeatures)))] TrainY=bothlabels[np.array(range(len(trainlabels)))] TestX=bothfeatures[np.array(range(len(trainfeatures),len(bothfeatures)))] maxent = LogisticRegression(penalty='l2') print('training...') maxent.fit(TrainX,TrainY) print('predicting...') ypred_probs=maxent.predict_proba(TestX)
def main(): scriptdir = os.path.dirname(os.path.realpath(__file__)) data = scriptdir + '/../data/cwi_training/cwi_training.txt.lbl.conll' testdata = scriptdir + '/../data/cwi_testing/cwi_testing.gold.txt.lbl.conll' pickled_data = scriptdir + '/../data.pickle' parser = argparse.ArgumentParser() parser.add_argument( '--threshold', '-t', type=float, help= 'Threshold for predicting 0/1. If not specified, the optimal threshold will first be computed as the median of all CV splits. May take a while.' ) parser.add_argument('--iterations', '-i', type=int, default=50, help='Training iterations.') parser.add_argument('--hidden-layers', '-l', dest='layers', required=True, type=int, nargs='+', help='List of layer sizes') parser.add_argument( '--cv-splits', '-c', dest='splits', type=int, help= 'No. of crossvalidation splits. If not specified, no CV will be performed.' ) parser.add_argument('--data', '-d', default=data, help='Features and labels') parser.add_argument('--testdata', '-y', default=testdata, help='Test data (not needed for crossval).') parser.add_argument('--verbose', '-v', dest='verbose', action='store_true', help='Print average loss at every training iteration.') parser.add_argument('--output', '-o', help="Output file") parser.add_argument('--features', '-f', dest='features', default=[], type=str, nargs='+', help='List of feature types') args = parser.parse_args() # X, y = load_pickled(args.data) combined_data = 'X_y_all.txt' cutoff = combine_data(args.data, args.testdata, combined_data) X, y, _ = feats_and_classify.collect_features(combined_data, True, args.features) X_tr = X[:cutoff] y_tr = y[:cutoff] X_te = X[cutoff:] y_te = y[cutoff:] conf = NeuralNetConfig(X=X, y=y, layers=args.layers, iterations=args.iterations, verbose=args.verbose) if args.splits: if args.threshold: crossval(X_tr, y_tr, args.splits, conf, t=args.threshold) else: # compute optimal threshold for each CV split print '### Computing optimal threshold... ' ts = crossval(X_tr, y_tr, args.splits, conf) avg = np.average(ts) med = np.median(ts) print '\nThresholds for crossval splits:', ts print 'Mean threshold', avg print 'Median threshold', med print 'Threshold st.dev.', np.std(ts) # Run CV with fixed avg/median threshold print '\n\n### Running with avg. threshold... ' crossval(X_tr, y_tr, args.splits, conf, t=avg) print '\n\n### Running with med. threshold... ' crossval(X_tr, y_tr, args.splits, conf, t=med) else: nn = NN(conf) nn.train(X_tr, y_tr, args.iterations) if args.testdata: # X_test, y_test = load_pickled(args.testdata) pred = nn.get_output(X_te) if args.output: with open(args.output, 'w') as of: for p in pred: of.write('%f\n' % p) t, res = nn.test(X_te, y_te, args.threshold) resout = "G: %f, R: %f, A: %f, P: %f\n" % res sys.stderr.write('%s %f\n' % (' '.join(args.features), t)) sys.stderr.write(resout)