def main(): args = get_args() # f1_matrix holds for every training annotator: the list of tuples of # avg/med f1_row based on avg/med threshold f1_matrix = [] # holds for every training annotator: the list of tuples of avg/med threshold t_matrix = [] current_label_list = [] f1_final = [] # holds 4-tuples of avgs over (f1_avg_avg, f1_avg_med, f1_med_avg, f1_med_med) f.e. tr t_final = [] # holds 4-tuples of (t_avg_avg, t_avg_med, t_med_avg, t_med_med) f.e. tr #X_tr, _, v = feats_and_classify_py2.collect_features(args.parsed_file) with open('X_train.pickle', 'rb') as pf: X_tr = pickle.load(pf) with open('X_test.pickle', 'rb') as pf: X_te = pickle.load(pf) y_tr = feats_and_classify_py2.collect_labels_positive_threshold(args.all_annotations_file, 1) y_te = np.array([int(l) for l in open(args.gold)]) #X_out, _, _ = feats_and_classify_py2.collect_features(args.predictfile) # filter for targets #X_out = [x for x in X_out if not x.label == '?'] rf = RandomForestClassifier(11) rf.fit(X_tr, y_tr) preds = rf.predict_proba(X_te) with open(args.output, 'w') as outfile: for p in preds: #print(p) outfile.write(str(p[1])) outfile.write('\n') sys.exit(0)
def main(): args = get_args() # f1_matrix holds for every training annotator: the list of tuples of # avg/med f1_row based on avg/med threshold f1_matrix = [] # holds for every training annotator: the list of tuples of avg/med threshold t_matrix = [] current_label_list = [] f1_final = [] # holds 4-tuples of avgs over (f1_avg_avg, f1_avg_med, f1_med_avg, f1_med_med) f.e. tr t_final = [] # holds 4-tuples of (t_avg_avg, t_avg_med, t_med_avg, t_med_med) f.e. tr #X_tr, _, v = feats_and_classify_py2.collect_features(args.parsed_file) with open('X_train.pickle', 'rb') as pf: X_tr = pickle.load(pf) with open('X_test.pickle', 'rb') as pf: X_te = pickle.load(pf) y_tr = feats_and_classify_py2.collect_labels_positive_threshold(args.all_annotations_file, 1) #X_out, _, _ = feats_and_classify_py2.collect_features(args.predictfile) # filter for targets #X_out = [x for x in X_out if not x.label == '?'] conf = NeuralNetConfig(X=X_tr, y=y_tr, layers=args.layers, iterations=args.iterations, verbose=args.verbose) nn = NN(conf) nn.train(X_tr, y_tr) if args.threshold: preds = nn.predict_for_threshold(X_te, args.threshold) else: preds = nn.get_output(X_te) with open(args.output, 'w') as outfile: for p in preds: #print(p) outfile.write(str(p)) outfile.write('\n') sys.exit(0)
def main(): args = get_args() # f1_matrix holds for every training annotator: the list of tuples of # avg/med f1_row based on avg/med threshold f1_matrix = [] # holds for every training annotator: the list of tuples of avg/med threshold t_matrix = [] current_label_list = [] f1_final = [] # holds 4-tuples of avgs over (f1_avg_avg, f1_avg_med, f1_med_avg, f1_med_med) f.e. tr t_final = [] # holds 4-tuples of (t_avg_avg, t_avg_med, t_med_avg, t_med_med) f.e. tr #X, _, v = feats_and_classify_py2.collect_features(args.parsed_file) with open(args.train_features, 'rb') as pf: X = pickle.load(pf) # train for every annotator... for vote_threshold in range(1,2): y_current_tr = feats_and_classify_py2.collect_labels_positive_threshold(args.all_annotations_file, vote_threshold) print("Training, setting positive labels for examples with at least {} positive votes. ".format(vote_threshold)) print("Training data has {} positive labels out of {}".format(sum(y_current_tr), len(y_current_tr))) f1_row = [] # holds 4-tuples of (f1_avg_avg, f1_avg_med, f1_med_avg, f1_med_med) f.e. tr/te t_row = [] # holds 2-tuples of (t_avg, t_med) f.e. tr/te f1_matrix.append(f1_row) t_matrix.append(t_row) conf = NeuralNetConfig(X=X, y=y_current_tr, layers=args.layers, iterations=args.iterations, verbose=args.verbose) print("Using neural network models with {} hidden layers of sizes {}".format(len(args.layers), args.layers)) # optimize t for every annotator (except training annotator), yields avg/med t # 02, 09, 17 are the annotators with the least/average/most positive votes #for idx in "02 03 09 12 17".split(" "): for idx in "01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20".split(" "): print(" Testing on annotator "+idx) #current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_"+idx+".lbl.conll" #y_current_te = feats_and_classify_py2.collect_labels(current_single_ann) y_current_te = feats_and_classify_py2.collect_labels(args.all_annotations_file, int(idx)-1) current_label_list.append(y_current_te) thresholds, scores = getBestThresholds(X, y_current_tr, y_current_te, conf, args.splits) t_avg = np.average(thresholds) t_med = np.median(thresholds) t_row.append((t_avg, t_med)) f1_avg = np.average([score[0] for score in scores]) f1_std = np.std([score[0] for score in scores]) print("Avg. F1 for test annotator {}: {} (+/- {})".format(idx, f1_avg, f1_std)) # calculate avg of avg t's, avg of med t's, ... for the current training annotator t_avg_avg = np.average([t[0] for t in t_row]) t_avg_med = np.average([t[1] for t in t_row]) t_med_avg = np.median([t[0] for t in t_row]) t_med_med = np.median([t[1] for t in t_row]) t_final.append((t_avg_avg, t_avg_med, t_med_avg, t_med_med)) print("Computed optimal t's... Now running a new phase of CV experiments with these t's on test annotators.") print("Optimal t's are {}, {}, {}, {}".format(t_avg_avg, t_avg_med, t_med_avg, t_med_med)) # 02, 09, 17 are the annotators with the least/average/most positive votes #for idx in "02 03 09 12 17".split(" "): for idx in "01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20".split(" "): f1_avg_avg = 0 f1_med_avg = 0 f1_avg_med = 0 f1_med_med = 0 y_current_te = feats_and_classify_py2.collect_labels(args.all_annotations_file, int(idx)-1) pos = sum(y_current_te) print("Testing globally optimal t's for annotator {} ({} positives)".format(idx, pos)) f1_avg_avg = cvWithThreshold(conf, X, y_current_tr, y_current_te, t_avg_avg, args.splits)[0] f1_avg_med = cvWithThreshold(conf, X, y_current_tr, y_current_te, t_avg_med, args.splits)[0] f1_med_avg = cvWithThreshold(conf, X, y_current_tr, y_current_te, t_med_avg, args.splits)[0] f1_med_med = cvWithThreshold(conf, X, y_current_tr, y_current_te, t_med_med, args.splits)[0] print("F1 for test annotator {}: {}".format(idx, f1_avg_med)) f1_row.append((f1_avg_avg, f1_avg_med, f1_med_avg, f1_med_med)) f1_final.append(tuple(map(np.average, zip(*f1_row)))) print(tuple(map(np.average, zip(*f1_row)))) print(f1_final) # get the index (NB: array index!) of the max avg/med F1 (i.e. computed on avg/med threshold) best_vote_threshold_avg_avg = np.argmax([f1[0] for f1 in f1_final]) best_vote_threshold_avg_med = np.argmax([f1[1] for f1 in f1_final]) best_vote_threshold_med_avg = np.argmax([f1[2] for f1 in f1_final]) best_vote_threshold_med_med = np.argmax([f1[3] for f1 in f1_final]) print(t_final) sys.exit(0)
def main(): args = get_args() # f1_matrix holds for every training annotator: the list of tuples of # avg/med f1_row based on avg/med threshold f1_matrix = [] # holds for every training annotator: the list of tuples of avg/med threshold t_matrix = [] current_label_list = [] f1_final = [ ] # holds 4-tuples of avgs over (f1_avg_avg, f1_avg_med, f1_med_avg, f1_med_med) f.e. tr t_final = [ ] # holds 4-tuples of (t_avg_avg, t_avg_med, t_med_avg, t_med_med) f.e. tr #X, _, v = feats_and_classify_py2.collect_features(args.parsed_file) with open(args.train_features, 'rb') as pf: X = pickle.load(pf) # train for every annotator... for vote_threshold in range(1, 2): y_current_tr = feats_and_classify_py2.collect_labels_positive_threshold( args.all_annotations_file, vote_threshold) print( "Training, setting positive labels for examples with at least {} positive votes. " .format(vote_threshold)) print("Training data has {} positive labels out of {}".format( sum(y_current_tr), len(y_current_tr))) f1_row = [ ] # holds 4-tuples of (f1_avg_avg, f1_avg_med, f1_med_avg, f1_med_med) f.e. tr/te t_row = [] # holds 2-tuples of (t_avg, t_med) f.e. tr/te f1_matrix.append(f1_row) t_matrix.append(t_row) conf = NeuralNetConfig(X=X, y=y_current_tr, layers=args.layers, iterations=args.iterations, verbose=args.verbose) print("Using neural network models with {} hidden layers of sizes {}". format(len(args.layers), args.layers)) # optimize t for every annotator (except training annotator), yields avg/med t # 02, 09, 17 are the annotators with the least/average/most positive votes #for idx in "02 03 09 12 17".split(" "): for idx in "01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20".split( " "): print(" Testing on annotator " + idx) #current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_"+idx+".lbl.conll" #y_current_te = feats_and_classify_py2.collect_labels(current_single_ann) y_current_te = feats_and_classify_py2.collect_labels( args.all_annotations_file, int(idx) - 1) current_label_list.append(y_current_te) thresholds, scores = getBestThresholds(X, y_current_tr, y_current_te, conf, args.splits) t_avg = np.average(thresholds) t_med = np.median(thresholds) t_row.append((t_avg, t_med)) f1_avg = np.average([score[0] for score in scores]) f1_std = np.std([score[0] for score in scores]) print("Avg. F1 for test annotator {}: {} (+/- {})".format( idx, f1_avg, f1_std)) # calculate avg of avg t's, avg of med t's, ... for the current training annotator t_avg_avg = np.average([t[0] for t in t_row]) t_avg_med = np.average([t[1] for t in t_row]) t_med_avg = np.median([t[0] for t in t_row]) t_med_med = np.median([t[1] for t in t_row]) t_final.append((t_avg_avg, t_avg_med, t_med_avg, t_med_med)) print( "Computed optimal t's... Now running a new phase of CV experiments with these t's on test annotators." ) print("Optimal t's are {}, {}, {}, {}".format(t_avg_avg, t_avg_med, t_med_avg, t_med_med)) # 02, 09, 17 are the annotators with the least/average/most positive votes #for idx in "02 03 09 12 17".split(" "): for idx in "01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20".split( " "): f1_avg_avg = 0 f1_med_avg = 0 f1_avg_med = 0 f1_med_med = 0 y_current_te = feats_and_classify_py2.collect_labels( args.all_annotations_file, int(idx) - 1) pos = sum(y_current_te) print( "Testing globally optimal t's for annotator {} ({} positives)". format(idx, pos)) f1_avg_avg = cvWithThreshold(conf, X, y_current_tr, y_current_te, t_avg_avg, args.splits)[0] f1_avg_med = cvWithThreshold(conf, X, y_current_tr, y_current_te, t_avg_med, args.splits)[0] f1_med_avg = cvWithThreshold(conf, X, y_current_tr, y_current_te, t_med_avg, args.splits)[0] f1_med_med = cvWithThreshold(conf, X, y_current_tr, y_current_te, t_med_med, args.splits)[0] print("F1 for test annotator {}: {}".format(idx, f1_avg_med)) f1_row.append((f1_avg_avg, f1_avg_med, f1_med_avg, f1_med_med)) f1_final.append(tuple(map(np.average, zip(*f1_row)))) print(tuple(map(np.average, zip(*f1_row)))) print(f1_final) # get the index (NB: array index!) of the max avg/med F1 (i.e. computed on avg/med threshold) best_vote_threshold_avg_avg = np.argmax([f1[0] for f1 in f1_final]) best_vote_threshold_avg_med = np.argmax([f1[1] for f1 in f1_final]) best_vote_threshold_med_avg = np.argmax([f1[2] for f1 in f1_final]) best_vote_threshold_med_med = np.argmax([f1[3] for f1 in f1_final]) print(t_final) sys.exit(0)