コード例 #1
0
ファイル: randomforest.py プロジェクト: jbingel/cwi2016
def main():
    args = get_args()
    # f1_matrix holds for every training annotator: the list of tuples of 
    # avg/med f1_row based on avg/med threshold
    f1_matrix = []
    # holds for every training annotator: the list of tuples of avg/med threshold
    t_matrix = []
    current_label_list = []
    
    f1_final = [] # holds 4-tuples of avgs over (f1_avg_avg, f1_avg_med, f1_med_avg, f1_med_med) f.e. tr 
    t_final  = [] # holds 4-tuples of (t_avg_avg, t_avg_med, t_med_avg, t_med_med) f.e. tr

    #X_tr, _, v = feats_and_classify_py2.collect_features(args.parsed_file)
    with open('X_train.pickle', 'rb') as pf:
        X_tr = pickle.load(pf)
    with open('X_test.pickle', 'rb') as pf:
        X_te = pickle.load(pf)
    y_tr = feats_and_classify_py2.collect_labels_positive_threshold(args.all_annotations_file, 1)
    y_te = np.array([int(l) for l in open(args.gold)])
    #X_out, _, _ = feats_and_classify_py2.collect_features(args.predictfile)
    # filter for targets
    #X_out = [x for x in X_out if not x.label == '?']

    rf = RandomForestClassifier(11)
    rf.fit(X_tr, y_tr) 
    preds = rf.predict_proba(X_te)
 
   

    with open(args.output, 'w') as outfile:
        for p in preds:
            #print(p)
            outfile.write(str(p[1]))
            outfile.write('\n')
    sys.exit(0)
コード例 #2
0
ファイル: nn-predict.py プロジェクト: jbingel/cwi2016
def main():
    args = get_args()
    # f1_matrix holds for every training annotator: the list of tuples of 
    # avg/med f1_row based on avg/med threshold
    f1_matrix = []
    # holds for every training annotator: the list of tuples of avg/med threshold
    t_matrix = []
    current_label_list = []
    
    f1_final = [] # holds 4-tuples of avgs over (f1_avg_avg, f1_avg_med, f1_med_avg, f1_med_med) f.e. tr 
    t_final  = [] # holds 4-tuples of (t_avg_avg, t_avg_med, t_med_avg, t_med_med) f.e. tr

    #X_tr, _, v = feats_and_classify_py2.collect_features(args.parsed_file)
    with open('X_train.pickle', 'rb') as pf:
        X_tr = pickle.load(pf)
    with open('X_test.pickle', 'rb') as pf:
        X_te = pickle.load(pf)
    y_tr = feats_and_classify_py2.collect_labels_positive_threshold(args.all_annotations_file, 1)

    #X_out, _, _ = feats_and_classify_py2.collect_features(args.predictfile)
    # filter for targets
    #X_out = [x for x in X_out if not x.label == '?']

    conf = NeuralNetConfig(X=X_tr, y=y_tr, layers=args.layers, iterations=args.iterations, verbose=args.verbose)
    
    nn = NN(conf)
    nn.train(X_tr, y_tr)
    if args.threshold:
        preds = nn.predict_for_threshold(X_te, args.threshold)
    else:
        preds = nn.get_output(X_te) 
    with open(args.output, 'w') as outfile:
        for p in preds:
            #print(p)
            outfile.write(str(p))
            outfile.write('\n')
    sys.exit(0)
コード例 #3
0
def main():
    args = get_args()
    # f1_matrix holds for every training annotator: the list of tuples of 
    # avg/med f1_row based on avg/med threshold
    f1_matrix = []
    # holds for every training annotator: the list of tuples of avg/med threshold
    t_matrix = []
    current_label_list = []
    
    f1_final = [] # holds 4-tuples of avgs over (f1_avg_avg, f1_avg_med, f1_med_avg, f1_med_med) f.e. tr 
    t_final  = [] # holds 4-tuples of (t_avg_avg, t_avg_med, t_med_avg, t_med_med) f.e. tr

    #X, _, v = feats_and_classify_py2.collect_features(args.parsed_file)

    with open(args.train_features, 'rb') as pf:
        X = pickle.load(pf)

    # train for every annotator...
    for vote_threshold in range(1,2):
        y_current_tr = feats_and_classify_py2.collect_labels_positive_threshold(args.all_annotations_file, vote_threshold)
        print("Training, setting positive labels for examples with at least {} positive votes. ".format(vote_threshold))
        print("Training data has {} positive labels out of {}".format(sum(y_current_tr), len(y_current_tr)))
        f1_row = [] # holds 4-tuples of (f1_avg_avg, f1_avg_med, f1_med_avg, f1_med_med) f.e. tr/te
        t_row  = [] # holds 2-tuples of (t_avg, t_med) f.e. tr/te
        f1_matrix.append(f1_row)
        t_matrix.append(t_row)
        
        conf = NeuralNetConfig(X=X, y=y_current_tr, layers=args.layers, iterations=args.iterations, verbose=args.verbose)
        print("Using neural network models with {} hidden layers of sizes {}".format(len(args.layers), args.layers))
        # optimize t for every annotator (except training annotator), yields avg/med t 
        # 02, 09, 17 are the annotators with the least/average/most positive votes
        #for idx in "02 03 09 12 17".split(" "):
        for idx in "01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20".split(" "):
            print("  Testing on annotator "+idx)
            #current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_"+idx+".lbl.conll"

            #y_current_te = feats_and_classify_py2.collect_labels(current_single_ann)
            y_current_te = feats_and_classify_py2.collect_labels(args.all_annotations_file, int(idx)-1)
            current_label_list.append(y_current_te)
             
            thresholds, scores = getBestThresholds(X, y_current_tr, y_current_te, conf, args.splits)
            t_avg = np.average(thresholds)
            t_med = np.median(thresholds)
            t_row.append((t_avg, t_med))
            f1_avg = np.average([score[0] for score in scores])
            f1_std = np.std([score[0] for score in scores])
            print("Avg. F1 for test annotator {}: {} (+/- {})".format(idx, f1_avg, f1_std))
        # calculate avg of avg t's, avg of med t's, ... for the current training annotator
        t_avg_avg = np.average([t[0] for t in t_row]) 
        t_avg_med = np.average([t[1] for t in t_row]) 
        t_med_avg =  np.median([t[0] for t in t_row]) 
        t_med_med =  np.median([t[1] for t in t_row]) 
        t_final.append((t_avg_avg, t_avg_med, t_med_avg, t_med_med))

        print("Computed optimal t's... Now running a new phase of CV experiments with these t's on test annotators.")
        print("Optimal t's are {}, {}, {}, {}".format(t_avg_avg, t_avg_med, t_med_avg, t_med_med)) 
        # 02, 09, 17 are the annotators with the least/average/most positive votes
        #for idx in "02 03 09 12 17".split(" "):
        for idx in "01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20".split(" "):
            f1_avg_avg = 0
            f1_med_avg = 0
            f1_avg_med = 0
            f1_med_med = 0
 
            y_current_te = feats_and_classify_py2.collect_labels(args.all_annotations_file, int(idx)-1)
            pos = sum(y_current_te)
            print("Testing globally optimal t's for annotator {} ({} positives)".format(idx, pos))
            
            f1_avg_avg = cvWithThreshold(conf, X, y_current_tr, y_current_te, t_avg_avg, args.splits)[0]
            f1_avg_med = cvWithThreshold(conf, X, y_current_tr, y_current_te, t_avg_med, args.splits)[0]
            f1_med_avg = cvWithThreshold(conf, X, y_current_tr, y_current_te, t_med_avg, args.splits)[0]
            f1_med_med = cvWithThreshold(conf, X, y_current_tr, y_current_te, t_med_med, args.splits)[0]
            print("F1 for test annotator {}: {}".format(idx, f1_avg_med))
           
            f1_row.append((f1_avg_avg, f1_avg_med, f1_med_avg, f1_med_med))

        f1_final.append(tuple(map(np.average, zip(*f1_row))))
        print(tuple(map(np.average, zip(*f1_row))))
    print(f1_final)
    # get the index (NB: array index!) of the max avg/med F1 (i.e. computed on avg/med threshold)
    best_vote_threshold_avg_avg = np.argmax([f1[0] for f1 in f1_final])
    best_vote_threshold_avg_med = np.argmax([f1[1] for f1 in f1_final])
    best_vote_threshold_med_avg = np.argmax([f1[2] for f1 in f1_final])
    best_vote_threshold_med_med = np.argmax([f1[3] for f1 in f1_final])

    print(t_final)

    sys.exit(0)
コード例 #4
0
def main():
    args = get_args()
    # f1_matrix holds for every training annotator: the list of tuples of
    # avg/med f1_row based on avg/med threshold
    f1_matrix = []
    # holds for every training annotator: the list of tuples of avg/med threshold
    t_matrix = []
    current_label_list = []

    f1_final = [
    ]  # holds 4-tuples of avgs over (f1_avg_avg, f1_avg_med, f1_med_avg, f1_med_med) f.e. tr
    t_final = [
    ]  # holds 4-tuples of (t_avg_avg, t_avg_med, t_med_avg, t_med_med) f.e. tr

    #X, _, v = feats_and_classify_py2.collect_features(args.parsed_file)

    with open(args.train_features, 'rb') as pf:
        X = pickle.load(pf)

    # train for every annotator...
    for vote_threshold in range(1, 2):
        y_current_tr = feats_and_classify_py2.collect_labels_positive_threshold(
            args.all_annotations_file, vote_threshold)
        print(
            "Training, setting positive labels for examples with at least {} positive votes. "
            .format(vote_threshold))
        print("Training data has {} positive labels out of {}".format(
            sum(y_current_tr), len(y_current_tr)))
        f1_row = [
        ]  # holds 4-tuples of (f1_avg_avg, f1_avg_med, f1_med_avg, f1_med_med) f.e. tr/te
        t_row = []  # holds 2-tuples of (t_avg, t_med) f.e. tr/te
        f1_matrix.append(f1_row)
        t_matrix.append(t_row)

        conf = NeuralNetConfig(X=X,
                               y=y_current_tr,
                               layers=args.layers,
                               iterations=args.iterations,
                               verbose=args.verbose)
        print("Using neural network models with {} hidden layers of sizes {}".
              format(len(args.layers), args.layers))
        # optimize t for every annotator (except training annotator), yields avg/med t
        # 02, 09, 17 are the annotators with the least/average/most positive votes
        #for idx in "02 03 09 12 17".split(" "):
        for idx in "01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20".split(
                " "):
            print("  Testing on annotator " + idx)
            #current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_"+idx+".lbl.conll"

            #y_current_te = feats_and_classify_py2.collect_labels(current_single_ann)
            y_current_te = feats_and_classify_py2.collect_labels(
                args.all_annotations_file,
                int(idx) - 1)
            current_label_list.append(y_current_te)

            thresholds, scores = getBestThresholds(X, y_current_tr,
                                                   y_current_te, conf,
                                                   args.splits)
            t_avg = np.average(thresholds)
            t_med = np.median(thresholds)
            t_row.append((t_avg, t_med))
            f1_avg = np.average([score[0] for score in scores])
            f1_std = np.std([score[0] for score in scores])
            print("Avg. F1 for test annotator {}: {} (+/- {})".format(
                idx, f1_avg, f1_std))
        # calculate avg of avg t's, avg of med t's, ... for the current training annotator
        t_avg_avg = np.average([t[0] for t in t_row])
        t_avg_med = np.average([t[1] for t in t_row])
        t_med_avg = np.median([t[0] for t in t_row])
        t_med_med = np.median([t[1] for t in t_row])
        t_final.append((t_avg_avg, t_avg_med, t_med_avg, t_med_med))

        print(
            "Computed optimal t's... Now running a new phase of CV experiments with these t's on test annotators."
        )
        print("Optimal t's are {}, {}, {}, {}".format(t_avg_avg, t_avg_med,
                                                      t_med_avg, t_med_med))
        # 02, 09, 17 are the annotators with the least/average/most positive votes
        #for idx in "02 03 09 12 17".split(" "):
        for idx in "01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20".split(
                " "):
            f1_avg_avg = 0
            f1_med_avg = 0
            f1_avg_med = 0
            f1_med_med = 0

            y_current_te = feats_and_classify_py2.collect_labels(
                args.all_annotations_file,
                int(idx) - 1)
            pos = sum(y_current_te)
            print(
                "Testing globally optimal t's for annotator {} ({} positives)".
                format(idx, pos))

            f1_avg_avg = cvWithThreshold(conf, X, y_current_tr, y_current_te,
                                         t_avg_avg, args.splits)[0]
            f1_avg_med = cvWithThreshold(conf, X, y_current_tr, y_current_te,
                                         t_avg_med, args.splits)[0]
            f1_med_avg = cvWithThreshold(conf, X, y_current_tr, y_current_te,
                                         t_med_avg, args.splits)[0]
            f1_med_med = cvWithThreshold(conf, X, y_current_tr, y_current_te,
                                         t_med_med, args.splits)[0]
            print("F1 for test annotator {}: {}".format(idx, f1_avg_med))

            f1_row.append((f1_avg_avg, f1_avg_med, f1_med_avg, f1_med_med))

        f1_final.append(tuple(map(np.average, zip(*f1_row))))
        print(tuple(map(np.average, zip(*f1_row))))
    print(f1_final)
    # get the index (NB: array index!) of the max avg/med F1 (i.e. computed on avg/med threshold)
    best_vote_threshold_avg_avg = np.argmax([f1[0] for f1 in f1_final])
    best_vote_threshold_avg_med = np.argmax([f1[1] for f1 in f1_final])
    best_vote_threshold_med_avg = np.argmax([f1[2] for f1 in f1_final])
    best_vote_threshold_med_med = np.argmax([f1[3] for f1 in f1_final])

    print(t_final)

    sys.exit(0)