plt.hist(train_marginals, bins=20) plt.show() gen_model.learned_lf_stats() save_marginals(session, L_train, train_marginals) load_external_labels(session, BiomarkerCondition, 'Biomarker', 'Condition', 'articles/disease_gold_labels.tsv', dev_cands, annotator_name='gold') L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1) L_gold_dev L_dev = labeler.apply_existing(split=1) _ = gen_model.score(session, L_dev, L_gold_dev) L_dev.lf_stats(session, L_gold_dev, gen_model.learned_lf_stats()['Accuracy']) labeled = [] for c in session.query(BiomarkerCondition).filter( BiomarkerCondition.split == 1).all(): if LF_markerDatabase(c) == 1: labeled.append(c) SentenceNgramViewer(labeled, session, n_per_page=3) # Load dev labels and convert to [0, 1] range L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1) dev_labels = (np.ravel(L_gold_dev.todense()) + 1) / 2 # Feature Extraction featurizer = FeatureAnnotator()
def score_lfs(predicate_resume, L_gold_test, session, date_time, parallelism=8): dump_file_path = "./results/" + "lfs_1_" + predicate_resume[ "predicate_name"] + date_time + ".csv" key_group = predicate_resume["label_group"] LFs = get_labelling_functions(predicate_resume) labeler = LabelAnnotator(lfs=LFs) test_cids_query = get_test_cids_with_span(predicate_resume, session) L_test = labeler.apply(parallelism=parallelism, cids_query=test_cids_query, key_group=key_group, clear=True, replace_key_set=False) data_frame = L_test.lf_stats(session) print(data_frame) logging.info(data_frame) data_frame.to_csv(dump_file_path) gen_model = GenerativeModel() gen_model.train(L_test, epochs=100, decay=0.95, step_size=0.1 / L_test.shape[0], reg_param=1e-6) p, r, f1 = gen_model.score(L_test, L_gold_test) print("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format(p, r, f1)) logging.info("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format( p, r, f1)) dump_file_path1 = "./results/" + "test_gen_1_" + predicate_resume[ "predicate_name"] + date_time + ".csv" with open(dump_file_path1, 'w+b') as f: writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL) writer.writerow(["Precision", "Recall", "F1"]) writer.writerow( ["{0:.3f}".format(p), "{0:.3f}".format(r), "{0:.3f}".format(f1)]) test_marginals = gen_model.marginals(L_test) dump_file_path2 = "./results/" + "plt_1_" + predicate_resume[ "predicate_name"] + date_time + ".csv" #plt.hist(test_marginals, bins=20) #plt.savefig(dump_file_path2) #plt.show() dump_file_path3 = "./results/" + "gen_2_" + predicate_resume[ "predicate_name"] + date_time + ".csv" data_frame3 = gen_model.learned_lf_stats() data_frame3.to_csv(dump_file_path3) dump_file_path4 = "./results/" + "gen_3_" + predicate_resume[ "predicate_name"] + date_time + ".csv" tp, fp, tn, fn = gen_model.error_analysis(session, L_test, L_gold_test) with open(dump_file_path4, 'w+b') as f: writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL) writer.writerow(["TP", "FP", "TN", "FN"]) writer.writerow( [str(len(tp)), str(len(fp)), str(len(tn)), str(len(fn))]) dump_file_path5 = "./results/" + "gen_4_" + predicate_resume[ "predicate_name"] + date_time + ".csv" data_frame4 = L_test.lf_stats(session, L_gold_test, gen_model.learned_lf_stats()['Accuracy']) data_frame4.to_csv(dump_file_path5)
# ####### Majority Vote ######## # mv_labels = np.sign(np.sum(L.T,1)) # print ('Coverage of Majority Vote on Train Set: ', np.sum(np.sign(np.sum(np.abs(L.T),1)) != 0)/float(loader.train_num)) # print ('Accuracy of Majority Vote on Train Set: ', np.sum(mv_labels == loader.train_ground)/float(loader.train_num)) ######################## ####### Snorkel ######## ######################## print('\n\n\n####### Running Snorkel Generative Model ########') gen_model = GenerativeModel() gen_model.train(L_train, epochs=100, decay=0.95, step_size=0.01 / L_train.shape[0], reg_param=1e-6) print(gen_model.score(L_train_sparse, loader.train_ground)) ###################### ####### METAL ######## ###################### # remap labels so that they are in {1, 2} def remap_labels(data): transformed_data = np.zeros(data.shape, dtype=np.int) transformed_data[data == -1] = 1 transformed_data[data == 1] = 2 return transformed_data train_ground = remap_labels(loader.train_ground)