def get_X_y(wv, wv_dim): pmids, sentences, lbls, vectorizer = parse_summerscales.get_tokens_and_lbls() # see: https://github.com/fchollet/keras/issues/233 # num_sentences x 1 x max_token_len x wv_dim # number of sequences x 1 x max number of tokens (padded to max len) x word vector size num_sentences = len(sentences) #max_token_len = max([len(s) for s in sentences]) #X_embedded = np.zeros((num_sentences, wv_dim)) X_embedded, X_tokens = [], [] # here a sequence associated with each doc/abstract y = [] #X_tokens = [] cur_pmid = pmids[0] cur_x_embedded, cur_x_tokens, cur_y = [], [], [] unknown_words_to_vecs = {} for idx, s in enumerate(sentences): if cur_pmid != pmids[idx]: X_embedded.append(np.vstack(cur_x_embedded)) X_tokens.append(np.vstack(cur_x_tokens)) y.append(np.array(cur_y)) cur_x_embedded, cur_x_tokens, cur_y = [], [], [] cur_pmid = pmids[idx] for j, t in enumerate(s): try: v = wv[t] except: # or maybe use 0s??? if not t in unknown_words_to_vecs: v = np.random.uniform(-1,1,wv_dim) unknown_words_to_vecs[t] = v v = unknown_words_to_vecs[t] cur_x_embedded.append(v) cur_x_tokens.append(vectorizer.vocabulary_[t]) cur_y.extend(lbls[idx]) #cur_x_tokens.append(v) X_embedded.append(np.vstack(cur_x_embedded)) X_tokens.append(np.vstack(cur_x_tokens)) y.append(np.array(cur_y)) return X_embedded, X_tokens, y, vectorizer, unknown_words_to_vecs
def get_PMIDs_to_X_y(use_pickle, use_coref): pmids_dict, token_to_features = \ parse_summerscales.get_tokens_and_lbls(use_pickle=use_pickle, use_coref=use_coref) pmids_to_X_y = {} for pmid in pmids_dict: pmid_sentences, pmid_lbls = pmids_dict[pmid] # for this sentence X_tokens, X_features = [], [] y = [] for sent_idx, s in enumerate(pmid_sentences): for j, token in enumerate(s): X_features.append(token_to_features[token]) X_tokens.append(token) y.extend(pmid_lbls[sent_idx]) pmids_to_X_y[pmid] = (np.vstack(X_tokens), np.vstack(X_features), np.hstack(y)) return (pmids_to_X_y, X_tokens)
def main(): n_folds = 5 try: opts, args = getopt.getopt(sys.argv[1:], '', ['window_size=', 'wiki=', 'n_feature_maps=', 'epochs=', 'undersample=', 'n_feature_maps=', 'criterion=', 'optimizer=', 'model=']) except getopt.GetoptError as error: print error sys.exit(2) model_type = 'nn' window_size = 5 wiki = True n_feature_maps = 100 epochs = 20 undersample = False binary_cross_entropy = False criterion = 'categorical_crossentropy' optimizer = 'adam' k = 2 for opt, arg in opts: if opt == '--window_size': window_size = int(arg) elif opt == '--wiki': if arg == 0: wiki = False elif opt == '--epochs': epochs = int(arg) elif opt == '--n_feature_maps': n_feature_maps = int(arg) elif opt == '--undersample': option = int(arg) if option == 1: undersample = True elif opt == '--n_feature_maps': n_feature_maps = int(arg) elif opt == '--criterion': criterion = arg elif opt == '--optimizer': optimizer = arg elif opt == '--model': model_type = arg else: print "Option {} is not valid!".format(opt) if criterion == 'binary_crossentropy': binary_cross_entropy = True k = 1 print('Loading word2vec model...') if wiki: print 'Using wiki word2vec...' word2vec_model = 'wikipedia-pubmed-and-PMC-w2v.bin' else: print 'Using non-wiki word2vec...' word2vec_model = 'PubMed-w2v.bin' w2v = Word2Vec.load_word2vec_format(word2vec_model, binary=True) print('Loaded word2vec model') pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \ parse_summerscales.get_tokens_and_lbls( make_pmids_dict=True, sen=True) all_pmids = pmids_dict.keys() n = len(all_pmids) kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds) for fold_idx, (train, test) in enumerate(kf): print("on fold %s" % fold_idx) train_pmids = [all_pmids[pmid_idx] for pmid_idx in train] test_pmids = [all_pmids[pmid_idx] for pmid_idx in test] print train_pmids print('loading data...') if model_type == 'cnn': X_train, y_train = _prep_data(train_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy) X_test, y_test = _prep_data(test_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy) elif model_type == 'nn': X_train, y_train = _prep_data(train_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy) X_test, y_test = _prep_data(test_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy) if undersample: # Undersample the non group tags at random....probably a bad idea... idx_undersample = numpy.where(y_train[:, 1] == 0)[0] idx_postive = numpy.where(y_train[:, 1] == 1)[0] random_negative_sample = numpy.random.choice(idx_undersample, idx_postive.shape[0]) X_train_postive = X_train[idx_postive, :, :, :] y_train_postive = y_train[idx_postive, :] X_train_negative = X_train[random_negative_sample, :, :, :] y_train_negative = y_train[random_negative_sample, :] X_train = numpy.vstack((X_train_postive, X_train_negative)) y_train = numpy.vstack((y_train_postive, y_train_negative)) print('loaded data...') if model_type == 'cnn': model = GroupCNN(window_size=window_size, n_feature_maps=n_feature_maps, k_output=k) elif model_type == 'nn': model = GroupNN(window_size=window_size, k=k) model.train(X_train, y_train, epochs, optim_algo=optimizer, criterion=criterion) accuracy, f1_score, precision, auc, recall = model.test(X_test, y_test) print "Accuracy: {}".format(accuracy) print "F1: {}".format(f1_score) print "Precision: {}".format(precision) print "AUC: {}".format(auc) print "Recall: {}".format(recall) sys.exit()
def run_crf(w2v, l2, l1, iters, shallow_parse, words_before, words_after, grid_search,tacc, name, transfer_learning=False): pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \ parse_summerscales.get_tokens_and_lbls( make_pmids_dict=True, sen=True, use_genia=shallow_parse, using_tacc=tacc) model = pycrfsuite.Trainer(verbose=False) all_pmids = pmids_dict.keys() n = len(all_pmids) n_folds = 5 kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds) fold_gi = [] recall_scores = [] precision_scores = [] f1_scores = [] model_type = 'nn' binary_cross_entropy = True for fold_idx, (train, test) in enumerate(kf): print("on fold %s" % fold_idx) train_pmids = [all_pmids[pmid_idx] for pmid_idx in train] test_pmids = [all_pmids[pmid_idx] for pmid_idx in test] print('loading data...') if transfer_learning: nn_model = GroupNN.load_model(model_path='NNModel.hdf5', model_info_path='NNModel.hdf5.p') window_size = nn_model.model_info['window_size'] train_x, train_y = GroupCNNExperiment._prep_data(train_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy, crf=True) test_x, test_y = GroupCNNExperiment._prep_data(test_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy, crf=True) train_x = transform_features(nn_model, train_x) test_x = transform_features(nn_model, test_x) train_y = _labels_to_str(train_y) test_y = _labels_to_str(test_y) else: train_x, train_y = abstracts2features(pmids_dict, train_pmids, words_before, words_after, w2v, shallow_parse) test_x, test_y = abstracts2features(pmids_dict, test_pmids, words_before, words_after, w2v, shallow_parse) print('loaded data...') for x, y in zip(train_x, train_y): model.append(x, y) if grid_search: model.set_params({ 'c1': l1, # coefficient for L1 penalty 'c2': l2, # coefficient for L2 penalty 'max_iterations': iters, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) crf = sklearn_crfsuite.CRF( algorithm='lbfgs', c1=l1, c2=l2, max_iterations=iters, all_possible_transitions=False ) params_space = { 'c1': scipy.stats.expon(scale=0.5), 'c2': scipy.stats.expon(scale=0.05), } # use the same metric for evaluation f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=test_y) # search rs = RandomizedSearchCV(crf, params_space, cv=3, verbose=1, n_jobs=-1, n_iter=50, scoring=f1_scorer) rs.fit(train_x, train_y) info = rs.best_estimator_.tagger_.info() tagger = rs.best_estimator_.tagger_ else: model.set_params({ 'c1': l1, # coefficient for L1 penalty 'c2': l2, # coefficient for L2 penalty 'max_iterations': iters, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) model_name = name + '_model {}'.format(fold_idx) print('training model...') model.train(model_name) print('done...') tagger = pycrfsuite.Tagger() tagger.open(model_name) info = tagger.info() def print_transitions(trans_features): for (label_from, label_to), weight in trans_features: print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight)) print("Top likely transitions:") print_transitions(Counter(info.transitions).most_common(80)) print("\nTop unlikely transitions:") print_transitions(Counter(info.transitions).most_common()[-80:]) def print_state_features(state_features): for (attr, label), weight in state_features: print("%0.6f %-6s %s" % (weight, label, attr)) print("Top positive:") print_state_features(Counter(info.state_features).most_common(80)) print("\nTop negative:") print_state_features(Counter(info.state_features).most_common()[-80:]) g_i = [] y_truths, predictions = [], [] abstract_predicted_mentions, true_abstract_mentions = [], [] for i, (pmid, x, y) in enumerate(zip(test_pmids, test_x, test_y)): print(pmid) abstract_words, abstract_labels, tagged_abstract, groups_dict, groups = pmids_dict[pmid] vocab = groups_map[pmid] abstract_words, _, _, _, _ = pmids_dict[pmid] count = 0 pred_labels = tagger.tag(x) pred_mentions = output2words(pred_labels, abstract_words) true_mentions = output2words(abstract_labels, abstract_words) print "Predicted: {}".format(pred_mentions) print "True: {}".format(true_mentions) print '\n' """ abstract_predicted_words.append(pred_words) # vocab2 = output2words(x, y, vectorizer, w2v, abstract_words) if len(test_y) > 0: y_truths.append(y) predictions.append(pred_labels) for pred_word in pred_words: for v_word in vocab: if pred_word == v_word: count += 1 pred_words_string = '' true_words = " " for word in pred_words: pred_words_string = pred_words_string + " " + word for word in vocab: true_words = true_words + " " + word print("Predicted: {}".format(pred_words_string)) print("True: {}".format(true_words)) print "" if len(vocab) == 0: continue g_i1 = float(count)/float(len(vocab)) g_i.append(g_i1) print LSTM_extraction._crf_evaluate_detection(y, pred_labels, ) """ abstract_predicted_mentions.append(pred_mentions) true_abstract_mentions.append(true_mentions) fold_recall, fold_precision, fold_f1_score = eveluate(abstract_predicted_mentions, true_abstract_mentions) recall_scores.append(fold_recall) precision_scores.append(fold_precision) f1_scores.append(fold_f1_score) fold_recall_results = "Fold recall: {}".format(fold_recall) fold_precision_results = "Fold precision: {}".format(fold_precision) fold_f1_results = "Fold F1 Score: {}".format(fold_f1_score) print fold_recall_results print fold_precision_results print fold_f1_results file = open(model_name + '_results.txt', 'w+') file.write(fold_recall_results + '\n') file.write(fold_precision_results + '\n') file.write(fold_f1_results + '\n') # avg_g_i = float(0) """ for x in g_i: avg_g_i += x avg_g_i = avg_g_i/len(g_i) fold_gi.append(avg_g_i) print('g_i: {}'.format(avg_g_i)) """ """ recall, precision, tp_overlapping_tokens, fp_tokens, accuracy = LSTM_extraction._crf_evaluate_detection(y_truths, predictions, abstract, vectorizer) print('Recall: {}'.format(recall)) print('precision: {}'.format(precision)) print('') avg_g_i = 0 for g in fold_gi: avg_g_i+=g avg_g_i = float(avg_g_i)/float(len(fold_gi)) avg_recall = 0 for g in fold_gi: avg_recall+=g avg_recall = float(avg_recall)/float(len(recalls)) avg_precision = 0 for g in precisions: avg_precision+=g avg_precision = float(avg_precision)/float(len(precisions)) " print("Fold avg g_i: {}".format(avg_g_i)) """ recall_average = _compute_average(recall_scores) precision_average = _compute_average(precision_scores) f1_scores = _compute_average(f1_scores) print "Recall Average: {}".format(recall_average) print "Precision Average: {}".format(precision_average) print "F1 Average: {}".format(f1_scores)
def main(): n_folds = 5 try: opts, args = getopt.getopt(sys.argv[1:], '', ['window_size=', 'wiki=', 'n_feature_maps=', 'epochs=', 'undersample=', 'n_feature_maps=', 'criterion=', 'optimizer=', 'model=', 'genia=', 'tacc=', 'layers=', 'hyperopt=', 'model_name=']) except getopt.GetoptError as error: print error sys.exit(2) model_type = 'nn' window_size = 5 wiki = True n_feature_maps = 100 epochs = 20 undersample = False binary_cross_entropy = False criterion = 'categorical_crossentropy' optimizer = 'adam' k = 2 use_genia = False using_tacc = False layer_sizes = [] hyperopt = False model_name = 'model' for opt, arg in opts: if opt == '--window_size': window_size = int(arg) elif opt == '--wiki': if arg == 0: wiki = False elif opt == '--epochs': epochs = int(arg) elif opt == '--layers': layer_sizes = arg.split(',') elif opt == '--n_feature_maps': n_feature_maps = int(arg) elif opt == '--undersample': option = int(arg) if option == 1: undersample = True elif opt == '--n_feature_maps': n_feature_maps = int(arg) elif opt == '--criterion': criterion = arg elif opt == '--optimizer': optimizer = arg elif opt == '--model': model_type = arg elif opt == '--genia': if int(arg) == 1: use_genia= True elif opt == '--tacc': if int(arg) == 1: using_tacc = True elif opt == '--hyperopt': if int(arg) == 1: hyperopt = True elif opt == '--model_name': model_name = arg else: print "Option {} is not valid!".format(opt) if criterion == 'binary_crossentropy': binary_cross_entropy = True k = 1 print('Loading word2vec model...') if wiki: print 'Using wiki word2vec...' word2vec_model = 'wikipedia-pubmed-and-PMC-w2v.bin' else: print 'Using non-wiki word2vec...' word2vec_model = 'PubMed-w2v.bin' w2v = Word2Vec.load_word2vec_format(word2vec_model, binary=True) print('Loaded word2vec model') pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \ parse_summerscales.get_tokens_and_lbls( make_pmids_dict=True, sen=True, use_genia=use_genia, using_tacc=using_tacc) all_pmids = pmids_dict.keys() n = len(all_pmids) kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds) accuracies = [] recalls = [] precisions = [] f1_scores = [] aucs = [] global model for fold_idx, (train, test) in enumerate(kf): print("on fold %s" % fold_idx) train_pmids = [all_pmids[pmid_idx] for pmid_idx in train] test_pmids = [all_pmids[pmid_idx] for pmid_idx in test] print train_pmids print('loading data...') if model_type == 'cnn': X_train, y_train = _prep_data(train_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy) X_test, y_test = _prep_data(test_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy) elif model_type == 'nn': X_train, y_train = _prep_data(train_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy) X_test, y_test = _prep_data(test_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy) elif model_type == 'ladder': X_train, y_train = _prep_data(train_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy) X_test, y_test = _prep_data(test_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy) if undersample: # Undersample the non group tags at random....probably a bad idea... if binary_cross_entropy: idx_undersample = numpy.where(y_train == 0)[0] idx_postive = numpy.where(y_train == 1)[0] else: idx_undersample = numpy.where(y_train[:, 1] == 0)[0] idx_postive = numpy.where(y_train[:, 1] == 1)[0] random_negative_sample = numpy.random.choice(idx_undersample, idx_postive.shape[0]) if model_type == 'nn': X_train_postive = X_train[idx_postive, :] X_train_negative = X_train[random_negative_sample, :] else: X_train_postive = X_train[idx_postive, :, :, :] X_train_negative = X_train[random_negative_sample, :, :, :] if binary_cross_entropy: y_train_postive = y_train[idx_postive] y_train_negative = y_train[random_negative_sample] else: y_train_postive = y_train[idx_postive, :] y_train_negative = y_train[random_negative_sample, :] X_train = numpy.vstack((X_train_postive, X_train_negative)) if binary_cross_entropy: y_train = numpy.hstack((y_train_postive, y_train_negative)) else: y_train = numpy.vstack((y_train_postive, y_train_negative)) print('loaded data...') if model_type == 'cnn': model = GroupCNN(window_size=window_size, n_feature_maps=n_feature_maps, k_output=k, name=model_name) elif model_type == 'nn': model = GroupNN(window_size=window_size, k=k, hyperparameter_search=hyperopt, name=model_name) if hyperopt: best_run, best_model = optim.minimize(model=_model, data=_data, algo=tpe.suggest, max_evals=5, trials=Trials()) model.model = best_model else: model.train(X_train, y_train, epochs, optim_algo=optimizer, criterion=criterion) words = [] for pmid in test_pmids: words.extend(pmids_dict[pmid][0]) predictions = model.predict_classes(X_test) predicted_words = crf.output2words(predictions, words) y_test_arg_max = numpy.argmax(y_test, axis=1) true_words = crf.output2words(y_test_arg_max, words) accuracy, f1_score, precision, auc, recall = model.test(X_test, y_test) recall, precision, f1_score = crf.eveluate(predicted_words, true_words) print "Accuracy: {}".format(accuracy) print "F1: {}".format(f1_score) print "Precision: {}".format(precision) print "AUC: {}".format(auc) print "Recall: {}".format(recall) accuracies.append(accuracy) f1_scores.append(f1_score) precisions.append(precision) aucs.append(auc) recalls.append(recall) mean_accuracy = numpy.mean(accuracies) mean_f1_score = numpy.mean(f1_scores) mean_precision = numpy.mean(precisions) mean_auc_score = numpy.mean(aucs) mean_recall = numpy.mean(recalls) mean_accuracy_string = "Mean Accuracy: {}".format(mean_accuracy) mean_f1_score_string = "Mean F1: {}".format(mean_f1_score) mean_precision_string = "Mean Precision: {}".format(mean_precision) mean_auc_score_string = "Mean AUC: {}".format(mean_auc_score) mean_recall_string = "Mean Recall: {}".format(mean_recall) print mean_accuracy_string print mean_f1_score_string print mean_precision_string print mean_auc_score_string print mean_recall_string results = open('{}_fold_results'.format(model.model_name), 'w+') results.write(mean_accuracy_string) results.write(mean_f1_score_string) results.write(mean_precision_string) results.write(mean_auc_score_string) results.write(mean_recall_string)
def get_X_y(wv, wv_dim, vectorizer=None, distant=False, n=None): pmids, sentences, lbls, vectorizer = [None] * 4 if distant: #pmids, sentences, lbls, vectorizer = distant_intervention_tag.get_tokens_and_lbls(N=N) pmids, tagged_abstracts, tokens_and_lbls, intervention_texts = \ distant_intervention_tag.distantly_annotate(n=n) else: pmids, sentences, lbls, vectorizer = parse_summerscales.get_tokens_and_lbls( ) #pdb.set_trace() # see: https://github.com/fchollet/keras/issues/233 # num_sentences x 1 x max_token_len x wv_dim # number of sequences x 1 x max number of tokens (padded to max len) x word vector size num_sentences = len(sentences) #max_token_len = max([len(s) for s in sentences]) #X_embedded = np.zeros((num_sentences, wv_dim)) X_embedded, X_tokens = [], [ ] # here a sequence associated with each doc/abstract y = [] #X_tokens = [] cur_pmid = pmids[0] cur_x_embedded, cur_x_tokens, cur_y, token_pmid_list = [], [], [], [] unknown_words_to_vecs = {} for idx, s in enumerate(sentences): if cur_pmid != pmids[idx]: X_embedded.append(np.vstack(cur_x_embedded)) X_tokens.append(np.vstack(cur_x_tokens)) y.append(np.array(cur_y)) cur_x_embedded, cur_x_tokens, cur_y = [], [], [] cur_pmid = pmids[idx] for j, t in enumerate(s): try: v = wv[t] except: print("%s not known!" % t) # or maybe use 0s??? if not t in unknown_words_to_vecs: v = np.random.uniform(-1, 1, wv_dim) unknown_words_to_vecs[t] = v v = unknown_words_to_vecs[t] cur_x_embedded.append(v) cur_x_tokens.append(vectorizer.vocabulary_[t]) token_pmid_list.append(cur_pmid) cur_y.extend(lbls[idx]) X_embedded.append(np.vstack(cur_x_embedded)) X_tokens.append(np.vstack(cur_x_tokens)) y.append(np.array(cur_y)) X_embedded = np.vstack(X_embedded) X_tokens = np.vstack(X_tokens) y = np.hstack(y) return X_embedded, X_tokens, y, vectorizer, unknown_words_to_vecs, token_pmid_list
def run_crf(w2v, words_before, words_after, shallow_parse): pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \ parse_summerscales.get_tokens_and_lbls( make_pmids_dict=True, sen=True) """ Create model """ model = ChainCRF(directed=False) ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=30) all_pmids = pmids_dict.keys() n = len(all_pmids) n_folds = 5 kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds) fold_gi = [] for fold_idx, (train, test) in enumerate(kf): print("on fold %s" % fold_idx) train_pmids = [all_pmids[pmid_idx] for pmid_idx in train] test_pmids = [all_pmids[pmid_idx] for pmid_idx in test] print('loading data...') train_x, train_y = abstract2features(pmids_dict, words_before, w2v, shallow_parse) test_x, test_y = abstract2features(pmids_dict, words_after, w2v, shallow_parse) print('loaded data...') print 'training...' ssvm.fit(train_x, train_y) print ssvm.score(test_x, test_y) for i, (pmid, x, y) in enumerate(zip(test_pmids, test_x, test_y)): abstract_words, _, _= pmids_dict[pmid] print(pmid) # predict() takes in a list returns another list prediction = ssvm.predict([x]).pop(0) predicted = '' output = '' if len(prediction) > 0: for p in prediction: if p == 1: print "word: {}".format(abstract_words[p]) if n == 0: predicted += abstract_words[p] else: predicted += ' ' + abstract_words[p] if not predicted == '': output = 'predicted: {}'.format(predicted) else: output = 'Predicted nothing!' else: output = 'Predicted nothing!' print output
def get_PMIDs_to_X_y(wv, wv_dim, max_length=None, distant=False, n=200): unknown_words_to_vecs = {} tokens_DS = None if distant: tokens_and_lbls, X_DS_embedded, y_DS, tokens_DS, unknown_words_to_vecs = \ _get_distantly_lbled_tokens(n=n, wv=wv, wv_dim=wv_dim) # we pass tokens_DS -- the unique tokens in the DS # data -- to go into our vectorizer! """ pmids_dict, pmids, sentences, lbls, vectorizer, groups_map = \ parse_summerscales.get_tokens_and_lbls( make_pmids_dict=True) """ pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \ parse_summerscales.get_tokens_and_lbls( make_pmids_dict=True, sen=True) ### # now loop through and get X_tokens representation! if distant: # really token_indices maybe more correct X_DS_tokens = [] for abs_idx, abs_tokens_and_lbls in enumerate(tokens_and_lbls): for token_idx, token_and_lbl in enumerate(abs_tokens_and_lbls): t, lbl = token_and_lbl X_DS_tokens.append(vectorizer.vocabulary_[t]) # see: https://github.com/fchollet/keras/issues/233 # num_sentences x 1 x max_token_len x wv_dim # number of sequences x 1 x max number of tokens (padded to max len) x word vector size # num_sentences = len(sentences) #max_token_len = max([len(s) for s in sentences]) #X_embedded = np.zeros((num_sentences, wv_dim)) X_embedded, X_tokens = [], [ ] # here a sequence associated with each doc/abstract #unknown_words_to_vecs = {} pmids_to_X_y = {} for pmid in pmids_dict: # pmid_sentences, pmid_lbls = pmids_dict[pmid] abstract_tokens, abstract_output_labels, _ = pmids_dict[pmid] # for this sentence X_embedded = [] X_tokens = [] y = [] for w_i, word_token in enumerate(abstract_tokens): try: v = wv[word_token] except: # or maybe use 0s??? if word_token not in unknown_words_to_vecs: print("word '%s' not known!" % word_token) v = np.random.uniform(-1, 1, wv_dim) unknown_words_to_vecs[word_token] = v v = unknown_words_to_vecs[word_token] X_embedded.append(v) X_tokens.append(vectorizer.vocabulary_[word_token]) #pmids_to_X_y[pmid] = (np.vstack(X_embedded), np.vstack(X_tokens), np.hstack(y)) if len(abstract_output_labels) > max_length: abstract_output_labels = abstract_output_labels[:max_length] elif len(abstract_output_labels) < max_length: padding = [] for i in range(max_length - len(abstract_output_labels)): padding.append(0) abstract_output_labels = padding + abstract_output_labels assert len(abstract_output_labels) == max_length, 'Must be same size' pmids_to_X_y[pmid] = (X_embedded, X_tokens, abstract_output_labels) """ for sent_idx, s in enumerate(pmid_sentences): for j, t in enumerate(s): try: v = wv[t] except: # or maybe use 0s??? if not t in unknown_words_to_vecs: print("word '%s' not known!" % t) v = np.random.uniform(-1,1,wv_dim) unknown_words_to_vecs[t] = v v = unknown_words_to_vecs[t] X_embedded.append(v) X_tokens.append(vectorizer.vocabulary_[t]) y.extend(pmid_lbls[sent_idx]) pmids_to_X_y[pmid] = (np.vstack(X_embedded), np.vstack(X_tokens), np.hstack(y)) """ if distant: return pmids_to_X_y, vectorizer, unknown_words_to_vecs, X_DS_embedded, X_DS_tokens, y_DS return pmids_to_X_y, vectorizer, unknown_words_to_vecs, groups_map, pmids_dict
def run_crf(w2v, words_before, words_after, shallow_parse): pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \ parse_summerscales.get_tokens_and_lbls( make_pmids_dict=True, sen=True) """ Create model """ model = ChainCRF(directed=False) ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=30) all_pmids = pmids_dict.keys() n = len(all_pmids) n_folds = 5 kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds) fold_gi = [] for fold_idx, (train, test) in enumerate(kf): print("on fold %s" % fold_idx) train_pmids = [all_pmids[pmid_idx] for pmid_idx in train] test_pmids = [all_pmids[pmid_idx] for pmid_idx in test] print('loading data...') train_x, train_y = abstract2features(pmids_dict, words_before, w2v, shallow_parse) test_x, test_y = abstract2features(pmids_dict, words_after, w2v, shallow_parse) print('loaded data...') print 'training...' ssvm.fit(train_x, train_y) print ssvm.score(test_x, test_y) for i, (pmid, x, y) in enumerate(zip(test_pmids, test_x, test_y)): abstract_words, _, _ = pmids_dict[pmid] print(pmid) # predict() takes in a list returns another list prediction = ssvm.predict([x]).pop(0) predicted = '' output = '' if len(prediction) > 0: for p in prediction: if p == 1: print "word: {}".format(abstract_words[p]) if n == 0: predicted += abstract_words[p] else: predicted += ' ' + abstract_words[p] if not predicted == '': output = 'predicted: {}'.format(predicted) else: output = 'Predicted nothing!' else: output = 'Predicted nothing!' print output
def main(): n_folds = 5 try: opts, args = getopt.getopt(sys.argv[1:], '', [ 'window_size=', 'wiki=', 'n_feature_maps=', 'epochs=', 'undersample=', 'n_feature_maps=', 'criterion=', 'optimizer=', 'model=', 'genia=', 'tacc=', 'layers=', 'hyperopt=', 'model_name=' ]) except getopt.GetoptError as error: print error sys.exit(2) model_type = 'nn' window_size = 5 wiki = True n_feature_maps = 100 epochs = 20 undersample = False binary_cross_entropy = False criterion = 'categorical_crossentropy' optimizer = 'adam' k = 2 use_genia = False using_tacc = False layer_sizes = [] hyperopt = False model_name = 'model' for opt, arg in opts: if opt == '--window_size': window_size = int(arg) elif opt == '--wiki': if arg == 0: wiki = False elif opt == '--epochs': epochs = int(arg) elif opt == '--layers': layer_sizes = arg.split(',') elif opt == '--n_feature_maps': n_feature_maps = int(arg) elif opt == '--undersample': option = int(arg) if option == 1: undersample = True elif opt == '--n_feature_maps': n_feature_maps = int(arg) elif opt == '--criterion': criterion = arg elif opt == '--optimizer': optimizer = arg elif opt == '--model': model_type = arg elif opt == '--genia': if int(arg) == 1: use_genia = True elif opt == '--tacc': if int(arg) == 1: using_tacc = True elif opt == '--hyperopt': if int(arg) == 1: hyperopt = True elif opt == '--model_name': model_name = arg else: print "Option {} is not valid!".format(opt) if criterion == 'binary_crossentropy': binary_cross_entropy = True k = 1 print('Loading word2vec model...') if wiki: print 'Using wiki word2vec...' word2vec_model = 'wikipedia-pubmed-and-PMC-w2v.bin' else: print 'Using non-wiki word2vec...' word2vec_model = 'PubMed-w2v.bin' w2v = Word2Vec.load_word2vec_format(word2vec_model, binary=True) print('Loaded word2vec model') pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \ parse_summerscales.get_tokens_and_lbls( make_pmids_dict=True, sen=True, use_genia=use_genia, using_tacc=using_tacc) all_pmids = pmids_dict.keys() n = len(all_pmids) kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds) accuracies = [] recalls = [] precisions = [] f1_scores = [] aucs = [] global model for fold_idx, (train, test) in enumerate(kf): print("on fold %s" % fold_idx) train_pmids = [all_pmids[pmid_idx] for pmid_idx in train] test_pmids = [all_pmids[pmid_idx] for pmid_idx in test] print train_pmids print('loading data...') if model_type == 'cnn': X_train, y_train = _prep_data(train_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy) X_test, y_test = _prep_data(test_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy) elif model_type == 'nn': X_train, y_train = _prep_data(train_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy) X_test, y_test = _prep_data(test_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy) elif model_type == 'ladder': X_train, y_train = _prep_data(train_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy) X_test, y_test = _prep_data(test_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy) if undersample: # Undersample the non group tags at random....probably a bad idea... if binary_cross_entropy: idx_undersample = numpy.where(y_train == 0)[0] idx_postive = numpy.where(y_train == 1)[0] else: idx_undersample = numpy.where(y_train[:, 1] == 0)[0] idx_postive = numpy.where(y_train[:, 1] == 1)[0] random_negative_sample = numpy.random.choice( idx_undersample, idx_postive.shape[0]) if model_type == 'nn': X_train_postive = X_train[idx_postive, :] X_train_negative = X_train[random_negative_sample, :] else: X_train_postive = X_train[idx_postive, :, :, :] X_train_negative = X_train[random_negative_sample, :, :, :] if binary_cross_entropy: y_train_postive = y_train[idx_postive] y_train_negative = y_train[random_negative_sample] else: y_train_postive = y_train[idx_postive, :] y_train_negative = y_train[random_negative_sample, :] X_train = numpy.vstack((X_train_postive, X_train_negative)) if binary_cross_entropy: y_train = numpy.hstack((y_train_postive, y_train_negative)) else: y_train = numpy.vstack((y_train_postive, y_train_negative)) print('loaded data...') if model_type == 'cnn': model = GroupCNN(window_size=window_size, n_feature_maps=n_feature_maps, k_output=k, name=model_name) elif model_type == 'nn': model = GroupNN(window_size=window_size, k=k, hyperparameter_search=hyperopt, name=model_name) if hyperopt: best_run, best_model = optim.minimize(model=_model, data=_data, algo=tpe.suggest, max_evals=5, trials=Trials()) model.model = best_model else: model.train(X_train, y_train, epochs, optim_algo=optimizer, criterion=criterion) words = [] for pmid in test_pmids: words.extend(pmids_dict[pmid][0]) predictions = model.predict_classes(X_test) predicted_words = crf.output2words(predictions, words) y_test_arg_max = numpy.argmax(y_test, axis=1) true_words = crf.output2words(y_test_arg_max, words) accuracy, f1_score, precision, auc, recall = model.test(X_test, y_test) recall, precision, f1_score = crf.eveluate(predicted_words, true_words) print "Accuracy: {}".format(accuracy) print "F1: {}".format(f1_score) print "Precision: {}".format(precision) print "AUC: {}".format(auc) print "Recall: {}".format(recall) accuracies.append(accuracy) f1_scores.append(f1_score) precisions.append(precision) aucs.append(auc) recalls.append(recall) mean_accuracy = numpy.mean(accuracies) mean_f1_score = numpy.mean(f1_scores) mean_precision = numpy.mean(precisions) mean_auc_score = numpy.mean(aucs) mean_recall = numpy.mean(recalls) mean_accuracy_string = "Mean Accuracy: {}".format(mean_accuracy) mean_f1_score_string = "Mean F1: {}".format(mean_f1_score) mean_precision_string = "Mean Precision: {}".format(mean_precision) mean_auc_score_string = "Mean AUC: {}".format(mean_auc_score) mean_recall_string = "Mean Recall: {}".format(mean_recall) print mean_accuracy_string print mean_f1_score_string print mean_precision_string print mean_auc_score_string print mean_recall_string results = open('{}_fold_results'.format(model.model_name), 'w+') results.write(mean_accuracy_string) results.write(mean_f1_score_string) results.write(mean_precision_string) results.write(mean_auc_score_string) results.write(mean_recall_string)
def run_crf(w2v, l2, l1, iters, shallow_parse, words_before, words_after, grid_search, tacc, name, transfer_learning=False): pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \ parse_summerscales.get_tokens_and_lbls( make_pmids_dict=True, sen=True, use_genia=shallow_parse, using_tacc=tacc) model = pycrfsuite.Trainer(verbose=False) all_pmids = pmids_dict.keys() n = len(all_pmids) n_folds = 5 kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds) fold_gi = [] recall_scores = [] precision_scores = [] f1_scores = [] model_type = 'nn' binary_cross_entropy = True for fold_idx, (train, test) in enumerate(kf): print("on fold %s" % fold_idx) train_pmids = [all_pmids[pmid_idx] for pmid_idx in train] test_pmids = [all_pmids[pmid_idx] for pmid_idx in test] print('loading data...') if transfer_learning: nn_model = GroupNN.load_model(model_path='NNModel.hdf5', model_info_path='NNModel.hdf5.p') window_size = nn_model.model_info['window_size'] train_x, train_y = GroupCNNExperiment._prep_data( train_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy, crf=True) test_x, test_y = GroupCNNExperiment._prep_data( test_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy, crf=True) train_x = transform_features(nn_model, train_x) test_x = transform_features(nn_model, test_x) train_y = _labels_to_str(train_y) test_y = _labels_to_str(test_y) else: train_x, train_y = abstracts2features(pmids_dict, train_pmids, words_before, words_after, w2v, shallow_parse) test_x, test_y = abstracts2features(pmids_dict, test_pmids, words_before, words_after, w2v, shallow_parse) print('loaded data...') for x, y in zip(train_x, train_y): model.append(x, y) if grid_search: model.set_params({ 'c1': l1, # coefficient for L1 penalty 'c2': l2, # coefficient for L2 penalty 'max_iterations': iters, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=l1, c2=l2, max_iterations=iters, all_possible_transitions=False) params_space = { 'c1': scipy.stats.expon(scale=0.5), 'c2': scipy.stats.expon(scale=0.05), } # use the same metric for evaluation f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=test_y) # search rs = RandomizedSearchCV(crf, params_space, cv=3, verbose=1, n_jobs=-1, n_iter=50, scoring=f1_scorer) rs.fit(train_x, train_y) info = rs.best_estimator_.tagger_.info() tagger = rs.best_estimator_.tagger_ else: model.set_params({ 'c1': l1, # coefficient for L1 penalty 'c2': l2, # coefficient for L2 penalty 'max_iterations': iters, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) model_name = name + '_model {}'.format(fold_idx) print('training model...') model.train(model_name) print('done...') tagger = pycrfsuite.Tagger() tagger.open(model_name) info = tagger.info() def print_transitions(trans_features): for (label_from, label_to), weight in trans_features: print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight)) print("Top likely transitions:") print_transitions(Counter(info.transitions).most_common(80)) print("\nTop unlikely transitions:") print_transitions(Counter(info.transitions).most_common()[-80:]) def print_state_features(state_features): for (attr, label), weight in state_features: print("%0.6f %-6s %s" % (weight, label, attr)) print("Top positive:") print_state_features(Counter(info.state_features).most_common(80)) print("\nTop negative:") print_state_features(Counter(info.state_features).most_common()[-80:]) g_i = [] y_truths, predictions = [], [] abstract_predicted_mentions, true_abstract_mentions = [], [] for i, (pmid, x, y) in enumerate(zip(test_pmids, test_x, test_y)): print(pmid) abstract_words, abstract_labels, tagged_abstract, groups_dict, groups = pmids_dict[ pmid] vocab = groups_map[pmid] abstract_words, _, _, _, _ = pmids_dict[pmid] count = 0 pred_labels = tagger.tag(x) pred_mentions = output2words(pred_labels, abstract_words) true_mentions = output2words(abstract_labels, abstract_words) print "Predicted: {}".format(pred_mentions) print "True: {}".format(true_mentions) print '\n' """ abstract_predicted_words.append(pred_words) # vocab2 = output2words(x, y, vectorizer, w2v, abstract_words) if len(test_y) > 0: y_truths.append(y) predictions.append(pred_labels) for pred_word in pred_words: for v_word in vocab: if pred_word == v_word: count += 1 pred_words_string = '' true_words = " " for word in pred_words: pred_words_string = pred_words_string + " " + word for word in vocab: true_words = true_words + " " + word print("Predicted: {}".format(pred_words_string)) print("True: {}".format(true_words)) print "" if len(vocab) == 0: continue g_i1 = float(count)/float(len(vocab)) g_i.append(g_i1) print LSTM_extraction._crf_evaluate_detection(y, pred_labels, ) """ abstract_predicted_mentions.append(pred_mentions) true_abstract_mentions.append(true_mentions) fold_recall, fold_precision, fold_f1_score = eveluate( abstract_predicted_mentions, true_abstract_mentions) recall_scores.append(fold_recall) precision_scores.append(fold_precision) f1_scores.append(fold_f1_score) fold_recall_results = "Fold recall: {}".format(fold_recall) fold_precision_results = "Fold precision: {}".format(fold_precision) fold_f1_results = "Fold F1 Score: {}".format(fold_f1_score) print fold_recall_results print fold_precision_results print fold_f1_results file = open(model_name + '_results.txt', 'w+') file.write(fold_recall_results + '\n') file.write(fold_precision_results + '\n') file.write(fold_f1_results + '\n') # avg_g_i = float(0) """ for x in g_i: avg_g_i += x avg_g_i = avg_g_i/len(g_i) fold_gi.append(avg_g_i) print('g_i: {}'.format(avg_g_i)) """ """ recall, precision, tp_overlapping_tokens, fp_tokens, accuracy = LSTM_extraction._crf_evaluate_detection(y_truths, predictions, abstract, vectorizer) print('Recall: {}'.format(recall)) print('precision: {}'.format(precision)) print('') avg_g_i = 0 for g in fold_gi: avg_g_i+=g avg_g_i = float(avg_g_i)/float(len(fold_gi)) avg_recall = 0 for g in fold_gi: avg_recall+=g avg_recall = float(avg_recall)/float(len(recalls)) avg_precision = 0 for g in precisions: avg_precision+=g avg_precision = float(avg_precision)/float(len(precisions)) " print("Fold avg g_i: {}".format(avg_g_i)) """ recall_average = _compute_average(recall_scores) precision_average = _compute_average(precision_scores) f1_scores = _compute_average(f1_scores) print "Recall Average: {}".format(recall_average) print "Precision Average: {}".format(precision_average) print "F1 Average: {}".format(f1_scores)
def get_X_y(wv, wv_dim, vectorizer=None, distant=False, n=None): pmids, sentences, lbls, vectorizer = [None]*4 if distant: #pmids, sentences, lbls, vectorizer = distant_intervention_tag.get_tokens_and_lbls(N=N) pmids, tagged_abstracts, tokens_and_lbls, intervention_texts = \ distant_intervention_tag.distantly_annotate(n=n) else: pmids, sentences, lbls, vectorizer = parse_summerscales.get_tokens_and_lbls() #pdb.set_trace() # see: https://github.com/fchollet/keras/issues/233 # num_sentences x 1 x max_token_len x wv_dim # number of sequences x 1 x max number of tokens (padded to max len) x word vector size num_sentences = len(sentences) #max_token_len = max([len(s) for s in sentences]) #X_embedded = np.zeros((num_sentences, wv_dim)) X_embedded, X_tokens = [], [] # here a sequence associated with each doc/abstract y = [] #X_tokens = [] cur_pmid = pmids[0] cur_x_embedded, cur_x_tokens, cur_y, token_pmid_list = [], [], [], [] unknown_words_to_vecs = {} for idx, s in enumerate(sentences): if cur_pmid != pmids[idx]: X_embedded.append(np.vstack(cur_x_embedded)) X_tokens.append(np.vstack(cur_x_tokens)) y.append(np.array(cur_y)) cur_x_embedded, cur_x_tokens, cur_y = [], [], [] cur_pmid = pmids[idx] for j, t in enumerate(s): try: v = wv[t] except: print("%s not known!" % t) # or maybe use 0s??? if not t in unknown_words_to_vecs: v = np.random.uniform(-1,1,wv_dim) unknown_words_to_vecs[t] = v v = unknown_words_to_vecs[t] cur_x_embedded.append(v) cur_x_tokens.append(vectorizer.vocabulary_[t]) token_pmid_list.append(cur_pmid) cur_y.extend(lbls[idx]) X_embedded.append(np.vstack(cur_x_embedded)) X_tokens.append(np.vstack(cur_x_tokens)) y.append(np.array(cur_y)) X_embedded = np.vstack(X_embedded) X_tokens = np.vstack(X_tokens) y = np.hstack(y) return X_embedded, X_tokens, y, vectorizer, unknown_words_to_vecs, token_pmid_list
def get_PMIDs_to_X_y(wv, wv_dim, max_length=None, distant=False, n=200): unknown_words_to_vecs = {} tokens_DS = None if distant: tokens_and_lbls, X_DS_embedded, y_DS, tokens_DS, unknown_words_to_vecs = \ _get_distantly_lbled_tokens(n=n, wv=wv, wv_dim=wv_dim) # we pass tokens_DS -- the unique tokens in the DS # data -- to go into our vectorizer! """ pmids_dict, pmids, sentences, lbls, vectorizer, groups_map = \ parse_summerscales.get_tokens_and_lbls( make_pmids_dict=True) """ pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \ parse_summerscales.get_tokens_and_lbls( make_pmids_dict=True, sen=True) ### # now loop through and get X_tokens representation! if distant: # really token_indices maybe more correct X_DS_tokens = [] for abs_idx, abs_tokens_and_lbls in enumerate(tokens_and_lbls): for token_idx, token_and_lbl in enumerate(abs_tokens_and_lbls): t, lbl = token_and_lbl X_DS_tokens.append(vectorizer.vocabulary_[t]) # see: https://github.com/fchollet/keras/issues/233 # num_sentences x 1 x max_token_len x wv_dim # number of sequences x 1 x max number of tokens (padded to max len) x word vector size # num_sentences = len(sentences) #max_token_len = max([len(s) for s in sentences]) #X_embedded = np.zeros((num_sentences, wv_dim)) X_embedded, X_tokens = [], [] # here a sequence associated with each doc/abstract #unknown_words_to_vecs = {} pmids_to_X_y = {} for pmid in pmids_dict: # pmid_sentences, pmid_lbls = pmids_dict[pmid] abstract_tokens, abstract_output_labels, _ = pmids_dict[pmid] # for this sentence X_embedded = [] X_tokens = [] y = [] for w_i, word_token in enumerate(abstract_tokens): try: v = wv[word_token] except: # or maybe use 0s??? if word_token not in unknown_words_to_vecs: print("word '%s' not known!" % word_token) v = np.random.uniform(-1,1,wv_dim) unknown_words_to_vecs[word_token] = v v = unknown_words_to_vecs[word_token] X_embedded.append(v) X_tokens.append(vectorizer.vocabulary_[word_token]) #pmids_to_X_y[pmid] = (np.vstack(X_embedded), np.vstack(X_tokens), np.hstack(y)) if len(abstract_output_labels) > max_length: abstract_output_labels = abstract_output_labels[:max_length] elif len(abstract_output_labels) < max_length: padding = [] for i in range(max_length - len(abstract_output_labels)): padding.append(0) abstract_output_labels = padding + abstract_output_labels assert len(abstract_output_labels) == max_length, 'Must be same size' pmids_to_X_y[pmid] = (X_embedded, X_tokens, abstract_output_labels) """ for sent_idx, s in enumerate(pmid_sentences): for j, t in enumerate(s): try: v = wv[t] except: # or maybe use 0s??? if not t in unknown_words_to_vecs: print("word '%s' not known!" % t) v = np.random.uniform(-1,1,wv_dim) unknown_words_to_vecs[t] = v v = unknown_words_to_vecs[t] X_embedded.append(v) X_tokens.append(vectorizer.vocabulary_[t]) y.extend(pmid_lbls[sent_idx]) pmids_to_X_y[pmid] = (np.vstack(X_embedded), np.vstack(X_tokens), np.hstack(y)) """ if distant: return pmids_to_X_y, vectorizer, unknown_words_to_vecs, X_DS_embedded, X_DS_tokens, y_DS return pmids_to_X_y, vectorizer, unknown_words_to_vecs, groups_map, pmids_dict