def decode(self, sentence_tokens): tag_indexer = self.tag_ix feature_indexer = self.feature_ix all_features = [] for word_idx in range(0, len(sentence_tokens)): features = [] for tag_idx in range(0, len(tag_indexer)): features.append( extract_emission_features(sentence_tokens, word_idx, tag_indexer.get_object(tag_idx), feature_indexer, add_to_indexer=False)) all_features.append(features) all_features = np.array(all_features) score, seq = self.forward(all_features) seq = flatten(seq) pred_tags = [] for j in seq: pred_tags.append(self.tag_ix.ints_to_objs[j]) return LabeledSentence(sentence_tokens, chunks_from_bio_tag_seq(pred_tags))
def train_model_based_binary_ner(ner_exs: List[PersonExample]): shuffle(ner_exs) """ ======================================= ========== Build Indexers ============= """ word_ix, pos_ix = create_index(ner_exs=ner_exs, stops=stops) ix2embedding = load_word_embedding(pretrained_embedding_filename=config.glove_file, word2index_vocab=word_ix.objs_to_ints) train_sent, POS, train_lables = index_data(ner_exs, word_ix, pos_ix) epochs = config.epochs batch_size = config.batch_size initial_lr = config.initial_lr no_of_classes = config.no_of_classes """ ================================== ===== Network Definition ======== ================================== """ word_indicator_feat_dim = len(word_ix) pos_indicator_feat_dim = len(pos_ix) is_upper_feat_dim = 1 all_caps_indicator_feat_dim = 1 word_embedding_feat_dim = 300 context_window_1 = 300 context_window_2 = 300 context_left_1 = 300 context_left_2 = 300 context_right_1 = 300 feat_dim = 0 # feat_dim += word_indicator_feat_dim # feat_dim += pos_indicator_feat_dim # feat_dim += is_upper_feat_dim # feat_dim += all_caps_indicator_feat_dim # feat_dim += word_embedding_feat_dim # feat_dim += context_window_1 # feat_dim += context_window_2 # feat_dim += context_left_1 # # feat_dim += context_left_2 # feat_dim += context_right_1 n_input_dim = feat_dim n_hidden1 = 16 # Number of hidden nodes n_hidden2 = 8 n_output = 2 # Number of output nodes = for binary classifier net = nn.Sequential( nn.Linear(n_input_dim, n_hidden1), nn.ELU(), nn.Linear(n_hidden1, n_hidden2), nn.ELU(), nn.Linear(n_hidden2, n_output), nn.Sigmoid()) print(net) learning_rate = initial_lr for epoch in range(epochs): t = time.time() """ ================= Create batch =============== """ for i in range(0, len(train_sent), batch_size): if len(train_sent[i:]) <= batch_size: data_batch = train_sent[i:] pos_batch = POS[i:] else: data_batch = train_sent[i: i + batch_size] pos_batch = POS[i: i + batch_size] Y_train = flatten(train_lables[i: i + batch_size]) """ ========== scaling ================== """ # compute class weights if Y_train.count(1) == 0: scaling = 1 else: scaling = Y_train.count(0) / Y_train.count(1) pos_weight = torch.ones([no_of_classes]) pos_weight[1] = scaling optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate) loss_func = nn.BCEWithLogitsLoss(pos_weight=pos_weight) # loss_func = nn.BCELoss() Y_train = np.asarray(Y_train) X_train = [] for sent, pos in zip(data_batch, pos_batch): for idx in range(0, len(sent)): X_train.append(get_features(sent, pos, word_ix, pos_ix, ix2embedding, idx)) X_train = np.asarray(X_train) # One hot y_train_one_hot = np.zeros((Y_train.size, no_of_classes)) for ix, n in enumerate(Y_train): if n == 0: y_train_one_hot[ix, 0] = 1 else: y_train_one_hot[ix, 1] = 1 Y_train = y_train_one_hot # convert to tensor X_train_t = torch.FloatTensor(X_train) Y_train_t = torch.FloatTensor(Y_train) y_hat = net(X_train_t) loss = loss_func(y_hat, Y_train_t) loss.backward() optimizer.step() optimizer.zero_grad() print("Epoch : ", epoch) print("Time taken", time.time() - t) print("Learning Rate = ", learning_rate) if (epoch + 1) % 3 == 0: learning_rate = initial_lr*2 learning_rate = learning_rate / 2 print("----------") print(" ") return BinaryPersonClassifier(model=net, word_ix=word_ix, pos_ix=pos_ix, ix2embed=ix2embedding)
def main(docs, labels, supervision_rate): # create dictionaries {label: label_id} and {label_id: label} label_to_id_dict = {v: n for n, v in enumerate(all_labels)} id_to_label_dict = {v: k for k, v in label_to_id_dict.items()} # Filter out docs that are less than 250 characters docs_labels = filter(lambda x: len(x[0]) > 250, zip(docs, labels)) docs = zip(*docs_labels)[0] labels = zip(*docs_labels)[1] # need to transform 'labels' from strings to indices label_ids = [[label_to_id_dict[label] for label in label_list] for label_list in labels] # subset list of doc labels at given rate and exclude example docs # example doc ids doc_ids = [29, 38, 13, 28, 41] label_ids_sub = [ label_id_list if (random.uniform(0, 1) < supervision_rate) and (doc_id not in doc_ids) else [] for doc_id, label_id_list in enumerate(label_ids) ] # calculate how many topics are captured known_topic_counter = Counter() for label_id in flatten(label_ids_sub): known_topic_counter[label_id] += 1 topic_coverage = float(len(known_topic_counter)) / len( all_labels) # percentage of topics covered print 'Topic coverage: %s' % topic_coverage # Vectorize tfidf_vectorizer = tfidf.Vectorizer(vocab_size=2000) tfidf_vectorizer.fit(docs) doc_term_matrix, terms = tfidf_vectorizer.transform(docs) # Factorize (weakly supervised) ws_nmf_model = ws_nmf.Model(doc_term_matrix, label_ids_sub, K=len(all_labels)) ws_nmf_model.train(max_iter=30) doc_topic_matrix_ws = ws_nmf_model.W topic_term_matrix_ws = ws_nmf_model.H # Create useful dictionaries # {topic id: terms} topic_to_term_dict_ws = create_topic_to_term_dict(topic_term_matrix_ws, terms) # {doc id: [(topic id, strength)]} doc_to_topic_ws = create_doc_to_topic_dict( doc_topic_matrix_ws, cutoff=0.001) # higher cutoff reduces dictionary size # {doc id: [(label id, strength)]} doc_to_label = defaultdict(list) for doc_ind, label_list in enumerate(label_ids): for label in label_list: doc_to_label[doc_ind].append((label, 1)) # Compute topic to label similarity matrix similarity_ws = compute_similarity_matrix(doc_to_topic_ws, doc_to_label) # Run hungarian algorithm score_ws, sorted_matches_ws, matched_similarity_ws = match_similarity_matrix( similarity_ws) # Print assignment score print 'Average similarity: %s' % score_ws # Print top 50 matched assignment matched_topic_terms_ws = [ (round(score, 3), id_to_label_dict[label_ind], topic_to_term_dict_ws[topic_ind]) for score, topic_ind, label_ind in sorted_matches_ws ] pprint.pprint(matched_topic_terms_ws[:50]) # Determine number of "resolved" topics (similarity > 0.1) n_resolved = len([ score for score, topic_ind, label_ind in sorted_matches_ws if score > 0.1 ]) # Print number of toipcs resolved print 'Numer of topics resolved: %s' % n_resolved # Print examples of documents print_examples(doc_ids, docs, doc_to_label, doc_to_topic_ws, topic_to_term_dict_ws, id_to_label_dict) return topic_coverage, score_ws, n_resolved
def train_mlp_ner(train_data: List[LabeledSentence], dev_data, test_data): shuffle(train_data) """ ======================================= ========== Build Indexers ============= """ tag_ix = Indexer() word_ix = Indexer() pos_ix = Indexer() word_counter = Counter() tag_ix.add_and_get_index(conf.PAD_TOKEN) # padding word_ix.add_and_get_index(conf.PAD_TOKEN) tag_ix.add_and_get_index(conf.EOS_TOKEN) # End of Sentence word_ix.add_and_get_index(conf.EOS_TOKEN) tag_ix.add_and_get_index(conf.BOS_TOKEN) # Beginning of Sentence word_ix.add_and_get_index(conf.BOS_TOKEN) tag_ix.add_and_get_index(conf.UNK_TOKEN) # Unk Words word_ix.add_and_get_index(conf.UNK_TOKEN) for sentence in train_data: for token in sentence.tokens: word_counter[token.word] += 1.0 for sentence in train_data: for token in sentence.tokens: # If the word occurs fewer than two times, don't index it -- we'll treat it as UNK get_word_index(word_indexer=word_ix, word_counter=word_counter, stops=stops, word=token.word.lower(), th=0) pos_ix.add_and_get_index(token.pos) for tag in sentence.get_bio_tags(): tag_ix.add_and_get_index(tag) ix2embedding = load_word_embedding(pretrained_embedding_filename=conf.glove_file, word2index_vocab=word_ix.objs_to_ints) train_sent = [] POS = [] train_labels = [] for sentence in train_data: s = [] pos = [] labels = [] for token in sentence.tokens: if token.word.lower() in word_ix.objs_to_ints: s.append(word_ix.objs_to_ints[token.word.lower()]) else: s.append(word_ix.objs_to_ints[conf.UNK_TOKEN]) if token.pos in pos_ix.objs_to_ints: pos.append(pos_ix.objs_to_ints[token.pos]) else: pos.append(pos_ix.objs_to_ints[conf.UNK_TOKEN]) for tag in sentence.get_bio_tags(): labels.append(tag_ix.objs_to_ints[tag]) train_sent.append(s) POS.append(pos) train_labels.append(labels) epochs = conf.epochs batch_size = conf.batch_size initial_lr = conf.initial_lr no_of_classes = len(tag_ix) """ ================================== ===== Network Definition ======== ================================== """ word_indicator_feat_dim = len(word_ix) pos_indicator_feat_dim = len(pos_ix) is_upper_feat_dim = 1 all_caps_indicator_feat_dim = 1 word_embedding_feat_dim = 300 context_window_1 = 300 context_window_2 = 300 context_left_1 = 300 context_left_2 = 300 context_right_1 = 300 feat_dim = 0 # feat_dim += word_indicator_feat_dim feat_dim += pos_indicator_feat_dim feat_dim += is_upper_feat_dim feat_dim += all_caps_indicator_feat_dim # feat_dim += word_embedding_feat_dim feat_dim += context_window_1 # feat_dim += context_window_2 # feat_dim += context_left_1 # # feat_dim += context_left_2 # feat_dim += context_right_1 n_input_dim = feat_dim n_hidden1 = 64 # Number of hidden nodes n_hidden2 = 32 n_hidden3 = 16 n_output = no_of_classes # Number of output nodes = for binary classifier net = nn.Sequential( nn.Linear(n_input_dim, n_hidden1), nn.ELU(), nn.Dropout(0.2), nn.Linear(n_hidden1, n_hidden2), nn.ELU(), nn.Dropout(0.2), nn.Linear(n_hidden2, n_hidden3), nn.ELU(), nn.Linear(n_hidden3, n_output), nn.Sigmoid()) print(net) learning_rate = initial_lr best_f1 = 0 for epoch in range(epochs): t = time.time() """ ================= Create batch =============== """ for i in range(0, len(train_sent), batch_size): if len(train_sent[i:]) <= batch_size: data_batch = train_sent[i:] pos_batch = POS[i:] else: data_batch = train_sent[i: i + batch_size] pos_batch = POS[i: i + batch_size] Y_train = flatten(train_labels[i: i + batch_size]) optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate) loss_func = nn.BCELoss() Y_train = np.asarray(Y_train) X_train = [] for sent, pos in zip(data_batch, pos_batch): for idx in range(0, len(sent)): X_train.append(get_features(sent, pos, word_ix, pos_ix, ix2embedding, idx)) X_train = np.asarray(X_train) # One hot y_train_one_hot = np.zeros((Y_train.size, no_of_classes)) for ix, n in enumerate(Y_train): y_train_one_hot[ix, n] = 1 Y_train = y_train_one_hot # convert to tensor X_train_t = torch.FloatTensor(X_train) Y_train_t = torch.FloatTensor(Y_train) y_hat = net(X_train_t) loss = loss_func(y_hat, Y_train_t) loss.backward() optimizer.step() optimizer.zero_grad() model = MLPNerClassifier(model=net, word_ix=word_ix, pos_ix=pos_ix, tag_ix=tag_ix, ix2embed=ix2embedding) # Compute Dev Acc. # if (epoch + 1) % 3 == 0: dev_decoded = [model.decode(test_ex.tokens) for test_ex in dev_data] f1 = print_evaluation_metric(dev_data, dev_decoded) if f1 > best_f1: test_decoded = [model.decode(test_ex.tokens) for test_ex in test_data] write_test_output(test_decoded, conf.output_path) print("-------------------------") print("Epoch: ", epoch) print("Time taken: ", time.time() - t) print(" ") print(" -------------------------") print("----------") print(" ") return model
def import_pubmed_data(pubmed_filename): print 'importing raw data...' e = ET.parse(pubmed_filename).getroot() # Initialize data dictionaries pubmed_dicts = [] print 'parsing XML..' for article in e.findall('PubmedArticle'): # Get article ID article_id = article.find(".//ArticleId[@IdType='pubmed']").text # Get abstract text (pass if no text) find_abstracts = article.findall(".//AbstractText") if len(find_abstracts) > 0: cur_abstract = ' '.join( [abstract.text for abstract in find_abstracts]) else: continue # Get keywords (pass if no keywords) find_keywords = article.findall(".//MeshHeading/DescriptorName") if len(find_keywords) > 0: cur_keywords = [keyword.text for keyword in find_keywords] else: continue pubmed_dicts.append({ 'article_id': article_id, 'abstract': cur_abstract, 'keywords': cur_keywords }) print 'filtering infrequent keywords...' # Create counter with keywords keywords_counter = Counter() for kw in flatten([d['keywords'] for d in pubmed_dicts]): keywords_counter[kw] += 1 # Filter out keywords that occur less than 100 times keywords_set = [ kw for kw in keywords_counter.keys() if keywords_counter[kw] >= 100 ] for d in pubmed_dicts: d['keywords'] = filter(lambda x: x in keywords_set, d['keywords']) # Now filter out documents that have no remaining keywords pubmed_dicts = filter(lambda x: len(x['keywords']) > 0, pubmed_dicts) print 'N unique keywords: %s' % len(keywords_set) print 'N docs: %s' % len(pubmed_dicts) # Create list of docs docs = [d['abstract'] for d in pubmed_dicts] # Create list of labels lists labels = [d['keywords'] for d in pubmed_dicts] # Create list of all unique labels all_labels = list(set(flatten(labels))) return docs, labels, all_labels