def __init__(self, data): # Word-level features module self.feat_word = WordFeatures() # Only run GENIA tagger if module is available if data and enabled['GENIA']: tagger = enabled['GENIA'] self.feat_genia = GeniaFeatures(tagger, data) # Only create UMLS cache if module is available if enabled['UMLS']: self.feat_umls = UMLSFeatures() self.enabled_IOB_nonprose_sentence_features = [] #self.enabled_IOB_nonprose_sentence_features.append('pos') #self.enabled_IOB_nonprose_sentence_features.append('pos_context') self.enabled_IOB_nonprose_sentence_features.append('prev') self.enabled_IOB_nonprose_sentence_features.append('next') self.enabled_IOB_nonprose_sentence_features.append('unigram_context') self.enabled_IOB_nonprose_sentence_features.append('UMLS') self.enabled_IOB_prose_sentence_features = [] self.enabled_IOB_prose_sentence_features.append('unigram_context') self.enabled_IOB_prose_sentence_features.append('pos') self.enabled_IOB_prose_sentence_features.append('pos_context') self.enabled_IOB_prose_sentence_features.append('prev') self.enabled_IOB_prose_sentence_features.append('prev2') self.enabled_IOB_prose_sentence_features.append('next') self.enabled_IOB_prose_sentence_features.append('next2') self.enabled_IOB_prose_sentence_features.append('GENIA') self.enabled_IOB_prose_sentence_features.append('UMLS')
def test(weights_file=None): if weights_file is None: weights_file = pick_best_model_from_dir() if DEBUG: print("Best model detected: {}".format(weights_file)) test_data = read_data_from_json(TEST_DATA_PATH) if DEBUG: print_data_stats(test_data, "Test") # Tokenize data (rudimentary tokenizer). tokenizer = Tokenizer() tokenizer.fit_on_texts(all_sentences(test_data)) if DEBUG: print("Tokenizer found {} words.".format(len(tokenizer.word_counts))) print("") # Convert to Keras input arrays (or dict). wf = WordFeatures() wf.train_PMI(test_data) test_data, test_labels, _ = preprocess_data(test_data, tokenizer, wf, "test") embeddings_matrix = build_embeddings_matrix(tokenizer) num_words = len(tokenizer.word_counts) model = define_model(num_words, embeddings_matrix, "test", WORD_EMBEDDINGS_DIM) model.load_weights(weights_file, by_name=True) model.summary() num_tests = test_data["question_input"].shape[0] y = model.predict(test_data) assert (y.shape[0] == num_tests) correct = 0 total = 0 exp_acc = 0.0 lin_acc = 0.0 for i in range(0, num_tests): predicted = np.argmax(y[i]) expected = np.argmax(test_labels[i]) # Expected value (treat y[i] as a random variable). value = np.dot(y[i], [0, 1, 2, 3, 4, 5]) expected_value = np.dot(test_labels[i], [0, 1, 2, 3, 4, 5]) exp_acc += (np.exp(abs(value - expected_value)) - 1.0) lin_acc += abs(value - expected_value) if predicted == expected: correct += 1 total += 1 assert (total == num_tests) print("\nEvaluated on {} terms.".format(total)) print("Accuracy: {0:.3f}%".format(100 * correct / total)) print("Exp accuracy: {0:.3f}".format(exp_acc / total)) print("Linear accuracy: {0:.3f}".format(lin_acc / total))
def predict(entry, sort=False, weights_file=None, show_plot=True): if weights_file is None: weights_file = pick_best_model_from_dir() if DEBUG: print("Best model detected: {}".format(weights_file)) assert ("question" in entry) question = entry["question"] if "terms" not in entry: nlp = spacy.load("en_core_web_sm") doc = nlp(question) entry["terms"] = {} for token in doc: entry["terms"][token.text] = 0 data = [entry] # Tokenize data (rudimentary tokenizer). tokenizer = Tokenizer() tokenizer.fit_on_texts(all_sentences(data)) # Convert to Keras input arrays (or dict). wf = WordFeatures() wf.train_PMI(data) data, _, words = preprocess_data(data, tokenizer, wf, "predict") embeddings_matrix = build_embeddings_matrix(tokenizer) num_words = len(tokenizer.word_counts) model = define_model(num_words, embeddings_matrix, "predict", WORD_EMBEDDINGS_DIM) model.load_weights(weights_file, by_name=True) y = model.predict(data) idx = 0 essentiality = [] for word in entry["terms"]: value = np.dot(y[idx], [0, 1, 2, 3, 4, 5]) essentiality.append(value / 5.0) idx += 1 if sort: zipped = list(zip(words, essentiality)) zipped.sort(key=lambda x: x[1], reverse=True) words = [x[0] for x in zipped] essentiality = [x[1] for x in zipped] import matplotlib.pyplot as plt plt.bar(range(len(words)), essentiality, align='center') plt.title("Predicted values") plt.xticks(range(len(words)), words, rotation=45, horizontalalignment='right') plt.tight_layout() if show_plot: plt.show()
def plot_pmi_values(dataset="val", index=None, sort=False): assert (dataset in ["train", "val", "test"]) data = None if dataset == "train": data = read_data_from_json(TRAIN_DATA_PATH) elif dataset == "val": data = read_data_from_json(VALIDATION_DATA_PATH) elif dataset == "test": data = read_data_from_json(TEST_DATA_PATH) assert (data is not None) if index is None: entry = random.choice(data) else: entry = data[index] entry_copy = deepcopy(entry) predict(entry, sort=sort, show_plot=False) entry = entry_copy wf = WordFeatures() wf.train_PMI([entry]) from pmi_utils import reduce_positive_avg idx = 0 values = [] words = [] for word in entry["terms"]: pmi_values = wf.get_PMI(word, entry, use_question=True, reduce_f=reduce_positive_avg) values.append(pmi_values[1]) words.append(word) idx += 1 if sort: zipped = list(zip(words, values)) zipped.sort(key=lambda x: x[1], reverse=True) words = [x[0] for x in zipped] values = [x[1] for x in zipped] print("\nQuestion: {}\n".format(entry["question"])) import matplotlib.pyplot as plt plt.figure() plt.bar(range(len(words)), values, align='center') plt.title("PMI values") plt.xticks(range(len(words)), words, rotation=45, horizontalalignment='right') plt.tight_layout() plt.show()
def predict_batch(data, weights_file=None): if weights_file is None: weights_file = pick_best_model_from_dir() if DEBUG: print("Best model detected: {}".format(weights_file)) # Tokenize data (rudimentary tokenizer). tokenizer = Tokenizer() tokenizer.fit_on_texts(all_sentences(data)) # Convert to Keras input arrays (or dict). wf = WordFeatures() wf.train_PMI(data) data2, _, words = preprocess_data(data, tokenizer, wf, "predict_batch") embeddings_matrix = build_embeddings_matrix(tokenizer) num_words = len(tokenizer.word_counts) model = define_model(num_words, embeddings_matrix, "predict_batch", WORD_EMBEDDINGS_DIM) model.load_weights(weights_file, by_name=True) y = model.predict(data2, batch_size=128) idx = 0 out = [] for entry in data: out_set = {} for _ in entry["terms"]: value = np.dot(y[idx], [0, 1, 2, 3, 4, 5]) word = words[idx] assert (word not in out_set) out_set[word] = value / 5.0 idx += 1 out.append(out_set) num_entries = 0 for entry in data: num_entries += len(entry['terms']) assert (num_entries == idx) assert (len(data) == len(out)) for i in range(0, len(data)): assert (len(out[i]) == len(data[i]["terms"])) for out_set in out: num_entries -= len(out_set) assert (num_entries == 0) return out
class TestWordEmbeddings(unittest.TestCase): wf = WordFeatures() def setUp(self): pass def tearDown(self): pass def test_is_science_term(self): self.assertFalse(DEBUG) science_terms = [ "aardvarks", "ab initio", "center of curvature", "force", "gravity", "geo-science", "origins of the solar system", "atom", "protons", "seahorses", "newton's law of universal gravitation", "nucleus", "zwitterion", "zygomorphic", "zygomycetes", "zygospore", ] for word in science_terms: self.assertTrue(self.wf.is_science_term(word)) not_science_terms = ["love", "beauty", "nice", "language", "glasses"] for word in not_science_terms: self.assertFalse(self.wf.is_science_term(word)) def test_concreteness_ratings(self): self.assertFalse(DEBUG) to_check = { "roadsweeper": 4.85, "treeless": 4.24, "divisional": 2.04, "hopeful": 1.7, "essentialness": 1.04, "interpretively": 1.21, "traindriver": 4.54, "chocolaty": 3.45, "mathematical": 2.9, "baking soda": 5.0, "beach ball": 5.0, "birth certificate": 5.0, "adaptive": 1.97, "bucharest": 2.5, "soccer": 4.76, "sebi": 2.5, "fasole": 2.5 } for word in to_check: r = to_check[word] self.assertAlmostEqual(self.wf.get_concretness_rating(word), r)
def plot_F1_scores(dataset, weights_file=None): assert (dataset in ["val", "test"]) if weights_file is None: weights_file = pick_best_model_from_dir() if DEBUG: print("Best model detected: {}".format(weights_file)) data = None if dataset == "val": data = read_data_from_json(VALIDATION_DATA_PATH) elif dataset == "test": data = read_data_from_json(TEST_DATA_PATH) assert (data is not None) if DEBUG: print_data_stats(data, "F1 scores data") # Tokenize data (rudimentary tokenizer). tokenizer = Tokenizer() tokenizer.fit_on_texts(all_sentences(data)) if DEBUG: print("Tokenizer found {} words.".format(len(tokenizer.word_counts))) print("") # Convert to Keras input arrays (or dict). wf = WordFeatures() wf.train_PMI(data) data, labels, words = preprocess_data(data, tokenizer, wf, "F1 scores") embeddings_matrix = build_embeddings_matrix(tokenizer) num_words = len(tokenizer.word_counts) model = define_model(num_words, embeddings_matrix, "F1_scores_data", WORD_EMBEDDINGS_DIM) model.load_weights(weights_file, by_name=True) model.summary() num_tests = data["question_input"].shape[0] y = model.predict(data) assert (y.shape[0] == num_tests) threshold = 0.0 f1 = [] thresholds = [] best_f1 = None best_threshold = None acc_at_max_f1 = None while threshold <= 1.0: correct = 0 total = 0 true_positive = 0 false_positive = 0 true_negative = 0 false_negative = 0 for i in range(0, num_tests): # Expected value (treat y[i] as a random variable). value = np.dot(y[i], [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]) if value >= threshold: predicted = 1 else: predicted = 0 expected_value = np.argmax(labels[i]) if expected_value >= 2.5: expected = 1 else: expected = 0 if predicted == expected: correct += 1 if predicted == 1: if expected == 1: true_positive += 1 else: false_positive += 1 else: if expected == 0: true_negative += 1 else: false_negative += 1 total += 1 assert (total == num_tests) assert (correct == true_positive + true_negative) if true_positive + false_positive == 0: threshold += 0.001 continue if true_positive + false_negative == 0: threshold += 0.001 continue precision = 1.0 * true_positive / (true_positive + false_positive) recall = 1.0 * true_positive / (true_positive + false_negative) f1_score = 2.0 * precision * recall / (precision + recall) if best_f1 is None or f1_score > best_f1: best_f1 = f1_score best_threshold = threshold acc_at_max_f1 = 1.0 * correct / max(total, 1.0) f1.append(f1_score) thresholds.append(threshold) threshold += 0.001 print("Best F1 score: {}, at t = {}".format(round(best_f1, 3), round(best_threshold, 4))) print("Accuracy at max F1: {}".format(round(acc_at_max_f1, 3))) import matplotlib.pyplot as plt plt.title("F1 score") plt.xlabel("Threshold") plt.ylabel("F1") plt.plot(thresholds, f1) plt.show()
def binary_test(weights_file=None): if weights_file is None: weights_file = pick_best_model_from_dir() if DEBUG: print("Best model detected: {}".format(weights_file)) test_data = read_data_from_json(TEST_DATA_PATH) # test_data = undersample_dataset(test_data, prob=0.84) if DEBUG: print_data_stats(test_data, "Test") # Tokenize data (rudimentary tokenizer). tokenizer = Tokenizer() tokenizer.fit_on_texts(all_sentences(test_data)) if DEBUG: print("Tokenizer found {} words.".format(len(tokenizer.word_counts))) print("") # Convert to Keras input arrays (or dict). wf = WordFeatures() wf.train_PMI(test_data) test_data, test_labels, words = preprocess_data(test_data, tokenizer, wf, "test") embeddings_matrix = build_embeddings_matrix(tokenizer) num_words = len(tokenizer.word_counts) model = define_model(num_words, embeddings_matrix, "test", WORD_EMBEDDINGS_DIM) model.load_weights(weights_file, by_name=True) model.summary() num_tests = test_data["question_input"].shape[0] y = model.predict(test_data) assert (y.shape[0] == num_tests) correct = 0 total = 0 true_positive = 0 false_positive = 0 true_negative = 0 false_negative = 0 correct_confidence = 0.0 wrong_confidence = 0.0 false_positive_words = [] for i in range(0, num_tests): # Expected value (treat y[i] as a random variable). value = np.dot(y[i], [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]) if value >= 0.5: predicted = 1 else: predicted = 0 confidence = None if predicted == 1: confidence = np.dot(y[i], [0, 0, 0, 1, 1, 1]) else: confidence = np.dot(y[i], [1, 1, 1, 0, 0, 0]) expected_value = np.argmax(test_labels[i]) if expected_value >= 2.5: expected = 1 else: expected = 0 if predicted == expected: correct += 1 correct_confidence += confidence else: wrong_confidence += confidence if predicted == 1: if expected == 1: true_positive += 1 else: false_positive += 1 false_positive_words.append(words[i]) else: if expected == 0: true_negative += 1 else: false_negative += 1 total += 1 assert (total == num_tests) assert (correct == true_positive + true_negative) precision = 100.0 * true_positive / (true_positive + false_positive) recall = 100.0 * true_positive / (true_positive + false_negative) f1 = 2.0 * precision * recall / (precision + recall) print("") print(" | Correct class |") print(" | 1 | 0 |") print("Predicted 1 |{} |{} |".format( str(true_positive).rjust(6), str(false_positive).rjust(6))) print("Predicted 0 |{} |{} |".format( str(false_negative).rjust(6), str(true_negative).rjust(6))) print("\nEvaluated on {} terms.".format(total)) print("Binary accuracy: {0:.3f}%".format(100 * correct / total)) print("Precision: {0:.3f}%".format(precision)) print("Recall: {0:.3f}%".format(recall)) print("F1: {0:.3f}".format(f1 / 100.0)) if correct >= 1: print("Correct confidence {0:.3f}%".format(100.0 * correct_confidence / correct)) if correct < total: print("Wrong confidence {0:.3f}%".format(100.0 * wrong_confidence / (total - correct))) print("") random.shuffle(false_positive_words) print("Some false positive words: ", str(false_positive_words[:10]))
def train(): train_data = read_data_from_json(TRAIN_DATA_PATH) val_data = read_data_from_json(VALIDATION_DATA_PATH) test_data = read_data_from_json(TEST_DATA_PATH) # train_data = undersample_dataset(train_data, prob=0.68) # val_data = undersample_dataset(val_data, prob=0.68) # test_data = undersample_dataset(test_data, prob=0.68) # train_data = train_data[:2] # val_data = val_data[:2] # test_data = test_data[:1] if DEBUG: print_data_stats(train_data, "Train") print_data_stats(val_data, "Validation") print_data_stats(test_data, "Test") if False: print(dataset_similarity(val_data, train_data)) # 0.5714% print(dataset_similarity(test_data, train_data)) # 2.112% # Tokenize data (rudimentary tokenizer). tokenizer = Tokenizer() tokenizer.fit_on_texts( all_sentences(train_data) + all_sentences(val_data) + all_sentences(test_data)) if DEBUG: print("Tokenizer found {} words.".format(len(tokenizer.word_counts))) print("") # Convert to Keras input arrays (or dict). wf = WordFeatures() wf.train_PMI(train_data + val_data + test_data) train_data, train_labels, _ = preprocess_data(train_data, tokenizer, wf, "train") val_data, val_labels, _ = preprocess_data(val_data, tokenizer, wf, "validation") test_data, test_labels, _ = preprocess_data(test_data, tokenizer, wf, "test") # Equalize training data labels to the same frequency. if False: from utils import equalize train_data, train_labels = equalize(train_data, train_labels) if DEBUG: print("Train data has been equalized. New freq: {}.".format( np.asarray(np.sum(train_labels, axis=0), dtype=np.int32))) if False: from utils import oversample_dataset train_data, train_labels = oversample_dataset(train_data, train_labels, [6000, 8000]) if DEBUG: print("Train data has been oversampled. New freq: {}.".format( np.asarray(np.sum(train_labels, axis=0), dtype=np.int32))) embeddings_matrix = build_embeddings_matrix(tokenizer) num_words = len(tokenizer.word_counts) model = define_model(num_words, embeddings_matrix, "train", WORD_EMBEDDINGS_DIM) model.summary() plot_model(model, to_file='model.png', show_shapes=True) filepath = "models/" + "model.{val_acc:.3f}-{epoch:03d}.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=0, mode='max', save_best_only=True, save_weights_only=True) model.fit(train_data, train_labels, batch_size=4000, epochs=450, verbose=2, validation_data=(val_data, val_labels), callbacks=[checkpoint]) score = model.evaluate(test_data, test_labels, verbose=0) if score: print('Test loss:', score[0]) print('Test accuracy:', score[1])
class SentenceFeatures: # Feature Enabling enabled_concept_features = frozenset(["UMLS"]) # Instantiate an Sentence object def __init__(self, data): # Word-level features module self.feat_word = WordFeatures() # Only run GENIA tagger if module is available if data and enabled['GENIA']: tagger = enabled['GENIA'] self.feat_genia = GeniaFeatures(tagger, data) # Only create UMLS cache if module is available if enabled['UMLS']: self.feat_umls = UMLSFeatures() self.enabled_IOB_nonprose_sentence_features = [] #self.enabled_IOB_nonprose_sentence_features.append('pos') #self.enabled_IOB_nonprose_sentence_features.append('pos_context') self.enabled_IOB_nonprose_sentence_features.append('prev') self.enabled_IOB_nonprose_sentence_features.append('next') self.enabled_IOB_nonprose_sentence_features.append('unigram_context') self.enabled_IOB_nonprose_sentence_features.append('UMLS') self.enabled_IOB_prose_sentence_features = [] self.enabled_IOB_prose_sentence_features.append('unigram_context') self.enabled_IOB_prose_sentence_features.append('pos') self.enabled_IOB_prose_sentence_features.append('pos_context') self.enabled_IOB_prose_sentence_features.append('prev') self.enabled_IOB_prose_sentence_features.append('prev2') self.enabled_IOB_prose_sentence_features.append('next') self.enabled_IOB_prose_sentence_features.append('next2') self.enabled_IOB_prose_sentence_features.append('GENIA') self.enabled_IOB_prose_sentence_features.append('UMLS') def IOB_prose_features(self, sentence): """ IOB_prose_features @param sentence. A list of strings @return A list of dictionaries of features """ features_list = [] # Get a feature set for each word in the sentence for i, word in enumerate(sentence): features_list.append(self.feat_word.IOB_prose_features( sentence[i])) # Feature: Bag of Words unigram conext (window=3) if 'unigram_context' in self.enabled_IOB_prose_sentence_features: window = 3 n = len(sentence) # Previous unigrams for i in range(n): end = min(i, window) unigrams = sentence[i - end:i] for j, u in enumerate(unigrams): features_list[i][('prev_unigrams-%d' % j, u)] = 1 # Next unigrams for i in range(n): end = min(i + window, n - 1) unigrams = sentence[i + 1:end + 1] for j, u in enumerate(unigrams): features_list[i][('next_unigrams-%d' % j, u)] = 1 # Only POS tag once if 'pos' in self.enabled_IOB_prose_sentence_features: pos_tagged = nltk_tagger.tag(sentence) # Allow for particular features to be enabled for feature in self.enabled_IOB_prose_sentence_features: # Feature: Part of Speech if feature == 'pos': for (i, (_, pos)) in enumerate(pos_tagged): features_list[i].update({('pos', pos): 1}) # Feature: POS context if 'pos_context' in self.enabled_IOB_prose_sentence_features: window = 3 n = len(sentence) # Previous POS for i in range(n): end = min(i, window) for j, p in enumerate(pos_tagged[i - end:i]): pos = p[1] features_list[i][('prev_pos_context-%d' % j, pos)] = 1 # Next POS for i in range(n): end = min(i + window, n - 1) for j, p in enumerate(pos_tagged[i + 1:i + end + 1]): pos = p[1] features_list[i][('prev_pos_context-%d' % j, pos)] = 1 # GENIA features if (feature == 'GENIA') and enabled['GENIA']: # Get GENIA features genia_feat_list = self.feat_genia.features(sentence) ''' print '\t', sentence print '\n\n' for gf in genia_feat_list: print '\t', gf print print '\n\n' ''' for i, feat_dict in enumerate(genia_feat_list): features_list[i].update(feat_dict) # Feature: UMLS Word Features (only use prose ones) if (feature == "UMLS") and enabled['UMLS']: umls_features = self.feat_umls.IOB_prose_features(sentence) for i in range(len(sentence)): features_list[i].update(umls_features[i]) # Used for 'prev' and 'next' features ngram_features = [{} for i in range(len(features_list))] if "prev" in self.enabled_IOB_prose_sentence_features: prev = lambda f: {("prev_" + k[0], k[1]): v for k, v in f.items()} prev_list = map(prev, features_list) for i in range(len(features_list)): if i == 0: ngram_features[i][("prev", "*")] = 1 else: ngram_features[i].update(prev_list[i - 1]) if "prev2" in self.enabled_IOB_prose_sentence_features: prev2 = lambda f: {("prev2_" + k[0], k[1]): v / 2.0 for k, v in f.items()} prev_list = map(prev2, features_list) for i in range(len(features_list)): if i == 0: ngram_features[i][("prev2", "*")] = 1 elif i == 1: ngram_features[i][("prev2", "*")] = 1 else: ngram_features[i].update(prev_list[i - 2]) if "next" in self.enabled_IOB_prose_sentence_features: next = lambda f: {("next_" + k[0], k[1]): v for k, v in f.items()} next_list = map(next, features_list) for i in range(len(features_list)): if i < len(features_list) - 1: ngram_features[i].update(next_list[i + 1]) else: ngram_features[i][("next", "*")] = 1 if "next2" in self.enabled_IOB_prose_sentence_features: next2 = lambda f: {("next2_" + k[0], k[1]): v / 2.0 for k, v in f.items()} next_list = map(next2, features_list) for i in range(len(features_list)): if i < len(features_list) - 2: ngram_features[i].update(next_list[i + 2]) elif i == len(features_list) - 2: ngram_features[i][("next2", "**")] = 1 else: ngram_features[i][("next2", "*")] = 1 merged = lambda d1, d2: dict(d1.items() + d2.items()) features_list = [ merged(features_list[i], ngram_features[i]) for i in range(len(features_list)) ] ''' for f in features_list: print sorted(f.items()) print print '\n\n\n' ''' return features_list def IOB_nonprose_features(self, sentence): """ IOB_nonprose_features @param sentence. A list of strings @return A list of dictionaries of features """ # Get a feature set for each word in the sentence features_list = [] for i, word in enumerate(sentence): word_feats = self.feat_word.IOB_nonprose_features(sentence[i]) features_list.append(word_feats) # Feature: Bag of Words unigram conext (window=3) if 'unigram_context' in self.enabled_IOB_nonprose_sentence_features: window = 3 n = len(sentence) # Previous unigrams for i in range(n): end = min(i, window) unigrams = sentence[i - end:i] for j, u in enumerate(unigrams): features_list[i][('prev_unigrams-%d' % j, u)] = 1 # Next unigrams for i in range(n): end = min(i + window, n - 1) unigrams = sentence[i + 1:end + 1] for u in unigrams: features_list[i][('next_unigrams-%d' % j, u)] = 1 # Feature: UMLS Word Features (only use nonprose ones) if enabled[ 'UMLS'] and 'UMLS' in self.enabled_IOB_nonprose_sentence_features: umls_features = self.feat_umls.IOB_nonprose_features(sentence) for i in range(len(sentence)): features_list[i].update(umls_features[i]) #return features_list if 'pos' in self.enabled_IOB_nonprose_sentence_features: pos_tagged = nltk_tagger.tag(sentence) # Allow for particular features to be enabled for feature in self.enabled_IOB_nonprose_sentence_features: # Feature: Part of Speech if feature == 'pos': for (i, (_, pos)) in enumerate(pos_tagged): features_list[i][('pos', pos)] = 1 # Feature: POS context if 'pos_context' in self.enabled_IOB_nonprose_sentence_features: window = 3 n = len(sentence) # Previous POS for i in range(n): end = min(i, window) for j, p in enumerate(pos_tagged[i - end:i]): pos = p[1] features_list[i][('prev_pos_context-%d' % j, pos)] = 1 # Next POS for i in range(n): end = min(i + window, n - 1) for j, p in enumerate(pos_tagged[i + 1:i + end + 1]): pos = p[1] features_list[i][('prev_pos_context-%d' % j, pos)] = 1 ngram_features = [{} for _ in range(len(features_list))] if "prev" in self.enabled_IOB_nonprose_sentence_features: prev = lambda f: {("prev_" + k[0], k[1]): v for k, v in f.items()} prev_list = map(prev, features_list) for i in range(len(features_list)): if i == 0: ngram_features[i][("prev", "*")] = 1 else: ngram_features[i].update(prev_list[i - 1]) if "next" in self.enabled_IOB_nonprose_sentence_features: next = lambda f: {("next_" + k[0], k[1]): v for k, v in f.items()} next_list = map(next, features_list) for i in range(len(features_list)): if i == len(features_list) - 1: ngram_features[i][("next", "*")] = 1 else: ngram_features[i].update(next_list[i + 1]) merged = lambda d1, d2: dict(d1.items() + d2.items()) features_list = [ merged(features_list[i], ngram_features[i]) for i in range(len(features_list)) ] return features_list def concept_features_for_sentence(self, sentence, chunk_inds): """ concept_features() @param sentence. A sentence in list of chunk format @param chunk_inds. A list of indices for non-None-labeled chunks @return A list of feature dictionaries """ # Get a feature set for each word in the sentence features_list = [] for ind in chunk_inds: features_list.append( self.feat_word.concept_features_for_chunk(sentence, ind)) # Allow for particular features to be enabled for feature in self.enabled_concept_features: # Features: UMLS features if (feature == "UMLS") and enabled['UMLS']: umls_features = self.feat_umls.concept_features_for_chunks( sentence, chunk_inds) for i in range(len(chunk_inds)): features_list[i].update(umls_features[i]) return features_list