def extract_features_sentence(sentence): """ extract_features_sentence Compute a list of dict-based feature representation for a list of tokens. @param sentence. A list of tokens. @return A list of feature dictionaries. """ features_list = [] # Get a feature set for each word in the sentence for i, word in enumerate(sentence): features_list.append(feat_word.IOB_prose_features(sentence[i])) # Feature: Bag of Words unigram conext (window=3) if 'unigram_context' in enabled_IOB_prose_sentence_features: window = 3 n = len(sentence) # Previous unigrams for i in range(n): end = min(i, window) unigrams = sentence[i - end:i] for j, u in enumerate(unigrams): features_list[i][('prev_unigrams-%d' % j, u)] = 1 # Next unigrams for i in range(n): end = min(i + window, n - 1) unigrams = sentence[i + 1:end + 1] for j, u in enumerate(unigrams): features_list[i][('next_unigrams-%d' % j, u)] = 1 # Only POS tag once if 'pos' in enabled_IOB_prose_sentence_features: pos_tagged = nltk_tagger.tag(sentence) # Allow for particular features to be enabled for feature in enabled_IOB_prose_sentence_features: # Feature: Part of Speech if feature == 'pos': for (i, (_, pos)) in enumerate(pos_tagged): features_list[i].update({('pos', pos): 1}) # Feature: POS context if 'pos_context' in enabled_IOB_prose_sentence_features: window = 3 n = len(sentence) # Previous POS for i in range(n): end = min(i, window) for j, p in enumerate(pos_tagged[i - end:i]): pos = p[1] features_list[i][('prev_pos_context-%d' % j, pos)] = 1 # Next POS for i in range(n): end = min(i + window, n - 1) for j, p in enumerate(pos_tagged[i + 1:i + end + 1]): pos = p[1] features_list[i][('prev_pos_context-%d' % j, pos)] = 1 # GENIA features if (feature == 'GENIA') and enabled['GENIA']: # Get GENIA features genia_feat_list = feat_genia.features(sentence) ''' print( '\t', sentence) print( '\n\n') for gf in genia_feat_list: print( '\t', gf) print() print ('\n\n') ''' for i, feat_dict in enumerate(genia_feat_list): features_list[i].update(feat_dict) # Feature: UMLS Word Features (only use prose ones) if (feature == "UMLS") and enabled['UMLS']: umls_features = feat_umls.extract_umls_features(sentence) for i in range(len(sentence)): features_list[i].update(umls_features[i]) ####### # TODO: This section is ugly... factorize it. ####### # Used for 'prev' and 'next' features ngram_features = [{} for i in range(len(features_list))] if "prev" in enabled_IOB_prose_sentence_features: prev = lambda f: {("prev_" + k[0], k[1]): v for k, v in f.items()} prev_list = list(map(prev, features_list)) for i in range(len(features_list)): if i == 0: ngram_features[i][("prev", "*")] = 1 else: ngram_features[i].update(prev_list[i - 1]) if "prev2" in enabled_IOB_prose_sentence_features: prev2 = lambda f: {("prev2_" + k[0], k[1]): v / 2.0 for k, v in f.items()} prev_list = list(map(prev2, features_list)) for i in range(len(features_list)): if i == 0: ngram_features[i][("prev2", "*")] = 1 elif i == 1: ngram_features[i][("prev2", "*")] = 1 else: ngram_features[i].update(prev_list[i - 2]) if "next" in enabled_IOB_prose_sentence_features: next = lambda f: {("next_" + k[0], k[1]): v for k, v in f.items()} next_list = list(map(next, features_list)) for i in range(len(features_list)): if i < len(features_list) - 1: ngram_features[i].update(next_list[i + 1]) else: ngram_features[i][("next", "*")] = 1 if "next2" in enabled_IOB_prose_sentence_features: next2 = lambda f: {("next2_" + k[0], k[1]): v / 2.0 for k, v in f.items()} next_list = list(map(next2, features_list)) for i in range(len(features_list)): if i < len(features_list) - 2: ngram_features[i].update(next_list[i + 2]) elif i == len(features_list) - 2: ngram_features[i][("next2", "**")] = 1 else: ngram_features[i][("next2", "*")] = 1 merged = lambda d1, d2: dict(list(d1.items()) + list(d2.items())) features_list = [ merged(features_list[i], ngram_features[i]) for i in range(len(features_list)) ] ''' for f in features_list: print (sorted(f.items())) print () print ('\n\n\n') ''' return features_list
def extract_sentence_features(sentence): """ extract_sentence_features Compute a list of dict-based feature representation for a list of tokens. @param sentence. A list of tokens. @return A list of feature dictionaries. """ features_list = [] # Get a feature set for each word in the sentence for i,word in enumerate(sentence): features_list.append(feat_word.extract_word_features(sentence[i])) # Feature: Bag of Words unigram conext (window=3) if 'unigram_context' in enabled_sentence_features: window = 3 n = len(sentence) # Previous unigrams for i in range(n): end = min(i, window) unigrams = sentence[i-end:i] for j,u in enumerate(unigrams): features_list[i][('prev_unigrams-%d'%j,u)] = 1 # Next unigrams for i in range(n): end = min(i + window, n-1) unigrams = sentence[i+1:end+1] for j,u in enumerate(unigrams): features_list[i][('next_unigrams-%d'%j,u)] = 1 # Only POS tag once if 'pos' in enabled_sentence_features: pos_tagged = nltk_tagger.tag(sentence) # Allow for particular features to be enabled for feature in enabled_sentence_features: # Feature: Part of Speech if feature == 'pos': for (i,(_,pos)) in enumerate(pos_tagged): features_list[i].update( { ('pos',pos) : 1} ) # Feature: POS context if 'pos_context' in enabled_sentence_features: window = 3 n = len(sentence) # Previous POS for i in range(n): end = min(i, window) for j,p in enumerate(pos_tagged[i-end:i]): pos = p[1] features_list[i][('prev_pos_context-%d'%j,pos)] = 1 # Next POS for i in range(n): end = min(i + window, n-1) for j,p in enumerate(pos_tagged[i+1:i+end+1]): pos = p[1] features_list[i][('prev_pos_context-%d'%j,pos)] = 1 # GENIA features if (feature == 'GENIA') and enabled['GENIA']: # Get GENIA features genia_feat_list = feat_genia.features(sentence) ''' print '\t', sentence print '\n\n' for gf in genia_feat_list: print '\t', gf print print '\n\n' ''' for i,feat_dict in enumerate(genia_feat_list): features_list[i].update(feat_dict) # Feature: UMLS Word Features (only use prose ones) if (feature == "UMLS") and enabled['UMLS']: umls_features = feat_umls.extract_umls_features(sentence) for i in range(len(sentence)): features_list[i].update( umls_features[i] ) ####### # TODO: This section is ugly... make it not shit ####### # Used for 'prev' and 'next' features ngram_features = [{} for i in range(len(features_list))] if "prev" in enabled_sentence_features: prev = lambda f: {("prev_"+k[0], k[1]): v for k,v in f.items()} prev_list = map(prev, features_list) for i in range(len(features_list)): if i == 0: ngram_features[i][("prev", "*")] = 1 else: ngram_features[i].update(prev_list[i-1]) if "prev2" in enabled_sentence_features: prev2 = lambda f: {("prev2_"+k[0], k[1]): v/2.0 for k,v in f.items()} prev_list = map(prev2, features_list) for i in range(len(features_list)): if i == 0: ngram_features[i][("prev2", "*")] = 1 elif i == 1: ngram_features[i][("prev2", "*")] = 1 else: ngram_features[i].update(prev_list[i-2]) if "next" in enabled_sentence_features: next = lambda f: {("next_"+k[0], k[1]): v for k,v in f.items()} next_list = map(next, features_list) for i in range(len(features_list)): if i < len(features_list) - 1: ngram_features[i].update(next_list[i+1]) else: ngram_features[i][("next", "*")] = 1 if "next2" in enabled_sentence_features: next2 = lambda f: {("next2_"+k[0], k[1]): v/2.0 for k,v in f.items()} next_list = map(next2, features_list) for i in range(len(features_list)): if i < len(features_list) - 2: ngram_features[i].update(next_list[i+2]) elif i == len(features_list) - 2: ngram_features[i][("next2", "**")] = 1 else: ngram_features[i][("next2", "*")] = 1 merged = lambda d1, d2: dict(d1.items() + d2.items()) features_list = [merged(features_list[i], ngram_features[i]) for i in range(len(features_list))] return features_list