def add_features_generic(mention_id, pheno_words, sentence): # Use the generic feature library (ONLY!) # Load dictionaries for keywords ddlib.load_dictionary(BASE_DIR + "/dicts/features/pheno_var.tsv", "VARKW") ddlib.load_dictionary( BASE_DIR + "/dicts/features/pheno_patient.tsv", "PATIENTKW") # Create the objects used by ddlib. ddlib interface is so ugly. obj = dict() obj['lemma'] = [] obj['words'] = [] obj['ner'] = [] obj['pos'] = [] obj['dep_graph'] = [] for word in sentence.words: obj['lemma'].append(word.lemma) obj['words'].append(word.word) obj['ner'].append(word.ner) obj['pos'].append(word.pos) obj['dep_graph'].append( str(word.dep_parent + 1) + "\t" + word.dep_path + "\t" + str(word.in_sent_idx + 1)) word_obj_list = ddlib.unpack_words( obj, lemma='lemma', pos='pos', ner='ner', words='words', dep_graph='dep_graph', dep_graph_parser=ddlib.dep_graph_parser_triplet) gene_span = ddlib.get_span(pheno_words[0].in_sent_idx, len(pheno_words)) features = set() for feature in ddlib.get_generic_features_mention( word_obj_list, gene_span): features.add(feature) for feature in features: print_feature(sentence.doc_id, mention_id, feature)
def add_features_generic(relation_id, gene_words, pheno_words, sentence): # Use the generic feature library (ONLY!) obj = dict() obj['lemma'] = [] obj['words'] = [] obj['ner'] = [] obj['pos'] = [] obj['dep_graph'] = [] for word in sentence.words: obj['lemma'].append(word.lemma) obj['words'].append(word.word) obj['ner'].append(word.ner) obj['pos'].append(word.pos) obj['dep_graph'].append( str(word.dep_parent + 1) + "\t" + word.dep_path + "\t" + str(word.in_sent_idx + 1)) word_obj_list = ddlib.unpack_words( obj, lemma='lemma', pos='pos', ner='ner', words='words', dep_graph='dep_graph', dep_graph_parser=ddlib.dep_graph_parser_triplet) gene_span = ddlib.get_span(gene_words[0].in_sent_idx, len(gene_words)) pheno_span = ddlib.get_span(pheno_words[0].ins_sent_idx, len(pheno_words)) features = set() for feature in ddlib.get_generic_feature_relation(word_obj_list, gene_span, pheno_span): features.add(feature) for feature in features: print_feature(sentence.doc_id, relation_id, feature)
def add_features_generic(mention_id, gene_words, sentence): # Use the generic feature library (ONLY!) # Load dictionaries for keywords ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_var.tsv", "VARKW") ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_knock.tsv", "KNOCKKW") ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_amino.tsv", "AMINOKW") ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_antigene.tsv", "ANTIGENEKW") ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_dna.tsv", "DNAKW") ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_downregulation.tsv", "DOWNREGKW") ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_upregulation.tsv", "UPREGKW") ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_tumor.tsv", "TUMORKW") ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_gene.tsv", "GENEKW") ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_expression.tsv", "EXPRESSKW") # Create the objects used by ddlib. ddlib interface is so ugly. obj = dict() obj['lemma'] = [] obj['words'] = [] obj['ner'] = [] obj['pos'] = [] obj['dep_graph'] = [] for word in sentence.words: obj['lemma'].append(word.lemma) obj['words'].append(word.word) obj['ner'].append(word.ner) obj['pos'].append(word.pos) obj['dep_graph'].append( str(word.dep_parent + 1) + "\t" + word.dep_path + "\t" + str(word.in_sent_idx + 1)) word_obj_list = ddlib.unpack_words( obj, lemma='lemma', pos='pos', ner='ner', words='words', dep_graph='dep_graph', dep_graph_parser=ddlib.dep_graph_parser_triplet) gene_span = ddlib.get_span(gene_words[0].in_sent_idx, len(gene_words)) features = set() for feature in ddlib.get_generic_features_mention(word_obj_list, gene_span): features.add(feature) for feature in features: print_feature(sentence.doc_id, mention_id, feature)
def add_features_generic(mention_id, gene_words, sentence): # Use the generic feature library (ONLY!) # Load dictionaries for keywords ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_var.tsv", "VARKW") ddlib.load_dictionary( BASE_DIR + "/dicts/features/gene_knock.tsv", "KNOCKKW") ddlib.load_dictionary( BASE_DIR + "/dicts/features/gene_amino.tsv", "AMINOKW") ddlib.load_dictionary( BASE_DIR + "/dicts/features/gene_antigene.tsv", "ANTIGENEKW") ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_dna.tsv", "DNAKW") ddlib.load_dictionary( BASE_DIR + "/dicts/features/gene_downregulation.tsv", "DOWNREGKW") ddlib.load_dictionary( BASE_DIR + "/dicts/features/gene_upregulation.tsv", "UPREGKW") ddlib.load_dictionary( BASE_DIR + "/dicts/features/gene_tumor.tsv", "TUMORKW") ddlib.load_dictionary( BASE_DIR + "/dicts/features/gene_gene.tsv", "GENEKW") ddlib.load_dictionary( BASE_DIR + "/dicts/features/gene_expression.tsv", "EXPRESSKW") # Create the objects used by ddlib. ddlib interface is so ugly. obj = dict() obj['lemma'] = [] obj['words'] = [] obj['ner'] = [] obj['pos'] = [] obj['dep_graph'] = [] for word in sentence.words: obj['lemma'].append(word.lemma) obj['words'].append(word.word) obj['ner'].append(word.ner) obj['pos'].append(word.pos) obj['dep_graph'].append( str(word.dep_parent + 1) + "\t" + word.dep_path + "\t" + str(word.in_sent_idx + 1)) word_obj_list = ddlib.unpack_words( obj, lemma='lemma', pos='pos', ner='ner', words='words', dep_graph='dep_graph', dep_graph_parser=ddlib.dep_graph_parser_triplet) gene_span = ddlib.get_span(gene_words[0].in_sent_idx, len(gene_words)) features = set() for feature in ddlib.get_generic_features_mention( word_obj_list, gene_span): features.add(feature) for feature in features: print_feature(sentence.doc_id, mention_id, feature)
def add_features(mention_id, mention_words, sentence): # The verb closest to the candidate, with the path to it. minl = 100 minp = None minw = None for word in mention_words: for word2 in sentence.words: if word2.lemma.isalpha() and re.search('^VB[A-Z]*$', word2.pos) \ and word2.lemma != 'be': # Ignoring "be" comes from pharm (Emily) (p, l) = sentence.get_word_dep_path( word.in_sent_idx, word2.in_sent_idx) if l < minl: minl = l minp = p minw = word2.lemma if minw: print_feature( sentence.doc_id, mention_id, 'VERB_[' + minw + ']' + minp) # The keywords that appear in the sentence with the mention minl = 100 minp = None minw = None for word in mention_words: for word2 in sentence.words: if word2.lemma in KEYWORDS: (p, l) = sentence.get_word_dep_path( word.in_sent_idx, word2.in_sent_idx) kw = word2.lemma if word2.lemma in KNOCK_KWS: kw = "_KNOCKOUT" elif word2.lemma in ANTIGENE_KWS: kw = "_ANTIGENE" elif word2.lemma in AMINO_ACID_KWS: kw = "_AMINOACID" # elif word2.lemma in DNA_KWS: # kw = "_DNA" elif word2.lemma in DOWNREGULATION_KWS: kw = "_DOWNREGULATION" elif word2.lemma in UPREGULATION_KWS: kw = "_UPREGULATION" # elif word2.lemma in TUMOR_KWS: # kw = "_TUMOR" # elif word2.lemma in GENE_KWS: # kw = "_GENE" # elif word2.lemma in COEXPRESSION_KWS: # ke = "_COEXPRESSION" if l < minl: minl = l minp = p minw = kw if len(p) < 100: print_feature( sentence.doc_id, mention_id, "KEYWORD_[" + kw + "]" + p) # Special features for the keyword on the shortest dependency path if minw: print_feature( sentence.doc_id, mention_id, 'EXT_KEYWORD_MIN_[' + minw + ']' + minp) print_feature( sentence.doc_id, mention_id, 'KEYWORD_MIN_[' + minw + ']') # If another gene is present in the sentence, add a feature with that gene # and the path to it. This comes from pharm. minl = 100 minp = None minw = None mention_wordidxs = [] for word in mention_words: mention_wordidxs.append(word.in_sent_idx) for word in mention_words: for word2 in sentence.words: if word2.in_sent_idx not in mention_wordidxs and \ word2.word in merged_genes_dict: (p, l) = sentence.get_word_dep_path( word.in_sent_idx, word2.in_sent_idx) if l < minl: minl = l minp = p minw = word2.lemma if minw: print_feature( sentence.doc_id, mention_id, 'OTHER_GENE_['+minw+']' + minp) # print_feature(sentence.doc_id, mention_id, 'OTHER_GENE_['+minw+']') # The lemma on the left of the candidate, whatever it is try: left = sentence.words[mention_words[0].in_sent_idx-1].lemma try: float(left) left = "_NUMBER" except ValueError: pass print_feature( sentence.doc_id, mention_id, "NGRAM_LEFT_1_[" + left + "]") except IndexError: pass # The lemma on the right of the candidate, whatever it is try: right = sentence.words[mention_words[-1].in_sent_idx+1].lemma try: float(right) right = "_NUMBER" except ValueError: pass print_feature( sentence.doc_id, mention_id, "NGRAM_RIGHT_1_[" + right + "]") except IndexError: pass # We know check whether the lemma on the left and on the right are # "special", for example a year or a gene. # The concept of left or right is a little tricky here, as we are actually # looking at the first word that contains only letters and is not a # stopword. idx = mention_words[0].in_sent_idx - 1 gene_on_left = None gene_on_right = None while idx >= 0 and \ ((((not sentence.words[idx].lemma.isalnum() and not sentence.words[idx] in merged_genes_dict) or (not sentence.words[idx].word.isupper() and sentence.words[idx].lemma in stopwords_dict)) and not re.match("^[0-9]+(.[0-9]+)?$", sentence.words[idx].word) and not sentence.words[idx] in merged_genes_dict) or len(sentence.words[idx].lemma) == 1): idx -= 1 if idx >= 0: if sentence.words[idx].word in merged_genes_dict and \ len(sentence.words[idx].word) > 3: gene_on_left = sentence.words[idx].word try: year = float(sentence.words[idx].word) if round(year) == year and year > 1950 and year <= 2014: print_feature(sentence.doc_id, mention_id, "IS_YEAR_LEFT") except: pass # The word on the right of the mention, if present, provided it's # alphanumeric but not a number idx = mention_words[-1].in_sent_idx + 1 while idx < len(sentence.words) and \ ((((not sentence.words[idx].lemma.isalnum() and not sentence.words[idx] in merged_genes_dict) or (not sentence.words[idx].word.isupper() and sentence.words[idx].lemma in stopwords_dict)) and not re.match("^[0-9]+(.[0-9]+)?$", sentence.words[idx].word) and not sentence.words[idx] in merged_genes_dict) or len(sentence.words[idx].lemma) == 1): idx += 1 if idx < len(sentence.words): if sentence.words[idx].word in merged_genes_dict and \ len(sentence.words[idx].word) > 3: gene_on_right = sentence.words[idx].word try: year = float(sentence.words[idx].word) if round(year) == year and year > 1950 and year <= 2014: print_feature(sentence.doc_id, mention_id, "IS_YEAR_RIGHT") except: pass if gene_on_left and gene_on_right: print_feature(sentence.doc_id, mention_id, "IS_BETWEEN_GENES") elif gene_on_left: print_feature(sentence.doc_id, mention_id, "GENE_ON_LEFT") elif gene_on_right: print_feature(sentence.doc_id, mention_id, "GENE_ON_RIGHT") # The candidate is a single word that appears many times (more than 4) in # the sentence if len(mention_words) == 1 and \ [w.word for w in sentence.words].count(mention_words[0].word) > 4: print_feature( sentence.doc_id, mention_id, "APPEARS_MANY_TIMES_IN_SENTENCE")
def add_features(mention_id, mention_words, sentence): # The verb closest to the candidate, with the path to it. minl = 100 minp = None minw = None for word in mention_words: for word2 in sentence.words: if word2.lemma.isalpha() and re.search('^VB[A-Z]*$', word2.pos) \ and word2.lemma != 'be': # Ignoring "be" comes from pharm (Emily) (p, l) = sentence.get_word_dep_path(word.in_sent_idx, word2.in_sent_idx) if l < minl: minl = l minp = p minw = word2.lemma if minw: print_feature(sentence.doc_id, mention_id, 'VERB_[' + minw + ']' + minp) # The keywords that appear in the sentence with the mention minl = 100 minp = None minw = None for word in mention_words: for word2 in sentence.words: if word2.lemma in KEYWORDS: (p, l) = sentence.get_word_dep_path(word.in_sent_idx, word2.in_sent_idx) kw = word2.lemma if word2.lemma in KNOCK_KWS: kw = "_KNOCKOUT" elif word2.lemma in ANTIGENE_KWS: kw = "_ANTIGENE" elif word2.lemma in AMINO_ACID_KWS: kw = "_AMINOACID" # elif word2.lemma in DNA_KWS: # kw = "_DNA" elif word2.lemma in DOWNREGULATION_KWS: kw = "_DOWNREGULATION" elif word2.lemma in UPREGULATION_KWS: kw = "_UPREGULATION" # elif word2.lemma in TUMOR_KWS: # kw = "_TUMOR" # elif word2.lemma in GENE_KWS: # kw = "_GENE" # elif word2.lemma in COEXPRESSION_KWS: # ke = "_COEXPRESSION" if l < minl: minl = l minp = p minw = kw if len(p) < 100: print_feature(sentence.doc_id, mention_id, "KEYWORD_[" + kw + "]" + p) # Special features for the keyword on the shortest dependency path if minw: print_feature(sentence.doc_id, mention_id, 'EXT_KEYWORD_MIN_[' + minw + ']' + minp) print_feature(sentence.doc_id, mention_id, 'KEYWORD_MIN_[' + minw + ']') # If another gene is present in the sentence, add a feature with that gene # and the path to it. This comes from pharm. minl = 100 minp = None minw = None mention_wordidxs = [] for word in mention_words: mention_wordidxs.append(word.in_sent_idx) for word in mention_words: for word2 in sentence.words: if word2.in_sent_idx not in mention_wordidxs and \ word2.word in merged_genes_dict: (p, l) = sentence.get_word_dep_path(word.in_sent_idx, word2.in_sent_idx) if l < minl: minl = l minp = p minw = word2.lemma if minw: print_feature(sentence.doc_id, mention_id, 'OTHER_GENE_[' + minw + ']' + minp) # print_feature(sentence.doc_id, mention_id, 'OTHER_GENE_['+minw+']') # The lemma on the left of the candidate, whatever it is try: left = sentence.words[mention_words[0].in_sent_idx - 1].lemma try: float(left) left = "_NUMBER" except ValueError: pass print_feature(sentence.doc_id, mention_id, "NGRAM_LEFT_1_[" + left + "]") except IndexError: pass # The lemma on the right of the candidate, whatever it is try: right = sentence.words[mention_words[-1].in_sent_idx + 1].lemma try: float(right) right = "_NUMBER" except ValueError: pass print_feature(sentence.doc_id, mention_id, "NGRAM_RIGHT_1_[" + right + "]") except IndexError: pass # We know check whether the lemma on the left and on the right are # "special", for example a year or a gene. # The concept of left or right is a little tricky here, as we are actually # looking at the first word that contains only letters and is not a # stopword. idx = mention_words[0].in_sent_idx - 1 gene_on_left = None gene_on_right = None while idx >= 0 and \ ((((not sentence.words[idx].lemma.isalnum() and not sentence.words[idx] in merged_genes_dict) or (not sentence.words[idx].word.isupper() and sentence.words[idx].lemma in stopwords_dict)) and not re.match("^[0-9]+(.[0-9]+)?$", sentence.words[idx].word) and not sentence.words[idx] in merged_genes_dict) or len(sentence.words[idx].lemma) == 1): idx -= 1 if idx >= 0: if sentence.words[idx].word in merged_genes_dict and \ len(sentence.words[idx].word) > 3: gene_on_left = sentence.words[idx].word try: year = float(sentence.words[idx].word) if round(year) == year and year > 1950 and year <= 2014: print_feature(sentence.doc_id, mention_id, "IS_YEAR_LEFT") except: pass # The word on the right of the mention, if present, provided it's # alphanumeric but not a number idx = mention_words[-1].in_sent_idx + 1 while idx < len(sentence.words) and \ ((((not sentence.words[idx].lemma.isalnum() and not sentence.words[idx] in merged_genes_dict) or (not sentence.words[idx].word.isupper() and sentence.words[idx].lemma in stopwords_dict)) and not re.match("^[0-9]+(.[0-9]+)?$", sentence.words[idx].word) and not sentence.words[idx] in merged_genes_dict) or len(sentence.words[idx].lemma) == 1): idx += 1 if idx < len(sentence.words): if sentence.words[idx].word in merged_genes_dict and \ len(sentence.words[idx].word) > 3: gene_on_right = sentence.words[idx].word try: year = float(sentence.words[idx].word) if round(year) == year and year > 1950 and year <= 2014: print_feature(sentence.doc_id, mention_id, "IS_YEAR_RIGHT") except: pass if gene_on_left and gene_on_right: print_feature(sentence.doc_id, mention_id, "IS_BETWEEN_GENES") elif gene_on_left: print_feature(sentence.doc_id, mention_id, "GENE_ON_LEFT") elif gene_on_right: print_feature(sentence.doc_id, mention_id, "GENE_ON_RIGHT") # The candidate is a single word that appears many times (more than 4) in # the sentence if len(mention_words) == 1 and \ [w.word for w in sentence.words].count(mention_words[0].word) > 4: print_feature(sentence.doc_id, mention_id, "APPEARS_MANY_TIMES_IN_SENTENCE")
def add_features(mention_id, mention_words, sentence): mention_wordidxs = [] for word in mention_words: mention_wordidxs.append(word.in_sent_idx) # The first alphanumeric lemma on the left of the mention, if present, idx = mention_words[0].in_sent_idx - 1 left_lemma_idx = -1 left_lemma = "" while idx >= 0 and not sentence.words[idx].word.isalnum(): idx -= 1 try: left_lemma = sentence.words[idx].lemma try: float(left_lemma) left_lemma = "_NUMBER" except ValueError: pass left_lemma_idx = idx print_feature( sentence.doc_id, mention_id, "NGRAM_LEFT_1_[{}]".format( left_lemma)) except IndexError: pass # The first alphanumeric lemma on the right of the mention, if present, idx = mention_wordidxs[-1] + 1 right_lemma_idx = -1 right_lemma = "" while idx < len(sentence.words) and not sentence.words[idx].word.isalnum(): idx += 1 try: right_lemma = sentence.words[idx].lemma try: float(right_lemma) right_lemma = "_NUMBER" except ValueError: pass right_lemma_idx = idx print_feature( sentence.doc_id, mention_id, "NGRAM_RIGHT_1_[{}]".format( right_lemma)) except IndexError: pass # The lemma "two on the left" of the mention, if present try: print_feature(sentence.doc_id, mention_id, "NGRAM_LEFT_2_[{}]".format( sentence.words[left_lemma_idx - 1].lemma)) print_feature( sentence.doc_id, mention_id, "NGRAM_LEFT_2_C_[{} {}]".format( sentence.words[left_lemma_idx - 1].lemma, left_lemma)) except IndexError: pass # The lemma "two on the right" on the left of the mention, if present try: print_feature( sentence.doc_id, mention_id, "NGRAM_RIGHT_2_[{}]".format( sentence.words[right_lemma_idx + 1].lemma)) print_feature( sentence.doc_id, mention_id, "NGRAM_RIGHT_2_C_[{} {}]".format( right_lemma, sentence.words[right_lemma_idx + 1].lemma)) except IndexError: pass # The keywords that appear in the sentence with the mention minl = 100 minp = None minw = None for word in mention_words: for word2 in sentence.words: if word2.lemma in KEYWORDS: (p, l) = sentence.get_word_dep_path( word.in_sent_idx, word2.in_sent_idx) kw = word2.lemma if word2.lemma in PATIENT_KWS: kw = "_HUMAN" print_feature( sentence.doc_id, mention_id, "KEYWORD_[" + kw + "]" + p) if l < minl: minl = l minp = p minw = kw # Special feature for the keyword on the shortest dependency path if minw: print_feature( sentence.doc_id, mention_id, 'EXT_KEYWORD_MIN_[' + minw + ']' + minp) print_feature( sentence.doc_id, mention_id, 'KEYWORD_MIN_[' + minw + ']') # The verb closest to the candidate minl = 100 minp = None minw = None for word in mention_words: for word2 in sentence.words: if word2.word.isalpha() and re.search('^VB[A-Z]*$', word2.pos) \ and word2.lemma != 'be': (p, l) = sentence.get_word_dep_path( word.in_sent_idx, word2.in_sent_idx) if l < minl: minl = l minp = p minw = word2.lemma if minw: print_feature( sentence.doc_id, mention_id, 'VERB_[' + minw + ']' + minp)
def add_features(relation_id, gene_words, pheno_words, sentence): # Find the start/end indices of the mentions composing the relation gene_start = gene_words[0].in_sent_idx pheno_start = pheno_words[0].in_sent_idx gene_end = gene_words[-1].in_sent_idx pheno_end = pheno_words[-1].in_sent_idx limits = sorted((gene_start, pheno_start, gene_end, pheno_end)) start = limits[0] betw_start = limits[1] betw_end = limits[2] end = limits[3] # If the gene comes first, we do not prefix, otherwise we do. if start == gene_start: inv = "" else: inv = "INV_" # Verbs between the mentions verbs_between = [] minl_gene = 100 minp_gene = None minw_gene = None mini_gene = None minl_pheno = 100 # minp_pheno = None minw_pheno = None mini_pheno = None neg_found = False # Look all the words, as in the dependency path there could be words that # are close to both mentions but not between them for i in range(len(sentence.words)): # The filtering of the brackets and commas is from Emily's code. if re.search('^VB[A-Z]*$', sentence.words[i].pos) and \ sentence.words[i].word not in ["{", "}", "(", ")", "[", "]"] \ and "," not in sentence.words[i].word: (p_gene, l_gene) = sentence.get_word_dep_path( betw_start, sentence.words[i].in_sent_idx) (p_pheno, l_pheno) = sentence.get_word_dep_path( sentence.words[i].in_sent_idx, betw_end) if l_gene < minl_gene: minl_gene = l_gene minp_gene = p_gene minw_gene = sentence.words[i].lemma mini_gene = sentence.words[i].in_sent_idx if l_pheno < minl_pheno: minl_pheno = l_pheno # minp_pheno = p_pheno minw_pheno = sentence.words[i].lemma mini_pheno = sentence.words[i].in_sent_idx # Look for negation. if i > 0 and sentence.words[i-1].lemma in \ ["no", "not", "neither", "nor"]: if i < betw_end - 2: neg_found = True print_feature( relation_id, inv + "NEG_VERB_[" + sentence.words[i - 1].word + "]-" + sentence.words[i].lemma) else: verbs_between.append(sentence.words[i]) if len(verbs_between) == 1 and not neg_found: print_feature(sentence.doc_id, relation_id, inv + "SINGLE_VERB_[%s]" % verbs_between[0].lemma) else: for verb in verbs_between: if verb.in_sent_idx > betw_start and \ verb.in_sent_idx < betw_end: print_feature(sentence.doc_id, relation_id, inv + "VERB_[%s]" % verb.lemma) if mini_pheno == mini_gene and mini_gene is not None and \ len(minp_gene) < 50: # and "," not in minw_gene: # feature = inv + 'MIN_VERB_[' + minw_gene + ']' + minp_gene # features.append(feature) feature = inv + 'MIN_VERB_[' + minw_gene + ']' print_feature(sentence.doc_id, relation_id, feature) else: feature = inv if mini_gene is not None: # feature = 'MIN_VERB_GENE_[' + minw_gene + ']' + minp_gene # print_feature(sentence.doc_id, relation_id, feature) feature += 'MIN_VERB_GENE_[' + minw_gene + ']' else: feature += 'MIN_VERB_GENE_[NULL]' if mini_pheno is not None: # feature = 'MIN_VERB_pheno_[' + minw_pheno + ']' + minp_pheno) # print_feature(sentence.doc_id, relation_id, feature) feature += '_pheno_[' + minw_pheno + ']' else: feature += '_pheno_[NULL]' print_feature(sentence.doc_id, relation_id, feature) # The following features are only added if the two mentions are "close # enough" to avoid overfitting. The concept of "close enough" is somewhat # arbitrary. neg_word_index = -1 if betw_end - betw_start - 1 < 8: for i in range(betw_start + 1, betw_end): # Feature for separation between entities. # TODO Think about merging these? # I think these should be some kind of supervision rule instead? if "while" == sentence.words[i].lemma: print_feature(sentence.doc_id, relation_id, "SEP_BY_[while]") if "whereas" == sentence.words[i].lemma: print_feature(sentence.doc_id, relation_id, "SEP_BY_[whereas]") if sentence.words[i].lemma in ["no", "not", "neither", "nor"]: neg_word_index = i # Features for the negative words # TODO: We would probably need distant supervision for these if neg_word_index > -1: gene_p = None gene_l = 100 for word in sentence.words[gene_start:gene_end + 1]: (p, l) = sentence.get_word_dep_path(word.in_sent_idx, neg_word_index) if l < gene_l: gene_p = p gene_l = l if gene_p: print_feature(sentence.doc_id, relation_id, inv + "NEG_[" + gene_p + "]") # pheno_p = None # pheno_l = 100 # for word in sentence.words[pheno_start:pheno_end+1]: # p = sentence.get_word_dep_path( # word.in_sent_idx, neg_word_index) # if len(p) < pheno_l: # pheno_p = p # pheno_l = len(p) # if pheno_p: # print_feature( # relation_id, inv + "pheno_TO_NEG_[" + pheno_p + "]") # The sequence of lemmas between the two mentions and the sequence of # lemmas between the two mentions but using the NERs, if present, and # the sequence of POSes between the mentions seq_list_ners = [] seq_list_lemmas = [] seq_list_poses = [] for word in sentence.words[betw_start + 1:betw_end]: if word.ner != "O": seq_list_ners.append(word.ner) else: seq_list_ners.append(word.lemma) seq_list_lemmas.append(word.lemma) seq_list_poses.append(word.pos) seq_ners = " ".join(seq_list_ners) seq_lemmas = " ".join(seq_list_lemmas) seq_poses = "_".join(seq_list_poses) print_feature(sentence.doc_id, relation_id, inv + "WORD_SEQ_[" + seq_lemmas + "]") print_feature(sentence.doc_id, relation_id, inv + "WORD_SEQ_NER_[" + seq_ners + "]") print_feature(sentence.doc_id, relation_id, inv + "POS_SEQ_[" + seq_poses + "]") # Shortest dependency path between the two mentions (path, length) = sentence.dep_path(gene_words, pheno_words) print_feature(sentence.doc_id, relation_id, inv + "DEP_PATH_[" + path + "]") # Number of words between the mentions # TODO I think this should be some kind of supervision rule instead? # print_feature(sentence.doc_id, relation_id, # inv + "WORD_SEQ_LEN_[" + str(betw_end - betw_start - 1) + "]") # 2-gram between the mentions if betw_end - betw_start - 1 > 4 and betw_start - betw_end - 1 < 15: for i in range(betw_start + 1, betw_end - 1): print_feature( sentence.doc_id, relation_id, "BETW_2_GRAM_[" + sentence.words[i].lemma + "_" + sentence.words[i + 1].lemma + "]") # Lemmas on the exterior of the mentions and on the interior feature = inv if start > 0: feature += "EXT_NGRAM_[" + sentence.words[start - 1].lemma + "]" else: feature += "EXT_NGRAM_[NULL]" if end < len(sentence.words) - 1: feature += "_[" + sentence.words[end + 1].lemma + "]" else: feature += "_[NULL]" print_feature(sentence.doc_id, relation_id, feature) feature = inv + "INT_NGRAM_[" + sentence.words[betw_start + 1].lemma + \ "]" + "_[" + sentence.words[betw_end - 1].lemma + "]" print_feature(sentence.doc_id, relation_id, feature)