def supervise(mentions, sentence): phrase = " ".join([x.word for x in sentence.words]) new_mentions = [] for mention in mentions: new_mentions.append(mention) if mention.is_correct is not None: continue # The candidate is a long name. if " ".join([word.word for word in mention.words]) in \ inverted_long_names: mention.is_correct = True mention.type = "GENE_SUP_long" continue # The candidate is a MIM entry if mention.words[0].word == "MIM": mention_word_idx = mention.words[0].in_sent_idx if mention_word_idx < len(sentence.words) - 1: next_word = sentence.words[mention_word_idx + 1].word if next_word.casefold() in ["no", "no.", "#", ":"] and \ mention_word_idx + 2 < len(sentence.words): next_word = sentence.words[mention_word_idx + 2].word try: int(next_word) mention.is_correct = False mention.type = "GENE_SUP_MIM" continue except ValueError: pass # The phrase starts with words that are indicative of the candidate not # being a mention of a gene # We add a feature for this, as it is a context property if phrase.startswith("Performed the experiments :") or \ phrase.startswith("Wrote the paper :") or \ phrase.startswith("W'rote the paper :") or \ phrase.startswith("Wlrote the paper") or \ phrase.startswith("Contributed reagents") or \ phrase.startswith("Analyzed the data :") or \ phrase.casefold().startswith("address"): # An unsupervised copy with the special feature unsuper_enriched = Mention( "GENE_dontsup", mention.entity, mention.words) unsuper_enriched.features = mention.features.copy() unsuper_enriched.add_feature("IN_CONTRIB_PHRASE") new_mentions.append(unsuper_enriched) # This candidate contain only the 'special' feature. super_spec = Mention( "GENE_SUP_contr_2", mention.entity, mention.words) super_spec.is_correct = False super_spec.add_feature("IN_CONTRIB_PHRASE") new_mentions.append(super_spec) # Set is_correct and type. mention.is_correct = False mention.type = "GENE_SUP_contr_1" continue # The candidate is an entry in Gene Ontology if len(mention.words) == 1 and mention.words[0].word == "GO": try: if sentence.words[mention.words[0].in_sent_idx + 1][0] == ":": mention.is_correct = False mention.type = "GENE_SUP_go" except: pass continue # Index of the word on the left idx = mention.wordidxs[0] - 1 if idx >= 0: # The candidate is preceded by a "%" (it's probably a quantity) if sentence.words[idx].word == "%": mention.is_correct = False mention.type = "GENE_SUP_%" continue # The candidate comes after a "document element" (e.g., table, or # figure) if sentence.words[idx].word.casefold() in DOC_ELEMENTS: mention.is_correct = False mention.type = "GENE_SUP_doc" continue # The candidate comes after an "individual" word (e.g., # "individual") if sentence.words[idx].word.casefold() in INDIVIDUALS and \ not mention.words[0].word.isalpha() and \ not len(mention.words[0].word) > 4: mention.is_correct = False mention.type = "GENE_SUP_indiv" continue # The candidate comes after a "type" word, and it is made only of # the letters "I" and "V" if sentence.words[idx].lemma.casefold() in TYPES and \ set(mention.words[0].word).issubset(set(["I", "V"])): mention.is_correct = False mention.type = "GENE_SUP_type" continue # Index of the word on the right idx = mention.wordidxs[-1] + 1 if idx < len(sentence.words): # The candidate is followed by a "=" (it's probably a quantity) if sentence.words[idx].word == "=": mention.is_correct = False mention.type = "GENE_SUP_=" continue # The candidate is followed by a ":" and the word after it is a # number (it's probably a quantity) if sentence.words[idx].word == ":": try: float(sentence.words[idx + 1].word) mention.is_correct = False mention.type = "GENE_SUP_:" except: # both ValueError and IndexError pass continue # The candidate comes before "et" if sentence.words[idx].word == "et": mention.is_correct = False mention.type = "GENE_SUP_et" continue # The candidate is a DNA triplet # We check this by looking at whether the word before or after is also # a DNA triplet. if len(mention.words) == 1 and len(mention.words[0].word) == 3 and \ set(mention.words[0].word) <= set("ACGT"): done = False idx = mention.wordidxs[0] - 1 if idx > 0: if set(sentence.words[idx].word) <= set("ACGT"): mention.is_correct = False mention.type = "GENE_SUP_dna" continue idx = mention.wordidxs[-1] + 1 if not done and idx < len(sentence.words): if set(sentence.words[idx].word) <= set("ACGT"): mention.is_correct = False mention.type = "GENE_SUP_dna" continue # If it's "II", it's most probably wrong. if mention.words[0].word == "II": mention.is_correct = False mention.type = "GENE_SUP_ii" continue # Snowball positive features # Commented out to avoid overfitting # if mention.features & snowball_pos_feats: # supervised = Mention("GENE_SUP", mention.entity, # mention.words) # supervised.features = mention.features - snowball_pos_feats # supervised.is_correct = True # new_mentions.append(supervised) # supervised2 = Mention("GENE_SUP", mention.entity, # mention.words) # supervised2.features = mention.features & snowball_pos_feats # supervised2.is_correct = True # new_mentions.append(supervised2) # continue # Some negative features # if "EXT_KEYWORD_MIN_[chromosome]@nn" in mention.features: # supervised = Mention("GENE_SUP", mention.entity, mention.words) # supervised.features = mention.features.copy() # supervised.is_correct = False # new_mentions.append(supervised) # continue # if "IS_YEAR_RIGHT" in mention.features: # supervised = Mention("GENE_SUP", mention.entity, mention.words) # supervised.features = mention.features.copy() # supervised.is_correct = False # new_mentions.append(supervised) # continue # The candidate comes after an organization, or a location, or a # person. We skip commas as they may trick us. comes_after = None loc_idx = mention.wordidxs[0] - 1 while loc_idx >= 0 and sentence.words[loc_idx].lemma == ",": loc_idx -= 1 if loc_idx >= 0 and \ sentence.words[loc_idx].ner in \ ["ORGANIZATION", "LOCATION", "PERSON"] and \ sentence.words[loc_idx].word not in merged_genes_dict: comes_after = sentence.words[loc_idx].ner # The candidate comes before an organization, or a location, or a # person. We skip commas, as they may trick us. comes_before = None loc_idx = mention.wordidxs[-1] + 1 while loc_idx < len(sentence.words) and \ sentence.words[loc_idx].lemma == ",": loc_idx += 1 if loc_idx < len(sentence.words) and sentence.words[loc_idx].ner in \ ["ORGANIZATION", "LOCATION", "PERSON"] and \ sentence.words[loc_idx].word not in merged_genes_dict: comes_before = sentence.words[loc_idx].ner # Not correct if it's most probably a person name. if comes_before and comes_after: mention.is_correct = False mention.type = "GENE_SUP_name" continue # Comes after person and before "," or ":", so it's probably a person # name if comes_after == "PERSON" and \ mention.words[-1].in_sent_idx + 1 < len(sentence.words) and \ sentence.words[mention.words[-1].in_sent_idx + 1].word \ in [",", ":"]: mention.is_correct = False mention.type = "GENE_SUP_name2" continue if comes_after == "PERSON" and mention.words[0].ner == "PERSON": mention.is_correct = False mention.type = "GENE_SUP_name3" continue # Is a location and comes before a location so it's probably wrong if comes_before == "LOCATION" and mention.words[0].ner == "LOCATION": mention.is_correct = False mention.type = "GENE_SUP_loc" continue return new_mentions
def supervise(mentions, sentence): phrase = " ".join([x.word for x in sentence.words]) new_mentions = [] for mention in mentions: new_mentions.append(mention) if mention.is_correct is not None: continue # The candidate is a long name. if " ".join([word.word for word in mention.words]) in \ inverted_long_names: mention.is_correct = True mention.type = "GENE_SUP_long" continue # The candidate is a MIM entry if mention.words[0].word == "MIM": mention_word_idx = mention.words[0].in_sent_idx if mention_word_idx < len(sentence.words) - 1: next_word = sentence.words[mention_word_idx + 1].word if next_word.casefold() in ["no", "no.", "#", ":"] and \ mention_word_idx + 2 < len(sentence.words): next_word = sentence.words[mention_word_idx + 2].word try: int(next_word) mention.is_correct = False mention.type = "GENE_SUP_MIM" continue except ValueError: pass # The phrase starts with words that are indicative of the candidate not # being a mention of a gene # We add a feature for this, as it is a context property if phrase.startswith("Performed the experiments :") or \ phrase.startswith("Wrote the paper :") or \ phrase.startswith("W'rote the paper :") or \ phrase.startswith("Wlrote the paper") or \ phrase.startswith("Contributed reagents") or \ phrase.startswith("Analyzed the data :") or \ phrase.casefold().startswith("address"): # An unsupervised copy with the special feature unsuper_enriched = Mention("GENE_dontsup", mention.entity, mention.words) unsuper_enriched.features = mention.features.copy() unsuper_enriched.add_feature("IN_CONTRIB_PHRASE") new_mentions.append(unsuper_enriched) # This candidate contain only the 'special' feature. super_spec = Mention("GENE_SUP_contr_2", mention.entity, mention.words) super_spec.is_correct = False super_spec.add_feature("IN_CONTRIB_PHRASE") new_mentions.append(super_spec) # Set is_correct and type. mention.is_correct = False mention.type = "GENE_SUP_contr_1" continue # The candidate is an entry in Gene Ontology if len(mention.words) == 1 and mention.words[0].word == "GO": try: if sentence.words[mention.words[0].in_sent_idx + 1][0] == ":": mention.is_correct = False mention.type = "GENE_SUP_go" except: pass continue # Index of the word on the left idx = mention.wordidxs[0] - 1 if idx >= 0: # The candidate is preceded by a "%" (it's probably a quantity) if sentence.words[idx].word == "%": mention.is_correct = False mention.type = "GENE_SUP_%" continue # The candidate comes after a "document element" (e.g., table, or # figure) if sentence.words[idx].word.casefold() in DOC_ELEMENTS: mention.is_correct = False mention.type = "GENE_SUP_doc" continue # The candidate comes after an "individual" word (e.g., # "individual") if sentence.words[idx].word.casefold() in INDIVIDUALS and \ not mention.words[0].word.isalpha() and \ not len(mention.words[0].word) > 4: mention.is_correct = False mention.type = "GENE_SUP_indiv" continue # The candidate comes after a "type" word, and it is made only of # the letters "I" and "V" if sentence.words[idx].lemma.casefold() in TYPES and \ set(mention.words[0].word).issubset(set(["I", "V"])): mention.is_correct = False mention.type = "GENE_SUP_type" continue # Index of the word on the right idx = mention.wordidxs[-1] + 1 if idx < len(sentence.words): # The candidate is followed by a "=" (it's probably a quantity) if sentence.words[idx].word == "=": mention.is_correct = False mention.type = "GENE_SUP_=" continue # The candidate is followed by a ":" and the word after it is a # number (it's probably a quantity) if sentence.words[idx].word == ":": try: float(sentence.words[idx + 1].word) mention.is_correct = False mention.type = "GENE_SUP_:" except: # both ValueError and IndexError pass continue # The candidate comes before "et" if sentence.words[idx].word == "et": mention.is_correct = False mention.type = "GENE_SUP_et" continue # The candidate is a DNA triplet # We check this by looking at whether the word before or after is also # a DNA triplet. if len(mention.words) == 1 and len(mention.words[0].word) == 3 and \ set(mention.words[0].word) <= set("ACGT"): done = False idx = mention.wordidxs[0] - 1 if idx > 0: if set(sentence.words[idx].word) <= set("ACGT"): mention.is_correct = False mention.type = "GENE_SUP_dna" continue idx = mention.wordidxs[-1] + 1 if not done and idx < len(sentence.words): if set(sentence.words[idx].word) <= set("ACGT"): mention.is_correct = False mention.type = "GENE_SUP_dna" continue # If it's "II", it's most probably wrong. if mention.words[0].word == "II": mention.is_correct = False mention.type = "GENE_SUP_ii" continue # Snowball positive features # Commented out to avoid overfitting # if mention.features & snowball_pos_feats: # supervised = Mention("GENE_SUP", mention.entity, # mention.words) # supervised.features = mention.features - snowball_pos_feats # supervised.is_correct = True # new_mentions.append(supervised) # supervised2 = Mention("GENE_SUP", mention.entity, # mention.words) # supervised2.features = mention.features & snowball_pos_feats # supervised2.is_correct = True # new_mentions.append(supervised2) # continue # Some negative features # if "EXT_KEYWORD_MIN_[chromosome]@nn" in mention.features: # supervised = Mention("GENE_SUP", mention.entity, mention.words) # supervised.features = mention.features.copy() # supervised.is_correct = False # new_mentions.append(supervised) # continue # if "IS_YEAR_RIGHT" in mention.features: # supervised = Mention("GENE_SUP", mention.entity, mention.words) # supervised.features = mention.features.copy() # supervised.is_correct = False # new_mentions.append(supervised) # continue # The candidate comes after an organization, or a location, or a # person. We skip commas as they may trick us. comes_after = None loc_idx = mention.wordidxs[0] - 1 while loc_idx >= 0 and sentence.words[loc_idx].lemma == ",": loc_idx -= 1 if loc_idx >= 0 and \ sentence.words[loc_idx].ner in \ ["ORGANIZATION", "LOCATION", "PERSON"] and \ sentence.words[loc_idx].word not in merged_genes_dict: comes_after = sentence.words[loc_idx].ner # The candidate comes before an organization, or a location, or a # person. We skip commas, as they may trick us. comes_before = None loc_idx = mention.wordidxs[-1] + 1 while loc_idx < len(sentence.words) and \ sentence.words[loc_idx].lemma == ",": loc_idx += 1 if loc_idx < len(sentence.words) and sentence.words[loc_idx].ner in \ ["ORGANIZATION", "LOCATION", "PERSON"] and \ sentence.words[loc_idx].word not in merged_genes_dict: comes_before = sentence.words[loc_idx].ner # Not correct if it's most probably a person name. if comes_before and comes_after: mention.is_correct = False mention.type = "GENE_SUP_name" continue # Comes after person and before "," or ":", so it's probably a person # name if comes_after == "PERSON" and \ mention.words[-1].in_sent_idx + 1 < len(sentence.words) and \ sentence.words[mention.words[-1].in_sent_idx + 1].word \ in [",", ":"]: mention.is_correct = False mention.type = "GENE_SUP_name2" continue if comes_after == "PERSON" and mention.words[0].ner == "PERSON": mention.is_correct = False mention.type = "GENE_SUP_name3" continue # Is a location and comes before a location so it's probably wrong if comes_before == "LOCATION" and mention.words[0].ner == "LOCATION": mention.is_correct = False mention.type = "GENE_SUP_loc" continue return new_mentions