def extract(sentence): mentions = [] # Skip the sentence if there are no English words in the sentence no_english_words = True for word in sentence.words: if len(word.word) > 2 and \ (word.word in english_dict or word.word.casefold() in english_dict): no_english_words = False break if no_english_words: return [] # Stop iteration sentence_is_upper = False if " ".join([x.word for x in sentence.words]).isupper(): sentence_is_upper = True # The following set keeps a list of indexes we already looked at and which # contained a mention history = set() words = sentence.words # Scan all subsequences of the sentence of length up to max_mention_length for start, end in get_all_phrases_in_sentence(sentence, max_mention_length): if start in history or end in history: continue phrase = " ".join([word.word for word in words[start:end]]) if sentence_is_upper: # This may not be a great idea... phrase = phrase.casefold() mention = None # If the phrase is a hpoterm name containing a gene, then it is a # mention candidate to supervise as negative if phrase in hpoterms_with_gene: mention = Mention("GENE_SUP_HPO", phrase, words[start:end]) add_features(mention, sentence) mention.is_correct = False mentions.append(mention) for i in range(start, end): history.add(i) # If the phrase is in the gene dictionary, then is a mention candidate if len(phrase) > 1 and phrase in merged_genes_dict: # The entity is a list of all the main symbols that could have the # phrase as symbol. They're separated by "|". mention = Mention("GENE", "|".join(merged_genes_dict[phrase]), words[start:end]) # Add features to the candidate add_features(mention, sentence) # Add mention to the list mentions.append(mention) # Add indexes to history so that they are not used for another # mention for i in range(start, end): history.add(i) return mentions
def extract(sentence): mentions = [] mention_ids = set() # If there are no English words in the sentence, we skip it. no_english_words = True for word in sentence.words: word.stem = stemmer.stem(word.word) # Here so all words have stem if len(word.word) > 2 and \ (word.word in english_dict or word.word.casefold() in english_dict): no_english_words = False if no_english_words: return mentions history = set() # Iterate over each phrase of length at most max_mention_length for start, end in get_all_phrases_in_sentence(sentence, max_mention_length): if start in history or end - 1 in history: continue phrase = " ".join([word.word for word in sentence.words[start:end]]) # If the phrase is a gene long name containing a phenotype name, create # a candidate that we supervise as negative if len(phrase) > 1 and phrase in genes_with_hpoterm: mention = Mention("HPOTERM_SUP_GENEL", phrase, sentence.words[start:end]) mention.is_correct = False add_features(mention, sentence) mentions.append(mention) for word in sentence.words[start:end]: history.add(word.in_sent_idx) continue # Iterate over each phrase of length at most max_mention_length for start, end in get_all_phrases_in_sentence(sentence, max_mention_length): should_continue = False for i in range(start, end): if i in history: should_continue = True break if should_continue: continue phrase = " ".join([word.word for word in sentence.words[start:end]]) # The list of stems in the phrase (not from stopwords or symbols, and # not already used for a mention) phrase_stems = [] for word in sentence.words[start:end]: if not re.match("^(_|\W)+$", word.word) and \ (len(word.word) == 1 or word.lemma.casefold() not in stopwords_dict): phrase_stems.append(word.stem) phrase_stems_set = frozenset(phrase_stems) if phrase_stems_set in hpoterms_dict: # Find the word objects of that match mention_words = [] mention_lemmas = [] mention_stems = [] for word in sentence.words[start:end]: if word.stem in phrase_stems_set and \ word.lemma.casefold() not in mention_lemmas and \ word.stem not in mention_stems: mention_lemmas.append(word.lemma.casefold()) mention_words.append(word) mention_stems.append(word.stem) if len(mention_words) == len(phrase_stems_set): break entity = list(hpoterms_dict[phrase_stems_set])[0] mention = Mention("HPOTERM", hponames_to_ids[entity] + "|" + entity, mention_words) # The following is a way to avoid duplicates. # It's ugly and not perfect if mention.id() in mention_ids: continue mention_ids.add(mention.id()) # Features add_features(mention, sentence) mentions.append(mention) for word in mention_words: history.add(word.in_sent_idx) # Generate some negative candidates at random, if this sentences didn't # contain any other candidate. We want the candidates to be nouns. if len(mentions) == 0 and random.random() <= NEG_PROB: index = random.randint(0, len(sentence.words) - 1) # We may not get a noun at random, so we try again if we don't. tries = 10 while not sentence.words[index].pos.startswith("NN") and tries > 0: index = random.randint(0, len(sentence.words) - 1) tries -= 1 if sentence.words[index].pos.startswith("NN"): mention = Mention("HPOTERM_SUP_rand", sentence.words[index].lemma.casefold(), sentence.words[index:index + 1]) mention.is_correct = False add_features(mention, sentence) mentions.append(mention) return mentions
def extract(sentence): mentions = [] mention_ids = set() # If there are no English words in the sentence, we skip it. no_english_words = True for word in sentence.words: word.stem = stemmer.stem(word.word) # Here so all words have stem if len(word.word) > 2 and \ (word.word in english_dict or word.word.casefold() in english_dict): no_english_words = False if no_english_words: return mentions history = set() # Iterate over each phrase of length at most max_mention_length for start, end in get_all_phrases_in_sentence(sentence, max_mention_length): if start in history or end - 1 in history: continue phrase = " ".join([word.word for word in sentence.words[start:end]]) # If the phrase is a gene long name containing a phenotype name, create # a candidate that we supervise as negative if len(phrase) > 1 and phrase in genes_with_hpoterm: mention = Mention("HPOTERM_SUP_GENEL", phrase, sentence.words[start:end]) mention.is_correct = False add_features(mention, sentence) mentions.append(mention) for word in sentence.words[start:end]: history.add(word.in_sent_idx) continue # Iterate over each phrase of length at most max_mention_length for start, end in get_all_phrases_in_sentence(sentence, max_mention_length): should_continue = False for i in range(start, end): if i in history: should_continue = True break if should_continue: continue phrase = " ".join([word.word for word in sentence.words[start:end]]) # The list of stems in the phrase (not from stopwords or symbols, and # not already used for a mention) phrase_stems = [] for word in sentence.words[start:end]: if not re.match("^(_|\W)+$", word.word) and \ (len(word.word) == 1 or word.lemma.casefold() not in stopwords_dict): phrase_stems.append(word.stem) phrase_stems_set = frozenset(phrase_stems) if phrase_stems_set in hpoterms_dict: # Find the word objects of that match mention_words = [] mention_lemmas = [] mention_stems = [] for word in sentence.words[start:end]: if word.stem in phrase_stems_set and \ word.lemma.casefold() not in mention_lemmas and \ word.stem not in mention_stems: mention_lemmas.append(word.lemma.casefold()) mention_words.append(word) mention_stems.append(word.stem) if len(mention_words) == len(phrase_stems_set): break entity = list(hpoterms_dict[phrase_stems_set])[0] mention = Mention( "HPOTERM", hponames_to_ids[entity] + "|" + entity, mention_words) # The following is a way to avoid duplicates. # It's ugly and not perfect if mention.id() in mention_ids: continue mention_ids.add(mention.id()) # Features add_features(mention, sentence) mentions.append(mention) for word in mention_words: history.add(word.in_sent_idx) # Generate some negative candidates at random, if this sentences didn't # contain any other candidate. We want the candidates to be nouns. if len(mentions) == 0 and random.random() <= NEG_PROB: index = random.randint(0, len(sentence.words) - 1) # We may not get a noun at random, so we try again if we don't. tries = 10 while not sentence.words[index].pos.startswith("NN") and tries > 0: index = random.randint(0, len(sentence.words) - 1) tries -= 1 if sentence.words[index].pos.startswith("NN"): mention = Mention( "HPOTERM_SUP_rand", sentence.words[index].lemma.casefold(), sentence.words[index:index+1]) mention.is_correct = False add_features(mention, sentence) mentions.append(mention) return mentions