def extract(sentence):
    mentions = []
    # Skip the sentence if there are no English words in the sentence
    no_english_words = True
    for word in sentence.words:
        if len(word.word) > 2 and \
                (word.word in english_dict or
                 word.word.casefold() in english_dict):
            no_english_words = False
            break
    if no_english_words:
        return []  # Stop iteration

    sentence_is_upper = False
    if " ".join([x.word for x in sentence.words]).isupper():
        sentence_is_upper = True
    # The following set keeps a list of indexes we already looked at and which
    # contained a mention
    history = set()
    words = sentence.words
    # Scan all subsequences of the sentence of length up to max_mention_length
    for start, end in get_all_phrases_in_sentence(sentence,
                                                  max_mention_length):
        if start in history or end in history:
                continue
        phrase = " ".join([word.word for word in words[start:end]])
        if sentence_is_upper:  # This may not be a great idea...
            phrase = phrase.casefold()
        mention = None
        # If the phrase is a hpoterm name containing a gene, then it is a
        # mention candidate to supervise as negative
        if phrase in hpoterms_with_gene:
            mention = Mention("GENE_SUP_HPO", phrase, words[start:end])
            add_features(mention, sentence)
            mention.is_correct = False
            mentions.append(mention)
            for i in range(start, end):
                history.add(i)
        # If the phrase is in the gene dictionary, then is a mention candidate
        if len(phrase) > 1 and phrase in merged_genes_dict:
            # The entity is a list of all the main symbols that could have the
            # phrase as symbol. They're separated by "|".
            mention = Mention("GENE",
                              "|".join(merged_genes_dict[phrase]),
                              words[start:end])
            # Add features to the candidate
            add_features(mention, sentence)
            # Add mention to the list
            mentions.append(mention)
            # Add indexes to history so that they are not used for another
            # mention
            for i in range(start, end):
                history.add(i)
    return mentions
def extract(sentence):
    mentions = []
    # Skip the sentence if there are no English words in the sentence
    no_english_words = True
    for word in sentence.words:
        if len(word.word) > 2 and \
                (word.word in english_dict or
                 word.word.casefold() in english_dict):
            no_english_words = False
            break
    if no_english_words:
        return []  # Stop iteration

    sentence_is_upper = False
    if " ".join([x.word for x in sentence.words]).isupper():
        sentence_is_upper = True
    # The following set keeps a list of indexes we already looked at and which
    # contained a mention
    history = set()
    words = sentence.words
    # Scan all subsequences of the sentence of length up to max_mention_length
    for start, end in get_all_phrases_in_sentence(sentence,
                                                  max_mention_length):
        if start in history or end in history:
            continue
        phrase = " ".join([word.word for word in words[start:end]])
        if sentence_is_upper:  # This may not be a great idea...
            phrase = phrase.casefold()
        mention = None
        # If the phrase is a hpoterm name containing a gene, then it is a
        # mention candidate to supervise as negative
        if phrase in hpoterms_with_gene:
            mention = Mention("GENE_SUP_HPO", phrase, words[start:end])
            add_features(mention, sentence)
            mention.is_correct = False
            mentions.append(mention)
            for i in range(start, end):
                history.add(i)
        # If the phrase is in the gene dictionary, then is a mention candidate
        if len(phrase) > 1 and phrase in merged_genes_dict:
            # The entity is a list of all the main symbols that could have the
            # phrase as symbol. They're separated by "|".
            mention = Mention("GENE", "|".join(merged_genes_dict[phrase]),
                              words[start:end])
            # Add features to the candidate
            add_features(mention, sentence)
            # Add mention to the list
            mentions.append(mention)
            # Add indexes to history so that they are not used for another
            # mention
            for i in range(start, end):
                history.add(i)
    return mentions
Esempio n. 3
0
def extract(sentence):
    mentions = []
    mention_ids = set()
    # If there are no English words in the sentence, we skip it.
    no_english_words = True
    for word in sentence.words:
        word.stem = stemmer.stem(word.word)  # Here so all words have stem
        if len(word.word) > 2 and \
                (word.word in english_dict or
                 word.word.casefold() in english_dict):
            no_english_words = False
    if no_english_words:
        return mentions
    history = set()
    # Iterate over each phrase of length at most max_mention_length
    for start, end in get_all_phrases_in_sentence(sentence,
                                                  max_mention_length):
        if start in history or end - 1 in history:
            continue
        phrase = " ".join([word.word for word in sentence.words[start:end]])
        # If the phrase is a gene long name containing a phenotype name, create
        # a candidate that we supervise as negative
        if len(phrase) > 1 and phrase in genes_with_hpoterm:
            mention = Mention("HPOTERM_SUP_GENEL", phrase,
                              sentence.words[start:end])
            mention.is_correct = False
            add_features(mention, sentence)
            mentions.append(mention)
            for word in sentence.words[start:end]:
                history.add(word.in_sent_idx)
            continue
    # Iterate over each phrase of length at most max_mention_length
    for start, end in get_all_phrases_in_sentence(sentence,
                                                  max_mention_length):
        should_continue = False
        for i in range(start, end):
            if i in history:
                should_continue = True
                break
        if should_continue:
            continue
        phrase = " ".join([word.word for word in sentence.words[start:end]])
        # The list of stems in the phrase (not from stopwords or symbols, and
        # not already used for a mention)
        phrase_stems = []
        for word in sentence.words[start:end]:
            if not re.match("^(_|\W)+$", word.word) and \
                    (len(word.word) == 1 or
                     word.lemma.casefold() not in stopwords_dict):
                phrase_stems.append(word.stem)
        phrase_stems_set = frozenset(phrase_stems)
        if phrase_stems_set in hpoterms_dict:
            # Find the word objects of that match
            mention_words = []
            mention_lemmas = []
            mention_stems = []
            for word in sentence.words[start:end]:
                if word.stem in phrase_stems_set and \
                        word.lemma.casefold() not in mention_lemmas and \
                        word.stem not in mention_stems:
                    mention_lemmas.append(word.lemma.casefold())
                    mention_words.append(word)
                    mention_stems.append(word.stem)
                    if len(mention_words) == len(phrase_stems_set):
                        break
            entity = list(hpoterms_dict[phrase_stems_set])[0]
            mention = Mention("HPOTERM",
                              hponames_to_ids[entity] + "|" + entity,
                              mention_words)
            # The following is a way to avoid duplicates.
            # It's ugly and not perfect
            if mention.id() in mention_ids:
                continue
            mention_ids.add(mention.id())
            # Features
            add_features(mention, sentence)
            mentions.append(mention)
            for word in mention_words:
                history.add(word.in_sent_idx)
    # Generate some negative candidates at random, if this sentences didn't
    # contain any other candidate. We want the candidates to be nouns.
    if len(mentions) == 0 and random.random() <= NEG_PROB:
        index = random.randint(0, len(sentence.words) - 1)
        # We may not get a noun at random, so we try again if we don't.
        tries = 10
        while not sentence.words[index].pos.startswith("NN") and tries > 0:
            index = random.randint(0, len(sentence.words) - 1)
            tries -= 1
        if sentence.words[index].pos.startswith("NN"):
            mention = Mention("HPOTERM_SUP_rand",
                              sentence.words[index].lemma.casefold(),
                              sentence.words[index:index + 1])
            mention.is_correct = False
            add_features(mention, sentence)
            mentions.append(mention)
    return mentions
def extract(sentence):
    mentions = []
    mention_ids = set()
    # If there are no English words in the sentence, we skip it.
    no_english_words = True
    for word in sentence.words:
        word.stem = stemmer.stem(word.word)  # Here so all words have stem
        if len(word.word) > 2 and \
                (word.word in english_dict or
                 word.word.casefold() in english_dict):
            no_english_words = False
    if no_english_words:
        return mentions
    history = set()
    # Iterate over each phrase of length at most max_mention_length
    for start, end in get_all_phrases_in_sentence(sentence,
                                                  max_mention_length):
        if start in history or end - 1 in history:
            continue
        phrase = " ".join([word.word for word in sentence.words[start:end]])
        # If the phrase is a gene long name containing a phenotype name, create
        # a candidate that we supervise as negative
        if len(phrase) > 1 and phrase in genes_with_hpoterm:
            mention = Mention("HPOTERM_SUP_GENEL",
                              phrase,
                              sentence.words[start:end])
            mention.is_correct = False
            add_features(mention, sentence)
            mentions.append(mention)
            for word in sentence.words[start:end]:
                history.add(word.in_sent_idx)
            continue
    # Iterate over each phrase of length at most max_mention_length
    for start, end in get_all_phrases_in_sentence(sentence,
                                                  max_mention_length):
        should_continue = False
        for i in range(start, end):
            if i in history:
                should_continue = True
                break
        if should_continue:
            continue
        phrase = " ".join([word.word for word in sentence.words[start:end]])
        # The list of stems in the phrase (not from stopwords or symbols, and
        # not already used for a mention)
        phrase_stems = []
        for word in sentence.words[start:end]:
            if not re.match("^(_|\W)+$", word.word) and \
                    (len(word.word) == 1 or
                     word.lemma.casefold() not in stopwords_dict):
                phrase_stems.append(word.stem)
        phrase_stems_set = frozenset(phrase_stems)
        if phrase_stems_set in hpoterms_dict:
            # Find the word objects of that match
            mention_words = []
            mention_lemmas = []
            mention_stems = []
            for word in sentence.words[start:end]:
                if word.stem in phrase_stems_set and \
                        word.lemma.casefold() not in mention_lemmas and \
                        word.stem not in mention_stems:
                    mention_lemmas.append(word.lemma.casefold())
                    mention_words.append(word)
                    mention_stems.append(word.stem)
                    if len(mention_words) == len(phrase_stems_set):
                        break
            entity = list(hpoterms_dict[phrase_stems_set])[0]
            mention = Mention(
                "HPOTERM", hponames_to_ids[entity] + "|" + entity,
                mention_words)
            # The following is a way to avoid duplicates.
            # It's ugly and not perfect
            if mention.id() in mention_ids:
                continue
            mention_ids.add(mention.id())
            # Features
            add_features(mention, sentence)
            mentions.append(mention)
            for word in mention_words:
                history.add(word.in_sent_idx)
    # Generate some negative candidates at random, if this sentences didn't
    # contain any other candidate. We want the candidates to be nouns.
    if len(mentions) == 0 and random.random() <= NEG_PROB:
        index = random.randint(0, len(sentence.words) - 1)
        # We may not get a noun at random, so we try again if we don't.
        tries = 10
        while not sentence.words[index].pos.startswith("NN") and tries > 0:
            index = random.randint(0, len(sentence.words) - 1)
            tries -= 1
        if sentence.words[index].pos.startswith("NN"):
            mention = Mention(
                "HPOTERM_SUP_rand", sentence.words[index].lemma.casefold(),
                sentence.words[index:index+1])
            mention.is_correct = False
            add_features(mention, sentence)
            mentions.append(mention)
    return mentions