コード例 #1
0
def extract(sentence):
    mentions = []
    # Skip the sentence if there are no English words in the sentence
    no_english_words = True
    for word in sentence.words:
        if len(word.word) > 2 and \
                (word.word in english_dict or
                 word.word.casefold() in english_dict):
            no_english_words = False
            break
    if no_english_words:
        return []  # Stop iteration

    sentence_is_upper = False
    if " ".join([x.word for x in sentence.words]).isupper():
        sentence_is_upper = True
    # The following set keeps a list of indexes we already looked at and which
    # contained a mention
    history = set()
    words = sentence.words
    # Scan all subsequences of the sentence of length up to max_mention_length
    for start, end in get_all_phrases_in_sentence(sentence,
                                                  max_mention_length):
        if start in history or end in history:
                continue
        phrase = " ".join([word.word for word in words[start:end]])
        if sentence_is_upper:  # This may not be a great idea...
            phrase = phrase.casefold()
        mention = None
        # If the phrase is a hpoterm name containing a gene, then it is a
        # mention candidate to supervise as negative
        if phrase in hpoterms_with_gene:
            mention = Mention("GENE_SUP_HPO", phrase, words[start:end])
            add_features(mention, sentence)
            mention.is_correct = False
            mentions.append(mention)
            for i in range(start, end):
                history.add(i)
        # If the phrase is in the gene dictionary, then is a mention candidate
        if len(phrase) > 1 and phrase in merged_genes_dict:
            # The entity is a list of all the main symbols that could have the
            # phrase as symbol. They're separated by "|".
            mention = Mention("GENE",
                              "|".join(merged_genes_dict[phrase]),
                              words[start:end])
            # Add features to the candidate
            add_features(mention, sentence)
            # Add mention to the list
            mentions.append(mention)
            # Add indexes to history so that they are not used for another
            # mention
            for i in range(start, end):
                history.add(i)
    return mentions
コード例 #2
0
def extract(sentence):
    mentions = []
    # Skip the sentence if there are no English words in the sentence
    no_english_words = True
    for word in sentence.words:
        if len(word.word) > 2 and \
                (word.word in english_dict or
                 word.word.casefold() in english_dict):
            no_english_words = False
            break
    if no_english_words:
        return []  # Stop iteration

    sentence_is_upper = False
    if " ".join([x.word for x in sentence.words]).isupper():
        sentence_is_upper = True
    # The following set keeps a list of indexes we already looked at and which
    # contained a mention
    history = set()
    words = sentence.words
    # Scan all subsequences of the sentence of length up to max_mention_length
    for start, end in get_all_phrases_in_sentence(sentence,
                                                  max_mention_length):
        if start in history or end in history:
            continue
        phrase = " ".join([word.word for word in words[start:end]])
        if sentence_is_upper:  # This may not be a great idea...
            phrase = phrase.casefold()
        mention = None
        # If the phrase is a hpoterm name containing a gene, then it is a
        # mention candidate to supervise as negative
        if phrase in hpoterms_with_gene:
            mention = Mention("GENE_SUP_HPO", phrase, words[start:end])
            add_features(mention, sentence)
            mention.is_correct = False
            mentions.append(mention)
            for i in range(start, end):
                history.add(i)
        # If the phrase is in the gene dictionary, then is a mention candidate
        if len(phrase) > 1 and phrase in merged_genes_dict:
            # The entity is a list of all the main symbols that could have the
            # phrase as symbol. They're separated by "|".
            mention = Mention("GENE", "|".join(merged_genes_dict[phrase]),
                              words[start:end])
            # Add features to the candidate
            add_features(mention, sentence)
            # Add mention to the list
            mentions.append(mention)
            # Add indexes to history so that they are not used for another
            # mention
            for i in range(start, end):
                history.add(i)
    return mentions
コード例 #3
0
     no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list,
     TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list,
     lambda x: TSVstring2list(x, int), TSVstring2list, no_op,
     lambda x: TSVstring2list(x, int), TSVstring2bool, no_op, no_op,
     lambda x: TSVstring2list(x, int), TSVstring2bool, no_op
 ])
 # Create the sentence object where the two mentions appear
 sentence = Sentence(line_dict["doc_id"], line_dict["sent_id"],
                     line_dict["wordidxs"], line_dict["words"],
                     line_dict["poses"], line_dict["ners"],
                     line_dict["lemmas"], line_dict["dep_paths"],
                     line_dict["dep_parents"],
                     line_dict["bounding_boxes"])
 # Create the mentions
 gene_1_mention = Mention(
     "GENE", line_dict["gene_1_entity"],
     [sentence.words[j] for j in line_dict["gene_1_wordidxs"]])
 gene_1_mention.is_correct = line_dict["gene_1_is_correct"]
 gene_1_mention.type = line_dict["gene_1_type"]
 gene_2_mention = Mention(
     "GENE", line_dict["gene_2_entity"],
     [sentence.words[j] for j in line_dict["gene_2_wordidxs"]])
 gene_2_mention.is_correct = line_dict["gene_2_is_correct"]
 gene_2_mention.type = line_dict["gene_2_type"]
 # If the word indexes do not overlap, create the relation candidate
 # TODO there may be other cases. Check with Emily.
 if not set(line_dict["gene_1_wordidxs"]) & \
         set(line_dict["gene_2_wordidxs"]):
     relation = Relation("GENEGENE", gene_1_mention, gene_2_mention)
     # Add features
     add_features(relation, gene_1_mention, gene_2_mention,
コード例 #4
0
def supervise(mentions, sentence):
    phrase = " ".join([x.word for x in sentence.words])
    new_mentions = []
    for mention in mentions:
        new_mentions.append(mention)
        if mention.is_correct is not None:
            continue
        # The candidate is a long name.
        if " ".join([word.word for word in mention.words]) in \
                inverted_long_names:
            mention.is_correct = True
            mention.type = "GENE_SUP_long"
            continue
        # The candidate is a MIM entry
        if mention.words[0].word == "MIM":
            mention_word_idx = mention.words[0].in_sent_idx
            if mention_word_idx < len(sentence.words) - 1:
                next_word = sentence.words[mention_word_idx + 1].word
                if next_word.casefold() in ["no", "no.", "#", ":"] and \
                        mention_word_idx + 2 < len(sentence.words):
                    next_word = sentence.words[mention_word_idx + 2].word
                try:
                    int(next_word)
                    mention.is_correct = False
                    mention.type = "GENE_SUP_MIM"
                    continue
                except ValueError:
                    pass
        # The phrase starts with words that are indicative of the candidate not
        # being a mention of a gene
        # We add a feature for this, as it is a context property
        if phrase.startswith("Performed the experiments :") or \
                phrase.startswith("Wrote the paper :") or \
                phrase.startswith("W'rote the paper :") or \
                phrase.startswith("Wlrote the paper") or \
                phrase.startswith("Contributed reagents") or \
                phrase.startswith("Analyzed the data :") or \
                phrase.casefold().startswith("address"):
            # An unsupervised copy with the special feature
            unsuper_enriched = Mention(
                "GENE_dontsup", mention.entity, mention.words)
            unsuper_enriched.features = mention.features.copy()
            unsuper_enriched.add_feature("IN_CONTRIB_PHRASE")
            new_mentions.append(unsuper_enriched)
            # This candidate contain only the 'special' feature.
            super_spec = Mention(
                "GENE_SUP_contr_2", mention.entity, mention.words)
            super_spec.is_correct = False
            super_spec.add_feature("IN_CONTRIB_PHRASE")
            new_mentions.append(super_spec)
            # Set is_correct and type.
            mention.is_correct = False
            mention.type = "GENE_SUP_contr_1"
            continue
        # The candidate is an entry in Gene Ontology
        if len(mention.words) == 1 and mention.words[0].word == "GO":
            try:
                if sentence.words[mention.words[0].in_sent_idx + 1][0] == ":":
                    mention.is_correct = False
                    mention.type = "GENE_SUP_go"
            except:
                pass
            continue
        # Index of the word on the left
        idx = mention.wordidxs[0] - 1
        if idx >= 0:
            # The candidate is preceded by a "%" (it's probably a quantity)
            if sentence.words[idx].word == "%":
                mention.is_correct = False
                mention.type = "GENE_SUP_%"
                continue
            # The candidate comes after a "document element" (e.g., table, or
            # figure)
            if sentence.words[idx].word.casefold() in DOC_ELEMENTS:
                mention.is_correct = False
                mention.type = "GENE_SUP_doc"
                continue
            # The candidate comes after an "individual" word (e.g.,
            # "individual")
            if sentence.words[idx].word.casefold() in INDIVIDUALS and \
                    not mention.words[0].word.isalpha() and \
                    not len(mention.words[0].word) > 4:
                mention.is_correct = False
                mention.type = "GENE_SUP_indiv"
                continue
            # The candidate comes after a "type" word, and it is made only of
            # the letters "I" and "V"
            if sentence.words[idx].lemma.casefold() in TYPES and \
                    set(mention.words[0].word).issubset(set(["I", "V"])):
                mention.is_correct = False
                mention.type = "GENE_SUP_type"
                continue
        # Index of the word on the right
        idx = mention.wordidxs[-1] + 1
        if idx < len(sentence.words):
            # The candidate is followed by a "=" (it's probably a quantity)
            if sentence.words[idx].word == "=":
                mention.is_correct = False
                mention.type = "GENE_SUP_="
                continue
            # The candidate is followed by a ":" and the word after it is a
            # number (it's probably a quantity)
            if sentence.words[idx].word == ":":
                try:
                    float(sentence.words[idx + 1].word)
                    mention.is_correct = False
                    mention.type = "GENE_SUP_:"
                except:  # both ValueError and IndexError
                    pass
                continue
            # The candidate comes before "et"
            if sentence.words[idx].word == "et":
                mention.is_correct = False
                mention.type = "GENE_SUP_et"
                continue
        # The candidate is a DNA triplet
        # We check this by looking at whether the word before or after is also
        # a DNA triplet.
        if len(mention.words) == 1 and len(mention.words[0].word) == 3 and \
                set(mention.words[0].word) <= set("ACGT"):
            done = False
            idx = mention.wordidxs[0] - 1
            if idx > 0:
                if set(sentence.words[idx].word) <= set("ACGT"):
                    mention.is_correct = False
                    mention.type = "GENE_SUP_dna"
                    continue
            idx = mention.wordidxs[-1] + 1
            if not done and idx < len(sentence.words):
                if set(sentence.words[idx].word) <= set("ACGT"):
                    mention.is_correct = False
                    mention.type = "GENE_SUP_dna"
                    continue
        # If it's "II", it's most probably wrong.
        if mention.words[0].word == "II":
            mention.is_correct = False
            mention.type = "GENE_SUP_ii"
            continue
        # Snowball positive features
        # Commented out to avoid overfitting
        # if mention.features & snowball_pos_feats:
        #    supervised = Mention("GENE_SUP", mention.entity,
        #                         mention.words)
        #    supervised.features = mention.features - snowball_pos_feats
        #    supervised.is_correct = True
        #    new_mentions.append(supervised)
        #    supervised2 = Mention("GENE_SUP", mention.entity,
        #                          mention.words)
        #    supervised2.features = mention.features & snowball_pos_feats
        #    supervised2.is_correct = True
        #    new_mentions.append(supervised2)
        #    continue
        # Some negative features
        # if "EXT_KEYWORD_MIN_[chromosome]@nn" in mention.features:
        #    supervised = Mention("GENE_SUP", mention.entity, mention.words)
        #    supervised.features = mention.features.copy()
        #    supervised.is_correct = False
        #    new_mentions.append(supervised)
        #    continue
        # if "IS_YEAR_RIGHT" in mention.features:
        #    supervised = Mention("GENE_SUP", mention.entity, mention.words)
        #    supervised.features = mention.features.copy()
        #    supervised.is_correct = False
        #    new_mentions.append(supervised)
        #    continue
        # The candidate comes after an organization, or a location, or a
        # person. We skip commas as they may trick us.
        comes_after = None
        loc_idx = mention.wordidxs[0] - 1
        while loc_idx >= 0 and sentence.words[loc_idx].lemma == ",":
            loc_idx -= 1
        if loc_idx >= 0 and \
                sentence.words[loc_idx].ner in \
                ["ORGANIZATION", "LOCATION", "PERSON"] and \
                sentence.words[loc_idx].word not in merged_genes_dict:
            comes_after = sentence.words[loc_idx].ner
        # The candidate comes before an organization, or a location, or a
        # person. We skip commas, as they may trick us.
        comes_before = None
        loc_idx = mention.wordidxs[-1] + 1
        while loc_idx < len(sentence.words) and \
                sentence.words[loc_idx].lemma == ",":
            loc_idx += 1
        if loc_idx < len(sentence.words) and sentence.words[loc_idx].ner in \
                ["ORGANIZATION", "LOCATION", "PERSON"] and \
                sentence.words[loc_idx].word not in merged_genes_dict:
            comes_before = sentence.words[loc_idx].ner
        # Not correct if it's most probably a person name.
        if comes_before and comes_after:
            mention.is_correct = False
            mention.type = "GENE_SUP_name"
            continue
        # Comes after person and before "," or ":", so it's probably a person
        # name
        if comes_after == "PERSON" and \
                mention.words[-1].in_sent_idx + 1 < len(sentence.words) and \
                sentence.words[mention.words[-1].in_sent_idx + 1].word \
                in [",", ":"]:
            mention.is_correct = False
            mention.type = "GENE_SUP_name2"
            continue
        if comes_after == "PERSON" and mention.words[0].ner == "PERSON":
            mention.is_correct = False
            mention.type = "GENE_SUP_name3"
            continue
        # Is a location and comes before a location so it's probably wrong
        if comes_before == "LOCATION" and mention.words[0].ner == "LOCATION":
            mention.is_correct = False
            mention.type = "GENE_SUP_loc"
            continue
    return new_mentions
コード例 #5
0
 line_dict["hpoterm_types"] = new_hpoterm_types
 # Create the sentence object where the two mentions appear
 sentence = Sentence(
     line_dict["doc_id"], line_dict["sent_id"],
     line_dict["wordidxs"], line_dict["words"], line_dict["poses"],
     line_dict["ners"], line_dict["lemmas"], line_dict["dep_paths"],
     line_dict["dep_parents"], line_dict["bounding_boxes"])
 # Skip weird sentences
 if sentence.is_weird():
     continue
 # Iterate over each pair of (gene,phenotype) mention
 for g_idx in range(len(line_dict["gene_is_corrects"])):
     g_wordidxs = TSVstring2list(
         line_dict["gene_wordidxss"][g_idx], int)
     gene_mention = Mention(
         "GENE", line_dict["gene_entities"][g_idx],
         [sentence.words[j] for j in g_wordidxs])
     if line_dict["gene_is_corrects"][g_idx] == "n":
         gene_mention.is_correct = None
     elif line_dict["gene_is_corrects"][g_idx] == "f":
         gene_mention.is_correct = False
     elif line_dict["gene_is_corrects"][g_idx] == "t":
         gene_mention.is_correct = True
     else:
         assert False
     gene_mention.type = line_dict["gene_types"][g_idx]
     assert not gene_mention.type.endswith("_UNSUP")
     for h_idx in range(len(line_dict["hpoterm_is_corrects"])):
         h_wordidxs = TSVstring2list(
             line_dict["hpoterm_wordidxss"][h_idx], int)
         hpoterm_mention = Mention(
コード例 #6
0
            "gene_2_is_correct", "gene_2_type"],
     [no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list,
         TSVstring2list, TSVstring2list, TSVstring2list,
         TSVstring2list, lambda x: TSVstring2list(x, int),
         TSVstring2list, no_op, lambda x: TSVstring2list(x, int),
         TSVstring2bool, no_op, no_op, lambda x: TSVstring2list(x,
         int), TSVstring2bool, no_op])
 # Create the sentence object where the two mentions appear
 sentence = Sentence(
     line_dict["doc_id"], line_dict["sent_id"],
     line_dict["wordidxs"], line_dict["words"], line_dict["poses"],
     line_dict["ners"], line_dict["lemmas"], line_dict["dep_paths"],
     line_dict["dep_parents"], line_dict["bounding_boxes"])
 # Create the mentions
 gene_1_mention = Mention(
     "GENE", line_dict["gene_1_entity"],
     [sentence.words[j] for j in line_dict["gene_1_wordidxs"]])
 gene_1_mention.is_correct = line_dict["gene_1_is_correct"]
 gene_1_mention.type = line_dict["gene_1_type"]
 gene_2_mention = Mention(
     "GENE", line_dict["gene_2_entity"],
     [sentence.words[j] for j in line_dict["gene_2_wordidxs"]])
 gene_2_mention.is_correct = line_dict["gene_2_is_correct"]
 gene_2_mention.type = line_dict["gene_2_type"]
 # If the word indexes do not overlap, create the relation candidate
 # TODO there may be other cases. Check with Emily.
 if not set(line_dict["gene_1_wordidxs"]) & \
         set(line_dict["gene_2_wordidxs"]):
     relation = Relation(
         "GENEGENE", gene_1_mention, gene_2_mention)
     # Add features
コード例 #7
0
def extract(sentence):
    mentions = []
    mention_ids = set()
    # If there are no English words in the sentence, we skip it.
    no_english_words = True
    for word in sentence.words:
        word.stem = stemmer.stem(word.word)  # Here so all words have stem
        if len(word.word) > 2 and \
                (word.word in english_dict or
                 word.word.casefold() in english_dict):
            no_english_words = False
    if no_english_words:
        return mentions
    history = set()
    # Iterate over each phrase of length at most max_mention_length
    for start, end in get_all_phrases_in_sentence(sentence,
                                                  max_mention_length):
        if start in history or end - 1 in history:
            continue
        phrase = " ".join([word.word for word in sentence.words[start:end]])
        # If the phrase is a gene long name containing a phenotype name, create
        # a candidate that we supervise as negative
        if len(phrase) > 1 and phrase in genes_with_hpoterm:
            mention = Mention("HPOTERM_SUP_GENEL", phrase,
                              sentence.words[start:end])
            mention.is_correct = False
            add_features(mention, sentence)
            mentions.append(mention)
            for word in sentence.words[start:end]:
                history.add(word.in_sent_idx)
            continue
    # Iterate over each phrase of length at most max_mention_length
    for start, end in get_all_phrases_in_sentence(sentence,
                                                  max_mention_length):
        should_continue = False
        for i in range(start, end):
            if i in history:
                should_continue = True
                break
        if should_continue:
            continue
        phrase = " ".join([word.word for word in sentence.words[start:end]])
        # The list of stems in the phrase (not from stopwords or symbols, and
        # not already used for a mention)
        phrase_stems = []
        for word in sentence.words[start:end]:
            if not re.match("^(_|\W)+$", word.word) and \
                    (len(word.word) == 1 or
                     word.lemma.casefold() not in stopwords_dict):
                phrase_stems.append(word.stem)
        phrase_stems_set = frozenset(phrase_stems)
        if phrase_stems_set in hpoterms_dict:
            # Find the word objects of that match
            mention_words = []
            mention_lemmas = []
            mention_stems = []
            for word in sentence.words[start:end]:
                if word.stem in phrase_stems_set and \
                        word.lemma.casefold() not in mention_lemmas and \
                        word.stem not in mention_stems:
                    mention_lemmas.append(word.lemma.casefold())
                    mention_words.append(word)
                    mention_stems.append(word.stem)
                    if len(mention_words) == len(phrase_stems_set):
                        break
            entity = list(hpoterms_dict[phrase_stems_set])[0]
            mention = Mention("HPOTERM",
                              hponames_to_ids[entity] + "|" + entity,
                              mention_words)
            # The following is a way to avoid duplicates.
            # It's ugly and not perfect
            if mention.id() in mention_ids:
                continue
            mention_ids.add(mention.id())
            # Features
            add_features(mention, sentence)
            mentions.append(mention)
            for word in mention_words:
                history.add(word.in_sent_idx)
    # Generate some negative candidates at random, if this sentences didn't
    # contain any other candidate. We want the candidates to be nouns.
    if len(mentions) == 0 and random.random() <= NEG_PROB:
        index = random.randint(0, len(sentence.words) - 1)
        # We may not get a noun at random, so we try again if we don't.
        tries = 10
        while not sentence.words[index].pos.startswith("NN") and tries > 0:
            index = random.randint(0, len(sentence.words) - 1)
            tries -= 1
        if sentence.words[index].pos.startswith("NN"):
            mention = Mention("HPOTERM_SUP_rand",
                              sentence.words[index].lemma.casefold(),
                              sentence.words[index:index + 1])
            mention.is_correct = False
            add_features(mention, sentence)
            mentions.append(mention)
    return mentions
コード例 #8
0
 # Create the sentence object where the two mentions appear
 sentence = Sentence(line_dict["doc_id"], line_dict["sent_id"],
                     line_dict["wordidxs"], line_dict["words"],
                     line_dict["poses"], line_dict["ners"],
                     line_dict["lemmas"], line_dict["dep_paths"],
                     line_dict["dep_parents"],
                     line_dict["bounding_boxes"])
 # Skip weird sentences
 if sentence.is_weird():
     continue
 # Iterate over each pair of (gene,phenotype) mention
 for g_idx in range(len(line_dict["gene_is_corrects"])):
     g_wordidxs = TSVstring2list(line_dict["gene_wordidxss"][g_idx],
                                 int)
     gene_mention = Mention("GENE",
                            line_dict["gene_entities"][g_idx],
                            [sentence.words[j] for j in g_wordidxs])
     if line_dict["gene_is_corrects"][g_idx] == "n":
         gene_mention.is_correct = None
     elif line_dict["gene_is_corrects"][g_idx] == "f":
         gene_mention.is_correct = False
     elif line_dict["gene_is_corrects"][g_idx] == "t":
         gene_mention.is_correct = True
     else:
         assert False
     gene_mention.type = line_dict["gene_types"][g_idx]
     assert not gene_mention.type.endswith("_UNSUP")
     for h_idx in range(len(line_dict["hpoterm_is_corrects"])):
         h_wordidxs = TSVstring2list(
             line_dict["hpoterm_wordidxss"][h_idx], int)
         hpoterm_mention = Mention(
コード例 #9
0
def extract(sentence):
    mentions = []
    mention_ids = set()
    # If there are no English words in the sentence, we skip it.
    no_english_words = True
    for word in sentence.words:
        word.stem = stemmer.stem(word.word)  # Here so all words have stem
        if len(word.word) > 2 and \
                (word.word in english_dict or
                 word.word.casefold() in english_dict):
            no_english_words = False
    if no_english_words:
        return mentions
    history = set()
    # Iterate over each phrase of length at most max_mention_length
    for start, end in get_all_phrases_in_sentence(sentence,
                                                  max_mention_length):
        if start in history or end - 1 in history:
            continue
        phrase = " ".join([word.word for word in sentence.words[start:end]])
        # If the phrase is a gene long name containing a phenotype name, create
        # a candidate that we supervise as negative
        if len(phrase) > 1 and phrase in genes_with_hpoterm:
            mention = Mention("HPOTERM_SUP_GENEL",
                              phrase,
                              sentence.words[start:end])
            mention.is_correct = False
            add_features(mention, sentence)
            mentions.append(mention)
            for word in sentence.words[start:end]:
                history.add(word.in_sent_idx)
            continue
    # Iterate over each phrase of length at most max_mention_length
    for start, end in get_all_phrases_in_sentence(sentence,
                                                  max_mention_length):
        should_continue = False
        for i in range(start, end):
            if i in history:
                should_continue = True
                break
        if should_continue:
            continue
        phrase = " ".join([word.word for word in sentence.words[start:end]])
        # The list of stems in the phrase (not from stopwords or symbols, and
        # not already used for a mention)
        phrase_stems = []
        for word in sentence.words[start:end]:
            if not re.match("^(_|\W)+$", word.word) and \
                    (len(word.word) == 1 or
                     word.lemma.casefold() not in stopwords_dict):
                phrase_stems.append(word.stem)
        phrase_stems_set = frozenset(phrase_stems)
        if phrase_stems_set in hpoterms_dict:
            # Find the word objects of that match
            mention_words = []
            mention_lemmas = []
            mention_stems = []
            for word in sentence.words[start:end]:
                if word.stem in phrase_stems_set and \
                        word.lemma.casefold() not in mention_lemmas and \
                        word.stem not in mention_stems:
                    mention_lemmas.append(word.lemma.casefold())
                    mention_words.append(word)
                    mention_stems.append(word.stem)
                    if len(mention_words) == len(phrase_stems_set):
                        break
            entity = list(hpoterms_dict[phrase_stems_set])[0]
            mention = Mention(
                "HPOTERM", hponames_to_ids[entity] + "|" + entity,
                mention_words)
            # The following is a way to avoid duplicates.
            # It's ugly and not perfect
            if mention.id() in mention_ids:
                continue
            mention_ids.add(mention.id())
            # Features
            add_features(mention, sentence)
            mentions.append(mention)
            for word in mention_words:
                history.add(word.in_sent_idx)
    # Generate some negative candidates at random, if this sentences didn't
    # contain any other candidate. We want the candidates to be nouns.
    if len(mentions) == 0 and random.random() <= NEG_PROB:
        index = random.randint(0, len(sentence.words) - 1)
        # We may not get a noun at random, so we try again if we don't.
        tries = 10
        while not sentence.words[index].pos.startswith("NN") and tries > 0:
            index = random.randint(0, len(sentence.words) - 1)
            tries -= 1
        if sentence.words[index].pos.startswith("NN"):
            mention = Mention(
                "HPOTERM_SUP_rand", sentence.words[index].lemma.casefold(),
                sentence.words[index:index+1])
            mention.is_correct = False
            add_features(mention, sentence)
            mentions.append(mention)
    return mentions
コード例 #10
0
def supervise(mentions, sentence):
    phrase = " ".join([x.word for x in sentence.words])
    new_mentions = []
    for mention in mentions:
        new_mentions.append(mention)
        if mention.is_correct is not None:
            continue
        # The candidate is a long name.
        if " ".join([word.word for word in mention.words]) in \
                inverted_long_names:
            mention.is_correct = True
            mention.type = "GENE_SUP_long"
            continue
        # The candidate is a MIM entry
        if mention.words[0].word == "MIM":
            mention_word_idx = mention.words[0].in_sent_idx
            if mention_word_idx < len(sentence.words) - 1:
                next_word = sentence.words[mention_word_idx + 1].word
                if next_word.casefold() in ["no", "no.", "#", ":"] and \
                        mention_word_idx + 2 < len(sentence.words):
                    next_word = sentence.words[mention_word_idx + 2].word
                try:
                    int(next_word)
                    mention.is_correct = False
                    mention.type = "GENE_SUP_MIM"
                    continue
                except ValueError:
                    pass
        # The phrase starts with words that are indicative of the candidate not
        # being a mention of a gene
        # We add a feature for this, as it is a context property
        if phrase.startswith("Performed the experiments :") or \
                phrase.startswith("Wrote the paper :") or \
                phrase.startswith("W'rote the paper :") or \
                phrase.startswith("Wlrote the paper") or \
                phrase.startswith("Contributed reagents") or \
                phrase.startswith("Analyzed the data :") or \
                phrase.casefold().startswith("address"):
            # An unsupervised copy with the special feature
            unsuper_enriched = Mention("GENE_dontsup", mention.entity,
                                       mention.words)
            unsuper_enriched.features = mention.features.copy()
            unsuper_enriched.add_feature("IN_CONTRIB_PHRASE")
            new_mentions.append(unsuper_enriched)
            # This candidate contain only the 'special' feature.
            super_spec = Mention("GENE_SUP_contr_2", mention.entity,
                                 mention.words)
            super_spec.is_correct = False
            super_spec.add_feature("IN_CONTRIB_PHRASE")
            new_mentions.append(super_spec)
            # Set is_correct and type.
            mention.is_correct = False
            mention.type = "GENE_SUP_contr_1"
            continue
        # The candidate is an entry in Gene Ontology
        if len(mention.words) == 1 and mention.words[0].word == "GO":
            try:
                if sentence.words[mention.words[0].in_sent_idx + 1][0] == ":":
                    mention.is_correct = False
                    mention.type = "GENE_SUP_go"
            except:
                pass
            continue
        # Index of the word on the left
        idx = mention.wordidxs[0] - 1
        if idx >= 0:
            # The candidate is preceded by a "%" (it's probably a quantity)
            if sentence.words[idx].word == "%":
                mention.is_correct = False
                mention.type = "GENE_SUP_%"
                continue
            # The candidate comes after a "document element" (e.g., table, or
            # figure)
            if sentence.words[idx].word.casefold() in DOC_ELEMENTS:
                mention.is_correct = False
                mention.type = "GENE_SUP_doc"
                continue
            # The candidate comes after an "individual" word (e.g.,
            # "individual")
            if sentence.words[idx].word.casefold() in INDIVIDUALS and \
                    not mention.words[0].word.isalpha() and \
                    not len(mention.words[0].word) > 4:
                mention.is_correct = False
                mention.type = "GENE_SUP_indiv"
                continue
            # The candidate comes after a "type" word, and it is made only of
            # the letters "I" and "V"
            if sentence.words[idx].lemma.casefold() in TYPES and \
                    set(mention.words[0].word).issubset(set(["I", "V"])):
                mention.is_correct = False
                mention.type = "GENE_SUP_type"
                continue
        # Index of the word on the right
        idx = mention.wordidxs[-1] + 1
        if idx < len(sentence.words):
            # The candidate is followed by a "=" (it's probably a quantity)
            if sentence.words[idx].word == "=":
                mention.is_correct = False
                mention.type = "GENE_SUP_="
                continue
            # The candidate is followed by a ":" and the word after it is a
            # number (it's probably a quantity)
            if sentence.words[idx].word == ":":
                try:
                    float(sentence.words[idx + 1].word)
                    mention.is_correct = False
                    mention.type = "GENE_SUP_:"
                except:  # both ValueError and IndexError
                    pass
                continue
            # The candidate comes before "et"
            if sentence.words[idx].word == "et":
                mention.is_correct = False
                mention.type = "GENE_SUP_et"
                continue
        # The candidate is a DNA triplet
        # We check this by looking at whether the word before or after is also
        # a DNA triplet.
        if len(mention.words) == 1 and len(mention.words[0].word) == 3 and \
                set(mention.words[0].word) <= set("ACGT"):
            done = False
            idx = mention.wordidxs[0] - 1
            if idx > 0:
                if set(sentence.words[idx].word) <= set("ACGT"):
                    mention.is_correct = False
                    mention.type = "GENE_SUP_dna"
                    continue
            idx = mention.wordidxs[-1] + 1
            if not done and idx < len(sentence.words):
                if set(sentence.words[idx].word) <= set("ACGT"):
                    mention.is_correct = False
                    mention.type = "GENE_SUP_dna"
                    continue
        # If it's "II", it's most probably wrong.
        if mention.words[0].word == "II":
            mention.is_correct = False
            mention.type = "GENE_SUP_ii"
            continue
        # Snowball positive features
        # Commented out to avoid overfitting
        # if mention.features & snowball_pos_feats:
        #    supervised = Mention("GENE_SUP", mention.entity,
        #                         mention.words)
        #    supervised.features = mention.features - snowball_pos_feats
        #    supervised.is_correct = True
        #    new_mentions.append(supervised)
        #    supervised2 = Mention("GENE_SUP", mention.entity,
        #                          mention.words)
        #    supervised2.features = mention.features & snowball_pos_feats
        #    supervised2.is_correct = True
        #    new_mentions.append(supervised2)
        #    continue
        # Some negative features
        # if "EXT_KEYWORD_MIN_[chromosome]@nn" in mention.features:
        #    supervised = Mention("GENE_SUP", mention.entity, mention.words)
        #    supervised.features = mention.features.copy()
        #    supervised.is_correct = False
        #    new_mentions.append(supervised)
        #    continue
        # if "IS_YEAR_RIGHT" in mention.features:
        #    supervised = Mention("GENE_SUP", mention.entity, mention.words)
        #    supervised.features = mention.features.copy()
        #    supervised.is_correct = False
        #    new_mentions.append(supervised)
        #    continue
        # The candidate comes after an organization, or a location, or a
        # person. We skip commas as they may trick us.
        comes_after = None
        loc_idx = mention.wordidxs[0] - 1
        while loc_idx >= 0 and sentence.words[loc_idx].lemma == ",":
            loc_idx -= 1
        if loc_idx >= 0 and \
                sentence.words[loc_idx].ner in \
                ["ORGANIZATION", "LOCATION", "PERSON"] and \
                sentence.words[loc_idx].word not in merged_genes_dict:
            comes_after = sentence.words[loc_idx].ner
        # The candidate comes before an organization, or a location, or a
        # person. We skip commas, as they may trick us.
        comes_before = None
        loc_idx = mention.wordidxs[-1] + 1
        while loc_idx < len(sentence.words) and \
                sentence.words[loc_idx].lemma == ",":
            loc_idx += 1
        if loc_idx < len(sentence.words) and sentence.words[loc_idx].ner in \
                ["ORGANIZATION", "LOCATION", "PERSON"] and \
                sentence.words[loc_idx].word not in merged_genes_dict:
            comes_before = sentence.words[loc_idx].ner
        # Not correct if it's most probably a person name.
        if comes_before and comes_after:
            mention.is_correct = False
            mention.type = "GENE_SUP_name"
            continue
        # Comes after person and before "," or ":", so it's probably a person
        # name
        if comes_after == "PERSON" and \
                mention.words[-1].in_sent_idx + 1 < len(sentence.words) and \
                sentence.words[mention.words[-1].in_sent_idx + 1].word \
                in [",", ":"]:
            mention.is_correct = False
            mention.type = "GENE_SUP_name2"
            continue
        if comes_after == "PERSON" and mention.words[0].ner == "PERSON":
            mention.is_correct = False
            mention.type = "GENE_SUP_name3"
            continue
        # Is a location and comes before a location so it's probably wrong
        if comes_before == "LOCATION" and mention.words[0].ner == "LOCATION":
            mention.is_correct = False
            mention.type = "GENE_SUP_loc"
            continue
    return new_mentions
コード例 #11
0
 # Skip weird sentences
 if sentence.is_weird():
     continue
 gene_mentions = []
 hpoterm_mentions = []
 positive_relations = []
 gene_wordidxs = set()
 hpoterm_wordidxs = set()
 # Iterate over each pair of (gene,phenotype) mentions
 for g_idx in range(len(line_dict["gene_is_corrects"])):
     g_wordidxs = TSVstring2list(
         line_dict["gene_wordidxss"][g_idx], int)
     for idx in g_wordidxs:
         gene_wordidxs.add(idx)
     gene_mention = Mention(
         "GENE", line_dict["gene_entities"][g_idx],
         [sentence.words[j] for j in g_wordidxs])
     if line_dict["gene_is_corrects"][g_idx] == "n":
         gene_mention.is_correct = None
     elif line_dict["gene_is_corrects"][g_idx] == "f":
         gene_mention.is_correct = False
     elif line_dict["gene_is_corrects"][g_idx] == "t":
         gene_mention.is_correct = True
     else:
         assert False
     gene_mention.type = line_dict["gene_types"][g_idx]
     assert not gene_mention.type.endswith("_UNSUP")
     gene_mentions.append(gene_mention)
     for h_idx in range(len(line_dict["hpoterm_is_corrects"])):
         h_wordidxs = TSVstring2list(
             line_dict["hpoterm_wordidxss"][h_idx], int)