"GENE", line_dict["gene_1_entity"],
                [sentence.words[j] for j in line_dict["gene_1_wordidxs"]])
            gene_1_mention.is_correct = line_dict["gene_1_is_correct"]
            gene_1_mention.type = line_dict["gene_1_type"]
            gene_2_mention = Mention(
                "GENE", line_dict["gene_2_entity"],
                [sentence.words[j] for j in line_dict["gene_2_wordidxs"]])
            gene_2_mention.is_correct = line_dict["gene_2_is_correct"]
            gene_2_mention.type = line_dict["gene_2_type"]
            # If the word indexes do not overlap, create the relation candidate
            # TODO there may be other cases. Check with Emily.
            if not set(line_dict["gene_1_wordidxs"]) & \
                    set(line_dict["gene_2_wordidxs"]):
                relation = Relation(
                    "GENEGENE", gene_1_mention, gene_2_mention)
                # Add features
                add_features(relation, gene_1_mention, gene_2_mention,
                            sentence)
                # Supervise
                # One of the two mentions (or both) is labelled as False
                # We do not create a copy in this case because there will
                # already be an unsupervised copy built on the unsupervised
                # copies of the mentions.
                if gene_1_mention.is_correct is False or \
                        gene_2_mention.is_correct is False:
                    relation.is_correct = False
                    relation.type = "GENEGENE_SUP_F"
                # TODO Check in Emily's code how to supervise as True
                # Print!
                print(relation.tsv_dump())
            gene_1_mention = Mention(
                "GENE", line_dict["gene_1_entity"],
                [sentence.words[j] for j in line_dict["gene_1_wordidxs"]])
            gene_1_mention.is_correct = line_dict["gene_1_is_correct"]
            gene_1_mention.type = line_dict["gene_1_type"]
            gene_2_mention = Mention(
                "GENE", line_dict["gene_2_entity"],
                [sentence.words[j] for j in line_dict["gene_2_wordidxs"]])
            gene_2_mention.is_correct = line_dict["gene_2_is_correct"]
            gene_2_mention.type = line_dict["gene_2_type"]
            # If the word indexes do not overlap, create the relation candidate
            # TODO there may be other cases. Check with Emily.
            if not set(line_dict["gene_1_wordidxs"]) & \
                    set(line_dict["gene_2_wordidxs"]):
                relation = Relation("GENEGENE", gene_1_mention, gene_2_mention)
                # Add features
                add_features(relation, gene_1_mention, gene_2_mention,
                             sentence)
                # Supervise
                # One of the two mentions (or both) is labelled as False
                # We do not create a copy in this case because there will
                # already be an unsupervised copy built on the unsupervised
                # copies of the mentions.
                if gene_1_mention.is_correct is False or \
                        gene_2_mention.is_correct is False:
                    relation.is_correct = False
                    relation.type = "GENEGENE_SUP_F"
                # TODO Check in Emily's code how to supervise as True
                # Print!
                print(relation.tsv_dump())
Example #3
0
 avail_wordidxs = list(avail_wordidxs)
 if len(avail_wordidxs) > 0:
     fake_rels = []
     for (gene_mention, hpoterm_mention) in positive_relations:
         other_word = sentence.words[random.choice(avail_wordidxs)]
         fake_gene_mention = Mention(
             "FAKE_GENE", other_word.lemma, [other_word, ])
         fake_hpo_mention = Mention(
             "FAKE_HPOTERM", other_word.lemma, [other_word, ])
         fake_rel_1 = Relation(
             "GENEPHENO_SUP_POSFAKEGENE", fake_gene_mention,
             hpoterm_mention)
         fake_rel_2 = Relation(
             "GENEPHENO_SUP_POSFAKEHPO", gene_mention,
             fake_hpo_mention)
         fake_rel_1.is_correct = False
         fake_rel_2.is_correct = False
         # Print!
         print(fake_rel_1.tsv_dump())
         print(fake_rel_2.tsv_dump())
 # Create more artificial negative examples:
 # for each gene candidate G in the sentence, if the pattern G
 # <Verb> X appears in the same sentence and X is not a phenotype
 # mention candidate, add (gene, X) as negative examples
 for gene_mention in gene_mentions:
     try:
         next_word = sentence.words[gene_mention.wordidxs[-1] + 1]
     except IndexError:
         continue
     if re.search('^VB[A-Z]*$', next_word.pos) and \
             next_word.word not in ["{", "}", "(", ")", "[", "]"]: