"GENE", line_dict["gene_1_entity"], [sentence.words[j] for j in line_dict["gene_1_wordidxs"]]) gene_1_mention.is_correct = line_dict["gene_1_is_correct"] gene_1_mention.type = line_dict["gene_1_type"] gene_2_mention = Mention( "GENE", line_dict["gene_2_entity"], [sentence.words[j] for j in line_dict["gene_2_wordidxs"]]) gene_2_mention.is_correct = line_dict["gene_2_is_correct"] gene_2_mention.type = line_dict["gene_2_type"] # If the word indexes do not overlap, create the relation candidate # TODO there may be other cases. Check with Emily. if not set(line_dict["gene_1_wordidxs"]) & \ set(line_dict["gene_2_wordidxs"]): relation = Relation( "GENEGENE", gene_1_mention, gene_2_mention) # Add features add_features(relation, gene_1_mention, gene_2_mention, sentence) # Supervise # One of the two mentions (or both) is labelled as False # We do not create a copy in this case because there will # already be an unsupervised copy built on the unsupervised # copies of the mentions. if gene_1_mention.is_correct is False or \ gene_2_mention.is_correct is False: relation.is_correct = False relation.type = "GENEGENE_SUP_F" # TODO Check in Emily's code how to supervise as True # Print! print(relation.tsv_dump())
gene_1_mention = Mention( "GENE", line_dict["gene_1_entity"], [sentence.words[j] for j in line_dict["gene_1_wordidxs"]]) gene_1_mention.is_correct = line_dict["gene_1_is_correct"] gene_1_mention.type = line_dict["gene_1_type"] gene_2_mention = Mention( "GENE", line_dict["gene_2_entity"], [sentence.words[j] for j in line_dict["gene_2_wordidxs"]]) gene_2_mention.is_correct = line_dict["gene_2_is_correct"] gene_2_mention.type = line_dict["gene_2_type"] # If the word indexes do not overlap, create the relation candidate # TODO there may be other cases. Check with Emily. if not set(line_dict["gene_1_wordidxs"]) & \ set(line_dict["gene_2_wordidxs"]): relation = Relation("GENEGENE", gene_1_mention, gene_2_mention) # Add features add_features(relation, gene_1_mention, gene_2_mention, sentence) # Supervise # One of the two mentions (or both) is labelled as False # We do not create a copy in this case because there will # already be an unsupervised copy built on the unsupervised # copies of the mentions. if gene_1_mention.is_correct is False or \ gene_2_mention.is_correct is False: relation.is_correct = False relation.type = "GENEGENE_SUP_F" # TODO Check in Emily's code how to supervise as True # Print! print(relation.tsv_dump())
avail_wordidxs = list(avail_wordidxs) if len(avail_wordidxs) > 0: fake_rels = [] for (gene_mention, hpoterm_mention) in positive_relations: other_word = sentence.words[random.choice(avail_wordidxs)] fake_gene_mention = Mention( "FAKE_GENE", other_word.lemma, [other_word, ]) fake_hpo_mention = Mention( "FAKE_HPOTERM", other_word.lemma, [other_word, ]) fake_rel_1 = Relation( "GENEPHENO_SUP_POSFAKEGENE", fake_gene_mention, hpoterm_mention) fake_rel_2 = Relation( "GENEPHENO_SUP_POSFAKEHPO", gene_mention, fake_hpo_mention) fake_rel_1.is_correct = False fake_rel_2.is_correct = False # Print! print(fake_rel_1.tsv_dump()) print(fake_rel_2.tsv_dump()) # Create more artificial negative examples: # for each gene candidate G in the sentence, if the pattern G # <Verb> X appears in the same sentence and X is not a phenotype # mention candidate, add (gene, X) as negative examples for gene_mention in gene_mentions: try: next_word = sentence.words[gene_mention.wordidxs[-1] + 1] except IndexError: continue if re.search('^VB[A-Z]*$', next_word.pos) and \ next_word.word not in ["{", "}", "(", ")", "[", "]"]: