# Create the mentions
 gene_1_mention = Mention(
     "GENE", line_dict["gene_1_entity"],
     [sentence.words[j] for j in line_dict["gene_1_wordidxs"]])
 gene_1_mention.is_correct = line_dict["gene_1_is_correct"]
 gene_1_mention.type = line_dict["gene_1_type"]
 gene_2_mention = Mention(
     "GENE", line_dict["gene_2_entity"],
     [sentence.words[j] for j in line_dict["gene_2_wordidxs"]])
 gene_2_mention.is_correct = line_dict["gene_2_is_correct"]
 gene_2_mention.type = line_dict["gene_2_type"]
 # If the word indexes do not overlap, create the relation candidate
 # TODO there may be other cases. Check with Emily.
 if not set(line_dict["gene_1_wordidxs"]) & \
         set(line_dict["gene_2_wordidxs"]):
     relation = Relation("GENEGENE", gene_1_mention, gene_2_mention)
     # Add features
     add_features(relation, gene_1_mention, gene_2_mention,
                  sentence)
     # Supervise
     # One of the two mentions (or both) is labelled as False
     # We do not create a copy in this case because there will
     # already be an unsupervised copy built on the unsupervised
     # copies of the mentions.
     if gene_1_mention.is_correct is False or \
             gene_2_mention.is_correct is False:
         relation.is_correct = False
         relation.type = "GENEGENE_SUP_F"
     # TODO Check in Emily's code how to supervise as True
     # Print!
     print(relation.tsv_dump())
                        hpoterm_mention.is_correct = True
                    else:
                        assert False
                    hpoterm_mention.type = line_dict["hpoterm_types"][h_idx]
                    assert not hpoterm_mention.type.endswith("_UNSUP")
                    # Skip if the word indexes overlab
                    if set(g_wordidxs) & set(h_wordidxs):
                        continue
                    # Skip if the mentions are too far away
                    gene_start = gene_mention.wordidxs[0]
                    hpoterm_start = hpoterm_mention.wordidxs[0]
                    gene_end = gene_mention.wordidxs[-1]
                    hpoterm_end = hpoterm_mention.wordidxs[-1]
                    limits = sorted(
                        (gene_start, hpoterm_start, gene_end, hpoterm_end))
                    start = limits[0]
                    betw_start = limits[1]
                    betw_end = limits[2]
                    if betw_end - betw_start > 50:
                        continue
                    relation = Relation(
                        "GENEHPOTERM", gene_mention, hpoterm_mention)
                    # Add features
                    add_features(relation, gene_mention, hpoterm_mention,
                                 sentence)
                    # Supervise
                    supervise(relation, gene_mention, hpoterm_mention,
                              sentence)
                    # Print!
                    print(relation.tsv_dump())
Exemple #3
0
                        hpoterm_mention.is_correct = True
                    else:
                        assert False
                    hpoterm_mention.type = line_dict["hpoterm_types"][h_idx]
                    assert not hpoterm_mention.type.endswith("_UNSUP")
                    # Skip if the word indexes overlab
                    if set(g_wordidxs) & set(h_wordidxs):
                        continue
                    # Skip if the mentions are too far away
                    gene_start = gene_mention.wordidxs[0]
                    hpoterm_start = hpoterm_mention.wordidxs[0]
                    gene_end = gene_mention.wordidxs[-1]
                    hpoterm_end = hpoterm_mention.wordidxs[-1]
                    limits = sorted(
                        (gene_start, hpoterm_start, gene_end, hpoterm_end))
                    start = limits[0]
                    betw_start = limits[1]
                    betw_end = limits[2]
                    if betw_end - betw_start > 50:
                        continue
                    relation = Relation("GENEHPOTERM", gene_mention,
                                        hpoterm_mention)
                    # Add features
                    add_features(relation, gene_mention, hpoterm_mention,
                                 sentence)
                    # Supervise
                    supervise(relation, gene_mention, hpoterm_mention,
                              sentence)
                    # Print!
                    print(relation.tsv_dump())
 # Create the mentions
 gene_1_mention = Mention(
     "GENE", line_dict["gene_1_entity"],
     [sentence.words[j] for j in line_dict["gene_1_wordidxs"]])
 gene_1_mention.is_correct = line_dict["gene_1_is_correct"]
 gene_1_mention.type = line_dict["gene_1_type"]
 gene_2_mention = Mention(
     "GENE", line_dict["gene_2_entity"],
     [sentence.words[j] for j in line_dict["gene_2_wordidxs"]])
 gene_2_mention.is_correct = line_dict["gene_2_is_correct"]
 gene_2_mention.type = line_dict["gene_2_type"]
 # If the word indexes do not overlap, create the relation candidate
 # TODO there may be other cases. Check with Emily.
 if not set(line_dict["gene_1_wordidxs"]) & \
         set(line_dict["gene_2_wordidxs"]):
     relation = Relation(
         "GENEGENE", gene_1_mention, gene_2_mention)
     # Add features
     add_features(relation, gene_1_mention, gene_2_mention,
                 sentence)
     # Supervise
     # One of the two mentions (or both) is labelled as False
     # We do not create a copy in this case because there will
     # already be an unsupervised copy built on the unsupervised
     # copies of the mentions.
     if gene_1_mention.is_correct is False or \
             gene_2_mention.is_correct is False:
         relation.is_correct = False
         relation.type = "GENEGENE_SUP_F"
     # TODO Check in Emily's code how to supervise as True
     # Print!
     print(relation.tsv_dump())
Exemple #5
0
         # Skip if the word indexes overlab
         if set(g_wordidxs) & set(h_wordidxs):
             continue
         # Skip if the mentions are too far away
         gene_start = gene_mention.wordidxs[0]
         hpoterm_start = hpoterm_mention.wordidxs[0]
         gene_end = gene_mention.wordidxs[-1]
         hpoterm_end = hpoterm_mention.wordidxs[-1]
         limits = sorted(
             (gene_start, hpoterm_start, gene_end, hpoterm_end))
         start = limits[0]
         betw_start = limits[1]
         betw_end = limits[2]
         if betw_end - betw_start > 50:
             continue
         relation = Relation(
             "GENEPHENO", gene_mention, hpoterm_mention)
         # Supervise
         supervise(relation, gene_mention, hpoterm_mention,
                   sentence)
         if relation.is_correct:
             positive_relations.append(
                 (gene_mention, hpoterm_mention))
         # Print!
         print(relation.tsv_dump())
 # Create some artificial negative examples:
 # for each (gene, phenotype) pair that is labelled as positive
 # example, select one word w in the same sentence that (1) is not a
 # gene mention candidate and (2) is not a phenotype mention
 # candidate, add (gene, w) and (w, phenotype) as negative example
 avail_wordidxs = (
     set(line_dict["wordidxs"]) - set(hpoterm_wordidxs)) - \