relation.add_feature("GENE_2_NGRAM_RIGHT_1_[" +
                             sentence.words[gene_2_end + 1].lemma + "]")


if __name__ == "__main__":
    # Process input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(line, [
                "doc_id", "sent_id", "wordidxs", "words", "poses", "ners",
                "lemmas", "dep_paths", "dep_parents", "bounding_boxes",
                "gene_1_entity", "gene_1_wordidxs", "gene_1_is_correct",
                "gene_1_type", "gene_2_entity", "gene_2_wordidxs",
                "gene_2_is_correct", "gene_2_type"
            ], [
                no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list,
                TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list,
                lambda x: TSVstring2list(x, int), TSVstring2list, no_op,
                lambda x: TSVstring2list(x, int), TSVstring2bool, no_op, no_op,
                lambda x: TSVstring2list(x, int), TSVstring2bool, no_op
            ])
            # Create the sentence object where the two mentions appear
            sentence = Sentence(line_dict["doc_id"], line_dict["sent_id"],
                                line_dict["wordidxs"], line_dict["words"],
                                line_dict["poses"], line_dict["ners"],
                                line_dict["lemmas"], line_dict["dep_paths"],
                                line_dict["dep_parents"],
                                line_dict["bounding_boxes"])
            # Create the mentions
            gene_1_mention = Mention(
    # There are many PERSONs/ORGANIZATIONs/LOCATIONs in the sentence
    # for ner in ["PERSON", "ORGANIZATION", "LOCATION"]:
    #    if [x.ner for x in sentence.words].count(ner) > 4:
    #        print_feature(
    #           sentence.doc_id, mention_id, "MANY_{}_IN_SENTENCE".format(ner))


if __name__ == "__main__":
    # Process the input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(
                line, ["doc_id", "sent_id", "wordidxs", "words", "poses",
                       "ners", "lemmas", "dep_paths", "dep_parents",
                       "mention_id", "mention_wordidxs"],
                [no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list,
                    TSVstring2list, TSVstring2list, TSVstring2list,
                    TSVstring2list, lambda x: TSVstring2list(x, int),
                    no_op, lambda x: TSVstring2list(x, int)])
            # Create the sentence object
            null_list = [None, ] * len(line_dict["wordidxs"])
            sentence = Sentence(
                line_dict["doc_id"], line_dict["sent_id"],
                line_dict["wordidxs"], line_dict["words"], line_dict["poses"],
                line_dict["ners"], line_dict["lemmas"], line_dict["dep_paths"],
                line_dict["dep_parents"], null_list)
            if sentence.is_weird():
                continue
            mention_words = []
            for mention_wordidx in line_dict["mention_wordidxs"]:
                mention_words.append(sentence.words[mention_wordidx])
import fileinput

from dstruct.Sentence import Sentence
from extract_gene_mentions import extract
from helper.easierlife import get_dict_from_TSVline, TSVstring2list, no_op
from helper.dictionaries import load_dict

if __name__ == "__main__":
    # Load the merged genes dictionary
    merged_genes_dict = load_dict("merged_genes")
    # Process the input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(
                line, ["doc_id", "sent_id", "wordidxs", "words", "gene"],
                [no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list,
                    no_op])
            # Create the Sentence object
            null_list = [None, ] * len(line_dict["wordidxs"])
            sentence = Sentence(
                line_dict["doc_id"], line_dict["sent_id"],
                line_dict["wordidxs"], line_dict["words"], null_list,
                null_list, null_list, null_list, null_list, null_list)
            # This is the 'labelled' gene that we know is in the sentence
            gene = line_dict["gene"]
            # Get the main symbol (or list of symbols) for the labelled gene
            if gene in merged_genes_dict:
                gene = merged_genes_dict[gene]
            else:
                gene = [gene, ]
            # Skip sentences that are "( GENE )", as they give no info about
    # for ner in ["PERSON", "ORGANIZATION", "LOCATION"]:
    #    if [x.ner for x in sentence.words].count(ner) > 4:
    #        print_feature(
    #           sentence.doc_id, mention_id, "MANY_{}_IN_SENTENCE".format(ner))


if __name__ == "__main__":
    # Process the input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(line, [
                "doc_id", "sent_id", "wordidxs", "words", "poses", "ners",
                "lemmas", "dep_paths", "dep_parents", "mention_id",
                "mention_wordidxs"
            ], [
                no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list,
                TSVstring2list, TSVstring2list, TSVstring2list, TSVstring2list,
                lambda x: TSVstring2list(x, int), no_op,
                lambda x: TSVstring2list(x, int)
            ])
            # Create the sentence object
            null_list = [
                None,
            ] * len(line_dict["wordidxs"])
            sentence = Sentence(line_dict["doc_id"], line_dict["sent_id"],
                                line_dict["wordidxs"], line_dict["words"],
                                line_dict["poses"], line_dict["ners"],
                                line_dict["lemmas"], line_dict["dep_paths"],
                                line_dict["dep_parents"], null_list)
            if sentence.is_weird():
                continue
from dstruct.Sentence import Sentence
from extract_gene_mentions import extract, add_features
from helper.easierlife import get_dict_from_TSVline, TSVstring2list, no_op
from helper.dictionaries import load_dict

if __name__ == "__main__":
    # Load the merged genes dictionary
    merged_genes_dict = load_dict("merged_genes")
    # Process the input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(
                line, ["doc_id", "sent_id", "wordidxs", "words", "poses",
                       "ners", "lemmas", "dep_paths", "dep_parents",
                       "bounding_boxes", "gene"],
                [no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list,
                    TSVstring2list, TSVstring2list, TSVstring2list,
                    TSVstring2list, lambda x: TSVstring2list(x, int),
                    TSVstring2list, no_op])
            # Create the Sentence object
            sentence = Sentence(
                line_dict["doc_id"], line_dict["sent_id"],
                line_dict["wordidxs"], line_dict["words"], line_dict["poses"],
                line_dict["ners"], line_dict["lemmas"], line_dict["dep_paths"],
                line_dict["dep_parents"], line_dict["bounding_boxes"])
            # This is the 'labelled' gene that we know is in the sentence
            gene = line_dict["gene"]
            # Get the main symbol (or list of symbols) for the labelled gene
            if gene in merged_genes_dict:
                gene = merged_genes_dict[gene]
            else:
        relation.add_feature("GENE_2_NGRAM_RIGHT_1_[" +
            sentence.words[gene_2_end+1].lemma + "]")


if __name__ == "__main__":
    # Process input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(
                line, ["doc_id", "sent_id", "wordidxs", "words", "poses",
                       "ners", "lemmas", "dep_paths", "dep_parents",
                       "bounding_boxes", "gene_1_entity", "gene_1_wordidxs",
                       "gene_1_is_correct", "gene_1_type",
                       "gene_2_entity", "gene_2_wordidxs",
                       "gene_2_is_correct", "gene_2_type"],
                [no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list,
                    TSVstring2list, TSVstring2list, TSVstring2list,
                    TSVstring2list, lambda x: TSVstring2list(x, int),
                    TSVstring2list, no_op, lambda x: TSVstring2list(x, int),
                    TSVstring2bool, no_op, no_op, lambda x: TSVstring2list(x,
                    int), TSVstring2bool, no_op])
            # Create the sentence object where the two mentions appear
            sentence = Sentence(
                line_dict["doc_id"], line_dict["sent_id"],
                line_dict["wordidxs"], line_dict["words"], line_dict["poses"],
                line_dict["ners"], line_dict["lemmas"], line_dict["dep_paths"],
                line_dict["dep_parents"], line_dict["bounding_boxes"])
            # Create the mentions
            gene_1_mention = Mention(
                "GENE", line_dict["gene_1_entity"],
# Load the gene<->hpoterm dictionary
genehpoterms_dict = load_dict("genehpoterms")

if __name__ == "__main__":
    # Process input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(
                line, ["doc_id", "sent_id", "wordidxs", "words", "poses",
                       "ners", "lemmas", "dep_paths", "dep_parents",
                       "bounding_boxes", "gene_entities", "gene_wordidxss",
                       "gene_is_corrects", "gene_types",
                       "hpoterm_entities", "hpoterm_wordidxss",
                       "hpoterm_is_corrects", "hpoterm_types"],
                [no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list,
                 TSVstring2list, TSVstring2list, TSVstring2list,
                 TSVstring2list, lambda x: TSVstring2list(x, int),
                 TSVstring2list,  # these are for the sentence
                 TSVstring2list, lambda x: TSVstring2list(x, sep="!~!"),
                 TSVstring2list, TSVstring2list,  # these are for the genes
                 TSVstring2list, lambda x: TSVstring2list(x, sep="!~!"),
                 TSVstring2list, TSVstring2list,  # these are for the HPO
                 ])
            # Remove the genes that are unsupervised copies or duplicates
            supervised_idxs = set()
            unsupervised_idxs = set()
            for i in range(len(line_dict["gene_is_corrects"])):
                if line_dict["gene_is_corrects"][i] == "n":
                    unsupervised_idxs.add(i)
                else:
                    if line_dict["gene_types"][i] != "GENE_SUP_contr_2":
# Load the genes dictionary
merged_genes_dict = load_dict("merged_genes")
inverted_long_names = load_dict("inverted_long_names")

if __name__ == "__main__":
    # Process the input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(
                line,
                ["doc_id", "sent_ids", "wordidxss", "wordss", "posess", 
                    "nerss", "lemmass", "dep_pathss", "dep_parentss",
                    "bounding_boxess"],
                [no_op, lambda x: TSVstring2list(x, int), 
                    lambda x: TSVstring2list(x,sep='!~!'), 
                    lambda x: TSVstring2list(x,sep='!~!'), 
                    lambda x: TSVstring2list(x,sep='!~!'),
                    lambda x: TSVstring2list(x,sep='!~!'),
                    lambda x: TSVstring2list(x,sep='!~!'), 
                    lambda x: TSVstring2list(x,sep='!~!'),
                    lambda x: TSVstring2list(x,sep='!~!'),
                    lambda x: TSVstring2list(x,sep='!~!')])
            # Acronyms defined in the document
            acronyms = dict()
            for idx in range(len(line_dict["sent_ids"])):
                wordidxs = TSVstring2list(line_dict["wordidxss"][idx], int)
                words = TSVstring2list(line_dict["wordss"][idx])
                poses = TSVstring2list(line_dict["posess"][idx])
                ners = TSVstring2list(line_dict["nerss"][idx])
                lemmas = TSVstring2list(line_dict["lemmass"][idx])
                dep_paths = TSVstring2list(line_dict["dep_pathss"][idx])
Beispiel #9
0
 # Parse the TSV line
 line_dict = get_dict_from_TSVline(
     line,
     [
         "doc_id", "sent_id", "wordidxs", "words", "poses", "ners",
         "lemmas", "dep_paths", "dep_parents", "bounding_boxes",
         "gene_entities", "gene_wordidxss", "gene_is_corrects",
         "gene_types", "hpoterm_entities", "hpoterm_wordidxss",
         "hpoterm_is_corrects", "hpoterm_types"
     ],
     [
         no_op,
         int,
         lambda x: TSVstring2list(x, int),
         TSVstring2list,
         TSVstring2list,
         TSVstring2list,
         TSVstring2list,
         TSVstring2list,
         lambda x: TSVstring2list(x, int),
         TSVstring2list,  # these are for the sentence
         TSVstring2list,
         lambda x: TSVstring2list(x, sep="!~!"),
         TSVstring2list,
         TSVstring2list,  # these are for the genes
         TSVstring2list,
         lambda x: TSVstring2list(x, sep="!~!"),
         TSVstring2list,
         TSVstring2list,  # these are for the HPO
     ])
 # Remove the genes that are unsupervised copies or duplicates
Beispiel #10
0
from dstruct.Sentence import Sentence
from extract_gene_mentions import extract
from helper.easierlife import get_dict_from_TSVline, TSVstring2list, no_op
from helper.dictionaries import load_dict

if __name__ == "__main__":
    # Load the merged genes dictionary
    merged_genes_dict = load_dict("merged_genes")
    # Process the input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(
                line, ["doc_id", "sent_id", "wordidxs", "words", "gene"], [
                    no_op, int, lambda x: TSVstring2list(x, int),
                    TSVstring2list, no_op
                ])
            # Create the Sentence object
            null_list = [
                None,
            ] * len(line_dict["wordidxs"])
            sentence = Sentence(line_dict["doc_id"], line_dict["sent_id"],
                                line_dict["wordidxs"], line_dict["words"],
                                null_list, null_list, null_list, null_list,
                                null_list, null_list)
            # This is the 'labelled' gene that we know is in the sentence
            gene = line_dict["gene"]
            # Get the main symbol (or list of symbols) for the labelled gene
            if gene in merged_genes_dict:
                gene = merged_genes_dict[gene]
            mentions.append(mention)
            # Add indexes to history so that they are not used for another
            # mention
            for i in range(start, end):
                history.add(i)
    return mentions


if __name__ == "__main__":
    # Process the input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(
                line,
                ["doc_id", "sent_id", "wordidxs", "words", "poses", "ners",
                    "lemmas"],
                [no_op, int, lambda x: TSVstring2list(x, int), TSVstring2list,
                    TSVstring2list, TSVstring2list, TSVstring2list])
            # Create the sentence object
            null_list = [None, ] * len(line_dict["wordidxs"])
            sentence = Sentence(
                line_dict["doc_id"], line_dict["sent_id"],
                line_dict["wordidxs"], line_dict["words"], line_dict["poses"],
                line_dict["ners"], line_dict["lemmas"], null_list, null_list,
                null_list)
            # Skip weird sentences
            if sentence.is_weird():
                continue
            # Get list of mentions candidates in this sentence
            mentions = extract(sentence)
            # Supervise them
Beispiel #12
0
# Load the genes dictionary
merged_genes_dict = load_dict("merged_genes")
inverted_long_names = load_dict("inverted_long_names")

if __name__ == "__main__":
    # Process the input
    with fileinput.input() as input_files:
        for line in input_files:
            # Parse the TSV line
            line_dict = get_dict_from_TSVline(line, [
                "doc_id", "sent_ids", "wordidxss", "wordss", "posess", "nerss",
                "lemmass", "dep_pathss", "dep_parentss", "bounding_boxess"
            ], [
                no_op, lambda x: TSVstring2list(x, int),
                lambda x: TSVstring2list(x, sep='!~!'),
                lambda x: TSVstring2list(x, sep='!~!'),
                lambda x: TSVstring2list(x, sep='!~!'),
                lambda x: TSVstring2list(x, sep='!~!'),
                lambda x: TSVstring2list(x, sep='!~!'),
                lambda x: TSVstring2list(x, sep='!~!'),
                lambda x: TSVstring2list(x, sep='!~!'),
                lambda x: TSVstring2list(x, sep='!~!')
            ])
            # Acronyms defined in the document
            acronyms = dict()
            for idx in range(len(line_dict["sent_ids"])):
                wordidxs = TSVstring2list(line_dict["wordidxss"][idx], int)
                words = TSVstring2list(line_dict["wordss"][idx])
                poses = TSVstring2list(line_dict["posess"][idx])
                ners = TSVstring2list(line_dict["nerss"][idx])
                lemmas = TSVstring2list(line_dict["lemmass"][idx])
                dep_paths = TSVstring2list(line_dict["dep_pathss"][idx])