Python print_feature Examples, helper.easierlife.print_feature Python Examples

Example #1

0

Show file

File: ext_pheno_features.py Project: NunoEdgarGFlowHub/dd-genomics

def add_features_generic(mention_id, pheno_words, sentence):
    # Use the generic feature library (ONLY!)

    # Load dictionaries for keywords
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/pheno_var.tsv",  "VARKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/pheno_patient.tsv",  "PATIENTKW")

    # Create the objects used by ddlib. ddlib interface is so ugly.
    obj = dict()
    obj['lemma'] = []
    obj['words'] = []
    obj['ner'] = []
    obj['pos'] = []
    obj['dep_graph'] = []
    for word in sentence.words:
        obj['lemma'].append(word.lemma)
        obj['words'].append(word.word)
        obj['ner'].append(word.ner)
        obj['pos'].append(word.pos)
        obj['dep_graph'].append(
            str(word.dep_parent + 1) + "\t" + word.dep_path + "\t" +
            str(word.in_sent_idx + 1))
    word_obj_list = ddlib.unpack_words(
        obj, lemma='lemma', pos='pos', ner='ner', words='words',
        dep_graph='dep_graph', dep_graph_parser=ddlib.dep_graph_parser_triplet)
    gene_span = ddlib.get_span(pheno_words[0].in_sent_idx, len(pheno_words))
    features = set()
    for feature in ddlib.get_generic_features_mention(
            word_obj_list, gene_span):
        features.add(feature)
    for feature in features:
        print_feature(sentence.doc_id, mention_id, feature)

Example #2

0

Show file

File: ext_genepheno_features.py Project: NunoEdgarGFlowHub/dd-genomics

def add_features_generic(relation_id, gene_words, pheno_words, sentence):
    # Use the generic feature library (ONLY!)

    obj = dict()
    obj['lemma'] = []
    obj['words'] = []
    obj['ner'] = []
    obj['pos'] = []
    obj['dep_graph'] = []
    for word in sentence.words:
        obj['lemma'].append(word.lemma)
        obj['words'].append(word.word)
        obj['ner'].append(word.ner)
        obj['pos'].append(word.pos)
        obj['dep_graph'].append(
            str(word.dep_parent + 1) + "\t" + word.dep_path + "\t" +
            str(word.in_sent_idx + 1))
    word_obj_list = ddlib.unpack_words(
        obj,
        lemma='lemma',
        pos='pos',
        ner='ner',
        words='words',
        dep_graph='dep_graph',
        dep_graph_parser=ddlib.dep_graph_parser_triplet)
    gene_span = ddlib.get_span(gene_words[0].in_sent_idx, len(gene_words))
    pheno_span = ddlib.get_span(pheno_words[0].ins_sent_idx, len(pheno_words))
    features = set()
    for feature in ddlib.get_generic_feature_relation(word_obj_list, gene_span,
                                                      pheno_span):
        features.add(feature)
    for feature in features:
        print_feature(sentence.doc_id, relation_id, feature)

Example #3

0

Show file

File: ext_gene_features.py Project: NunoEdgarGFlowHub/dd-genomics

def add_features_generic(mention_id, gene_words, sentence):
    # Use the generic feature library (ONLY!)

    # Load dictionaries for keywords
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_var.tsv", "VARKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_knock.tsv",
                          "KNOCKKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_amino.tsv",
                          "AMINOKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_antigene.tsv",
                          "ANTIGENEKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_dna.tsv", "DNAKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_downregulation.tsv",
                          "DOWNREGKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_upregulation.tsv",
                          "UPREGKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_tumor.tsv",
                          "TUMORKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_gene.tsv", "GENEKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_expression.tsv",
                          "EXPRESSKW")
    # Create the objects used by ddlib. ddlib interface is so ugly.
    obj = dict()
    obj['lemma'] = []
    obj['words'] = []
    obj['ner'] = []
    obj['pos'] = []
    obj['dep_graph'] = []
    for word in sentence.words:
        obj['lemma'].append(word.lemma)
        obj['words'].append(word.word)
        obj['ner'].append(word.ner)
        obj['pos'].append(word.pos)
        obj['dep_graph'].append(
            str(word.dep_parent + 1) + "\t" + word.dep_path + "\t" +
            str(word.in_sent_idx + 1))
    word_obj_list = ddlib.unpack_words(
        obj,
        lemma='lemma',
        pos='pos',
        ner='ner',
        words='words',
        dep_graph='dep_graph',
        dep_graph_parser=ddlib.dep_graph_parser_triplet)
    gene_span = ddlib.get_span(gene_words[0].in_sent_idx, len(gene_words))
    features = set()
    for feature in ddlib.get_generic_features_mention(word_obj_list,
                                                      gene_span):
        features.add(feature)
    for feature in features:
        print_feature(sentence.doc_id, mention_id, feature)

Example #4

0

Show file

File: ext_gene_features.py Project: amwenger/dd-genomics

def add_features_generic(mention_id, gene_words, sentence):
    # Use the generic feature library (ONLY!)

    # Load dictionaries for keywords
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_var.tsv",  "VARKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_knock.tsv",  "KNOCKKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_amino.tsv",  "AMINOKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_antigene.tsv",  "ANTIGENEKW")
    ddlib.load_dictionary(BASE_DIR + "/dicts/features/gene_dna.tsv",  "DNAKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_downregulation.tsv",  "DOWNREGKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_upregulation.tsv",  "UPREGKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_tumor.tsv",  "TUMORKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_gene.tsv",  "GENEKW")
    ddlib.load_dictionary(
        BASE_DIR + "/dicts/features/gene_expression.tsv",  "EXPRESSKW")
    # Create the objects used by ddlib. ddlib interface is so ugly.
    obj = dict()
    obj['lemma'] = []
    obj['words'] = []
    obj['ner'] = []
    obj['pos'] = []
    obj['dep_graph'] = []
    for word in sentence.words:
        obj['lemma'].append(word.lemma)
        obj['words'].append(word.word)
        obj['ner'].append(word.ner)
        obj['pos'].append(word.pos)
        obj['dep_graph'].append(
            str(word.dep_parent + 1) + "\t" + word.dep_path + "\t" +
            str(word.in_sent_idx + 1))
    word_obj_list = ddlib.unpack_words(
        obj, lemma='lemma', pos='pos', ner='ner', words='words',
        dep_graph='dep_graph', dep_graph_parser=ddlib.dep_graph_parser_triplet)
    gene_span = ddlib.get_span(gene_words[0].in_sent_idx, len(gene_words))
    features = set()
    for feature in ddlib.get_generic_features_mention(
            word_obj_list, gene_span):
        features.add(feature)
    for feature in features:
        print_feature(sentence.doc_id, mention_id, feature)

Example #5

0

Show file

File: ext_gene_features.py Project: amwenger/dd-genomics

def add_features(mention_id, mention_words, sentence):
    # The verb closest to the candidate, with the path to it.
    minl = 100
    minp = None
    minw = None
    for word in mention_words:
        for word2 in sentence.words:
            if word2.lemma.isalpha() and re.search('^VB[A-Z]*$', word2.pos) \
                    and word2.lemma != 'be':
                # Ignoring "be" comes from pharm (Emily)
                (p, l) = sentence.get_word_dep_path(
                    word.in_sent_idx, word2.in_sent_idx)
                if l < minl:
                  minl = l
                  minp = p
                  minw = word2.lemma
    if minw:
        print_feature(
            sentence.doc_id, mention_id, 'VERB_[' + minw + ']' + minp)
    # The keywords that appear in the sentence with the mention
    minl = 100
    minp = None
    minw = None
    for word in mention_words:
        for word2 in sentence.words:
            if word2.lemma in KEYWORDS:
                (p, l) = sentence.get_word_dep_path(
                    word.in_sent_idx, word2.in_sent_idx)
                kw = word2.lemma
                if word2.lemma in KNOCK_KWS:
                    kw = "_KNOCKOUT"
                elif word2.lemma in ANTIGENE_KWS:
                    kw = "_ANTIGENE"
                elif word2.lemma in AMINO_ACID_KWS:
                    kw = "_AMINOACID"
                # elif word2.lemma in DNA_KWS:
                #    kw = "_DNA"
                elif word2.lemma in DOWNREGULATION_KWS:
                    kw = "_DOWNREGULATION"
                elif word2.lemma in UPREGULATION_KWS:
                    kw = "_UPREGULATION"
                # elif word2.lemma in TUMOR_KWS:
                #     kw = "_TUMOR"
                # elif word2.lemma in GENE_KWS:
                #     kw = "_GENE"
                # elif word2.lemma in COEXPRESSION_KWS:
                #    ke = "_COEXPRESSION"
                if l < minl:
                    minl = l
                    minp = p
                    minw = kw
                if len(p) < 100:
                    print_feature(
                        sentence.doc_id, mention_id,
                        "KEYWORD_[" + kw + "]" + p)
    # Special features for the keyword on the shortest dependency path
    if minw:
        print_feature(
            sentence.doc_id, mention_id,
            'EXT_KEYWORD_MIN_[' + minw + ']' + minp)
        print_feature(
            sentence.doc_id, mention_id, 'KEYWORD_MIN_[' + minw + ']')
    # If another gene is present in the sentence, add a feature with that gene
    # and the path to it. This comes from pharm.
    minl = 100
    minp = None
    minw = None
    mention_wordidxs = []
    for word in mention_words:
        mention_wordidxs.append(word.in_sent_idx)
    for word in mention_words:
        for word2 in sentence.words:
            if word2.in_sent_idx not in mention_wordidxs and \
                    word2.word in merged_genes_dict:
                (p, l) = sentence.get_word_dep_path(
                    word.in_sent_idx, word2.in_sent_idx)
                if l < minl:
                    minl = l
                    minp = p
                    minw = word2.lemma
    if minw:
        print_feature(
            sentence.doc_id, mention_id, 'OTHER_GENE_['+minw+']' + minp)
        # print_feature(sentence.doc_id, mention_id, 'OTHER_GENE_['+minw+']')
    # The lemma on the left of the candidate, whatever it is
    try:
        left = sentence.words[mention_words[0].in_sent_idx-1].lemma
        try:
            float(left)
            left = "_NUMBER"
        except ValueError:
            pass
        print_feature(
            sentence.doc_id, mention_id, "NGRAM_LEFT_1_[" + left + "]")
    except IndexError:
        pass
    # The lemma on the right of the candidate, whatever it is
    try:
        right = sentence.words[mention_words[-1].in_sent_idx+1].lemma
        try:
            float(right)
            right = "_NUMBER"
        except ValueError:
            pass
        print_feature(
            sentence.doc_id, mention_id, "NGRAM_RIGHT_1_[" + right + "]")
    except IndexError:
        pass
    # We know check whether the lemma on the left and on the right are
    # "special", for example a year or a gene.
    # The concept of left or right is a little tricky here, as we are actually
    # looking at the first word that contains only letters and is not a
    # stopword.
    idx = mention_words[0].in_sent_idx - 1
    gene_on_left = None
    gene_on_right = None
    while idx >= 0 and \
            ((((not sentence.words[idx].lemma.isalnum() and not
                sentence.words[idx] in merged_genes_dict) or
                (not sentence.words[idx].word.isupper() and
                 sentence.words[idx].lemma in stopwords_dict)) and
                not re.match("^[0-9]+(.[0-9]+)?$", sentence.words[idx].word)
                and not sentence.words[idx] in merged_genes_dict) or
                len(sentence.words[idx].lemma) == 1):
        idx -= 1
    if idx >= 0:
        if sentence.words[idx].word in merged_genes_dict and \
                len(sentence.words[idx].word) > 3:
            gene_on_left = sentence.words[idx].word
        try:
            year = float(sentence.words[idx].word)
            if round(year) == year and year > 1950 and year <= 2014:
                print_feature(sentence.doc_id, mention_id, "IS_YEAR_LEFT")
        except:
            pass
    # The word on the right of the mention, if present, provided it's
    # alphanumeric but not a number
    idx = mention_words[-1].in_sent_idx + 1
    while idx < len(sentence.words) and \
        ((((not sentence.words[idx].lemma.isalnum() and not
            sentence.words[idx] in merged_genes_dict) or
            (not sentence.words[idx].word.isupper() and
                sentence.words[idx].lemma in stopwords_dict)) and
            not re.match("^[0-9]+(.[0-9]+)?$", sentence.words[idx].word)
            and not sentence.words[idx] in merged_genes_dict) or
            len(sentence.words[idx].lemma) == 1):
        idx += 1
    if idx < len(sentence.words):
        if sentence.words[idx].word in merged_genes_dict and \
                len(sentence.words[idx].word) > 3:
            gene_on_right = sentence.words[idx].word
        try:
            year = float(sentence.words[idx].word)
            if round(year) == year and year > 1950 and year <= 2014:
                print_feature(sentence.doc_id, mention_id, "IS_YEAR_RIGHT")
        except:
            pass
    if gene_on_left and gene_on_right:
        print_feature(sentence.doc_id, mention_id, "IS_BETWEEN_GENES")
    elif gene_on_left:
        print_feature(sentence.doc_id, mention_id, "GENE_ON_LEFT")
    elif gene_on_right:
        print_feature(sentence.doc_id, mention_id, "GENE_ON_RIGHT")
    # The candidate is a single word that appears many times (more than 4) in
    # the sentence
    if len(mention_words) == 1 and \
            [w.word for w in sentence.words].count(mention_words[0].word) > 4:
        print_feature(
            sentence.doc_id, mention_id, "APPEARS_MANY_TIMES_IN_SENTENCE")

Example #6

0

Show file

File: ext_gene_features.py Project: NunoEdgarGFlowHub/dd-genomics

def add_features(mention_id, mention_words, sentence):
    # The verb closest to the candidate, with the path to it.
    minl = 100
    minp = None
    minw = None
    for word in mention_words:
        for word2 in sentence.words:
            if word2.lemma.isalpha() and re.search('^VB[A-Z]*$', word2.pos) \
                    and word2.lemma != 'be':
                # Ignoring "be" comes from pharm (Emily)
                (p, l) = sentence.get_word_dep_path(word.in_sent_idx,
                                                    word2.in_sent_idx)
                if l < minl:
                    minl = l
                    minp = p
                    minw = word2.lemma
    if minw:
        print_feature(sentence.doc_id, mention_id,
                      'VERB_[' + minw + ']' + minp)
    # The keywords that appear in the sentence with the mention
    minl = 100
    minp = None
    minw = None
    for word in mention_words:
        for word2 in sentence.words:
            if word2.lemma in KEYWORDS:
                (p, l) = sentence.get_word_dep_path(word.in_sent_idx,
                                                    word2.in_sent_idx)
                kw = word2.lemma
                if word2.lemma in KNOCK_KWS:
                    kw = "_KNOCKOUT"
                elif word2.lemma in ANTIGENE_KWS:
                    kw = "_ANTIGENE"
                elif word2.lemma in AMINO_ACID_KWS:
                    kw = "_AMINOACID"
                # elif word2.lemma in DNA_KWS:
                #    kw = "_DNA"
                elif word2.lemma in DOWNREGULATION_KWS:
                    kw = "_DOWNREGULATION"
                elif word2.lemma in UPREGULATION_KWS:
                    kw = "_UPREGULATION"
                # elif word2.lemma in TUMOR_KWS:
                #     kw = "_TUMOR"
                # elif word2.lemma in GENE_KWS:
                #     kw = "_GENE"
                # elif word2.lemma in COEXPRESSION_KWS:
                #    ke = "_COEXPRESSION"
                if l < minl:
                    minl = l
                    minp = p
                    minw = kw
                if len(p) < 100:
                    print_feature(sentence.doc_id, mention_id,
                                  "KEYWORD_[" + kw + "]" + p)
    # Special features for the keyword on the shortest dependency path
    if minw:
        print_feature(sentence.doc_id, mention_id,
                      'EXT_KEYWORD_MIN_[' + minw + ']' + minp)
        print_feature(sentence.doc_id, mention_id,
                      'KEYWORD_MIN_[' + minw + ']')
    # If another gene is present in the sentence, add a feature with that gene
    # and the path to it. This comes from pharm.
    minl = 100
    minp = None
    minw = None
    mention_wordidxs = []
    for word in mention_words:
        mention_wordidxs.append(word.in_sent_idx)
    for word in mention_words:
        for word2 in sentence.words:
            if word2.in_sent_idx not in mention_wordidxs and \
                    word2.word in merged_genes_dict:
                (p, l) = sentence.get_word_dep_path(word.in_sent_idx,
                                                    word2.in_sent_idx)
                if l < minl:
                    minl = l
                    minp = p
                    minw = word2.lemma
    if minw:
        print_feature(sentence.doc_id, mention_id,
                      'OTHER_GENE_[' + minw + ']' + minp)
        # print_feature(sentence.doc_id, mention_id, 'OTHER_GENE_['+minw+']')
    # The lemma on the left of the candidate, whatever it is
    try:
        left = sentence.words[mention_words[0].in_sent_idx - 1].lemma
        try:
            float(left)
            left = "_NUMBER"
        except ValueError:
            pass
        print_feature(sentence.doc_id, mention_id,
                      "NGRAM_LEFT_1_[" + left + "]")
    except IndexError:
        pass
    # The lemma on the right of the candidate, whatever it is
    try:
        right = sentence.words[mention_words[-1].in_sent_idx + 1].lemma
        try:
            float(right)
            right = "_NUMBER"
        except ValueError:
            pass
        print_feature(sentence.doc_id, mention_id,
                      "NGRAM_RIGHT_1_[" + right + "]")
    except IndexError:
        pass
    # We know check whether the lemma on the left and on the right are
    # "special", for example a year or a gene.
    # The concept of left or right is a little tricky here, as we are actually
    # looking at the first word that contains only letters and is not a
    # stopword.
    idx = mention_words[0].in_sent_idx - 1
    gene_on_left = None
    gene_on_right = None
    while idx >= 0 and \
            ((((not sentence.words[idx].lemma.isalnum() and not
                sentence.words[idx] in merged_genes_dict) or
                (not sentence.words[idx].word.isupper() and
                 sentence.words[idx].lemma in stopwords_dict)) and
                not re.match("^[0-9]+(.[0-9]+)?$", sentence.words[idx].word)
                and not sentence.words[idx] in merged_genes_dict) or
                len(sentence.words[idx].lemma) == 1):
        idx -= 1
    if idx >= 0:
        if sentence.words[idx].word in merged_genes_dict and \
                len(sentence.words[idx].word) > 3:
            gene_on_left = sentence.words[idx].word
        try:
            year = float(sentence.words[idx].word)
            if round(year) == year and year > 1950 and year <= 2014:
                print_feature(sentence.doc_id, mention_id, "IS_YEAR_LEFT")
        except:
            pass
    # The word on the right of the mention, if present, provided it's
    # alphanumeric but not a number
    idx = mention_words[-1].in_sent_idx + 1
    while idx < len(sentence.words) and \
        ((((not sentence.words[idx].lemma.isalnum() and not
            sentence.words[idx] in merged_genes_dict) or
            (not sentence.words[idx].word.isupper() and
                sentence.words[idx].lemma in stopwords_dict)) and
            not re.match("^[0-9]+(.[0-9]+)?$", sentence.words[idx].word)
            and not sentence.words[idx] in merged_genes_dict) or
            len(sentence.words[idx].lemma) == 1):
        idx += 1
    if idx < len(sentence.words):
        if sentence.words[idx].word in merged_genes_dict and \
                len(sentence.words[idx].word) > 3:
            gene_on_right = sentence.words[idx].word
        try:
            year = float(sentence.words[idx].word)
            if round(year) == year and year > 1950 and year <= 2014:
                print_feature(sentence.doc_id, mention_id, "IS_YEAR_RIGHT")
        except:
            pass
    if gene_on_left and gene_on_right:
        print_feature(sentence.doc_id, mention_id, "IS_BETWEEN_GENES")
    elif gene_on_left:
        print_feature(sentence.doc_id, mention_id, "GENE_ON_LEFT")
    elif gene_on_right:
        print_feature(sentence.doc_id, mention_id, "GENE_ON_RIGHT")
    # The candidate is a single word that appears many times (more than 4) in
    # the sentence
    if len(mention_words) == 1 and \
            [w.word for w in sentence.words].count(mention_words[0].word) > 4:
        print_feature(sentence.doc_id, mention_id,
                      "APPEARS_MANY_TIMES_IN_SENTENCE")

Example #7

0

Show file

File: ext_pheno_features.py Project: NunoEdgarGFlowHub/dd-genomics

def add_features(mention_id, mention_words, sentence):
    mention_wordidxs = []
    for word in mention_words:
        mention_wordidxs.append(word.in_sent_idx)
    # The first alphanumeric lemma on the left of the mention, if present,
    idx = mention_words[0].in_sent_idx - 1
    left_lemma_idx = -1
    left_lemma = ""
    while idx >= 0 and not sentence.words[idx].word.isalnum():
        idx -= 1
    try:
        left_lemma = sentence.words[idx].lemma
        try:
            float(left_lemma)
            left_lemma = "_NUMBER"
        except ValueError:
            pass
        left_lemma_idx = idx
        print_feature(
            sentence.doc_id, mention_id, "NGRAM_LEFT_1_[{}]".format(
                left_lemma))
    except IndexError:
        pass
    # The first alphanumeric lemma on the right of the mention, if present,
    idx = mention_wordidxs[-1] + 1
    right_lemma_idx = -1
    right_lemma = ""
    while idx < len(sentence.words) and not sentence.words[idx].word.isalnum():
        idx += 1
    try:
        right_lemma = sentence.words[idx].lemma
        try:
            float(right_lemma)
            right_lemma = "_NUMBER"
        except ValueError:
            pass
        right_lemma_idx = idx
        print_feature(
            sentence.doc_id, mention_id, "NGRAM_RIGHT_1_[{}]".format(
                right_lemma))
    except IndexError:
        pass
    # The lemma "two on the left" of the mention, if present
    try:
        print_feature(sentence.doc_id, mention_id, "NGRAM_LEFT_2_[{}]".format(
            sentence.words[left_lemma_idx - 1].lemma))
        print_feature(
            sentence.doc_id, mention_id, "NGRAM_LEFT_2_C_[{} {}]".format(
                sentence.words[left_lemma_idx - 1].lemma, left_lemma))
    except IndexError:
        pass
    # The lemma "two on the right" on the left of the mention, if present
    try:
        print_feature(
            sentence.doc_id, mention_id, "NGRAM_RIGHT_2_[{}]".format(
                sentence.words[right_lemma_idx + 1].lemma))
        print_feature(
            sentence.doc_id, mention_id, "NGRAM_RIGHT_2_C_[{} {}]".format(
                right_lemma, sentence.words[right_lemma_idx + 1].lemma))
    except IndexError:
        pass
    # The keywords that appear in the sentence with the mention
    minl = 100
    minp = None
    minw = None
    for word in mention_words:
        for word2 in sentence.words:
            if word2.lemma in KEYWORDS:
                (p, l) = sentence.get_word_dep_path(
                    word.in_sent_idx, word2.in_sent_idx)
                kw = word2.lemma
                if word2.lemma in PATIENT_KWS:
                    kw = "_HUMAN"
                print_feature(
                    sentence.doc_id, mention_id, "KEYWORD_[" + kw + "]" + p)
                if l < minl:
                    minl = l
                    minp = p
                    minw = kw
    # Special feature for the keyword on the shortest dependency path
    if minw:
        print_feature(
            sentence.doc_id, mention_id,
            'EXT_KEYWORD_MIN_[' + minw + ']' + minp)
        print_feature(
            sentence.doc_id, mention_id, 'KEYWORD_MIN_[' + minw + ']')
    # The verb closest to the candidate
    minl = 100
    minp = None
    minw = None
    for word in mention_words:
        for word2 in sentence.words:
            if word2.word.isalpha() and re.search('^VB[A-Z]*$', word2.pos) \
                    and word2.lemma != 'be':
                (p, l) = sentence.get_word_dep_path(
                    word.in_sent_idx, word2.in_sent_idx)
                if l < minl:
                    minl = l
                    minp = p
                    minw = word2.lemma
        if minw:
            print_feature(
                sentence.doc_id, mention_id, 'VERB_[' + minw + ']' + minp)

Example #8

0

Show file

File: ext_genepheno_features.py Project: NunoEdgarGFlowHub/dd-genomics

def add_features(relation_id, gene_words, pheno_words, sentence):
    # Find the start/end indices of the mentions composing the relation
    gene_start = gene_words[0].in_sent_idx
    pheno_start = pheno_words[0].in_sent_idx
    gene_end = gene_words[-1].in_sent_idx
    pheno_end = pheno_words[-1].in_sent_idx
    limits = sorted((gene_start, pheno_start, gene_end, pheno_end))
    start = limits[0]
    betw_start = limits[1]
    betw_end = limits[2]
    end = limits[3]
    # If the gene comes first, we do not prefix, otherwise we do.
    if start == gene_start:
        inv = ""
    else:
        inv = "INV_"

    # Verbs between the mentions
    verbs_between = []
    minl_gene = 100
    minp_gene = None
    minw_gene = None
    mini_gene = None
    minl_pheno = 100
    # minp_pheno = None
    minw_pheno = None
    mini_pheno = None
    neg_found = False
    # Look all the words, as in the dependency path there could be words that
    # are close to both mentions but not between them
    for i in range(len(sentence.words)):
        # The filtering of the brackets and commas is from Emily's code.
        if re.search('^VB[A-Z]*$', sentence.words[i].pos) and \
                sentence.words[i].word not in ["{", "}", "(", ")", "[", "]"] \
                and "," not in sentence.words[i].word:
            (p_gene, l_gene) = sentence.get_word_dep_path(
                betw_start, sentence.words[i].in_sent_idx)
            (p_pheno, l_pheno) = sentence.get_word_dep_path(
                sentence.words[i].in_sent_idx, betw_end)
            if l_gene < minl_gene:
                minl_gene = l_gene
                minp_gene = p_gene
                minw_gene = sentence.words[i].lemma
                mini_gene = sentence.words[i].in_sent_idx
            if l_pheno < minl_pheno:
                minl_pheno = l_pheno
                #  minp_pheno = p_pheno
                minw_pheno = sentence.words[i].lemma
                mini_pheno = sentence.words[i].in_sent_idx
            # Look for negation.
            if i > 0 and sentence.words[i-1].lemma in \
                    ["no", "not", "neither", "nor"]:
                if i < betw_end - 2:
                    neg_found = True
                    print_feature(
                        relation_id,
                        inv + "NEG_VERB_[" + sentence.words[i - 1].word +
                        "]-" + sentence.words[i].lemma)
            else:
                verbs_between.append(sentence.words[i])
    if len(verbs_between) == 1 and not neg_found:
        print_feature(sentence.doc_id, relation_id,
                      inv + "SINGLE_VERB_[%s]" % verbs_between[0].lemma)
    else:
        for verb in verbs_between:
            if verb.in_sent_idx > betw_start and \
                    verb.in_sent_idx < betw_end:
                print_feature(sentence.doc_id, relation_id,
                              inv + "VERB_[%s]" % verb.lemma)
    if mini_pheno == mini_gene and mini_gene is not None and \
            len(minp_gene) < 50:  # and "," not in minw_gene:
        # feature = inv + 'MIN_VERB_[' + minw_gene + ']' + minp_gene
        # features.append(feature)
        feature = inv + 'MIN_VERB_[' + minw_gene + ']'
        print_feature(sentence.doc_id, relation_id, feature)
    else:
        feature = inv
        if mini_gene is not None:
            # feature = 'MIN_VERB_GENE_[' + minw_gene + ']' + minp_gene
            # print_feature(sentence.doc_id, relation_id, feature)
            feature += 'MIN_VERB_GENE_[' + minw_gene + ']'
        else:
            feature += 'MIN_VERB_GENE_[NULL]'
        if mini_pheno is not None:
            # feature = 'MIN_VERB_pheno_[' + minw_pheno + ']' + minp_pheno)
            # print_feature(sentence.doc_id, relation_id, feature)
            feature += '_pheno_[' + minw_pheno + ']'
        else:
            feature += '_pheno_[NULL]'
        print_feature(sentence.doc_id, relation_id, feature)

    # The following features are only added if the two mentions are "close
    # enough" to avoid overfitting. The concept of "close enough" is somewhat
    # arbitrary.
    neg_word_index = -1
    if betw_end - betw_start - 1 < 8:
        for i in range(betw_start + 1, betw_end):
            # Feature for separation between entities.
            # TODO Think about merging these?
            # I think these should be some kind of supervision rule instead?
            if "while" == sentence.words[i].lemma:
                print_feature(sentence.doc_id, relation_id, "SEP_BY_[while]")
            if "whereas" == sentence.words[i].lemma:
                print_feature(sentence.doc_id, relation_id, "SEP_BY_[whereas]")
            if sentence.words[i].lemma in ["no", "not", "neither", "nor"]:
                neg_word_index = i
        # Features for the negative words
        # TODO: We would probably need distant supervision for these
        if neg_word_index > -1:
            gene_p = None
            gene_l = 100
            for word in sentence.words[gene_start:gene_end + 1]:
                (p, l) = sentence.get_word_dep_path(word.in_sent_idx,
                                                    neg_word_index)
                if l < gene_l:
                    gene_p = p
                    gene_l = l
            if gene_p:
                print_feature(sentence.doc_id, relation_id,
                              inv + "NEG_[" + gene_p + "]")
            # pheno_p = None
            # pheno_l = 100
            # for word in sentence.words[pheno_start:pheno_end+1]:
            #    p = sentence.get_word_dep_path(
            #        word.in_sent_idx, neg_word_index)
            #    if len(p) < pheno_l:
            #        pheno_p = p
            #        pheno_l = len(p)
            # if pheno_p:
            #    print_feature(
            #       relation_id, inv + "pheno_TO_NEG_[" + pheno_p + "]")
        # The sequence of lemmas between the two mentions and the sequence of
        # lemmas between the two mentions but using the NERs, if present, and
        # the sequence of POSes between the mentions
        seq_list_ners = []
        seq_list_lemmas = []
        seq_list_poses = []
        for word in sentence.words[betw_start + 1:betw_end]:
            if word.ner != "O":
                seq_list_ners.append(word.ner)
            else:
                seq_list_ners.append(word.lemma)
            seq_list_lemmas.append(word.lemma)
            seq_list_poses.append(word.pos)
        seq_ners = " ".join(seq_list_ners)
        seq_lemmas = " ".join(seq_list_lemmas)
        seq_poses = "_".join(seq_list_poses)
        print_feature(sentence.doc_id, relation_id,
                      inv + "WORD_SEQ_[" + seq_lemmas + "]")
        print_feature(sentence.doc_id, relation_id,
                      inv + "WORD_SEQ_NER_[" + seq_ners + "]")
        print_feature(sentence.doc_id, relation_id,
                      inv + "POS_SEQ_[" + seq_poses + "]")
        # Shortest dependency path between the two mentions
        (path, length) = sentence.dep_path(gene_words, pheno_words)
        print_feature(sentence.doc_id, relation_id,
                      inv + "DEP_PATH_[" + path + "]")
    # Number of words between the mentions
    # TODO I think this should be some kind of supervision rule instead?
    # print_feature(sentence.doc_id, relation_id,
    #    inv + "WORD_SEQ_LEN_[" + str(betw_end - betw_start - 1) + "]")
    # 2-gram between the mentions
    if betw_end - betw_start - 1 > 4 and betw_start - betw_end - 1 < 15:
        for i in range(betw_start + 1, betw_end - 1):
            print_feature(
                sentence.doc_id, relation_id,
                "BETW_2_GRAM_[" + sentence.words[i].lemma + "_" +
                sentence.words[i + 1].lemma + "]")
    # Lemmas on the exterior of the mentions and on the interior
    feature = inv
    if start > 0:
        feature += "EXT_NGRAM_[" + sentence.words[start - 1].lemma + "]"
    else:
        feature += "EXT_NGRAM_[NULL]"
    if end < len(sentence.words) - 1:
        feature += "_[" + sentence.words[end + 1].lemma + "]"
    else:
        feature += "_[NULL]"
    print_feature(sentence.doc_id, relation_id, feature)
    feature = inv + "INT_NGRAM_[" + sentence.words[betw_start + 1].lemma + \
        "]" + "_[" + sentence.words[betw_end - 1].lemma + "]"
    print_feature(sentence.doc_id, relation_id, feature)