Example #1
0
def get_features_for_candidate(row):
    """Extract features for candidate mention- both generic ones from ddlib & custom features"""
    features = []
    f = Feature(doc_id=row.doc_id,
                section_id=row.section_id,
                relation_id=row.relation_id,
                name=None)
    dds = util.create_ddlib_sentence(row)

    # (1) GENERIC FEATURES from ddlib
    gene_span = ddlib.Span(begin_word_id=row.gene_wordidxs[0],
                           length=len(row.gene_wordidxs))
    pheno_span = ddlib.Span(begin_word_id=row.pheno_wordidxs[0],
                            length=len(row.pheno_wordidxs))
    for feat in ddlib.get_generic_features_relation(dds, gene_span,
                                                    pheno_span):
        if take_feature(feat):
            features.append(f._replace(name=feat))
    features.extend(
        [f._replace(name=feat) for feat in get_custom_features(row, dds)])
    # these seem to be hurting (?)
    # start_span = ddlib.Span(begin_word_id=0, length=4)
    # for feat in ddlib.get_generic_features_mention(dds, start_span, length_bin_size=2):
    #  features.append(f._replace(name='START_SENT_%s' % feat))
    # WITH these custom features, I get a little LESS precision and a little MORE recall (!)
    # features += [f._replace(name=feat) for feat in create_ners_between(row.gene_wordidxs, row.pheno_wordidxs, row.ners)]
    return features
Example #2
0
def run(doc_id, sent_id, words, lemmas, poses, ners, dep_paths, dep_parents, wordidxs, relation_id, wordidxs_1, wordidxs_2):
  try:
    import ddlib
  except:
    import os
    DD_HOME = os.environ['DEEPDIVE_HOME']
    from sys import path
    path.append('%s/ddlib' % DD_HOME)
    import ddlib

  obj = dict()
  obj['lemma'] = []
  obj['words'] = []
  obj['ner'] = []
  obj['pos'] = []
  obj['dep_graph'] = []
  for i in xrange(len(words)):
      obj['lemma'].append(lemmas[i])
      obj['words'].append(words[i])
      obj['ner'].append(ners[i])
      obj['pos'].append(poses[i])
      obj['dep_graph'].append(
          str(int(dep_parents[i])) + "\t" + dep_paths[i] + "\t" + str(i))
  word_obj_list = ddlib.unpack_words(
      obj, lemma='lemma', pos='pos', ner='ner', words='words', dep_graph='dep_graph')
  gene_span = ddlib.get_span(wordidxs_1[0], len(wordidxs_1))
  pheno_span = ddlib.get_span(wordidxs_2[0], len(wordidxs_2))
  features = set()
  for feature in ddlib.get_generic_features_relation(word_obj_list, gene_span, pheno_span):
    features.add(feature)
  for feature in features:
    yield doc_id, relation_id, feature
def extract(
    p_id="text",
    e_id="text",
    p_begin_index="int",
    p_end_index="int",
    e_begin_index="int",
    e_end_index="int",
    doc_id="text",
    sent_index="int",
    tokens="text[]",
    lemmas="text[]",
    pos_tags="text[]",
    ner_tags="text[]",
    dep_types="text[]",
    dep_parents="int[]",
):
    """
    Uses DDLIB to generate features for the spouse relation.
    """
    ddlib.load_dictionary(os.path.abspath("../../../job_employ_keyword.txt"),
                          dict_id="has_employment")
    ddlib.load_dictionary(
        os.path.abspath("../../../job_no_employ_keyword.txt"),
        dict_id="no_employment")
    # Create a DDLIB sentence object, which is just a list of DDLIB Word objects
    sent = []
    for i, t in enumerate(tokens):
        sent.append(
            ddlib.Word(
                begin_char_offset=None,
                end_char_offset=None,
                word=t,
                lemma=lemmas[i],
                pos=pos_tags[i],
                ner=ner_tags[i],
                dep_par=dep_parents[i] -
                1,  # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
                dep_label=dep_types[i]))

    # Create DDLIB Spans for the two mentions
    p_span = ddlib.Span(begin_word_id=p_begin_index,
                        length=(p_end_index - p_begin_index + 1))
    e_span = ddlib.Span(begin_word_id=e_begin_index,
                        length=(e_end_index - e_begin_index + 1))

    # Generate the generic features using DDLIB
    for feature in ddlib.get_generic_features_relation(sent, p_span, e_span):
        yield [p_id, e_id, feature]
Example #4
0
def get_features_for_candidate(row):
    """Extract features for candidate mention- both generic ones from ddlib & custom features"""
    features = []
    f = Feature(doc_id=row.doc_id,
                section_id=row.section_id,
                relation_id=row.relation_id,
                name=None)
    dds = util.create_ddlib_sentence(row)

    # (1) GENERIC FEATURES from ddlib
    genevar_span = ddlib.Span(begin_word_id=row.genevar_wordidxs[0],
                              length=len(row.genevar_wordidxs))
    pheno_span = ddlib.Span(begin_word_id=row.pheno_wordidxs[0],
                            length=len(row.pheno_wordidxs))
    features += [f._replace(name=feat) \
                      for feat in ddlib.get_generic_features_relation(dds, genevar_span, pheno_span)]
    return features
def extract(
        chemical_id             = "text",
        disease_id              = "text",
        chemical_begin_index    = "int",
        chemical_end_index      = "int",
        disease_begin_index     = "int",
        disease_end_index       = "int",
        doc_id                  = "text",
        sent_index              = "int",
        tokens                  = "text[]",
        lemmas                  = "text[]",
        pos_tags                = "text[]",
        ner_tags                = "text[]",
        my_ner_tags             = "text[]",
        my_ner_tags_token_ids   = "int[]",
        dep_types               = "text[]",
        dep_parents             = "int[]",
    ):
    """
    Uses DDLIB to generate features for the chemical-disease relation candidates.
    """

    # creates a dictionary of tags from the sparse my_ner_tags array
    my_ner_tags_dict = { i:tag for i,tag in zip(my_ner_tags_token_ids, my_ner_tags) }

    sent = []
    for i,t in enumerate(tokens):
        sent.append(ddlib.Word(
            begin_char_offset=None,
            end_char_offset=None,
            word=t,
            lemma=lemmas[i],
            pos=pos_tags[i],
            # replace NER tag if one is found for that token in my_ner_tags:
            ner=my_ner_tags_dict[i] if i in my_ner_tags_dict else ner_tags[i],
            dep_par=dep_parents[i] - 1,  # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
            dep_label=dep_types[i]))

    # Create DDLIB Spans for the two person mentions
    chemical_span = ddlib.Span(begin_word_id=chemical_begin_index, length=(chemical_end_index-chemical_begin_index+1))
    disease_span = ddlib.Span(begin_word_id=disease_begin_index, length=(disease_end_index-disease_begin_index+1))

    # Generate the generic features using DDLIB
    for feature in ddlib.get_generic_features_relation(sent, chemical_span, disease_span):
        yield [chemical_id, disease_id, feature]
def extract(
    gene_id="text",
    variation_id="text",
    gene_begin_index="int",
    gene_end_index="int",
    var_begin_index="int",
    var_end_index="int",
    doc_id="text",
    sent_index="int",
    tokens="text[]",
    lemmas="text[]",
    pos_tags="text[]",
    ner_tags="text[]",
    dep_types="text[]",
    dep_parents="int[]",
):
    """
    Uses DDLIB to generate features for the spouse relation.
    """
    # Create a DDLIB sentence object, which is just a list of DDLIB Word objects
    sent = []
    for i, t in enumerate(tokens):
        sent.append(
            ddlib.Word(
                begin_char_offset=None,
                end_char_offset=None,
                word=t,
                lemma=lemmas[i],
                pos=pos_tags[i],
                ner=ner_tags[i],
                dep_par=dep_parents[i] -
                1,  # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
                dep_label=dep_types[i]))

    # Create DDLIB Spans for the gene and variation mentions
    gene_span = ddlib.Span(begin_word_id=gene_begin_index,
                           length=gene_end_index - gene_begin_index)
    variation_span = ddlib.Span(begin_word_id=var_begin_index,
                                length=var_end_index - var_begin_index)

    # Generate the generic features using DDLIB
    for feature in ddlib.get_generic_features_relation(sent, gene_span,
                                                       variation_span):
        yield [gene_id, variation_id, feature]
def extract(
    p1_id="text",
    p2_id="text",
    p1_begin_index="int",
    p1_end_index="int",
    p2_begin_index="int",
    p2_end_index="int",
    doc_id="text",
    sent_index="int",
    tokens="text[]",
    lemmas="text[]",
    pos_tags="text[]",
    ner_tags="text[]",
    dep_types="text[]",
    dep_parents="int[]",
):
    """
    Uses DDLIB to generate features for the relation of MED and ARD.
    """
    # Create a DDLIB sentence object, which is just a list of DDLIB Word objects
    sent = []
    for i, t in enumerate(tokens):
        sent.append(
            ddlib.Word(
                begin_char_offset=None,
                end_char_offset=None,
                word=t,
                lemma=lemmas[i],
                pos=pos_tags[i],
                ner=ner_tags[i],
                dep_par=dep_parents[i] -
                1,  # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
                dep_label=dep_types[i]))

    # Create DDLIB Spans for the two person mentions
    p1_span = ddlib.Span(begin_word_id=p1_begin_index,
                         length=(p1_end_index - p1_begin_index + 1))
    p2_span = ddlib.Span(begin_word_id=p2_begin_index,
                         length=(p2_end_index - p2_begin_index + 1))

    # Generate the generic features using DDLIB
    for feature in ddlib.get_generic_features_relation(sent, p1_span, p2_span):
        yield [p1_id, p2_id, feature]
Example #8
0
def extract(S_id="text",
            O_id="text",
            S_begin_index="int",
            S_end_index="int",
            O_begin_index="int",
            O_end_index="int",
            sent_id="text",
            tokens="text[]",
            pos_tags="text[]",
            ner_tags="text[]",
            dep_types="text[]",
            dep_tokens="int[]"):
    """
    Uses DDLIB to generate features for relation.
    """
    # Create a DDLIB sentence object, which is just a list of DDLIB Word objects
    sent = []
    if len(tokens) != len(pos_tags):
        print >> sys.stderr, '===>>>', sent_id, len(tokens), len(pos_tags)
    for i, t in enumerate(tokens):
        sent.append(
            ddlib.Word(
                begin_char_offset=None,
                end_char_offset=None,
                word=t,
                lemma=tokens[i],
                pos=pos_tags[i],
                ner=ner_tags[i],
                dep_par=dep_tokens[i] -
                1,  # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
                dep_label=dep_types[i]))

    # Create DDLIB Spans for the two person mentions
    S_span = ddlib.Span(begin_word_id=S_begin_index,
                        length=(S_begin_index - S_end_index + 1))
    O_span = ddlib.Span(begin_word_id=O_begin_index,
                        length=(O_begin_index - O_end_index + 1))

    # Generate the generic features using DDLIB
    for feature in ddlib.get_generic_features_relation(sent, S_span, O_span):
        yield [S_id, O_id, feature]
def extract(
        p1_id          = "text",
        p2_id          = "text",
        p1_begin_index = "int",
        p1_end_index   = "int",
        p2_begin_index = "int",
        p2_end_index   = "int",
        doc_id         = "text",
        sent_index     = "int",
        tokens         = "text[]",
        lemmas         = "text[]",
        pos_tags       = "text[]",
        ner_tags       = "text[]",
        dep_types      = "text[]",
        dep_parents    = "int[]",
    ):
    """
    Uses DDLIB to generate features for the spouse relation.
    """
    # Create a DDLIB sentence object, which is just a list of DDLIB Word objects
    sent = []
    for i,t in enumerate(tokens):
        sent.append(ddlib.Word(
            begin_char_offset=None,
            end_char_offset=None,
            word=t,
            lemma=lemmas[i],
            pos=pos_tags[i],
            ner=ner_tags[i],
            dep_par=dep_parents[i] - 1,  # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
            dep_label=dep_types[i]))

    # Create DDLIB Spans for the two person mentions
    p1_span = ddlib.Span(begin_word_id=p1_begin_index, length=(p1_end_index-p1_begin_index+1))
    p2_span = ddlib.Span(begin_word_id=p2_begin_index, length=(p2_end_index-p2_begin_index+1))

    # Generate the generic features using DDLIB
    for feature in ddlib.get_generic_features_relation(sent, p1_span, p2_span):
        yield [p1_id, p2_id, feature]
def extract(
    S_id            = "text",
    O_id            = "text",
    S_begin_index   = "int",
    S_end_index     = "int",
    O_begin_index   = "int",
    O_end_index     = "int",
    sent_id         = "text",
    tokens          = "text[]",
    pos_tags        = "text[]",
    ner_tags        = "text[]",
    dep_types       = "text[]",
    dep_tokens      = "int[]"
):
    """
    Uses DDLIB to generate features for relation.
    """
    # Create a DDLIB sentence object, which is just a list of DDLIB Word objects
    sent = []
    if len(tokens) != len(pos_tags):
        print >>sys.stderr, '===>>>', sent_id, len(tokens), len(pos_tags)
    for i,t in enumerate(tokens):
        sent.append(ddlib.Word(
            begin_char_offset=None,
            end_char_offset=None,
            word=t,
            lemma=tokens[i],
            pos=pos_tags[i],
            ner=ner_tags[i],
            dep_par=dep_tokens[i] - 1,  # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
            dep_label=dep_types[i]))

    # Create DDLIB Spans for the two person mentions
    S_span = ddlib.Span(begin_word_id=S_begin_index, length=(S_begin_index-S_end_index+1))
    O_span = ddlib.Span(begin_word_id=O_begin_index, length=(O_begin_index-O_end_index+1))

    # Generate the generic features using DDLIB
    for feature in ddlib.get_generic_features_relation(sent, S_span, O_span):
        yield [S_id, O_id, feature]
Example #11
0
def run(doc_id, sent_id, words, lemmas, poses, ners, dep_paths, dep_parents,
        wordidxs, relation_id, wordidxs_1, wordidxs_2):
    try:
        import ddlib
    except:
        import os
        DD_HOME = os.environ['DEEPDIVE_HOME']
        from sys import path
        path.append('%s/ddlib' % DD_HOME)
        import ddlib

    obj = dict()
    obj['lemma'] = []
    obj['words'] = []
    obj['ner'] = []
    obj['pos'] = []
    obj['dep_graph'] = []
    for i in xrange(len(words)):
        obj['lemma'].append(lemmas[i])
        obj['words'].append(words[i])
        obj['ner'].append(ners[i])
        obj['pos'].append(poses[i])
        obj['dep_graph'].append(
            str(int(dep_parents[i])) + "\t" + dep_paths[i] + "\t" + str(i))
    word_obj_list = ddlib.unpack_words(obj,
                                       lemma='lemma',
                                       pos='pos',
                                       ner='ner',
                                       words='words',
                                       dep_graph='dep_graph')
    gene_span = ddlib.get_span(wordidxs_1[0], len(wordidxs_1))
    pheno_span = ddlib.get_span(wordidxs_2[0], len(wordidxs_2))
    features = set()
    for feature in ddlib.get_generic_features_relation(word_obj_list,
                                                       gene_span, pheno_span):
        features.add(feature)
    for feature in features:
        yield doc_id, relation_id, feature
Example #12
0
  p1_start, p1_length, p2_start, p2_length = [int(x) for x in parts[6:]]

  # Get a sentence from ddlib -- array of "Word" objects
  if len(dependencies) == 0:
    print >>sys.stderr, str(relation_id) + '\t' + 'DEP_PATH_EMPTY'
    continue

  try:
    sentence = ddlib.get_sentence(
        [0, ] * len(words),  [0, ] * len(words), words, lemmas, poses,
        dependencies, ners)
  except:
    print >>sys.stderr, dependencies
    continue

  # Create two spans of person mentions
  span1 = ddlib.Span(begin_word_id=p1_start, length=p1_length)
  span2 = ddlib.Span(begin_word_id=p2_start, length=p2_length)

  # Features for this pair come in here
  features = set()

  # Get generic features generated by ddlib
  for feature in ddlib.get_generic_features_relation(sentence, span1, span2):
    features.add(feature)
    # TODO: clean LENGTH features?
    # if not 'LENGTH' in feature:
    #   features.add(feature)
  for feature in features:
    print str(relation_id) + '\t' + feature