def get_features_for_candidate(row): """Extract features for candidate mention- both generic ones from ddlib & custom features""" features = [] f = Feature(doc_id=row.doc_id, section_id=row.section_id, relation_id=row.relation_id, name=None) dds = util.create_ddlib_sentence(row) # (1) GENERIC FEATURES from ddlib gene_span = ddlib.Span(begin_word_id=row.gene_wordidxs[0], length=len(row.gene_wordidxs)) pheno_span = ddlib.Span(begin_word_id=row.pheno_wordidxs[0], length=len(row.pheno_wordidxs)) for feat in ddlib.get_generic_features_relation(dds, gene_span, pheno_span): if take_feature(feat): features.append(f._replace(name=feat)) features.extend( [f._replace(name=feat) for feat in get_custom_features(row, dds)]) # these seem to be hurting (?) # start_span = ddlib.Span(begin_word_id=0, length=4) # for feat in ddlib.get_generic_features_mention(dds, start_span, length_bin_size=2): # features.append(f._replace(name='START_SENT_%s' % feat)) # WITH these custom features, I get a little LESS precision and a little MORE recall (!) # features += [f._replace(name=feat) for feat in create_ners_between(row.gene_wordidxs, row.pheno_wordidxs, row.ners)] return features
def run(doc_id, sent_id, words, lemmas, poses, ners, dep_paths, dep_parents, wordidxs, relation_id, wordidxs_1, wordidxs_2): try: import ddlib except: import os DD_HOME = os.environ['DEEPDIVE_HOME'] from sys import path path.append('%s/ddlib' % DD_HOME) import ddlib obj = dict() obj['lemma'] = [] obj['words'] = [] obj['ner'] = [] obj['pos'] = [] obj['dep_graph'] = [] for i in xrange(len(words)): obj['lemma'].append(lemmas[i]) obj['words'].append(words[i]) obj['ner'].append(ners[i]) obj['pos'].append(poses[i]) obj['dep_graph'].append( str(int(dep_parents[i])) + "\t" + dep_paths[i] + "\t" + str(i)) word_obj_list = ddlib.unpack_words( obj, lemma='lemma', pos='pos', ner='ner', words='words', dep_graph='dep_graph') gene_span = ddlib.get_span(wordidxs_1[0], len(wordidxs_1)) pheno_span = ddlib.get_span(wordidxs_2[0], len(wordidxs_2)) features = set() for feature in ddlib.get_generic_features_relation(word_obj_list, gene_span, pheno_span): features.add(feature) for feature in features: yield doc_id, relation_id, feature
def extract( p_id="text", e_id="text", p_begin_index="int", p_end_index="int", e_begin_index="int", e_end_index="int", doc_id="text", sent_index="int", tokens="text[]", lemmas="text[]", pos_tags="text[]", ner_tags="text[]", dep_types="text[]", dep_parents="int[]", ): """ Uses DDLIB to generate features for the spouse relation. """ ddlib.load_dictionary(os.path.abspath("../../../job_employ_keyword.txt"), dict_id="has_employment") ddlib.load_dictionary( os.path.abspath("../../../job_no_employ_keyword.txt"), dict_id="no_employment") # Create a DDLIB sentence object, which is just a list of DDLIB Word objects sent = [] for i, t in enumerate(tokens): sent.append( ddlib.Word( begin_char_offset=None, end_char_offset=None, word=t, lemma=lemmas[i], pos=pos_tags[i], ner=ner_tags[i], dep_par=dep_parents[i] - 1, # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT dep_label=dep_types[i])) # Create DDLIB Spans for the two mentions p_span = ddlib.Span(begin_word_id=p_begin_index, length=(p_end_index - p_begin_index + 1)) e_span = ddlib.Span(begin_word_id=e_begin_index, length=(e_end_index - e_begin_index + 1)) # Generate the generic features using DDLIB for feature in ddlib.get_generic_features_relation(sent, p_span, e_span): yield [p_id, e_id, feature]
def get_features_for_candidate(row): """Extract features for candidate mention- both generic ones from ddlib & custom features""" features = [] f = Feature(doc_id=row.doc_id, section_id=row.section_id, relation_id=row.relation_id, name=None) dds = util.create_ddlib_sentence(row) # (1) GENERIC FEATURES from ddlib genevar_span = ddlib.Span(begin_word_id=row.genevar_wordidxs[0], length=len(row.genevar_wordidxs)) pheno_span = ddlib.Span(begin_word_id=row.pheno_wordidxs[0], length=len(row.pheno_wordidxs)) features += [f._replace(name=feat) \ for feat in ddlib.get_generic_features_relation(dds, genevar_span, pheno_span)] return features
def extract( chemical_id = "text", disease_id = "text", chemical_begin_index = "int", chemical_end_index = "int", disease_begin_index = "int", disease_end_index = "int", doc_id = "text", sent_index = "int", tokens = "text[]", lemmas = "text[]", pos_tags = "text[]", ner_tags = "text[]", my_ner_tags = "text[]", my_ner_tags_token_ids = "int[]", dep_types = "text[]", dep_parents = "int[]", ): """ Uses DDLIB to generate features for the chemical-disease relation candidates. """ # creates a dictionary of tags from the sparse my_ner_tags array my_ner_tags_dict = { i:tag for i,tag in zip(my_ner_tags_token_ids, my_ner_tags) } sent = [] for i,t in enumerate(tokens): sent.append(ddlib.Word( begin_char_offset=None, end_char_offset=None, word=t, lemma=lemmas[i], pos=pos_tags[i], # replace NER tag if one is found for that token in my_ner_tags: ner=my_ner_tags_dict[i] if i in my_ner_tags_dict else ner_tags[i], dep_par=dep_parents[i] - 1, # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT dep_label=dep_types[i])) # Create DDLIB Spans for the two person mentions chemical_span = ddlib.Span(begin_word_id=chemical_begin_index, length=(chemical_end_index-chemical_begin_index+1)) disease_span = ddlib.Span(begin_word_id=disease_begin_index, length=(disease_end_index-disease_begin_index+1)) # Generate the generic features using DDLIB for feature in ddlib.get_generic_features_relation(sent, chemical_span, disease_span): yield [chemical_id, disease_id, feature]
def extract( gene_id="text", variation_id="text", gene_begin_index="int", gene_end_index="int", var_begin_index="int", var_end_index="int", doc_id="text", sent_index="int", tokens="text[]", lemmas="text[]", pos_tags="text[]", ner_tags="text[]", dep_types="text[]", dep_parents="int[]", ): """ Uses DDLIB to generate features for the spouse relation. """ # Create a DDLIB sentence object, which is just a list of DDLIB Word objects sent = [] for i, t in enumerate(tokens): sent.append( ddlib.Word( begin_char_offset=None, end_char_offset=None, word=t, lemma=lemmas[i], pos=pos_tags[i], ner=ner_tags[i], dep_par=dep_parents[i] - 1, # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT dep_label=dep_types[i])) # Create DDLIB Spans for the gene and variation mentions gene_span = ddlib.Span(begin_word_id=gene_begin_index, length=gene_end_index - gene_begin_index) variation_span = ddlib.Span(begin_word_id=var_begin_index, length=var_end_index - var_begin_index) # Generate the generic features using DDLIB for feature in ddlib.get_generic_features_relation(sent, gene_span, variation_span): yield [gene_id, variation_id, feature]
def extract( p1_id="text", p2_id="text", p1_begin_index="int", p1_end_index="int", p2_begin_index="int", p2_end_index="int", doc_id="text", sent_index="int", tokens="text[]", lemmas="text[]", pos_tags="text[]", ner_tags="text[]", dep_types="text[]", dep_parents="int[]", ): """ Uses DDLIB to generate features for the relation of MED and ARD. """ # Create a DDLIB sentence object, which is just a list of DDLIB Word objects sent = [] for i, t in enumerate(tokens): sent.append( ddlib.Word( begin_char_offset=None, end_char_offset=None, word=t, lemma=lemmas[i], pos=pos_tags[i], ner=ner_tags[i], dep_par=dep_parents[i] - 1, # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT dep_label=dep_types[i])) # Create DDLIB Spans for the two person mentions p1_span = ddlib.Span(begin_word_id=p1_begin_index, length=(p1_end_index - p1_begin_index + 1)) p2_span = ddlib.Span(begin_word_id=p2_begin_index, length=(p2_end_index - p2_begin_index + 1)) # Generate the generic features using DDLIB for feature in ddlib.get_generic_features_relation(sent, p1_span, p2_span): yield [p1_id, p2_id, feature]
def extract(S_id="text", O_id="text", S_begin_index="int", S_end_index="int", O_begin_index="int", O_end_index="int", sent_id="text", tokens="text[]", pos_tags="text[]", ner_tags="text[]", dep_types="text[]", dep_tokens="int[]"): """ Uses DDLIB to generate features for relation. """ # Create a DDLIB sentence object, which is just a list of DDLIB Word objects sent = [] if len(tokens) != len(pos_tags): print >> sys.stderr, '===>>>', sent_id, len(tokens), len(pos_tags) for i, t in enumerate(tokens): sent.append( ddlib.Word( begin_char_offset=None, end_char_offset=None, word=t, lemma=tokens[i], pos=pos_tags[i], ner=ner_tags[i], dep_par=dep_tokens[i] - 1, # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT dep_label=dep_types[i])) # Create DDLIB Spans for the two person mentions S_span = ddlib.Span(begin_word_id=S_begin_index, length=(S_begin_index - S_end_index + 1)) O_span = ddlib.Span(begin_word_id=O_begin_index, length=(O_begin_index - O_end_index + 1)) # Generate the generic features using DDLIB for feature in ddlib.get_generic_features_relation(sent, S_span, O_span): yield [S_id, O_id, feature]
def extract( p1_id = "text", p2_id = "text", p1_begin_index = "int", p1_end_index = "int", p2_begin_index = "int", p2_end_index = "int", doc_id = "text", sent_index = "int", tokens = "text[]", lemmas = "text[]", pos_tags = "text[]", ner_tags = "text[]", dep_types = "text[]", dep_parents = "int[]", ): """ Uses DDLIB to generate features for the spouse relation. """ # Create a DDLIB sentence object, which is just a list of DDLIB Word objects sent = [] for i,t in enumerate(tokens): sent.append(ddlib.Word( begin_char_offset=None, end_char_offset=None, word=t, lemma=lemmas[i], pos=pos_tags[i], ner=ner_tags[i], dep_par=dep_parents[i] - 1, # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT dep_label=dep_types[i])) # Create DDLIB Spans for the two person mentions p1_span = ddlib.Span(begin_word_id=p1_begin_index, length=(p1_end_index-p1_begin_index+1)) p2_span = ddlib.Span(begin_word_id=p2_begin_index, length=(p2_end_index-p2_begin_index+1)) # Generate the generic features using DDLIB for feature in ddlib.get_generic_features_relation(sent, p1_span, p2_span): yield [p1_id, p2_id, feature]
def extract( S_id = "text", O_id = "text", S_begin_index = "int", S_end_index = "int", O_begin_index = "int", O_end_index = "int", sent_id = "text", tokens = "text[]", pos_tags = "text[]", ner_tags = "text[]", dep_types = "text[]", dep_tokens = "int[]" ): """ Uses DDLIB to generate features for relation. """ # Create a DDLIB sentence object, which is just a list of DDLIB Word objects sent = [] if len(tokens) != len(pos_tags): print >>sys.stderr, '===>>>', sent_id, len(tokens), len(pos_tags) for i,t in enumerate(tokens): sent.append(ddlib.Word( begin_char_offset=None, end_char_offset=None, word=t, lemma=tokens[i], pos=pos_tags[i], ner=ner_tags[i], dep_par=dep_tokens[i] - 1, # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT dep_label=dep_types[i])) # Create DDLIB Spans for the two person mentions S_span = ddlib.Span(begin_word_id=S_begin_index, length=(S_begin_index-S_end_index+1)) O_span = ddlib.Span(begin_word_id=O_begin_index, length=(O_begin_index-O_end_index+1)) # Generate the generic features using DDLIB for feature in ddlib.get_generic_features_relation(sent, S_span, O_span): yield [S_id, O_id, feature]
def run(doc_id, sent_id, words, lemmas, poses, ners, dep_paths, dep_parents, wordidxs, relation_id, wordidxs_1, wordidxs_2): try: import ddlib except: import os DD_HOME = os.environ['DEEPDIVE_HOME'] from sys import path path.append('%s/ddlib' % DD_HOME) import ddlib obj = dict() obj['lemma'] = [] obj['words'] = [] obj['ner'] = [] obj['pos'] = [] obj['dep_graph'] = [] for i in xrange(len(words)): obj['lemma'].append(lemmas[i]) obj['words'].append(words[i]) obj['ner'].append(ners[i]) obj['pos'].append(poses[i]) obj['dep_graph'].append( str(int(dep_parents[i])) + "\t" + dep_paths[i] + "\t" + str(i)) word_obj_list = ddlib.unpack_words(obj, lemma='lemma', pos='pos', ner='ner', words='words', dep_graph='dep_graph') gene_span = ddlib.get_span(wordidxs_1[0], len(wordidxs_1)) pheno_span = ddlib.get_span(wordidxs_2[0], len(wordidxs_2)) features = set() for feature in ddlib.get_generic_features_relation(word_obj_list, gene_span, pheno_span): features.add(feature) for feature in features: yield doc_id, relation_id, feature
p1_start, p1_length, p2_start, p2_length = [int(x) for x in parts[6:]] # Get a sentence from ddlib -- array of "Word" objects if len(dependencies) == 0: print >>sys.stderr, str(relation_id) + '\t' + 'DEP_PATH_EMPTY' continue try: sentence = ddlib.get_sentence( [0, ] * len(words), [0, ] * len(words), words, lemmas, poses, dependencies, ners) except: print >>sys.stderr, dependencies continue # Create two spans of person mentions span1 = ddlib.Span(begin_word_id=p1_start, length=p1_length) span2 = ddlib.Span(begin_word_id=p2_start, length=p2_length) # Features for this pair come in here features = set() # Get generic features generated by ddlib for feature in ddlib.get_generic_features_relation(sentence, span1, span2): features.add(feature) # TODO: clean LENGTH features? # if not 'LENGTH' in feature: # features.add(feature) for feature in features: print str(relation_id) + '\t' + feature