def get_features_for_candidate(row): """Extract features for candidate mention- both generic ones from ddlib & custom features""" features = [] f = Feature(doc_id=row.doc_id, section_id=row.section_id, relation_id=row.relation_id, name=None) dds = util.create_ddlib_sentence(row) # (1) GENERIC FEATURES from ddlib gene_span = ddlib.Span(begin_word_id=row.gene_wordidxs[0], length=len(row.gene_wordidxs)) pheno_span = ddlib.Span(begin_word_id=row.pheno_wordidxs[0], length=len(row.pheno_wordidxs)) for feat in ddlib.get_generic_features_relation(dds, gene_span, pheno_span): if take_feature(feat): features.append(f._replace(name=feat)) features.extend( [f._replace(name=feat) for feat in get_custom_features(row, dds)]) # these seem to be hurting (?) # start_span = ddlib.Span(begin_word_id=0, length=4) # for feat in ddlib.get_generic_features_mention(dds, start_span, length_bin_size=2): # features.append(f._replace(name='START_SENT_%s' % feat)) # WITH these custom features, I get a little LESS precision and a little MORE recall (!) # features += [f._replace(name=feat) for feat in create_ners_between(row.gene_wordidxs, row.pheno_wordidxs, row.ners)] return features
def test_tokens_between_spans(self): span1 = dd.Span(0, 2) span2 = dd.Span(3, 5) words_between = dd.tokens_between_spans(self.words, span1, span2) self.assertEqual(words_between[:], (False, ["Jake"])) words_between = dd.tokens_between_spans(self.words, span2, span1) self.assertEqual(words_between[:], (True, ["Jake"])) words_between = dd.tokens_between_spans(self.words, span1, span1) self.assertEqual(words_between[:], (False, []))
def test_tokens_between_spans(self): span1 = dd.Span(0, 2) span2 = dd.Span(3, 5) words_between = dd.tokens_between_spans(self.words, span1, span2) self.assertEqual( [words_between[0], list(words_between[1])], [False, ["Jake"]]) words_between = dd.tokens_between_spans(self.words, span2, span1) self.assertEqual( [words_between[0], list(words_between[1])], [True, ["Jake"]]) words_between = dd.tokens_between_spans(self.words, span1, span1) self.assertEqual( [words_between[0], list(words_between[1])], [False, []])
def extract( p_id="text", e_id="text", p_begin_index="int", p_end_index="int", e_begin_index="int", e_end_index="int", doc_id="text", sent_index="int", tokens="text[]", lemmas="text[]", pos_tags="text[]", ner_tags="text[]", dep_types="text[]", dep_parents="int[]", ): """ Uses DDLIB to generate features for the spouse relation. """ ddlib.load_dictionary(os.path.abspath("../../../job_employ_keyword.txt"), dict_id="has_employment") ddlib.load_dictionary( os.path.abspath("../../../job_no_employ_keyword.txt"), dict_id="no_employment") # Create a DDLIB sentence object, which is just a list of DDLIB Word objects sent = [] for i, t in enumerate(tokens): sent.append( ddlib.Word( begin_char_offset=None, end_char_offset=None, word=t, lemma=lemmas[i], pos=pos_tags[i], ner=ner_tags[i], dep_par=dep_parents[i] - 1, # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT dep_label=dep_types[i])) # Create DDLIB Spans for the two mentions p_span = ddlib.Span(begin_word_id=p_begin_index, length=(p_end_index - p_begin_index + 1)) e_span = ddlib.Span(begin_word_id=e_begin_index, length=(e_end_index - e_begin_index + 1)) # Generate the generic features using DDLIB for feature in ddlib.get_generic_features_relation(sent, p_span, e_span): yield [p_id, e_id, feature]
def get_features_for_row(row): OPTS = config.PHENO_ACRONYMS['F'] features = [] f = Feature(doc_id=row.doc_id, section_id=row.section_id, mention_id=row.mention_id, name=None) # (1) Get generic ddlib features sentence = util.create_ddlib_sentence(row) allWordIdxs = row.short_wordidxs + row.long_wordidxs start = min(allWordIdxs) length = max(allWordIdxs) - start span = ddlib.Span(begin_word_id=start, length=length) assert len(span) > 0, row assert start + length < len(row.words), (start + length, len(row.words), row) generic_features = [ f._replace(name=feat) for feat in ddlib.get_generic_features_mention(sentence, span) ] # Optionally filter out some generic features if OPTS.get('exclude_generic'): generic_features = filter( lambda feat: not feat.startswith(tuple(OPTS['exclude_generic'])), generic_features) features += generic_features return features
def get_features_for_row(row): #OPTS = config.GENE['F'] features = [] f = Feature(doc_id=row.doc_id, section_id=row.section_id, mention_id=row.mention_id, name=None) # (1) Get generic ddlib features sentence = util.create_ddlib_sentence(row) span = ddlib.Span(begin_word_id=row.mention_wordidxs[0], length=len(row.mention_wordidxs)) generic_features = [ f._replace(name=feat) for feat in ddlib.get_generic_features_mention(sentence, span) ] features += generic_features features += [f._replace(name=feat) for feat in get_custom_features(row)] # (2) Include gene type as a feature # Note: including this as feature creates massive overfitting, for obvious reasons # We need neg supervision of canonical & noncanonical symbols, then can / should try adding this feature """ for t in ENSEMBL_TYPES: if re.search(re.escape(t), row.mention_type, flags=re.I): features.append(f._replace(name='GENE_TYPE[%s]' % t)) break """ return features
def extract( organization_id="text", begin_index="int", end_index="int", doc_id="text", sentence_index="int", tokens="text[]", pos_tags="text[]", dep_types="text[]", dep_heads="int[]", ): sent = [] for i, t in enumerate(tokens): sent.append( ddlib.Word( begin_char_offset=None, end_char_offset=None, word=t, lemma=tokens[i], pos=pos_tags[i], ner=None, dep_par=dep_heads[i] - 1, # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT dep_label=dep_types[i])) #### org_span = ddlib.Span(begin_word_id=begin_index, length=(end_index - begin_index + 1)) for feature in ddlib.get_generic_features_mention(sent, org_span): yield [organization_id, feature]
def extract( mention_id="text", doc_begin_index="int", doc_end_index="int", doc_id="text", position="text", sentence_index="int", tokens="text[]", pos_tags="text[]", ): # Constant # WINDOW_SIZE = 10 # Load keyword dictionaries using ddlib, for domain-specific features # Words in "legal_penalty" dictionary are indicative of marriage # Words in "non_legal_penalty" dictionary are indicative of non_marriage APP_HOME = os.environ['APP_HOME'] ddlib.load_dictionary(APP_HOME + "/udf/dicts/kw_crime.txt", dict_id="crime") ddlib.load_dictionary(APP_HOME + "/udf/dicts/kw_non_crime.txt", dict_id="non_crime") # kw_non_legal_penalty = map(lambda word: word.strip(), open(APP_HOME + "/udf/dicts/kw_non_legal_penalty.txt", 'r').readlines()) # kw_legal_penalty = map(lambda word: word.strip(), open(APP_HOME + "/udf/dicts/kw_legal_penalty.txt", 'r').readlines()) # Non penalty signals on the left of candidate mention # NON_PENAL_SIGNALS_LEFT = frozenset(kw_non_legal_penalty) # Penalty signals on the right of candidate mention # PENAL_SIGNALS_LEFT = frozenset(kw_legal_penalty) WINDOW_SIZE = 10 MAX_PHRASE_LENGTH = 5 # Get all subsequences of left sentence with WINDOW_SIZE = 10 low_tokens = map(lambda token: token.lower(), tokens) left_window = get_left_window(doc_begin_index, low_tokens, WINDOW_SIZE) phrases_in_sentence_left = list( get_all_phrases_in_sentence(left_window, MAX_PHRASE_LENGTH)) # Create a DDLIB sentence object, which is just a list of DDLIB Word objects sent = [] for i, t in enumerate(tokens): sent.append( ddlib.Word( begin_char_offset=None, end_char_offset=None, word=t, lemma=tokens[i], # lemma for vietnamese: lowercase pos=pos_tags[i], ner=None, dep_par= -1, # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT dep_label=None)) # Create DDLIB Span for penalty candidate penalty_span = ddlib.Span(begin_word_id=doc_begin_index, length=(doc_end_index - doc_begin_index + 1)) # Generate the generic features using DDLIB on left and right window for feature in ddlib.get_generic_features_mention(sent, penalty_span): yield [mention_id, feature]
def get_features_for_candidate(row): """Extract features for candidate mention- both generic ones from ddlib & custom features""" features = [] f = Feature(doc_id=row.doc_id, section_id=row.section_id, relation_id=row.relation_id, name=None) dds = util.create_ddlib_sentence(row) # (1) GENERIC FEATURES from ddlib genevar_span = ddlib.Span(begin_word_id=row.genevar_wordidxs[0], length=len(row.genevar_wordidxs)) pheno_span = ddlib.Span(begin_word_id=row.pheno_wordidxs[0], length=len(row.pheno_wordidxs)) features += [f._replace(name=feat) \ for feat in ddlib.get_generic_features_relation(dds, genevar_span, pheno_span)] return features
def extract( chemical_id = "text", disease_id = "text", chemical_begin_index = "int", chemical_end_index = "int", disease_begin_index = "int", disease_end_index = "int", doc_id = "text", sent_index = "int", tokens = "text[]", lemmas = "text[]", pos_tags = "text[]", ner_tags = "text[]", my_ner_tags = "text[]", my_ner_tags_token_ids = "int[]", dep_types = "text[]", dep_parents = "int[]", ): """ Uses DDLIB to generate features for the chemical-disease relation candidates. """ # creates a dictionary of tags from the sparse my_ner_tags array my_ner_tags_dict = { i:tag for i,tag in zip(my_ner_tags_token_ids, my_ner_tags) } sent = [] for i,t in enumerate(tokens): sent.append(ddlib.Word( begin_char_offset=None, end_char_offset=None, word=t, lemma=lemmas[i], pos=pos_tags[i], # replace NER tag if one is found for that token in my_ner_tags: ner=my_ner_tags_dict[i] if i in my_ner_tags_dict else ner_tags[i], dep_par=dep_parents[i] - 1, # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT dep_label=dep_types[i])) # Create DDLIB Spans for the two person mentions chemical_span = ddlib.Span(begin_word_id=chemical_begin_index, length=(chemical_end_index-chemical_begin_index+1)) disease_span = ddlib.Span(begin_word_id=disease_begin_index, length=(disease_end_index-disease_begin_index+1)) # Generate the generic features using DDLIB for feature in ddlib.get_generic_features_relation(sent, chemical_span, disease_span): yield [chemical_id, disease_id, feature]
def extract( gene_id="text", variation_id="text", gene_begin_index="int", gene_end_index="int", var_begin_index="int", var_end_index="int", doc_id="text", sent_index="int", tokens="text[]", lemmas="text[]", pos_tags="text[]", ner_tags="text[]", dep_types="text[]", dep_parents="int[]", ): """ Uses DDLIB to generate features for the spouse relation. """ # Create a DDLIB sentence object, which is just a list of DDLIB Word objects sent = [] for i, t in enumerate(tokens): sent.append( ddlib.Word( begin_char_offset=None, end_char_offset=None, word=t, lemma=lemmas[i], pos=pos_tags[i], ner=ner_tags[i], dep_par=dep_parents[i] - 1, # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT dep_label=dep_types[i])) # Create DDLIB Spans for the gene and variation mentions gene_span = ddlib.Span(begin_word_id=gene_begin_index, length=gene_end_index - gene_begin_index) variation_span = ddlib.Span(begin_word_id=var_begin_index, length=var_end_index - var_begin_index) # Generate the generic features using DDLIB for feature in ddlib.get_generic_features_relation(sent, gene_span, variation_span): yield [gene_id, variation_id, feature]
def extract( p1_id="text", p2_id="text", p1_begin_index="int", p1_end_index="int", p2_begin_index="int", p2_end_index="int", doc_id="text", sent_index="int", tokens="text[]", lemmas="text[]", pos_tags="text[]", ner_tags="text[]", dep_types="text[]", dep_parents="int[]", ): """ Uses DDLIB to generate features for the relation of MED and ARD. """ # Create a DDLIB sentence object, which is just a list of DDLIB Word objects sent = [] for i, t in enumerate(tokens): sent.append( ddlib.Word( begin_char_offset=None, end_char_offset=None, word=t, lemma=lemmas[i], pos=pos_tags[i], ner=ner_tags[i], dep_par=dep_parents[i] - 1, # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT dep_label=dep_types[i])) # Create DDLIB Spans for the two person mentions p1_span = ddlib.Span(begin_word_id=p1_begin_index, length=(p1_end_index - p1_begin_index + 1)) p2_span = ddlib.Span(begin_word_id=p2_begin_index, length=(p2_end_index - p2_begin_index + 1)) # Generate the generic features using DDLIB for feature in ddlib.get_generic_features_relation(sent, p1_span, p2_span): yield [p1_id, p2_id, feature]
def extract(S_id="text", O_id="text", S_begin_index="int", S_end_index="int", O_begin_index="int", O_end_index="int", sent_id="text", tokens="text[]", pos_tags="text[]", ner_tags="text[]", dep_types="text[]", dep_tokens="int[]"): """ Uses DDLIB to generate features for relation. """ # Create a DDLIB sentence object, which is just a list of DDLIB Word objects sent = [] if len(tokens) != len(pos_tags): print >> sys.stderr, '===>>>', sent_id, len(tokens), len(pos_tags) for i, t in enumerate(tokens): sent.append( ddlib.Word( begin_char_offset=None, end_char_offset=None, word=t, lemma=tokens[i], pos=pos_tags[i], ner=ner_tags[i], dep_par=dep_tokens[i] - 1, # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT dep_label=dep_types[i])) # Create DDLIB Spans for the two person mentions S_span = ddlib.Span(begin_word_id=S_begin_index, length=(S_begin_index - S_end_index + 1)) O_span = ddlib.Span(begin_word_id=O_begin_index, length=(O_begin_index - O_end_index + 1)) # Generate the generic features using DDLIB for feature in ddlib.get_generic_features_relation(sent, S_span, O_span): yield [S_id, O_id, feature]
def get_features_for_candidate(row): """Extract features for candidate mention- both generic ones from ddlib & custom features""" features = [] dds = util.create_ddlib_sentence(row) # (1) GENERIC FEATURES from ddlib span = ddlib.Span(begin_word_id=row.mention_wordidxs[0], length=len(row.mention_wordidxs)) features += [(row.doc_id, row.section_id, row.mention_id, feat) \ for feat in ddlib.get_generic_features_mention(dds, span)] # (2) Add the closest verb by raw distance if OPTS.get('closest-verb'): verb_idxs = [i for i, p in enumerate(row.poses) if p.startswith("VB")] if len(verb_idxs) > 0: dists = filter(lambda d : d[0] > 0, \ [(min([abs(i-j) for j in row.mention_wordidxs]), i) for i in verb_idxs]) if len(dists) > 0: verb = row.lemmas[min(dists)[1]] features.append((row.doc_id, row.section_id, row.mention_id, 'NEAREST_VERB_[%s]' % (verb, ))) return features
def run(doc_id, sent_id, words, lemmas, poses, ners, dep_paths, dep_parents, mention_id, wordidxs): try: import ddlib except: import os DD_HOME = os.environ['DEEPDIVE_HOME'] from sys import path path.append('%s/ddlib' % DD_HOME) import ddlib def unpack_(begin_char_offsets, end_char_offsets, words, lemmas, poses, ners, dep_parents, dep_paths): wordobjs = [] for i in range(0, len(words)): wordobjs.append( ddlib.Word( begin_char_offset=None, end_char_offset=None, word=words[i], lemma=lemmas[i], pos=poses[i], ner='', # NER is noisy on medical docs dep_par=dep_parents[i], dep_label=dep_paths[i])) return wordobjs begin_char_offsets = None end_char_offsets = None sentence = unpack_(begin_char_offsets, end_char_offsets, words, lemmas, poses, ners, dep_parents, dep_paths) span = ddlib.Span(begin_word_id=wordidxs[0], length=len(wordidxs)) for feature in ddlib.get_generic_features_mention(sentence, span): yield doc_id, mention_id, feature
# File: udf/ext_has_spouse_features.py # Sample input data (piped into STDIN): ''' {"p2_length":2,"p1_length":2,"lemma":["Sen.","Barack","Obama","and","he","wife",",","Michelle","Obama",",","have","release","eight","year","of","joint","return","."],"words":["Sen.","Barack","Obama","and","his","wife",",","Michelle","Obama",",","have","released","eight","years","of","joint","returns","."],"relation_id":"118238@10_7_118238@10_1","p1_start_position":7,"p2_start_position":1} ''' import sys, json import ddlib # DeepDive python utility # For each input tuple for row in sys.stdin: obj = json.loads(row) words = obj["words"] # Unpack input into tuples. span1 = ddlib.Span(begin_word_id=obj['p1_start'], length=obj['p1_length']) span2 = ddlib.Span(begin_word_id=obj['p2_start'], length=obj['p2_length']) # Features for this pair come in here features = set() # Feature 1: Bag of words between the two phrases words_between = ddlib.tokens_between_spans(words, span1, span2) for word in words_between.elements: features.add("word_between=" + word) # Feature 2: Number of words between the two phrases features.add("num_words_between=%s" % len(words_between.elements)) # Feature 3: Does the last word (last name) match? last_word_left = ddlib.materialize_span(words, span1)[-1]
ARR_DELIM = '~^~' # For each input tuple for row in sys.stdin: parts = row.strip().split('\t') if len(parts) != 6: print >>sys.stderr, 'Failed to parse row:', row continue # Get all fields from a row words = parts[0].split(ARR_DELIM) relation_id = parts[1] p1_start, p1_length, p2_start, p2_length = [int(x) for x in parts[2:]] # Unpack input into tuples. span1 = ddlib.Span(begin_word_id=p1_start, length=p1_length) span2 = ddlib.Span(begin_word_id=p2_start, length=p2_length) # Features for this pair come in here features = set() # Feature 1: Bag of words between the two phrases words_between = ddlib.tokens_between_spans(words, span1, span2) for word in words_between.elements: features.add("word_between=" + word) # Feature 2: Number of words between the two phrases features.add("num_words_between=%s" % len(words_between.elements)) # Feature 3: Does the last word (last name) match? last_word_left = ddlib.materialize_span(words, span1)[-1]
def extract( p_id="text", p_begin_index="int", p_end_index="int", doc_id="text", sent_index="int", tokens="text[]", pos_tags="text[]", ner_tags="text[]", dep_types="text[]", dep_parents="int[]", ): """ Uses DDLIB to generate features for the legal penalty mention """ # Constant # WINDOW_SIZE = 10 # Load keyword dictionaries using ddlib, for domain-specific features # Words in "legal_penalty" dictionary are indicative of marriage # Words in "non_legal_penalty" dictionary are indicative of non_marriage APP_HOME = os.environ['APP_HOME'] ddlib.load_dictionary(APP_HOME + "/udf/dicts/kw_legal_penalty.txt", dict_id="legal_penalty") ddlib.load_dictionary(APP_HOME + "/udf/dicts/kw_non_legal_penalty.txt", dict_id="non_legal_penalty") kw_non_legal_penalty = map( lambda word: word.strip(), open(APP_HOME + "/udf/dicts/kw_non_legal_penalty.txt", 'r').readlines()) # kw_legal_penalty = map(lambda word: word.strip(), open(APP_HOME + "/udf/dicts/kw_legal_penalty.txt", 'r').readlines()) # Non penalty signals on the left of candidate mention NON_PENAL_SIGNALS_LEFT = frozenset(kw_non_legal_penalty) # Penalty signals on the right of candidate mention # PENAL_SIGNALS_LEFT = frozenset(kw_legal_penalty) WINDOW_SIZE = 10 MAX_PHRASE_LENGTH = 5 # Get all subsequences of left sentence with WINDOW_SIZE = 10 low_tokens = map(lambda token: token.lower(), tokens) left_window = get_left_window(p_begin_index, low_tokens, WINDOW_SIZE) phrases_in_sentence_left = list( get_all_phrases_in_sentence(left_window, MAX_PHRASE_LENGTH)) # Create a DDLIB sentence object, which is just a list of DDLIB Word objects sent = [] for i, t in enumerate(tokens): sent.append( ddlib.Word( begin_char_offset=None, end_char_offset=None, word=t, lemma=t.lower(), # lemma for vietnamese: lowercase pos=pos_tags[i], ner=ner_tags[i], dep_par=dep_parents[i] - 1, # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT dep_label=dep_types[i])) # Create DDLIB Span for penalty candidate penalty_span = ddlib.Span(begin_word_id=p_begin_index, length=(p_end_index - p_begin_index + 1)) # Generate the generic features using DDLIB on left and right window for feature in ddlib.get_generic_features_mention(sent, penalty_span): yield [p_id, feature] # Keywords represent non-legal_penalty appears on the left if len(NON_PENAL_SIGNALS_LEFT.intersection(phrases_in_sentence_left)) > 0: yield [p_id, 'APPEAR_LEFT_KW_NON_LEGAL_PENALTY'] # "phạt tù" appear on the left of mention if "phạt tù" in phrases_in_sentence_left: yield [p_id, 'APPEAR_LEFT_PHAT_TU']
#! /usr/bin/env python # File: udf/ext_has_spouse_features.py import sys, json import ddlib # For each input tuple # TODO: Sample Data and the input schema. # sample json for row in sys.stdin: # Unpack input into tuples. # obj = json.loads(row) words, lemmas = obj["words"], obj["lemma"] span1 = ddlib.Span(begin_word_id=obj['p1.start_position'], length=obj['p1.length']) span2 = ddlib.Span(begin_word_id=obj['p2.start_position'], length=obj['p2.length']) features = set() # Feature 1: Find out if a lemma of marry occurs. # A better feature would ensure this is on the dependency path between the two. # lemma_between = ddlib.tokens_between_spans(lemmas, span1, span2) married_words = ('marry', 'widow') for lemma in lemma_between.elements: if lemma in married_words: features.add("important_word=%s" % lemma) # Feature 2: The number of words between the two phrases. # Intuition: if they are close by, the link may be stronger.
def test_materialize_span(self): span1 = dd.Span(0, 3) materialized_span = dd.materialize_span(self.words, span1) self.assertEqual(materialized_span[:], ["Tanja", "married", "Jake"])