relation_id = parts[1] p2_text = parts[2] p1_start, p1_length, p2_start, p2_length = [int(x) for x in parts[3:]] # Unpack input into tuples. span1 = ddlib.Span(begin_word_id=p1_start, length=p1_length) span2 = ddlib.Span(begin_word_id=p2_start, length=p2_length) # Features for this pair come in here features = set() # Feature 1: Bag of words between the two phrases words_between = ddlib.tokens_between_spans(words, span1, span2) count = 1 for word in words_between.elements: if count < nbWordsBetweenPeopleCompanyConsidered: features.add("word_between=" + word) count +=1 # Feature 2: Number of words between the two phrases features.add("num_words_between=%s" % len(words_between.elements)) # Feature 3: Is the last name of the founder included in the name of the company? last_word_left = ddlib.materialize_span(words, span1)[-1] if (last_word_left in p2_text): features.add("potential_last_name_match") for feature in features: print str(relation_id) + '\t' + feature
span1 = ddlib.Span(begin_word_id=obj['p1_start'], length=obj['p1_length']) span2 = ddlib.Span(begin_word_id=obj['p2_start'], length=obj['p2_length']) # Features for this pair come in here features = set() # Feature 1: Bag of words between the two phrases words_between = ddlib.tokens_between_spans(words, span1, span2) for word in words_between.elements: features.add("word_between=" + word) # Feature 2: Number of words between the two phrases features.add("num_words_between=%s" % len(words_between.elements)) # Feature 3: Does the last word (last name) match? last_word_left = ddlib.materialize_span(words, span1)[-1] last_word_right = ddlib.materialize_span(words, span2)[-1] if (last_word_left == last_word_right): features.add("potential_last_name_match") ######################## # Improved Feature Set # ######################## # # Feature 1: Find out if a lemma of marry occurs. # # A better feature would ensure this is on the dependency path between the two. # words_between = ddlib.tokens_between_spans(words, span1, span2) # lemma_between = ddlib.tokens_between_spans(obj["lemma"], span1, span2) # married_words = ['marry', 'widow', 'wife', 'fiancee', 'spouse'] # non_married_words = ['father', 'mother', 'brother', 'sister', 'son'] # # Make sure the distance between mention pairs is not too long
def test_materialize_span(self): span1 = dd.Span(0, 3) materialized_span = dd.materialize_span(self.words, span1) self.assertEqual(materialized_span[:], ["Tanja", "married", "Jake"])
if len(lemma_between.elements) <= 10: for mw in death_words: if mw in lemma_between.elements: features.add("important_word=%s" % mw) # TODO: window # # Feature 2: Number of words between the two phrases # Intuition: if they are close by, the link may be stronger. l = len(words_between.elements) if l < 5: features.add("num_words_between=%d" % l) else: features.add("many_words_between") # Feature 3: number size toll_number = ' '.join(ddlib.materialize_span(words, span2)) toll_number = re.sub(',', '', toll_number.lower()) # 84,887 -> 84887 number_dict = ["zero", "one", 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten'] if toll_number in number_dict: toll_number = number_dict.index(toll_number) features.add("toll_num=%d" % toll_number) else: try: toll_number = int(toll_number) if toll_number < 10: features.add("toll_num=%d", toll_number) else: features.add("toll_num_log=%d", int(math.log(toll_number))) except: features.add("toll_not_parsed")
lemma_between = ddlib.tokens_between_spans(lemmas, span1, span2) married_words = ('marry', 'widow') for lemma in lemma_between.elements: if lemma in married_words: features.add("important_word=%s" % lemma) # Feature 2: The number of words between the two phrases. # Intuition: if they are close by, the link may be stronger. # words_between = ddlib.tokens_between_spans(words, span1, span2) l = len(list(words_between.elements)) features.add("num_words_between=%s" % l if l<5 else "many_words_between") # Feature 3: Check if the last name matches heuristically. # last_word_left = list(ddlib.materialize_span(words, span1))[-1] last_word_right = list(ddlib.materialize_span(words, span2))[-1] if (last_word_left == last_word_right): features.add("potential_last_name_match") # Use this line if you want to print out all features extracted # #ddlib.log(features) for feature in sorted(features): print(json.dumps({ "relation_id": obj["relation_id"], "feature": feature }, sort_keys=True))