def test_tokens_between_spans(self): span1 = dd.Span(0, 2) span2 = dd.Span(3, 5) words_between = dd.tokens_between_spans(self.words, span1, span2) self.assertEqual(words_between[:], (False, ["Jake"])) words_between = dd.tokens_between_spans(self.words, span2, span1) self.assertEqual(words_between[:], (True, ["Jake"])) words_between = dd.tokens_between_spans(self.words, span1, span1) self.assertEqual(words_between[:], (False, []))
def test_tokens_between_spans(self): span1 = dd.Span(0, 2) span2 = dd.Span(3, 5) words_between = dd.tokens_between_spans(self.words, span1, span2) self.assertEqual([words_between[0], list(words_between[1])], [False, ["Jake"]]) words_between = dd.tokens_between_spans(self.words, span2, span1) self.assertEqual([words_between[0], list(words_between[1])], [True, ["Jake"]]) words_between = dd.tokens_between_spans(self.words, span1, span1) self.assertEqual([words_between[0], list(words_between[1])], [False, []])
def test_tokens_between_spans(self): span1 = dd.Span(0, 2) span2 = dd.Span(3, 5) words_between = dd.tokens_between_spans(self.words, span1, span2) self.assertEqual( [words_between[0], list(words_between[1])], [False, ["Jake"]]) words_between = dd.tokens_between_spans(self.words, span2, span1) self.assertEqual( [words_between[0], list(words_between[1])], [True, ["Jake"]]) words_between = dd.tokens_between_spans(self.words, span1, span1) self.assertEqual( [words_between[0], list(words_between[1])], [False, []])
# Get all fields from a row words = parts[0].split(ARR_DELIM) relation_id = parts[1] p2_text = parts[2] p1_start, p1_length, p2_start, p2_length = [int(x) for x in parts[3:]] # Unpack input into tuples. span1 = ddlib.Span(begin_word_id=p1_start, length=p1_length) span2 = ddlib.Span(begin_word_id=p2_start, length=p2_length) # Features for this pair come in here features = set() # Feature 1: Bag of words between the two phrases words_between = ddlib.tokens_between_spans(words, span1, span2) count = 1 for word in words_between.elements: if count < nbWordsBetweenPeopleCompanyConsidered: features.add("word_between=" + word) count +=1 # Feature 2: Number of words between the two phrases features.add("num_words_between=%s" % len(words_between.elements)) # Feature 3: Is the last name of the founder included in the name of the company? last_word_left = ddlib.materialize_span(words, span1)[-1] if (last_word_left in p2_text): features.add("potential_last_name_match")
import sys, json import ddlib # DeepDive python utility # For each input tuple for row in sys.stdin: obj = json.loads(row) words = obj["words"] # Unpack input into tuples. span1 = ddlib.Span(begin_word_id=obj['p1_start'], length=obj['p1_length']) span2 = ddlib.Span(begin_word_id=obj['p2_start'], length=obj['p2_length']) # Features for this pair come in here features = set() # Feature 1: Bag of words between the two phrases words_between = ddlib.tokens_between_spans(words, span1, span2) for word in words_between.elements: features.add("word_between=" + word) # Feature 2: Number of words between the two phrases features.add("num_words_between=%s" % len(words_between.elements)) # Feature 3: Does the last word (last name) match? last_word_left = ddlib.materialize_span(words, span1)[-1] last_word_right = ddlib.materialize_span(words, span2)[-1] if (last_word_left == last_word_right): features.add("potential_last_name_match") ######################## # Improved Feature Set # ########################
# sample json for row in sys.stdin: # Unpack input into tuples. # obj = json.loads(row) words, lemmas = obj["words"], obj["lemma"] span1 = ddlib.Span(begin_word_id=obj['p1.start_position'], length=obj['p1.length']) span2 = ddlib.Span(begin_word_id=obj['p2.start_position'], length=obj['p2.length']) features = set() # Feature 1: Find out if a lemma of marry occurs. # A better feature would ensure this is on the dependency path between the two. # lemma_between = ddlib.tokens_between_spans(lemmas, span1, span2) married_words = ('marry', 'widow') for lemma in lemma_between.elements: if lemma in married_words: features.add("important_word=%s" % lemma) # Feature 2: The number of words between the two phrases. # Intuition: if they are close by, the link may be stronger. # words_between = ddlib.tokens_between_spans(words, span1, span2) l = len(words_between.elements) features.add("num_words_between=%s" % l if l<5 else "many_words_between") # Feature 3: Check if the last name matches heuristically. # last_word_left = ddlib.materialize_span(words, span1)[-1]