relation_id = parts[1]
  p2_text = parts[2]
  p1_start, p1_length, p2_start, p2_length = [int(x) for x in parts[3:]]

  # Unpack input into tuples.
  span1 = ddlib.Span(begin_word_id=p1_start, length=p1_length)
  span2 = ddlib.Span(begin_word_id=p2_start, length=p2_length)

  # Features for this pair come in here
  features = set()
  
  # Feature 1: Bag of words between the two phrases
  words_between = ddlib.tokens_between_spans(words, span1, span2)
  count = 1
  for word in words_between.elements:
    if count < nbWordsBetweenPeopleCompanyConsidered:
      features.add("word_between=" + word)
    count +=1
    

  # Feature 2: Number of words between the two phrases
  features.add("num_words_between=%s" % len(words_between.elements))

  # Feature 3: Is the last name of the founder included in the name of the company?
  last_word_left = ddlib.materialize_span(words, span1)[-1]
  if (last_word_left in p2_text):
    features.add("potential_last_name_match")

  for feature in features:
    print str(relation_id) + '\t' + feature 
Example #2
0
  span1 = ddlib.Span(begin_word_id=obj['p1_start'], length=obj['p1_length'])
  span2 = ddlib.Span(begin_word_id=obj['p2_start'], length=obj['p2_length'])

  # Features for this pair come in here
  features = set()

  # Feature 1: Bag of words between the two phrases
  words_between = ddlib.tokens_between_spans(words, span1, span2)
  for word in words_between.elements:
    features.add("word_between=" + word)

  # Feature 2: Number of words between the two phrases
  features.add("num_words_between=%s" % len(words_between.elements))

  # Feature 3: Does the last word (last name) match?
  last_word_left = ddlib.materialize_span(words, span1)[-1]
  last_word_right = ddlib.materialize_span(words, span2)[-1]
  if (last_word_left == last_word_right):
    features.add("potential_last_name_match")

  ########################
  # Improved Feature Set #
  ########################

  # # Feature 1: Find out if a lemma of marry occurs.
  # # A better feature would ensure this is on the dependency path between the two.
  # words_between = ddlib.tokens_between_spans(words, span1, span2)
  # lemma_between = ddlib.tokens_between_spans(obj["lemma"], span1, span2)
  # married_words = ['marry', 'widow', 'wife', 'fiancee', 'spouse']
  # non_married_words = ['father', 'mother', 'brother', 'sister', 'son']
  # # Make sure the distance between mention pairs is not too long
Example #3
0
 def test_materialize_span(self):
   span1 = dd.Span(0, 3)
   materialized_span = dd.materialize_span(self.words, span1)
   self.assertEqual(materialized_span[:], ["Tanja", "married", "Jake"])
  if len(lemma_between.elements) <= 10:
    for mw in death_words:
      if mw in lemma_between.elements: 
        features.add("important_word=%s" % mw)

  # TODO: window

  # # Feature 2: Number of words between the two phrases
  # Intuition: if they are close by, the link may be stronger.
  l = len(words_between.elements)
  if l < 5: features.add("num_words_between=%d" % l)
  else: features.add("many_words_between")


  # Feature 3: number size
  toll_number = ' '.join(ddlib.materialize_span(words, span2))
  toll_number = re.sub(',', '', toll_number.lower()) # 84,887 -> 84887
  number_dict = ["zero", "one", 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten']
  if toll_number in number_dict:
    toll_number = number_dict.index(toll_number)
    features.add("toll_num=%d" % toll_number)
  else:
    try:
      toll_number = int(toll_number)
      if toll_number < 10:
        features.add("toll_num=%d", toll_number)
      else:
        features.add("toll_num_log=%d", int(math.log(toll_number)))
    except:
      features.add("toll_not_parsed")
Example #5
0
 def test_materialize_span(self):
     span1 = dd.Span(0, 3)
     materialized_span = dd.materialize_span(self.words, span1)
     self.assertEqual(materialized_span[:], ["Tanja", "married", "Jake"])
Example #6
0
  lemma_between = ddlib.tokens_between_spans(lemmas, span1, span2)
  married_words = ('marry', 'widow')
  for lemma in lemma_between.elements:
    if lemma in married_words:
      features.add("important_word=%s" % lemma)

  # Feature 2: The number of words between the two phrases.
  # Intuition: if they are close by, the link may be stronger.
  #
  words_between = ddlib.tokens_between_spans(words, span1, span2)
  l = len(list(words_between.elements))
  features.add("num_words_between=%s" % l if l<5 else "many_words_between")

  # Feature 3: Check if the last name matches heuristically.
  #
  last_word_left = list(ddlib.materialize_span(words, span1))[-1]
  last_word_right = list(ddlib.materialize_span(words, span2))[-1]
  if (last_word_left == last_word_right):
    features.add("potential_last_name_match")

  # Use this line if you want to print out all features extracted
  #
  #ddlib.log(features)

  for feature in sorted(features):
    print(json.dumps({
      "relation_id": obj["relation_id"],
      "feature": feature
    }, sort_keys=True))