Esempio n. 1
0
def get_keywords_from_pos_entries(pos_entries):
  keywords = set()
  for entry in pos_entries:
    if annotator.is_link_entry(entry):
      word = entry['word']
      if is_valid_keyword(entry['page'], word):
        keywords.add(word)
  return keywords
Esempio n. 2
0
def get_probs_for_sense(sense, lang, vocab, poses, training_data, stop_words):
  pos_entries_by_paragraph = training_data

  if training_data == True or len(pos_entries_by_paragraph) == 0:
    return True
  counters_for_sense = get_empty_counters_for_sense()
  for pos_entries in pos_entries_by_paragraph:
    for i, pos_entry in enumerate(pos_entries):
      if annotator.is_link_entry(pos_entry):
        update_features(pos_entries, i, counters_for_sense, stop_words)

  return counters_to_probs(counters_for_sense, vocab, poses)
Esempio n. 3
0
def is_likely_lower_sense(pos_entries_by_paragraph):
  num_lower = 0
  num_total = 0
  for pos_entries in pos_entries_by_paragraph:
    for pos_entry in pos_entries:
      if annotator.is_link_entry(pos_entry):
        num_total += 1
        if is_valid_keyword(pos_entry['page'], pos_entry['word']):
          num_lower += 1
  if num_total == 0:
    return False
  fraction = 1.0 * num_lower / num_total
  return fraction >= 0.1
Esempio n. 4
0
def wsd_page(pageid, title, content, lang, stop_words):
  required_data = get_required_data_cache(pageid, title, content, lang)

  pos_entries_by_paragraph = required_data['doc']
  vocab = required_data['vocab']
  poses = required_data['poses']
  training_data_by_keyword = required_data['training_data_by_keyword']

  keyword_sense_probs = {}
  for keyword, training_data in training_data_by_keyword.iteritems():
    debug("Training for keyword " + keyword)
    keyword_sense_probs[keyword] = get_probs_by_sense(keyword, lang, vocab, poses, \
      training_data, stop_words)

  for pos_entries in pos_entries_by_paragraph:
    for i, pos_entry in enumerate(pos_entries):
      if annotator.is_link_entry(pos_entry):
        keyword = pos_entry['word']
        if keyword in training_data_by_keyword:
          probs_for_sense = keyword_sense_probs[keyword]
          sense = predict_sense(pos_entries, i, probs_for_sense, stop_words)
          output_prediction(keyword, sense)
Esempio n. 5
0
def print_links_in_pos_entries(pos_entries):
  for pos_entry in pos_entries:
    if annotator.is_link_entry(pos_entry):
      debug(pos_entry['link_text_words'])