def get_keywords_from_pos_entries(pos_entries): keywords = set() for entry in pos_entries: if annotator.is_link_entry(entry): word = entry['word'] if is_valid_keyword(entry['page'], word): keywords.add(word) return keywords
def get_probs_for_sense(sense, lang, vocab, poses, training_data, stop_words): pos_entries_by_paragraph = training_data if training_data == True or len(pos_entries_by_paragraph) == 0: return True counters_for_sense = get_empty_counters_for_sense() for pos_entries in pos_entries_by_paragraph: for i, pos_entry in enumerate(pos_entries): if annotator.is_link_entry(pos_entry): update_features(pos_entries, i, counters_for_sense, stop_words) return counters_to_probs(counters_for_sense, vocab, poses)
def is_likely_lower_sense(pos_entries_by_paragraph): num_lower = 0 num_total = 0 for pos_entries in pos_entries_by_paragraph: for pos_entry in pos_entries: if annotator.is_link_entry(pos_entry): num_total += 1 if is_valid_keyword(pos_entry['page'], pos_entry['word']): num_lower += 1 if num_total == 0: return False fraction = 1.0 * num_lower / num_total return fraction >= 0.1
def wsd_page(pageid, title, content, lang, stop_words): required_data = get_required_data_cache(pageid, title, content, lang) pos_entries_by_paragraph = required_data['doc'] vocab = required_data['vocab'] poses = required_data['poses'] training_data_by_keyword = required_data['training_data_by_keyword'] keyword_sense_probs = {} for keyword, training_data in training_data_by_keyword.iteritems(): debug("Training for keyword " + keyword) keyword_sense_probs[keyword] = get_probs_by_sense(keyword, lang, vocab, poses, \ training_data, stop_words) for pos_entries in pos_entries_by_paragraph: for i, pos_entry in enumerate(pos_entries): if annotator.is_link_entry(pos_entry): keyword = pos_entry['word'] if keyword in training_data_by_keyword: probs_for_sense = keyword_sense_probs[keyword] sense = predict_sense(pos_entries, i, probs_for_sense, stop_words) output_prediction(keyword, sense)
def print_links_in_pos_entries(pos_entries): for pos_entry in pos_entries: if annotator.is_link_entry(pos_entry): debug(pos_entry['link_text_words'])