def build_all_datasets(): # Build up the cache of Wikipedia editor usernames if prompt_and_print.prompt_for_build_wikipedia_username_cache(): crosssite_username_dataset_mgr.build_wikipedia_editor_username_cache() # Prompt to ask from which site we want to build a dataset try: site = prompt_and_print.prompt_for_site() except KeyError: print "Sorry, that is not a recognized site. Exiting." return # Build up the spreadsheet of usernames that # on exist both Wikipedia and the passed site if prompt_and_print.prompt_for_build_username_csv(): crosssite_username_dataset_mgr.build_crosssite_username_dataset(site) # Get the confirmed usernames from the spreadsheet since these will # be the usernames from which Wikipedia edits and short texts are fetched crosssite_usernames = crosssite_username_dataset_mgr.get_confirmed_usernames(site) # Build the spreadsheet of articles that # these usernames have edited on Wikipedia if prompt_and_print.prompt_for_build_edits_csv(): wikipedia_edits_dataset_mgr.build_wikipedia_edits_dataset(crosssite_usernames, site) # Build the spreadsheet of short texts that # these usernames have posted on the input site if prompt_and_print.prompt_for_build_shorttexts_csv(site): short_text_dataset_mgr.build_shorttexts_dataset(crosssite_usernames, site) # Get the shorttexts fetched from the given site shorttext_rows = short_text_dataset_mgr.get_shorttext_rows(site) # Cache the nouns and Named Entities detected by nltk, which are used when validating entities later if prompt_and_print.prompt_for_cache_nltk_entities(site): nltk_extraction_dataset_mgr.extract_entities(shorttext_rows, site) # Build the spreadsheet of named entities that are # contained within these short texts on the given site if prompt_and_print.prompt_for_build_entity_csv(site): entity_dataset_mgr.build_entities_dataset(shorttext_rows, site)
def run_RESLVE(): # Prompt to ask from which site we want to disambiguate entities try: site = prompt_and_print.prompt_for_site() except KeyError: print "Sorry, that is not a recognized site. Exiting." return reslve_algorithms = __get_RESLVE_algorithm_constructors__() alg_num = raw_input("Which RESLVE algorithm do you want to run? "+\ "\n1=Article Content, 2=Article ID, 3=Article Title, 4=Direct Category ID, "+\ "5=Direct Category Title, 6=Category Graph ID, 7=Category Graph Title, 8=Article Content WSD") cache_resolved_entities = raw_input("Cache resolved entities? (Y/N): ") RESLVE_alg = reslve_algorithms[int(alg_num)]() resolved_entities = RESLVE_rankings_mgr.run_all_algorithms(RESLVE_alg, site, cache_resolved_entities) # evaluate and compare performance performance.compare_ranking_precision(resolved_entities) performance.eval_annotator_agreement(site)
if judgment=='true': num_true = num_true+1 else: num_false = num_false+1 selected_candidates[selected_candidate_title] = (num_true, num_false) judgments[judged_entity_id] = selected_candidates turkerID = row[turkerID_col] annotator_decisions[turkerID].append({selected_candidate_title:judgment}) row_num = row_num+1 except: continue # just ignore a problematic row # Cache each annotator's decisions for later inter-rater agreement calculations entity_dataset_mgr.save_annotator_decisions(annotator_decisions, site) print "Cached a total of "+str(len(judgments))+" entities judged by human Mechanical Turk annotators" entity_dataset_mgr.save_entity_judgements(judgments, site) return judgments prompt_make_or_extract = raw_input("Make entities task for Turkers (A) or analyze completed task (B)? ") if 'A'==prompt_make_or_extract or 'a'==prompt_make_or_extract: make_tweet_entities_csv_for_turk() elif 'B'==prompt_make_or_extract or 'b'==prompt_make_or_extract: site = prompt_and_print.prompt_for_site() analyze_entity_judgments(site) else: print "Unrecognized input, exiting."