def extract_entities(shorttext_rows, site): # { short text id -> (noun entities, named entities) } shorttext_entities = {} # nltk entity classes nltk_entity_types = __get_nltk_entity_types__() for shorttext_row in shorttext_rows: shorttext_id = shorttext_row[0] shorttext_str = shorttext_row[1] noun_entities = [] named_entities = [] sentences = nltk.sent_tokenize(shorttext_str) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] chunked_sentences = nltk.batch_ne_chunk(tagged_sentences) for tree in chunked_sentences: __extract_valid_entities__(tree, (noun_entities, named_entities), nltk_entity_types) shorttext_entities[shorttext_id] = (noun_entities, named_entities) # Cache extracted entities pkl_util.write_pickle(__output_str__, shorttext_entities, __get_nltk_entities_cache_path__(site))
def build_wikipedia_editor_username_cache(): ''' Fetches large numbers of active Wikipedia editors who have made edits recently and stores them in a cache, which we can access while attempting to find usernames that exist on both Wikipedia and a short text source site. (The occurrence of such cross-site username matces may be low, so we want to have a large cache of Wikipedia editors to draw upon). The cache is a mapping of { username -> { edited page -> number of edits on page } } ''' # Load the Wikipedia usernames+edits cache output_str = "Wikipedia editor usernames and their edited pages..." editor_usernames = pkl_util.load_pickle(output_str, __wikipedia_editors_cache_path__) if editor_usernames is None: editor_usernames = [] # Prompt how many Wikipedia usernames to fetch and query Wikipedia until retrieved that many desired_num_editors = prompt_and_print.prompt_num_entries_to_build("active Wikipedia editors", editor_usernames) pre_fetch_len = len(editor_usernames) wikipedia_api_util.query_editors_of_recentchanges(desired_num_editors, editor_usernames) print "Fetched "+str(len(editor_usernames)-pre_fetch_len)+" more recent and active Wikipedia editors" # make sure all usernames are lowercase editor_usernames = [u.lower() for u in editor_usernames] # Update cache print "Cached a total of "+str(len(editor_usernames))+" Wikipedia editor usernames" pkl_util.write_pickle(output_str, editor_usernames, __wikipedia_editors_cache_path__)
def build_crosssite_username_dataset(site): ''' Searches the given site for the Wikipedia editor usernames we have previously cached. Does so until have a sufficient set of unique users who have both active Wikipedia accounts and active accounts on the given site. Saves those users in a csv file and a pkl cache and also writes to cache the usernames that are determined to NOT exist on the given site so we don't bother searching for them again in the future. @param site: Should be a Site object ''' siteNameStr = str(site.siteName) # Load or create/initialize the spreadsheet of usernames usernames_csv_path = __get_usernames_csv_path__(site) csv_string = 'usernames that exist on both Wikipedia and '+siteNameStr headers = [COLUMN_USERNAME, __COLUMN_SAME_INDIVIDUAL__] usernames_in_csv = csv_util.load_or_initialize_csv(usernames_csv_path, csv_string, headers, COLUMN_USERNAME) # Load the caches of Wikipedia usernames: editor_names_cache = pkl_util.load_pickle("Wikipedia editor usernames", __wikipedia_editors_cache_path__) editor_usernames = [] if (editor_names_cache is None) else editor_names_cache # editor usernames that do NOT exist on the given site nonexistent_usernames_path = __get_nonexistent_usernames_cache_path__(site) nonexistent_usernames_cache = pkl_util.load_pickle("Wikipedia usernames that do NOT exist on "+siteNameStr+"...", nonexistent_usernames_path) if nonexistent_usernames_cache==None: nonexistent_usernames_cache = [] # only need to analyze those usernames that we haven't # already determined do or do not exist on given site usernames_todo = __get_remaining_todo__(editor_usernames, [usernames_in_csv, nonexistent_usernames_cache]) # Prompt how many matching usernames to fetch from the given site desired_num_usernames = prompt_and_print.prompt_num_entries_to_build(csv_string, usernames_in_csv) num_to_append = desired_num_usernames - len(usernames_in_csv) if len(usernames_todo) < num_to_append: print "Only "+str(len(usernames_todo))+" unanalyzed Wikipedia usernames in cache. If you "+\ "want "+str(desired_num_usernames)+" total in the cross-site usernames csv, you'll have to "+\ "re-run script and choose to first fetch more Wikipedia editor usernames." prompt_count = 0 while(len(usernames_in_csv)<desired_num_usernames and len(usernames_todo)>0): # Intermittently prompt user whether to continue fetching matching usernames or exit script if prompt_count >= __PROMPT_COUNT__: continue_searching = prompt_and_print.prompt_continue_building(csv_string, usernames_in_csv, desired_num_usernames) if not continue_searching: break prompt_count = 0 # reset count prompt_count = prompt_count + 1 # get lists of usernames that do or do not also exist on the given site match_response = site.fetching_existence_status(usernames_todo, desired_num_usernames) existing = match_response[site.get_existing_response_key()] nonexisting = match_response[site.get_nonexisting_response_key()] print "Found "+str(len(existing))+" existing and active usernames on "+siteNameStr # update the spreadsheet with any new usernames that have been fetched existing_rows = [[username, __VALUE_UNCONFIRMED__] for username in existing] csv_util.append_to_spreadsheet(csv_string, usernames_csv_path, usernames_in_csv, existing_rows) # and update the list of usernames in the csv so we know how # many more we still need to fetch to reach the desired num usernames_in_csv.extend(existing) # Also update the cache of Wikipedia usernames that do NOT exist on the given site nonexistent_usernames_cache.extend(nonexisting) nonexistent_write_str = "usernames that DO NOT exist on both Wikipedia and "+siteNameStr+"..." pkl_util.write_pickle(nonexistent_write_str, nonexistent_usernames_cache, nonexistent_usernames_path) # remove any usernames that we now determined do not exist on given site usernames_todo = __get_remaining_todo__(usernames_todo, [existing, nonexistent_usernames_cache]) rate_limited = match_response[site.get_rate_limit_key()] if rate_limited: break # reached rate limit, so break
def build_wikipedia_edits_dataset(crosssite_usernames, prompt=True): # Load or create/initialize the spreadsheet of users' wikipedia edits csv_string = 'Wikipedia edits made by usernames that also exist on a site that is a source of short texts' headers = [COLUMN_USERNAME, __COLUMN_ARTICLE_ID__, __COLUMN_NUM_EDITS__] usernames_in_csv = csv_util.load_or_initialize_csv(__edits_csv_path__, csv_string, headers, COLUMN_USERNAME) # Load the cache of edits, a dict: { username -> {edited page -> num edits } } editor_names_to_edits_cache = pkl_util.load_pickle("Wikipedia editor usernames to their edited pages+counts", __edits_cache_path__) if editor_names_to_edits_cache is None: editor_names_to_edits_cache = {} # only need to fetch the edits for usernames that we haven't already done editors_todo = [u for u in crosssite_usernames if u not in usernames_in_csv] # Exit if all available names are done if len(editors_todo)==0: print "Wikipedia edit data fetched and stored for all "+\ str(len(crosssite_usernames))+" confirmed cross-site editors. Exiting." return print str(len(crosssite_usernames))+" cross-site editors total, and "+\ str(len(editors_todo))+" editors not yet in spreadsheet of edits " # Prompt how many users to fetch edits for if prompt: desired_num_editors = prompt_and_print.prompt_num_entries_to_build(csv_string, usernames_in_csv) num_to_append = desired_num_editors - len(usernames_in_csv) if len(editors_todo) < num_to_append: print "Only "+str(len(editors_todo))+" cross-site usernames available. If you want "+\ "want "+str(desired_num_editors)+" total editors' edits in the edits csv, you'll have to "+\ "re-run script and choose to first fetch more cross-site usernames." else: desired_num_editors = 1 edits_rows = [] #prompt_count = 0 progress_count = 1 for username in editors_todo: if len(usernames_in_csv) >= desired_num_editors: # have enough so exit break ''' # Intermittently prompt user whether to continue fetching matching usernames or exit script if prompt and prompt_count >= __PROMPT_COUNT__: continue_searching = prompt_and_print.prompt_continue_building(csv_string, usernames_in_csv, desired_num_editors) if not continue_searching: break prompt_count = 0 # reset count prompt_count = prompt_count + 1 ''' if progress_count%10==0: print "Querying for pages edited by cross site usernames... Number "+\ "usernames whose edits have been fetched so far: "+str(progress_count) progress_count = progress_count+1 user_edits = wikipedia_api_util.query_usercontribs(username, False) for article_id in user_edits: num_times_edited = user_edits[article_id] edits_row = [username, article_id, num_times_edited] edits_rows.append(edits_row) # keep track that we'll be adding this username to the csv usernames_in_csv.append(username) editor_names_to_edits_cache[username] = user_edits # add that user+edits to cache # update the spreadsheet with any new editors' edits that have been fetched csv_util.append_to_spreadsheet(csv_string, __edits_csv_path__, usernames_in_csv, edits_rows, False) # update the edit mapping cache pkl_util.write_pickle("user edits to file...", editor_names_to_edits_cache, __edits_cache_path__)
def __save_resolved_entities__(resolved_entities, site, use_cache): if use_cache: pkl_util.write_pickle(__resolved_entities_output_str__, resolved_entities, __get_resolved_entities_cache_path__(site))
def save_annotator_decisions(annotator_decisions, site): pkl_util.write_pickle(__annotator_output_str__, annotator_decisions, __get_annotator_cache_path__(site))
def save_entity_judgements(judgments, site): pkl_util.write_pickle(__candidate_judgments_output_str__, judgments, __get_candidate_judgments_cache_path__(site))
def build_entities_dataset(shorttext_rows, site): siteNameStr = str(site.siteName) # Load or create/initialize the spreadsheet of users' short texts entity_csv_path = __get_entities_csv_path__(site) output_str = __get_detected_entities_output_str__(site) headers = [COLUMN_ENTITY_ID, __COLUMN_ENTITY_STRING__, COLUMN_SHORTTEXT_ID, COLUMN_SHORTTEXT_STRING, COLUMN_USERNAME] entities_in_csv = csv_util.load_or_initialize_csv(entity_csv_path, output_str, headers, COLUMN_ENTITY_ID) shorttexts_in_csv = csv_util.get_all_column_values(entity_csv_path, COLUMN_SHORTTEXT_ID) print "A total of "+str(len(shorttext_rows))+" short texts available to detect and resolve entities in..." # Load the cache of ambiguous entity objects ne_objs = pkl_util.load_pickle(output_str, __get_ne_cache_path__(site)) if ne_objs is None: ne_objs = [] # Load the cache of short texts that contain no entities # and that we don't need to keep querying services with entityless_shorttexts = get_entityless_shorttexts(site) # Load the cache of problematic short texts that we can # go back and look at later.. problematic_shorttexts = get_problematic_shorttexts(site) # Prompt how many users to fetch short texts for desired_num_entities = prompt_and_print.prompt_num_entries_to_build(output_str, shorttexts_in_csv) entities_rows = [] progress_count = 1 all_shorttexts_done = True for shorttext_row in shorttext_rows: shorttext_id = shorttext_row[0] if shorttext_id in shorttexts_in_csv or shorttext_id in entityless_shorttexts or shorttext_id in problematic_shorttexts: # already did entities for this shorttext (and either successfully # detected some, successfully detected none, or encountered an error) continue all_shorttexts_done = False try: if len(entities_in_csv) >= desired_num_entities: # have enough so exit break if progress_count%10==0: print "Detecting named entities in short texts posted on "+siteNameStr+\ " by cross-site usernames... Number of short texts whose entities have been fetched so far: \n"+\ str(len(entities_in_csv)) progress_count = progress_count+1 original_shorttext = shorttext_row[1] username = shorttext_row[2] # get the entities contained in each short text # clean the short text before attempting to detect entities in it clean_shorttext = text_util.format_text_for_NER(original_shorttext, site) if clean_shorttext=='': # whole string was invalid, perhaps a URL or # some other content that gets totally filtered problematic_shorttexts.append(shorttext_id) continue detected_entities = named_entity_finder.find_and_construct_named_entities(shorttext_id, original_shorttext, username, site) if len(detected_entities)==0: entityless_shorttexts.append(shorttext_id) for ne_obj in detected_entities: # cache this entity object ne_objs.append(ne_obj) # make a row in the spreadsheet for this entity ne_id = ne_obj.get_entity_id() entity_row = [ne_id, ne_obj.surface_form, shorttext_id, original_shorttext, username] entities_rows.append(entity_row) # keep track that we'll be adding this entity to the csv entities_in_csv.append(ne_id) except Exception as st_e: print "Problematic short text "+str(shorttext_row[1]), st_e if 'referenced before assignment' in str(st_e): raise # it's a server error so we need to stop problematic_shorttexts.append(shorttext_id) continue # update the spreadsheet with any new users' short texts that have been fetched csv_util.append_to_spreadsheet(output_str, entity_csv_path, entities_in_csv, entities_rows, False) # update the cache of ambiguous surface form objects pkl_util.write_pickle(output_str, ne_objs, __get_ne_cache_path__(site)) pkl_util.write_pickle(__entityless_output_str__, entityless_shorttexts, __get_entityless_cache_path__(site)) pkl_util.write_pickle(__problematic_output_str__, problematic_shorttexts, __get_problematic_cache_path__(site)) print "Cached a total of "+str(len(ne_objs))+" ambiguous named entities" if all_shorttexts_done: print "Completed detecting and resolving entities in all short texts available." else: print "More short texts available to detect and resolve entities for."