def worker_function(file_name_list, source_folder, target_folder): source_path_list = (source_folder + "/" + file_name for file_name in file_name_list) target_path_list = (target_folder + "/" + file_name[:-4] + ".json" for file_name in file_name_list) sent_tokenize, _treebank_word_tokenize = get_tokenizer() tagger = get_braupt_tagger() lemmatizer, lemmatize = get_lemmatizer("wordnet") stopset = get_stopset() first_cap_re, all_cap_re = get_camel_case_regexes() digits_punctuation_whitespace_re = get_digits_punctuation_whitespace_regex() pos_set = get_pos_set() # Get the lists of a user for source_path in source_path_list: twitter_lists_corpus = load_pickle(source_path) if "lists" in twitter_lists_corpus.keys(): twitter_lists_corpus = twitter_lists_corpus["lists"] else: continue bag_of_lemmas, lemma_to_keywordbag = user_twitter_list_bag_of_words(twitter_lists_corpus, sent_tokenize, _treebank_word_tokenize, tagger, lemmatizer, lemmatize, stopset, first_cap_re, all_cap_re, digits_punctuation_whitespace_re, pos_set) user_annotation = dict() user_annotation["bag_of_lemmas"] = bag_of_lemmas user_annotation["lemma_to_keywordbag"] = lemma_to_keywordbag target_path = next(target_path_list) with open(target_path, "w", encoding="utf-8") as fp: json.dump(user_annotation, fp)
def worker_function(file_name_list, lemmatizing, source_folder, target_folder): source_path_list = (source_folder + "/" + file_name for file_name in file_name_list) target_path_list = (target_folder + "/" + file_name[:-4] + ".json" for file_name in file_name_list) # Get the lists of a user for source_path in source_path_list: twitter_lists_corpus = load_pickle(source_path) if "lists" in twitter_lists_corpus.keys(): twitter_lists_corpus = twitter_lists_corpus["lists"] else: continue bag_of_lemmas, lemma_to_keywordbag = user_twitter_list_bag_of_words(twitter_lists_corpus, lemmatizing) user_annotation = dict() user_annotation["bag_of_lemmas"] = bag_of_lemmas user_annotation["lemma_to_keywordbag"] = lemma_to_keywordbag target_path = next(target_path_list) with open(target_path, "w", encoding="utf-8") as fp: json.dump(user_annotation, fp)
def make_annotation( twitter_lists_folder, twitter_lists_keywords_folder, weakly_connected_graph_folder, weakly_connected_label_folder, full_graph_folder, ): # TODO: Move keywords from Mongo to the folder. # Read set of users. weakly_connected_user_id_set = load_pickle(weakly_connected_graph_folder + "/user_id_set" + ".pkl") weakly_connected_node_to_id = load_pickle(weakly_connected_graph_folder + "/node_to_id" + ".pkl") id_to_name = load_pickle(full_graph_folder + "/id_to_name" + ".pkl") # Read set of twitter lists. twitter_list_file_list = os.listdir(twitter_lists_folder) twitter_list_file_list = [int(file_name[:-4]) for file_name in twitter_list_file_list] # Read which users are annotated. user_keywords_file_list = os.listdir(twitter_lists_keywords_folder) user_keywords_file_list = [int(file_name[:-5]) for file_name in user_keywords_file_list] # Find which twitter lists need to be preprocessed. user_twitter_id_list = [ file_name for file_name in twitter_list_file_list if file_name in weakly_connected_user_id_set ] user_twitter_id_list = [file_name for file_name in user_twitter_id_list if file_name not in user_keywords_file_list] twitter_list_file_list = [str(file_name) + ".pkl" for file_name in user_twitter_id_list] pool = Pool(processes=get_threads_number() * 2) user_chunks = chunks(twitter_list_file_list, get_threads_number() * 2) pool.map( partial( worker_function, lemmatizing="wordnet", source_folder=twitter_lists_folder, target_folder=twitter_lists_keywords_folder, ), user_chunks, ) # # Make user-label matrix. user_keywords_file_list = [str(file_name) for file_name in user_keywords_file_list] user_twitter_list_keywords_gen = read_local_user_annotations(twitter_lists_keywords_folder, user_keywords_file_list) weakly_connected_id_to_node = dict(zip(weakly_connected_node_to_id.values(), weakly_connected_node_to_id.keys())) # # twitter_id_to_weakly_connected_node = {int(twitter_id): weakly_connected_id_to_node[int(twitter_id)] for twitter_id in user_keywords_file_list if int(twitter_id) in weakly_connected_id_to_node.keys()} # node_twitter_list_keywords_gen = ((weakly_connected_id_to_node[int(user_twitter_id)], twitter_list_keywords) for user_twitter_id, twitter_list_keywords in user_twitter_list_keywords_gen if int(user_twitter_id) in weakly_connected_id_to_node.keys()) # for node, j in user_twitter_list_keywords_gen: # print(node, j) implicated_user_twitter_list_keywords_gen = ( (int(user_twitter_id), twitter_list_keywords) for user_twitter_id, twitter_list_keywords in user_twitter_list_keywords_gen if int(user_twitter_id) in weakly_connected_id_to_node.keys() ) # for node, j in user_twitter_list_keywords_gen: # print(node, j) #################################################################################################################### # Semi-automatic user annotation. #################################################################################################################### reveal_set = get_reveal_set() topic_keyword_dict = get_topic_keyword_dictionary() available_topics = set(list(topic_keyword_dict.keys())) keyword_list = list() for topic in reveal_set: if topic in available_topics: keyword_list.extend(topic_keyword_dict[topic]) lemma_set = list() for keyword in keyword_list: lemma = clean_single_word(keyword, lemmatizing="wordnet") lemma_set.append(lemma) lemma_set = set(lemma_set) keyword_topic_dict = dict() for topic, keyword_set in topic_keyword_dict.items(): for keyword in keyword_set: keyword_topic_dict[keyword] = topic user_label_matrix, annotated_nodes, label_to_lemma, node_to_lemma_tokeywordbag = form_user_term_matrix( implicated_user_twitter_list_keywords_gen, weakly_connected_id_to_node, lemma_set=lemma_set, keyword_to_topic_manual=keyword_topic_dict, ) scipy_sparse_to_csv( weakly_connected_label_folder + "/unfiltered_user_label_matrix" + ".tsv", user_label_matrix, "\t", directed=True ) store_pickle(weakly_connected_label_folder + "/unfiltered_annotated_nodes" + ".pkl", annotated_nodes) store_pickle(weakly_connected_label_folder + "/unfiltered_label_to_lemma" + ".pkl", label_to_lemma) store_pickle( weakly_connected_label_folder + "/unfiltered_node_to_lemma_tokeywordbag" + ".pkl", node_to_lemma_tokeywordbag ) user_label_matrix, annotated_user_ids, label_to_lemma = filter_user_term_matrix( user_label_matrix, annotated_nodes, label_to_lemma, max_number_of_labels=None ) lemma_to_keyword = form_lemma_tokeyword_map(annotated_nodes, node_to_lemma_tokeywordbag) # user_label_matrix, annotated_user_ids, label_to_lemma, lemma_to_keyword = semi_automatic_user_annotation(implicated_user_twitter_list_keywords_gen, weakly_connected_id_to_node) # Store user-label binary matrix. scipy_sparse_to_csv( weakly_connected_label_folder + "/user_label_matrix" + ".tsv", user_label_matrix, "\t", directed=True ) # Store user-label keyword matrix. write_screen_name_to_topics( weakly_connected_label_folder + "/user_name_to_topics" + ".tsv", user_label_matrix, weakly_connected_node_to_id, id_to_name, label_to_lemma, lemma_to_keyword, separator="\t", ) return twitter_lists_folder
def weakly_connected_graph(full_graph_folder, weakly_connected_graph_folder): # Read relevant data. mention_graph = load_pickle(full_graph_folder + "/mention_graph" + ".pkl") mention_graph = spsp.coo_matrix(spsp.csr_matrix(mention_graph)) retweet_graph = load_pickle(full_graph_folder + "/retweet_graph" + ".pkl") retweet_graph = spsp.coo_matrix(spsp.csr_matrix(retweet_graph)) user_lemma_matrix = load_pickle(full_graph_folder + "/user_lemma_matrix" + ".pkl") user_lemma_matrix = spsp.coo_matrix(spsp.csr_matrix(user_lemma_matrix)) user_id_set = load_pickle(full_graph_folder + "/user_id_set" + ".pkl") node_to_id = load_pickle(full_graph_folder + "/node_to_id" + ".pkl") # Extract weakly connected graph for the mention graph. weakly_connected_men_ret_graph, weakly_connected_node_to_id, old_node_list = extract_connected_components( spsp.coo_matrix(spsp.csr_matrix(mention_graph + retweet_graph)), "weak", node_to_id ) # Calculate the user twitter id set for the weakly connected component. weakly_connected_user_id_set = set(list(weakly_connected_node_to_id.values())) node_array = np.array(old_node_list, dtype=np.int64) # Extract corresponding retweet graph and user lemma matrix. weakly_connected_mention_graph = submatrix_pull_via_networkx( spsp.coo_matrix(mention_graph), node_array, directed=True ) weakly_connected_retweet_graph = submatrix_pull_via_networkx( spsp.coo_matrix(retweet_graph), node_array, directed=True ) user_lemma_matrix = spsp.csr_matrix(user_lemma_matrix) weakly_connected_user_lemma_matrix = user_lemma_matrix[node_array, :] # Change sparse matrices to coordinate format in order to save as an edge list. weakly_connected_mention_graph = spsp.coo_matrix(weakly_connected_mention_graph) weakly_connected_retweet_graph = spsp.coo_matrix(weakly_connected_retweet_graph) weakly_connected_user_lemma_matrix = spsp.coo_matrix(weakly_connected_user_lemma_matrix) # Store weakly connected data. scipy_sparse_to_csv( weakly_connected_graph_folder + "/mention_graph.tsv", weakly_connected_mention_graph, separator="\t", directed=True, ) scipy_sparse_to_csv( weakly_connected_graph_folder + "/retweet_graph.tsv", weakly_connected_retweet_graph, separator="\t", directed=True, ) scipy_sparse_to_csv( weakly_connected_graph_folder + "/user_lemma_matrix.tsv", weakly_connected_user_lemma_matrix, separator="\t", directed=True, ) store_pickle(weakly_connected_graph_folder + "/user_id_set" + ".pkl", weakly_connected_user_id_set) store_pickle(weakly_connected_graph_folder + "/node_to_id" + ".pkl", weakly_connected_node_to_id)