def process_tweet_collection(tweet_generator, full_graph_folder):
    mention_graph, retweet_graph, user_lemma_matrix, tweet_id_set, user_id_set, node_to_id, lemma_to_attribute, id_to_name = extract_graphs_and_lemmas_from_tweets(
        tweet_generator
    )

    # Store full graph data in corresponding folder.
    store_pickle(full_graph_folder + "/mention_graph" + ".pkl", mention_graph)
    scipy_sparse_to_csv(full_graph_folder + "/mention_graph" + ".tsv", mention_graph, "\t", directed=True)
    store_pickle(full_graph_folder + "/retweet_graph" + ".pkl", retweet_graph)
    scipy_sparse_to_csv(full_graph_folder + "/retweet_graph" + ".tsv", retweet_graph, "\t", directed=True)
    store_pickle(full_graph_folder + "/user_lemma_matrix" + ".pkl", user_lemma_matrix)
    scipy_sparse_to_csv(full_graph_folder + "/user_lemma_matrix" + ".tsv", user_lemma_matrix, "\t", directed=True)
    store_pickle(full_graph_folder + "/tweet_id_set" + ".pkl", tweet_id_set)
    store_pickle(full_graph_folder + "/user_id_set" + ".pkl", user_id_set)
    store_pickle(full_graph_folder + "/node_to_id" + ".pkl", node_to_id)
    store_pickle(full_graph_folder + "/lemma_to_attribute" + ".pkl", lemma_to_attribute)
    store_pickle(full_graph_folder + "/id_to_name" + ".pkl", id_to_name)
def make_annotation(
    twitter_lists_folder,
    twitter_lists_keywords_folder,
    weakly_connected_graph_folder,
    weakly_connected_label_folder,
    full_graph_folder,
):
    # TODO: Move keywords from Mongo to the folder.
    # Read set of users.
    weakly_connected_user_id_set = load_pickle(weakly_connected_graph_folder + "/user_id_set" + ".pkl")
    weakly_connected_node_to_id = load_pickle(weakly_connected_graph_folder + "/node_to_id" + ".pkl")
    id_to_name = load_pickle(full_graph_folder + "/id_to_name" + ".pkl")

    # Read set of twitter lists.
    twitter_list_file_list = os.listdir(twitter_lists_folder)
    twitter_list_file_list = [int(file_name[:-4]) for file_name in twitter_list_file_list]

    # Read which users are annotated.
    user_keywords_file_list = os.listdir(twitter_lists_keywords_folder)
    user_keywords_file_list = [int(file_name[:-5]) for file_name in user_keywords_file_list]

    # Find which twitter lists need to be preprocessed.
    user_twitter_id_list = [
        file_name for file_name in twitter_list_file_list if file_name in weakly_connected_user_id_set
    ]
    user_twitter_id_list = [file_name for file_name in user_twitter_id_list if file_name not in user_keywords_file_list]

    twitter_list_file_list = [str(file_name) + ".pkl" for file_name in user_twitter_id_list]

    pool = Pool(processes=get_threads_number() * 2)
    user_chunks = chunks(twitter_list_file_list, get_threads_number() * 2)
    pool.map(
        partial(
            worker_function,
            lemmatizing="wordnet",
            source_folder=twitter_lists_folder,
            target_folder=twitter_lists_keywords_folder,
        ),
        user_chunks,
    )

    # # Make user-label matrix.
    user_keywords_file_list = [str(file_name) for file_name in user_keywords_file_list]
    user_twitter_list_keywords_gen = read_local_user_annotations(twitter_lists_keywords_folder, user_keywords_file_list)
    weakly_connected_id_to_node = dict(zip(weakly_connected_node_to_id.values(), weakly_connected_node_to_id.keys()))

    # # twitter_id_to_weakly_connected_node = {int(twitter_id): weakly_connected_id_to_node[int(twitter_id)] for twitter_id in user_keywords_file_list if int(twitter_id) in weakly_connected_id_to_node.keys()}
    # node_twitter_list_keywords_gen = ((weakly_connected_id_to_node[int(user_twitter_id)], twitter_list_keywords) for user_twitter_id, twitter_list_keywords in user_twitter_list_keywords_gen if int(user_twitter_id) in weakly_connected_id_to_node.keys())
    # for node, j in user_twitter_list_keywords_gen:
    #     print(node, j)

    implicated_user_twitter_list_keywords_gen = (
        (int(user_twitter_id), twitter_list_keywords)
        for user_twitter_id, twitter_list_keywords in user_twitter_list_keywords_gen
        if int(user_twitter_id) in weakly_connected_id_to_node.keys()
    )
    # for node, j in user_twitter_list_keywords_gen:
    #     print(node, j)

    ####################################################################################################################
    # Semi-automatic user annotation.
    ####################################################################################################################
    reveal_set = get_reveal_set()
    topic_keyword_dict = get_topic_keyword_dictionary()

    available_topics = set(list(topic_keyword_dict.keys()))

    keyword_list = list()
    for topic in reveal_set:
        if topic in available_topics:
            keyword_list.extend(topic_keyword_dict[topic])

    lemma_set = list()
    for keyword in keyword_list:
        lemma = clean_single_word(keyword, lemmatizing="wordnet")
        lemma_set.append(lemma)
    lemma_set = set(lemma_set)

    keyword_topic_dict = dict()
    for topic, keyword_set in topic_keyword_dict.items():
        for keyword in keyword_set:
            keyword_topic_dict[keyword] = topic

    user_label_matrix, annotated_nodes, label_to_lemma, node_to_lemma_tokeywordbag = form_user_term_matrix(
        implicated_user_twitter_list_keywords_gen,
        weakly_connected_id_to_node,
        lemma_set=lemma_set,
        keyword_to_topic_manual=keyword_topic_dict,
    )

    scipy_sparse_to_csv(
        weakly_connected_label_folder + "/unfiltered_user_label_matrix" + ".tsv", user_label_matrix, "\t", directed=True
    )
    store_pickle(weakly_connected_label_folder + "/unfiltered_annotated_nodes" + ".pkl", annotated_nodes)
    store_pickle(weakly_connected_label_folder + "/unfiltered_label_to_lemma" + ".pkl", label_to_lemma)
    store_pickle(
        weakly_connected_label_folder + "/unfiltered_node_to_lemma_tokeywordbag" + ".pkl", node_to_lemma_tokeywordbag
    )

    user_label_matrix, annotated_user_ids, label_to_lemma = filter_user_term_matrix(
        user_label_matrix, annotated_nodes, label_to_lemma, max_number_of_labels=None
    )

    lemma_to_keyword = form_lemma_tokeyword_map(annotated_nodes, node_to_lemma_tokeywordbag)

    # user_label_matrix, annotated_user_ids, label_to_lemma, lemma_to_keyword = semi_automatic_user_annotation(implicated_user_twitter_list_keywords_gen, weakly_connected_id_to_node)

    # Store user-label binary matrix.
    scipy_sparse_to_csv(
        weakly_connected_label_folder + "/user_label_matrix" + ".tsv", user_label_matrix, "\t", directed=True
    )

    # Store user-label keyword matrix.
    write_screen_name_to_topics(
        weakly_connected_label_folder + "/user_name_to_topics" + ".tsv",
        user_label_matrix,
        weakly_connected_node_to_id,
        id_to_name,
        label_to_lemma,
        lemma_to_keyword,
        separator="\t",
    )
    return twitter_lists_folder
def make_implicit_graphs(
    weakly_connected_graph_folder,
    simple_undirected_graph_folder,
    combinatorial_implicit_graph_folder,
    directed_implicit_graph_folder,
):
    # Read relevant data.
    mention_graph = read_adjacency_matrix(weakly_connected_graph_folder + "/mention_graph.tsv", separator="\t")
    retweet_graph = read_adjacency_matrix(weakly_connected_graph_folder + "/retweet_graph.tsv", separator="\t")
    # user_lemma_matrix = read_adjacency_matrix(weakly_connected_graph_folder + "/user_lemma_matrix.tsv", separator="\t")

    # Make text-based graph.
    # lemma_graph = make_text_graph(user_lemma_matrix)

    ####################################################################################################################
    # Make simple undirected graphs.
    ####################################################################################################################
    simple_undirected_mention_graph = (mention_graph + mention_graph.transpose()) / 2
    simple_undirected_mention_graph = spsp.coo_matrix(spsp.csr_matrix(simple_undirected_mention_graph))
    scipy_sparse_to_csv(
        simple_undirected_graph_folder + "/mention_graph" + ".tsv",
        simple_undirected_mention_graph,
        separator="\t",
        directed=False,
    )
    gc.collect()
    print("Simple Undirected Mention Graph.")

    simple_undirected_retweet_graph = (retweet_graph + retweet_graph.transpose()) / 2
    simple_undirected_retweet_graph = spsp.coo_matrix(spsp.csr_matrix(simple_undirected_retweet_graph))
    scipy_sparse_to_csv(
        simple_undirected_graph_folder + "/retweet_graph" + ".tsv",
        simple_undirected_retweet_graph,
        separator="\t",
        directed=False,
    )
    gc.collect()
    print("Simple Undirected Retweet Graph.")

    # simple_undirected_lemma_graph = (lemma_graph + lemma_graph.transpose())/2
    # simple_undirected_lemma_graph = spsp.coo_matrix(spsp.csr_matrix(simple_undirected_lemma_graph))
    # scipy_sparse_to_csv(simple_undirected_graph_folder + "/lemma_graph" + ".tsv",
    #                     simple_undirected_lemma_graph,
    #                     separator="\t",
    #                     directed=False)
    # gc.collect()
    # print("Simple Undirected Lemma Graph.")

    simple_undirected_mr_graph = (simple_undirected_mention_graph + simple_undirected_retweet_graph) / 2
    simple_undirected_mr_graph = spsp.coo_matrix(spsp.csr_matrix(simple_undirected_mr_graph))
    scipy_sparse_to_csv(
        simple_undirected_graph_folder + "/men_ret_graph" + ".tsv",
        simple_undirected_mr_graph,
        separator="\t",
        directed=False,
    )
    gc.collect()
    print("Simple Undirected Mention+Retweet Graph.")

    ####################################################################################################################
    # Make combinatorial implicit graphs.
    ####################################################################################################################
    implicit_combinatorial_mention_graph, phi = get_adjacency_matrix_via_combinatorial_laplacian(mention_graph, 0.1)
    implicit_combinatorial_mention_graph = spsp.coo_matrix(spsp.csr_matrix(implicit_combinatorial_mention_graph))
    scipy_sparse_to_csv(
        combinatorial_implicit_graph_folder + "/mention_graph" + ".tsv",
        implicit_combinatorial_mention_graph,
        separator="\t",
        directed=False,
    )
    gc.collect()
    print("Implicit Combinatorial Mention Graph.")
    print(implicit_combinatorial_mention_graph.sum(axis=1))

    implicit_combinatorial_retweet_graph, phi = get_adjacency_matrix_via_combinatorial_laplacian(retweet_graph, 0.1)
    implicit_combinatorial_retweet_graph = spsp.coo_matrix(spsp.csr_matrix(implicit_combinatorial_retweet_graph))
    scipy_sparse_to_csv(
        combinatorial_implicit_graph_folder + "/retweet_graph" + ".tsv",
        implicit_combinatorial_retweet_graph,
        separator="\t",
        directed=False,
    )
    gc.collect()
    print("Implicit Combinatorial Retweet Graph.")
    print(implicit_combinatorial_retweet_graph.sum(axis=1))

    # implicit_combinatorial_lemma_graph, phi = get_adjacency_matrix_via_combinatorial_laplacian(lemma_graph, 0.5)
    # implicit_combinatorial_lemma_graph = spsp.coo_matrix(spsp.csr_matrix(implicit_combinatorial_lemma_graph))
    # scipy_sparse_to_csv(combinatorial_implicit_graph_folder + "/lemma_graph" + ".tsv",
    #                     implicit_combinatorial_lemma_graph,
    #                     separator="\t",
    #                     directed=False)
    # gc.collect()
    # print("Implicit Combinatorial Lemma Graph.")

    ####################################################################################################################
    # Make and store directed implicit graphs.
    ####################################################################################################################
    implicit_directed_mention_graph, phi = get_adjacency_matrix_via_directed_laplacian(mention_graph, 0.1)
    implicit_directed_mention_graph = spsp.coo_matrix(spsp.csr_matrix(implicit_directed_mention_graph))
    scipy_sparse_to_csv(
        directed_implicit_graph_folder + "/mention_graph" + ".tsv",
        implicit_directed_mention_graph,
        separator="\t",
        directed=False,
    )
    gc.collect()
    print("Implicit Directed Mention Graph.")
    print(implicit_directed_mention_graph.sum(axis=1))

    implicit_directed_retweet_graph, phi = get_adjacency_matrix_via_directed_laplacian(retweet_graph, 0.1)
    implicit_directed_retweet_graph = spsp.coo_matrix(spsp.csr_matrix(implicit_directed_retweet_graph))
    scipy_sparse_to_csv(
        directed_implicit_graph_folder + "/retweet_graph" + ".tsv",
        implicit_directed_retweet_graph,
        separator="\t",
        directed=False,
    )
    gc.collect()
    print("Implicit Directed Retweet Graph.")
    print(implicit_directed_retweet_graph.sum(axis=1))

    # implicit_directed_lemma_graph, phi = get_adjacency_matrix_via_directed_laplacian(lemma_graph, 0.1)
    # implicit_directed_lemma_graph = spsp.coo_matrix(spsp.csr_matrix(implicit_directed_lemma_graph))
    # scipy_sparse_to_csv(directed_implicit_graph_folder + "/lemma_graph" + ".tsv",
    #                     implicit_directed_lemma_graph,
    #                     separator="\t",
    #                     directed=False)
    # gc.collect()
    # print("Implicit Directed Lemma Graph.")

    ####################################################################################################################
    # Make multiview transition matrices.
    ####################################################################################################################
    men_ret_transition_matrix = get_multiview_transition_matrix(
        [mention_graph, retweet_graph], weights=None, method="zhou"
    )

    implicit_combinatorial_men_ret_graph, com_phi, implicit_directed_men_ret_graph, dir_phi = get_implicit_adjacency_matrices(
        men_ret_transition_matrix, rho=0.1
    )
    scipy_sparse_to_csv(
        combinatorial_implicit_graph_folder + "/men_ret_graph" + ".tsv",
        implicit_combinatorial_men_ret_graph,
        separator="\t",
        directed=False,
    )
    scipy_sparse_to_csv(
        directed_implicit_graph_folder + "/men_ret_graph" + ".tsv",
        implicit_directed_men_ret_graph,
        separator="\t",
        directed=False,
    )
    gc.collect()
    print("Implicit Mention-Retweet Graphs.")
def weakly_connected_graph(full_graph_folder, weakly_connected_graph_folder):
    # Read relevant data.
    mention_graph = load_pickle(full_graph_folder + "/mention_graph" + ".pkl")
    mention_graph = spsp.coo_matrix(spsp.csr_matrix(mention_graph))
    retweet_graph = load_pickle(full_graph_folder + "/retweet_graph" + ".pkl")
    retweet_graph = spsp.coo_matrix(spsp.csr_matrix(retweet_graph))
    user_lemma_matrix = load_pickle(full_graph_folder + "/user_lemma_matrix" + ".pkl")
    user_lemma_matrix = spsp.coo_matrix(spsp.csr_matrix(user_lemma_matrix))
    user_id_set = load_pickle(full_graph_folder + "/user_id_set" + ".pkl")
    node_to_id = load_pickle(full_graph_folder + "/node_to_id" + ".pkl")

    # Extract weakly connected graph for the mention graph.
    weakly_connected_men_ret_graph, weakly_connected_node_to_id, old_node_list = extract_connected_components(
        spsp.coo_matrix(spsp.csr_matrix(mention_graph + retweet_graph)), "weak", node_to_id
    )

    # Calculate the user twitter id set for the weakly connected component.
    weakly_connected_user_id_set = set(list(weakly_connected_node_to_id.values()))

    node_array = np.array(old_node_list, dtype=np.int64)

    # Extract corresponding retweet graph and user lemma matrix.
    weakly_connected_mention_graph = submatrix_pull_via_networkx(
        spsp.coo_matrix(mention_graph), node_array, directed=True
    )

    weakly_connected_retweet_graph = submatrix_pull_via_networkx(
        spsp.coo_matrix(retweet_graph), node_array, directed=True
    )

    user_lemma_matrix = spsp.csr_matrix(user_lemma_matrix)
    weakly_connected_user_lemma_matrix = user_lemma_matrix[node_array, :]

    # Change sparse matrices to coordinate format in order to save as an edge list.
    weakly_connected_mention_graph = spsp.coo_matrix(weakly_connected_mention_graph)
    weakly_connected_retweet_graph = spsp.coo_matrix(weakly_connected_retweet_graph)
    weakly_connected_user_lemma_matrix = spsp.coo_matrix(weakly_connected_user_lemma_matrix)

    # Store weakly connected data.
    scipy_sparse_to_csv(
        weakly_connected_graph_folder + "/mention_graph.tsv",
        weakly_connected_mention_graph,
        separator="\t",
        directed=True,
    )

    scipy_sparse_to_csv(
        weakly_connected_graph_folder + "/retweet_graph.tsv",
        weakly_connected_retweet_graph,
        separator="\t",
        directed=True,
    )

    scipy_sparse_to_csv(
        weakly_connected_graph_folder + "/user_lemma_matrix.tsv",
        weakly_connected_user_lemma_matrix,
        separator="\t",
        directed=True,
    )

    store_pickle(weakly_connected_graph_folder + "/user_id_set" + ".pkl", weakly_connected_user_id_set)
    store_pickle(weakly_connected_graph_folder + "/node_to_id" + ".pkl", weakly_connected_node_to_id)