Ejemplo n.º 1
0
def integrate_graphs(mention_graph, retweet_graph, node_to_id, restart_probability, number_of_threads):
    """
    A bit of post-processing of the graphs to end up with a single aggregate graph.

    Inputs:  - mention_graph: The mention graph as a SciPy sparse matrix.
             - retweet_graph: The retweet graph as a SciPy sparse matrix.
             - user_lemma_matrix: The user lemma vector representation matrix as a SciPy sparse matrix.
             - number_of_threads:

    Outputs: - adjacency_matrix: An aggregate, post-processed view of the graphs.
             - node_to_id: A node to Twitter id map as a python dictionary.
             - features: The graph structure proximity features as calculated by ARCTE in scipy sparse matrix format.
             - node_importances: A vector containing node importance values.
    """
    # Form the adjacency matrix.
    adjacency_matrix = 0.25*mention_graph +\
                       0.25*mention_graph.transpose() +\
                       0.25*retweet_graph +\
                       0.25*retweet_graph.transpose()

    # Here is where I need to extract connected components or something similar.
    adjacency_matrix, node_to_id, old_node_list = extract_connected_components(adjacency_matrix, "weak", node_to_id)

    # Extract features
    features = arcte(adjacency_matrix=adjacency_matrix,
                     rho=restart_probability,
                     epsilon=0.00001,
                     number_of_threads=number_of_threads)

    node_importances = calculate_node_importances(adjacency_matrix)

    return adjacency_matrix, node_to_id, features, node_importances
def integrate_graphs(mention_graph,
                     retweet_graph,
                     user_lemma_matrix,
                     node_to_id,
                     lemma_to_attribute,
                     restart_probability,
                     number_of_threads):
    """
    A bit of post-processing of the graphs to end up with a single aggregate graph.

    Inputs:  - mention_graph: The mention graph as a SciPy sparse matrix.
             - retweet_graph: The retweet graph as a SciPy sparse matrix.
             - user_lemma_matrix: The user lemma vector representation matrix as a SciPy sparse matrix.
             - number_of_threads:

    Outputs: - adjacency_matrix: An aggregate, post-processed view of the graphs.
             - node_to_id: A node to Twitter id map as a python dictionary.
             - features: The graph structure proximity features as calculated by ARCTE in scipy sparse matrix format.
             - node_importances: A vector containing node importance values.
    """
    text_graph = make_text_graph(user_lemma_matrix,
                                 dimensionality=50,
                                 metric="angular",
                                 number_of_estimators=5,
                                 number_of_neighbors=3)

    # Form the adjacency matrix.
    adjacency_matrix,\
    laplacian_matrix = graph_fusion_directed(adjacency_matrix_list=[mention_graph, retweet_graph, text_graph],
                                             weights=[1.0, 1.0, 1.0],
                                             fusion_type="zhou",
                                             laplacian_type="directed")

    # Here is where I need to extract connected components or something similar.
    adjacency_matrix, node_to_id, old_node_list = extract_connected_components(adjacency_matrix, "weak", node_to_id)

    # Extract features
    features = arcte(adjacency_matrix=adjacency_matrix,
                     rho=restart_probability,
                     epsilon=0.00001,
                     number_of_threads=number_of_threads)

    node_importances = calculate_node_importances(adjacency_matrix)

    return adjacency_matrix, node_to_id, features, node_importances, old_node_list
def weakly_connected_graph(full_graph_folder, weakly_connected_graph_folder):
    # Read relevant data.
    mention_graph = load_pickle(full_graph_folder + "/mention_graph" + ".pkl")
    mention_graph = spsp.coo_matrix(spsp.csr_matrix(mention_graph))
    retweet_graph = load_pickle(full_graph_folder + "/retweet_graph" + ".pkl")
    retweet_graph = spsp.coo_matrix(spsp.csr_matrix(retweet_graph))
    user_lemma_matrix = load_pickle(full_graph_folder + "/user_lemma_matrix" + ".pkl")
    user_lemma_matrix = spsp.coo_matrix(spsp.csr_matrix(user_lemma_matrix))
    user_id_set = load_pickle(full_graph_folder + "/user_id_set" + ".pkl")
    node_to_id = load_pickle(full_graph_folder + "/node_to_id" + ".pkl")

    # Extract weakly connected graph for the mention graph.
    weakly_connected_men_ret_graph, weakly_connected_node_to_id, old_node_list = extract_connected_components(
        spsp.coo_matrix(spsp.csr_matrix(mention_graph + retweet_graph)), "weak", node_to_id
    )

    # Calculate the user twitter id set for the weakly connected component.
    weakly_connected_user_id_set = set(list(weakly_connected_node_to_id.values()))

    node_array = np.array(old_node_list, dtype=np.int64)

    # Extract corresponding retweet graph and user lemma matrix.
    weakly_connected_mention_graph = submatrix_pull_via_networkx(
        spsp.coo_matrix(mention_graph), node_array, directed=True
    )

    weakly_connected_retweet_graph = submatrix_pull_via_networkx(
        spsp.coo_matrix(retweet_graph), node_array, directed=True
    )

    user_lemma_matrix = spsp.csr_matrix(user_lemma_matrix)
    weakly_connected_user_lemma_matrix = user_lemma_matrix[node_array, :]

    # Change sparse matrices to coordinate format in order to save as an edge list.
    weakly_connected_mention_graph = spsp.coo_matrix(weakly_connected_mention_graph)
    weakly_connected_retweet_graph = spsp.coo_matrix(weakly_connected_retweet_graph)
    weakly_connected_user_lemma_matrix = spsp.coo_matrix(weakly_connected_user_lemma_matrix)

    # Store weakly connected data.
    scipy_sparse_to_csv(
        weakly_connected_graph_folder + "/mention_graph.tsv",
        weakly_connected_mention_graph,
        separator="\t",
        directed=True,
    )

    scipy_sparse_to_csv(
        weakly_connected_graph_folder + "/retweet_graph.tsv",
        weakly_connected_retweet_graph,
        separator="\t",
        directed=True,
    )

    scipy_sparse_to_csv(
        weakly_connected_graph_folder + "/user_lemma_matrix.tsv",
        weakly_connected_user_lemma_matrix,
        separator="\t",
        directed=True,
    )

    store_pickle(weakly_connected_graph_folder + "/user_id_set" + ".pkl", weakly_connected_user_id_set)
    store_pickle(weakly_connected_graph_folder + "/node_to_id" + ".pkl", weakly_connected_node_to_id)