def integrate_graphs(mention_graph, retweet_graph, node_to_id, restart_probability, number_of_threads): """ A bit of post-processing of the graphs to end up with a single aggregate graph. Inputs: - mention_graph: The mention graph as a SciPy sparse matrix. - retweet_graph: The retweet graph as a SciPy sparse matrix. - user_lemma_matrix: The user lemma vector representation matrix as a SciPy sparse matrix. - number_of_threads: Outputs: - adjacency_matrix: An aggregate, post-processed view of the graphs. - node_to_id: A node to Twitter id map as a python dictionary. - features: The graph structure proximity features as calculated by ARCTE in scipy sparse matrix format. - node_importances: A vector containing node importance values. """ # Form the adjacency matrix. adjacency_matrix = 0.25*mention_graph +\ 0.25*mention_graph.transpose() +\ 0.25*retweet_graph +\ 0.25*retweet_graph.transpose() # Here is where I need to extract connected components or something similar. adjacency_matrix, node_to_id, old_node_list = extract_connected_components(adjacency_matrix, "weak", node_to_id) # Extract features features = arcte(adjacency_matrix=adjacency_matrix, rho=restart_probability, epsilon=0.00001, number_of_threads=number_of_threads) node_importances = calculate_node_importances(adjacency_matrix) return adjacency_matrix, node_to_id, features, node_importances
def integrate_graphs(mention_graph, retweet_graph, user_lemma_matrix, node_to_id, lemma_to_attribute, restart_probability, number_of_threads): """ A bit of post-processing of the graphs to end up with a single aggregate graph. Inputs: - mention_graph: The mention graph as a SciPy sparse matrix. - retweet_graph: The retweet graph as a SciPy sparse matrix. - user_lemma_matrix: The user lemma vector representation matrix as a SciPy sparse matrix. - number_of_threads: Outputs: - adjacency_matrix: An aggregate, post-processed view of the graphs. - node_to_id: A node to Twitter id map as a python dictionary. - features: The graph structure proximity features as calculated by ARCTE in scipy sparse matrix format. - node_importances: A vector containing node importance values. """ text_graph = make_text_graph(user_lemma_matrix, dimensionality=50, metric="angular", number_of_estimators=5, number_of_neighbors=3) # Form the adjacency matrix. adjacency_matrix,\ laplacian_matrix = graph_fusion_directed(adjacency_matrix_list=[mention_graph, retweet_graph, text_graph], weights=[1.0, 1.0, 1.0], fusion_type="zhou", laplacian_type="directed") # Here is where I need to extract connected components or something similar. adjacency_matrix, node_to_id, old_node_list = extract_connected_components(adjacency_matrix, "weak", node_to_id) # Extract features features = arcte(adjacency_matrix=adjacency_matrix, rho=restart_probability, epsilon=0.00001, number_of_threads=number_of_threads) node_importances = calculate_node_importances(adjacency_matrix) return adjacency_matrix, node_to_id, features, node_importances, old_node_list
def weakly_connected_graph(full_graph_folder, weakly_connected_graph_folder): # Read relevant data. mention_graph = load_pickle(full_graph_folder + "/mention_graph" + ".pkl") mention_graph = spsp.coo_matrix(spsp.csr_matrix(mention_graph)) retweet_graph = load_pickle(full_graph_folder + "/retweet_graph" + ".pkl") retweet_graph = spsp.coo_matrix(spsp.csr_matrix(retweet_graph)) user_lemma_matrix = load_pickle(full_graph_folder + "/user_lemma_matrix" + ".pkl") user_lemma_matrix = spsp.coo_matrix(spsp.csr_matrix(user_lemma_matrix)) user_id_set = load_pickle(full_graph_folder + "/user_id_set" + ".pkl") node_to_id = load_pickle(full_graph_folder + "/node_to_id" + ".pkl") # Extract weakly connected graph for the mention graph. weakly_connected_men_ret_graph, weakly_connected_node_to_id, old_node_list = extract_connected_components( spsp.coo_matrix(spsp.csr_matrix(mention_graph + retweet_graph)), "weak", node_to_id ) # Calculate the user twitter id set for the weakly connected component. weakly_connected_user_id_set = set(list(weakly_connected_node_to_id.values())) node_array = np.array(old_node_list, dtype=np.int64) # Extract corresponding retweet graph and user lemma matrix. weakly_connected_mention_graph = submatrix_pull_via_networkx( spsp.coo_matrix(mention_graph), node_array, directed=True ) weakly_connected_retweet_graph = submatrix_pull_via_networkx( spsp.coo_matrix(retweet_graph), node_array, directed=True ) user_lemma_matrix = spsp.csr_matrix(user_lemma_matrix) weakly_connected_user_lemma_matrix = user_lemma_matrix[node_array, :] # Change sparse matrices to coordinate format in order to save as an edge list. weakly_connected_mention_graph = spsp.coo_matrix(weakly_connected_mention_graph) weakly_connected_retweet_graph = spsp.coo_matrix(weakly_connected_retweet_graph) weakly_connected_user_lemma_matrix = spsp.coo_matrix(weakly_connected_user_lemma_matrix) # Store weakly connected data. scipy_sparse_to_csv( weakly_connected_graph_folder + "/mention_graph.tsv", weakly_connected_mention_graph, separator="\t", directed=True, ) scipy_sparse_to_csv( weakly_connected_graph_folder + "/retweet_graph.tsv", weakly_connected_retweet_graph, separator="\t", directed=True, ) scipy_sparse_to_csv( weakly_connected_graph_folder + "/user_lemma_matrix.tsv", weakly_connected_user_lemma_matrix, separator="\t", directed=True, ) store_pickle(weakly_connected_graph_folder + "/user_id_set" + ".pkl", weakly_connected_user_id_set) store_pickle(weakly_connected_graph_folder + "/node_to_id" + ".pkl", weakly_connected_node_to_id)