def integrate_graphs(mention_graph, retweet_graph, node_to_id, restart_probability, number_of_threads): """ A bit of post-processing of the graphs to end up with a single aggregate graph. Inputs: - mention_graph: The mention graph as a SciPy sparse matrix. - retweet_graph: The retweet graph as a SciPy sparse matrix. - user_lemma_matrix: The user lemma vector representation matrix as a SciPy sparse matrix. - number_of_threads: Outputs: - adjacency_matrix: An aggregate, post-processed view of the graphs. - node_to_id: A node to Twitter id map as a python dictionary. - features: The graph structure proximity features as calculated by ARCTE in scipy sparse matrix format. - node_importances: A vector containing node importance values. """ # Form the adjacency matrix. adjacency_matrix = 0.25*mention_graph +\ 0.25*mention_graph.transpose() +\ 0.25*retweet_graph +\ 0.25*retweet_graph.transpose() # Here is where I need to extract connected components or something similar. adjacency_matrix, node_to_id, old_node_list = extract_connected_components(adjacency_matrix, "weak", node_to_id) # Extract features features = arcte(adjacency_matrix=adjacency_matrix, rho=restart_probability, epsilon=0.00001, number_of_threads=number_of_threads) node_importances = calculate_node_importances(adjacency_matrix) return adjacency_matrix, node_to_id, features, node_importances
def feature_extraction(adjacency_matrix, feature_extraction_method_name, thread_num, feature_extraction_parameters): start_time = time.time() if feature_extraction_method_name == "arcte": epsilon = feature_extraction_parameters["epsilon"] rho = feature_extraction_parameters["rho"] feature_matrix = arcte(adjacency_matrix, rho, epsilon, thread_num) elif feature_extraction_method_name == "mroc": alpha = feature_extraction_parameters["alpha"] feature_matrix = mroc(adjacency_matrix, alpha) elif feature_extraction_method_name == "louvain": feature_matrix = louvain(adjacency_matrix) elif feature_extraction_method_name == "basecomm": feature_matrix = base_communities(adjacency_matrix) elif feature_extraction_method_name == "lapeig": dimensionality = feature_extraction_parameters["dimensionality"] feature_matrix = laplacian_eigenmaps(adjacency_matrix, dimensionality) elif feature_extraction_method_name == "repeig": dimensionality = feature_extraction_parameters["dimensionality"] feature_matrix = replicator_eigenmaps(adjacency_matrix, dimensionality) else: print("Invalid feature extraction name.") raise RuntimeError elapsed_time = time.time() - start_time return feature_matrix, elapsed_time
def integrate_graphs(mention_graph, retweet_graph, user_lemma_matrix, node_to_id, lemma_to_attribute, restart_probability, number_of_threads): """ A bit of post-processing of the graphs to end up with a single aggregate graph. Inputs: - mention_graph: The mention graph as a SciPy sparse matrix. - retweet_graph: The retweet graph as a SciPy sparse matrix. - user_lemma_matrix: The user lemma vector representation matrix as a SciPy sparse matrix. - number_of_threads: Outputs: - adjacency_matrix: An aggregate, post-processed view of the graphs. - node_to_id: A node to Twitter id map as a python dictionary. - features: The graph structure proximity features as calculated by ARCTE in scipy sparse matrix format. - node_importances: A vector containing node importance values. """ text_graph = make_text_graph(user_lemma_matrix, dimensionality=50, metric="angular", number_of_estimators=5, number_of_neighbors=3) # Form the adjacency matrix. adjacency_matrix,\ laplacian_matrix = graph_fusion_directed(adjacency_matrix_list=[mention_graph, retweet_graph, text_graph], weights=[1.0, 1.0, 1.0], fusion_type="zhou", laplacian_type="directed") # Here is where I need to extract connected components or something similar. adjacency_matrix, node_to_id, old_node_list = extract_connected_components(adjacency_matrix, "weak", node_to_id) # Extract features features = arcte(adjacency_matrix=adjacency_matrix, rho=restart_probability, epsilon=0.00001, number_of_threads=number_of_threads) node_importances = calculate_node_importances(adjacency_matrix) return adjacency_matrix, node_to_id, features, node_importances, old_node_list
def main(): #################################################################################################################### # Parse arguments. #################################################################################################################### parser = argparse.ArgumentParser() # File paths. parser.add_argument( "-i", "--input", dest="input_edge_list_path", help="This is the file path of the graph in edge list format.", type=str, required=True) parser.add_argument( "-o", "--output", dest="output_feature_path", help="This is the file path of the graph in edge list format.", type=str, required=True) # Edge list parsing configuration. parser.add_argument( "-s", "--separator", dest="separator", help= "The character(s) separating the values in the edge list (default is tab: \"\t\").", type=str, required=False, default="\t") parser.add_argument( "-u", "--undirected", dest="undirected", help="Also create the reciprocal edge for each edge in edge list.", type=bool, required=False, default=False) # Algorithm configuration. parser.add_argument( "-r", "--rho", dest="restart_probability", help= "The restart probability for the vertex-centric PageRank calculation.", type=float, required=False, default=0.1) parser.add_argument( "-e", "--epsilon", dest="epsilon_threshold", help="The tolerance for calculating vertex-centric PageRank values.", type=float, required=False, default=1.0e-05) parser.add_argument("-nt", "--tasks", dest="number_of_tasks", help="The number of parallel tasks to create.", type=int, required=False, default=None) args = parser.parse_args() input_edge_list_path = args.input_edge_list_path output_feature_path = args.output_feature_path separator = args.separator undirected = args.undirected restart_probability = args.restart_probability epsilon_threshold = args.epsilon_threshold number_of_tasks = args.number_of_tasks if number_of_tasks is None: number_of_tasks = get_threads_number() #################################################################################################################### # Perform algorithm. #################################################################################################################### # Read the adjacency matrix. adjacency_matrix,\ node_to_id = read_adjacency_matrix(file_path=input_edge_list_path, separator=separator, undirected=undirected) # Make sure we are dealing with a symmetric adjacency matrix. adjacency_matrix = spsp.csr_matrix(adjacency_matrix) adjacency_matrix = (adjacency_matrix + adjacency_matrix.transpose()) / 2 # Perform ARCTE. features = arcte(adjacency_matrix=adjacency_matrix, rho=restart_probability, epsilon=epsilon_threshold, number_of_threads=number_of_tasks) features = spsp.csr_matrix(features) # Write features to output file. write_features(file_path=output_feature_path, features=features, separator=separator, node_to_id=node_to_id)
def run_prototype(snow_tweets_folder, prototype_output_folder, restart_probability, number_of_threads): """ This is a sample execution of the User Network Profile Classifier Prototype. Specifically: - Reads a set of tweets from a local folder. - Forms graphs and text-based vector representation for the users involved. - Fetches Twitter lists for influential users. - Extracts keywords from Twitter lists and thus annotates these users as experts in these topics. - Extracts graph-based features using the ARCTE algorithm. - Performs user classification for the rest of the users. """ if number_of_threads is None: number_of_threads = get_threads_number() #################################################################################################################### # Read data. #################################################################################################################### # Read graphs. edge_list_path = os.path.normpath(snow_tweets_folder + "/graph.tsv") adjacency_matrix = read_adjacency_matrix(file_path=edge_list_path, separator='\t') number_of_nodes = adjacency_matrix.shape[0] # Read labels. node_label_list_path = os.path.normpath(snow_tweets_folder + "/user_label_matrix.tsv") user_label_matrix, number_of_categories, labelled_node_indices = read_node_label_matrix(node_label_list_path, '\t') #################################################################################################################### # Extract features. #################################################################################################################### features = arcte(adjacency_matrix, restart_probability, 0.00001, number_of_threads=number_of_threads) features = normalize_columns(features) percentages = np.arange(1, 11, dtype=np.int) trial_num = 10 #################################################################################################################### # Perform user classification. #################################################################################################################### mean_macro_precision = np.zeros(percentages.size, dtype=np.float) std_macro_precision = np.zeros(percentages.size, dtype=np.float) mean_micro_precision = np.zeros(percentages.size, dtype=np.float) std_micro_precision = np.zeros(percentages.size, dtype=np.float) mean_macro_recall = np.zeros(percentages.size, dtype=np.float) std_macro_recall = np.zeros(percentages.size, dtype=np.float) mean_micro_recall = np.zeros(percentages.size, dtype=np.float) std_micro_recall = np.zeros(percentages.size, dtype=np.float) mean_macro_F1 = np.zeros(percentages.size, dtype=np.float) std_macro_F1 = np.zeros(percentages.size, dtype=np.float) mean_micro_F1 = np.zeros(percentages.size, dtype=np.float) std_micro_F1 = np.zeros(percentages.size, dtype=np.float) F1 = np.zeros((percentages.size, number_of_categories), dtype=np.float) for p in np.arange(percentages.size): percentage = percentages[p] # Initialize the metric storage arrays to zero macro_precision = np.zeros(trial_num, dtype=np.float) micro_precision = np.zeros(trial_num, dtype=np.float) macro_recall = np.zeros(trial_num, dtype=np.float) micro_recall = np.zeros(trial_num, dtype=np.float) macro_F1 = np.zeros(trial_num, dtype=np.float) micro_F1 = np.zeros(trial_num, dtype=np.float) trial_F1 = np.zeros((trial_num, number_of_categories), dtype=np.float) folds = generate_folds(user_label_matrix, labelled_node_indices, number_of_categories, percentage, trial_num) for trial in np.arange(trial_num): train, test = next(folds) ######################################################################################################## # Separate train and test sets ######################################################################################################## X_train, X_test, y_train, y_test = features[train, :],\ features[test, :],\ user_label_matrix[train, :],\ user_label_matrix[test, :] contingency_matrix = chi2_contingency_matrix(X_train, y_train) community_weights = peak_snr_weight_aggregation(contingency_matrix) X_train, X_test = community_weighting(X_train, X_test, community_weights) #################################################################################################### # Train model #################################################################################################### # Train classifier model = OneVsRestClassifier(svm.LinearSVC(C=1, random_state=None, dual=False, fit_intercept=True), n_jobs=number_of_threads) model.fit(X_train, y_train) #################################################################################################### # Make predictions #################################################################################################### y_pred = model.decision_function(X_test) y_pred = form_node_label_prediction_matrix(y_pred, y_test) ######################################################################################################## # Calculate measures ######################################################################################################## measures = evaluation.calculate_measures(y_pred, y_test) macro_recall[trial] = measures[0] micro_recall[trial] = measures[1] macro_precision[trial] = measures[2] micro_precision[trial] = measures[3] macro_F1[trial] = measures[4] micro_F1[trial] = measures[5] trial_F1[trial, :] = measures[6] mean_macro_precision[p] = np.mean(macro_precision) std_macro_precision[p] = np.std(macro_precision) mean_micro_precision[p] = np.mean(micro_precision) std_micro_precision[p] = np.std(micro_precision) mean_macro_recall[p] = np.mean(macro_recall) std_macro_recall[p] = np.std(macro_recall) mean_micro_recall[p] = np.mean(micro_recall) std_micro_recall[p] = np.std(micro_recall) mean_macro_F1[p] = np.mean(macro_F1) std_macro_F1[p] = np.std(macro_F1) mean_micro_F1[p] = np.mean(micro_F1) std_micro_F1[p] = np.std(micro_F1) F1[p, :] = np.mean(trial_F1, axis=0) measure_list = [(mean_macro_precision, std_macro_precision), (mean_micro_precision, std_micro_precision), (mean_macro_recall, std_macro_recall), (mean_micro_recall, std_micro_recall), (mean_macro_F1, std_macro_F1), (mean_micro_F1, std_micro_F1), F1] write_results(measure_list, os.path.normpath(prototype_output_folder + "/F1_average_scores.txt"))
def main(): #################################################################################################################### # Parse arguments. #################################################################################################################### parser = argparse.ArgumentParser() # File paths. parser.add_argument("-i", "--input", dest="input_edge_list_path", help="This is the file path of the graph in edge list format.", type=str, required=True) parser.add_argument("-o", "--output", dest="output_feature_path", help="This is the file path of the graph in edge list format.", type=str, required=True) # Edge list parsing configuration. parser.add_argument("-s", "--separator", dest="separator", help="The character(s) separating the values in the edge list (default is tab: \"\t\").", type=str, required=False, default="\t") parser.add_argument("-u", "--undirected", dest="undirected", help="Also create the reciprocal edge for each edge in edge list.", type=bool, required=False, default=False) # Algorithm configuration. parser.add_argument("-r", "--rho", dest="restart_probability", help="The restart probability for the vertex-centric PageRank calculation.", type=float, required=False, default=0.1) parser.add_argument("-e", "--epsilon", dest="epsilon_threshold", help="The tolerance for calculating vertex-centric PageRank values.", type=float, required=False, default=1.0e-05) parser.add_argument("-nt", "--tasks", dest="number_of_tasks", help="The number of parallel tasks to create.", type=int, required=False, default=None) args = parser.parse_args() input_edge_list_path = args.input_edge_list_path output_feature_path = args.output_feature_path separator = args.separator undirected = args.undirected restart_probability = args.restart_probability epsilon_threshold = args.epsilon_threshold number_of_tasks = args.number_of_tasks if number_of_tasks is None: number_of_tasks = get_threads_number() #################################################################################################################### # Perform algorithm. #################################################################################################################### # Read the adjacency matrix. adjacency_matrix,\ node_to_id = read_adjacency_matrix(file_path=input_edge_list_path, separator=separator, undirected=undirected) # Make sure we are dealing with a symmetric adjacency matrix. adjacency_matrix = spsp.csr_matrix(adjacency_matrix) adjacency_matrix = (adjacency_matrix + adjacency_matrix.transpose())/2 # Perform ARCTE. features = arcte(adjacency_matrix=adjacency_matrix, rho=restart_probability, epsilon=epsilon_threshold, number_of_threads=number_of_tasks) features = spsp.csr_matrix(features) # Write features to output file. write_features(file_path=output_feature_path, features=features, separator=separator, node_to_id=node_to_id)