def integrate_graphs(mention_graph, retweet_graph, node_to_id, restart_probability, number_of_threads):
    """
    A bit of post-processing of the graphs to end up with a single aggregate graph.

    Inputs:  - mention_graph: The mention graph as a SciPy sparse matrix.
             - retweet_graph: The retweet graph as a SciPy sparse matrix.
             - user_lemma_matrix: The user lemma vector representation matrix as a SciPy sparse matrix.
             - number_of_threads:

    Outputs: - adjacency_matrix: An aggregate, post-processed view of the graphs.
             - node_to_id: A node to Twitter id map as a python dictionary.
             - features: The graph structure proximity features as calculated by ARCTE in scipy sparse matrix format.
             - node_importances: A vector containing node importance values.
    """
    # Form the adjacency matrix.
    adjacency_matrix = 0.25*mention_graph +\
                       0.25*mention_graph.transpose() +\
                       0.25*retweet_graph +\
                       0.25*retweet_graph.transpose()

    # Here is where I need to extract connected components or something similar.
    adjacency_matrix, node_to_id, old_node_list = extract_connected_components(adjacency_matrix, "weak", node_to_id)

    # Extract features
    features = arcte(adjacency_matrix=adjacency_matrix,
                     rho=restart_probability,
                     epsilon=0.00001,
                     number_of_threads=number_of_threads)

    node_importances = calculate_node_importances(adjacency_matrix)

    return adjacency_matrix, node_to_id, features, node_importances
def feature_extraction(adjacency_matrix,
                       feature_extraction_method_name,
                       thread_num,
                       feature_extraction_parameters):
    start_time = time.time()
    if feature_extraction_method_name == "arcte":
        epsilon = feature_extraction_parameters["epsilon"]
        rho = feature_extraction_parameters["rho"]

        feature_matrix = arcte(adjacency_matrix, rho, epsilon, thread_num)
    elif feature_extraction_method_name == "mroc":
        alpha = feature_extraction_parameters["alpha"]
        feature_matrix = mroc(adjacency_matrix, alpha)
    elif feature_extraction_method_name == "louvain":
        feature_matrix = louvain(adjacency_matrix)
    elif feature_extraction_method_name == "basecomm":
        feature_matrix = base_communities(adjacency_matrix)
    elif feature_extraction_method_name == "lapeig":
        dimensionality = feature_extraction_parameters["dimensionality"]

        feature_matrix = laplacian_eigenmaps(adjacency_matrix, dimensionality)
    elif feature_extraction_method_name == "repeig":
        dimensionality = feature_extraction_parameters["dimensionality"]

        feature_matrix = replicator_eigenmaps(adjacency_matrix, dimensionality)
    else:
        print("Invalid feature extraction name.")
        raise RuntimeError
    elapsed_time = time.time() - start_time

    return feature_matrix, elapsed_time
Esempio n. 3
0
def feature_extraction(adjacency_matrix, feature_extraction_method_name,
                       thread_num, feature_extraction_parameters):
    start_time = time.time()
    if feature_extraction_method_name == "arcte":
        epsilon = feature_extraction_parameters["epsilon"]
        rho = feature_extraction_parameters["rho"]

        feature_matrix = arcte(adjacency_matrix, rho, epsilon, thread_num)
    elif feature_extraction_method_name == "mroc":
        alpha = feature_extraction_parameters["alpha"]
        feature_matrix = mroc(adjacency_matrix, alpha)
    elif feature_extraction_method_name == "louvain":
        feature_matrix = louvain(adjacency_matrix)
    elif feature_extraction_method_name == "basecomm":
        feature_matrix = base_communities(adjacency_matrix)
    elif feature_extraction_method_name == "lapeig":
        dimensionality = feature_extraction_parameters["dimensionality"]

        feature_matrix = laplacian_eigenmaps(adjacency_matrix, dimensionality)
    elif feature_extraction_method_name == "repeig":
        dimensionality = feature_extraction_parameters["dimensionality"]

        feature_matrix = replicator_eigenmaps(adjacency_matrix, dimensionality)
    else:
        print("Invalid feature extraction name.")
        raise RuntimeError
    elapsed_time = time.time() - start_time

    return feature_matrix, elapsed_time
def integrate_graphs(mention_graph,
                     retweet_graph,
                     user_lemma_matrix,
                     node_to_id,
                     lemma_to_attribute,
                     restart_probability,
                     number_of_threads):
    """
    A bit of post-processing of the graphs to end up with a single aggregate graph.

    Inputs:  - mention_graph: The mention graph as a SciPy sparse matrix.
             - retweet_graph: The retweet graph as a SciPy sparse matrix.
             - user_lemma_matrix: The user lemma vector representation matrix as a SciPy sparse matrix.
             - number_of_threads:

    Outputs: - adjacency_matrix: An aggregate, post-processed view of the graphs.
             - node_to_id: A node to Twitter id map as a python dictionary.
             - features: The graph structure proximity features as calculated by ARCTE in scipy sparse matrix format.
             - node_importances: A vector containing node importance values.
    """
    text_graph = make_text_graph(user_lemma_matrix,
                                 dimensionality=50,
                                 metric="angular",
                                 number_of_estimators=5,
                                 number_of_neighbors=3)

    # Form the adjacency matrix.
    adjacency_matrix,\
    laplacian_matrix = graph_fusion_directed(adjacency_matrix_list=[mention_graph, retweet_graph, text_graph],
                                             weights=[1.0, 1.0, 1.0],
                                             fusion_type="zhou",
                                             laplacian_type="directed")

    # Here is where I need to extract connected components or something similar.
    adjacency_matrix, node_to_id, old_node_list = extract_connected_components(adjacency_matrix, "weak", node_to_id)

    # Extract features
    features = arcte(adjacency_matrix=adjacency_matrix,
                     rho=restart_probability,
                     epsilon=0.00001,
                     number_of_threads=number_of_threads)

    node_importances = calculate_node_importances(adjacency_matrix)

    return adjacency_matrix, node_to_id, features, node_importances, old_node_list
Esempio n. 5
0
def main():
    ####################################################################################################################
    # Parse arguments.
    ####################################################################################################################
    parser = argparse.ArgumentParser()

    # File paths.
    parser.add_argument(
        "-i",
        "--input",
        dest="input_edge_list_path",
        help="This is the file path of the graph in edge list format.",
        type=str,
        required=True)
    parser.add_argument(
        "-o",
        "--output",
        dest="output_feature_path",
        help="This is the file path of the graph in edge list format.",
        type=str,
        required=True)

    # Edge list parsing configuration.
    parser.add_argument(
        "-s",
        "--separator",
        dest="separator",
        help=
        "The character(s) separating the values in the edge list (default is tab: \"\t\").",
        type=str,
        required=False,
        default="\t")
    parser.add_argument(
        "-u",
        "--undirected",
        dest="undirected",
        help="Also create the reciprocal edge for each edge in edge list.",
        type=bool,
        required=False,
        default=False)

    # Algorithm configuration.
    parser.add_argument(
        "-r",
        "--rho",
        dest="restart_probability",
        help=
        "The restart probability for the vertex-centric PageRank calculation.",
        type=float,
        required=False,
        default=0.1)
    parser.add_argument(
        "-e",
        "--epsilon",
        dest="epsilon_threshold",
        help="The tolerance for calculating vertex-centric PageRank values.",
        type=float,
        required=False,
        default=1.0e-05)
    parser.add_argument("-nt",
                        "--tasks",
                        dest="number_of_tasks",
                        help="The number of parallel tasks to create.",
                        type=int,
                        required=False,
                        default=None)

    args = parser.parse_args()

    input_edge_list_path = args.input_edge_list_path
    output_feature_path = args.output_feature_path

    separator = args.separator
    undirected = args.undirected

    restart_probability = args.restart_probability
    epsilon_threshold = args.epsilon_threshold
    number_of_tasks = args.number_of_tasks

    if number_of_tasks is None:
        number_of_tasks = get_threads_number()

    ####################################################################################################################
    # Perform algorithm.
    ####################################################################################################################
    # Read the adjacency matrix.
    adjacency_matrix,\
    node_to_id = read_adjacency_matrix(file_path=input_edge_list_path,
                                       separator=separator,
                                       undirected=undirected)

    # Make sure we are dealing with a symmetric adjacency matrix.
    adjacency_matrix = spsp.csr_matrix(adjacency_matrix)
    adjacency_matrix = (adjacency_matrix + adjacency_matrix.transpose()) / 2

    # Perform ARCTE.
    features = arcte(adjacency_matrix=adjacency_matrix,
                     rho=restart_probability,
                     epsilon=epsilon_threshold,
                     number_of_threads=number_of_tasks)
    features = spsp.csr_matrix(features)

    # Write features to output file.
    write_features(file_path=output_feature_path,
                   features=features,
                   separator=separator,
                   node_to_id=node_to_id)
def run_prototype(snow_tweets_folder,
                  prototype_output_folder,
                  restart_probability,
                  number_of_threads):
    """
    This is a sample execution of the User Network Profile Classifier Prototype.

    Specifically:
           - Reads a set of tweets from a local folder.
           - Forms graphs and text-based vector representation for the users involved.
           - Fetches Twitter lists for influential users.
           - Extracts keywords from Twitter lists and thus annotates these users as experts in these topics.
           - Extracts graph-based features using the ARCTE algorithm.
           - Performs user classification for the rest of the users.
    """
    if number_of_threads is None:
        number_of_threads = get_threads_number()

    ####################################################################################################################
    # Read data.
    ####################################################################################################################
    # Read graphs.
    edge_list_path = os.path.normpath(snow_tweets_folder + "/graph.tsv")
    adjacency_matrix = read_adjacency_matrix(file_path=edge_list_path,
                                             separator='\t')
    number_of_nodes = adjacency_matrix.shape[0]

    # Read labels.
    node_label_list_path = os.path.normpath(snow_tweets_folder + "/user_label_matrix.tsv")
    user_label_matrix, number_of_categories, labelled_node_indices = read_node_label_matrix(node_label_list_path,
                                                                                            '\t')

    ####################################################################################################################
    # Extract features.
    ####################################################################################################################
    features = arcte(adjacency_matrix,
                     restart_probability,
                     0.00001,
                     number_of_threads=number_of_threads)

    features = normalize_columns(features)

    percentages = np.arange(1, 11, dtype=np.int)
    trial_num = 10

    ####################################################################################################################
    # Perform user classification.
    ####################################################################################################################
    mean_macro_precision = np.zeros(percentages.size, dtype=np.float)
    std_macro_precision = np.zeros(percentages.size, dtype=np.float)
    mean_micro_precision = np.zeros(percentages.size, dtype=np.float)
    std_micro_precision = np.zeros(percentages.size, dtype=np.float)
    mean_macro_recall = np.zeros(percentages.size, dtype=np.float)
    std_macro_recall = np.zeros(percentages.size, dtype=np.float)
    mean_micro_recall = np.zeros(percentages.size, dtype=np.float)
    std_micro_recall = np.zeros(percentages.size, dtype=np.float)
    mean_macro_F1 = np.zeros(percentages.size, dtype=np.float)
    std_macro_F1 = np.zeros(percentages.size, dtype=np.float)
    mean_micro_F1 = np.zeros(percentages.size, dtype=np.float)
    std_micro_F1 = np.zeros(percentages.size, dtype=np.float)
    F1 = np.zeros((percentages.size, number_of_categories), dtype=np.float)
    for p in np.arange(percentages.size):
        percentage = percentages[p]
        # Initialize the metric storage arrays to zero
        macro_precision = np.zeros(trial_num, dtype=np.float)
        micro_precision = np.zeros(trial_num, dtype=np.float)
        macro_recall = np.zeros(trial_num, dtype=np.float)
        micro_recall = np.zeros(trial_num, dtype=np.float)
        macro_F1 = np.zeros(trial_num, dtype=np.float)
        micro_F1 = np.zeros(trial_num, dtype=np.float)
        trial_F1 = np.zeros((trial_num, number_of_categories), dtype=np.float)

        folds = generate_folds(user_label_matrix,
                               labelled_node_indices,
                               number_of_categories,
                               percentage,
                               trial_num)
        for trial in np.arange(trial_num):
            train, test = next(folds)
            ########################################################################################################
            # Separate train and test sets
            ########################################################################################################
            X_train, X_test, y_train, y_test = features[train, :],\
                                                features[test, :],\
                                                user_label_matrix[train, :],\
                                                user_label_matrix[test, :]

            contingency_matrix = chi2_contingency_matrix(X_train, y_train)
            community_weights = peak_snr_weight_aggregation(contingency_matrix)
            X_train, X_test = community_weighting(X_train, X_test, community_weights)

            ####################################################################################################
            # Train model
            ####################################################################################################
            # Train classifier
            model = OneVsRestClassifier(svm.LinearSVC(C=1,
                                                      random_state=None,
                                                      dual=False,
                                                      fit_intercept=True),
                                        n_jobs=number_of_threads)

            model.fit(X_train, y_train)
            ####################################################################################################
            # Make predictions
            ####################################################################################################
            y_pred = model.decision_function(X_test)

            y_pred = form_node_label_prediction_matrix(y_pred, y_test)

            ########################################################################################################
            # Calculate measures
            ########################################################################################################
            measures = evaluation.calculate_measures(y_pred, y_test)

            macro_recall[trial] = measures[0]
            micro_recall[trial] = measures[1]

            macro_precision[trial] = measures[2]
            micro_precision[trial] = measures[3]

            macro_F1[trial] = measures[4]
            micro_F1[trial] = measures[5]

            trial_F1[trial, :] = measures[6]

        mean_macro_precision[p] = np.mean(macro_precision)
        std_macro_precision[p] = np.std(macro_precision)
        mean_micro_precision[p] = np.mean(micro_precision)
        std_micro_precision[p] = np.std(micro_precision)
        mean_macro_recall[p] = np.mean(macro_recall)
        std_macro_recall[p] = np.std(macro_recall)
        mean_micro_recall[p] = np.mean(micro_recall)
        std_micro_recall[p] = np.std(micro_recall)
        mean_macro_F1[p] = np.mean(macro_F1)
        std_macro_F1[p] = np.std(macro_F1)
        mean_micro_F1[p] = np.mean(micro_F1)
        std_micro_F1[p] = np.std(micro_F1)
        F1[p, :] = np.mean(trial_F1, axis=0)

    measure_list = [(mean_macro_precision, std_macro_precision),
                    (mean_micro_precision, std_micro_precision),
                    (mean_macro_recall, std_macro_recall),
                    (mean_micro_recall, std_micro_recall),
                    (mean_macro_F1, std_macro_F1),
                    (mean_micro_F1, std_micro_F1),
                    F1]

    write_results(measure_list,
                  os.path.normpath(prototype_output_folder + "/F1_average_scores.txt"))
Esempio n. 7
0
def main():
    ####################################################################################################################
    # Parse arguments.
    ####################################################################################################################
    parser = argparse.ArgumentParser()

    # File paths.
    parser.add_argument("-i", "--input", dest="input_edge_list_path",
                        help="This is the file path of the graph in edge list format.",
                        type=str, required=True)
    parser.add_argument("-o", "--output", dest="output_feature_path",
                        help="This is the file path of the graph in edge list format.",
                        type=str, required=True)

    # Edge list parsing configuration.
    parser.add_argument("-s", "--separator", dest="separator",
                        help="The character(s) separating the values in the edge list (default is tab: \"\t\").",
                        type=str, required=False, default="\t")
    parser.add_argument("-u", "--undirected", dest="undirected",
                        help="Also create the reciprocal edge for each edge in edge list.",
                        type=bool, required=False, default=False)

    # Algorithm configuration.
    parser.add_argument("-r", "--rho", dest="restart_probability",
                        help="The restart probability for the vertex-centric PageRank calculation.",
                        type=float, required=False, default=0.1)
    parser.add_argument("-e", "--epsilon", dest="epsilon_threshold",
                        help="The tolerance for calculating vertex-centric PageRank values.",
                        type=float, required=False, default=1.0e-05)
    parser.add_argument("-nt", "--tasks", dest="number_of_tasks",
                        help="The number of parallel tasks to create.",
                        type=int, required=False, default=None)

    args = parser.parse_args()

    input_edge_list_path = args.input_edge_list_path
    output_feature_path = args.output_feature_path

    separator = args.separator
    undirected = args.undirected

    restart_probability = args.restart_probability
    epsilon_threshold = args.epsilon_threshold
    number_of_tasks = args.number_of_tasks

    if number_of_tasks is None:
        number_of_tasks = get_threads_number()

    ####################################################################################################################
    # Perform algorithm.
    ####################################################################################################################
    # Read the adjacency matrix.
    adjacency_matrix,\
    node_to_id = read_adjacency_matrix(file_path=input_edge_list_path,
                                       separator=separator,
                                       undirected=undirected)

    # Make sure we are dealing with a symmetric adjacency matrix.
    adjacency_matrix = spsp.csr_matrix(adjacency_matrix)
    adjacency_matrix = (adjacency_matrix + adjacency_matrix.transpose())/2

    # Perform ARCTE.
    features = arcte(adjacency_matrix=adjacency_matrix,
                     rho=restart_probability,
                     epsilon=epsilon_threshold,
                     number_of_threads=number_of_tasks)
    features = spsp.csr_matrix(features)

    # Write features to output file.
    write_features(file_path=output_feature_path,
                   features=features,
                   separator=separator,
                   node_to_id=node_to_id)