def main():
    ####################################################################################################################
    # Parse arguments.
    ####################################################################################################################
    parser = argparse.ArgumentParser()

    # File paths.
    parser.add_argument(
        "-i",
        "--input",
        dest="input_edge_list_path",
        help="This is the file path of the graph in edge list format.",
        type=str,
        required=True)
    parser.add_argument(
        "-o",
        "--output",
        dest="output_feature_path",
        help="This is the file path of the graph in edge list format.",
        type=str,
        required=True)

    # Edge list parsing configuration.
    parser.add_argument(
        "-s",
        "--separator",
        dest="separator",
        help=
        "The character(s) separating the values in the edge list (default is tab: \"\t\").",
        type=str,
        required=False,
        default="\t")
    parser.add_argument(
        "-u",
        "--undirected",
        dest="undirected",
        help="Also create the reciprocal edge for each edge in edge list.",
        type=bool,
        required=False,
        default=False)

    # Algorithm configuration.
    parser.add_argument(
        "-r",
        "--rho",
        dest="restart_probability",
        help=
        "The restart probability for the vertex-centric PageRank calculation.",
        type=float,
        required=False,
        default=0.1)
    parser.add_argument(
        "-e",
        "--epsilon",
        dest="epsilon_threshold",
        help="The tolerance for calculating vertex-centric PageRank values.",
        type=float,
        required=False,
        default=1.0e-05)
    parser.add_argument("-nt",
                        "--tasks",
                        dest="number_of_tasks",
                        help="The number of parallel tasks to create.",
                        type=int,
                        required=False,
                        default=None)

    args = parser.parse_args()

    input_edge_list_path = args.input_edge_list_path
    output_feature_path = args.output_feature_path

    separator = args.separator
    undirected = args.undirected

    restart_probability = args.restart_probability
    epsilon_threshold = args.epsilon_threshold
    number_of_tasks = args.number_of_tasks

    if number_of_tasks is None:
        number_of_tasks = get_threads_number()

    ####################################################################################################################
    # Perform algorithm.
    ####################################################################################################################
    # Read the adjacency matrix.
    adjacency_matrix,\
    node_to_id = read_adjacency_matrix(file_path=input_edge_list_path,
                                       separator=separator,
                                       undirected=undirected)

    # Make sure we are dealing with a symmetric adjacency matrix.
    adjacency_matrix = spsp.csr_matrix(adjacency_matrix)
    adjacency_matrix = (adjacency_matrix + adjacency_matrix.transpose()) / 2

    # Perform ARCTE.
    features = arcte(adjacency_matrix=adjacency_matrix,
                     rho=restart_probability,
                     epsilon=epsilon_threshold,
                     number_of_threads=number_of_tasks)
    features = spsp.csr_matrix(features)

    # Write features to output file.
    write_features(file_path=output_feature_path,
                   features=features,
                   separator=separator,
                   node_to_id=node_to_id)
        classifier_parameters["fit_intercept"] = True
    elif feature_extraction_method_name == "lapeig":
        classifier_parameters["C"] = 50.0
        classifier_parameters["fit_intercept"] = False
    elif feature_extraction_method_name == "repeig":
        classifier_parameters["C"] = 50.0
        classifier_parameters["fit_intercept"] = False
    else:
        print("Invalid method name.")
        raise RuntimeError

    return classifier_parameters

PERCENTAGES = np.arange(1, 11)  # [1, 10]
TRIAL_NUM = 10
THREAD_NUM = get_threads_number()

########################################################################################################################
# Experiment execution.
########################################################################################################################

feature_extraction_parameters = get_feature_extraction_parameters(FEATURE_EXTRACTION_METHOD_NAME)
classifier_parameters = get_classifier_parameters(FEATURE_EXTRACTION_METHOD_NAME)

run_experiment(DATASET_NAME,
               DATASET_FOLDER,
               FEATURE_EXTRACTION_METHOD_NAME,
               PERCENTAGES,
               TRIAL_NUM,
               THREAD_NUM,
               feature_extraction_parameters,
def arcte(adjacency_matrix, rho, epsilon, number_of_threads=None):
    """
    Extracts local community features for all graph nodes based on the partitioning of node-centric similarity vectors.

    Inputs:  - A in R^(nxn): Adjacency matrix of an undirected network represented as a SciPy Sparse COOrdinate matrix.
             - rho: Restart probability
             - epsilon: Approximation threshold

    Outputs: - X in R^(nxC_n): The latent space embedding represented as a SciPy Sparse COOrdinate matrix.
    """
    adjacency_matrix = sparse.csr_matrix(adjacency_matrix)
    number_of_nodes = adjacency_matrix.shape[0]

    if number_of_threads is None:
        number_of_threads = get_threads_number()
    if number_of_threads == 1:
        # Calculate natural random walk transition probability matrix.
        rw_transition, out_degree, in_degree = get_natural_random_walk_matrix(adjacency_matrix, make_shared=False)

        a = adjacency_matrix.copy()
        a.data = np.ones_like(a.data)
        edge_count_vector = np.squeeze(np.asarray(a.sum(axis=0), dtype=np.int64))

        iterate_nodes = np.where(edge_count_vector != 0)[0]
        argsort_indices = np.argsort(edge_count_vector[iterate_nodes])
        iterate_nodes = iterate_nodes[argsort_indices][::-1]
        iterate_nodes = iterate_nodes[np.where(edge_count_vector[iterate_nodes] > 1.0)[0]]

        # iterate_nodes = np.where(out_degree != 0)[0]
        # argsort_indices = np.argsort(out_degree[iterate_nodes])
        # iterate_nodes = iterate_nodes[argsort_indices][::-1]
        # iterate_nodes = iterate_nodes[np.where(out_degree[iterate_nodes] > 1.0)[0]]

        local_features = arcte_worker(iterate_nodes,
                                      rw_transition.indices,
                                      rw_transition.indptr,
                                      rw_transition.data,
                                      out_degree,
                                      in_degree,
                                      rho,
                                      epsilon)
    else:
        # Calculate natural random walk transition probability matrix.
        rw_transition, out_degree, in_degree = get_natural_random_walk_matrix(adjacency_matrix, make_shared=True)

        a = adjacency_matrix.copy()
        a.data = np.ones_like(a.data)
        edge_count_vector = np.squeeze(np.asarray(a.sum(axis=0), dtype=np.int64))

        iterate_nodes = np.where(edge_count_vector != 0)[0]
        argsort_indices = np.argsort(edge_count_vector[iterate_nodes])
        iterate_nodes = iterate_nodes[argsort_indices][::-1]
        iterate_nodes = iterate_nodes[np.where(edge_count_vector[iterate_nodes] > 1.0)[0]]

        # iterate_nodes = np.where(out_degree != 0)[0]
        # argsort_indices = np.argsort(out_degree[iterate_nodes])
        # iterate_nodes = iterate_nodes[argsort_indices][::-1]
        # iterate_nodes = iterate_nodes[np.where(out_degree[iterate_nodes] > 1.0)[0]]

        pool = mp.Pool(number_of_threads)
        node_chunks = list(parallel_chunks(iterate_nodes, number_of_threads))
        node_count = 0
        for chunk in node_chunks:
            node_count += len(list(chunk))
        results = list()
        for chunk_no in range(len(pool._pool)):
            pool.apply_async(arcte_worker,
                             args=(node_chunks[chunk_no],
                                   rw_transition.indices,
                                   rw_transition.indptr,
                                   rw_transition.data,
                                   out_degree,
                                   in_degree,
                                   rho,
                                   epsilon),
                             callback=results.append)
        pool.close()
        pool.join()
        # local_features = sparse.hstack(results)
        local_features = results[0]
        for additive_features in results[1:]:
            local_features = local_features + additive_features
        local_features = sparse.csr_matrix(local_features)

    # Form base community feature matrix.
    identity_matrix = sparse.csr_matrix(sparse.eye(number_of_nodes, number_of_nodes, dtype=np.float64))
    adjacency_matrix_ones = adjacency_matrix
    adjacency_matrix_ones.data = np.ones_like(adjacency_matrix.data)
    base_community_features = identity_matrix + adjacency_matrix_ones

    # Stack horizontally matrices to form feature matrix.
    try:
        features = sparse.hstack([base_community_features, local_features]).tocsr()
    except ValueError as e:
        print("Failure with horizontal feature stacking.")
        features = base_community_features

    return features
Beispiel #4
0
def arcte(adjacency_matrix, rho, epsilon, number_of_threads=None):
    """
    Extracts local community features for all graph nodes based on the partitioning of node-centric similarity vectors.

    Inputs:  - A in R^(nxn): Adjacency matrix of an undirected network represented as a SciPy Sparse COOrdinate matrix.
             - rho: Restart probability
             - epsilon: Approximation threshold

    Outputs: - X in R^(nxC_n): The latent space embedding represented as a SciPy Sparse COOrdinate matrix.
    """
    adjacency_matrix = sparse.csr_matrix(adjacency_matrix)
    number_of_nodes = adjacency_matrix.shape[0]

    if number_of_threads is None:
        number_of_threads = get_threads_number()
    if number_of_threads == 1:
        # Calculate natural random walk transition probability matrix.
        rw_transition, out_degree, in_degree = get_natural_random_walk_matrix(
            adjacency_matrix, make_shared=False)

        a = adjacency_matrix.copy()
        a.data = np.ones_like(a.data)
        edge_count_vector = np.squeeze(
            np.asarray(a.sum(axis=0), dtype=np.int64))

        iterate_nodes = np.where(edge_count_vector != 0)[0]
        argsort_indices = np.argsort(edge_count_vector[iterate_nodes])
        iterate_nodes = iterate_nodes[argsort_indices][::-1]
        iterate_nodes = iterate_nodes[np.where(
            edge_count_vector[iterate_nodes] > 1.0)[0]]

        # iterate_nodes = np.where(out_degree != 0)[0]
        # argsort_indices = np.argsort(out_degree[iterate_nodes])
        # iterate_nodes = iterate_nodes[argsort_indices][::-1]
        # iterate_nodes = iterate_nodes[np.where(out_degree[iterate_nodes] > 1.0)[0]]

        local_features = arcte_worker(iterate_nodes, rw_transition.indices,
                                      rw_transition.indptr, rw_transition.data,
                                      out_degree, in_degree, rho, epsilon)
    else:
        # Calculate natural random walk transition probability matrix.
        rw_transition, out_degree, in_degree = get_natural_random_walk_matrix(
            adjacency_matrix, make_shared=True)

        a = adjacency_matrix.copy()
        a.data = np.ones_like(a.data)
        edge_count_vector = np.squeeze(
            np.asarray(a.sum(axis=0), dtype=np.int64))

        iterate_nodes = np.where(edge_count_vector != 0)[0]
        argsort_indices = np.argsort(edge_count_vector[iterate_nodes])
        iterate_nodes = iterate_nodes[argsort_indices][::-1]
        iterate_nodes = iterate_nodes[np.where(
            edge_count_vector[iterate_nodes] > 1.0)[0]]

        # iterate_nodes = np.where(out_degree != 0)[0]
        # argsort_indices = np.argsort(out_degree[iterate_nodes])
        # iterate_nodes = iterate_nodes[argsort_indices][::-1]
        # iterate_nodes = iterate_nodes[np.where(out_degree[iterate_nodes] > 1.0)[0]]

        pool = mp.Pool(number_of_threads)
        node_chunks = list(parallel_chunks(iterate_nodes, number_of_threads))
        node_count = 0
        for chunk in node_chunks:
            node_count += len(list(chunk))
        results = list()
        for chunk_no in range(len(pool._pool)):
            pool.apply_async(arcte_worker,
                             args=(node_chunks[chunk_no],
                                   rw_transition.indices, rw_transition.indptr,
                                   rw_transition.data, out_degree, in_degree,
                                   rho, epsilon),
                             callback=results.append)
        pool.close()
        pool.join()
        # local_features = sparse.hstack(results)
        local_features = results[0]
        for additive_features in results[1:]:
            local_features = local_features + additive_features
        local_features = sparse.csr_matrix(local_features)

    # Form base community feature matrix.
    identity_matrix = sparse.csr_matrix(
        sparse.eye(number_of_nodes, number_of_nodes, dtype=np.float64))
    adjacency_matrix_ones = adjacency_matrix
    adjacency_matrix_ones.data = np.ones_like(adjacency_matrix.data)
    base_community_features = identity_matrix + adjacency_matrix_ones

    # Stack horizontally matrices to form feature matrix.
    try:
        features = sparse.hstack([base_community_features,
                                  local_features]).tocsr()
    except ValueError as e:
        print("Failure with horizontal feature stacking.")
        features = base_community_features

    return features
Beispiel #5
0
        classifier_parameters["fit_intercept"] = True
    elif feature_extraction_method_name == "lapeig":
        classifier_parameters["C"] = 50.0
        classifier_parameters["fit_intercept"] = False
    elif feature_extraction_method_name == "repeig":
        classifier_parameters["C"] = 50.0
        classifier_parameters["fit_intercept"] = False
    else:
        print("Invalid method name.")
        raise RuntimeError

    return classifier_parameters


PERCENTAGES = np.arange(1, 11)  # [1, 10]
TRIAL_NUM = 10
THREAD_NUM = get_threads_number()

########################################################################################################################
# Experiment execution.
########################################################################################################################

feature_extraction_parameters = get_feature_extraction_parameters(
    FEATURE_EXTRACTION_METHOD_NAME)
classifier_parameters = get_classifier_parameters(
    FEATURE_EXTRACTION_METHOD_NAME)

run_experiment(DATASET_NAME, DATASET_FOLDER, FEATURE_EXTRACTION_METHOD_NAME,
               PERCENTAGES, TRIAL_NUM, THREAD_NUM,
               feature_extraction_parameters, classifier_parameters)
def main():
    ####################################################################################################################
    # Parse arguments.
    ####################################################################################################################
    parser = argparse.ArgumentParser()

    # File paths.
    parser.add_argument("-i", "--input", dest="input_edge_list_path",
                        help="This is the file path of the graph in edge list format.",
                        type=str, required=True)
    parser.add_argument("-o", "--output", dest="output_feature_path",
                        help="This is the file path of the graph in edge list format.",
                        type=str, required=True)

    # Edge list parsing configuration.
    parser.add_argument("-s", "--separator", dest="separator",
                        help="The character(s) separating the values in the edge list (default is tab: \"\t\").",
                        type=str, required=False, default="\t")
    parser.add_argument("-u", "--undirected", dest="undirected",
                        help="Also create the reciprocal edge for each edge in edge list.",
                        type=bool, required=False, default=False)

    # Algorithm configuration.
    parser.add_argument("-r", "--rho", dest="restart_probability",
                        help="The restart probability for the vertex-centric PageRank calculation.",
                        type=float, required=False, default=0.1)
    parser.add_argument("-e", "--epsilon", dest="epsilon_threshold",
                        help="The tolerance for calculating vertex-centric PageRank values.",
                        type=float, required=False, default=1.0e-05)
    parser.add_argument("-nt", "--tasks", dest="number_of_tasks",
                        help="The number of parallel tasks to create.",
                        type=int, required=False, default=None)

    args = parser.parse_args()

    input_edge_list_path = args.input_edge_list_path
    output_feature_path = args.output_feature_path

    separator = args.separator
    undirected = args.undirected

    restart_probability = args.restart_probability
    epsilon_threshold = args.epsilon_threshold
    number_of_tasks = args.number_of_tasks

    if number_of_tasks is None:
        number_of_tasks = get_threads_number()

    ####################################################################################################################
    # Perform algorithm.
    ####################################################################################################################
    # Read the adjacency matrix.
    adjacency_matrix,\
    node_to_id = read_adjacency_matrix(file_path=input_edge_list_path,
                                       separator=separator,
                                       undirected=undirected)

    # Make sure we are dealing with a symmetric adjacency matrix.
    adjacency_matrix = spsp.csr_matrix(adjacency_matrix)
    adjacency_matrix = (adjacency_matrix + adjacency_matrix.transpose())/2

    # Perform ARCTE.
    features = arcte(adjacency_matrix=adjacency_matrix,
                     rho=restart_probability,
                     epsilon=epsilon_threshold,
                     number_of_threads=number_of_tasks)
    features = spsp.csr_matrix(features)

    # Write features to output file.
    write_features(file_path=output_feature_path,
                   features=features,
                   separator=separator,
                   node_to_id=node_to_id)