def read_performance_measures(file_path, number=10):
    file_row_gen = get_file_row_generator(file_path, "\t")

    F1_macro_mean = np.zeros(number, dtype=np.float64)
    F1_macro_std = np.zeros(number, dtype=np.float64)
    F1_micro_mean = np.zeros(number, dtype=np.float64)
    F1_micro_std = np.zeros(number, dtype=np.float64)

    for r in range(18):
        file_row = next(file_row_gen)

    file_row = [float(score) for score in file_row]
    F1_macro_mean[:] = file_row

    file_row = next(file_row_gen)
    file_row = [float(score) for score in file_row]
    F1_macro_std[:] = file_row

    for r in range(3):
        file_row = next(file_row_gen)

    file_row = [float(score) for score in file_row]
    F1_micro_mean[:] = file_row

    file_row = next(file_row_gen)
    file_row = [float(score) for score in file_row]
    F1_micro_std[:] = file_row

    return F1_macro_mean, F1_macro_std, F1_micro_mean, F1_micro_std
def read_performance_measures(file_path, number=10):
    file_row_gen = get_file_row_generator(file_path, "\t")

    F1_macro_mean = np.zeros(number, dtype=np.float64)
    F1_macro_std = np.zeros(number, dtype=np.float64)
    F1_micro_mean = np.zeros(number, dtype=np.float64)
    F1_micro_std = np.zeros(number, dtype=np.float64)

    for r in range(18):
        file_row = next(file_row_gen)

    file_row = [float(score) for score in file_row]
    F1_macro_mean[:] = file_row

    file_row = next(file_row_gen)
    file_row = [float(score) for score in file_row]
    F1_macro_std[:] = file_row

    for r in range(3):
        file_row = next(file_row_gen)

    file_row = [float(score) for score in file_row]
    F1_micro_mean[:] = file_row

    file_row = next(file_row_gen)
    file_row = [float(score) for score in file_row]
    F1_micro_std[:] = file_row

    return F1_macro_mean, F1_macro_std, F1_micro_mean, F1_micro_std
def read_graph_raw_data_file(filepath, number_of_nodes):

    file_row_gen = get_file_row_generator(filepath, " ")

    file_row = next(file_row_gen)
    while file_row[0][0] == "%":
        file_row = next(file_row_gen)

    number_of_edges = int(file_row[2])

    row = np.empty(number_of_edges, dtype=np.int32)
    col = np.empty(number_of_edges, dtype=np.int32)
    data = np.empty(number_of_edges, dtype=np.float64)

    edge_counter = 0
    for file_row in file_row_gen:
        if file_row[0] == "":
            break
        source_node = int(file_row[0])
        target_node = int(file_row[1])
        edge_weight = float(file_row[2])

        row[edge_counter] = source_node
        col[edge_counter] = target_node
        data[edge_counter] = edge_weight

        edge_counter += 1

    row = row - 1
    col = col - 1

    matrix = spsp.coo_matrix((data, (row, col)), shape=(number_of_nodes, number_of_nodes))
    matrix = spsp.coo_matrix(spsp.csr_matrix(matrix))

    return matrix
Beispiel #4
0
def read_node_label_matrix(file_path, separator, numbering="matlab"):
    """
    Reads node-label pairs in csv format and returns a list of tuples and a node-label matrix.

    Inputs:  - file_path: The path where the node-label matrix is stored.
             - separator: The delimiter among values (e.g. ",", "\t", " ")
             - number_of_nodes: The number of nodes of the full graph. It is possible that not all nodes are labelled.
             - numbering: Array numbering style: * "matlab"
                                                 * "c"

    Outputs: - node_label_matrix: The node-label associations in a NumPy array of tuples format.
             - number_of_categories: The number of categories/classes the nodes may belong to.
             - labelled_node_indices: A NumPy array containing the labelled node indices.
    """
    # Open file
    file_row_generator = get_file_row_generator(file_path, separator)

    file_row = next(file_row_generator)
    number_of_rows = file_row[1]
    number_of_categories = int(file_row[3])

    # Initialize lists for row and column sparse matrix arguments
    row = list()
    col = list()
    append_row = row.append
    append_col = col.append

    # Populate the arrays
    for file_row in file_row_generator:
        node = np.int64(file_row[0])
        label = np.int64(file_row[1])

        # Add label
        append_row(node)
        append_col(label)

    labelled_node_indices = np.array(list(set(row)))

    row = np.array(row, dtype=np.int64)
    col = np.array(col, dtype=np.int64)
    data = np.ones_like(row, dtype=np.float64)

    if numbering == "matlab":
        row -= 1
        col -= 1
        labelled_node_indices -= 1
    elif numbering == "c":
        pass
    else:
        print("Invalid numbering style.")
        raise RuntimeError

    # Form sparse adjacency matrix
    node_label_matrix = spsp.coo_matrix(
        (data, (row, col)), shape=(number_of_rows, number_of_categories))
    node_label_matrix = node_label_matrix.tocsr()

    return node_label_matrix, number_of_categories, labelled_node_indices
def make_labelling(dataset, raw_data_folder, preprocessed_data_folder):
    node_file_path = raw_data_folder + "/" + dataset + ".ids"

    file_row_gen = get_file_row_generator(node_file_path, " ")

    user_twitter_id_list = list()

    for file_row in file_row_gen:
        if file_row[0] == "":
            break
        else:
            user_twitter_id_list.append(int(file_row[0]))

    id_to_node = dict(zip(user_twitter_id_list, range(len(user_twitter_id_list))))
    user_twitter_id_list = set(user_twitter_id_list)

    core_file_path = raw_data_folder + "/" + dataset + ".communities"

    file_row_gen = get_file_row_generator(core_file_path, ",")

    row = list()
    col = list()

    category_counter = 0
    for file_row in file_row_gen:
        id_list = list()
        first_id = file_row[0].strip().split(" ")
        first_id = id_to_node[int(first_id[1])]
        id_list.append(first_id)
        for id in file_row[1:]:
            id_list.append(id_to_node[int(id)])

        row.extend(id_list)
        col.extend(category_counter*np.ones(len(id_list), dtype=np.int32))

        category_counter += 1

    row = np.array(row, dtype=np.int32)
    col = np.array(col, dtype=np.int32)
    data = np.ones_like(row, dtype=np.int8)

    node_label_matrix = spsp.coo_matrix((data, (row, col)), shape=(len(user_twitter_id_list), category_counter))

    target_path = preprocessed_data_folder + "/" + "node_label_matrix" + ".tsv"
    scipy_sparse_to_csv(target_path, node_label_matrix, separator="\t", directed=True, numbering="matlab")
def read_node_label_matrix(file_path, separator, numbering="matlab"):
    """
    Reads node-label pairs in csv format and returns a list of tuples and a node-label matrix.

    Inputs:  - file_path: The path where the node-label matrix is stored.
             - separator: The delimiter among values (e.g. ",", "\t", " ")
             - number_of_nodes: The number of nodes of the full graph. It is possible that not all nodes are labelled.
             - numbering: Array numbering style: * "matlab"
                                                 * "c"

    Outputs: - node_label_matrix: The node-label associations in a NumPy array of tuples format.
             - number_of_categories: The number of categories/classes the nodes may belong to.
             - labelled_node_indices: A NumPy array containing the labelled node indices.
    """
    # Open file
    file_row_generator = get_file_row_generator(file_path, separator)

    file_row = next(file_row_generator)
    number_of_rows = file_row[1]
    number_of_categories = int(file_row[3])

    # Initialize lists for row and column sparse matrix arguments
    row = list()
    col = list()
    append_row = row.append
    append_col = col.append

    # Populate the arrays
    for file_row in file_row_generator:
        node = np.int64(file_row[0])
        label = np.int64(file_row[1])

        # Add label
        append_row(node)
        append_col(label)

    labelled_node_indices = np.array(list(set(row)))

    row = np.array(row, dtype=np.int64)
    col = np.array(col, dtype=np.int64)
    data = np.ones_like(row, dtype=np.float64)

    if numbering == "matlab":
        row -= 1
        col -= 1
        labelled_node_indices -= 1
    elif numbering == "c":
        pass
    else:
        print("Invalid numbering style.")
        raise RuntimeError

    # Form sparse adjacency matrix
    node_label_matrix = spsp.coo_matrix((data, (row, col)), shape=(number_of_rows, number_of_categories))
    node_label_matrix = node_label_matrix.tocsr()

    return node_label_matrix, number_of_categories, labelled_node_indices
def make_labelling(dataset, raw_data_folder, preprocessed_data_folder):
    node_file_path = raw_data_folder + "/" + dataset + ".ids"

    file_row_gen = get_file_row_generator(node_file_path, " ")

    user_twitter_id_list = list()

    for file_row in file_row_gen:
        if file_row[0] == "":
            break
        else:
            user_twitter_id_list.append(int(file_row[0]))

    id_to_node = dict(zip(user_twitter_id_list, range(len(user_twitter_id_list))))
    user_twitter_id_list = set(user_twitter_id_list)

    core_file_path = raw_data_folder + "/" + dataset + ".core"

    file_row_gen = get_file_row_generator(core_file_path, " ")

    core_user_twitter_id_list = list()

    for file_row in file_row_gen:
        if file_row[0] == "":
            break
        else:
            core_user_twitter_id_list.append(int(file_row[0]))

    core_user_twitter_id_list = user_twitter_id_list.intersection(core_user_twitter_id_list)

    non_core_user_twitter_id_set = user_twitter_id_list.difference(core_user_twitter_id_list)

    row = [id_to_node[id] for id in core_user_twitter_id_list] + [id_to_node[id] for id in non_core_user_twitter_id_set]
    row = np.array(row, dtype=np.int32)
    col = [1 for id in core_user_twitter_id_list] + [0 for id in non_core_user_twitter_id_set]
    col = np.array(col, dtype=np.int32)
    data = np.ones(len(user_twitter_id_list), dtype=np.int8)

    node_label_matrix = spsp.coo_matrix((data, (row, col)), shape=(len(user_twitter_id_list), 2))

    target_path = preprocessed_data_folder + "/" + "node_label_matrix" + ".tsv"
    scipy_sparse_to_csv(target_path, node_label_matrix, separator="\t", directed=True, numbering="matlab")
def read_node_label_matrix(file_path, separator, number_of_nodes):
    """
    Reads node-label pairs in csv format and returns a list of tuples and a node-label matrix.

    Inputs:  - file_path: The path where the node-label matrix is stored.
             - separator: The delimiter among values (e.g. ",", "\t", " ")
             - number_of_nodes: The number of nodes of the full graph. It is possible that not all nodes are labelled.

    Outputs: - node_label_matrix: The node-label associations in a NumPy array of tuples format.
             - number_of_categories: The number of categories/classes the nodes may belong to.
             - labelled_node_indices: A NumPy array containing the labelled node indices.
    """
    # Open file
    file_row_generator = get_file_row_generator(file_path, separator)

    # Initialize lists for row and column sparse matrix arguments
    row = list()
    col = list()
    append_row = row.append
    append_col = col.append

    # Populate the arrays
    for file_row in file_row_generator:
        node = np.int64(file_row[0])
        label = np.int64(file_row[1])

        # Add label
        append_row(node)
        append_col(label)

    number_of_categories = len(
        set(col)
    )  # I assume that there are no missing labels. There may be missing nodes.
    labelled_node_indices = np.array(list(set(row)))

    row = np.array(row, dtype=np.int64)
    col = np.array(col, dtype=np.int64)
    data = np.ones_like(row, dtype=np.float64)

    # Array count should start from 0.
    row -= 1
    col -= 1
    labelled_node_indices -= 1

    # Form sparse adjacency matrix
    node_label_matrix = sparse.coo_matrix(
        (data, (row, col)), shape=(number_of_nodes, number_of_categories))
    node_label_matrix = node_label_matrix.tocsr()

    return node_label_matrix, number_of_categories, labelled_node_indices
def get_number_of_nodes(raw_data_folder, dataset):
    node_file_path = raw_data_folder + "/" + dataset + ".ids"

    file_row_gen = get_file_row_generator(node_file_path, " ")

    number_of_nodes = 0

    for file_row in file_row_gen:
        if file_row[0] == "":
            break
        else:
            number_of_nodes += 1

    return number_of_nodes
def read_adjacency_matrix(file_path, separator):
    """
    Reads an edge list in csv format and returns the adjacency matrix in SciPy Sparse COOrdinate format.

    Inputs:  - file_path: The path where the adjacency matrix is stored.
             - separator: The delimiter among values (e.g. ",", "\t", " ")

    Outputs: - adjacency_matrix: The adjacency matrix in SciPy Sparse COOrdinate format.
    """
    # Open file
    file_row_generator = get_file_row_generator(file_path, separator)

    # Initialize lists for row and column sparse matrix arguments
    row = list()
    col = list()
    append_row = row.append
    append_col = col.append

    # Read all file rows
    for file_row in file_row_generator:
        source_node = np.int64(file_row[0])
        target_node = np.int64(file_row[1])

        # Add edge
        append_row(source_node)
        append_col(target_node)

        # Since this is an undirected network also add the reciprocal edge
        append_row(target_node)
        append_col(source_node)

    row = np.array(row, dtype=np.int64)
    col = np.array(col, dtype=np.int64)
    data = np.ones_like(row, dtype=np.float64)

    number_of_nodes = np.max(
        row)  # I assume that there are no missing nodes at the end.

    # Array count should start from 0.
    row -= 1
    col -= 1

    # Form sparse adjacency matrix
    adjacency_matrix = sparse.coo_matrix(
        (data, (row, col)), shape=(number_of_nodes, number_of_nodes))

    return adjacency_matrix
def read_node_label_matrix(file_path, separator, number_of_nodes):
    """
    Reads node-label pairs in csv format and returns a list of tuples and a node-label matrix.

    Inputs:  - file_path: The path where the node-label matrix is stored.
             - separator: The delimiter among values (e.g. ",", "\t", " ")
             - number_of_nodes: The number of nodes of the full graph. It is possible that not all nodes are labelled.

    Outputs: - node_label_matrix: The node-label associations in a NumPy array of tuples format.
             - number_of_categories: The number of categories/classes the nodes may belong to.
             - labelled_node_indices: A NumPy array containing the labelled node indices.
    """
    # Open file
    file_row_generator = get_file_row_generator(file_path, separator)

    # Initialize lists for row and column sparse matrix arguments
    row = list()
    col = list()
    append_row = row.append
    append_col = col.append

    # Populate the arrays
    for file_row in file_row_generator:
        node = np.int64(file_row[0])
        label = np.int64(file_row[1])

        # Add label
        append_row(node)
        append_col(label)

    number_of_categories = len(set(col))  # I assume that there are no missing labels. There may be missing nodes.
    labelled_node_indices = np.array(list(set(row)))

    row = np.array(row, dtype=np.int64)
    col = np.array(col, dtype=np.int64)
    data = np.ones_like(row, dtype=np.float64)

    # Array count should start from 0.
    row -= 1
    col -= 1
    labelled_node_indices -= 1

    # Form sparse adjacency matrix
    node_label_matrix = sparse.coo_matrix((data, (row, col)), shape=(number_of_nodes, number_of_categories))
    node_label_matrix = node_label_matrix.tocsr()

    return node_label_matrix, number_of_categories, labelled_node_indices
def read_adjacency_matrix(file_path, separator):
    """
    Reads an edge list in csv format and returns the adjacency matrix in SciPy Sparse COOrdinate format.

    Inputs:  - file_path: The path where the adjacency matrix is stored.
             - separator: The delimiter among values (e.g. ",", "\t", " ")

    Outputs: - adjacency_matrix: The adjacency matrix in SciPy Sparse COOrdinate format.
    """
    # Open file
    file_row_generator = get_file_row_generator(file_path, separator)

    # Initialize lists for row and column sparse matrix arguments
    row = list()
    col = list()
    append_row = row.append
    append_col = col.append

    # Read all file rows
    for file_row in file_row_generator:
        source_node = np.int64(file_row[0])
        target_node = np.int64(file_row[1])

        # Add edge
        append_row(source_node)
        append_col(target_node)

        # Since this is an undirected network also add the reciprocal edge
        append_row(target_node)
        append_col(source_node)

    row = np.array(row, dtype=np.int64)
    col = np.array(col, dtype=np.int64)
    data = np.ones_like(row, dtype=np.float64)

    number_of_nodes = np.max(row)  # I assume that there are no missing nodes at the end.

    # Array count should start from 0.
    row -= 1
    col -= 1

    # Form sparse adjacency matrix
    adjacency_matrix = sparse.coo_matrix((data, (row, col)), shape=(number_of_nodes, number_of_nodes))

    return adjacency_matrix
def get_number_of_nodes(raw_data_folder, dataset):
    node_file_path = raw_data_folder + "/" + dataset + ".ids"

    file_row_gen = get_file_row_generator(node_file_path, " ")

    number_of_nodes = 0

    id_list = list()

    for file_row in file_row_gen:
        if file_row[0] == "":
            break
        else:
            number_of_nodes += 1
            id_list.append(int(file_row[0]))

    id_to_node = dict(zip(id_list, range(len(id_list))))

    return number_of_nodes, id_to_node
Beispiel #14
0
def read_adjacency_matrix(file_path, separator, numbering="matlab"):
    """
    Reads an edge list in csv format and returns the adjacency matrix in SciPy Sparse COOrdinate format.

    Inputs:  - file_path: The path where the adjacency matrix is stored.
             - separator: The delimiter among values (e.g. ",", "\t", " ")
             - numbering: Array numbering style: * "matlab"
                                                 * "c"

    Outputs: - adjacency_matrix: The adjacency matrix in SciPy Sparse COOrdinate format.
    """
    # Open file
    file_row_generator = get_file_row_generator(file_path, separator)

    file_row = next(file_row_generator)
    number_of_rows = file_row[1]
    number_of_columns = file_row[3]
    directed = file_row[7]
    if directed == "True":
        directed = True
    elif directed == "False":
        directed = False
    else:
        print("Invalid metadata.")
        raise RuntimeError

    # Initialize lists for row and column sparse matrix arguments
    row = list()
    col = list()
    data = list()
    append_row = row.append
    append_col = col.append
    append_data = data.append

    # Read all file rows
    for file_row in file_row_generator:
        source_node = np.int64(file_row[0])
        target_node = np.int64(file_row[1])
        edge_weight = np.float64(file_row[2])

        # Add edge
        append_row(source_node)
        append_col(target_node)
        append_data(edge_weight)

        # Since this is an undirected network also add the reciprocal edge
        if not directed:
            if source_node != target_node:
                append_row(target_node)
                append_col(source_node)
                append_data(edge_weight)

    row = np.array(row, dtype=np.int64)
    col = np.array(col, dtype=np.int64)
    data = np.array(data, dtype=np.float64)

    if numbering == "matlab":
        row -= 1
        col -= 1
    elif numbering == "c":
        pass
    else:
        print("Invalid numbering style.")
        raise RuntimeError

    # Form sparse adjacency matrix
    adjacency_matrix = spsp.coo_matrix(
        (data, (row, col)), shape=(number_of_rows, number_of_columns))

    return adjacency_matrix
def read_adjacency_matrix(file_path, separator, numbering="matlab"):
    """
    Reads an edge list in csv format and returns the adjacency matrix in SciPy Sparse COOrdinate format.

    Inputs:  - file_path: The path where the adjacency matrix is stored.
             - separator: The delimiter among values (e.g. ",", "\t", " ")
             - numbering: Array numbering style: * "matlab"
                                                 * "c"

    Outputs: - adjacency_matrix: The adjacency matrix in SciPy Sparse COOrdinate format.
    """
    # Open file
    file_row_generator = get_file_row_generator(file_path, separator)

    file_row = next(file_row_generator)
    number_of_rows = file_row[1]
    number_of_columns = file_row[3]
    directed = file_row[7]
    if directed == "True":
        directed = True
    elif directed == "False":
        directed = False
    else:
        print("Invalid metadata.")
        raise RuntimeError

    # Initialize lists for row and column sparse matrix arguments
    row = list()
    col = list()
    data = list()
    append_row = row.append
    append_col = col.append
    append_data = data.append

    # Read all file rows
    for file_row in file_row_generator:
        source_node = np.int64(file_row[0])
        target_node = np.int64(file_row[1])
        edge_weight = np.float64(file_row[2])

        # Add edge
        append_row(source_node)
        append_col(target_node)
        append_data(edge_weight)

        # Since this is an undirected network also add the reciprocal edge
        if not directed:
            if source_node != target_node:
                append_row(target_node)
                append_col(source_node)
                append_data(edge_weight)

    row = np.array(row, dtype=np.int64)
    col = np.array(col, dtype=np.int64)
    data = np.array(data, dtype=np.float64)

    if numbering == "matlab":
        row -= 1
        col -= 1
    elif numbering == "c":
        pass
    else:
        print("Invalid numbering style.")
        raise RuntimeError

    # Form sparse adjacency matrix
    adjacency_matrix = spsp.coo_matrix((data, (row, col)), shape=(number_of_rows, number_of_columns))

    return adjacency_matrix
def get_folds_generator(node_label_matrix,
                        labelled_node_indices,
                        number_of_categories,
                        dataset_memory_folder,
                        percentage,
                        number_of_folds=10):
    """
    Read or form and store the seed nodes for training and testing.

    Inputs: - node_label_matrix: The node-label ground truth in a SciPy sparse matrix format.
            - labelled_node_indices: A NumPy array containing the labelled node indices.
            - number_of_categories: The number of categories/classes in the learning.
            - memory_path: The folder where the results are stored.
            - percentage: The percentage of labelled samples that will be used for training.

    Output: - folds: A generator containing train/test set folds.
    """
    number_of_labeled_nodes = labelled_node_indices.size
    training_set_size = int(np.ceil(percentage*number_of_labeled_nodes/100))

    ####################################################################################################################
    # Read or generate folds
    ####################################################################################################################
    fold_file_path = dataset_memory_folder + "/folds/" + str(percentage) + "_folds.txt"
    train_list = list()
    test_list = list()
    if not os.path.exists(fold_file_path):
        with open(fold_file_path, "w") as fp:
            for trial in np.arange(number_of_folds):
                train, test = valid_train_test(node_label_matrix[labelled_node_indices, :],
                                               training_set_size,
                                               number_of_categories,
                                               trial)
                train = labelled_node_indices[train]
                test = labelled_node_indices[test]

                # Write test nodes
                row = [str(node) for node in test]
                row = "\t".join(row) + "\n"
                fp.write(row)

                # Write train nodes
                row = [str(node) for node in train]
                row = "\t".join(row) + "\n"
                fp.write(row)

                train_list.append(train)
                test_list.append(test)
    else:
        file_row_gen = get_file_row_generator(fold_file_path, "\t")

        for trial in np.arange(number_of_folds):
            # Read test nodes
            test = next(file_row_gen)
            test = [int(node) for node in test]
            test = np.array(test)

            # Read train nodes
            train = next(file_row_gen)
            train = [int(node) for node in train]
            train = np.array(train)

            train_list.append(train)
            test_list.append(test)

    folds = ((train, test) for train, test in zip(train_list, test_list))
    return folds