def print_results(row_labels, prediction, labels, graph: gc.Graph):
    print("connected")
    for i in range(len(labels)):
        if labels[i] == 1:
            print(row_labels[i], "Predicted: ", prediction[i], "actual:",
                  labels[i], "correct:", prediction[i] == labels[i],
                  "Degree of node:", graph.degree(row_labels[i]))
    print("not connected")
    for i in range(len(labels)):
        if labels[i] == 0:
            print(row_labels[i], "Predicted: ", prediction[i], "actual:",
                  labels[i], "correct:", prediction[i] == labels[i],
                  "Degree of node:", graph.degree(row_labels[i]))
def test_per_node(nodes_to_train_on: List[int], graph: gc.Graph,
                  save_info: sl.MemoryAccess, feature_type: ft.FeatureType,
                  num_of_bins: int, limit_num_training_graphs: Optional[int],
                  sampling_strategy: Optional, c, removed_node: int):
    if nodes_to_train_on is not None:
        tr_node_list = nodes_to_train_on[removed_node]
    else:
        tr_node_list = None
        raise ValueError(
            "Training node list is not given, should be given though")
    train_data = save_info.load_list_of_training_data(
        removed_node=removed_node,
        graph=graph.delete_node(removed_node),
        feature_type=feature_type,
        num_of_bins=num_of_bins,
        limit_num=limit_num_training_graphs,
        tr_node_list=tr_node_list)

    utils.assert_df_no_nan(
        train_data, text=f'Training data for removed node {removed_node}')

    test_data = save_info.load_test_data(removed_node=removed_node,
                                         feature_type=feature_type,
                                         num_of_bins=num_of_bins)
    utils.assert_df_no_nan(test_data,
                           text=f'Test data for removed node {removed_node}')

    tr_labels, tr_predicted, tr_probabilities, te_labels, te_predicted, te_probabilities = \
        _train(c, train_data=train_data, test_data=test_data, sampling_strategy=sampling_strategy)

    # train_results, test_results = evaluate(tr_labels, tr_predicted, te_labels, te_predicted, te_probabilities)
    train_results = evaluate(tr_labels, tr_predicted, tr_probabilities)
    test_results = evaluate(te_labels, te_predicted, te_probabilities)

    # add some additional information
    test_results["degree"] = graph.degree(removed_node)

    test_results["avg_neighbour_degree"] = graph.average_neighbour_degree(
        removed_node)

    test_results["avg dist to pos pred"] = \
        calculate_avg_distance_to_positive_predicted_nodes(graph=graph, removed_node=removed_node,
                                                           labels=test_data.index.values,
                                                           predicted=te_predicted)

    test_results["num training features"] = len(train_data)
    test_results["num test features"] = len(test_data)

    test_results["train false negative"] = train_results["false negative"]
    test_results["train true positive"] = train_results["true positive"]
    test_results["train accuracy"] = train_results["accuracy"]
    test_results["train precision"] = train_results["precision"]
    test_results["train recall"] = train_results["recall"]
    test_results["train auc"] = train_results["auc"]

    return pd.Series(test_results), removed_node
Ejemplo n.º 3
0
def compute_degrees(graph: gc.Graph, labels: [int]):
    degrees = pd.DataFrame(0, labels, ["degree"])
    for label in labels:
        degrees.loc[label] = graph.degree(label)

    # normalise degrees
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(degrees.astype(float))

    return pd.DataFrame(x_scaled, index=labels, columns=["degree"])
Ejemplo n.º 4
0
def __create_degree_column_for_feature(graph: gc.Graph, row_labels: [int]):
    degrees = pd.DataFrame(0, row_labels, ["degree"])
    for label in row_labels:
        degrees.loc[label] = graph.degree(label)

    # normalise degrees
    # min_max_scaler = preprocessing.MinMaxScaler()
    # x_scaled = min_max_scaler.fit_transform(degrees)

    degrees = degrees / max(degrees.values)

    return pd.DataFrame(degrees, index=row_labels, columns=["degree"])
Ejemplo n.º 5
0
def get_sample_with_degree(graph: gc.Graph, node_list: [int], degree: int, quantity: int):
    degrees = np.array([graph.degree(n) for n in node_list])
    samples_to_find = quantity

    candidates = np.where(degrees == degree)[0]
    if len(candidates) > samples_to_find:
        np.random.choice(candidates, size=samples_to_find, replace=False)
    elif len(candidates) < samples_to_find:
        raise ValueError(f'Not enough training samples required {quantity} got {len(candidates)}')
    sample = candidates
    samples_to_find -= len(candidates)

    offset = 1
    while samples_to_find > 0:
        # print(f"Sampling range {offset}")
        candidates = np.concatenate([np.where(degrees == (degree + offset))[0],
                                     np.where(degrees == (degree - offset))[0]])
        if len(candidates) > samples_to_find:
            candidates = np.random.choice(candidates, size=samples_to_find, replace=False)
        sample = np.concatenate([sample, candidates])
        samples_to_find -= len(candidates)
        offset += 1

    return [node_list[s] for s in sample]
def _get_avg_degree_of_neighbours(graph: gc.Graph, node: int):
    neighbours_mean = (list(
        map(lambda n: graph.degree(n), list(graph.neighbours(node)))))
    return sum(neighbours_mean) / len(neighbours_mean)