Exemple #1
0
def main():
    file_edge_path = "./data/edge_list.txt"
    file_node_path = "./data/class_info.txt"
    base_dir = "./plot/"
    g, train_node, test_node = GraphUtils().make_graph(file_edge_path, file_node_path)
    GraphUtils().print_all_graph_quantitatively(g)
    GraphUtils().print_quantitatively_plt(g, base_dir)
    GraphUtils().get_centrality(g)
def main():
    file_edge_path = "./data/edge_list.txt"
    file_node_path = "./data/class_info.txt"

    emb_dir = "./emb/"
    os.makedirs(emb_dir, exist_ok=True)

    model_dir = "./model/"
    os.makedirs(model_dir, exist_ok=True)

    # node vec parameter
    p = 1
    q = 1
    num_walks = 50  #40#30#18 #50 #40 #30 #18
    walk_length = 200  #300#300 #200 # 500 #400 #200 #100
    dimensions = 128
    window_size = 15
    workers = 8
    iter = 3

    emb_load_file = ""

    # Uncomment it For load embedding
    emb_load_file = "./emb/50_300_f1_0.916_nede2vec.emb"
    #emb_load_file = "./emb/50_200_f1_0.899_nede2vec.emb"
    model_file_load = ""
    # Uncomment it For load model
    model_file_load = "./model/50_300_128_LogisticRegressionCV_n2v_0.916.bin"
    #model_file_load = "./model/50_200_128_LogisticRegressionCV_n2v_0.899.bin"
    best_model_file = model_file_load
    model = None
    walks = None

    # make graph
    g, train_node, test_node = GraphUtils().make_graph(file_edge_path,
                                                       file_node_path)

    # make random walks
    if emb_load_file is None or emb_load_file == "":
        walks = make_random_walk(g, p, q, num_walks, walk_length)

    # make embedding
    emb_model = get_embedding(walks, dimensions, window_size, workers, iter,
                              emb_load_file)

    # preparing data
    X, y, train_val, test_val, test_X, test_y = preprocessing_data(
        emb_model, file_node_path)

    if model_file_load is None or model_file_load == "":
        # classifier train
        model, best_model_file, best_f1 = cv_model_selection(
            X, y, num_walks, walk_length, dimensions, model_dir)

        # embedding model save with parameter
        emb_save(emb_model, num_walks, walk_length, best_f1, emb_dir)

    # prediction
    n2v_prediction(model, X, test_X, train_val, y, test_val, best_model_file,
                   model_file_load)
def n2v_prediction(best_model,
                   X,
                   test_X,
                   train_val,
                   y,
                   test_val,
                   best_model_file,
                   model_file_load=None):

    if model_file_load is None or model_file_load == "":
        model = best_model
    else:
        # model_name = model_dir + file_name
        model = load_model(model_file_load)

    model_pred_test = model.predict(test_X)

    model_pred_train = model.predict(X)

    acc = accuracy_score(y, model_pred_train)
    f1 = f1_score(y, model_pred_train)
    print("saving model eval -> acc : {0} f1 : {1}".format(acc, f1))

    dict_200 = dict(zip(list(map(int, train_val)), list(y)))
    dict_800 = dict(zip(list(map(int, test_val)), list(model_pred_test)))

    all_df = {**dict_200, **dict_800}

    dict_800 = dict(sorted(dict_800.items()))

    result_file = result_dir + os.path.splitext(
        os.path.basename(best_model_file))[0] + "_" + str(f1) + ".txt"
    GraphUtils().save_result(dict_800, result_file)
Exemple #4
0
def main():
    file_emb_output = "./emb/18_100_f1_0.916_nede2vec.emb"
    file_edge_path = "./data/edge_list.txt"
    file_node_path = "./data/class_info.txt"
    g, train_node, test_node = GraphUtils().make_graph(file_edge_path,
                                                       file_node_path)

    G, train_gen, train_targets, val_gen, val_targets, test_targets, test_gen, all_gen, generator \
        = GraphUtils().preprocessing(g, train_node, file_emb_output)

    Gt, test_800_gen, generator = GraphUtils().preprocessing_predict(
        g, test_node, file_emb_output)

    ppnp_model, embedding_model = make_ppnp(train_targets, generator)
    ppnp_train(ppnp_model, train_gen, val_gen)
    GraphUtils().evaluate(ppnp_model, test_gen)
    all_node_test(ppnp_model, all_gen, train_node, test_800_gen, test_node)
    GraphUtils().show_embedding(embedding_model, all_gen, train_node, 'PPNP')
Exemple #5
0
def main():
    file_edge_path = "./data/edge_list.txt"
    file_node_path = "./data/class_info.txt"
    g2 = GraphUtils().preprocess_label_propagation(file_edge_path,
                                                   file_node_path)

    # position file save for plotting
    #pos_file = save_layout_pos(g1)
    pos_file = "./obj/spring_layout_pos.pkl"
    """ Library """
    communities = asyn_lpa_communities(G=g2,
                                       pos_file=pos_file,
                                       limit_epoch=100,
                                       chk_dir=checkpoint_dir,
                                       base_dir=base_dir,
                                       weight=None)  # Asynchronous
    nx.set_node_attributes(g2, communities, 'community')
Exemple #6
0
def all_node_test(model, all_gen, train_node, test_800_gen, test_node):
    #all_nodes = node_subjects.index
    #all_gen = generator.flow(all_nodes)
    all_predictions = model.predict(all_gen)
    predicted_node = np.where(all_predictions > 0.5, '1', '-1')
    # accuracy_score(node_subjects.value, predicted_node.squeeze())

    score = accuracy_score(train_node['values'].values.astype(str),
                           predicted_node.squeeze())
    f1 = f1_score(train_node['values'].values.astype(int),
                  predicted_node.squeeze().astype(int))

    predictions_800 = model.predict(test_800_gen)
    predicted_800_node = np.where(predictions_800 > 0.5, '1', '-1')
    dict_800 = dict(
        zip(list(map(int, test_node['node'].values)),
            list(predicted_800_node.squeeze())))
    result_file = "./result/" + "gcn_result" + "_" + str(round(f1, 2)) + ".txt"
    GraphUtils().save_result(dict_800, result_file)

    print("\nTest train all node: \n \tacc : {0} f1: {1}".format(score, f1))
Exemple #7
0
def asyn_lpa_communities(G,
                         pos_file,
                         limit_epoch,
                         chk_dir,
                         base_dir,
                         weight=None):
    """Returns communities in `G` as detected by asynchronous label
    propagation.

    The asynchronous label propagation algorithm is described in
    [1]_. The algorithm is probabilistic and the found communities may
    vary on different executions.

    The algorithm proceeds as follows. After initializing each node with
    a unique label, the algorithm repeatedly sets the label of a node to
    be the label that appears most frequently among that nodes
    neighbors. The algorithm halts when each node has the label that
    appears most frequently among its neighbors. The algorithm is
    asynchronous because each node is updated without waiting for
    updates on the remaining nodes.

    This generalized version of the algorithm in [1]_ accepts edge
    weights.

    Parameters
    ----------
    G : Graph

    weight : string
        The edge attribute representing the weight of an edge.
        If None, each edge is assumed to have weight one. In this
        algorithm, the weight of an edge is used in determining the
        frequency with which a label appears among the neighbors of a
        node: a higher weight means the label appears more often.

    Returns
    -------
    communities : iterable
        Iterable of communities given as sets of nodes.

    Notes
    ------
    Edge weight attributes must be numerical.

    References
    ----------
    .. [1] Raghavan, Usha Nandini, Réka Albert, and Soundar Kumara. "Near
           linear time algorithm to detect community structures in large-scale
           networks." Physical Review E 76.3 (2007): 036106.
    """

    if pos_file is not None:
        pos = load_obj(pos_file)
    labels = {n: G._node[n]['label'] for i, n in enumerate(G)}
    all_nodes = list(G)
    node_with_zero = [n for n in all_nodes if G._node[n]['label'] == 0]
    total_cont = True
    number_loop = 0
    best_epoch = 0
    stop_point = 0
    label_dict = dict()
    cont = True

    while total_cont and cont:
        cont = False

        random.shuffle(node_with_zero)
        # Calculate the label for each node
        #end_freq = Counter()
        check_nodes = dict()
        for node in node_with_zero:
            if len(G[node]) < 1:
                continue

            # Get label frequencies. Depending on the order they are processed
            # in some nodes with be in t and others in t-1, making the
            # algorithm asynchronous.
            label_freq = Counter()

            for v in G[node]:
                value = G.nodes[v]['label']
                if labels[v] != 0:
                    label_freq.update(
                        {labels[v]: G.edges[v, node][weight] if weight else 1})
            # Choose the label with the highest frecuency. If more than 1 label
            # has the highest frecuency choose one randomly.
            if len(label_freq) != 0:
                try:
                    max_freq = max(label_freq.values())
                except Exception as e:
                    print(e)
                best_labels = [
                    label for label, freq in label_freq.items()
                    if freq == max_freq
                ]
                new_label = random.choice(best_labels)
                labels[node] = new_label
                # Continue until all nodes have a label that is better than other
                # neighbour labels (only one label has max_freq for each node).
                cont = cont or len(best_labels) > 1
                end_freq = Counter(labels.values())
                check_nodes[node] = cont

                #if (end_freq[0] + end_freq[1]) == 800:
        chk_flg = Counter(check_nodes.values())
        # End of node loop
        if (end_freq[0]) == 0:
            current_epoch = chk_flg[False]
            if current_epoch > best_epoch:
                # Add communities attribute into the graph
                plot_g = G.copy()
                nx.set_node_attributes(plot_g, labels, 'community')
                best_epoch = current_epoch
                # Save best_epoch label
                label_dict[best_epoch] = labels
                #nx.set_node_attributes(G, labels, 'community')
                save_graphml(G, chk_dir, best_epoch)
                print("result : {0}, epoch {1} false_cnt {2}".format(
                    end_freq, number_loop, current_epoch))
                stop_point += 1
                GraphUtils().draw_network(plot_g, base_dir, pos, best_epoch)

                best_dict = {
                    items[0]: items[1]
                    for items in labels.items() if items[0] in node_with_zero
                }

                #best_df = pd.DataFrame({"node": best_dict.keys(), "value": best_dict.values()})
                result_file = "./result/" + "epoch_{0}_fcnt_{1}_lp_result.txt".format(
                    number_loop, current_epoch)
                GraphUtils().save_result(best_dict, result_file)
                # saving as a CSV file
                #df.to_csv(result_dir, sep='\t', index=False, header=False)

        elif number_loop % 1 == 0:
            plot_g = G.copy()
            nx.set_node_attributes(plot_g, labels, 'community')
            print("result : {0}, epoch {1}".format(end_freq, number_loop))
            GraphUtils().draw_network(plot_g, base_dir, pos, best_epoch)
            #chk_flg = Counter(check_nodes.values())
        number_loop += 1
        if number_loop > limit_epoch:
            total_cont = False
        print("cont Counter : {0} epoch {1}".format(chk_flg, number_loop))
    labels_b = label_dict[max(label_dict.keys())]
    # TODO In Python 3.3 or later, this should be `yield from ...`.
    return labels_b