Example #1
0
def predict():
    string = str('test')
    hist_pred_n = string + "hist_pred.jpeg"

    # Loading from .pkl files
    pkl_hnd = store(app.config['static_path'], app.root_path)
    clf = pkl_hnd.load('model')
    n_labels = pkl_hnd.load('n_labels')
    enc = pkl_hnd.load('enc')

    # Feature extraction
    data = utils.file_parser_test(
        os.path.join(app.config['upload_path'], "test.txt"))
    features = utils.feature_extractor(data['text'], 5000)

    # Preprocessing features
    data_x = utils.preprocess_features(features, 2500)

    # Predicting
    pr = predict_model(data_x)
    pred_enc = pr.predict_model(clf)

    # Decoding the encoded prediction
    pred = utils.label_encoder(pred_enc, True, enc)
    pkl_hnd.save_pred(data_x, pred)
    # Saving predicted value and data into .csv file

    #Plotting histogram of prediction
    pkl_hnd.plot_hist(pred, hist_pred_n)

    return render_template(
        "predict_result.html",
        img_hist_pred=url_for(app.config['static_path'], filename=hist_pred_n),
    )
Example #2
0
 def process_data(self):
     data = load_data('cora')
     adj, feas = data[:2]
     self.adj = adj.todense()
     self.normed_adj = preprocess_adj(adj)
     self.feas = preprocess_features(feas, False)
     self.y_train, self.y_val, self.y_test = data[2:5]
     self.train_mask, self.val_mask, self.test_mask = data[5:]
Example #3
0
def run(args):
    (
        adj,
        features,
        y_train,
        y_val,
        y_test,
        train_mask,
        val_mask,
        test_mask,
        train_size,
        test_size,
    ) = load_corpus(args.select_data)

    train_mask = train_mask + val_mask
    y_train = y_train + y_val

    adj_dense = preprocess_adj(adj).toarray().astype(np.float32)
    features_dense = preprocess_features(features).toarray().astype(np.float32)

    y_train = y_train.astype(np.float32)
    y_test = y_test.astype(np.float32)
    train_mask = train_mask.astype(np.float32)
    test_mask = test_mask.astype(np.float32)

    gcn_model = GCN(
        tf.convert_to_tensor(adj_dense),
        layers=args.layers,
        hidden_size=args.hidden_size,
        dropout=args.dropout,
    )

    loss_fn = masked_softmax_cross_entropy

    # acc_fn = masked_accuracy

    optimizer = Adam(learning_rate=args.lr)
    # print("Model Layers: ", gcn_model.trainable_variables)
    model_textGCN = TextGCN(model=gcn_model,
                            loss=loss_fn,
                            optimizer=optimizer,
                            args=args)

    model_textGCN.train(features_dense, y_train, train_mask)

    sns.distplot(model_textGCN.train_accuracy)
    plt.savefig("train_acc.png")

    plt.clf()

    sns.distplot(model_textGCN.train_losses)
    plt.savefig("train_losses.png")

    eval_result = model_textGCN.evaluate(features_dense, y_test, test_mask)

    print(f"Final Evaluation Result: {eval_result}")
Example #4
0
def load(dataset):
    datadir = os.path.join('data', dataset)

    if not os.path.exists(datadir):
        os.makedirs(datadir)
        ds = download(dataset)
        adj = nx.to_numpy_array(ds.graph)
        diff = compute_ppr(ds.graph, 0.2)
        feat = ds.features[:]
        labels = ds.labels[:]

        idx_train = np.argwhere(ds.train_mask == 1).reshape(-1)
        idx_val = np.argwhere(ds.val_mask == 1).reshape(-1)
        idx_test = np.argwhere(ds.test_mask == 1).reshape(-1)

        np.save(f'{datadir}/adj.npy', adj)
        np.save(f'{datadir}/diff.npy', diff)
        np.save(f'{datadir}/feat.npy', feat)
        np.save(f'{datadir}/labels.npy', labels)
        np.save(f'{datadir}/idx_train.npy', idx_train)
        np.save(f'{datadir}/idx_val.npy', idx_val)
        np.save(f'{datadir}/idx_test.npy', idx_test)
    else:
        adj = np.load(f'{datadir}/adj.npy')
        diff = np.load(f'{datadir}/diff.npy')
        feat = np.load(f'{datadir}/feat.npy')
        labels = np.load(f'{datadir}/labels.npy')
        idx_train = np.load(f'{datadir}/idx_train.npy')
        idx_val = np.load(f'{datadir}/idx_val.npy')
        idx_test = np.load(f'{datadir}/idx_test.npy')

    if dataset == 'citeseer':
        feat = preprocess_features(feat)

        epsilons = [1e-5, 1e-4, 1e-3, 1e-2]
        avg_degree = np.sum(adj) / adj.shape[0]
        epsilon = epsilons[np.argmin([
            abs(avg_degree - np.argwhere(diff >= e).shape[0] / diff.shape[0])
            for e in epsilons
        ])]

        diff[diff < epsilon] = 0.0
        scaler = MinMaxScaler()
        scaler.fit(diff)
        diff = scaler.transform(diff)

    ori_adj = copy.deepcopy(adj)
    # print(ori_adj)
    adj = normalize_adj(adj + sp.eye(adj.shape[0])).todense()

    return ori_adj, adj, diff, feat, labels, idx_train, idx_val, idx_test
Example #5
0
def main(args):
    #
    save_dir = args.save_dir
    log_dir = args.log_dir
    train_dir = args.data_dir

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = utils.load_data(
        args.data_type)
    features = utils.preprocess_features(features)
    support = [utils.preprocess_adj(adj)]
    args.num_supports = 1
    args.input_size, args.features_size = features[2][1], features[2]
    args.output_size = y_train.shape[1]

    config_proto = utils.get_config_proto()
    sess = tf.Session(config=config_proto)
    model = GCN(args, sess, name="gcn")
    summary_writer = tf.summary.FileWriter(log_dir)

    for epoch in range(1, args.nb_epoch + 1):
        epoch_start_time = time.time()

        feed_dict = utils.construct_feed_dict(model, features, support,
                                              y_train, train_mask)
        _, train_loss, train_acc, summaries = model.train(feed_dict)

        if epoch % args.summary_epoch == 0:
            summary_writer.add_summary(summaries, epoch)

        if epoch % args.print_epoch == 0:
            feed_dict_val = utils.construct_feed_dict(model, features, support,
                                                      y_val, val_mask)
            val_loss, val_acc = model.evaluate(feed_dict_val)
            print "epoch %d, train_loss %f, train_acc %f, val_loss %f, val_acc %f, time %.5fs" % \
              (epoch, train_loss, train_acc, val_loss, val_acc, time.time()-epoch_start_time)

        if args.anneal and epoch >= args.anneal_start:
            sess.run(model.lr_decay_op)

    model.saver.save(sess, os.path.join(save_dir, "model.ckpt"))
    print "Model stored...."
def get_data(dataset):
    # Load output_data
    (adj, y_train, y_val, y_test, train_mask, val_mask, test_mask, train_size,
     test_size) = utils.load_data(dataset)

    features = sparse.identity(adj.shape[1])

    # Some preprocessing
    features = utils.preprocess_features(features)
    support = [utils.preprocess_adj(adj)]

    # Define placeholders
    t_features = torch.from_numpy(features)
    t_y_train = torch.from_numpy(y_train)
    t_y_val = torch.from_numpy(y_val)
    t_y_test = torch.from_numpy(y_test)
    t_train_mask = torch.from_numpy(train_mask.astype(np.float32))

    t_support = []
    for i in range(len(support)):
        t_support.append(torch.Tensor(support[i]))

    return (t_features, t_y_train, t_y_val, t_y_test, t_train_mask, t_support,
            val_mask, test_mask, train_size, test_size)
Example #7
0
def load_data(dataset_name,
              splits_file_path=None,
              train_percentage=None,
              val_percentage=None,
              embedding_mode=None,
              embedding_method=None,
              embedding_method_graph=None,
              embedding_method_space=None):
    if dataset_name in {'cora', 'citeseer', 'pubmed'}:
        adj, features, labels, _, _, _ = utils.load_data(dataset_name)
        labels = np.argmax(labels, axis=-1)
        features = features.todense()
        G = nx.DiGraph(adj)
    else:
        graph_adjacency_list_file_path = os.path.join('new_data', dataset_name,
                                                      'out1_graph_edges.txt')
        graph_node_features_and_labels_file_path = os.path.join(
            'new_data', dataset_name, f'out1_node_feature_label.txt')

        G = nx.DiGraph()
        graph_node_features_dict = {}
        graph_labels_dict = {}

        if dataset_name == 'film':
            with open(graph_node_features_and_labels_file_path
                      ) as graph_node_features_and_labels_file:
                graph_node_features_and_labels_file.readline()
                for line in graph_node_features_and_labels_file:
                    line = line.rstrip().split('\t')
                    assert (len(line) == 3)
                    assert (int(line[0]) not in graph_node_features_dict
                            and int(line[0]) not in graph_labels_dict)
                    feature_blank = np.zeros(932, dtype=np.uint8)
                    feature_blank[np.array(line[1].split(','),
                                           dtype=np.uint16)] = 1
                    graph_node_features_dict[int(line[0])] = feature_blank
                    graph_labels_dict[int(line[0])] = int(line[2])
        else:
            with open(graph_node_features_and_labels_file_path
                      ) as graph_node_features_and_labels_file:
                graph_node_features_and_labels_file.readline()
                for line in graph_node_features_and_labels_file:
                    line = line.rstrip().split('\t')
                    assert (len(line) == 3)
                    assert (int(line[0]) not in graph_node_features_dict
                            and int(line[0]) not in graph_labels_dict)
                    graph_node_features_dict[int(line[0])] = np.array(
                        line[1].split(','), dtype=np.uint8)
                    graph_labels_dict[int(line[0])] = int(line[2])

        with open(graph_adjacency_list_file_path) as graph_adjacency_list_file:
            graph_adjacency_list_file.readline()
            for line in graph_adjacency_list_file:
                line = line.rstrip().split('\t')
                assert (len(line) == 2)
                if int(line[0]) not in G:
                    G.add_node(int(line[0]),
                               features=graph_node_features_dict[int(line[0])],
                               label=graph_labels_dict[int(line[0])])
                if int(line[1]) not in G:
                    G.add_node(int(line[1]),
                               features=graph_node_features_dict[int(line[1])],
                               label=graph_labels_dict[int(line[1])])
                G.add_edge(int(line[0]), int(line[1]))

        adj = nx.adjacency_matrix(G, sorted(G.nodes()))
        features = np.array([
            features for _, features in sorted(G.nodes(data='features'),
                                               key=lambda x: x[0])
        ])
        labels = np.array([
            label
            for _, label in sorted(G.nodes(data='label'), key=lambda x: x[0])
        ])

    features = utils.preprocess_features(features)

    if not embedding_mode:
        g = DGLGraph(adj + sp.eye(adj.shape[0]))
    else:
        if embedding_mode == 'ExperimentTwoAll':
            embedding_file_path = os.path.join(
                'embedding_method_combinations_all',
                f'outf_nodes_relation_{dataset_name}all_embedding_methods.txt')
        elif embedding_mode == 'ExperimentTwoPairs':
            embedding_file_path = os.path.join(
                'embedding_method_combinations_in_pairs',
                f'outf_nodes_relation_{dataset_name}_graph_{embedding_method_graph}_space_{embedding_method_space}.txt'
            )
        else:
            embedding_file_path = os.path.join(
                'structural_neighborhood',
                f'outf_nodes_space_relation_{dataset_name}_{embedding_method}.txt'
            )
        space_and_relation_type_to_idx_dict = {}

        with open(embedding_file_path) as embedding_file:
            for line in embedding_file:
                if line.rstrip() == 'node1,node2	space	relation_type':
                    continue
                line = re.split(r'[\t,]', line.rstrip())
                assert (len(line) == 4)
                assert (int(line[0]) in G and int(line[1]) in G)
                if (line[2], int(
                        line[3])) not in space_and_relation_type_to_idx_dict:
                    space_and_relation_type_to_idx_dict[(line[2], int(
                        line[3]))] = len(space_and_relation_type_to_idx_dict)
                if G.has_edge(int(line[0]), int(line[1])):
                    G.remove_edge(int(line[0]), int(line[1]))
                G.add_edge(int(line[0]),
                           int(line[1]),
                           subgraph_idx=space_and_relation_type_to_idx_dict[(
                               line[2], int(line[3]))])

        space_and_relation_type_to_idx_dict['self_loop'] = len(
            space_and_relation_type_to_idx_dict)
        for node in sorted(G.nodes()):
            if G.has_edge(node, node):
                G.remove_edge(node, node)
            G.add_edge(
                node,
                node,
                subgraph_idx=space_and_relation_type_to_idx_dict['self_loop'])
        adj = nx.adjacency_matrix(G, sorted(G.nodes()))
        g = DGLGraph(adj)

        for u, v, feature in G.edges(data='subgraph_idx'):
            g.edges[g.edge_id(u,
                              v)].data['subgraph_idx'] = th.tensor([feature])

    if splits_file_path:
        with np.load(splits_file_path) as splits_file:
            train_mask = splits_file['train_mask']
            val_mask = splits_file['val_mask']
            test_mask = splits_file['test_mask']
    else:
        assert (train_percentage is not None and val_percentage is not None)
        assert (train_percentage < 1.0 and val_percentage < 1.0
                and train_percentage + val_percentage < 1.0)

        if dataset_name in {'cora', 'citeseer'}:
            disconnected_node_file_path = os.path.join(
                'unconnected_nodes', f'{dataset_name}_unconnected_nodes.txt')
            with open(disconnected_node_file_path) as disconnected_node_file:
                disconnected_node_file.readline()
                disconnected_nodes = []
                for line in disconnected_node_file:
                    line = line.rstrip()
                    disconnected_nodes.append(int(line))

            disconnected_nodes = np.array(disconnected_nodes)
            connected_nodes = np.setdiff1d(np.arange(features.shape[0]),
                                           disconnected_nodes)

            connected_labels = labels[connected_nodes]

            train_and_val_index, test_index = next(
                ShuffleSplit(n_splits=1,
                             train_size=train_percentage +
                             val_percentage).split(
                                 np.empty_like(connected_labels),
                                 connected_labels))
            train_index, val_index = next(
                ShuffleSplit(n_splits=1, train_size=train_percentage).split(
                    np.empty_like(connected_labels[train_and_val_index]),
                    connected_labels[train_and_val_index]))
            train_index = train_and_val_index[train_index]
            val_index = train_and_val_index[val_index]

            train_mask = np.zeros_like(labels)
            train_mask[connected_nodes[train_index]] = 1
            val_mask = np.zeros_like(labels)
            val_mask[connected_nodes[val_index]] = 1
            test_mask = np.zeros_like(labels)
            test_mask[connected_nodes[test_index]] = 1
        else:
            train_and_val_index, test_index = next(
                ShuffleSplit(n_splits=1,
                             train_size=train_percentage +
                             val_percentage).split(np.empty_like(labels),
                                                   labels))
            train_index, val_index = next(
                ShuffleSplit(n_splits=1, train_size=train_percentage).split(
                    np.empty_like(labels[train_and_val_index]),
                    labels[train_and_val_index]))
            train_index = train_and_val_index[train_index]
            val_index = train_and_val_index[val_index]

            train_mask = np.zeros_like(labels)
            train_mask[train_index] = 1
            val_mask = np.zeros_like(labels)
            val_mask[val_index] = 1
            test_mask = np.zeros_like(labels)
            test_mask[test_index] = 1

    num_features = features.shape[1]
    num_labels = len(np.unique(labels))
    assert (np.array_equal(np.unique(labels),
                           np.arange(len(np.unique(labels)))))

    features = th.FloatTensor(features)
    labels = th.LongTensor(labels)
    train_mask = th.BoolTensor(train_mask)
    val_mask = th.BoolTensor(val_mask)
    test_mask = th.BoolTensor(test_mask)

    # Adapted from https://docs.dgl.ai/tutorials/models/1_gnn/1_gcn.html
    degs = g.in_degrees().float()
    norm = th.pow(degs, -0.5).cuda()
    norm[th.isinf(norm)] = 0
    g.ndata['norm'] = norm.unsqueeze(1)

    return g, features, labels, train_mask, val_mask, test_mask, num_features, num_labels
Example #8
0
A = A + np.eye(A.shape[0])  # Add self-loops
# print('A = A+np.eye(A.shape[0])')
# print(type(A))
# print(A)

#=========================  X      =======================
#==================================归一化===================================

data_pre = []
data_train = []
data_test = []
# Preprocessing operations
A_list = [A for i in range(5)]

for i in range(5):
    data_pre.append(preprocess_features(data[i]))

for i in range(2):
    data_train.append(data_pre[i])

for i in range(2, 4):
    data_test.append(data_pre[i])

# for i in range(5):
#     data_pre.append(preprocess_features(data[i]))
# for i in range(3):
#     data_train.append([data_pre[i], A])
# for i in range(2,4):
#     data_test.append([data_pre[i], A])

#--ValueError: could not broadcast input array from shape (12,7) into shape (12)
Example #9
0
import torch 
import numpy as np
import pickle
from utils import load_data,preprocess_features,preprocess_adj,tuple_to_torchSparseTensor
from gcn_model import GCN

adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data("cora")

adj_hat = preprocess_adj(adj)
features = preprocess_features(features)
# features[0].shape == (49216, 2) 
# features[1].sahpe == (49216,) 
# features[2] === (2708, 1433)  

# Convert to torch.Tensor
sparse_adj_hat = tuple_to_torchSparseTensor(adj_hat)
sparse_features = tuple_to_torchSparseTensor(features)

y_train = torch.FloatTensor(y_train)  # dtype = torch.float32
y_val = torch.FloatTensor(y_val)
y_test = torch.FloatTensor(y_test)

train_mask = torch.from_numpy(train_mask)  # dtype = torch.bool
val_mask = torch.from_numpy(val_mask)
test_mask = torch.from_numpy(test_mask)
model_file = 'training_dir/gcn_model.pkl'
model = torch.load(model_file)
output = model(sparse_adj_hat,sparse_features)
test_loss = model.loss(output,y_test,test_mask)
test_acc = model.accuracy(output,y_test,test_mask)
print("model_file={},test_loss={},test_acc={}".format(model_file,test_loss.item(),test_acc.item()))
Example #10
0
from models.gat import create_gat_model
from models.sgc import create_sgc_model
from models.gfnn import create_gfnn_model
#from models.graphsage import create_graphsage_model
from models.masked_gcn import create_masked_gcn_model
from train import run
from utils import preprocess_features

import argparse

if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--dataset', type=str, default='small')
    parser.add_argument('--model', type=str, default='sgc')
    parser.add_argument('--niter', type=int, default=10)

    args = parser.parse_args()

    data = Data.load(args.dataset)  #load_data(args.dataset)
    data.update_mask(0)
    data.features = preprocess_features(data.features)

    if args.model == 'sgc':
        model, optimizer = create_sgc_model(data, lr=0.2, K=2)
    elif args.model == 'gcn':
        model, optimizer = create_gcn_model(data)
    else:
        raise ValueError(args.model)
    run(data, model, optimizer, verbose=False, niter=args.niter, patience=10)
Example #11
0
def exp(dataset, data_seed, init_seed):
    '''
    dataset - name of dataset
    data_seed - data_seed corresponds to train/dev/test split
    init_seed - seed for initializing NN weights
    '''
    print('running {} on {}'.format(FLAGS.model, dataset))

    tf.reset_default_graph()
    adj, subgraphs, features, labels, train_mask, val_mask, test_mask = load_data(
        dataset, data_seed)
    features = preprocess_features(features)

    # if early_stopping is not used, then put validation data into the testing data
    if FLAGS.early_stop == 0:
        mask = np.logical_or(val_mask, test_mask)
        test_mask = mask
        val_mask = mask

    config = tf.ConfigProto()
    config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    #config.log_device_placement = True
    config.gpu_options.allow_growth = True

    train_loss = []
    train_acc = []
    valid_loss = []
    valid_acc = []
    with tf.Graph().as_default():
        random.seed(init_seed)
        np.random.seed(init_seed)
        tf.set_random_seed(init_seed)

        with tf.Session(config=config) as sess:

            model, support, placeholders = build_model(adj, features,
                                                       labels.shape[1],
                                                       subgraphs)
            sess.run(tf.global_variables_initializer())

            def evaluate(labels_mask, noise=0., dropout=0.):
                feed_dict_val = construct_feed_dict(features, support, labels,
                                                    labels_mask, placeholders,
                                                    noise, dropout)
                outs_val = sess.run([model.loss, model.accuracy],
                                    feed_dict=feed_dict_val)
                return outs_val[0], outs_val[1]

            start_t = time.time()
            for epoch in range(FLAGS.epochs):
                feed_dict = construct_feed_dict(features, support, labels,
                                                train_mask, placeholders,
                                                FLAGS.fisher_noise,
                                                FLAGS.dropout)
                feed_dict.update({tf.keras.backend.learning_phase(): 1})
                outs = sess.run([model.opt_op, model.loss, model.accuracy],
                                feed_dict=feed_dict)
                train_loss.append(outs[1])
                train_acc.append(outs[2])

                # Validation
                outs = evaluate(val_mask)
                valid_loss.append(outs[0])
                valid_acc.append(outs[1])

                if (epoch + 1) % 10 == 0:
                    print("Epoch:", '%04d' % (epoch + 1), "train_loss=",
                          "{:.5f}".format(train_loss[-1]), "train_acc=",
                          "{:.5f}".format(train_acc[-1]), "val_loss=",
                          "{:.5f}".format(valid_loss[-1]), "val_acc=",
                          "{:.5f}".format(valid_acc[-1]))
                    #print( 'perterbation radius:', sess.run( pradius ) )

                if FLAGS.early_stop == 0:
                    if epoch > 10 and (train_loss[-1] > 1.5 * train_loss[0]
                                       or np.isnan(train_loss[-1])):
                        print("Early stopping at epoch {}...".format(epoch))
                        break

                elif FLAGS.early_stop == 1:  # simple early stopping
                    if epoch > 20 and valid_loss[-1] > np.mean( valid_loss[-10:] ) \
                                  and valid_acc[-1] < np.mean( valid_acc[-10:] ):
                        print("Early stopping at epoch {}...".format(epoch))
                        break

                elif FLAGS.early_stop == 2:  # more strict conditions
                    if epoch > 100 \
                        and np.mean( valid_loss[-10:] ) > np.mean( valid_loss[-100:] ) \
                        and np.mean( valid_acc[-10:] ) < np.mean( valid_acc[-100:] ):
                        print("Early stopping at epoch {}...".format(epoch))
                        break
                else:
                    print('unknown early stopping strategy:', FLAGS.early_stop)
                    sys.exit(0)

            test_loss, test_acc = evaluate(test_mask)
            sec_per_epoch = (time.time() - start_t) / epoch
            print("Test set results:", "loss=", "{:.5f}".format(test_loss),
                  "accuracy=", "{:.5f}".format(test_acc), "epoch_secs=",
                  "{:.2f}".format(sec_per_epoch))

    tf.reset_default_graph()

    return {
        'train_loss': train_loss,
        'train_acc': train_acc,
        'valid_loss': valid_loss,
        'valid_acc': valid_acc,
        'test_loss': test_loss,
        'test_acc': test_acc,
    }
Example #12
0
A, X, Y_train, Y_val, Y_test, idx_train, idx_val, idx_test = load_data('cora')

# Parameters
N = X.shape[0]  # Number of nodes in the graph
F = X.shape[1]  # Original feature dimension
n_classes = Y_train.shape[1]  # Number of classes
F_ = 8  # Output size of first GraphAttention layer
n_attn_heads = 8  # Number of attention heads in first GAT layer
dropout_rate = 0.6  # Dropout rate (between and inside GAT layers)
l2_reg = 5e-4 / 2  # Factor for l2 regularization
learning_rate = 5e-3  # Learning rate for Adam
epochs = 10000  # Number of training epochs
es_patience = 100  # Patience fot early stopping

# Preprocessing operations
X = preprocess_features(X)
A = A + np.eye(A.shape[0])  # Add self-loops

# Model definition (as per Section 3.3 of the paper)
X_in = Input(shape=(F, ))
A_in = Input(shape=(N, ))

dropout1 = Dropout(dropout_rate)(X_in)
graph_attention_1 = GraphAttention(
    F_,
    attn_heads=n_attn_heads,
    attn_heads_reduction='concat',
    dropout_rate=dropout_rate,
    activation='elu',
    kernel_regularizer=l2(l2_reg),
    attn_kernel_regularizer=l2(l2_reg))([dropout1, A_in])
Example #13
0
            elif 'bias' in name:
                torch.nn.init.constant_(w, 0)
            else:
                pass


if __name__ == '__main__':
    adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(
        'cora')

    print('adj:', adj.shape)
    print('features:', features.shape)
    print('y:', y_train.shape, y_val.shape, y_test.shape)
    print('mask:', train_mask.shape, val_mask.shape, test_mask.shape)

    features = preprocess_features(
        features)  # [49216, 2], [49216], [2708, 1433]
    supports = preprocess_adj(adj)

    train_label = torch.from_numpy(y_train).long().to(device)
    num_classes = train_label.shape[1]
    train_label = train_label.argmax(dim=1)
    train_mask = torch.from_numpy(train_mask.astype(np.int)).to(device)
    val_label = torch.from_numpy(y_val).long().to(device)
    val_label = val_label.argmax(dim=1)
    val_mask = torch.from_numpy(val_mask.astype(np.int)).to(device)
    test_label = torch.from_numpy(y_test).long().to(device)
    test_label = test_label.argmax(dim=1)
    test_mask = torch.from_numpy(test_mask.astype(np.int)).to(device)

    i = torch.from_numpy(features[0].astype(np.float)).long().to(device)
    v = torch.from_numpy(features[1]).to(device)
Example #14
0
flags.DEFINE_float('weight_decay', 5e-4,
                   'Weight for L2 loss on embedding matrix.')
flags.DEFINE_integer('early_stopping', 10,
                     'Tolerance for early stopping (# of epochs).')
flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.')
flags.DEFINE_string('gpu', '1', 'GPU selection.')
flags.DEFINE_string('method', args.method, 'Adversarial attack method')

os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu

# Load data
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(
    FLAGS.dataset_dir, FLAGS.dataset)

# Some preprocessing
features_dense, features = preprocess_features(features)
if FLAGS.model == 'gcn':
    support = [preprocess_adj(adj)]
    num_supports = 1
    model_func = GCN
elif FLAGS.model == 'gcn_cheby':
    support = chebyshev_polynomials(adj, FLAGS.max_degree)
    num_supports = 1 + FLAGS.max_degree
    model_func = GCN
elif FLAGS.model == 'dense':
    support = [preprocess_adj(adj)]  # Not used
    num_supports = 1
    model_func = MLP
else:
    raise ValueError('Invalid argument for model: ' + str(FLAGS.model))
Example #15
0
            print("Ignored fold {}, since store_data_dicts already contains {} folds".format(fold_i, len(store_data_dicts)))
            continue
        
        data["train_ind"] = train_ind
        data["val_ind"] = val_ind
        data["fold_i"] = fold_i
        
        # Get the CNN embedding in case of a parisot model, it is dependent on fold_i.
        if args.model == "parisot_tf" or args.model == "parisot_py":
            data['cnn_embedding'] = get_data_raw_cnn(args, fold_i)
            if type(data['cnn_embedding']) == type(None):
                print("No cnn embedding loaded, this fold is therefore ignored.")
                continue
            
            # Preprocess the features for the parisot models.
            data['cnn_embedding'] = preprocess_features(data['cnn_embedding']) 

            # Rewriting the name since the general name will be input_feauteres, independend of the actual data source.
            data['input_features'] = data.pop('cnn_embedding')
            
            # Create adjacency matrix based on vae in the case of a Parisot model. 
            # This should happen in this point since it can be based on input_features for the sparisity part.
            data['vae'] = get_data_raw_amc_vae(data['id'])
            data['adj_raw'] = get_adjacency_matrix_vae(args, data['vae'], data)
            data['adj_support'] = chebyshev_polynomials(data['adj_raw'], args.polynomial_degree)
        
        store_data_dict = train_single_fold(args, data)
        store_data_dicts.append(store_data_dict)
            
        save_datadicts(args, store_data_dicts)
Example #16
0
def train():

    clf = request.form['train']
    if allowed_classifier(clf):
        string = str('train')
        hist_n = string + "hist.jpeg"
        cnmt_n = string + "cnmt.jpeg"
        pkl_hnd = store(app.config['static_path'], app.root_path)

        # Feature extraction
        data = utils.file_parser(
            os.path.join(app.config['upload_path'], "data.txt"))
        features = utils.feature_extractor(data['text'], 5000).todense()
        sh = data.shape

        # Preprocessing features and labels
        data_x = utils.preprocess_features(features, 2500)
        data_y, enc = utils.label_encoder(data['label'], False, None)
        pkl_hnd.dump(enc, 'enc')  # storing encoder

        # Splitting data into training set and validation set
        train_x, train_y, valid_x, valid_y = utils.train_valid(
            data_x, data_y, 0.2)

        #Balancing data with SMOTE
        text, label = utils.balance_data(train_x, train_y)

        # Selecting model and tuning hyperparameters
        tr = model(clf, text[:sh[0], :], label[:sh[0]], valid_x, valid_y)
        comb_mod = tr.model_selection()

        # Fitting model and predicting
        mod = tr.build_model(comb_mod)
        pkl_hnd.dump(mod, 'model')  # storing the model
        pr = predict_model(valid_x)
        pred = pr.predict_model(mod)

        #Training Statistics
        st = stats(pred, valid_y)
        acc, f1 = st.train_stats()

        #Plotting histogram and confusion matrix
        pkl_hnd.plot_hist(data['label'], hist_n)
        n_labels = np.unique(np.asarray(data['label']))
        pkl_hnd.dump(n_labels, 'n_labels')  # storing labels
        cnf_matrix = st.cnf_mtx()
        pkl_hnd.plot_confusion_matrix(
            cnf_matrix,
            n_labels,
            cnmt_n,
            normalize=True,
            title='Confusion matrix',
            cmap=plt.cm.Blues,
        )

        return render_template("train_result.html",
                               accuracy=acc,
                               img_hist=url_for(app.config['static_path'],
                                                filename=hist_n),
                               img_cfmt=url_for(app.config['static_path'],
                                                filename=cnmt_n),
                               f1=f1)
    else:
        flash('Please enter a valid classifier')
        return redirect(url_for('index'))
Example #17
0
def train_Model(dataset, data_seed, init_seed):
    print('{} Model on {}'.format(FLAGS.model, dataset))

    tf.reset_default_graph()
    adj, features, labels, train_mask, val_mask, test_mask = load_data(
        dataset, data_seed)

    #Feature Selection part
    y_train = np.zeros(labels.shape)
    y_train[train_mask, :] = labels[train_mask, :]
    y_train = np.argmax(y_train, axis=1)
    #     clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial').fit(features[train_mask], y_train[train_mask])
    #     model = SelectFromModel(clf, prefit=True)
    #     features = model.transform(features)

    #alfa A+ beta A' (Clique Finding)
    graphMain = nx.from_numpy_matrix(adj.todense())
    listClique = list(nx.find_cliques(graphMain))
    tmp = deepcopy(np.matrix(adj.todense()))
    for i in listClique:
        for j in i:
            for k in i:
                if j != k:
                    adj[j, k] = len(i) - 1
                    adj[k, j] = len(i) - 1
    adj = FLAGS.alfa * np.matrix(adj.todense()) + FLAGS.beta * tmp

    features = preprocess_features(features)

    config = tf.ConfigProto()
    config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    #config.log_device_placement = True
    config.gpu_options.allow_growth = True

    train_loss = []
    train_acc = []
    valid_loss = []
    valid_acc = []
    with tf.Graph().as_default():
        random.seed(init_seed)
        np.random.seed(init_seed)
        tf.set_random_seed(init_seed)

        sess = tf.Session(config=config)

        model, support, placeholders = build_model(adj, features,
                                                   labels.shape[1])
        sess.run(tf.global_variables_initializer())

        start_t = time.time()
        for epoch in range(FLAGS.epochs):
            feed_dict = construct_feed_dict(features, support, labels,
                                            train_mask, placeholders,
                                            FLAGS.dropout, FLAGS.alfa,
                                            FLAGS.beta)
            feed_dict.update({tf.keras.backend.learning_phase(): 1})
            outs = sess.run([model.opt_op, model.loss, model.accuracy],
                            feed_dict=feed_dict)
            train_loss.append(outs[1])
            train_acc.append(outs[2])

            # Validation
            outs = evaluate(sess, model, features, support, labels,
                            placeholders, val_mask)
            valid_loss.append(outs[0])
            valid_acc.append(outs[1])

            if epoch > FLAGS.early_stoping \
                    and np.mean( valid_loss[-10:] ) > np.mean( valid_loss[-100:] ) \
                    and np.mean( valid_acc[-10:] ) < np.mean( valid_acc[-100:] ):
                print("Early stopping at epoch {}...".format(epoch))
                break

        test_loss, test_acc = evaluate(sess, model, features, support, labels,
                                       placeholders, test_mask)
        print("Test set results:", "loss=", "{:.5f}".format(test_loss),
              "accuracy=", "{:.5f}".format(test_acc))

    tf.reset_default_graph()
    from importlib import reload
    import scipy.io as sio
    sio.savemat('train_lossCoolClique.mat',
                {'train_loss_GOOLnorm': train_loss})
    sio.savemat('train_accCoolClique.mat', {'train_loss_GOOLnorm': train_acc})
    sio.savemat('valid_lossCoolClique.mat',
                {'train_loss_GOOLnorm': valid_loss}, {'valid_acc': valid_acc})
    sio.savemat('valid_accgCoolClique.mat', {'train_loss_GOOLnorm': valid_acc})

    return {
        'train_loss': train_loss,
        'train_acc': train_acc,
        'valid_loss': valid_loss,
        'valid_acc': valid_acc,
        'test_loss': test_loss,
        'test_acc': test_acc,
    }
Example #18
0
neg_test_path = dataset + '/neg_test.pkl'
changedadj_path = dataset + '/changed_adj.pkl'
linkspath = dataset + '/links.pkl'

# Load data
if dataset == 'nell.0.001':
    features = load_nell(dataset)[1]
else:
    features = load_data(dataset)[1]

with open(changedadj_path, 'rb') as load_cha_adj:
    changed_adj = pickle.load(load_cha_adj)

# Some preprocessing
if FLAGS.features == 0:
    changed_features = preprocess_features(changed_adj +
                                           sp.eye(changed_adj.shape[0]))
else:
    changed_features = preprocess_features(features)

support = [preprocess_adj(changed_adj)]
num_supports = 1
model_func = GCN

# Define placeholders
placeholders = {
    'support':
    [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
    'features':
    tf.sparse_placeholder(tf.float32,
                          shape=tf.constant(changed_features[2],
                                            dtype=tf.int64)),
Example #19
0
print('Dataset: ' + dataset)
print('----- Opt. hyperparams -----')
print('lr: {}'.format(lr))
print('l2_coef: {}'.format(l2_coef))
print('feed forward dropout: {}'.format(ff_dropout))
print('attention dropout: {}'.format(attn_dropout))
print('patience: {}'.format(patience))
print('----- Archi. hyperparams -----')
print('no. layers: {}'.format(n_layer))
print('no. hidden units: {}'.format(hidden_units))
print('nonlinearity: {}'.format(nonlinearity))
print('model: {}'.format(model))

adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(
    dataset)
features, spars = preprocess_features(features)

n_node = features.shape[0]
ft_size = features.shape[1]
n_class = y_train.shape[1]

adj = adj.todense()

features = torch.from_numpy(features)
y_train = torch.from_numpy(y_train)
y_val = torch.from_numpy(y_val)
y_test = torch.from_numpy(y_test)
train_mask = torch.from_numpy(np.array(train_mask, dtype=np.uint8))
val_mask = torch.from_numpy(np.array(val_mask, dtype=np.uint8))
test_mask = torch.from_numpy(np.array(test_mask, dtype=np.uint8))