Beispiel #1
0
def run_cora():
    np.random.seed(1)
    random.seed(1)
    num_nodes = 2708
    feat_data, labels, adj_lists = load_cora()
    adj_score_list, adj_score_sum = findNeighbor(adj_lists)
    features = nn.Embedding(2708, 1433)
    features.weight = nn.Parameter(torch.FloatTensor(feat_data),
                                   requires_grad=False)
    # features.cuda()

    agg1 = MeanAggregator(adj_score_list, adj_score_sum, features, cuda=True)
    enc1 = Encoder(features, 1433, 128, adj_lists, agg1, gcn=True, cuda=False)
    agg2 = MeanAggregator(adj_score_list,
                          adj_score_sum,
                          lambda nodes: enc1(nodes).t(),
                          cuda=False)
    enc2 = Encoder(lambda nodes: enc1(nodes).t(),
                   enc1.embed_dim,
                   128,
                   adj_lists,
                   agg2,
                   base_model=enc1,
                   gcn=True,
                   cuda=False)
    enc1.num_samples = 5
    enc2.num_samples = 5

    graphsage = SupervisedGraphSage(7, enc2)
    #    graphsage.cuda()
    rand_indices = np.random.permutation(num_nodes)
    test = rand_indices[:1000]
    val = rand_indices[1000:1500]
    train = list(rand_indices[1500:])

    optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
                                       graphsage.parameters()),
                                lr=0.7)
    times = []
    for batch in range(100):
        batch_nodes = train[:256]
        random.shuffle(train)
        start_time = time.time()
        optimizer.zero_grad()
        loss = graphsage.loss(
            batch_nodes,
            Variable(torch.LongTensor(labels[np.array(batch_nodes)])))
        loss.backward()
        optimizer.step()
        end_time = time.time()
        times.append(end_time - start_time)
        print(batch, loss.data.item())

    val_output = graphsage.forward(val)
    print(
        "Validation F1:",
        f1_score(labels[val],
                 val_output.data.numpy().argmax(axis=1),
                 average="micro"))
    print("Average batch time:", np.mean(times))
Beispiel #2
0
def run_cora():
    np.random.seed(1)
    random.seed(1)
    num_nodes = 2708
    mini_batch_size = 256
    feat_data, labels, adj_lists = load_cora()
    print(feat_data.shape)
    features = nn.Embedding(2708, 1433)
    features.weight = nn.Parameter(torch.FloatTensor(feat_data), requires_grad=False)
    # features.cuda()

    agg1 = MeanAggregator(features, cuda=True)
    enc1 = Encoder(features, 1433, 128, adj_lists, agg1, gcn=True, cuda=False)
    agg2 = MeanAggregator(lambda nodes : enc1(nodes).t(), cuda=False, gcn=True)
    enc2 = Encoder(lambda nodes : enc1(nodes).t(), enc1.embed_dim, 128, adj_lists, agg2,
            base_model=enc1, gcn=True, cuda=False)
    enc1.num_samples = 5
    enc2.num_samples = 5

    graphsage = SupervisedGraphSage(7, enc2)
    # graphsage.cuda()
    rand_indices = np.random.permutation(num_nodes)
    test = rand_indices[:1000]
    val = rand_indices[1000:1500]
    train = list(rand_indices[1500:])

    optimizer = torch.optim.SGD(filter(lambda p : p.requires_grad, graphsage.parameters()), lr=0.7)
    times = []

    num_train_nodes = len(train)

    mini_batches_iterator = minibatch_iter(num_train_nodes, mini_batch_size)

    n_epochs = 3
    for epoch in range(n_epochs):
        # one epoch
        print("Start running epoch {0} / {1}".format(epoch + 1, n_epochs))
        random.shuffle(train)
        mini_batches = iter(mini_batches_iterator)
        for start, end in mini_batches:
            batch_nodes = train[start:end]
            
            start_time = time.time()
            optimizer.zero_grad()
            loss = graphsage.loss(batch_nodes, 
                    Variable(torch.LongTensor(labels[np.array(batch_nodes)])))
            loss.backward()
            optimizer.step()
            end_time = time.time()
            times.append(end_time-start_time)
            print("\t", start, end, loss.data.item())

        val_output = graphsage.forward(val) 
        print("Validation F1:", f1_score(labels[val], val_output.data.numpy().argmax(axis=1), average="micro"))
        print("Average batch time:", np.mean(times) if len(times) else 0)
Beispiel #3
0
def run_cora():
    np.random.seed(1)
    random.seed(1)

    # load data
    num_nodes = 2708
    feat_data, labels, adj_lists = load_cora()
    train, test, val = split_data(labels)

    # construct model
    ## layer1 : Embedding layer
    features = nn.Embedding(feat_data.shape[0], feat_data.shape[1])
    features.weight = nn.Parameter(torch.FloatTensor(feat_data),
                                   requires_grad=False)
    #features.cuda()
    ## layer2 : Sample and Aggregate 1433->128
    agg1 = MeanAggregator(features, cuda=True)
    enc1 = Encoder(features, 1433, 128, adj_lists, agg1, gcn=True, cuda=False)
    ## layer3 : Sample and Aggregate 128->128
    agg2 = MeanAggregator(lambda nodes: enc1(nodes).t(), cuda=False)
    enc2 = Encoder(lambda nodes: enc1(nodes).t(),
                   enc1.embed_dim,
                   128,
                   adj_lists,
                   agg2,
                   base_model=enc1,
                   gcn=True,
                   cuda=False)
    ## layer4 : Classification layer
    enc1.num_samples = 5
    enc2.num_samples = 5
    graphsage = SupervisedGraphSage(7, enc2)

    # optimizer
    optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
                                       graphsage.parameters()),
                                lr=0.7)

    times = []
    for batch in range(100):
        batch_nodes = train[:256]
        random.shuffle(train)
        start_time = time.time()
        optimizer.zero_grad()
        loss = graphsage.loss(
            batch_nodes,
            Variable(torch.LongTensor(labels[np.array(batch_nodes)])))
        loss.backward()
        optimizer.step()
        end_time = time.time()
        times.append(end_time - start_time)
        if batch % 10 == 0:
            print batch, float(loss.data)
    print 'Finished training.'
    print '******************************'

    test_output = graphsage.forward(test)
    test_onehot = test_output.data.numpy()
    test_labels = labels[test]
    test_preds = np.argmax(test_onehot, axis=1)
    print 'Test Accuracy:', accuracy_score(test_labels, test_preds)
    print 'Average batch time: ', np.mean(times)
Beispiel #4
0
def run_data(input_node, input_edge_train, input_edge_test, output_file, name):
    feat_data, edge_train, label_train, edge_test, label_test, adj_lists, adj_time = load_data(
        input_node, input_edge_train, input_edge_test)
    print("Finish Loading Data")
    features = nn.Embedding(len(feat_data), 1000)
    features.weight = nn.Parameter(torch.FloatTensor(feat_data),
                                   requires_grad=False)
    #features.cuda()
    agg1 = MeanAggregator(features, cuda=True)
    enc1 = Encoder(features,
                   1000,
                   dimension,
                   adj_lists,
                   adj_time,
                   agg1,
                   gcn=True,
                   cuda=False)
    agg2 = MeanAggregator(lambda nodes: enc1(nodes).t(), cuda=False)
    enc2 = Encoder(lambda nodes: enc1(nodes).t(),
                   enc1.embed_dim,
                   dimension,
                   adj_lists,
                   adj_time,
                   agg2,
                   base_model=enc1,
                   gcn=True,
                   cuda=False)
    enc1.num_samples = 5
    enc2.num_samples = 5
    enc2.last = True
    graphsage = SupervisedGraphSage(2, enc2, name)
    #graphsage.cuda()
    #f=open('result_test'+name+'_'+file+"_"+str(dimension),'a+')
    f = open(output_file, 'a+')
    f.write("Training\n")
    #f.close()
    for epoch in range(0, 20):
        #f = open('result_test'+name+'_'+file+"_"+str(dimension), 'a+')
        optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
                                           graphsage.parameters()),
                                    lr=0.7)
        optimizer.zero_grad()
        f.write("epoch " + str(epoch) + "\n")
        loss, predict_y = graphsage.loss(
            Variable(torch.LongTensor(label_train)), agg1, agg2, edge_train)
        print("AUC: " + str(metrics.roc_auc_score(label_train, predict_y)) +
              "\n")
        f.write("AUC: " + str(metrics.roc_auc_score(label_train, predict_y)) +
                "\n")
        loss.backward()
        optimizer.step()
        #f.close()
        #gc.collect()
    #f = open('result_test'+name+'_'+file+"_"+str(dimension), 'a+')
    f.write("Testing\n")
    loss, predict_y = graphsage.loss(Variable(torch.LongTensor(label_test)),
                                     agg1, agg2, edge_test)
    f.write("AUC: " + str(metrics.roc_auc_score(label_test, predict_y)) + "\n")
    predict_y1 = []
    for i in range(0, len(predict_y)):
        if predict_y[i] > 0.5:
            predict_y1.append(1)
        else:
            predict_y1.append(0)

    f.write("Micro-f1 score: " +
            str(metrics.f1_score(label_test, predict_y1, average="micro")) +
            "\n")
    f.write("Macro-f1 score: " +
            str(metrics.f1_score(label_test, predict_y1, average="macro")) +
            "\n")
    f.write("recall: " + str(metrics.recall_score(label_test, predict_y1)) +
            "\n")
    f.close()
Beispiel #5
0
def run_cora():
    np.random.seed(1)
    random.seed(1)
    num_nodes = 2708
    feat_data, labels, adj_lists, IDs = load_cora()
    print(feat_data[0])
    #assert (0)
    features = nn.Embedding(2708, 1433)
    features.weight = nn.Parameter(torch.FloatTensor(feat_data),
                                   requires_grad=False)
    # features.cuda()

    agg1 = MeanAggregator(features, cuda=True)
    enc1 = Encoder(features, 1433, 128, adj_lists, agg1, cuda=False)

    print(enc1.embed_dim)
    #assert (0)

    agg2 = MeanAggregator(lambda nodes: enc1(nodes).t(), cuda=False)
    enc2 = Encoder(lambda nodes: enc1(nodes).t(),
                   enc1.embed_dim,
                   128,
                   adj_lists,
                   agg2,
                   base_model=enc1,
                   cuda=False,
                   text_label="Encoder_1")

    agg3 = MeanAggregator(lambda nodes: enc2(nodes).t(), cuda=False)
    enc3 = Encoder(lambda nodes: enc2(nodes).t(),
                   enc2.embed_dim,
                   128,
                   adj_lists,
                   agg3,
                   base_model=enc1,
                   cuda=False,
                   text_label="Encoder_2")

    enc1.num_samples = 5
    enc2.num_samples = 5
    enc3.num_samples = 5

    graphsage = SupervisedGraphSage(7, enc2)
    #    graphsage.cuda()
    rand_indices = range(num_nodes)
    test = rand_indices[2166:]
    val = rand_indices[2165:2166]
    train = list(rand_indices[:2165])
    print(np.shape(train))

    optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
                                       graphsage.parameters()),
                                lr=0.6,
                                weight_decay=4e-5)
    times = []
    best_acc = 0

    for batch in range(150):
        batch_nodes = train[:256]
        random.shuffle(train)
        start_time = time.time()
        optimizer.zero_grad()
        loss = graphsage.loss(
            batch_nodes,
            Variable(torch.LongTensor(labels[np.array(batch_nodes)])))
        assert (0)
        loss.backward()
        optimizer.step()
        end_time = time.time()
        times.append(end_time - start_time)

        if batch % 25 == 0:
            val_output = graphsage.forward(val)
            val_pridict = val_output.data.numpy().argmax(axis=1)

            if best_acc < accuracy_score(labels[val], val_pridict):
                best_acc = accuracy_score(labels[val], val_pridict)
                if best_acc > 0.80:
                    output = open('u6014942.csv', 'w')
                    output.write('id,label\n')
                    test_output = graphsage.forward(test)
                    test_pridict = test_output.data.numpy().argmax(axis=1)
                    print(np.shape(val_pridict))
                    cnt_output = 0
                    print(np.shape(IDs))
                    for i in list(test):
                        output.write('%s,%s\n' %
                                     (IDs[i], test_pridict[cnt_output] + 1))
                        cnt_output = cnt_output + 1
            print("batch", batch, " Validation accuracy:",
                  accuracy_score(labels[val], val_pridict), "best_acc:",
                  best_acc)
        #print (batch, loss.data[0])

    output = open('u6014942_final.csv', 'w')
    output.write('id,label\n')
    test_output = graphsage.forward(test)
    test_pridict = test_output.data.numpy().argmax(axis=1)
    print(np.shape(val_pridict))
    cnt_output = 0
    print(np.shape(IDs))
    for i in list(test):
        output.write('%s,%s\n' % (IDs[i], test_pridict[cnt_output] + 1))
        cnt_output = cnt_output + 1
    #print (val)
    #assert (0)
    #print ("Validation F1:", accuracy_score(labels[val],val_pridict))
    print("Average batch time:", np.mean(times))
Beispiel #6
0
def run_graphsage(feat_data,
                  labels,
                  adj_lists,
                  train,
                  val,
                  test,
                  num_classes,
                  model_class=SupervisedGraphSage):
    np.random.seed(1)
    random.seed(1)
    num_nodes = feat_data.shape[0]
    # feat_data, labels, adj_lists = load_cora()
    features = nn.Embedding(feat_data.shape[0], feat_data.shape[1])
    features.weight = nn.Parameter(torch.FloatTensor(feat_data),
                                   requires_grad=False)
    if args.cuda:
        features.cuda()

    if model_class == SupervisedGraphSageConcat2:
        raise NotImplementedError()
        # The code seems to be not working...
        linear_embed_weights = nn.Parameter(torch.FloatTensor(
            feat_data.shape[1], args.hid_units),
                                            requires_grad=True)
        init.xavier_uniform(linear_embed_weights)
        features.weight = nn.Parameter(
            features.weight.mm(linear_embed_weights), requires_grad=False)

    agg1 = MeanAggregator(features, cuda=args.cuda, gcn=args.gcn_aggregator)
    enc1 = Encoder(features,
                   features.weight.shape[1],
                   args.hid_units,
                   adj_lists,
                   agg1,
                   gcn=args.gcn_encoder,
                   cuda=args.cuda)
    agg2 = MeanAggregator(lambda nodes: enc1(nodes).t(),
                          cuda=args.cuda,
                          gcn=args.gcn_aggregator)
    enc2 = Encoder(lambda nodes: enc1(nodes).t(),
                   enc1.embed_dim,
                   args.hid_units,
                   adj_lists,
                   agg2,
                   base_model=enc1,
                   gcn=args.gcn_encoder,
                   cuda=args.cuda)
    enc1.num_samples = args.num_samples[0]
    enc2.num_samples = args.num_samples[1]

    if model_class == SupervisedGraphSageConcat:
        graphsage = model_class(num_classes, enc1, enc2)
    elif model_class == SupervisedGraphSageConcat2:
        graphsage = model_class(num_classes, enc1, enc2)
    else:
        graphsage = model_class(num_classes, enc2)
    if args.cuda:
        graphsage.cuda()

    optimizer = torch.optim.SGD(
        [p for p in graphsage.parameters() if p.requires_grad], lr=args.lr)
    times = []
    record_dict = dict()
    best_val_record_dict = None

    for batch in range(args.epochs):
        batch_nodes = train[:args.batch_size]
        random.shuffle(train)
        start_time = time.time()
        optimizer.zero_grad()
        loss = graphsage.loss(
            batch_nodes,
            Variable(torch.LongTensor(labels[np.array(batch_nodes)])))
        loss.backward()
        optimizer.step()
        end_time = time.time()
        times.append(end_time - start_time)

        train_acc = accuracy(graphsage.forward(train), labels[train])
        val_acc = accuracy(graphsage.forward(val), labels[val])
        test_acc = accuracy(graphsage.forward(test), labels[test])
        print(batch, loss.data, train_acc, val_acc, test_acc)
        record_dict.update(
            dict(epoch=int(batch + 1),
                 train_loss=float(loss.data),
                 train_acc=float(train_acc),
                 val_acc=float(val_acc),
                 test_accuracy=float(test_acc),
                 time=str(end_time - start_time),
                 early_stopping=False))

        if (best_val_record_dict is None) or (record_dict["val_acc"] >=
                                              best_val_record_dict["val_acc"]):
            best_val_record_dict = record_dict.copy()

    val_output = graphsage.forward(val)
    print(
        "Validation F1:",
        f1_score(labels[val],
                 val_output.data.numpy().argmax(axis=1),
                 average="micro"))
    print("Average batch time:", np.mean(times))
    print(best_val_record_dict)

    if args.use_signac:
        with open(job.fn("results.json"), "w") as f:
            json.dump(best_val_record_dict, f)
        print("Results recorded to {}".format(job.fn("results.json")))
        job.data[f"correct_label"] = labels
Beispiel #7
0
def train_batch(solver_base,
                data_loader,
                total_loss,
                rep,
                epoch,
                model_list,
                device,
                batch_replication,
                hidden_dimension,
                feature_dim,
                train_graph_recurrence_num,
                train_outer_recurrence_num,
                use_cuda=True,
                is_train=True,
                randomized=True):
    # np.random.seed(1)

    random.seed(1)
    '''优化参数列表'''
    optim_list = [{
        'params':
        filter(lambda p: p.requires_grad, model.parameters())
    } for model in model_list]
    total_example_num = 0
    '''# 下标起始位置为1,每次读入为 dalaloader 里的一项'''
    for (j, data) in enumerate(data_loader, 1):
        segment_num = len(data[0])
        print('Train CNF:', j)
        for seg in range(segment_num):
            '''将读进来的data放进gpu'''
            (graph_map, batch_variable_map, batch_function_map, edge_feature,
             graph_feat, label, answers, var,
             func) = [_to_cuda(d[seg], use_cuda, device) for d in data]
            total_example_num += (batch_variable_map.max() + 1)
            sat_problem = SATProblem(
                (graph_map, batch_variable_map, batch_function_map,
                 edge_feature, answers, None), device, batch_replication)
            loss = torch.zeros(1, device=device, requires_grad=False)
            # '''将所有CNF的答案拼接起来, 有解才执行 graphSage 模型'''
            # if len(answers[0].flatten()) > 0:
            # answers = np.concatenate(answers, axis = 0)
            '''展开所有子句的变量(绝对值)'''
            variable_map = torch.cat(
                ((torch.abs(sat_problem.nodes).to(torch.long) - 1).reshape(
                    1, -1), graph_map[1].to(torch.long).reshape(1, -1)),
                dim=0)
            '''feat_data 为输入 CNF 的[变量, 子句]矩阵'''
            feat_data = torch.sparse.FloatTensor(
                variable_map, edge_feature.squeeze(1),
                torch.Size([sum(var), sum(func)])).to_dense()
            # feat_data = feat_data[np.argwhere(torch.sum(torch.abs(feat_data), 1) > 0)[0]]
            num_nodes_x = feat_data.shape[0]
            num_nodes_y = feat_data.shape[1]
            '''编码读入的数据'''
            features = nn.Embedding(num_nodes_x, num_nodes_y)
            '''用sp模型初始化的data作为特征值的权重'''
            features.weight = nn.Parameter(feat_data, requires_grad=False)
            if use_cuda:
                features = features.cuda()

            agg1 = MeanAggregator(features, device=device)
            enc1 = Encoder(device,
                           features,
                           num_nodes_y,
                           sat_problem._edge_num,
                           sat_problem.adj_lists,
                           sat_problem.node_adj_lists,
                           agg1,
                           gru=True)
            agg2 = MeanAggregator(lambda nodes: enc1(nodes).t(), device=device)
            enc2 = Encoder(device,
                           lambda nodes: enc1(nodes).t(),
                           enc1.embed_dim,
                           sat_problem._edge_num,
                           sat_problem.adj_lists,
                           sat_problem.node_adj_lists,
                           agg2,
                           base_model=enc1,
                           gru=False)
            enc1.num_samples = 15
            enc2.num_samples = 5
            graphsage = SupervisedGraphSage(device, hidden_dimension,
                                            feature_dim, enc2, 'sp-nueral')
            '''优化参数列表增加graphSAGE模型参数'''
            optim_list.append({
                'params':
                filter(lambda p: p.requires_grad, graphsage.parameters())
            })
            optimizer = torch.optim.SGD(optim_list, lr=0.3, weight_decay=0.01)
            optimizer.zero_grad()
            nodes = [i for i in range(sat_problem._variable_num)]
            # sample_length = int(len(nodes)/train_outer_recurrence_num)
            for i in range(train_graph_recurrence_num):
                loss += graphsage.loss(nodes, sat_problem, i <
                                       (train_graph_recurrence_num - 1))
            # else:
            #     # optimizer = torch.optim.SGD(optim_list, lr = 0.3, weight_decay = 0.01)
            #     optimizer = torch.optim.Adam(optim_list, lr = 0.3, weight_decay = 0.01)
            #     optimizer.zero_grad()
            for (k, model) in enumerate(model_list):
                '''初始化变量state, 可选择随机是否随机初始化 其中 batch_replication 表示同一个CNF数据重复次数'''
                state = _module(model).get_init_state(graph_map, randomized,
                                                      batch_replication)
                '''train_outer_recurrence_num 代表同一组数据重新训练的次数, loss叠加'''
                for i in torch.arange(train_outer_recurrence_num,
                                      dtype=torch.int32,
                                      device=device):
                    variable_prediction, state = model(init_state=state,
                                                       sat_problem=sat_problem,
                                                       is_training=True)
                    '''计算 sp_aggregator 的loss'''
                    # loss += model.compute_loss(is_train, variable_prediction, label, sat_problem._graph_map,
                    #                            sat_problem._batch_variable_map, sat_problem._batch_function_map,
                    #                            sat_problem._edge_feature, sat_problem._meta_data)

                    loss += solver_base._compute_loss(_module(model), None,
                                                      is_train,
                                                      variable_prediction,
                                                      label, sat_problem)

                    for p in variable_prediction:
                        del p

                for s in state:
                    del s

                print('rep: %d, epoch: %d, data segment: %d, loss: %f' %
                      (rep, epoch, seg, loss))
                total_loss[k] += loss.detach().cpu().numpy()
                loss.backward()

            optimizer.step()

        for model in model_list:
            _module(model)._global_step += 1

            del graph_map
            del batch_variable_map
            del batch_function_map
            del graph_feat
            del label
            del edge_feature
    return total_loss / total_example_num.cpu().numpy()