def __init__(
     self,
     rw_hops=64,
     subgraph_size=64,
     restart_prob=0.8,
     positional_embedding_size=32,
     step_dist=[1.0, 0.0, 0.0],
 ):
     super(GraphDataset).__init__()
     self.rw_hops = rw_hops
     self.subgraph_size = subgraph_size
     self.restart_prob = restart_prob
     self.positional_embedding_size = positional_embedding_size
     self.step_dist = step_dist
     assert sum(step_dist) == 1.0
     assert positional_embedding_size > 1
     #  graphs = []
     graphs, _ = dgl.data.utils.load_graphs(
         "data_bin/dgl/lscc_graphs.bin", [0, 1, 2]
     )
     for name in ["cs", "physics"]:
         g = Coauthor(name)[0]
         g.remove_nodes((g.in_degrees() == 0).nonzero().squeeze())
         g.readonly()
         graphs.append(g)
     for name in ["computers", "photo"]:
         g = AmazonCoBuy(name)[0]
         g.remove_nodes((g.in_degrees() == 0).nonzero().squeeze())
         g.readonly()
         graphs.append(g)
     # more graphs are comming ...
     print("load graph done")
     self.graphs = graphs
     self.length = sum([g.number_of_nodes() for g in self.graphs])
def extract_dataset():
    parser = argparse.ArgumentParser(description='DATA')
    register_data_args(parser)
    args = parser.parse_args()
    dataset_name = [
        'cora', 'citeseer', 'pubmed', 'reddit', 'CoraFull', 'Coauthor_cs',
        'Coauthor_physics', 'AmazonCoBuy_computers', 'AmazonCoBuy_photo'
    ]

    print("Now PATH IS ", os.getcwd())
    for name in dataset_name:
        '''
        if os.path.exists(name):
            print('Folder exists. Skipping ' + name)
            continue
        '''
        if name in ['cora', 'citeseer', 'pubmed', 'reddit']:

            args.dataset = name
            print('args.dataset = ', args.dataset)
            if not os.path.exists(name):
                os.mkdir(name)
            os.chdir(name)

            print("Now PATH IS ", os.getcwd())

            data = load_data(args)
            features = data.features
            labels = data.labels
            graph = data.graph
            edges = graph.edges
            train_mask = data.train_mask
            val_mask = data.val_mask
            test_mask = data.test_mask

            n_nodes = features.shape[0]
            n_edges = data.graph.number_of_edges

            if args.dataset == 'reddit':
                graph, features, labels, train_mask, val_mask, test_mask = cut_graph(
                    graph, n_nodes, n_edges, features, labels, train_mask,
                    val_mask, test_mask, 0.85)

            #edge_x = np.append(edge_x, edge_y, axis=1)

            edges_list = np.array([])
            first_element = True
            if name is not 'reddit':
                for item in edges:

                    if first_element:
                        edges_list = np.array([[item[0], item[1]]])
                        first_element = False
                    else:

                        edges_list = np.append(edges_list,
                                               np.array([[item[0], item[1]]]),
                                               axis=0)

            if name == 'reddit':
                edges = graph.edges()

                edge_x = edges[0].numpy().reshape((-1, 1))
                print(edge_x.shape)
                edge_y = edges[1].numpy().reshape((-1, 1))
                edges_list = np.hstack((edge_x, edge_y))
                print(edges_list.shape, edge_x.shape, edge_y.shape)

            print('features_shape', features.shape)
            print('labels_shape', labels.shape)
            print('edges_shape', edges_list.shape)
            '''
            np.savetxt('edges.txt', edges_list)
            np.savetxt('features.txt', features)
            np.savetxt('labels.txt', labels)

            np.savetxt('train_mask.txt', train_mask)
            np.savetxt('val_mask.txt', val_mask)
            np.savetxt('test_mask.txt', test_mask)
            '''

            np.save('edges.npy', edges_list)
            np.save('features.npy', features)
            np.save('labels.npy', labels)

            np.save('train_mask.npy', train_mask)

            print('Finish writing dataset', name)
            os.chdir('..')
            print('change to ', os.getcwd())

        else:

            if not os.path.exists(name):
                os.mkdir(name)
            os.chdir(name)

            if name == 'CoraFull':
                data = CoraFull()
            elif name == 'Coauthor_cs':
                data = Coauthor('cs')
            elif name == 'Coauthor_physics':
                data = Coauthor('physics')
            elif name == 'AmazonCoBuy_computers':
                data = AmazonCoBuy('computers')
            elif name == 'AmazonCoBuy_photo':
                data = AmazonCoBuy('photo')
            else:
                raise Exception("No such a dataset {}".format(name))

            graph = data.data[0]
            features = torch.FloatTensor(graph.ndata['feat']).numpy()
            labels = torch.LongTensor(graph.ndata['label']).numpy()

            print('dataset ', name)

            features_shape = features.shape
            labels_shape = labels.shape

            n_nodes = features_shape[0]
            edges_u, edges_v = graph.all_edges()

            edges_u = edges_u.numpy()
            edges_v = edges_v.numpy()

            edges_list = np.array([])
            first_element = True
            for idx in range(len(edges_u)):
                if first_element:
                    edges_list = np.array([[edges_u[idx], edges_v[idx]]])
                    first_element = False
                else:
                    edges_list = np.append(edges_list,
                                           np.array(
                                               [[edges_u[idx], edges_v[idx]]]),
                                           axis=0)

            print('features_shape', features_shape)
            print('labels_shape', labels_shape)
            print('edges_shape', edges_list.shape)

            train_mask = []
            for x in range(500):
                train_mask.append(True)
            for x in range(n_nodes - 500):
                train_mask.append(False)
            train_mask = np.array(train_mask)
            '''
            np.savetxt('edges.txt', edges_list)
            np.savetxt('features.txt', features)
            np.savetxt('labels.txt', labels)
            np.savetxt('train_mask.txt', train_mask)
            '''

            np.save('edges.npy', edges_list)
            np.save('features.npy', features)
            np.save('labels.npy', labels)
            np.save('train_mask.npy', train_mask)

            print('Finish writing dataset', name)
            os.chdir('..')
            print('change to ', os.getcwd())
def main(args):

    torch.manual_seed(1234)

    if args.dataset == 'cora' or args.dataset == 'citeseer' or args.dataset == 'pubmed':
        data = load_data(args)
        features = torch.FloatTensor(data.features)

        labels = torch.LongTensor(data.labels)
        in_feats = features.shape[1]
        g = data.graph
        if args.dataset == 'cora':
            g.remove_edges_from(nx.selfloop_edges(g))
            g.add_edges_from(zip(g.nodes(), g.nodes()))
        g = DGLGraph(g)
        attr_matrix = data.features
        labels = data.labels

    else:
        if args.dataset == 'physics':
            data = Coauthor('physics')
        if args.dataset == 'cs':
            data = Coauthor('cs')
        if args.dataset == 'computers':
            data = AmazonCoBuy('computers')
        if args.dataset == 'photo':
            data = AmazonCoBuy('photo')

        g = data
        g = data[0]
        attr_matrix = g.ndata['feat']
        labels = g.ndata['label']

        features = torch.FloatTensor(g.ndata['feat'])

    ### LCC of the graph
    n_components = 1
    sparse_graph = g.adjacency_matrix_scipy(return_edge_ids=False)
    _, component_indices = sp.csgraph.connected_components(sparse_graph)
    component_sizes = np.bincount(component_indices)
    components_to_keep = np.argsort(
        component_sizes
    )[::-1][:n_components]  # reverse order to sort descending
    nodes_to_keep = [
        idx for (idx, component) in enumerate(component_indices)
        if component in components_to_keep
    ]

    adj_matrix = sparse_graph[nodes_to_keep][:, nodes_to_keep]
    num_nodes = len(nodes_to_keep)
    g = adj_matrix
    g = DGLGraph(g)
    g = remove_self_loop(g)
    g = add_self_loop(g)
    g = DGLGraph(g)

    g.ndata['feat'] = attr_matrix[nodes_to_keep]
    features = torch.FloatTensor(g.ndata['feat'].float())
    if args.dataset == 'cora' or args.dataset == 'pubmed':
        features = features / (features.norm(dim=1) + 1e-8)[:, None]
    g.ndata['label'] = labels[nodes_to_keep]
    labels = torch.LongTensor(g.ndata['label'])

    in_feats = features.shape[1]

    unique_l = np.unique(labels, return_counts=False)
    n_classes = len(unique_l)
    n_nodes = g.number_of_nodes()
    n_edges = g.number_of_edges()

    print('Number of nodes', n_nodes, 'Number of edges', n_edges)

    enc = OneHotEncoder()
    enc.fit(labels.reshape(-1, 1))
    ylabels = enc.transform(labels.reshape(-1, 1)).toarray()

    for beta in [args.beta]:
        for K in [args.num_clusters]:
            for alpha in [args.alpha]:
                accs = []
                t_st = time.time()

                sets = "imbalanced"

                for k in range(2):  #number of differnet trainings
                    #print(k)

                    random_state = np.random.RandomState()
                    if sets == "imbalanced":
                        train_idx, val_idx, test_idx = get_train_val_test_split(
                            random_state,
                            ylabels,
                            train_examples_per_class=None,
                            val_examples_per_class=None,
                            test_examples_per_class=None,
                            train_size=20 * n_classes,
                            val_size=30 * n_classes,
                            test_size=None)
                    elif sets == "balanced":
                        train_idx, val_idx, test_idx = get_train_val_test_split(
                            random_state,
                            ylabels,
                            train_examples_per_class=20,
                            val_examples_per_class=30,
                            test_examples_per_class=None,
                            train_size=None,
                            val_size=None,
                            test_size=None)
                    else:
                        ("No such set configuration (imbalanced/balanced)")

                    n_nodes = len(nodes_to_keep)
                    train_mask = np.zeros(n_nodes)
                    train_mask[train_idx] = 1
                    val_mask = np.zeros(n_nodes)
                    val_mask[val_idx] = 1
                    test_mask = np.zeros(n_nodes)
                    test_mask[test_idx] = 1
                    train_mask = torch.BoolTensor(train_mask)
                    val_mask = torch.BoolTensor(val_mask)
                    test_mask = torch.BoolTensor(test_mask)
                    """
                    Planetoid Split for CORA, CiteSeer, PubMed
                    train_mask = torch.BoolTensor(data.train_mask)
                    val_mask = torch.BoolTensor(data.val_mask)
                    test_mask = torch.BoolTensor(data.test_mask)
                    train_mask2 = torch.BoolTensor(data.train_mask)
                    val_mask2 = torch.BoolTensor(data.val_mask)
                    test_mask2 = torch.BoolTensor(data.test_mask)
                    """

                    if args.gpu < 0:
                        cuda = False

                    else:
                        cuda = True
                        torch.cuda.set_device(args.gpu)
                        features = features.cuda()
                        labels = labels.cuda()
                        train_mask = train_mask.cuda()
                        val_mask = val_mask.cuda()
                        test_mask = test_mask.cuda()

                    gic = GIC(g, in_feats, args.n_hidden, args.n_layers,
                              nn.PReLU(args.n_hidden), args.dropout, K, beta,
                              alpha)

                    if cuda:
                        gic.cuda()

                    gic_optimizer = torch.optim.Adam(
                        gic.parameters(),
                        lr=args.gic_lr,
                        weight_decay=args.weight_decay)

                    # train GIC
                    cnt_wait = 0
                    best = 1e9
                    best_t = 0
                    dur = []

                    for epoch in range(args.n_gic_epochs):
                        gic.train()
                        if epoch >= 3:
                            t0 = time.time()

                        gic_optimizer.zero_grad()
                        loss = gic(features)
                        #print(loss)
                        loss.backward()
                        gic_optimizer.step()

                        if loss < best:
                            best = loss
                            best_t = epoch
                            cnt_wait = 0
                            torch.save(gic.state_dict(), 'best_gic.pkl')
                        else:
                            cnt_wait += 1

                        if cnt_wait == args.patience:
                            #print('Early stopping!')
                            break

                        if epoch >= 3:
                            dur.append(time.time() - t0)

                        #print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | "
                        #"ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur), loss.item(),
                        #n_edges / np.mean(dur) / 1000))

                    # train classifier
                    #print('Loading {}th epoch'.format(best_t))
                    gic.load_state_dict(torch.load('best_gic.pkl'))
                    embeds = gic.encoder(features, corrupt=False)
                    embeds = embeds / (embeds + 1e-8).norm(dim=1)[:, None]
                    embeds = embeds.detach()

                    # create classifier model
                    classifier = Classifier(args.n_hidden, n_classes)
                    if cuda:
                        classifier.cuda()

                    classifier_optimizer = torch.optim.Adam(
                        classifier.parameters(),
                        lr=args.classifier_lr,
                        weight_decay=args.weight_decay)

                    dur = []
                    best_a = 0
                    cnt_wait = 0
                    for epoch in range(args.n_classifier_epochs):
                        classifier.train()
                        if epoch >= 3:
                            t0 = time.time()

                        classifier_optimizer.zero_grad()
                        preds = classifier(embeds)
                        loss = F.nll_loss(preds[train_mask],
                                          labels[train_mask])
                        loss.backward()
                        classifier_optimizer.step()

                        if epoch >= 3:
                            dur.append(time.time() - t0)

                        acc = evaluate(
                            classifier, embeds, labels, val_mask
                        )  #+ evaluate(classifier, embeds, labels, train_mask)

                        if acc > best_a and epoch > 100:
                            best_a = acc
                            best_t = epoch

                            torch.save(classifier.state_dict(),
                                       'best_class.pkl')

                        #print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | "
                        #"ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur), loss.item(),
                        #acc, n_edges / np.mean(dur) / 1000))

                    acc = evaluate(classifier, embeds, labels, test_mask)
                    accs.append(acc)

                print('=================== ', ' alpha', alpha, ' beta ', beta,
                      'K', K)
                print(args.dataset, ' Acc (mean)', mean(accs), ' (std)',
                      stdev(accs))
                print('=================== time', int(
                    (time.time() - t_st) / 60))