def train(args):
    _, A, _ = load_data(path=args.path, dataset=args.dataset)
    scaled_A = A / A.sum(axis=1)
    size = args.size
    K = args.Kstep
    assert size % K == 0
    dim = int(size / K)
    t1 = time.time()
    A_k = np.identity(scaled_A.shape[0])
    Rep = np.zeros((scaled_A.shape[0], size))
    for i in range(K):
        print("K:", i)
        A_k = np.dot(A_k, scaled_A)
        prob_trans = np.log(A_k / np.tile(np.sum(A_k, axis=0),
                                          (scaled_A.shape[0], 1))) - np.log(
                                              1.0 / scaled_A.shape[0])
        prob_trans[prob_trans < 0] = 0
        prob_trans[prob_trans == np.nan] = 0
        U, S, VT = la.svd(prob_trans)
        Ud = U[:, 0:dim]
        Sd = S[0:dim]
        R_k = np.array(Ud) * np.power(Sd, 0.5).reshape(dim)
        R_k = normalize(R_k, axis=1, norm='l2')
        Rep[:, dim * i:dim * (i + 1)] = R_k[:, :]
    print("done.., cost: {}s".format(time.time() - t1))
    np.save(args.output + ".npy", np.asarray(Rep, dtype=np.float32))
    print("saved.")
def train_py(args):
    _, A, _ = load_data(path=args.path, dataset=args.dataset)

    scaled_A = A / A.sum(axis=1)
    size = args.size
    K = args.Kstep
    assert size % K == 0
    dim = int(size / K)
    t1 = time.time()
    A_k = np.identity(scaled_A.shape[0])
    Rep = np.zeros((scaled_A.shape[0], size))

    scaled_A = torch.FloatTensor(scaled_A).cuda()
    A_k = torch.FloatTensor(A_k).cuda()
    Rep = torch.FloatTensor(Rep).cuda()
    for i in range(K):
        print("K:", i)
        A_k = torch.dot(A_k, scaled_A)
        prob_trans = torch.log(A_k / torch.sum(A_k, dim=0).repeat(
            scaled_A.shape[0], 1)) - torch.log(1.0 / scaled_A.shape[0])
        prob_trans[prob_trans < 0] = 0
        prob_trans[prob_trans == np.nan] = 0
        U, S, VT = torch.svd(prob_trans)
        Ud = U[:, 0:dim]
        Sd = S[0:dim]
        R_k = Ud * torch.pow(Sd, 0.5).view(dim)
        R_k = F.normalize(R_k, p=2, dim=1)
        Rep[:, dim * i:dim * (i + 1)] = R_k[:, :]
    print("done.., cost: {}s".format(time.time() - t1))
    np.save(args.output + ".npy", Rep.cpu().numpy())
    print("saved.")
def train(args):
    _, A, _ = load_data(path=args.path, dataset=args.dataset)
    row, col = A.nonzero()
    edges = np.concatenate((row.reshape(-1, 1), col.reshape(-1, 1)),
                           axis=1).astype(dtype=np.dtype(str))
    print("build")
    t1 = time.time()
    G = {}
    for [i, j] in edges:
        if i not in G:
            G[i] = []
        if j not in G:
            G[j] = []
        G[i].append(j)
        G[j].append(i)
    for node in G:
        G[node] = list(sorted(set(G[node])))
        if node in G[node]:
            G[node].remove(node)

    nodes = list(sorted(G.keys()))
    print("len(G.keys()):", len(G.keys()), "\tnode_num:", A.shape[0])
    corpus = []
    for cnt in range(args.number_walks):
        random.shuffle(nodes)
        for idx, node in enumerate(nodes):
            path = [node]
            while len(path) < args.walk_length:
                cur = path[-1]
                if len(G[cur]) > 0:
                    if random.random() >= args.alpha:
                        path.append(random.choice(G[cur]))
                    else:
                        path.append(path[0])
                else:
                    break
            corpus.append(path)
    t2 = time.time()
    print("cost: {}s".format(t2 - t1))
    print("train...")
    model = Word2Vec(corpus,
                     size=args.size,
                     window=args.window,
                     min_count=0,
                     sg=1,
                     hs=1,
                     workers=args.workers)
    print("done.., cost: {}s".format(time.time() - t2))
    output = []
    for i in range(A.shape[0]):
        if str(i) in model.wv:
            output.append(model.wv[str(i)])
        else:
            output.append(np.zeros(args.size))
    np.save(args.output + ".npy", np.asarray(output, dtype=np.float32))
    print("saved.")
Esempio n. 4
0
def deepWalk(_windowSize=5, _embeddingSize=128, _walkLength=35):
    X, A, y = data_utils_cora.load_data(dataset='cora')
    graph = Graph(A)

    walk = []
    vector = list(graph.vector)

    #build corpus from random walks
    for vect in range(0, len(vector)):
        walk.append(graph.randomWalk(_walkLength, vect))


#set hyperparameters - word2vec utilises the skipgram algorithm to create word embeddings
    model = Word2Vec(walk,
                     size=_embeddingSize,
                     window=_windowSize,
                     min_count=0,
                     sg=1,
                     hs=1,
                     workers=4)

    G = Graph.sparse(A)

    y = np.ravel(np.array([np.where(y[i] == 1)[0] for i in range(y.shape[0])]))
    X = np.array([model.wv[str(i)] for i in range(len(G))])
    features = np.asarray([model[str(X)] for X in range(len(Graph.sparse(A)))])

    y_train, y_val, y_test, idx_train, idx_val, idx_test = data_utils_cora.get_splits(
        y)
    x_train, x_val, x_test, idx_train, idx_val, idx_test = data_utils_cora.get_splits(
        X)
    test = LogisticRegression(max_iter=500,
                              multi_class='ovr').fit(features[idx_train],
                                                     y[idx_train].ravel())
    test.fit(x_train, y_train)
    print(test.score(x_train, y_train))
    print(test.score(x_test, y_test))
def train(args):
    _, A, _ = load_data(path=args.path, dataset=args.dataset)
    row, col = A.nonzero()
    edges = np.concatenate((row.reshape(-1, 1), col.reshape(-1, 1)), axis=1)
    edge_sampler = AliasSampling(probs=A.data / np.sum(A.data))
    node_weights = np.power(np.asarray(A.sum(axis=0)).flatten(), 0.75)
    node_sampler = AliasSampling(probs=node_weights / np.sum(node_weights))

    learning_rate = args.rho
    line = Line(A.shape[0], args.size)
    optimizer = optim.Adadelta(line.parameters(), lr=learning_rate)
    if args.gpu and torch.cuda.is_available():
        line.cuda()

    sampling_time, training_time = 0, 0
    line.train()
    for i in range(args.batch_num):
        t1 = time.time()
        u_i, u_j, label = get_batch(A,
                                    edges=edges,
                                    edge_sampler=edge_sampler,
                                    node_sampler=node_sampler,
                                    batch_size=args.batch_size,
                                    negative=args.negative)
        t2 = time.time()
        sampling_time += t2 - t1

        if args.gpu and torch.cuda.is_available():
            u_i, u_j, label = Variable(u_i.cuda()), Variable(
                u_j.cuda()), Variable(label.cuda())
        else:
            u_i, u_j, label = Variable(u_i), Variable(u_j), Variable(label)
        if i % 100 == 0 and i != 0:
            print('Batch_no: {:06d}'.format(i),
                  'loss: {:.4f}'.format(loss.data[0]),
                  'rho: {:.4f}'.format(learning_rate),
                  'sampling_time: {:.4f}'.format(sampling_time),
                  'training_time: {:.4f}'.format(training_time))
            sampling_time, training_time = 0, 0
        else:
            optimizer.zero_grad()

            loss = line(u_i, u_j, label)
            # loss = F.kl_div(output, label)
            # print("__loss: {:.4f}".format(loss.data[0]))

            loss.backward()

            # print("line.embeddings.weight.grad:", np.max(np.array(line.embeddings.weight.grad.data)))
            # if line.order == 2:
            #     print("line.context_embedding.weight.grad:", np.max(np.array(line.context_embedding.weight.grad.data)))

            optimizer.step()

            training_time += time.time() - t2

            if learning_rate > args.rho * 1e-4:
                learning_rate = args.rho * (1 - i / args.batch_num)
            else:
                learning_rate = args.rho * 1e-4
            optimizer = optim.Adadelta(line.parameters(), lr=learning_rate)

    print("done..")
    if args.gpu and torch.cuda.is_available():
        np.save(args.output + "_" + str(args.order) + ".npy",
                F.normalize(line.embeddings.cpu().weight).data.numpy())
    else:
        np.save(args.output + "_" + str(args.order) + ".npy",
                F.normalize(line.embeddings.weight).data.numpy())
    print("saved.")
np.random.seed(2018)
torch.manual_seed(2018)


class LogisticRegression(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, num_classes)

    def forward(self, x):
        out = self.linear(x)
        # out = F.relu(out)
        return F.log_softmax(out, dim=1)


X, A, y = load_data(dataset='cora')
y_train, y_val, y_test, idx_train, idx_val, idx_test = get_splits('cora', y)

METHOD = "line"
PARA = "_4_0.25" if METHOD == "node2vec" else ""
if METHOD == "line":
    PARA = "_2"

# embeddings = np.genfromtxt("workspace/vec_2nd_wo_norm10000.txt", skip_header=1, dtype=np.float32)[:, 1:]
# from sklearn.preprocessing import normalize
# embeddings = normalize(embeddings, axis=1)

embeddings = np.load("workspace/" + METHOD + "_embedding_cora" + PARA + ".npy")

# embeddings = np.load("workspace/line.tensorflow_7w.npy")
# print(embeddings[0:5])
Esempio n. 7
0
def train(args):
    _, A, _ = load_data(path=args.path, dataset=args.dataset)
    row, col = A.nonzero()
    edges = np.concatenate((row.reshape(-1, 1), col.reshape(-1, 1)),
                           axis=1).astype(dtype=np.dtype(str))
    print("build")
    t1 = time.time()
    G, node_samplers, edge_samplers = {}, {}, {}
    for [i, j] in edges:
        if i not in G:
            G[i] = []
        if j not in G:
            G[j] = []
        G[i].append(j)
        G[j].append(i)
    for node in G:
        G[node] = list(sorted(set(G[node])))
        if node in G[node]:
            G[node].remove(node)
        node_samplers[node] = alias_sampler(probs=A[int(node), :].data /
                                            np.sum(A[int(node), :].data))

    for [i, j] in edges:
        edge_weights = []
        for j_nbr in G[j]:
            if j_nbr == i:
                edge_weights.append(A[int(j), int(j_nbr)] / args.p)
            elif A[int(j_nbr), int(i)] >= 1e-4:
                edge_weights.append(A[int(j), int(j_nbr)])
            else:
                edge_weights.append(A[int(j), int(j_nbr)] / args.q)
        edge_weights = np.asarray(edge_weights, dtype=np.float32)
        edge_samplers[i + "-" + j] = alias_sampler(probs=edge_weights /
                                                   edge_weights.sum())

    nodes = list(sorted(G.keys()))
    print("len(G.keys()):", len(G.keys()), "\tnode_num:", A.shape[0])
    corpus = []
    for cnt in range(args.number_walks):
        random.shuffle(nodes)
        for idx, node in enumerate(nodes):
            path = [node]
            while len(path) < args.walk_length:
                cur = path[-1]
                if len(G[cur]) > 0:
                    if len(path) == 1:
                        path.append(G[cur][sampling(node_samplers[cur][0],
                                                    node_samplers[cur][1])])
                    else:
                        prev = path[-2]
                        path.append(G[cur][sampling(
                            edge_samplers[prev + "-" + cur][0],
                            edge_samplers[prev + "-" + cur][1])])
                else:
                    break
            corpus.append(path)
    t2 = time.time()
    print("cost: {}s".format(t2 - t1))
    print("train...")
    model = Word2Vec(corpus,
                     size=args.size,
                     window=args.window,
                     min_count=0,
                     sg=1,
                     workers=args.workers)
    print("done.., cost: {}s".format(time.time() - t2))
    output = []
    for i in range(A.shape[0]):
        if str(i) in model.wv:
            output.append(model.wv[str(i)])
        else:
            output.append(np.zeros(args.size))
    np.save(args.output + "_" + str(args.p) + "_" + str(args.q) + ".npy",
            np.asarray(output, dtype=np.float32))
    print("saved.")