Example #1
0
def get_degrees(dges, is_directed, length):
    for i in range(length):
        edges[i] = [int(x) for x in edges[i]]
    edges = np.array(edges)
    res = {}
    # directed: 'node_id': [OUT_deg, in_deg]
    if is_directed == 1:
        for edge in edges:
            s = str(edge[0])
            d = str(edge[1])

            if s in res:
                res[s][0] += 1
            else:
                res[s] = [1, 0]
            if d in res:
                res[d][1] += 1
            else:
                res[d] = [0, 1]

    # undirected
    elif is_directed == 2:
        sum = 0
        for i in range(1, length + 1):
            res[str(i)] = len(np.where(edges[:, 0] == i)[0])
            res[str(i)] += len(np.where(edges[:, 1] == i)[0])
            res[str(i)] /= 2
            sum += res[str(i)]
            if i % 50000 == 0:
                print(str(i) + '/' + str(length))
                toc()
                tic()
        print(sum)
    return res
Example #2
0
def vec_to_node(vecs, length, pool_size, cri, fname):
    print('vec_to_node')
    if cri == 1:
        threshold = get_treshold(vecs)
        print('the distance threshold:', threshold)
        print('converting vectors back into a graph by distance threshold...')
        chunks = split_vecs(vecs, pool_size)
        # print(chunks)
        # print(array(chunks).shape)
    else:
        print('converting vectors back into a graph by original degrees...')
        d = int(input('recon number: '))
        j = open('degrees/' + 'recon_degree' + str(d) + '.json', 'r')
        degrees = json.load(j)
        # degrees = get_degrees(make_graph('BlogCatalog/data/edges.csv'), 2)
        ids = np.arange(length)
        V = []
        for i in range(length):
            V.append([list(vecs[i]), [ids[i]]])
        # print(V[0:50])

    tic()

    if cri == 1:
        for i in range(length):
            with poolcontext(processes=pool_size) as pool:
                pool.map(
                    partial(parallel_check_link_dis,
                            v=vecs[i],
                            v_id=i,
                            threshold=threshold), chunks)
            if i % 20 == 0:  # for mesauring computational time tests
                # if i % 5000 == 0 : # for actual executions
                print(str(i) + '/' + str(length))
                toc()
                tic()
    else:
        tic()
        with poolcontext(processes=pool_size) as pool:
            pool.map(
                partial(parallel_check_link_deg,
                        vecs=vecs,
                        degrees=degrees,
                        fname=fname), V)
        toc()
Example #3
0
def make_BOW(data, label_dic, label_onehot):
    vectorizer = CountVectorizer()
    print('constructing bag of words...')
    vectorizer.fit_transform(list(data.values()))
    ally_ty = []
    allx_tx = []
    allx_tx_dic = {}
    i = 0
    tmp_dic = {}
    print('appending training data...')
    for k in tqdm(data):
        if k not in label_dic:
            continue
        tmp_vec = vectorizer.transform([data[k]]).toarray()
        allx_tx.append(tmp_vec)
        allx_tx_dic[k] = 0
        tmp_dic[k] = i
        one_hot = label_onehot[label_dic[k]]
        ally_ty.append(one_hot)
        i += 1

    nb_trains = i-1
    print('appending unlabeled data...')
    for k in tqdm(data):
        if k in label_dic:
            continue
        tmp_vec = vectorizer.transform([data[k]]).toarray()
        allx_tx.append(tmp_vec)
        allx_tx_dic[k] = 0
        tmp_dic[k] = i
        ally_ty.append(np.zeros(len(ally_ty[0])))
        i += 1

    print('Reducing dimensionality with Truncated SVD...')
    tic()
    allx_tx = sp.sparse.csr_matrix(np.matrix(np.array(allx_tx)))
    allx_tx = svd(allx_tx, dim=2000, n_iter=1)
    toc()
    for k in allx_tx_dic:
        allx_tx_dic[k] = allx_tx[tmp_dic[k]]

    return sp.sparse.csr_matrix(allx_tx), np.array(ally_ty), allx_tx_dic, nb_trains
def vec_to_node(vecs, threshold, length, pool_size=1):
    print('vec_to_node')
    chunks = split_vecs(vecs, pool_size)
    #vecs: length * dim, thresolds: dim * 1
    # print(chunks)
    print(array(chunks).shape)
    # print(chunks)

    links = []
    print('converting vectors back into a graph...')
    tic()
    for i in range(length):
        # print('node #' + str(i+1))
        with poolcontext(processes=pool_size) as pool:
            pool.map(
                partial(parallel_check_link,
                        v=vecs[i],
                        v_id=i,
                        threshold=threshold), chunks)
        if i % 10 == 0:
            print(str(i) + '/' + str(length))
            toc()
            tic()
            continue
        if len(line) < 10:
            node_id = int(line.split()[0])
        else:
            sl = line.split(' ')
            if len(sl) < 2:
                node_id = sl
            else:
                ebd = sl
                ebd[-1] = ebd[-1].replace('\n', '')
                embedding = [float(x) for x in ebd[1:]]
                dic[int(node_id)] = embedding
                # print('[node]: ', node_id)
                # print('[ebd]: ', embedding[0:3], len(embedding))

        # if i % 2 == 0: # node_id
        #     print(i)
        #     print('@@: ',line)
        #     node_id = int(line.split()[0])
        # else:   # embedding
        #     ebd = line.split(' ')
        #     ebd[-1] = ebd[-1].replace('\n', '')
        #     print('##: ', ebd[1:])
        #     embedding = [float(x) for x in ebd[1:]]
        #     dic[node_id] = embedding
        # i += 1
    with open('embeddings.json', 'w') as g:
        print(len(dic))
        json.dump(dic, g)
toc()