def get_degrees(dges, is_directed, length): for i in range(length): edges[i] = [int(x) for x in edges[i]] edges = np.array(edges) res = {} # directed: 'node_id': [OUT_deg, in_deg] if is_directed == 1: for edge in edges: s = str(edge[0]) d = str(edge[1]) if s in res: res[s][0] += 1 else: res[s] = [1, 0] if d in res: res[d][1] += 1 else: res[d] = [0, 1] # undirected elif is_directed == 2: sum = 0 for i in range(1, length + 1): res[str(i)] = len(np.where(edges[:, 0] == i)[0]) res[str(i)] += len(np.where(edges[:, 1] == i)[0]) res[str(i)] /= 2 sum += res[str(i)] if i % 50000 == 0: print(str(i) + '/' + str(length)) toc() tic() print(sum) return res
def vec_to_node(vecs, length, pool_size, cri, fname): print('vec_to_node') if cri == 1: threshold = get_treshold(vecs) print('the distance threshold:', threshold) print('converting vectors back into a graph by distance threshold...') chunks = split_vecs(vecs, pool_size) # print(chunks) # print(array(chunks).shape) else: print('converting vectors back into a graph by original degrees...') d = int(input('recon number: ')) j = open('degrees/' + 'recon_degree' + str(d) + '.json', 'r') degrees = json.load(j) # degrees = get_degrees(make_graph('BlogCatalog/data/edges.csv'), 2) ids = np.arange(length) V = [] for i in range(length): V.append([list(vecs[i]), [ids[i]]]) # print(V[0:50]) tic() if cri == 1: for i in range(length): with poolcontext(processes=pool_size) as pool: pool.map( partial(parallel_check_link_dis, v=vecs[i], v_id=i, threshold=threshold), chunks) if i % 20 == 0: # for mesauring computational time tests # if i % 5000 == 0 : # for actual executions print(str(i) + '/' + str(length)) toc() tic() else: tic() with poolcontext(processes=pool_size) as pool: pool.map( partial(parallel_check_link_deg, vecs=vecs, degrees=degrees, fname=fname), V) toc()
def make_BOW(data, label_dic, label_onehot): vectorizer = CountVectorizer() print('constructing bag of words...') vectorizer.fit_transform(list(data.values())) ally_ty = [] allx_tx = [] allx_tx_dic = {} i = 0 tmp_dic = {} print('appending training data...') for k in tqdm(data): if k not in label_dic: continue tmp_vec = vectorizer.transform([data[k]]).toarray() allx_tx.append(tmp_vec) allx_tx_dic[k] = 0 tmp_dic[k] = i one_hot = label_onehot[label_dic[k]] ally_ty.append(one_hot) i += 1 nb_trains = i-1 print('appending unlabeled data...') for k in tqdm(data): if k in label_dic: continue tmp_vec = vectorizer.transform([data[k]]).toarray() allx_tx.append(tmp_vec) allx_tx_dic[k] = 0 tmp_dic[k] = i ally_ty.append(np.zeros(len(ally_ty[0]))) i += 1 print('Reducing dimensionality with Truncated SVD...') tic() allx_tx = sp.sparse.csr_matrix(np.matrix(np.array(allx_tx))) allx_tx = svd(allx_tx, dim=2000, n_iter=1) toc() for k in allx_tx_dic: allx_tx_dic[k] = allx_tx[tmp_dic[k]] return sp.sparse.csr_matrix(allx_tx), np.array(ally_ty), allx_tx_dic, nb_trains
def vec_to_node(vecs, threshold, length, pool_size=1): print('vec_to_node') chunks = split_vecs(vecs, pool_size) #vecs: length * dim, thresolds: dim * 1 # print(chunks) print(array(chunks).shape) # print(chunks) links = [] print('converting vectors back into a graph...') tic() for i in range(length): # print('node #' + str(i+1)) with poolcontext(processes=pool_size) as pool: pool.map( partial(parallel_check_link, v=vecs[i], v_id=i, threshold=threshold), chunks) if i % 10 == 0: print(str(i) + '/' + str(length)) toc() tic()
continue if len(line) < 10: node_id = int(line.split()[0]) else: sl = line.split(' ') if len(sl) < 2: node_id = sl else: ebd = sl ebd[-1] = ebd[-1].replace('\n', '') embedding = [float(x) for x in ebd[1:]] dic[int(node_id)] = embedding # print('[node]: ', node_id) # print('[ebd]: ', embedding[0:3], len(embedding)) # if i % 2 == 0: # node_id # print(i) # print('@@: ',line) # node_id = int(line.split()[0]) # else: # embedding # ebd = line.split(' ') # ebd[-1] = ebd[-1].replace('\n', '') # print('##: ', ebd[1:]) # embedding = [float(x) for x in ebd[1:]] # dic[node_id] = embedding # i += 1 with open('embeddings.json', 'w') as g: print(len(dic)) json.dump(dic, g) toc()