def parseData(): target = 'georgetown' flagFile = '../data/flag_' + target + '.txt' tree, total_level, all_leaves = etl.prepare_tree('../data/tree2_' + target) flag = {} with open(flagFile, "r") as f: for line in f: line = line.strip() if len(line) == 0: continue items = line.split() if len(items) != 2: continue if items[1] in flag.keys(): flag[items[1]].append(items[0]) else: flag[items[1]] = [items[0]] print(flag['3']) flagCnt = {} for k, v in flag.items(): # print(k) # print(v) flagCnt[k] = len(v) print(flagCnt)
def main(args): """ training entrance :param args: :return: """ # Parameters verifying. if (not args.do_train): raise ValueError('error.') if (args.hidden_dim % 2 != 0): raise ValueError('hidden_error') args.single_dim = args.hidden_dim // 2 # Select a device, cpu or gpu. if args.usecuda: device = "cuda:2" if torch.cuda.is_available() else "cpu" else: device = "cpu" # Load the tree and some properties of the tree. tree, total_level, all_leaves = etl.prepare_tree(args.data_path) # load the graph graph = etl.prepare_graph(args.network_path) # Define the root node root = len(tree) - 1 # Calc the graph similarity, i.e. the matrix \capA in paper. leaves_similarity = etl.get_leaves_similarity(graph) # Initialize the result and fix the root node's embedding. root_embedding_lower = torch.zeros(1, args.single_dim) root_embedding_upper = args.single_circle_range * torch.ones( 1, args.single_dim) root_embedding = torch.cat((root_embedding_lower, root_embedding_upper), 1)[0] res = torch.zeros(len(tree), args.hidden_dim) res[root] = root_embedding # Initialize the layer dict containing lists of nodes of each layer. layerCounter = [[] for i in range(total_level)] for node in tree: layerCounter[node.level - 1].append(node.id) # Train HASNE layer by layer. layerWiseTraining(0, res, args, tree, leaves_similarity, device, layerCounter) # calc the graph similarity leaves_similarity = etl.get_leaves_similarity(graph) train_dfs(root, res, args, tree, leaves_similarity, device) res_output = os.path.join(args.res_path, "res_" + str(int(time.time()))) write_to_file(res_output, json.dumps(res.numpy().tolist()))
def visualizationZoom(embedding): """ visualization for a simplified version. :param embedding: :return: """ pdf_output = 'path to the results' tree, total_level, all_leaves = etl.prepare_tree('path to tree') fig = plt.figure(figsize=(7, 7)) ax = fig.add_subplot(1, 1, 1) # Which nodes we want to display. displayNode = [122, 123] for index in range(len(embedding)): mark = 0 # root if index == 128: mark = 1 for disp in displayNode: if disp in tree[index].path: mark = 1 break if mark != 1: continue each = embedding[index] start = each[0] end = each[1] tmp = start xx = [] yy = [] while tmp < end: x = tmp * math.cos(tmp / 10) y = tmp * math.sin(tmp / 10) x = float(x) y = float(y) xx.append(x) yy.append(y) tmp += math.pi * 0.001 ax.plot(xx, yy, label='vis', ls='-', lw=10) plt.show() pp = PdfPages(pdf_output) pp.savefig(fig) pp.close()
def display(embedding): embedding = np.around(embedding, decimals=5) lower = embedding[:, :8] higher = embedding[:, 8:] # print(lower) # print(higher) # exit(1) diff = higher - lower displayfile = '../res/metric_display.txt' tree, total_level, all_leaves = etl.prepare_tree( '../data/tree2_hamilton') tmp = {} for i in range(len(diff)): tmpl = [] l = diff[i].tolist() for each in l: tmpl.append('%.03f' % each) s = str(i) + ' ' + str(json.dumps(tmpl)) + "\r\n" tmp[i] = s # write_to_file(displayfile, s) write_to_file(displayfile, tmp[2618]) write_to_file(displayfile, "\r\n\r\n") secondLevel = tree[2618].direct_children for i in secondLevel: write_to_file(displayfile, tmp[i]) write_to_file(displayfile, "\r\n\r\n") thirdLevel = tree[secondLevel[0]].direct_children for j in thirdLevel: write_to_file(displayfile, tmp[j]) write_to_file(displayfile, "\r\n\r\n") fourthLevel = tree[thirdLevel[0]].direct_children for k in fourthLevel: write_to_file(displayfile, tmp[k])
import json import torch import codes.utils.data_etl as etl # number of nodes nums = 4000 treePath = "path to the tree file" tree, total_level, all_leaves = etl.prepare_tree(treePath) f = open("path to the results of GNE", encoding='utf-8') content = f.read() dict = json.loads(content) embeddings = dict['coordinates'] radii = dict['radius'] len = len(radii) # We draw 10 layers layerDict = { 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: []
def reconstruction(embedding, target): """ reconstruction for network :param embedding: :param target: :return: """ singleDim = 32 flag_file = f"../data/flag_{target}.txt" edge_file = f"../data/edges_{target}.txt" tree_file = f"../data/tree2_{target}" tree, total_level, all_leaves = etl.prepare_tree('../data/tree2_' + target) adjM = etl.prepare_graph(edge_file) objects = np.array(list(adjM.adj.keys())) nodesNum = len(objects) def calcLowerBoundDist(lowerBoundEmbedding, singleDim, needSum=True): """ Calculate the distance based on the lower bound among embeddings. :param lowerBoundEmbedding: :return: """ nodesNum = len(lowerBoundEmbedding) emb1 = torch.reshape(lowerBoundEmbedding.t(), (lowerBoundEmbedding.numel(), 1)) emb1 = emb1.repeat(1, nodesNum) emb2 = torch.repeat_interleave(lowerBoundEmbedding.t(), repeats=nodesNum, dim=0) dimMixedDiff = torch.abs(emb1 - emb2) lowerDist = torch.unsqueeze(dimMixedDiff, 0).reshape(singleDim, nodesNum, nodesNum) if needSum: lowerDist = torch.sum(lowerDist, dim=0) return lowerDist def calcDist(tree, embedding): """ distance metric :param tree: :param embedding: :return: """ embeddingInTorch = torch.from_numpy(embedding) embeddingInTorchLower, embeddingInTorchHigher = torch.split( embeddingInTorch, singleDim, dim=1) layerBasedIndex = {} layerBasedTorch = {} for i in range(total_level): layerTmp = [] for node in tree: if node.level - 1 == i: layerTmp.append(node.id) layerBasedTorch[i] = torch.index_select( embeddingInTorchLower, dim=0, index=torch.tensor(layerTmp)) layerBasedIndex[i] = layerTmp layerBasedDist = {} for each in layerBasedTorch: layerBasedDist[each] = calcLowerBoundDist( layerBasedTorch[each], singleDim) leaves = layerBasedIndex[total_level - 1] leavesNum = len(leaves) leavesParentDict = {} for eachLayer in range(total_level): leavesParentDict[eachLayer] = [] for node in tree: if node.id < leavesNum: for eachLayer in range(total_level): leavesParentDict[eachLayer].append( node.path[eachLayer]) pass leavesParentIndexList = {} for layerConuter in range(total_level): leavesParentIndexList[layerConuter] = [ layerBasedIndex[layerConuter].index(x) for x in leavesParentDict[layerConuter] ] finalDist = torch.zeros((leavesNum, leavesNum)) for u in range(leavesNum): print('handling u:%d' % u) for v in range(u, leavesNum): if u == v: continue for index in leavesParentIndexList: if index == 0: continue indexu = leavesParentIndexList[index][u] indexv = leavesParentIndexList[index][v] added = finalDist[u][v] + layerBasedDist[index][ indexu][indexv] finalDist[u][v] = finalDist[v][u] = added return finalDist ranksum = nranks = ap_scores = iters = 0 labels = np.empty(nodesNum) distMatrix = calcDist(tree, embedding) for object in objects: labels.fill(0) neighbors = np.array(list(adjM.adj[object])) if (len(neighbors) == 0): continue objDist = distMatrix[object] objDist[object] = 1e5 sorted_dists, sorted_idx = objDist.sort() ranks, = np.where( np.in1d(sorted_idx.detach().cpu().numpy(), neighbors)) # The above gives us the position of the neighbors in sorted order. We # want to count the number of non-neighbors that occur before each neighbor ranks += 1 N = ranks.shape[0] # To account for other positive nearer neighbors, we subtract (N*(N+1)/2) # As an example, assume the ranks of the neighbors are: # 0, 1, 4, 5, 6, 8 # For each neighbor, we'd like to return the number of non-neighbors # that ranked higher than it. In this case, we'd return 0+0+2+2+2+3=14 # Another way of thinking about it is to return # 0 + 1 + 4 + 5 + 6 + 8 - (0 + 1 + 2 + 3 + 4 + 5) # (0 + 1 + 2 + ... + N) == (N * (N + 1) / 2) # Note that we include `N` to account for the source embedding itself # always being the nearest neighbor ranksum += ranks.sum() - (N * (N - 1) / 2) nranks += ranks.shape[0] labels[neighbors] = 1 ap_scores += average_precision_score( labels, -objDist.detach().cpu().numpy()) iters += 1
def drawGNE(): """ visualization of the GNE model :return: """ pdf_output = 'path of the result' f = open( 'path of GNE results which can be trained by project on github', encoding='utf-8') content = f.read() dict = json.loads(content) embeddings = dict['coordinates'] tree, total_level, all_leaves = etl.prepare_tree( 'path to the tree file') fig = plt.figure(figsize=(20, 20)) ax = fig.add_subplot(1, 1, 1) plt.style.use('bmh') xx = [] yy = [] for index in range(len(embeddings)): each = embeddings[index] x = each[0] * 1 y = each[1] * 1 x = float(x) y = float(y) xx.append(x) yy.append(y) edges_set = set() for node in tree: path = node.path for i in range(len(path) - 1): if ((path[i], path[i + 1]) not in edges_set): edges_set.add((path[i], path[i + 1])) for each in list(edges_set): xxx = [] yyy = [] x1 = float(embeddings[each[0]][0] * 1) xxx.append(x1) y1 = float(embeddings[each[0]][1] * 1) yyy.append(y1) x2 = float(embeddings[each[1]][0] * 1) xxx.append(x2) y2 = float(embeddings[each[1]][1] * 1) yyy.append(y2) ax.plot(xxx, yyy, '-', label='debug', linewidth=2, color='#2F54EB') ax.plot(xx, yy, 'o', label='debug', marker='o', markersize=20, color='#F64C4C') plt.style.use('bmh') plt.show() pp = PdfPages(pdf_output) pp.savefig(fig) pp.close()
def drawPoincare(): """ draw poincare vis :return: """ model = torch.load( "path to poincare results, can be train by gensim or project on github" ) embeddings = model['embeddings'] pdf_output = 'path to the result' tree, total_level, all_leaves = etl.prepare_tree('path to tree file') fig = plt.figure(figsize=(15, 15)) ax = fig.add_subplot(1, 1, 1) xx = [] yy = [] for index in range(len(embeddings)): each = embeddings[index] x = each[0] * 1000 y = each[1] * 1000 x = float(x) y = float(y) xx.append(x) yy.append(y) edges_set = set() for node in tree: path = node.path for i in range(len(path) - 1): if ((path[i], path[i + 1]) not in edges_set): edges_set.add((path[i], path[i + 1])) for each in list(edges_set): xxx = [] yyy = [] x1 = float(embeddings[each[0]][0] * 1000) xxx.append(x1) y1 = float(embeddings[each[0]][1] * 1000) yyy.append(y1) x2 = float(embeddings[each[1]][0] * 1000) xxx.append(x2) y2 = float(embeddings[each[1]][1] * 1000) yyy.append(y2) ax.plot(xxx, yyy, '-', label='debug', linewidth=2, color='#2F54EB') ax.plot(xx, yy, 'o', label='debug', marker='o', markersize=15, color='#F64C4C') plt.show() pp = PdfPages(pdf_output) pp.savefig(fig) pp.close()