def evaluate_embeddings(embeddings, edges, cda=True, greedy_routing=False, cda_max_vertices=1000, gr_max_pairs=10000, eval_core=False, core_exponent=0.5): "evaluate quality of embeddings compared with real elge set" report = [] # get connected component true_graph = nx.Graph() true_graph.add_edges_from(edges) Gcc = sorted(nx.connected_component_subgraphs(true_graph), key=len, reverse=True) true_graph = Gcc[0] # use BallTree for efficient graph construction print "construct BallTree" vertices = list(true_graph.nodes()) edges = list(true_graph.edges()) n = len(vertices) if eval_core: vertices = [ v for v in vertices if true_graph.degree(v) >= n**core_exponent ] true_graph = true_graph.subgraph(vertices) edges = list(true_graph.edges()) embeddings_array = np.array([embeddings[v] for v in vertices]) bt = BallTree(embeddings_array, metric=distance) degrees = defaultdict(int) print "compute number of correct directed arcs" for v1, v2 in edges: degrees[v1] += 1 degrees[v2] += 1 # compute number of correct DIRECTED arcs assuming that degrees are known if cda: all_correct_arcs = set() cda_vertices = vertices[:] if len(cda_vertices) > cda_max_vertices: np.random.shuffle(cda_vertices) cda_vertices = cda_vertices[:cda_max_vertices] for v_i, v in enumerate(cda_vertices): start = time.time() degree = degrees[v] dist, ind = bt.query(np.array(embeddings[v]).reshape(1, -1), k=degree + 1) # one of neighbors is vertex inself neigh = [vertices[i] for i in ind[0].tolist() if vertices[i] != v] for ne in neigh: if make_edge(v, ne) in edges: all_correct_arcs.add((v, ne)) finish = time.time() #print "DEBUG: {} / {}, time={}s".format(v_i + 1, len(cda_vertices), datetime.timedelta(seconds=finish-start)) report.append([ 'ratio of correct arcs for known degrees', float(len(all_correct_arcs)) / (2 * len(edges)) ]) if greedy_routing: print "compute greedy routing efficiency" random_pairs = set() if n * (n - 1) / 2 <= gr_max_pairs: random_pairs = set(combinations(vertices, 2)) else: while (len(random_pairs) < gr_max_pairs): v1 = np.random.choice(vertices) v2 = np.random.choice(vertices) if v1 != v2: random_pairs.add((v1, v2)) total_distribution = defaultdict(int) success_distribution = defaultdict(int) complete_fails_distribution = defaultdict(int) all_path_length_pairs = defaultdict(int) for i, pair in enumerate(random_pairs): src, dst = pair # best path best_path_length = nx.shortest_path_length(true_graph, source=src, target=dst) total_distribution[0] += 1 total_distribution[best_path_length] += 1 # greedy path curr_src = src path_length = 0 seen = set() while curr_src != dst: seen.add(curr_src) # find neighbor closest to destination unseen_neighbors = filter(lambda x: x not in seen, true_graph.neighbors(curr_src)) if not len(unseen_neighbors): # greedy algorithm stuck in 'leaf' path_length = np.nan break def curr_distance(v): return distance(embeddings[dst], embeddings[v]) closest_neigh = min(unseen_neighbors, key=curr_distance) path_length += 1 curr_src = closest_neigh if path_length == best_path_length: success_distribution[0] += 1 success_distribution[best_path_length] += 1 if np.isnan(path_length): complete_fails_distribution[0] += 1 complete_fails_distribution[best_path_length] += 1 all_path_length_pairs[(best_path_length, path_length)] += 1 all_success = success_distribution[0] all_complete_fails = complete_fails_distribution[0] all_total = total_distribution[0] all_ratio = float(all_success) / all_total * 100 print "Complete fails: {} / {} ({:.2f} %)".format( all_complete_fails, all_total, float(all_complete_fails) / all_total * 100) print "Success: {} / {} ({:.2f} %)".format(all_success, all_total, all_ratio) for pl in sorted( set(total_distribution.keys()) | set(success_distribution.keys())): if pl == 0: continue total = total_distribution.get(pl, 0) success = success_distribution.get(pl, 0) ratio = float(success) / total * 100 print "Success at path length = {}: {} / {} ({:.2f} %)".format( pl, success, total, ratio) if False: # depends on R, bad for subgraphs -- not used n = len(vertices) R = 2 * np.log(n) coshR = np.cosh(R) predicted_edges = set() print "predict edges" for v in vertices: coords = embeddings[v] neigh_idx = bt.query_radius(coords, R) neigh = [ vertices[i] for i in neigh_idx[0].tolist() if vertices[i] != v ] predicted_edges.update([make_edge(v, ne) for ne in neigh]) report.append(['total_predicted_edges', len(predicted_edges)]) # contingency matrix print "compute contingency matrix" report.append(['true positive', len(edges & predicted_edges)]) report.append(['false positive', len(predicted_edges - edges)]) report.append(['false negative', len(edges - predicted_edges)]) report.append( ['true negative', n * (n - 1) / 2 - len(edges | predicted_edges)]) return report
def evaluate_embeddings(embeddings, edges, cda=True, greedy_routing=False, cda_max_vertices=1000, gr_max_pairs=10000): "evaluate quality of embeddings compared with real elge set" report = [] # get connected component true_graph = nx.Graph() true_graph.add_edges_from(edges) Gcc=sorted(nx.connected_component_subgraphs(true_graph), key=len, reverse=True) true_graph=Gcc[0] # use BallTree for efficient graph construction print "construct BallTree" vertices = list(true_graph.nodes()) n = len(vertices) embeddings_array = np.array([embeddings[v] for v in vertices]) bt = BallTree(embeddings_array, metric=distance) degrees = defaultdict(int) print "compute number of correct directed arcs" for v1, v2 in edges: degrees[v1] += 1 degrees[v2] += 1 # compute number of correct DIRECTED arcs assuming that degrees are known if cda: all_correct_arcs = set() cda_vertices = vertices[:] if len(cda_vertices) > cda_max_vertices: np.random.shuffle(cda_vertices) cda_vertices = cda_vertices[:cda_max_vertices] for v_i, v in enumerate(cda_vertices): start = time.time() degree = degrees[v] dist, ind = bt.query(embeddings[v], k=degree+1) # one of neighbors is vertex inself neigh = [vertices[i] for i in ind[0].tolist() if vertices[i] != v] for ne in neigh: if make_edge(v, ne) in edges: all_correct_arcs.add((v, ne)) finish = time.time() #print "DEBUG: {} / {}, time={}s".format(v_i + 1, len(cda_vertices), datetime.timedelta(seconds=finish-start)) report.append(['ratio of correct arcs for known degrees', float(len(all_correct_arcs)) / (2 * len(edges))]) if greedy_routing: print "compute greedy routing efficiency" random_pairs = set() if n * (n-1) / 2 <= gr_max_pairs: random_pairs = set(combinations(vertices, 2)) else: while(len(random_pairs) < gr_max_pairs): v1 = np.random.choice(vertices) v2 = np.random.choice(vertices) if v1 != v2: random_pairs.add((v1, v2)) total_distribution = defaultdict(int) success_distribution = defaultdict(int) for i, pair in enumerate(random_pairs): src, dst = pair # best path best_path_length = nx.shortest_path_length(true_graph, source=src, target=dst) total_distribution[0] += 1 total_distribution[best_path_length] += 1 # greedy path curr_src = src path_length = 0 seen = set() while curr_src != dst: seen.add(curr_src) # find neighbor closest to destination unseen_neighbors = filter(lambda x: x not in seen, true_graph.neighbors(curr_src)) if not len(unseen_neighbors): # greedy algorithm stuck in 'leaf' path_length = np.nan break def curr_distance(v): return distance(embeddings[dst], embeddings[v]) closest_neigh = min(unseen_neighbors, key=curr_distance) path_length += 1 curr_src = closest_neigh if path_length == best_path_length: success_distribution[0] += 1 success_distribution[best_path_length] += 1 all_success = success_distribution[0] all_total = total_distribution[0] all_ratio = float(all_success) / all_total * 100 print "Total: {} / {} ({:.2f} %)".format(all_success, all_total, all_ratio) for pl in sorted(set(total_distribution.keys()) | set(success_distribution.keys())): if pl == 0: continue total = total_distribution.get(pl, 0) success = success_distribution.get(pl, 0) ratio = float(success) / total * 100 print "Path length = {}: {} / {} ({:.2f} %)".format(pl, success, total, ratio) if False: # depends on R, bad for subgraphs -- not used n = len(vertices) R = 2 * np.log(n) coshR = np.cosh(R) predicted_edges = set() print "predict edges" for v in vertices: coords = embeddings[v] neigh_idx = bt.query_radius(coords, R) neigh = [vertices[i] for i in neigh_idx[0].tolist() if vertices[i] != v] predicted_edges.update([make_edge(v, ne) for ne in neigh]) report.append(['total_predicted_edges', len(predicted_edges)]) # contingency matrix print "compute contingency matrix" report.append(['true positive', len(edges & predicted_edges)]) report.append(['false positive', len(predicted_edges - edges)]) report.append(['false negative', len(edges - predicted_edges)]) report.append(['true negative', n*(n-1)/2 - len(edges | predicted_edges)]) return report
def find_embeddings( vertices, edges, mode, learning_rate=0.1, n_epoch=100, ratio_to_second=2.0, ratio_between_first=1.0, ratio_random=1.0, silent=False, ): "find (r, phi) for each vertex" vertices = list(vertices) n = len(vertices) R = 2 * np.log(n) print "mode: {}".format(mode) np.random.seed(0) degrees = defaultdict(int) print "count degrees" for v1, v2 in edges: degrees[v1] += 1 degrees[v2] += 1 if mode == "random": # phi=rand(0, 2pi), r = rand(0,R) return {v: (np.random.uniform(0.0, R), np.random.uniform(0.0, 2 * np.pi)) for v in vertices} elif mode == "degrees": # phi=rand(0,2pi), r = 2log(n/k) return {v: (2 * np.log(n / degrees[v]), np.random.uniform(0.0, 2 * np.pi)) for v in vertices} elif mode.startswith("fit"): x0 = [] for (r, phi) in zip( [2 * np.log(n / degrees[v]) for v in vertices], [np.random.uniform(0.0, 2 * np.pi) for v in vertices] ): x0.append(r) x0.append(phi) x0 = np.array(x0) nedges = set() all_nedges = set() for (v1, v2) in combinations(vertices, 2): # if (v1, v2) not in edges and (v2, v1) not in edges: e = make_edge(v1, v2) if e not in edges: all_nedges.add(e) if mode == "fit_random": a = list(all_nedges) random.shuffle(a) nedges = set(a[: len(edges)]) elif mode == "fit_degrees": K = float(ratio_to_second) # ratio of nedges to second neighbour L = float(ratio_between_first) # ratio of nedges between first neighbours M = float(ratio_random) # ratio of random nedges # free_nedges = all_nedges.copy() G = nx.Graph() G.add_edges_from(edges) srt_vertices = sorted(degrees.keys(), key=lambda v: -degrees[v]) shuf_vertices = srt_vertices[:] random.shuffle(shuf_vertices) for v in srt_vertices: # get first neighbours first_neigh = set(G.neighbors(v)) # get second neighbours second_neigh = set() for neigh in first_neigh: second_neigh.update(G.neighbors(neigh)) second_neigh.remove(v) n_vertex_nedges = 0 # from v to second neighbours for i, sec_n in enumerate(second_neigh): # print "i: {}".format(i) if i + 1 > degrees[v] * K: continue e = make_edge(v, sec_n) if e not in nedges: nedges.add(e) n_vertex_nedges += 1 # between first neighbours for j, pair in enumerate(combinations(first_neigh, 2)): # print "j: {}".format(j) if j + 1 > degrees[v] * L: continue v1, v2 = pair e = make_edge(v1, v2) if e not in nedges: nedges.add(e) # random edges max_n_random_vertices = int(degrees[v] * M) n_random_vertices = 0 for rand_v in shuf_vertices: if n_random_vertices >= max_n_random_vertices: break e = make_edge(v, rand_v) if e not in nedges and e not in edges: nedges.add(e) n_random_vertices += 1 else: nedges = all_nedges.copy() print "number of nedges={}".format(len(nedges)) q = Q(vertices, edges, nedges) grad_q = GradQ(vertices, edges, nedges) if mode == "fit_degrees_sgd": print "Learning rate: {}".format(learning_rate) print "Ratio to second: {}".format(ratio_to_second) print "Ratio between first: {}".format(ratio_between_first) print "Ratio random: {}".format(ratio_random) G = nx.Graph() G.add_edges_from(edges) # construct connected(!) core core_exponent = 0.4 core_vertices, fringe_vertices = [], [] # one-pass split by condition for v in vertices: core_vertices.append(v) if degrees[v] >= n ** core_exponent else fringe_vertices.append(v) # add vertices to ensure connectivity of core fringe_vertices.sort(key=lambda v: -degrees[v]) while not nx.is_connected(G.subgraph(core_vertices)): core_vertices.append(fringe_vertices.pop(0)) print "Core size: {}".format(len(core_vertices)) G_core = G.subgraph(core_vertices) print "Is core connected:", nx.is_connected(G_core) # loss_function = MSE(binary_edges=True) loss_function = LogLoss(binary_edges=True) optimizer = SGD(n_epoch=n_epoch, learning_rate=learning_rate, verbose=not silent) FRINGE_FRACTION = 0.1 max_fringe_size = int(G.number_of_nodes() * FRINGE_FRACTION) curr_graph = G.subgraph(core_vertices) curr_core_vertices = set(core_vertices) curr_embedding_model = PoincareModel(curr_graph, fit_radius=False) curr_pair_generator = BinaryPairGenerator(curr_graph, batch_size=1) optimizer.optimize_embedding(curr_embedding_model, loss_function, curr_pair_generator) for i in range(int(1 / FRINGE_FRACTION) + 1): total_fringe = fringe(G, curr_core_vertices) # print "DEBUG:", curr_graph. number_of_nodes(), len(curr_core_vertices), len(total_fringe) fringe_vertices = set(sorted(total_fringe, key=lambda v: -G.degree(v))[:max_fringe_size]) # print "DEBUG:", i+1, fringe_vertices if not fringe_vertices: break curr_graph = G.subgraph(curr_core_vertices | fringe_vertices) curr_embedding_model = PoincareModel(curr_graph, fit_radius=False, init_embedding=curr_embedding_model) curr_pair_generator = BinaryPairGenerator(curr_graph, batch_size=1) optimizer.optimize_embedding( curr_embedding_model, loss_function, curr_pair_generator, fixed_vertices=curr_core_vertices ) curr_core_vertices |= fringe_vertices embedding_model = curr_embedding_model """ core_embedding_model = PoincareModel(G_core, fit_radius=False) core_pair_generator = BinaryPairGenerator(G_core, batch_size=1) optimizer.optimize_embedding(core_embedding_model, loss_function, core_pair_generator) #optimizer = SGD(n_epoch=n_epoch, learning_rate=learning_rate, verbose=not silent) embedding_model = PoincareModel(G, fit_radius=False, init_embedding=core_embedding_model) pair_generator = BinaryPairGenerator(G, batch_size=1) optimizer.optimize_embedding(embedding_model, loss_function, pair_generator, fixed_vertices=core_vertices) #print "Radius before: {}".format(embedding_model.embedding['radius']) #print "Radius after: {}".format(embedding_model.embedding['radius']) """ return (embedding_model.embedding["vertices"], {"core": list(G.edges())}) else: print "Check gradient: ", check_grad(q, grad_q, x0) res = minimize(q, x0, method="BFGS", jac=grad_q) # print res x = res.x retval = {} for i in range(len(vertices)): r = x[2 * i] phi = x[2 * i + 1] retval[vertices[i]] = (r, phi) return retval else: raise Exception("unknown mode")
def find_embeddings(vertices, edges, mode, learning_rate=0.1, n_epoch=100, ratio_to_second=2., ratio_between_first=1., ratio_random=1., silent=False): "find (r, phi) for each vertex" vertices = list(vertices) n = len(vertices) R = 2 * np.log(n) print "mode: {}".format(mode) np.random.seed(0) degrees = defaultdict(int) print "count degrees" for v1, v2 in edges: degrees[v1] += 1 degrees[v2] += 1 if mode == 'random': # phi=rand(0, 2pi), r = rand(0,R) return { v: (np.random.uniform(0.0, R), np.random.uniform(0.0, 2 * np.pi)) for v in vertices } elif mode == 'degrees': # phi=rand(0,2pi), r = 2log(n/k) return { v: (2 * np.log(n / degrees[v]), np.random.uniform(0.0, 2 * np.pi)) for v in vertices } elif mode.startswith('fit'): x0 = [] for (r, phi) in zip([2 * np.log(n / degrees[v]) for v in vertices], [np.random.uniform(0.0, 2 * np.pi) for v in vertices]): x0.append(r) x0.append(phi) x0 = np.array(x0) nedges = set() all_nedges = set() for (v1, v2) in combinations(vertices, 2): #if (v1, v2) not in edges and (v2, v1) not in edges: e = make_edge(v1, v2) if e not in edges: all_nedges.add(e) if mode == 'fit_random': a = list(all_nedges) random.shuffle(a) nedges = set(a[:len(edges)]) elif mode == 'fit_degrees': K = float(ratio_to_second) # ratio of nedges to second neighbour L = float(ratio_between_first ) # ratio of nedges between first neighbours M = float(ratio_random) # ratio of random nedges #free_nedges = all_nedges.copy() G = nx.Graph() G.add_edges_from(edges) srt_vertices = sorted(degrees.keys(), key=lambda v: -degrees[v]) shuf_vertices = srt_vertices[:] random.shuffle(shuf_vertices) for v in srt_vertices: # get first neighbours first_neigh = set(G.neighbors(v)) # get second neighbours second_neigh = set() for neigh in first_neigh: second_neigh.update(G.neighbors(neigh)) second_neigh.remove(v) n_vertex_nedges = 0 # from v to second neighbours for i, sec_n in enumerate(second_neigh): #print "i: {}".format(i) if i + 1 > degrees[v] * K: continue e = make_edge(v, sec_n) if e not in nedges: nedges.add(e) n_vertex_nedges += 1 # between first neighbours for j, pair in enumerate(combinations(first_neigh, 2)): #print "j: {}".format(j) if j + 1 > degrees[v] * L: continue v1, v2 = pair e = make_edge(v1, v2) if e not in nedges: nedges.add(e) # random edges max_n_random_vertices = int(degrees[v] * M) n_random_vertices = 0 for rand_v in shuf_vertices: if n_random_vertices >= max_n_random_vertices: break e = make_edge(v, rand_v) if e not in nedges and e not in edges: nedges.add(e) n_random_vertices += 1 else: nedges = all_nedges.copy() print "number of nedges={}".format(len(nedges)) q = Q(vertices, edges, nedges) grad_q = GradQ(vertices, edges, nedges) if mode == 'fit_degrees_sgd': print "Learning rate: {}".format(learning_rate) print "Ratio to second: {}".format(ratio_to_second) print "Ratio between first: {}".format(ratio_between_first) print "Ratio random: {}".format(ratio_random) G = nx.Graph() G.add_edges_from(edges) # construct connected(!) core core_exponent = 0.4 core_vertices, fringe_vertices = [], [] # one-pass split by condition for v in vertices: core_vertices.append(v) if degrees[ v] >= n**core_exponent else fringe_vertices.append(v) # add vertices to ensure connectivity of core fringe_vertices.sort(key=lambda v: -degrees[v]) while not nx.is_connected(G.subgraph(core_vertices)): core_vertices.append(fringe_vertices.pop(0)) print "Core size: {}".format(len(core_vertices)) G_core = G.subgraph(core_vertices) print "Is core connected:", nx.is_connected(G_core) #loss_function = MSE(binary_edges=True) loss_function = LogLoss(binary_edges=True) optimizer = SGD(n_epoch=n_epoch, learning_rate=learning_rate, verbose=not silent) FRINGE_FRACTION = 0.1 max_fringe_size = int(G.number_of_nodes() * FRINGE_FRACTION) curr_graph = G.subgraph(core_vertices) curr_core_vertices = set(core_vertices) curr_embedding_model = PoincareModel(curr_graph, fit_radius=False) curr_pair_generator = BinaryPairGenerator(curr_graph, batch_size=1) optimizer.optimize_embedding(curr_embedding_model, loss_function, curr_pair_generator) for i in range(int(1 / FRINGE_FRACTION) + 1): total_fringe = fringe(G, curr_core_vertices) #print "DEBUG:", curr_graph. number_of_nodes(), len(curr_core_vertices), len(total_fringe) fringe_vertices = set( sorted(total_fringe, key=lambda v: -G.degree(v))[:max_fringe_size]) #print "DEBUG:", i+1, fringe_vertices if not fringe_vertices: break curr_graph = G.subgraph(curr_core_vertices | fringe_vertices) curr_embedding_model = PoincareModel( curr_graph, fit_radius=False, init_embedding=curr_embedding_model) curr_pair_generator = BinaryPairGenerator(curr_graph, batch_size=1) optimizer.optimize_embedding(curr_embedding_model, loss_function, curr_pair_generator, fixed_vertices=curr_core_vertices) curr_core_vertices |= fringe_vertices embedding_model = curr_embedding_model ''' core_embedding_model = PoincareModel(G_core, fit_radius=False) core_pair_generator = BinaryPairGenerator(G_core, batch_size=1) optimizer.optimize_embedding(core_embedding_model, loss_function, core_pair_generator) #optimizer = SGD(n_epoch=n_epoch, learning_rate=learning_rate, verbose=not silent) embedding_model = PoincareModel(G, fit_radius=False, init_embedding=core_embedding_model) pair_generator = BinaryPairGenerator(G, batch_size=1) optimizer.optimize_embedding(embedding_model, loss_function, pair_generator, fixed_vertices=core_vertices) #print "Radius before: {}".format(embedding_model.embedding['radius']) #print "Radius after: {}".format(embedding_model.embedding['radius']) ''' return (embedding_model.embedding['vertices'], { 'core': list(G.edges()) }) else: print "Check gradient: ", check_grad(q, grad_q, x0) res = minimize(q, x0, method='BFGS', jac=grad_q) #print res x = res.x retval = {} for i in range(len(vertices)): r = x[2 * i] phi = x[2 * i + 1] retval[vertices[i]] = (r, phi) return retval else: raise Exception('unknown mode')