def main(): args = parse_args() _, _, node_labels = load_data(args) print("Loaded dataset") dist_fn = args.dist_fn sep = "," header = "infer" if dist_fn == "euclidean": sep = " " header = None embedding_df = pd.read_csv(args.embedding_filename, sep=sep, header=header, index_col=0) embedding_df = embedding_df.reindex(sorted(embedding_df.index)) embedding = embedding_df.values print(embedding.shape) # project to a space with straight euclidean lines if dist_fn == "poincare": embedding = poincare_ball_to_hyperboloid(embedding) embedding = hyperboloid_to_klein(embedding) elif dist_fn == "hyperboloid": embedding = hyperboloid_to_klein(embedding) print(embedding.shape) label_percentages, f1_micros, f1_macros = \ evaluate_node_classification(embedding, node_labels) test_results = {} for label_percentage, f1_micro, f1_macro in zip(label_percentages, f1_micros, f1_macros): print("{:.2f}".format(label_percentage), "micro = {:.2f}".format(f1_micro), "macro = {:.2f}".format(f1_macro)) test_results.update( {"{:.2f}_micro".format(label_percentage): f1_micro}) test_results.update( {"{:.2f}_macro".format(label_percentage): f1_macro}) test_results_dir = args.test_results_dir if not os.path.exists(test_results_dir): os.makedirs(test_results_dir) test_results_filename = os.path.join(test_results_dir, "test_results.csv") test_results_lock_filename = os.path.join(test_results_dir, "test_results.lock") touch(test_results_lock_filename) print("saving test results to {}".format(test_results_filename)) threadsafe_save_test_results(test_results_lock_filename, test_results_filename, args.seed, data=test_results)
def main(): args = parse_args() seed = args.seed training_edgelist_dir = os.path.join(args.output, "seed={:03d}".format(seed), "training_edges") removed_edges_dir = os.path.join(args.output, "seed={:03d}".format(seed), "removed_edges") if not os.path.exists(training_edgelist_dir): os.makedirs(training_edgelist_dir, exist_ok=True) if not os.path.exists(removed_edges_dir): os.makedirs(removed_edges_dir, exist_ok=True) training_edgelist_fn = os.path.join(training_edgelist_dir, "edgelist.tsv") val_edgelist_fn = os.path.join(removed_edges_dir, "val_edges.tsv") val_non_edgelist_fn = os.path.join(removed_edges_dir, "val_non_edges.tsv") test_edgelist_fn = os.path.join(removed_edges_dir, "test_edges.tsv") test_non_edgelist_fn = os.path.join(removed_edges_dir, "test_non_edges.tsv") graph, features, node_labels = load_data(args) print("loaded dataset") edges = list(graph.edges()) non_edges = list(nx.non_edges(graph)) _, (val_edges, val_non_edges), (test_edges, test_non_edges) = split_edges(edges, non_edges, seed) for edge in test_edges: assert edge in graph.edges() or edge[::-1] in graph.edges() graph.remove_edges_from(val_edges + test_edges) graph.add_edges_from(((u, u, { "weight": 0 }) for u in graph.nodes( ))) # ensure that every node appears at least once by adding self loops print("removed edges") nx.write_edgelist(graph, training_edgelist_fn, delimiter="\t", data=["weight"]) write_edgelist_to_file(val_edges, val_edgelist_fn) write_edgelist_to_file(val_non_edges, val_non_edgelist_fn) write_edgelist_to_file(test_edges, test_edgelist_fn) write_edgelist_to_file(test_non_edges, test_non_edgelist_fn) print("done")
def main(): args = parse_args() _, _, node_labels = load_data(args) print("Loaded dataset") dist_fn = args.dist_fn sep = "," header = "infer" if dist_fn == "euclidean": sep = " " header = None embedding_df = pd.read_csv(args.embedding_filename, sep=sep, header=header, index_col=0) embedding_df = embedding_df.reindex(sorted(embedding_df.index)) embedding = embedding_df.values # project to a space with straight euclidean lines if dist_fn == "poincare": embedding = poincare_ball_to_hyperboloid(embedding) embedding = hyperboloid_to_klein(embedding) elif dist_fn == "hyperboloid": embedding = hyperboloid_to_klein(embedding) k_fold_f1_micro, k_fold_f1_macro = \ evaluate_kfold_label_classification(embedding, node_labels, k=5) test_results = {} test_results.update({ "10-fold-f1_micro": k_fold_f1_micro, "10-fold-f1-macro": k_fold_f1_macro }) test_results_dir = args.test_results_dir if not os.path.exists(test_results_dir): os.makedirs(test_results_dir) test_results_filename = os.path.join(test_results_dir, "test_results.csv") test_results_lock_filename = os.path.join(test_results_dir, "test_results.lock") touch(test_results_lock_filename) print("saving test results to {}".format(test_results_filename)) threadsafe_save_test_results(test_results_lock_filename, test_results_filename, args.seed, data=test_results)
def main(): args = parse_args() seed = args.seed training_edgelist_dir = os.path.join(args.output, "training_edges") removed_edges_dir = os.path.join(args.output, "removed_edges") if not os.path.exists(training_edgelist_dir): os.makedirs(training_edgelist_dir, exist_ok=True) if not os.path.exists(removed_edges_dir): os.makedirs(removed_edges_dir, exist_ok=True) training_edgelist_fn = os.path.join(training_edgelist_dir, "edgelist.tsv") val_edgelist_fn = os.path.join(removed_edges_dir, "val_edges.tsv") val_non_edgelist_fn = os.path.join(removed_edges_dir, "val_non_edges.tsv") test_edgelist_fn = os.path.join(removed_edges_dir, "test_edges.tsv") test_non_edgelist_fn = os.path.join(removed_edges_dir, "test_non_edges.tsv") graph, _, _ = load_data(args) print("loaded dataset") pivot_time = get_pivot_time(graph, wanted_ratio=0.2, min_ratio=0.1) print(pivot_time) edges = list(graph.edges.data()) #non_edges = list(nx.non_edges(graph)) _, (val_edges, val_non_edges), (test_edges, test_non_edges) = split_edges( edges, graph, pivot_time) for edge in test_edges: assert edge in graph.edges() or edge[::-1] in graph.edges() graph.remove_edges_from(val_edges + test_edges) #graph.add_edges_from(((u, u, {"weight": 0}) for u in graph.nodes())) # ensure that every node appears at least once by adding self loops #### #daoluan=test_edges+test_non_edges #test_edges=random.sample(daoluan,len(test_edges)) #test_non_edges=random.sample(daoluan,len(test_edges)) #### print("removed edges") nx.write_edgelist(graph, training_edgelist_fn, delimiter="\t", data=["time"]) write_edgelist_to_file(val_edges, val_edgelist_fn) write_edgelist_to_file(val_non_edges, val_non_edgelist_fn) write_edgelist_to_file(test_edges, test_edgelist_fn) write_edgelist_to_file(test_non_edges, test_non_edgelist_fn) print("done")
def main(): args = parse_args() assert not (args.visualise and args.embedding_dim > 2), "Can only visualise two dimensions" assert args.embedding_path is not None, "you must specify a path to save embedding" if not args.no_walks: assert args.walk_path is not None, "you must specify a path to save walks" random.seed(args.seed) np.random.seed(args.seed) tf.set_random_seed(args.seed) graph, features, node_labels = load_data(args) print ("Loaded dataset") if False: plot_degree_dist(graph, "degree distribution") configure_paths(args) print ("Configured paths") # build model num_nodes = len(graph) model = build_model(num_nodes, args) model, initial_epoch = load_weights(model, args) optimizer = ExponentialMappingOptimizer(lr=args.lr) loss = hyperbolic_softmax_loss(sigma=args.sigma) model.compile(optimizer=optimizer, loss=loss, target_tensors=[tf.placeholder(dtype=tf.int32)]) model.summary() callbacks = [ TerminateOnNaN(), EarlyStopping(monitor="loss", patience=args.patience, verbose=True), Checkpointer(epoch=initial_epoch, nodes=sorted(graph.nodes()), embedding_directory=args.embedding_path) ] positive_samples, negative_samples, probs = \ determine_positive_and_negative_samples(graph, features, args) del features # remove features reference to free up memory if args.use_generator: print ("Training with data generator with {} worker threads".format(args.workers)) training_generator = TrainingDataGenerator(positive_samples, probs, model, args) model.fit_generator(training_generator, workers=args.workers, max_queue_size=10, use_multiprocessing=args.workers>0, epochs=args.num_epochs, steps_per_epoch=len(training_generator), initial_epoch=initial_epoch, verbose=args.verbose, callbacks=callbacks ) else: print ("Training without data generator") train_x = np.append(positive_samples, negative_samples, axis=-1) train_y = np.zeros([len(train_x), 1, 1], dtype=np.int32 ) model.fit(train_x, train_y, shuffle=True, batch_size=args.batch_size, epochs=args.num_epochs, initial_epoch=initial_epoch, verbose=args.verbose, callbacks=callbacks ) print ("Training complete") if args.visualise: embedding = model.get_weights()[0] if embedding.shape[1] == 3: print ("projecting to poincare ball") embedding = hyperboloid_to_poincare_ball(embedding) draw_graph(graph, embedding, node_labels, path="2d-poincare-disk-visualisation.png")
def main(): args = parse_args() graph, _, _ = load_data(args) print("Loaded dataset") dist_fn = args.dist_fn sep = "," header = "infer" if dist_fn == "euclidean": sep = " " header = None embedding_df = pd.read_csv(args.embedding_filename, sep=sep, header=header, index_col=0) embedding_df = embedding_df.reindex(sorted(embedding_df.index)) # row 0 is embedding for node 0 # row 1 is embedding for node 1 etc... embedding = embedding_df.values if dist_fn == "poincare": dists = hyperbolic_distance_poincare(embedding) elif dist_fn == "hyperboloid": dists = hyperbolic_distance_hyperboloid(embedding, embedding) else: dists = euclidean_distance(embedding) test_edges = np.array(list(graph.edges())) test_non_edges = np.array(list(nx.non_edges(graph))) np.random.seed(args.seed) idx = np.random.permutation(len(test_non_edges))[:len(test_edges)] test_non_edges = test_non_edges[idx] test_results = dict() scores = -dists (mean_rank_recon, ap_recon, roc_recon) = evaluate_rank_and_AP(scores, test_edges, test_non_edges) test_results.update({ "mean_rank_recon": mean_rank_recon, "ap_recon": ap_recon, "roc_recon": roc_recon }) map_reconstruction = evaluate_mean_average_precision( scores, test_edges, test_non_edges) test_results.update({"map_recon": map_reconstruction}) for k in (1, 3, 5, 10): precision_at_k = evaluate_precision_at_k(scores, test_edges, test_non_edges, k=k) test_results.update({"precision_{}".format(k): precision_at_k}) test_results_dir = args.test_results_dir if not os.path.exists(test_results_dir): os.makedirs(test_results_dir) test_results_filename = os.path.join(test_results_dir, "test_results.csv") test_results_lock_filename = os.path.join(test_results_dir, "test_results.lock") touch(test_results_lock_filename) print("saving test results to {}".format(test_results_filename)) threadsafe_save_test_results(test_results_lock_filename, test_results_filename, args.seed, data=test_results) print("done")
def main(): args = parse_args() test_results_dir = args.test_results_dir if not os.path.exists(test_results_dir): os.makedirs(test_results_dir, exist_ok=True) test_results_filename = os.path.join(test_results_dir, "test_results.csv") if check_complete(test_results_filename, args.seed): return test_results_lock_filename = os.path.join(test_results_dir, "test_results.lock") touch(test_results_lock_filename) graph, _, _ = load_data(args) assert not nx.is_directed(graph) print("Loaded dataset") print() seed = args.seed random.seed(seed) removed_edges_dir = args.removed_edges_dir test_edgelist_fn = os.path.join(removed_edges_dir, "test_edges.tsv") test_non_edgelist_fn = os.path.join(removed_edges_dir, "test_non_edges.tsv") print("loading test edges from {}".format(test_edgelist_fn)) print("loading test non-edges from {}".format(test_non_edgelist_fn)) test_edges = read_edgelist(test_edgelist_fn) test_non_edges = read_edgelist(test_non_edgelist_fn) test_edges = np.array(test_edges) test_non_edges = np.array(test_non_edges) print("number of test edges:", len(test_edges)) print("number of test non edges:", len(test_non_edges)) embedding = load_embedding(args.dist_fn, args.embedding_directory) test_results = dict() (mean_rank_lp, ap_lp, roc_lp) = evaluate_rank_AUROC_AP(embedding, test_edges, test_non_edges, args.dist_fn) test_results.update({ "mean_rank_lp": mean_rank_lp, "ap_lp": ap_lp, "roc_lp": roc_lp }) map_lp, precisions_at_k = evaluate_mean_average_precision( embedding, test_edges, args.dist_fn, graph_edges=graph.edges()) test_results.update({"map_lp": map_lp}) for k, pk in precisions_at_k.items(): print("precision at", k, pk) test_results.update( {"p@{}".format(k): pk for k, pk in precisions_at_k.items()}) print("saving test results to {}".format(test_results_filename)) threadsafe_save_test_results(test_results_lock_filename, test_results_filename, seed, data=test_results) print("done")
def main(): args = parse_args() test_results_dir = args.test_results_dir if not os.path.exists(test_results_dir): os.makedirs(test_results_dir, exist_ok=True) test_results_filename = os.path.join(test_results_dir, "test_results.csv") if check_complete(test_results_filename, args.seed): return test_results_lock_filename = os.path.join(test_results_dir, "test_results.lock") touch(test_results_lock_filename) _, _, node_labels = load_data(args) print ("Loaded dataset") embedding = load_embedding(args.dist_fn, args.embedding_directory) min_count = 10 if node_labels.shape[1] == 1: # remove any node belonging to an under-represented class label_counts = Counter(node_labels.flatten()) mask = np.array([label_counts[l] >= min_count for l in node_labels.flatten()]) embedding = embedding[mask] node_labels = node_labels[mask] else: assert node_labels.shape[1] > 1 idx = node_labels.sum(0) >= min_count node_labels = node_labels[:, idx] idx = node_labels.any(-1) embedding = embedding[idx] node_labels = node_labels[idx] if args.dist_fn == "hyperboloid": print ("loaded a hyperboloid embedding") # print ("projecting from hyperboloid to klein") # embedding = hyperboloid_to_klein(embedding) print ("projecting from hyperboloid to poincare") embedding = hyperboloid_to_poincare_ball(embedding) print ("projecting from poincare to klein") embedding = poincare_ball_to_klein(embedding) elif args.dist_fn == "poincare": print ("loaded a poincare embedding") # print ("projecting from poincare to klein") # embedding = poincare_ball_to_hyperboloid(embedding) # embedding = hyperboloid_to_klein(embedding) print ("projecting from poincare to klein") embedding = poincare_ball_to_klein(embedding) test_results = {} label_percentages, f1_micros, f1_macros = \ evaluate_node_classification(embedding, node_labels) for label_percentage, f1_micro, f1_macro in zip(label_percentages, f1_micros, f1_macros): print ("{:.2f}".format(label_percentage), "micro = {:.2f}".format(f1_micro), "macro = {:.2f}".format(f1_macro) ) test_results.update({"{:.2f}_micro".format(label_percentage): f1_micro}) test_results.update({"{:.2f}_macro".format(label_percentage): f1_macro}) k = 10 k_fold_roc, k_fold_f1, k_fold_precision, k_fold_recall = \ evaluate_kfold_label_classification(embedding, node_labels, k=k) test_results.update({ "{}-fold-roc".format(k): k_fold_roc, "{}-fold-f1".format(k): k_fold_f1, "{}-fold-precision".format(k): k_fold_precision, "{}-fold-recall".format(k): k_fold_recall, }) print ("saving test results to {}".format(test_results_filename)) threadsafe_save_test_results(test_results_lock_filename, test_results_filename, args.seed, data=test_results )
def main(): args = parse_args() test_results_dir = args.test_results_dir if not os.path.exists(test_results_dir): os.makedirs(test_results_dir, exist_ok=True) test_results_filename = os.path.join(test_results_dir, "test_results.csv") if check_complete(test_results_filename, args.seed): return test_results_lock_filename = os.path.join(test_results_dir, "test_results.lock") touch(test_results_lock_filename) graph, _, _ = load_data(args) assert not args.directed assert not nx.is_directed(graph) print ("Loaded dataset") print () random.seed(args.seed) test_edges = list(graph.edges()) test_edges += [(v, u) for u, v in test_edges] num_edges = len(test_edges) test_non_edges = sample_non_edges(graph, set(test_edges), num_edges) test_edges = np.array(test_edges) test_non_edges = np.array(test_non_edges) print ("number of test edges:", len(test_edges)) print ("number of test non edges:", len(test_non_edges)) embedding = load_embedding(args.dist_fn, args.embedding_directory) test_results = dict() (mean_rank_recon, ap_recon, roc_recon) = evaluate_rank_AUROC_AP( embedding, test_edges, test_non_edges, args.dist_fn) test_results.update({"mean_rank_recon": mean_rank_recon, "ap_recon": ap_recon, "roc_recon": roc_recon}) map_recon, precisions_at_k = evaluate_mean_average_precision( embedding, test_edges, args.dist_fn) test_results.update({"map_recon": map_recon}) # precisions_at_k = [(k, # evaluate_precision_at_k(embedding, # test_edges, # args.dist_fn, # k=k)) # for k in (1, 3, 5, 10)] for k, pk in precisions_at_k.items(): print ("precision at", k, pk) test_results.update({"p@{}".format(k): pk for k, pk in precisions_at_k.items()}) print ("saving test results to {}".format( test_results_filename)) threadsafe_save_test_results(test_results_lock_filename, test_results_filename, args.seed, data=test_results ) print ("done")
def main(): args = parse_args() args.directed = True seed = args.seed training_edgelist_dir = os.path.join(args.output, "seed={:03d}".format(seed), "training_edges") removed_edges_dir = os.path.join(args.output, "seed={:03d}".format(seed), "removed_edges") if not os.path.exists(training_edgelist_dir): os.makedirs(training_edgelist_dir, exist_ok=True) if not os.path.exists(removed_edges_dir): os.makedirs(removed_edges_dir, exist_ok=True) training_edgelist_fn = os.path.join(training_edgelist_dir, "edgelist.tsv") val_edgelist_fn = os.path.join(removed_edges_dir, "val_edges.tsv") val_non_edgelist_fn = os.path.join(removed_edges_dir, "val_non_edges.tsv") test_edgelist_fn = os.path.join(removed_edges_dir, "test_edges.tsv") test_non_edgelist_fn = os.path.join(removed_edges_dir, "test_non_edges.tsv") graph, _, _ = load_data(args) print("loaded dataset") assert nx.is_directed(graph) N = len(graph) for u in range(N): assert u in graph edges = list(graph.edges()) print("enumerated edges") (training_edges, (val_edges, val_non_edges), (test_edges, test_non_edges)) = split_edges(graph, edges, seed, val_split=0) assert len(nx.DiGraph(training_edges)) == N print("number of val edges", len(val_edges), "number of val non edges", len(val_edges)) print("number of test edges", len(test_edges), "number of test non edges", len(test_edges)) # remove val and test edges graph.remove_edges_from(val_edges + test_edges) print("removed edges") nx.write_edgelist(graph, training_edgelist_fn, delimiter="\t", data=["weight"]) write_edgelist_to_file(val_edges, val_edgelist_fn) write_edgelist_to_file(val_non_edges, val_non_edgelist_fn) write_edgelist_to_file(test_edges, test_edgelist_fn) write_edgelist_to_file(test_non_edges, test_non_edgelist_fn) print("done")