def main(): args = parse_args() args.directed = True seed = args.seed training_edgelist_dir = os.path.join(args.output, "seed={:03d}".format(seed), "training_edges") removed_edges_dir = os.path.join(args.output, "seed={:03d}".format(seed), "removed_edges") if not os.path.exists(training_edgelist_dir): os.makedirs(training_edgelist_dir, exist_ok=True) if not os.path.exists(removed_edges_dir): os.makedirs(removed_edges_dir, exist_ok=True) # training_edgelist_fn = os.path.join(training_edgelist_dir, # "graph.npz") val_edgelist_fn = os.path.join(removed_edges_dir, "val_edges.tsv") val_non_edgelist_fn = os.path.join(removed_edges_dir, "val_non_edges.tsv") test_edgelist_fn = os.path.join(removed_edges_dir, "test_edges.tsv") test_non_edgelist_fn = os.path.join(removed_edges_dir, "test_non_edges.tsv") graph, _, _ = load_data(args) print("loaded dataset") if isinstance(graph, nx.DiGraph): graph = nx.adjacency_matrix(graph, nodelist=sorted(graph), weight=None).astype(bool) nodes = set(range(graph.shape[0])) edges = list(zip(*graph.nonzero())) print("enumerated edges") print("number of edges", len(edges)) (_, (val_edges, val_non_edges), (test_edges, test_non_edges)) = split_edges(nodes, edges, seed, val_split=0) print("number of val edges", len(val_edges), "number of val non edges", len(val_edges)) print("number of test edges", len(test_edges), "number of test non edges", len(test_edges)) # remove val and test edges for edge in val_edges + test_edges: graph[edge] = 0 graph.eliminate_zeros() assert np.all( np.logical_or(graph.A.any(0).flatten(), graph.A.any(1).flatten())) for u, v in val_edges: assert not graph[u, v] for u, v in test_edges: assert not graph[u, v] print("removed edges") training_sparse_filename = os.path.join(training_edgelist_dir, "graph.npz") print("writing adjacency matrix to", training_sparse_filename) save_npz(training_sparse_filename, graph) training_edgelist_filename = os.path.join(training_edgelist_dir, "edgelist.tsv.gz") print("writing training edgelist to", training_edgelist_filename) graph = graph.astype(int) nx.write_weighted_edgelist(nx.from_scipy_sparse_matrix( graph, create_using=nx.DiGraph()), training_edgelist_filename, delimiter="\t") write_edgelist_to_file(val_edges, val_edgelist_fn) write_edgelist_to_file(val_non_edges, val_non_edgelist_fn) write_edgelist_to_file(test_edges, test_edgelist_fn) write_edgelist_to_file(test_non_edges, test_non_edgelist_fn) print("done")
def main(): args = parse_args() test_results_dir = args.test_results_dir if not os.path.exists(test_results_dir): os.makedirs(test_results_dir, exist_ok=True) test_results_filename = os.path.join(test_results_dir, "{}.pkl".format(args.seed)) # if check_complete(test_results_filename, args.seed): # return # test_results_lock_filename = os.path.join(test_results_dir, # "test_results.lock") # touch(test_results_lock_filename) args.directed = True graph, _, _ = load_data(args) # assert nx.is_directed(graph) print("Loaded dataset") print() if isinstance(graph, nx.DiGraph): graph = nx.adjacency_matrix(graph, nodelist=sorted(graph), weight=None).astype(bool) random.seed(args.seed) test_edges = list(zip(*graph.nonzero())) num_edges = len(test_edges) nodes = set(range(graph.shape[0])) del graph test_non_edges = sample_non_edges(nodes, set(test_edges), num_edges) test_edges = np.array(test_edges) test_non_edges = np.array(test_non_edges) print("number of test edges:", len(test_edges)) print("number of test non edges:", len(test_non_edges)) embedding = load_embedding(args.dist_fn, args.embedding_directory) test_results = dict() (mean_rank_recon, ap_recon, roc_recon) = evaluate_rank_AUROC_AP(embedding, test_edges, test_non_edges, args.dist_fn) test_results.update({ "mean_rank_recon": mean_rank_recon, "ap_recon": ap_recon, "roc_recon": roc_recon }) map_recon, precisions_at_k = \ evaluate_mean_average_precision( embedding, test_edges, args.dist_fn) test_results.update({"map_recon": map_recon}) for k, pk in precisions_at_k.items(): print("precision at", k, pk) test_results.update( {"p@{}".format(k): pk for k, pk in precisions_at_k.items()}) print("saving test results to {}".format(test_results_filename)) test_results = pd.Series(test_results) with open(test_results_filename, "wb") as f: pkl.dump(test_results, f, pkl.HIGHEST_PROTOCOL) print("done")
def main(): args = parse_args() args.directed = True assert not (args.visualise and args.embedding_dim > 2), "Can only visualise two dimensions" assert args.embedding_path is not None, "you must specify a path to save embedding" random.seed(args.seed) np.random.seed(args.seed) tf.set_random_seed(args.seed) graph, features, node_labels = \ load_data(args) if not args.visualise and node_labels is not None: node_labels = None print ("Loaded dataset") configure_paths(args) print ("Configured paths") positive_samples, negative_samples, node_map = \ determine_positive_and_negative_samples(graph, args) N = graph.shape[0] if not args.visualise: del graph # build model embedder, model = build_headnet( N, features, args.embedding_dim, args.num_negative_samples, identity_variance=args.identity_variance, ) model, initial_epoch = load_weights( model, args.embedding_path) model.summary() best_model_path = os.path.join(args.embedding_path, "best_model.h5") callbacks = [ TerminateOnNaN(), EarlyStopping(monitor="loss", patience=args.patience, mode="min", verbose=True), ModelCheckpoint(best_model_path, save_best_only=True, save_weights_only=True, monitor="loss", mode="min"), Checkpointer(epoch=initial_epoch, embedding_directory=args.embedding_path, model=model, embedder=embedder, features=features if features is not None else np.arange(N),)#.reshape(N, 1),) ] print ("Training with data generator with {} worker threads".format(args.workers)) training_generator = TrainingDataGenerator( features, positive_samples, negative_samples, node_map, args, ) model.fit_generator(training_generator, workers=args.workers, use_multiprocessing=False, steps_per_epoch=len(training_generator), epochs=args.num_epochs, initial_epoch=initial_epoch, verbose=args.verbose, callbacks=callbacks, ) print ("Training complete") if os.path.exists(best_model_path): print ("Loading best model from", best_model_path) model.load_weights(best_model_path) print ("saving final embedding") if features is not None: embedding, sigmas = embedder.predict(features) else: embedding, sigmas = embedder.predict(np.arange(N)) embedding = np.squeeze(embedding, 1) sigmas = np.squeeze(sigmas, 1) assert np.isfinite(embedding).all() assert np.isfinite(sigmas).all() embedding_filename = os.path.join(args.embedding_path, "final_embedding.csv") print ("saving embedding to", embedding_filename) embedding_df = pd.DataFrame(embedding) embedding_df.to_csv(embedding_filename) variance_filename = os.path.join(args.embedding_path, "final_variance.csv") print ("saving variance to", variance_filename) variance_df = pd.DataFrame(sigmas) variance_df.to_csv(variance_filename) if args.visualise: draw_graph(graph, poincare_embedding, node_labels, path="2d-poincare-disk-visualisation.png")
def main(): args = parse_args() args.directed = True seed = args.seed random.seed(seed) training_edgelist_dir = os.path.join(args.output, "seed={:03d}".format(seed), "training_edges") removed_edges_dir = os.path.join(args.output, "seed={:03d}".format(seed), "removed_edges") if not os.path.exists(training_edgelist_dir): os.makedirs(training_edgelist_dir, exist_ok=True) if not os.path.exists(removed_edges_dir): os.makedirs(removed_edges_dir, exist_ok=True) # training_edgelist_fn = os.path.join(training_edgelist_dir, # "graph.npz") val_edgelist_fn = os.path.join(removed_edges_dir, "val_edges.tsv") val_non_edgelist_fn = os.path.join(removed_edges_dir, "val_non_edges.tsv") test_edgelist_fn = os.path.join(removed_edges_dir, "test_edges.tsv") test_non_edgelist_fn = os.path.join(removed_edges_dir, "test_non_edges.tsv") graph, _, _ = load_data(args) print("loaded dataset") if isinstance(graph, nx.DiGraph): graph = nx.adjacency_matrix(graph, nodelist=sorted(graph), weight=None).astype(bool) nodes = range(graph.shape[0]) train_nodes, val_nodes, test_nodes = \ split_nodes( nodes, seed, val_split=0.0, test_split=0.1) removed_nodes = np.append(val_nodes, test_nodes) for u in removed_nodes: assert u not in train_nodes print("num train nodes:", len(train_nodes)) print("num val nodes:", len(val_nodes)) print("num test nodes:", len(test_nodes)) edge_set = set(list(zip(*graph.nonzero()))) if len(val_nodes) > 0: val_edges = [(u, v) for u, v in edge_set if u in val_nodes or v in val_nodes] val_non_edges = sample_non_edges(nodes, edge_set, len(val_edges)) else: val_edges = [] val_non_edges = [] print("determined val edges") if len(test_nodes) > 0: test_edges = [(u, v) for u, v in edge_set if u in test_nodes or v in test_nodes] test_non_edges = sample_non_edges( nodes, edge_set.union(val_non_edges), # do not sample val edges len(test_edges)) else: test_edges = [] test_non_edges = [] print("determined test edges") for edge in val_edges + test_edges: graph[edge] = 0 graph.eliminate_zeros() for u, v in val_edges: assert not graph[u, v] for u, v in test_edges: assert not graph[u, v] for u in removed_nodes: assert not graph[u].A.flatten().any() assert not graph.T[u].A.flatten().any() print("removed edges") training_sparse_filename = os.path.join(training_edgelist_dir, "graph.npz") print("writing adjacency matrix to", training_sparse_filename) save_npz(training_sparse_filename, graph) training_edgelist_filename = os.path.join(training_edgelist_dir, "edgelist.tsv.gz") print("writing training edgelist to", training_edgelist_filename) graph = graph.astype(int) nx.write_weighted_edgelist(nx.from_scipy_sparse_matrix( graph, create_using=nx.DiGraph()), training_edgelist_filename, delimiter="\t") removed_nodes_filename = os.path.join(removed_edges_dir, "removed_nodes.txt") print("writing removed nodes to", removed_nodes_filename) with open(removed_nodes_filename, "w") as f: f.write("\n".join((str(u) for u in removed_nodes))) write_edgelist_to_file(val_edges, val_edgelist_fn) write_edgelist_to_file(val_non_edges, val_non_edgelist_fn) write_edgelist_to_file(test_edges, test_edgelist_fn) write_edgelist_to_file(test_non_edges, test_non_edgelist_fn) print("done")
def main(): args = parse_args() test_results_dir = args.test_results_dir if not os.path.exists(test_results_dir): os.makedirs(test_results_dir, exist_ok=True) test_results_filename = os.path.join(test_results_dir, "{}.pkl".format(args.seed)) args.directed = True graph, _, _ = load_data(args) # assert nx.is_directed(graph) print("Loaded dataset") print() if isinstance(graph, nx.DiGraph): graph = nx.adjacency_matrix(graph, nodelist=sorted(graph), weight=None).astype(bool) N = graph.shape[0] print("network has", N, "nodes") graph_edges = list(zip(*graph.nonzero())) del graph seed = args.seed random.seed(seed) removed_edges_dir = args.removed_edges_dir test_edgelist_fn = os.path.join(removed_edges_dir, "test_edges.tsv") test_non_edgelist_fn = os.path.join(removed_edges_dir, "test_non_edges.tsv") print("loading test edges from {}".format(test_edgelist_fn)) print("loading test non-edges from {}".format(test_non_edgelist_fn)) test_edges = read_edgelist(test_edgelist_fn) test_non_edges = read_edgelist(test_non_edgelist_fn) # test_non_edges = sample_non_edges(range(N), # set(graph_edges).union(test_edges), # sample_size=10*len(test_edges)) test_edges = np.array(test_edges) test_non_edges = np.array(test_non_edges) print("number of test edges:", len(test_edges)) print("number of test non edges:", len(test_non_edges)) embedding = load_embedding(args.dist_fn, args.embedding_directory) test_results = dict() (mean_rank_lp, ap_lp, roc_lp) = evaluate_rank_AUROC_AP(embedding, test_edges, test_non_edges, args.dist_fn) test_results.update({ "mean_rank_lp": mean_rank_lp, "ap_lp": ap_lp, "roc_lp": roc_lp }) map_lp, precisions_at_k = evaluate_mean_average_precision( embedding, test_edges, args.dist_fn, graph_edges=graph_edges) test_results.update({"map_lp": map_lp}) for k, pk in precisions_at_k.items(): print("precision at", k, pk) test_results.update( {"p@{}".format(k): pk for k, pk in precisions_at_k.items()}) print("saving test results to {}".format(test_results_filename)) test_results = pd.Series(test_results) with open(test_results_filename, "wb") as f: pkl.dump(test_results, f, pkl.HIGHEST_PROTOCOL) print("done")