def main():

    args = parse_args()
    args.directed = True

    seed = args.seed
    training_edgelist_dir = os.path.join(args.output,
                                         "seed={:03d}".format(seed),
                                         "training_edges")
    removed_edges_dir = os.path.join(args.output, "seed={:03d}".format(seed),
                                     "removed_edges")

    if not os.path.exists(training_edgelist_dir):
        os.makedirs(training_edgelist_dir, exist_ok=True)
    if not os.path.exists(removed_edges_dir):
        os.makedirs(removed_edges_dir, exist_ok=True)

    # training_edgelist_fn = os.path.join(training_edgelist_dir,
    # 	"graph.npz")
    val_edgelist_fn = os.path.join(removed_edges_dir, "val_edges.tsv")
    val_non_edgelist_fn = os.path.join(removed_edges_dir, "val_non_edges.tsv")
    test_edgelist_fn = os.path.join(removed_edges_dir, "test_edges.tsv")
    test_non_edgelist_fn = os.path.join(removed_edges_dir,
                                        "test_non_edges.tsv")

    graph, _, _ = load_data(args)
    print("loaded dataset")

    if isinstance(graph, nx.DiGraph):
        graph = nx.adjacency_matrix(graph, nodelist=sorted(graph),
                                    weight=None).astype(bool)

    nodes = set(range(graph.shape[0]))
    edges = list(zip(*graph.nonzero()))
    print("enumerated edges")
    print("number of edges", len(edges))

    (_, (val_edges, val_non_edges),
     (test_edges, test_non_edges)) = split_edges(nodes,
                                                 edges,
                                                 seed,
                                                 val_split=0)

    print("number of val edges", len(val_edges), "number of val non edges",
          len(val_edges))
    print("number of test edges", len(test_edges), "number of test non edges",
          len(test_edges))

    # remove val and test edges
    for edge in val_edges + test_edges:
        graph[edge] = 0
    graph.eliminate_zeros()

    assert np.all(
        np.logical_or(graph.A.any(0).flatten(),
                      graph.A.any(1).flatten()))
    for u, v in val_edges:
        assert not graph[u, v]
    for u, v in test_edges:
        assert not graph[u, v]

    print("removed edges")

    training_sparse_filename = os.path.join(training_edgelist_dir, "graph.npz")
    print("writing adjacency matrix to", training_sparse_filename)
    save_npz(training_sparse_filename, graph)

    training_edgelist_filename = os.path.join(training_edgelist_dir,
                                              "edgelist.tsv.gz")
    print("writing training edgelist to", training_edgelist_filename)
    graph = graph.astype(int)
    nx.write_weighted_edgelist(nx.from_scipy_sparse_matrix(
        graph, create_using=nx.DiGraph()),
                               training_edgelist_filename,
                               delimiter="\t")

    write_edgelist_to_file(val_edges, val_edgelist_fn)
    write_edgelist_to_file(val_non_edges, val_non_edgelist_fn)
    write_edgelist_to_file(test_edges, test_edgelist_fn)
    write_edgelist_to_file(test_non_edges, test_non_edgelist_fn)

    print("done")
def main():

    args = parse_args()

    test_results_dir = args.test_results_dir
    if not os.path.exists(test_results_dir):
        os.makedirs(test_results_dir, exist_ok=True)

    test_results_filename = os.path.join(test_results_dir,
                                         "{}.pkl".format(args.seed))

    # if check_complete(test_results_filename, args.seed):
    # 	return

    # test_results_lock_filename = os.path.join(test_results_dir,
    # 	"test_results.lock")
    # touch(test_results_lock_filename)

    args.directed = True

    graph, _, _ = load_data(args)
    # assert nx.is_directed(graph)
    print("Loaded dataset")
    print()

    if isinstance(graph, nx.DiGraph):
        graph = nx.adjacency_matrix(graph, nodelist=sorted(graph),
                                    weight=None).astype(bool)

    random.seed(args.seed)

    test_edges = list(zip(*graph.nonzero()))
    num_edges = len(test_edges)

    nodes = set(range(graph.shape[0]))
    del graph
    test_non_edges = sample_non_edges(nodes, set(test_edges), num_edges)

    test_edges = np.array(test_edges)
    test_non_edges = np.array(test_non_edges)

    print("number of test edges:", len(test_edges))
    print("number of test non edges:", len(test_non_edges))

    embedding = load_embedding(args.dist_fn, args.embedding_directory)

    test_results = dict()

    (mean_rank_recon, ap_recon,
     roc_recon) = evaluate_rank_AUROC_AP(embedding, test_edges, test_non_edges,
                                         args.dist_fn)

    test_results.update({
        "mean_rank_recon": mean_rank_recon,
        "ap_recon": ap_recon,
        "roc_recon": roc_recon
    })

    map_recon, precisions_at_k = \
     evaluate_mean_average_precision(
     embedding,
     test_edges,
     args.dist_fn)
    test_results.update({"map_recon": map_recon})

    for k, pk in precisions_at_k.items():
        print("precision at", k, pk)
    test_results.update(
        {"p@{}".format(k): pk
         for k, pk in precisions_at_k.items()})

    print("saving test results to {}".format(test_results_filename))

    test_results = pd.Series(test_results)
    with open(test_results_filename, "wb") as f:
        pkl.dump(test_results, f, pkl.HIGHEST_PROTOCOL)

    print("done")
Beispiel #3
0
def main():

	args = parse_args()

	args.directed = True

	assert not (args.visualise and args.embedding_dim > 2), "Can only visualise two dimensions"
	assert args.embedding_path is not None, "you must specify a path to save embedding"

	random.seed(args.seed)
	np.random.seed(args.seed)
	tf.set_random_seed(args.seed)

	graph, features, node_labels = \
		load_data(args)
	if not args.visualise and node_labels is not None:
		node_labels = None
	print ("Loaded dataset")

	configure_paths(args)

	print ("Configured paths")

	positive_samples, negative_samples, node_map = \
		determine_positive_and_negative_samples(graph, args)

	N = graph.shape[0]

	if not args.visualise:
		del graph 

	# build model
	embedder, model = build_headnet(
		N,
		features, 
		args.embedding_dim, 
		args.num_negative_samples, 
		identity_variance=args.identity_variance,
		)
	model, initial_epoch = load_weights(
		model, 
		args.embedding_path)

	model.summary()

	best_model_path = os.path.join(args.embedding_path, 
		"best_model.h5")

	callbacks = [
		TerminateOnNaN(),
		EarlyStopping(monitor="loss", 
			patience=args.patience, 
			mode="min",
			verbose=True),
		ModelCheckpoint(best_model_path,
			save_best_only=True,
			save_weights_only=True,
			monitor="loss",
			mode="min"),
		Checkpointer(epoch=initial_epoch, 
			embedding_directory=args.embedding_path,
			model=model,
			embedder=embedder,
			features=features if features is not None else np.arange(N),)#.reshape(N, 1),)
	]			

	print ("Training with data generator with {} worker threads".format(args.workers))
	training_generator = TrainingDataGenerator(
		features,
		positive_samples,  
		negative_samples,
		node_map,
		args,
	)

	model.fit_generator(training_generator, 
		workers=args.workers,
		use_multiprocessing=False,
		steps_per_epoch=len(training_generator),
		epochs=args.num_epochs, 
		initial_epoch=initial_epoch, 
		verbose=args.verbose,
		callbacks=callbacks,
	)


	print ("Training complete")
	if os.path.exists(best_model_path):
		print ("Loading best model from", best_model_path)
		model.load_weights(best_model_path)

	print ("saving final embedding")

	if features is not None:
		embedding, sigmas = embedder.predict(features)
	else:
		embedding, sigmas = embedder.predict(np.arange(N))

		embedding = np.squeeze(embedding, 1)
		sigmas = np.squeeze(sigmas, 1)

	assert np.isfinite(embedding).all()
	assert np.isfinite(sigmas).all()

	embedding_filename = os.path.join(args.embedding_path,
		"final_embedding.csv")
	print ("saving embedding to", embedding_filename)
	embedding_df = pd.DataFrame(embedding)
	embedding_df.to_csv(embedding_filename)

	variance_filename = os.path.join(args.embedding_path,
		"final_variance.csv")
	print ("saving variance to", variance_filename)
	variance_df = pd.DataFrame(sigmas)
	variance_df.to_csv(variance_filename)

	if args.visualise:
		draw_graph(graph,
			poincare_embedding, 
			node_labels, 
			path="2d-poincare-disk-visualisation.png")
Beispiel #4
0
def main():

    args = parse_args()
    args.directed = True

    seed = args.seed
    random.seed(seed)

    training_edgelist_dir = os.path.join(args.output,
                                         "seed={:03d}".format(seed),
                                         "training_edges")
    removed_edges_dir = os.path.join(args.output, "seed={:03d}".format(seed),
                                     "removed_edges")

    if not os.path.exists(training_edgelist_dir):
        os.makedirs(training_edgelist_dir, exist_ok=True)
    if not os.path.exists(removed_edges_dir):
        os.makedirs(removed_edges_dir, exist_ok=True)

    # training_edgelist_fn = os.path.join(training_edgelist_dir,
    # 	"graph.npz")

    val_edgelist_fn = os.path.join(removed_edges_dir, "val_edges.tsv")
    val_non_edgelist_fn = os.path.join(removed_edges_dir, "val_non_edges.tsv")
    test_edgelist_fn = os.path.join(removed_edges_dir, "test_edges.tsv")
    test_non_edgelist_fn = os.path.join(removed_edges_dir,
                                        "test_non_edges.tsv")

    graph, _, _ = load_data(args)
    print("loaded dataset")

    if isinstance(graph, nx.DiGraph):
        graph = nx.adjacency_matrix(graph, nodelist=sorted(graph),
                                    weight=None).astype(bool)

    nodes = range(graph.shape[0])
    train_nodes, val_nodes, test_nodes = \
     split_nodes(
     nodes,
     seed,
     val_split=0.0,
     test_split=0.1)

    removed_nodes = np.append(val_nodes, test_nodes)
    for u in removed_nodes:
        assert u not in train_nodes

    print("num train nodes:", len(train_nodes))
    print("num val nodes:", len(val_nodes))
    print("num test nodes:", len(test_nodes))

    edge_set = set(list(zip(*graph.nonzero())))

    if len(val_nodes) > 0:
        val_edges = [(u, v) for u, v in edge_set
                     if u in val_nodes or v in val_nodes]
        val_non_edges = sample_non_edges(nodes, edge_set, len(val_edges))
    else:
        val_edges = []
        val_non_edges = []

    print("determined val edges")

    if len(test_nodes) > 0:
        test_edges = [(u, v) for u, v in edge_set
                      if u in test_nodes or v in test_nodes]
        test_non_edges = sample_non_edges(
            nodes,
            edge_set.union(val_non_edges),  # do not sample val edges
            len(test_edges))
    else:
        test_edges = []
        test_non_edges = []

    print("determined test edges")

    for edge in val_edges + test_edges:
        graph[edge] = 0
    graph.eliminate_zeros()

    for u, v in val_edges:
        assert not graph[u, v]
    for u, v in test_edges:
        assert not graph[u, v]

    for u in removed_nodes:
        assert not graph[u].A.flatten().any()
        assert not graph.T[u].A.flatten().any()

    print("removed edges")

    training_sparse_filename = os.path.join(training_edgelist_dir, "graph.npz")
    print("writing adjacency matrix to", training_sparse_filename)
    save_npz(training_sparse_filename, graph)

    training_edgelist_filename = os.path.join(training_edgelist_dir,
                                              "edgelist.tsv.gz")
    print("writing training edgelist to", training_edgelist_filename)
    graph = graph.astype(int)
    nx.write_weighted_edgelist(nx.from_scipy_sparse_matrix(
        graph, create_using=nx.DiGraph()),
                               training_edgelist_filename,
                               delimiter="\t")

    removed_nodes_filename = os.path.join(removed_edges_dir,
                                          "removed_nodes.txt")
    print("writing removed nodes to", removed_nodes_filename)
    with open(removed_nodes_filename, "w") as f:
        f.write("\n".join((str(u) for u in removed_nodes)))

    write_edgelist_to_file(val_edges, val_edgelist_fn)
    write_edgelist_to_file(val_non_edges, val_non_edgelist_fn)
    write_edgelist_to_file(test_edges, test_edgelist_fn)
    write_edgelist_to_file(test_non_edges, test_non_edgelist_fn)

    print("done")
def main():

    args = parse_args()

    test_results_dir = args.test_results_dir
    if not os.path.exists(test_results_dir):
        os.makedirs(test_results_dir, exist_ok=True)
    test_results_filename = os.path.join(test_results_dir,
                                         "{}.pkl".format(args.seed))

    args.directed = True

    graph, _, _ = load_data(args)
    # assert nx.is_directed(graph)
    print("Loaded dataset")
    print()

    if isinstance(graph, nx.DiGraph):
        graph = nx.adjacency_matrix(graph, nodelist=sorted(graph),
                                    weight=None).astype(bool)

    N = graph.shape[0]
    print("network has", N, "nodes")

    graph_edges = list(zip(*graph.nonzero()))
    del graph

    seed = args.seed
    random.seed(seed)

    removed_edges_dir = args.removed_edges_dir

    test_edgelist_fn = os.path.join(removed_edges_dir, "test_edges.tsv")
    test_non_edgelist_fn = os.path.join(removed_edges_dir,
                                        "test_non_edges.tsv")

    print("loading test edges from {}".format(test_edgelist_fn))
    print("loading test non-edges from {}".format(test_non_edgelist_fn))

    test_edges = read_edgelist(test_edgelist_fn)
    test_non_edges = read_edgelist(test_non_edgelist_fn)
    # test_non_edges = sample_non_edges(range(N),
    # 	set(graph_edges).union(test_edges),
    # 	sample_size=10*len(test_edges))

    test_edges = np.array(test_edges)
    test_non_edges = np.array(test_non_edges)

    print("number of test edges:", len(test_edges))
    print("number of test non edges:", len(test_non_edges))

    embedding = load_embedding(args.dist_fn, args.embedding_directory)

    test_results = dict()

    (mean_rank_lp, ap_lp,
     roc_lp) = evaluate_rank_AUROC_AP(embedding, test_edges, test_non_edges,
                                      args.dist_fn)

    test_results.update({
        "mean_rank_lp": mean_rank_lp,
        "ap_lp": ap_lp,
        "roc_lp": roc_lp
    })

    map_lp, precisions_at_k = evaluate_mean_average_precision(
        embedding, test_edges, args.dist_fn, graph_edges=graph_edges)

    test_results.update({"map_lp": map_lp})

    for k, pk in precisions_at_k.items():
        print("precision at", k, pk)
    test_results.update(
        {"p@{}".format(k): pk
         for k, pk in precisions_at_k.items()})

    print("saving test results to {}".format(test_results_filename))

    test_results = pd.Series(test_results)

    with open(test_results_filename, "wb") as f:
        pkl.dump(test_results, f, pkl.HIGHEST_PROTOCOL)

    print("done")