Example #1
0
def main():

    args = parse_args()

    _, _, node_labels = load_data(args)
    print("Loaded dataset")
    dist_fn = args.dist_fn

    sep = ","
    header = "infer"
    if dist_fn == "euclidean":
        sep = " "
        header = None

    embedding_df = pd.read_csv(args.embedding_filename,
                               sep=sep,
                               header=header,
                               index_col=0)
    embedding_df = embedding_df.reindex(sorted(embedding_df.index))
    embedding = embedding_df.values

    print(embedding.shape)

    # project to a space with straight euclidean lines
    if dist_fn == "poincare":
        embedding = poincare_ball_to_hyperboloid(embedding)
        embedding = hyperboloid_to_klein(embedding)
    elif dist_fn == "hyperboloid":
        embedding = hyperboloid_to_klein(embedding)

    print(embedding.shape)

    label_percentages, f1_micros, f1_macros = \
     evaluate_node_classification(embedding, node_labels)

    test_results = {}
    for label_percentage, f1_micro, f1_macro in zip(label_percentages,
                                                    f1_micros, f1_macros):
        print("{:.2f}".format(label_percentage),
              "micro = {:.2f}".format(f1_micro),
              "macro = {:.2f}".format(f1_macro))
        test_results.update(
            {"{:.2f}_micro".format(label_percentage): f1_micro})
        test_results.update(
            {"{:.2f}_macro".format(label_percentage): f1_macro})

    test_results_dir = args.test_results_dir
    if not os.path.exists(test_results_dir):
        os.makedirs(test_results_dir)
    test_results_filename = os.path.join(test_results_dir, "test_results.csv")
    test_results_lock_filename = os.path.join(test_results_dir,
                                              "test_results.lock")

    touch(test_results_lock_filename)

    print("saving test results to {}".format(test_results_filename))
    threadsafe_save_test_results(test_results_lock_filename,
                                 test_results_filename,
                                 args.seed,
                                 data=test_results)
Example #2
0
def main():

    args = parse_args()

    seed = args.seed
    training_edgelist_dir = os.path.join(args.output,
                                         "seed={:03d}".format(seed),
                                         "training_edges")
    removed_edges_dir = os.path.join(args.output, "seed={:03d}".format(seed),
                                     "removed_edges")

    if not os.path.exists(training_edgelist_dir):
        os.makedirs(training_edgelist_dir, exist_ok=True)
    if not os.path.exists(removed_edges_dir):
        os.makedirs(removed_edges_dir, exist_ok=True)

    training_edgelist_fn = os.path.join(training_edgelist_dir, "edgelist.tsv")
    val_edgelist_fn = os.path.join(removed_edges_dir, "val_edges.tsv")
    val_non_edgelist_fn = os.path.join(removed_edges_dir, "val_non_edges.tsv")
    test_edgelist_fn = os.path.join(removed_edges_dir, "test_edges.tsv")
    test_non_edgelist_fn = os.path.join(removed_edges_dir,
                                        "test_non_edges.tsv")

    graph, features, node_labels = load_data(args)
    print("loaded dataset")

    edges = list(graph.edges())
    non_edges = list(nx.non_edges(graph))

    _, (val_edges,
        val_non_edges), (test_edges,
                         test_non_edges) = split_edges(edges, non_edges, seed)

    for edge in test_edges:
        assert edge in graph.edges() or edge[::-1] in graph.edges()

    graph.remove_edges_from(val_edges + test_edges)
    graph.add_edges_from(((u, u, {
        "weight": 0
    }) for u in graph.nodes(
    )))  # ensure that every node appears at least once by adding self loops

    print("removed edges")

    nx.write_edgelist(graph,
                      training_edgelist_fn,
                      delimiter="\t",
                      data=["weight"])
    write_edgelist_to_file(val_edges, val_edgelist_fn)
    write_edgelist_to_file(val_non_edges, val_non_edgelist_fn)
    write_edgelist_to_file(test_edges, test_edgelist_fn)
    write_edgelist_to_file(test_non_edges, test_non_edgelist_fn)

    print("done")
Example #3
0
def main():

    args = parse_args()

    _, _, node_labels = load_data(args)
    print("Loaded dataset")
    dist_fn = args.dist_fn

    sep = ","
    header = "infer"
    if dist_fn == "euclidean":
        sep = " "
        header = None

    embedding_df = pd.read_csv(args.embedding_filename,
                               sep=sep,
                               header=header,
                               index_col=0)
    embedding_df = embedding_df.reindex(sorted(embedding_df.index))
    embedding = embedding_df.values

    # project to a space with straight euclidean lines
    if dist_fn == "poincare":
        embedding = poincare_ball_to_hyperboloid(embedding)
        embedding = hyperboloid_to_klein(embedding)
    elif dist_fn == "hyperboloid":
        embedding = hyperboloid_to_klein(embedding)


    k_fold_f1_micro, k_fold_f1_macro = \
     evaluate_kfold_label_classification(embedding, node_labels, k=5)

    test_results = {}
    test_results.update({
        "10-fold-f1_micro": k_fold_f1_micro,
        "10-fold-f1-macro": k_fold_f1_macro
    })

    test_results_dir = args.test_results_dir
    if not os.path.exists(test_results_dir):
        os.makedirs(test_results_dir)
    test_results_filename = os.path.join(test_results_dir, "test_results.csv")
    test_results_lock_filename = os.path.join(test_results_dir,
                                              "test_results.lock")

    touch(test_results_lock_filename)

    print("saving test results to {}".format(test_results_filename))
    threadsafe_save_test_results(test_results_lock_filename,
                                 test_results_filename,
                                 args.seed,
                                 data=test_results)
Example #4
0
def main():

    args = parse_args()

    seed = args.seed
    training_edgelist_dir = os.path.join(args.output, "training_edges")
    removed_edges_dir = os.path.join(args.output, "removed_edges")

    if not os.path.exists(training_edgelist_dir):
        os.makedirs(training_edgelist_dir, exist_ok=True)
    if not os.path.exists(removed_edges_dir):
        os.makedirs(removed_edges_dir, exist_ok=True)

    training_edgelist_fn = os.path.join(training_edgelist_dir, "edgelist.tsv")
    val_edgelist_fn = os.path.join(removed_edges_dir, "val_edges.tsv")
    val_non_edgelist_fn = os.path.join(removed_edges_dir, "val_non_edges.tsv")
    test_edgelist_fn = os.path.join(removed_edges_dir, "test_edges.tsv")
    test_non_edgelist_fn = os.path.join(removed_edges_dir,
                                        "test_non_edges.tsv")

    graph, _, _ = load_data(args)
    print("loaded dataset")
    pivot_time = get_pivot_time(graph, wanted_ratio=0.2, min_ratio=0.1)
    print(pivot_time)
    edges = list(graph.edges.data())
    #non_edges = list(nx.non_edges(graph))

    _, (val_edges, val_non_edges), (test_edges, test_non_edges) = split_edges(
        edges, graph, pivot_time)

    for edge in test_edges:
        assert edge in graph.edges() or edge[::-1] in graph.edges()

    graph.remove_edges_from(val_edges + test_edges)
    #graph.add_edges_from(((u, u, {"weight": 0}) for u in graph.nodes())) # ensure that every node appears at least once by adding self loops
    ####
    #daoluan=test_edges+test_non_edges
    #test_edges=random.sample(daoluan,len(test_edges))
    #test_non_edges=random.sample(daoluan,len(test_edges))
    ####
    print("removed edges")

    nx.write_edgelist(graph,
                      training_edgelist_fn,
                      delimiter="\t",
                      data=["time"])
    write_edgelist_to_file(val_edges, val_edgelist_fn)
    write_edgelist_to_file(val_non_edges, val_non_edgelist_fn)
    write_edgelist_to_file(test_edges, test_edgelist_fn)
    write_edgelist_to_file(test_non_edges, test_non_edgelist_fn)

    print("done")
Example #5
0
File: main.py Project: Jappy0/heat
def main():

	args = parse_args()

	assert not (args.visualise and args.embedding_dim > 2), "Can only visualise two dimensions"
	assert args.embedding_path is not None, "you must specify a path to save embedding"
	if not args.no_walks:
		assert args.walk_path is not None, "you must specify a path to save walks"

	random.seed(args.seed)
	np.random.seed(args.seed)
	tf.set_random_seed(args.seed)

	graph, features, node_labels = load_data(args)
	print ("Loaded dataset")

	if False:
		plot_degree_dist(graph, "degree distribution")

	configure_paths(args)

	print ("Configured paths")

	# build model
	num_nodes = len(graph)
	
	model = build_model(num_nodes, args)
	model, initial_epoch = load_weights(model, args)
	optimizer = ExponentialMappingOptimizer(lr=args.lr)
	loss = hyperbolic_softmax_loss(sigma=args.sigma)
	model.compile(optimizer=optimizer, 
		loss=loss, 
		target_tensors=[tf.placeholder(dtype=tf.int32)])
	model.summary()

	callbacks = [
		TerminateOnNaN(),
		EarlyStopping(monitor="loss", 
			patience=args.patience, 
			verbose=True),
		Checkpointer(epoch=initial_epoch, 
			nodes=sorted(graph.nodes()), 
			embedding_directory=args.embedding_path)
	]			

	positive_samples, negative_samples, probs = \
			determine_positive_and_negative_samples(graph, 
			features, args)

	del features # remove features reference to free up memory

	if args.use_generator:
		print ("Training with data generator with {} worker threads".format(args.workers))
		training_generator = TrainingDataGenerator(positive_samples,  
				probs,
				model,
				args)

		model.fit_generator(training_generator, 
			workers=args.workers,
			max_queue_size=10, 
			use_multiprocessing=args.workers>0, 
			epochs=args.num_epochs, 
			steps_per_epoch=len(training_generator),
			initial_epoch=initial_epoch, 
			verbose=args.verbose,
			callbacks=callbacks
		)

	else:
		print ("Training without data generator")

		train_x = np.append(positive_samples, negative_samples, axis=-1)
		train_y = np.zeros([len(train_x), 1, 1], dtype=np.int32 )

		model.fit(train_x, train_y,
			shuffle=True,
			batch_size=args.batch_size, 
			epochs=args.num_epochs, 
			initial_epoch=initial_epoch, 
			verbose=args.verbose,
			callbacks=callbacks
		)

	print ("Training complete")

	if args.visualise:
		embedding = model.get_weights()[0]
		if embedding.shape[1] == 3:
			print ("projecting to poincare ball")
			embedding = hyperboloid_to_poincare_ball(embedding)
		draw_graph(graph, 
			embedding, 
			node_labels, 
			path="2d-poincare-disk-visualisation.png")
Example #6
0
def main():

    args = parse_args()

    graph, _, _ = load_data(args)
    print("Loaded dataset")

    dist_fn = args.dist_fn

    sep = ","
    header = "infer"
    if dist_fn == "euclidean":
        sep = " "
        header = None

    embedding_df = pd.read_csv(args.embedding_filename,
                               sep=sep,
                               header=header,
                               index_col=0)
    embedding_df = embedding_df.reindex(sorted(embedding_df.index))
    # row 0 is embedding for node 0
    # row 1 is embedding for node 1 etc...
    embedding = embedding_df.values

    if dist_fn == "poincare":
        dists = hyperbolic_distance_poincare(embedding)
    elif dist_fn == "hyperboloid":
        dists = hyperbolic_distance_hyperboloid(embedding, embedding)
    else:
        dists = euclidean_distance(embedding)

    test_edges = np.array(list(graph.edges()))
    test_non_edges = np.array(list(nx.non_edges(graph)))

    np.random.seed(args.seed)
    idx = np.random.permutation(len(test_non_edges))[:len(test_edges)]
    test_non_edges = test_non_edges[idx]

    test_results = dict()

    scores = -dists

    (mean_rank_recon, ap_recon,
     roc_recon) = evaluate_rank_and_AP(scores, test_edges, test_non_edges)

    test_results.update({
        "mean_rank_recon": mean_rank_recon,
        "ap_recon": ap_recon,
        "roc_recon": roc_recon
    })

    map_reconstruction = evaluate_mean_average_precision(
        scores, test_edges, test_non_edges)

    test_results.update({"map_recon": map_reconstruction})

    for k in (1, 3, 5, 10):
        precision_at_k = evaluate_precision_at_k(scores,
                                                 test_edges,
                                                 test_non_edges,
                                                 k=k)
        test_results.update({"precision_{}".format(k): precision_at_k})

    test_results_dir = args.test_results_dir
    if not os.path.exists(test_results_dir):
        os.makedirs(test_results_dir)
    test_results_filename = os.path.join(test_results_dir, "test_results.csv")
    test_results_lock_filename = os.path.join(test_results_dir,
                                              "test_results.lock")
    touch(test_results_lock_filename)

    print("saving test results to {}".format(test_results_filename))

    threadsafe_save_test_results(test_results_lock_filename,
                                 test_results_filename,
                                 args.seed,
                                 data=test_results)

    print("done")
Example #7
0
def main():

    args = parse_args()

    test_results_dir = args.test_results_dir
    if not os.path.exists(test_results_dir):
        os.makedirs(test_results_dir, exist_ok=True)
    test_results_filename = os.path.join(test_results_dir, "test_results.csv")

    if check_complete(test_results_filename, args.seed):
        return

    test_results_lock_filename = os.path.join(test_results_dir,
                                              "test_results.lock")
    touch(test_results_lock_filename)

    graph, _, _ = load_data(args)
    assert not nx.is_directed(graph)
    print("Loaded dataset")
    print()

    seed = args.seed
    random.seed(seed)

    removed_edges_dir = args.removed_edges_dir

    test_edgelist_fn = os.path.join(removed_edges_dir, "test_edges.tsv")
    test_non_edgelist_fn = os.path.join(removed_edges_dir,
                                        "test_non_edges.tsv")

    print("loading test edges from {}".format(test_edgelist_fn))
    print("loading test non-edges from {}".format(test_non_edgelist_fn))

    test_edges = read_edgelist(test_edgelist_fn)
    test_non_edges = read_edgelist(test_non_edgelist_fn)

    test_edges = np.array(test_edges)
    test_non_edges = np.array(test_non_edges)

    print("number of test edges:", len(test_edges))
    print("number of test non edges:", len(test_non_edges))

    embedding = load_embedding(args.dist_fn, args.embedding_directory)

    test_results = dict()

    (mean_rank_lp, ap_lp,
     roc_lp) = evaluate_rank_AUROC_AP(embedding, test_edges, test_non_edges,
                                      args.dist_fn)

    test_results.update({
        "mean_rank_lp": mean_rank_lp,
        "ap_lp": ap_lp,
        "roc_lp": roc_lp
    })

    map_lp, precisions_at_k = evaluate_mean_average_precision(
        embedding, test_edges, args.dist_fn, graph_edges=graph.edges())

    test_results.update({"map_lp": map_lp})

    for k, pk in precisions_at_k.items():
        print("precision at", k, pk)
    test_results.update(
        {"p@{}".format(k): pk
         for k, pk in precisions_at_k.items()})

    print("saving test results to {}".format(test_results_filename))

    threadsafe_save_test_results(test_results_lock_filename,
                                 test_results_filename,
                                 seed,
                                 data=test_results)

    print("done")
Example #8
0
def main():

	args = parse_args()

	test_results_dir = args.test_results_dir
	if not os.path.exists(test_results_dir):
		os.makedirs(test_results_dir, exist_ok=True)
	test_results_filename = os.path.join(test_results_dir, 
		"test_results.csv")

	if check_complete(test_results_filename, args.seed):
		return

	test_results_lock_filename = os.path.join(test_results_dir, 
		"test_results.lock")
	touch(test_results_lock_filename)

	_, _, node_labels = load_data(args)
	print ("Loaded dataset")

	embedding = load_embedding(args.dist_fn, args.embedding_directory)

	min_count = 10
	if node_labels.shape[1] == 1: # remove any node belonging to an under-represented class
		label_counts = Counter(node_labels.flatten())
		mask = np.array([label_counts[l] >= min_count
			for l in node_labels.flatten()])
		embedding = embedding[mask]
		node_labels = node_labels[mask]
	else:
		assert node_labels.shape[1] > 1
		idx = node_labels.sum(0) >= min_count
		node_labels = node_labels[:, idx]
		idx = node_labels.any(-1)
		embedding = embedding[idx]
		node_labels = node_labels[idx]

	if args.dist_fn == "hyperboloid":
		print ("loaded a hyperboloid embedding")
		# print ("projecting from hyperboloid to klein")
		# embedding = hyperboloid_to_klein(embedding)
		print ("projecting from hyperboloid to poincare")
		embedding = hyperboloid_to_poincare_ball(embedding)
		print ("projecting from poincare to klein")
		embedding = poincare_ball_to_klein(embedding)

	elif args.dist_fn == "poincare":
		print ("loaded a poincare embedding")
		# print ("projecting from poincare to klein")
		# embedding = poincare_ball_to_hyperboloid(embedding)
		# embedding = hyperboloid_to_klein(embedding)
		print ("projecting from poincare to klein")
		embedding = poincare_ball_to_klein(embedding)

	test_results = {}
	
	label_percentages, f1_micros, f1_macros = \
		evaluate_node_classification(embedding, node_labels)

	for label_percentage, f1_micro, f1_macro in zip(label_percentages, f1_micros, f1_macros):
		print ("{:.2f}".format(label_percentage), 
			"micro = {:.2f}".format(f1_micro), 
			"macro = {:.2f}".format(f1_macro) )
		test_results.update({"{:.2f}_micro".format(label_percentage): f1_micro})
		test_results.update({"{:.2f}_macro".format(label_percentage): f1_macro})

	k = 10
	k_fold_roc, k_fold_f1, k_fold_precision, k_fold_recall = \
		evaluate_kfold_label_classification(embedding, node_labels, k=k)

	test_results.update({
		"{}-fold-roc".format(k): k_fold_roc, 
		"{}-fold-f1".format(k): k_fold_f1,
		"{}-fold-precision".format(k): k_fold_precision,
		"{}-fold-recall".format(k): k_fold_recall,
		})

	print ("saving test results to {}".format(test_results_filename))
	threadsafe_save_test_results(test_results_lock_filename, test_results_filename, args.seed, data=test_results )
Example #9
0
def main():

	args = parse_args()

	test_results_dir = args.test_results_dir
	if not os.path.exists(test_results_dir):
		os.makedirs(test_results_dir, exist_ok=True)
	test_results_filename = os.path.join(test_results_dir, 
		"test_results.csv")

	if check_complete(test_results_filename, args.seed):
		return

	test_results_lock_filename = os.path.join(test_results_dir, 
		"test_results.lock")
	touch(test_results_lock_filename)

	graph, _, _ = load_data(args)
	assert not args.directed 
	assert not nx.is_directed(graph)
	print ("Loaded dataset")
	print ()

	random.seed(args.seed)
	
	test_edges = list(graph.edges())

	test_edges += [(v, u) for u, v in test_edges]

	num_edges = len(test_edges)

	test_non_edges = sample_non_edges(graph, 
		set(test_edges),
		num_edges)

	test_edges = np.array(test_edges)
	test_non_edges = np.array(test_non_edges)


	print ("number of test edges:", len(test_edges))
	print ("number of test non edges:", len(test_non_edges))


	embedding = load_embedding(args.dist_fn, 
		args.embedding_directory)
	
	test_results = dict()

	(mean_rank_recon, ap_recon, 
		roc_recon) = evaluate_rank_AUROC_AP(
			embedding,
			test_edges, 
			test_non_edges,
			args.dist_fn)

	test_results.update({"mean_rank_recon": mean_rank_recon, 
		"ap_recon": ap_recon,
		"roc_recon": roc_recon})

	map_recon, precisions_at_k = evaluate_mean_average_precision(
		embedding, 
		test_edges,
		args.dist_fn)
	test_results.update({"map_recon": map_recon})

	# precisions_at_k = [(k, 
	# 	evaluate_precision_at_k(embedding,  
	# 		test_edges, 
	# 		args.dist_fn,
	# 		k=k))
	# 		for k in (1, 3, 5, 10)]
	for k, pk in precisions_at_k.items():
		print ("precision at", k, pk)
	test_results.update({"p@{}".format(k): pk
		for k, pk in precisions_at_k.items()})

	print ("saving test results to {}".format(
		test_results_filename))

	threadsafe_save_test_results(test_results_lock_filename, 
		test_results_filename, args.seed, data=test_results )

	print ("done")
Example #10
0
def main():

    args = parse_args()
    args.directed = True

    seed = args.seed
    training_edgelist_dir = os.path.join(args.output,
                                         "seed={:03d}".format(seed),
                                         "training_edges")
    removed_edges_dir = os.path.join(args.output, "seed={:03d}".format(seed),
                                     "removed_edges")

    if not os.path.exists(training_edgelist_dir):
        os.makedirs(training_edgelist_dir, exist_ok=True)
    if not os.path.exists(removed_edges_dir):
        os.makedirs(removed_edges_dir, exist_ok=True)

    training_edgelist_fn = os.path.join(training_edgelist_dir, "edgelist.tsv")
    val_edgelist_fn = os.path.join(removed_edges_dir, "val_edges.tsv")
    val_non_edgelist_fn = os.path.join(removed_edges_dir, "val_non_edges.tsv")
    test_edgelist_fn = os.path.join(removed_edges_dir, "test_edges.tsv")
    test_non_edgelist_fn = os.path.join(removed_edges_dir,
                                        "test_non_edges.tsv")

    graph, _, _ = load_data(args)
    print("loaded dataset")
    assert nx.is_directed(graph)
    N = len(graph)

    for u in range(N):
        assert u in graph

    edges = list(graph.edges())
    print("enumerated edges")

    (training_edges, (val_edges, val_non_edges),
     (test_edges, test_non_edges)) = split_edges(graph,
                                                 edges,
                                                 seed,
                                                 val_split=0)

    assert len(nx.DiGraph(training_edges)) == N

    print("number of val edges", len(val_edges), "number of val non edges",
          len(val_edges))
    print("number of test edges", len(test_edges), "number of test non edges",
          len(test_edges))

    # remove val and test edges
    graph.remove_edges_from(val_edges + test_edges)

    print("removed edges")

    nx.write_edgelist(graph,
                      training_edgelist_fn,
                      delimiter="\t",
                      data=["weight"])
    write_edgelist_to_file(val_edges, val_edgelist_fn)
    write_edgelist_to_file(val_non_edges, val_non_edgelist_fn)
    write_edgelist_to_file(test_edges, test_edgelist_fn)
    write_edgelist_to_file(test_non_edges, test_non_edgelist_fn)

    print("done")