Exemple #1
0
# Input data paths
dataset = "MUTAG"
corpus_data_dir = "data/" + dataset

# Desired output paths
output_embedding_fh = "Graph2Vec_Embeddings.json"

# Hyper parameters
wl_depth = 2
min_count_patterns = 0  # min number of occurrences to be considered in vocabulary of subgraph patterns

#######
# Step 1 Create corpus data for neural language model
# We keep permanent files for sake of deeper post studies and testing
#######
graph_files = utils.get_files(corpus_data_dir, ".gexf", max_files=0)
wl_corpus(graph_files, wl_depth)
extension = ".wld" + str(wl_depth)  # Extension of the graph document

######
# Step 2 Train a neural language model to learn distributed representations
# 		 of the graphs directly or of its substructures. Here we learn it directly
#		 for an example of the latter check out the DGK models.
######
# Instantiate a PV-DBOW trainer to learn distributed reps directly.
trainer = InMemoryTrainer(corpus_dir=corpus_data_dir,
                          extension=extension,
                          max_files=0,
                          output_fh=output_embedding_fh,
                          emb_dimension=32,
                          batch_size=128,
Exemple #2
0
import pandas
import geometric2dr.embedding_methods.utils as utils

# Setup parameters
perf_folder = "DGK_WL_Performance_MUTAG"
dgk, mode, performance, dataset = perf_folder.strip().split("_")

if mode == "GK":
    # Dataframe setup
    data = []
    header = [
        "dataset", "num_graphlet", "sample_size", "emb_dimension",
        "batch_size", "epochs", "run", "accuracy", "std"
    ]

    perf_files = utils.get_files(perf_folder, "", max_files=0)

    for perf_file in perf_files:
        perf_file_basename = os.path.basename(perf_file)
        dataset, num_graphlet, sample_size, emb_dimension, batch_size, epochs, run = perf_file_basename.strip(
        ).split("_")
        print(perf_file_basename.strip().split("_"))

        # Get the accuracy and the std in it
        with open(perf_file, "r") as fh:
            lines = fh.readlines()
            for line in lines:
                mean_acc, std = line.strip().split(",")
                mean_acc = float(mean_acc)
                std = float(std)
Exemple #3
0
    graph = nx.read_gexf(file_handle)
    adj_matrix = nx.to_numpy_matrix(graph)
    return graph, adj_matrix


dataset = "MUTAG"
path_to_gexf_data = "data/"
graph_class_labels_fh = path_to_gexf_data + dataset + ".Labels"
dataset_path = path_to_gexf_data + dataset

# Yanardag style dataset
data = {}
labels = []
graph_files = {}

graph_files = utils.get_files(dataset_path, extension=".gexf", max_files=0)
label_tuples = utils.get_class_labels_tuples(graph_files,
                                             graph_class_labels_fh)
graph_classes = np.array(
    [y for z, y in sorted(label_tuples, key=lambda x: x[0])])
data['labels'] = graph_classes
gf = graph_files[0]

graph_data = {}

for gf in graph_files:
    gindex = int(os.path.basename(gf).split(".")[0]) - 1
    nx_graph, adj_matrix = load_graph(gf)

    graph_data[gindex] = {}
    for node_string in nx_graph.nodes():
Exemple #4
0
corpus_data_dir = "data/" + dataset
class_labels_fh = data_path + dataset + ".Labels"

if method == "graph2vec":
    embeddings_folder = "Graph2vec_Embeddings_" + dataset
    csv_fh = method + "_" + dataset + "_results.csv"
    csv_fh_avg = method + "_" + dataset + "fullCV_results.csv"
    # Dataframe setup
    data = []
    header = [
        "dataset", "wl_depth", "embedding_dimension", "batch_size", "epochs",
        "initial_lr", "run", "accuracy", "std"
    ]

    # if method == "graph2vec"
    embedding_files = utils.get_files(embeddings_folder, "", max_files=0)

    for embedding_file in embedding_files:
        embedding_file_basename = os.path.basename(embedding_file)
        dataset, wl_depth, embedding_dimension, batch_size, epochs, initial_lr, run = embedding_file_basename.split(
            "_")
        print(embedding_file_basename.strip().split("_"))
        extension = ".wld" + wl_depth

        classify_scores = cross_val_accuracy(
            corpus_dir=corpus_data_dir,
            extension=extension,
            embedding_fname=embedding_file,
            class_labels_fname=class_labels_fh)
        mean_acc, std_dev = classify_scores
        print("Mean accuracy using 10 cross fold accuracy: %s with std %s" %