Beispiel #1
0
def inference(embeddings):
    G = form_graph("data/cutted_edges.csv", "data/cutted_features.csv", "data/cutted_edges_to.csv", embeddings)
    edge_splitter_full = EdgeSplitter(G)

    import keras as keras
    from stellargraph.layer import MeanAggregator, LinkEmbedding
    model = keras.models.load_model('data/w_rev1.h5',
                                    custom_objects={'MeanAggregator': MeanAggregator, 'LinkEmbedding': LinkEmbedding})

    G_full, edge_ids_full, edge_labels_full = edge_splitter_full.train_test_split(
        p=0.5, method="global", keep_connected=True
    )

    batch_size = 20
    num_samples = [20, 10]

    generator = GraphSAGELinkGenerator(G, batch_size, num_samples)
    hold_out_gen = generator.flow(edge_ids_full, edge_labels_full)

    hold_out_predictions_pr = model.predict(hold_out_gen)

    ID = 111180
    EDGE_results = []
    for i in range(len(edge_ids_full)):
        if edge_ids_full[i][0] == ID or edge_ids_full[i][1] == ID:
            EDGE_results.append(i)
    predictions = [[hold_out_predictions_pr[EDGE_results[i]][0], edge_ids_full[EDGE_results[i]][0]] for i in
                   range(len(hold_out_predictions_pr[EDGE_results]))]

    a = sorted(predictions, reverse=True)[0:10]
    sorted_ids = [a[i][1] for i in range(len(a))]

    return sorted_ids
Beispiel #2
0
    def initialize(self,**hyper_params):

        if(not "batch_size" in hyper_params.keys()):
            batch_size = 16
        if(not "layer_sizes" in hyper_params.keys()):
            num_samples = [25, 10]
        if(not "num_samples" in hyper_params.keys()):
            layer_sizes = [256, 256]
        if(not "bias" in hyper_params.keys()):
            bias = True
        if(not "dropout" in hyper_params.keys()):
            dropout = 0.0
        if(not "lr" in hyper_params.keys()):
            lr = 1e-3
        if(not "num_walks" in hyper_params.keys()):
            num_walks = 1
        if(not "length" in hyper_params.keys()):
            length = 5

        self.graph = sg.StellarGraph(nodes=self.nodes_df,edges=self.edges_df)
        self.nodes = list(self.graph.nodes())

        del self.nodes_df
        del self.edges_df

        unsupervised_samples = UnsupervisedSampler(
            self.graph, nodes=self.nodes, length=length, number_of_walks=num_walks
        )

        # Train iterators
        train_gen = GraphSAGELinkGenerator(self.graph, batch_size, num_samples)
        self.train_flow = train_gen.flow(unsupervised_samples)

        # Model defining - Keras functional API + Stellargraph layers
        graphsage = GraphSAGE(
            layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout, normalize="l2"
        )

        x_inp, x_out = graphsage.in_out_tensors()

        prediction = link_classification(
            output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
        )(x_out)

        self.model = keras.Model(inputs=x_inp, outputs=prediction)

        self.model.compile(
            optimizer=keras.optimizers.Adam(lr=lr),
            loss=keras.losses.binary_crossentropy,
            metrics=[keras.metrics.binary_accuracy],
        )

        x_inp_src = x_inp[0::2]
        x_out_src = x_out[0]
        self.embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

        self.node_gen = GraphSAGENodeGenerator(self.graph, batch_size, num_samples).flow(self.nodes)

        return self.model.get_weights()
Beispiel #3
0
    def initialize(self, **hyper_params):

        if (not "batch_size" in hyper_params.keys()):
            batch_size = 20
        if (not "layer_sizes" in hyper_params.keys()):
            num_samples = [20, 10]
        if (not "num_samples" in hyper_params.keys()):
            layer_sizes = [20, 20]
        if (not "bias" in hyper_params.keys()):
            bias = True
        if (not "dropout" in hyper_params.keys()):
            dropout = 0.3
        if (not "lr" in hyper_params.keys()):
            lr = 1e-3
        if (not "train_split" in hyper_params.keys()):
            train_split = 0.2

        self.graph = sg.StellarGraph(nodes=self.nodes, edges=self.edges)

        # Train split
        edge_splitter_train = EdgeSplitter(self.graph)
        graph_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
            p=train_split, method="global", keep_connected=True)

        # Train iterators
        train_gen = GraphSAGELinkGenerator(graph_train, batch_size,
                                           num_samples)
        self.train_flow = train_gen.flow(edge_ids_train,
                                         edge_labels_train,
                                         shuffle=True)

        # Model defining - Keras functional API + Stellargraph layers
        graphsage = GraphSAGE(layer_sizes=layer_sizes,
                              generator=train_gen,
                              bias=bias,
                              dropout=dropout)

        x_inp, x_out = graphsage.in_out_tensors()

        prediction = link_classification(output_dim=1,
                                         output_act="relu",
                                         edge_embedding_method="ip")(x_out)

        self.model = keras.Model(inputs=x_inp, outputs=prediction)

        self.model.compile(
            optimizer=keras.optimizers.Adam(lr=lr),
            loss=keras.losses.binary_crossentropy,
            metrics=["acc"],
        )

        return self.model.get_weights()
Beispiel #4
0
    def preprocess_train(self, G, edge_ids, edge_labels, mode='train'):
        """
        ```
        preprocess training set
        Args:
          G (networkx graph): networkx graph
          edge_ids(list): list of tuples representing edge ids
          edge_labels(list): edge labels (1 or 0 to indicated whether it is a true edge in original graph or not)
        ```
        """
        # import stellargraph
        try:
            import stellargraph as sg
            from stellargraph.mapper import GraphSAGELinkGenerator
        except:
            raise Exception(SG_ERRMSG)
        if version.parse(sg.__version__) < version.parse('0.8'):
            raise Exception(SG_ERRMSG)

        #edge_labels = to_categorical(edge_labels)
        G_sg = sg.StellarGraph(G, node_features="feature")
        #print(G_sg.info())
        shuffle = True if mode == 'train' else False
        link_seq = GraphSAGELinkGenerator(
            G_sg, U.DEFAULT_BS, self.sample_sizes).flow(edge_ids,
                                                        edge_labels,
                                                        shuffle=shuffle)
        from .sg_wrappers import LinkSequenceWrapper
        return LinkSequenceWrapper(link_seq)
Beispiel #5
0
def _dispatch_generator(graph, model_name, params,
                        generator_type="node"):
    """Create a graph generator."""
    if model_name == "watchyourstep":
        return AdjacencyPowerGenerator(
            graph, num_powers=params["num_powers"])
    elif model_name in ["complex", "distmult"]:
        return KGTripleGenerator(graph, params["batch_size"])
    elif model_name == "attri2vec":
        if generator_type == "node":
            return Attri2VecNodeGenerator(
                graph, params["batch_size"])
        else:
            return Attri2VecLinkGenerator(
                graph, params["batch_size"])
    elif model_name in ["graphsage", "graphsage_dgi"]:
        if generator_type == "node":
            return GraphSAGENodeGenerator(
                graph, params["batch_size"], params["num_samples"])
        else:
            return GraphSAGELinkGenerator(
                graph, params["batch_size"], params["num_samples"])
    elif model_name in ["gcn_dgi", "gat_dgi"]:
        return FullBatchNodeGenerator(graph, sparse=False)
    elif model_name in ["cluster_gcn_dgi", "cluster_gat_dgi"]:
        return ClusterNodeGenerator(
            graph, clusters=params["clusters"],
            q=params["clusters_q"])
    else:
        raise ValueError(f"Unknown model name '{model_name}'")
Beispiel #6
0
def create_graphSAGE_model(graph, link_prediction=False):

    if link_prediction:
        # We are going to train on the original graph
        generator = GraphSAGELinkGenerator(graph,
                                           batch_size=2,
                                           num_samples=[2, 2])
        edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]])
        train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0]))
    else:
        generator = GraphSAGENodeGenerator(graph,
                                           batch_size=2,
                                           num_samples=[2, 2])
        train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]]))

    # if link_prediction:
    #     edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]])
    #     train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0]))
    # else:
    #     train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]]))

    base_model = GraphSAGE(layer_sizes=[8, 8],
                           generator=train_gen,
                           bias=True,
                           dropout=0.5)

    if link_prediction:
        # Expose input and output sockets of graphsage, for source and destination nodes:
        x_inp_src, x_out_src = base_model.node_model()
        x_inp_dst, x_out_dst = base_model.node_model()
        # re-pack into a list where (source, destination) inputs alternate, for link inputs:
        x_inp = [x for ab in zip(x_inp_src, x_inp_dst) for x in ab]
        # same for outputs:
        x_out = [x_out_src, x_out_dst]

        prediction = link_classification(output_dim=1,
                                         output_act="relu",
                                         edge_embedding_method="ip")(x_out)

        keras_model = Model(inputs=x_inp, outputs=prediction)
    else:
        x_inp, x_out = base_model.node_model()
        prediction = layers.Dense(units=2, activation="softmax")(x_out)

        keras_model = Model(inputs=x_inp, outputs=prediction)

    return base_model, keras_model, generator, train_gen
def create_train_gen(G):
    # This generates random walk samples from the graph
    unsupervised_samples = UnsupervisedSampler(
        G,
        nodes=list(G.nodes()),
        length=config.WALK_LENGTH,
        number_of_walks=config.NUM_WALKS,
    )

    return GraphSAGELinkGenerator(G, config.BATCH_SIZE, config.NUM_SAMPLES).flow(
        unsupervised_samples
    )
    def initialize(self,**hyper_params):

        if(not "batch_size" in hyper_params.keys()):
            batch_size = 20
        if(not "layer_sizes" in hyper_params.keys()):
            num_samples = [20, 10]
        if(not "num_samples" in hyper_params.keys()):
            layer_sizes = [10, 10 ]
        if(not "bias" in hyper_params.keys()):
            bias = True
        if(not "dropout" in hyper_params.keys()):
            dropout = 0.1
        if(not "lr" in hyper_params.keys()):
            lr = 1e-2

        graph = sg.StellarGraph(nodes=self.nodes,edges=self.edges)

        # Test split
        edge_splitter_test = EdgeSplitter(graph)
        self.graph_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
            p=0.1, method="global", keep_connected=True, seed = 42
        )

        # Train split
        edge_splitter_train = EdgeSplitter(self.graph_test)
        self.graph_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
            p=0.1, method="global", keep_connected=True, seed = 42
        )

        # Train iterators
        train_gen = GraphSAGELinkGenerator(self.graph_train, batch_size, num_samples, seed = 42)
        self.train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True)

        # Test iterators
        test_gen = GraphSAGELinkGenerator(self.graph_train, batch_size, num_samples, seed = 42)
        self.test_flow = test_gen.flow(edge_ids_test, edge_labels_test, shuffle=True)

        # Model defining - Keras functional API + Stellargraph layers
        graphsage = GraphSAGE(
            layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout
        )

        x_inp, x_out = graphsage.in_out_tensors()

        prediction = link_classification(
            output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
        )(x_out)

        self.model = keras.Model(inputs=x_inp, outputs=prediction)

        self.model.compile(
            optimizer=keras.optimizers.Adam(lr=lr),
            loss=keras.losses.binary_crossentropy,
            metrics=[keras.metrics.BinaryAccuracy(),keras.metrics.Recall(),keras.metrics.AUC(),keras.metrics.Precision()],
        )

        # return number of training and testing examples
        return edge_ids_train.shape[0],edge_ids_test.shape[0]
def test(G, model_file: AnyStr, batch_size: int = 100):
    """
    Load the serialized model and evaluate on a random balanced subset of all links in the graph.
    Note that the set of links the model is evaluated on may contain links from the model's training set.
    To avoid this, set the seed of the edge splitter to the same seed as used for link splitting in train()

    Args:
        G: NetworkX graph file
        model_file: Location of Keras model to load
        batch_size: Size of batch for inference
    """
    print("Loading model from ", model_file)
    model = keras.models.load_model(
        model_file, custom_objects={"MeanAggregator": MeanAggregator})

    # Get required input shapes from model
    num_samples = [
        int(model.input_shape[ii + 1][1] / model.input_shape[ii][1])
        for ii in range(1,
                        len(model.input_shape) - 1, 2)
    ]

    edge_splitter_test = EdgeSplitter(G)
    # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G, and obtain the
    # reduced graph G_test with the sampled links removed:
    G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
        p=0.1,
        method=args.edge_sampling_method,
        probs=args.edge_sampling_probs)

    # Convert G_test to StellarGraph object (undirected, as required by GraphSAGE):
    G_test = sg.StellarGraph(G_test, node_features="feature")

    # Generator feeds data from (source, target) sampled subgraphs to GraphSAGE model
    test_gen = GraphSAGELinkGenerator(
        G_test,
        batch_size,
        num_samples,
        name="test",
    ).flow(edge_ids_test, edge_labels_test)

    # Evaluate and print metrics
    test_metrics = model.evaluate_generator(test_gen)

    print("\nTest Set Evaluation:")
    for name, val in zip(model.metrics_names, test_metrics):
        print("\t{}: {:0.4f}".format(name, val))
G = sg.StellarGraph(nodes=nodes,edges=edges)

# Train split
edge_splitter_train = EdgeSplitter(G)
G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
    p=0.2, method="global", keep_connected=True
)

# Hyperparams
batch_size = 20
epochs = 20
num_samples = [20, 10]
layer_sizes = [20, 20]

# Train iterators
train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples)
train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True)



# Model defining - Keras functional API + Stellargraph layers
graphsage = GraphSAGE(
    layer_sizes=layer_sizes, generator=train_gen, bias=True, dropout=0.3
)

x_inp, x_out = graphsage.in_out_tensors()

prediction = link_classification(
    output_dim=1, output_act="relu", edge_embedding_method="ip"
)(x_out)
# GraphSAGE model:

# In[12]:


num_samples = [20, 10]


# ### Create the generators for training

# For training we create a generator on the `G_train` graph. The `shuffle=True` argument is given to the `flow` method to improve training.

# In[13]:


generator = GraphSAGELinkGenerator(G_train, batch_size, num_samples)


# In[14]:


train_gen = generator.flow(edge_ids_train, edge_labels_train, shuffle=True)


# At test time we use the `G_test` graph and don't specify the `shuffle` argument (it defaults to `False`).

# In[15]:


test_gen = GraphSAGELinkGenerator(G_test, batch_size, num_samples).flow(
    edge_ids_test, edge_labels_test
Beispiel #12
0
g_test = g.copy()
edgelist = [(start, end) for start, end in zip(link_ids_train[:,0], link_ids_train[:,1]) ]
g_test.remove_edges_from(edgelist)

G = StellarGraph.from_networkx(g, node_features="feature")
g_train = StellarGraph.from_networkx(g_train, node_features="feature")
g_test = StellarGraph.from_networkx(g_test, node_features="feature")

print(g_train.info())
print(g_test.info())

##
batch_size = 40
num_samples = [15, 10, 5]

train_gen = GraphSAGELinkGenerator(g_train, batch_size, num_samples)
train_flow = train_gen.flow(link_ids_train, link_labels_train, shuffle=True)

test_gen = GraphSAGELinkGenerator(g_test, batch_size, num_samples)
test_flow = test_gen.flow(link_ids_test, link_labels_test)

traintest_gen = GraphSAGELinkGenerator(G, batch_size, num_samples)
traintest_flow = traintest_gen.flow(link_ids, link_labels)

## =================== model -==============================
graphsage_model = GraphSAGE(layer_sizes=[64, 32, 16], generator= train_gen, activations=["relu","relu","linear"],
                            bias=True, aggregator = MaxPoolingAggregator, dropout=0.0)

x_inp, x_out = graphsage_model.in_out_tensors()

def custom_layer(x):
Beispiel #13
0
def train(
    G,
    layer_size: List[int],
    num_samples: List[int],
    batch_size: int = 100,
    num_epochs: int = 10,
    learning_rate: float = 0.001,
    dropout: float = 0.0,
):
    """
    Train the GraphSAGE model on the specified graph G
    with given parameters.

    Args:
        G: NetworkX graph file
        layer_size: A list of number of hidden units in each layer of the GraphSAGE model
        num_samples: Number of neighbours to sample at each layer of the GraphSAGE model
        batch_size: Size of batch for inference
        num_epochs: Number of epochs to train the model
        learning_rate: Initial Learning rate
        dropout: The dropout (0->1)
    """

    # Split links into train/test
    print("Using '{}' method to sample negative links".format(
        args.edge_sampling_method))

    # From the original graph, extract E_test and the reduced graph G_test:
    edge_splitter_test = EdgeSplitter(G)
    # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G, and obtain the
    # reduced graph G_test with the sampled links removed:
    G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
        p=0.1,
        keep_connected=True,
        method=args.edge_sampling_method,
        probs=args.edge_sampling_probs,
    )

    # From G_test, extract E_train and the reduced graph G_train:
    edge_splitter_train = EdgeSplitter(G_test, G)
    # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G_test, and obtain the
    # further reduced graph G_train with the sampled links removed:
    G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
        p=0.1,
        keep_connected=True,
        method=args.edge_sampling_method,
        probs=args.edge_sampling_probs,
    )

    # G_train, edge_ds_train, edge_labels_train will be used for model training
    # G_test, edge_ds_test, edge_labels_test will be used for model testing

    # Convert G_train and G_test to StellarGraph objects (undirected, as required by GraphSAGE) for ML:
    G_train = sg.StellarGraph(G_train, node_features="feature")
    G_test = sg.StellarGraph(G_test, node_features="feature")

    # Mapper feeds link data from sampled subgraphs to GraphSAGE model
    # We need to create two mappers: for training and testing of the model
    train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples)
    train_flow = train_gen.flow(edge_ids_train,
                                edge_labels_train,
                                shuffle=True)

    test_gen = GraphSAGELinkGenerator(G_test, batch_size, num_samples)
    test_flow = test_gen.flow(edge_ids_test, edge_labels_test)

    # GraphSAGE model
    graphsage = GraphSAGE(layer_sizes=layer_size,
                          generator=train_gen,
                          bias=True,
                          dropout=dropout)

    # Construct input and output tensors for the link prediction model
    x_inp, x_out = graphsage.build()

    # Final estimator layer
    prediction = link_classification(
        output_dim=1,
        output_act="sigmoid",
        edge_embedding_method=args.edge_embedding_method,
    )(x_out)

    # Stack the GraphSAGE and prediction layers into a Keras model, and specify the loss
    model = keras.Model(inputs=x_inp, outputs=prediction)
    model.compile(
        optimizer=optimizers.Adam(lr=learning_rate),
        loss=losses.binary_crossentropy,
        metrics=[metrics.binary_accuracy],
    )

    # Evaluate the initial (untrained) model on the train and test set:
    init_train_metrics = model.evaluate_generator(train_flow)
    init_test_metrics = model.evaluate_generator(test_flow)

    print("\nTrain Set Metrics of the initial (untrained) model:")
    for name, val in zip(model.metrics_names, init_train_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    print("\nTest Set Metrics of the initial (untrained) model:")
    for name, val in zip(model.metrics_names, init_test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Train model
    print("\nTraining the model for {} epochs...".format(num_epochs))
    history = model.fit_generator(
        train_flow,
        epochs=num_epochs,
        validation_data=test_flow,
        verbose=2,
        shuffle=False,
    )

    # Evaluate and print metrics
    train_metrics = model.evaluate_generator(train_flow)
    test_metrics = model.evaluate_generator(test_flow)

    print("\nTrain Set Metrics of the trained model:")
    for name, val in zip(model.metrics_names, train_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    print("\nTest Set Metrics of the trained model:")
    for name, val in zip(model.metrics_names, test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Save the trained model
    save_str = "_n{}_l{}_d{}_r{}".format(
        "_".join([str(x) for x in num_samples]),
        "_".join([str(x) for x in layer_size]),
        dropout,
        learning_rate,
    )
    model.save("graphsage_link_pred" + save_str + ".h5")
Beispiel #14
0
dataset = datasets.Cora()
display(HTML(dataset.description))
G, node_subjects = dataset.load()

nodes = list(G.nodes())
number_of_walks = 1
length = 5
unsupervised_samples = UnsupervisedSampler(
    G, nodes=nodes, length=length, number_of_walks=number_of_walks
)

batch_size = 50
epochs = 4
num_samples = [10, 5]

generator = GraphSAGELinkGenerator(G, batch_size, num_samples)
train_gen = generator.flow(unsupervised_samples)



# +
# feature extractoring and preprocessing data
import librosa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# %matplotlib inline
import os
from PIL import Image
import pathlib
import csv
Beispiel #15
0
## Train the embedding
#                    mo"number of epochs to train for"l
G = sg.StellarGraph(Gnx, node_features=node_features)
Gtrain = sg.StellarGraph(Gtrain_nx, node_features=node_features)

# The graph G
#                    together wi"number of parallel workers to use" the unsupervised sampler will be used to generate samples.
actual_nodes_train = list(Gtrain.nodes())
if testtype == 'nodes':
    assert set(nodes_train).issuperset(actual_nodes_train)
unsupervised_samples = UnsupervisedSampler(Gtrain,
                                           nodes=actual_nodes_train,
                                           length=length_of_walks,
                                           number_of_walks=number_of_walks)
train_gen = GraphSAGELinkGenerator(Gtrain, batch_size,
                                   num_samples).flow(unsupervised_samples)

# Build the model
assert len(layer_sizes) == len(num_samples)
graphsage = GraphSAGE(layer_sizes=layer_sizes,
                      generator=train_gen,
                      bias=bias,
                      dropout=0.0,
                      normalize="l2")
x_inp, x_out = graphsage.build(flatten_output=False)
prediction = link_classification(output_dim=1,
                                 output_act="sigmoid",
                                 edge_embedding_method='ip')(x_out)
model = keras.Model(inputs=x_inp, outputs=prediction)
model.compile(
    optimizer=keras.optimizers.Adam(lr=1e-3),