Beispiel #1
0
def graph_link_predictor(name,
                         train_data,
                         preproc,
                         layer_sizes=[20, 20],
                         verbose=1):
    """
    Build and return a neural link prediction model.

    Args:
        name (string): one of:
                      - 'graphsage' for GraphSAGE model 
                      (only GraphSAGE currently supported)

        train_data (LinkSequenceWrapper): a ktrain.graph.sg_wrappers.LinkSequenceWrapper object
        preproc(LinkPreprocessor): a LinkPreprocessor instance
        verbose (boolean): verbosity of output
    Return:
        model (Model): A Keras Model instance
    """
    from .sg_wrappers import LinkSequenceWrapper

    # check argument
    if not isinstance(train_data, LinkSequenceWrapper):
        err = """
            train_data must be a ktrain.graph.sg_wrappers.LinkSequenceWrapper object
            """
        raise Exception(err)
    if len(layer_sizes) != len(preproc.sample_sizes):
        raise ValueError(
            'number of layer_sizes must match len(preproc.sample_sizes)')

    num_classes = U.nclasses_from_data(train_data)

    # set loss and activations
    loss_func = 'categorical_crossentropy'
    activation = 'softmax'

    # import stellargraph
    try:
        import stellargraph as sg
        from stellargraph.layer import GraphSAGE, link_classification
    except:
        raise Exception(SG_ERRMSG)
    if version.parse(sg.__version__) < version.parse('0.8'):
        raise Exception(SG_ERRMSG)

    # build a GraphSAGE link prediction model
    graphsage = GraphSAGE(layer_sizes=layer_sizes,
                          generator=train_data,
                          bias=True,
                          dropout=0.3)
    x_inp, x_out = graphsage.build()
    prediction = link_classification(output_dim=1,
                                     output_act="relu",
                                     edge_embedding_method='ip')(x_out)
    model = Model(inputs=x_inp, outputs=prediction)
    model.compile(optimizer=U.DEFAULT_OPT,
                  loss='binary_crossentropy',
                  metrics=["accuracy"])
    return model
Beispiel #2
0
    def initialize(self,**hyper_params):

        if(not "batch_size" in hyper_params.keys()):
            batch_size = 16
        if(not "layer_sizes" in hyper_params.keys()):
            num_samples = [25, 10]
        if(not "num_samples" in hyper_params.keys()):
            layer_sizes = [256, 256]
        if(not "bias" in hyper_params.keys()):
            bias = True
        if(not "dropout" in hyper_params.keys()):
            dropout = 0.0
        if(not "lr" in hyper_params.keys()):
            lr = 1e-3
        if(not "num_walks" in hyper_params.keys()):
            num_walks = 1
        if(not "length" in hyper_params.keys()):
            length = 5

        self.graph = sg.StellarGraph(nodes=self.nodes_df,edges=self.edges_df)
        self.nodes = list(self.graph.nodes())

        del self.nodes_df
        del self.edges_df

        unsupervised_samples = UnsupervisedSampler(
            self.graph, nodes=self.nodes, length=length, number_of_walks=num_walks
        )

        # Train iterators
        train_gen = GraphSAGELinkGenerator(self.graph, batch_size, num_samples)
        self.train_flow = train_gen.flow(unsupervised_samples)

        # Model defining - Keras functional API + Stellargraph layers
        graphsage = GraphSAGE(
            layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout, normalize="l2"
        )

        x_inp, x_out = graphsage.in_out_tensors()

        prediction = link_classification(
            output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
        )(x_out)

        self.model = keras.Model(inputs=x_inp, outputs=prediction)

        self.model.compile(
            optimizer=keras.optimizers.Adam(lr=lr),
            loss=keras.losses.binary_crossentropy,
            metrics=[keras.metrics.binary_accuracy],
        )

        x_inp_src = x_inp[0::2]
        x_out_src = x_out[0]
        self.embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

        self.node_gen = GraphSAGENodeGenerator(self.graph, batch_size, num_samples).flow(self.nodes)

        return self.model.get_weights()
    def initialize(self,**hyper_params):

        if(not "batch_size" in hyper_params.keys()):
            batch_size = 20
        if(not "layer_sizes" in hyper_params.keys()):
            num_samples = [20, 10]
        if(not "num_samples" in hyper_params.keys()):
            layer_sizes = [10, 10 ]
        if(not "bias" in hyper_params.keys()):
            bias = True
        if(not "dropout" in hyper_params.keys()):
            dropout = 0.1
        if(not "lr" in hyper_params.keys()):
            lr = 1e-2

        graph = sg.StellarGraph(nodes=self.nodes,edges=self.edges)

        # Test split
        edge_splitter_test = EdgeSplitter(graph)
        self.graph_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
            p=0.1, method="global", keep_connected=True, seed = 42
        )

        # Train split
        edge_splitter_train = EdgeSplitter(self.graph_test)
        self.graph_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
            p=0.1, method="global", keep_connected=True, seed = 42
        )

        # Train iterators
        train_gen = GraphSAGELinkGenerator(self.graph_train, batch_size, num_samples, seed = 42)
        self.train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True)

        # Test iterators
        test_gen = GraphSAGELinkGenerator(self.graph_train, batch_size, num_samples, seed = 42)
        self.test_flow = test_gen.flow(edge_ids_test, edge_labels_test, shuffle=True)

        # Model defining - Keras functional API + Stellargraph layers
        graphsage = GraphSAGE(
            layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout
        )

        x_inp, x_out = graphsage.in_out_tensors()

        prediction = link_classification(
            output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
        )(x_out)

        self.model = keras.Model(inputs=x_inp, outputs=prediction)

        self.model.compile(
            optimizer=keras.optimizers.Adam(lr=lr),
            loss=keras.losses.binary_crossentropy,
            metrics=[keras.metrics.BinaryAccuracy(),keras.metrics.Recall(),keras.metrics.AUC(),keras.metrics.Precision()],
        )

        # return number of training and testing examples
        return edge_ids_train.shape[0],edge_ids_test.shape[0]
Beispiel #4
0
    def initialize(self, **hyper_params):

        if (not "batch_size" in hyper_params.keys()):
            batch_size = 20
        if (not "layer_sizes" in hyper_params.keys()):
            num_samples = [20, 10]
        if (not "num_samples" in hyper_params.keys()):
            layer_sizes = [20, 20]
        if (not "bias" in hyper_params.keys()):
            bias = True
        if (not "dropout" in hyper_params.keys()):
            dropout = 0.3
        if (not "lr" in hyper_params.keys()):
            lr = 1e-3
        if (not "train_split" in hyper_params.keys()):
            train_split = 0.2

        self.graph = sg.StellarGraph(nodes=self.nodes, edges=self.edges)

        # Train split
        edge_splitter_train = EdgeSplitter(self.graph)
        graph_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
            p=train_split, method="global", keep_connected=True)

        # Train iterators
        train_gen = GraphSAGELinkGenerator(graph_train, batch_size,
                                           num_samples)
        self.train_flow = train_gen.flow(edge_ids_train,
                                         edge_labels_train,
                                         shuffle=True)

        # Model defining - Keras functional API + Stellargraph layers
        graphsage = GraphSAGE(layer_sizes=layer_sizes,
                              generator=train_gen,
                              bias=bias,
                              dropout=dropout)

        x_inp, x_out = graphsage.in_out_tensors()

        prediction = link_classification(output_dim=1,
                                         output_act="relu",
                                         edge_embedding_method="ip")(x_out)

        self.model = keras.Model(inputs=x_inp, outputs=prediction)

        self.model.compile(
            optimizer=keras.optimizers.Adam(lr=lr),
            loss=keras.losses.binary_crossentropy,
            metrics=["acc"],
        )

        return self.model.get_weights()
Beispiel #5
0
    def _fit_inductive_embedder(self, train_graph):
        """Fit inductive embedder (predictive model and embeddings)."""
        if self.model_name in ["cluster_gcn_dgi", "cluster_gat_dgi"]:
            return _fit_deep_graph_infomax(train_graph, self.params,
                                           self.model_name)

        unsupervised_samples = UnsupervisedSampler(
            train_graph,
            nodes=train_graph.nodes(),
            length=self.params["length"],
            number_of_walks=self.params["number_of_walks"])

        generator = _dispatch_generator(train_graph,
                                        self.model_name,
                                        self.params,
                                        generator_type="edge")
        layer_sizes = _dispatch_layer_sizes(self.model_name, self.params)
        embedding_layer = _dispatch_inductive_layer(layer_sizes, generator,
                                                    self.model_name,
                                                    self.params)

        x_inp, x_out = embedding_layer.in_out_tensors()

        prediction = link_classification(output_dim=1,
                                         output_act="sigmoid",
                                         edge_embedding_method="ip")(x_out)

        model = Model(inputs=x_inp, outputs=prediction)
        model.compile(
            optimizer=optimizers.Adam(lr=1e-3),
            loss=losses.binary_crossentropy,
            metrics=[metrics.binary_accuracy],
        )
        train_generator = generator.flow(unsupervised_samples)

        model.fit(train_generator,
                  epochs=self.params["epochs"],
                  shuffle=True,
                  verbose=0)

        if self.model_name == "attri2vec":
            x_inp_src = x_inp[0]
        elif self.model_name == "graphsage":
            x_inp_src = x_inp[0::2]

        x_out_src = x_out[0]

        embedding_model = Model(inputs=x_inp_src, outputs=x_out_src)
        return embedding_model
Beispiel #6
0
def create_graphSAGE_model(graph, link_prediction=False):

    if link_prediction:
        # We are going to train on the original graph
        generator = GraphSAGELinkGenerator(graph,
                                           batch_size=2,
                                           num_samples=[2, 2])
        edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]])
        train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0]))
    else:
        generator = GraphSAGENodeGenerator(graph,
                                           batch_size=2,
                                           num_samples=[2, 2])
        train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]]))

    # if link_prediction:
    #     edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]])
    #     train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0]))
    # else:
    #     train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]]))

    base_model = GraphSAGE(layer_sizes=[8, 8],
                           generator=train_gen,
                           bias=True,
                           dropout=0.5)

    if link_prediction:
        # Expose input and output sockets of graphsage, for source and destination nodes:
        x_inp_src, x_out_src = base_model.node_model()
        x_inp_dst, x_out_dst = base_model.node_model()
        # re-pack into a list where (source, destination) inputs alternate, for link inputs:
        x_inp = [x for ab in zip(x_inp_src, x_inp_dst) for x in ab]
        # same for outputs:
        x_out = [x_out_src, x_out_dst]

        prediction = link_classification(output_dim=1,
                                         output_act="relu",
                                         edge_embedding_method="ip")(x_out)

        keras_model = Model(inputs=x_inp, outputs=prediction)
    else:
        x_inp, x_out = base_model.node_model()
        prediction = layers.Dense(units=2, activation="softmax")(x_out)

        keras_model = Model(inputs=x_inp, outputs=prediction)

    return base_model, keras_model, generator, train_gen
def create_model(graph_sage):
    x_inp, x_out = graph_sage.build(flatten_output=False)

    # classification layer that takes the pair of node embeddings, combines them, puts them
    # through a dense layer
    prediction = link_classification(
        output_dim=1,
        output_act="sigmoid",
        edge_embedding_method="ip",
    )(x_out)

    model = keras.Model(inputs=x_inp, outputs=prediction)

    model.compile(
        optimizer=keras.optimizers.Adam(lr=1e-3),
        loss=keras.losses.binary_crossentropy,
        metrics=[keras.metrics.binary_accuracy],
    )

    return x_inp, x_out, model
# Train iterators
train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples)
train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True)



# Model defining - Keras functional API + Stellargraph layers
graphsage = GraphSAGE(
    layer_sizes=layer_sizes, generator=train_gen, bias=True, dropout=0.3
)

x_inp, x_out = graphsage.in_out_tensors()

prediction = link_classification(
    output_dim=1, output_act="relu", edge_embedding_method="ip"
)(x_out)

model = keras.Model(inputs=x_inp, outputs=prediction)

model.compile(
    optimizer=keras.optimizers.Adam(lr=1e-3),
    loss=keras.losses.binary_crossentropy,
    metrics=["acc"],
)

# Set weights
weights = np.load(path_weights,allow_pickle=True)
model.set_weights(weights)

print("Training started")
def get_hinsage_model(generator,
                      train_gen,
                      test_gen,
                      num_samples=[8, 4],
                      hinsage_layer_sizes=[32, 32],
                      bias=True,
                      dropout=0.0,
                      lr=1e-2,
                      edge_embedding_method='concat',
                      output_act='sigmoid'):

    assert len(hinsage_layer_sizes) == len(num_samples)

    hinsage = HinSAGE(layer_sizes=hinsage_layer_sizes,
                      generator=generator,
                      bias=bias,
                      dropout=dropout)

    # Expose input and output sockets of hinsage:
    x_inp, x_out = hinsage.in_out_tensors()

    # Final estimator layer
    score_prediction = link_classification(
        output_dim=1,
        output_act='sigmoid',
        edge_embedding_method=edge_embedding_method)(x_out)

    def root_mean_square_error(s_true, s_pred):
        return K.sqrt(K.mean(K.pow(s_true - s_pred, 2)))

    def recall_m(y_true, y_pred):
        y_pred = tf.where(y_pred > 0.5, 1.0, 0.0)
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision_m(y_true, y_pred):
        y_pred = tf.where(y_pred > 0.5, 1.0, 0.0)
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    def f1_m(y_true, y_pred):
        y_pred = tf.where(y_pred > 0.5, 1.0, 0.0)
        precision = precision_m(y_true, y_pred)
        recall = recall_m(y_true, y_pred)
        return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

    model = Model(inputs=x_inp, outputs=score_prediction)
    model.compile(
        optimizer=optimizers.Adam(lr=lr),
        # loss=losses.mean_squared_error,
        loss=losses.binary_crossentropy,
        metrics=[
            metrics.binary_accuracy,
            metrics.Precision(),
            metrics.Recall()
        ],
        # metrics=[root_mean_square_error, metrics.mae, 'acc'],
    )

    return model
Beispiel #10
0
def train(
    G,
    layer_size: List[int],
    num_samples: List[int],
    batch_size: int = 100,
    num_epochs: int = 10,
    learning_rate: float = 0.001,
    dropout: float = 0.0,
):
    """
    Train the GraphSAGE model on the specified graph G
    with given parameters.

    Args:
        G: NetworkX graph file
        layer_size: A list of number of hidden units in each layer of the GraphSAGE model
        num_samples: Number of neighbours to sample at each layer of the GraphSAGE model
        batch_size: Size of batch for inference
        num_epochs: Number of epochs to train the model
        learning_rate: Initial Learning rate
        dropout: The dropout (0->1)
    """

    # Split links into train/test
    print("Using '{}' method to sample negative links".format(
        args.edge_sampling_method))

    # From the original graph, extract E_test and the reduced graph G_test:
    edge_splitter_test = EdgeSplitter(G)
    # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G, and obtain the
    # reduced graph G_test with the sampled links removed:
    G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
        p=0.1,
        keep_connected=True,
        method=args.edge_sampling_method,
        probs=args.edge_sampling_probs,
    )

    # From G_test, extract E_train and the reduced graph G_train:
    edge_splitter_train = EdgeSplitter(G_test, G)
    # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G_test, and obtain the
    # further reduced graph G_train with the sampled links removed:
    G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
        p=0.1,
        keep_connected=True,
        method=args.edge_sampling_method,
        probs=args.edge_sampling_probs,
    )

    # G_train, edge_ds_train, edge_labels_train will be used for model training
    # G_test, edge_ds_test, edge_labels_test will be used for model testing

    # Convert G_train and G_test to StellarGraph objects (undirected, as required by GraphSAGE) for ML:
    G_train = sg.StellarGraph(G_train, node_features="feature")
    G_test = sg.StellarGraph(G_test, node_features="feature")

    # Mapper feeds link data from sampled subgraphs to GraphSAGE model
    # We need to create two mappers: for training and testing of the model
    train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples)
    train_flow = train_gen.flow(edge_ids_train,
                                edge_labels_train,
                                shuffle=True)

    test_gen = GraphSAGELinkGenerator(G_test, batch_size, num_samples)
    test_flow = test_gen.flow(edge_ids_test, edge_labels_test)

    # GraphSAGE model
    graphsage = GraphSAGE(layer_sizes=layer_size,
                          generator=train_gen,
                          bias=True,
                          dropout=dropout)

    # Construct input and output tensors for the link prediction model
    x_inp, x_out = graphsage.build()

    # Final estimator layer
    prediction = link_classification(
        output_dim=1,
        output_act="sigmoid",
        edge_embedding_method=args.edge_embedding_method,
    )(x_out)

    # Stack the GraphSAGE and prediction layers into a Keras model, and specify the loss
    model = keras.Model(inputs=x_inp, outputs=prediction)
    model.compile(
        optimizer=optimizers.Adam(lr=learning_rate),
        loss=losses.binary_crossentropy,
        metrics=[metrics.binary_accuracy],
    )

    # Evaluate the initial (untrained) model on the train and test set:
    init_train_metrics = model.evaluate_generator(train_flow)
    init_test_metrics = model.evaluate_generator(test_flow)

    print("\nTrain Set Metrics of the initial (untrained) model:")
    for name, val in zip(model.metrics_names, init_train_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    print("\nTest Set Metrics of the initial (untrained) model:")
    for name, val in zip(model.metrics_names, init_test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Train model
    print("\nTraining the model for {} epochs...".format(num_epochs))
    history = model.fit_generator(
        train_flow,
        epochs=num_epochs,
        validation_data=test_flow,
        verbose=2,
        shuffle=False,
    )

    # Evaluate and print metrics
    train_metrics = model.evaluate_generator(train_flow)
    test_metrics = model.evaluate_generator(test_flow)

    print("\nTrain Set Metrics of the trained model:")
    for name, val in zip(model.metrics_names, train_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    print("\nTest Set Metrics of the trained model:")
    for name, val in zip(model.metrics_names, test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Save the trained model
    save_str = "_n{}_l{}_d{}_r{}".format(
        "_".join([str(x) for x in num_samples]),
        "_".join([str(x) for x in layer_size]),
        dropout,
        learning_rate,
    )
    model.save("graphsage_link_pred" + save_str + ".h5")
Beispiel #11
0
                                           nodes=actual_nodes_train,
                                           length=length_of_walks,
                                           number_of_walks=number_of_walks)
train_gen = GraphSAGELinkGenerator(Gtrain, batch_size,
                                   num_samples).flow(unsupervised_samples)

# Build the model
assert len(layer_sizes) == len(num_samples)
graphsage = GraphSAGE(layer_sizes=layer_sizes,
                      generator=train_gen,
                      bias=bias,
                      dropout=0.0,
                      normalize="l2")
x_inp, x_out = graphsage.build(flatten_output=False)
prediction = link_classification(output_dim=1,
                                 output_act="sigmoid",
                                 edge_embedding_method='ip')(x_out)
model = keras.Model(inputs=x_inp, outputs=prediction)
model.compile(
    optimizer=keras.optimizers.Adam(lr=1e-3),
    loss=keras.losses.binary_crossentropy,
    metrics=[keras.metrics.binary_accuracy],
)

# Train the model
history = model.fit_generator(
    train_gen,
    epochs=nepochs,
    verbose=verbose,
    use_multiprocessing=False,
    workers=nworkers,
Beispiel #12
0
def main():
    with open(r"training.txt", "r") as f:
        reader = csv.reader(f)
        training = list(reader)
    # in order of training examples
    training = [element[0].split(" ") for element in training]
    training = pd.DataFrame(training, columns=['Node1', 'Node2', 'Link'])
    print("Training examples shape: {}".format(training.shape))

    with open(r"testing.txt", "r") as f:
        reader = csv.reader(f)
        testing = list(reader)
    # in order of testing examples
    testing = [element[0].split(" ") for element in testing]
    testing = pd.DataFrame(testing, columns=['Node1', 'Node2'])
    print("Testing examples shape: {}".format(testing.shape))
    '''
    uncomment lines for reduced corpus with stopword removal. In future integrate stemmer here, multi-language
    '''
    NODE_INFO_DIRECTORY = r"node_information/text/"

    corpus_path = r"pickles/simple_corpus.PICKLE"
    ids_path = r"pickles/ids.PICKLE"
    if os.path.exists(corpus_path):
        with open(corpus_path, 'rb') as f:
            corpus = pickle.load(f)
        f.close()
        with open(ids_path, 'rb') as f:
            ids = pickle.load(f)
        f.close()
    else:
        corpus = []
        ids = []
        for filename in tqdm(os.listdir(NODE_INFO_DIRECTORY),
                             position=0,
                             leave=True):
            with open(NODE_INFO_DIRECTORY + filename,
                      'r',
                      encoding='UTF-8',
                      errors='ignore') as f:
                doc_string = []
                for line in f:
                    [
                        doc_string.append(token.strip())
                        for token in line.lower().strip().split(" ")
                        if token != ""
                    ]
                corpus.append(' '.join(doc_string))
                ids.append(filename[:-4])
        with open(corpus_path, '+wb') as f:
            pickle.dump(corpus, f)
        f.close()
        with open(ids_path, '+wb') as f:
            pickle.dump(ids, f)
        f.close()

    stemmed_corpus_path = r"pickles/stemmed_corpus.PICKLE"
    if os.path.exists(stemmed_corpus_path):
        with open(stemmed_corpus_path, 'rb') as f:
            stemmed_corpus = pickle.load(f)
        f.close()
    else:
        print('Stemmed corpus unavailable')

    # in order of alphabetical text information i.e. 0, 1, 10, 100
    node_info = pd.DataFrame({
        'id': ids,
        'corpus': corpus,
        'stemmed': stemmed_corpus
    })
    print("Training node info shape: {}".format(node_info.shape))

    train_graph_split_path = 'pickles/train_graph_split.PICKLE'

    if os.path.exists(train_graph_split_path):
        with open(train_graph_split_path, 'rb') as f:
            keep_indices = pickle.load(f)
        f.close()
    else:
        keep_indices = random.sample(range(len(training)),
                                     k=int(len(training) * 0.05))
        with open(train_graph_split_path, '+wb') as f:
            pickle.dump(keep_indices, f)
        f.close()

    data_train_val = training.iloc[keep_indices]

    linked_nodes = training.loc[training['Link'] == '1']
    linked_nodes = linked_nodes[['Node1', 'Node2']]
    edgelist = linked_nodes.rename(columns={
        "Node1": "source",
        "Node2": "target"
    })

    lda_path = r"pickles/stemmed_lda_matrix.PICKLE"
    if os.path.exists(lda_path):
        with open(lda_path, 'rb') as f:
            lda = pickle.load(f)
        f.close()

    lda.shape

    feature_names = node_column_names = ["w_{}".format(ii) for ii in range(10)]
    node_data = pd.DataFrame(lda, columns=node_column_names)
    node_data.index = [str(i) for i in node_data.index]

    G_all_nx = nx.from_pandas_edgelist(edgelist)

    all_node_features = node_data[feature_names]

    G_all = sg.StellarGraph(G_all_nx, node_features=all_node_features)

    print(G_all.info())

    G_all.get_feature_for_nodes(['0'])

    ## Get DBLP Subgraph
    ### with papers published before a threshold year

    sub_linked_nodes = data_train_val.loc[data_train_val['Link'] == '1']
    sub_linked_nodes = sub_linked_nodes[['Node1', 'Node2']]
    subgraph_edgelist = sub_linked_nodes.rename(columns={
        "Node1": "source",
        "Node2": "target"
    })

    G_sub_nx = nx.from_pandas_edgelist(subgraph_edgelist)

    subgraph_node_ids = sorted(list(G_sub_nx.nodes))

    subgraph_node_features = node_data[feature_names].reindex(
        subgraph_node_ids)

    G_sub = sg.StellarGraph(G_sub_nx, node_features=subgraph_node_features)

    print(G_sub.info())

    ## Train attri2vec on the DBLP Subgraph

    nodes = list(G_sub.nodes())
    number_of_walks = int(input('Number of Walks: '))
    length = int(input('Walk length: '))

    unsupervised_samples = UnsupervisedSampler(G_sub,
                                               nodes=nodes,
                                               length=length,
                                               number_of_walks=number_of_walks)

    batch_size = 50
    epochs = int(input('Enter number of epochs: '))

    generator = Attri2VecLinkGenerator(G_sub, batch_size)

    layer_sizes = [128]
    attri2vec = Attri2Vec(layer_sizes=layer_sizes,
                          generator=generator.flow(unsupervised_samples),
                          bias=False,
                          normalize=None)

    # Build the model and expose input and output sockets of attri2vec, for node pair inputs:
    x_inp, x_out = attri2vec.build()

    prediction = link_classification(output_dim=1,
                                     output_act="sigmoid",
                                     edge_embedding_method='ip')(x_out)

    model = keras.Model(inputs=x_inp, outputs=prediction)

    model.compile(
        optimizer=keras.optimizers.Adam(lr=1e-2),
        loss=keras.losses.binary_crossentropy,
        metrics=[keras.metrics.binary_accuracy],
    )

    history = model.fit_generator(
        generator.flow(unsupervised_samples),
        epochs=epochs,
        verbose=1,
        use_multiprocessing=bool(int(input('Multiprocessing? 1/0: '))),
        workers=int(input('Number of workers: ')),
        shuffle=True,
    )
    print(history)
    model.save('model_walks{}len{}e{}.h5'.format(number_of_walks, length,
                                                 epochs))
    return model