Example #1
0
def graph_link_predictor(name,
                         train_data,
                         preproc,
                         layer_sizes=[20, 20],
                         verbose=1):
    """
    Build and return a neural link prediction model.

    Args:
        name (string): one of:
                      - 'graphsage' for GraphSAGE model 
                      (only GraphSAGE currently supported)

        train_data (LinkSequenceWrapper): a ktrain.graph.sg_wrappers.LinkSequenceWrapper object
        preproc(LinkPreprocessor): a LinkPreprocessor instance
        verbose (boolean): verbosity of output
    Return:
        model (Model): A Keras Model instance
    """
    from .sg_wrappers import LinkSequenceWrapper

    # check argument
    if not isinstance(train_data, LinkSequenceWrapper):
        err = """
            train_data must be a ktrain.graph.sg_wrappers.LinkSequenceWrapper object
            """
        raise Exception(err)
    if len(layer_sizes) != len(preproc.sample_sizes):
        raise ValueError(
            'number of layer_sizes must match len(preproc.sample_sizes)')

    num_classes = U.nclasses_from_data(train_data)

    # set loss and activations
    loss_func = 'categorical_crossentropy'
    activation = 'softmax'

    # import stellargraph
    try:
        import stellargraph as sg
        from stellargraph.layer import GraphSAGE, link_classification
    except:
        raise Exception(SG_ERRMSG)
    if version.parse(sg.__version__) < version.parse('0.8'):
        raise Exception(SG_ERRMSG)

    # build a GraphSAGE link prediction model
    graphsage = GraphSAGE(layer_sizes=layer_sizes,
                          generator=train_data,
                          bias=True,
                          dropout=0.3)
    x_inp, x_out = graphsage.build()
    prediction = link_classification(output_dim=1,
                                     output_act="relu",
                                     edge_embedding_method='ip')(x_out)
    model = Model(inputs=x_inp, outputs=prediction)
    model.compile(optimizer=U.DEFAULT_OPT,
                  loss='binary_crossentropy',
                  metrics=["accuracy"])
    return model
Example #2
0
def graph_node_classifier(name, train_data, layer_sizes=[32, 32], verbose=1):
    """
    Build and return a neural node classification model.
    Notes: Only mutually-exclusive class labels are supported.
    Args:
        name (string): one of:
                      - 'graphsage' for GraphSAGE model
                      (only GraphSAGE currently supported)
        train_data (NodeSequenceWrapper): a deepwrap.graph.sg_wrappers.NodeSequenceWrapper object
        verbose (boolean): verbosity of output
    Return:
        model (Model): A Keras Model instance
    """
    from .sg_wrappers import NodeSequenceWrapper

    # check argument
    if not isinstance(train_data, NodeSequenceWrapper):
        err = """
            train_data must be a deepwrap.graph.sg_wrappers.NodeSequenceWrapper object
            """
        raise Exception(err)
    if len(layer_sizes) != 2:
        raise ValueError('layer_sizes must be of length 2')

    num_classes = U.nclasses_from_data(train_data)

    # determine multilabel
    multilabel = U.is_multilabel(train_data)
    if multilabel:
        raise ValueError(
            'Multi-label classification not currently supported for graphs.')
    U.vprint("Is Multi-Label? %s" % (multilabel), verbose=verbose)

    # set loss and activations
    loss_func = 'categorical_crossentropy'
    activation = 'softmax'

    # import stellargraph
    try:
        import stellargraph as sg
        from stellargraph.layer import GraphSAGE
    except:
        raise Exception(SG_ERRMSG)
    if version.parse(sg.__version__) < version.parse('0.8'):
        raise Exception(SG_ERRMSG)

    # build a GraphSAGE node classification model
    graphsage_model = GraphSAGE(
        layer_sizes=layer_sizes,
        generator=train_data,
        bias=True,
        dropout=0.5,
    )
    # x_inp, x_out = graphsage_model.default_model(flatten_output=True)
    x_inp, x_out = graphsage_model.build()
    prediction = Dense(units=num_classes, activation=activation)(x_out)
    model = Model(inputs=x_inp, outputs=prediction)
    model.compile(optimizer='adam', loss=loss_func, metrics=["accuracy"])
    U.vprint('done', verbose=verbose)
    return model
Example #3
0
    def initialize(self,**hyper_params):

        if(not "batch_size" in hyper_params.keys()):
            batch_size = 16
        if(not "layer_sizes" in hyper_params.keys()):
            num_samples = [25, 10]
        if(not "num_samples" in hyper_params.keys()):
            layer_sizes = [256, 256]
        if(not "bias" in hyper_params.keys()):
            bias = True
        if(not "dropout" in hyper_params.keys()):
            dropout = 0.0
        if(not "lr" in hyper_params.keys()):
            lr = 1e-3
        if(not "num_walks" in hyper_params.keys()):
            num_walks = 1
        if(not "length" in hyper_params.keys()):
            length = 5

        self.graph = sg.StellarGraph(nodes=self.nodes_df,edges=self.edges_df)
        self.nodes = list(self.graph.nodes())

        del self.nodes_df
        del self.edges_df

        unsupervised_samples = UnsupervisedSampler(
            self.graph, nodes=self.nodes, length=length, number_of_walks=num_walks
        )

        # Train iterators
        train_gen = GraphSAGELinkGenerator(self.graph, batch_size, num_samples)
        self.train_flow = train_gen.flow(unsupervised_samples)

        # Model defining - Keras functional API + Stellargraph layers
        graphsage = GraphSAGE(
            layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout, normalize="l2"
        )

        x_inp, x_out = graphsage.in_out_tensors()

        prediction = link_classification(
            output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
        )(x_out)

        self.model = keras.Model(inputs=x_inp, outputs=prediction)

        self.model.compile(
            optimizer=keras.optimizers.Adam(lr=lr),
            loss=keras.losses.binary_crossentropy,
            metrics=[keras.metrics.binary_accuracy],
        )

        x_inp_src = x_inp[0::2]
        x_out_src = x_out[0]
        self.embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

        self.node_gen = GraphSAGENodeGenerator(self.graph, batch_size, num_samples).flow(self.nodes)

        return self.model.get_weights()
    def initialize(self,**hyper_params):

        if(not "batch_size" in hyper_params.keys()):
            batch_size = 20
        if(not "layer_sizes" in hyper_params.keys()):
            num_samples = [20, 10]
        if(not "num_samples" in hyper_params.keys()):
            layer_sizes = [10, 10 ]
        if(not "bias" in hyper_params.keys()):
            bias = True
        if(not "dropout" in hyper_params.keys()):
            dropout = 0.1
        if(not "lr" in hyper_params.keys()):
            lr = 1e-2

        graph = sg.StellarGraph(nodes=self.nodes,edges=self.edges)

        # Test split
        edge_splitter_test = EdgeSplitter(graph)
        self.graph_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
            p=0.1, method="global", keep_connected=True, seed = 42
        )

        # Train split
        edge_splitter_train = EdgeSplitter(self.graph_test)
        self.graph_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
            p=0.1, method="global", keep_connected=True, seed = 42
        )

        # Train iterators
        train_gen = GraphSAGELinkGenerator(self.graph_train, batch_size, num_samples, seed = 42)
        self.train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True)

        # Test iterators
        test_gen = GraphSAGELinkGenerator(self.graph_train, batch_size, num_samples, seed = 42)
        self.test_flow = test_gen.flow(edge_ids_test, edge_labels_test, shuffle=True)

        # Model defining - Keras functional API + Stellargraph layers
        graphsage = GraphSAGE(
            layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout
        )

        x_inp, x_out = graphsage.in_out_tensors()

        prediction = link_classification(
            output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
        )(x_out)

        self.model = keras.Model(inputs=x_inp, outputs=prediction)

        self.model.compile(
            optimizer=keras.optimizers.Adam(lr=lr),
            loss=keras.losses.binary_crossentropy,
            metrics=[keras.metrics.BinaryAccuracy(),keras.metrics.Recall(),keras.metrics.AUC(),keras.metrics.Precision()],
        )

        # return number of training and testing examples
        return edge_ids_train.shape[0],edge_ids_test.shape[0]
Example #5
0
    def train_clf(self, graph, L):
        '''
			Train GraphSage model with updated labeled pool L
			Return new trained model
		'''
        train_targets = self.target_encoding.transform(
            self.df_targets.loc[L].to_dict("records"))
        train_gen = self.generator.flow(L, train_targets)

        gsage = GraphSAGE(layer_sizes=[32, 32],
                          generator=self.generator,
                          bias=True,
                          dropout=0.5)

        x_inp, x_out = gsage.build()
        predictions = layers.Dense(units=train_targets.shape[1],
                                   activation="softmax")(x_out)

        class_support = dict(Counter(self.df_targets.loc[L]["label"]))
        classes = sorted(self.data.class_labels)
        counts = [
            class_support[c] if c in class_support else 0 for c in classes
        ]
        weights = np.sum(counts) / np.array(counts)
        weighted_loss = self.weighted_categorical_crossentropy(weights)

        model = Model(inputs=x_inp, outputs=predictions)
        model.compile(
            optimizer=optimizers.Adam(lr=0.2),
            # loss=losses.categorical_crossentropy,
            loss=weighted_loss,
            metrics=["acc"],
        )

        # if not os.path.isdir("model_logs"):
        #     os.makedirs("model_logs")
        # es_callback = EarlyStopping(
        #     monitor="acc", patience=50
        # )  # patience is the number of epochs to wait before early stopping in case of no further improvement
        # mc_callback = ModelCheckpoint(
        #     "model_logs/best_model.h5", monitor="acc", save_best_only=True, save_weights_only=True
        # )

        history = model.fit_generator(
            train_gen,
            epochs=50,
            verbose=0,
            shuffle=
            False,  # this should be False, since shuffling data means shuffling the whole graph
            # callbacks=[es_callback, mc_callback],
        )

        # model.load_weights("model_logs/best_model.h5")

        return model
Example #6
0
    def initialize(self, **hyper_params):

        if (not "batch_size" in hyper_params.keys()):
            batch_size = 20
        if (not "layer_sizes" in hyper_params.keys()):
            num_samples = [20, 10]
        if (not "num_samples" in hyper_params.keys()):
            layer_sizes = [20, 20]
        if (not "bias" in hyper_params.keys()):
            bias = True
        if (not "dropout" in hyper_params.keys()):
            dropout = 0.3
        if (not "lr" in hyper_params.keys()):
            lr = 1e-3
        if (not "train_split" in hyper_params.keys()):
            train_split = 0.2

        self.graph = sg.StellarGraph(nodes=self.nodes, edges=self.edges)

        # Train split
        edge_splitter_train = EdgeSplitter(self.graph)
        graph_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
            p=train_split, method="global", keep_connected=True)

        # Train iterators
        train_gen = GraphSAGELinkGenerator(graph_train, batch_size,
                                           num_samples)
        self.train_flow = train_gen.flow(edge_ids_train,
                                         edge_labels_train,
                                         shuffle=True)

        # Model defining - Keras functional API + Stellargraph layers
        graphsage = GraphSAGE(layer_sizes=layer_sizes,
                              generator=train_gen,
                              bias=bias,
                              dropout=dropout)

        x_inp, x_out = graphsage.in_out_tensors()

        prediction = link_classification(output_dim=1,
                                         output_act="relu",
                                         edge_embedding_method="ip")(x_out)

        self.model = keras.Model(inputs=x_inp, outputs=prediction)

        self.model.compile(
            optimizer=keras.optimizers.Adam(lr=lr),
            loss=keras.losses.binary_crossentropy,
            metrics=["acc"],
        )

        return self.model.get_weights()
    def run_model(self):
        graph_sampled, label_series_sampled = self.prepare_data_for_stellargraph(
        )
        train_targets, valid_targets, test_targets, train_labels, valid_labels, test_labels = self.get_train_valid_test(
            label_series_sampled)

        batch_size = self.hyperparams["batch_size"]
        num_samples = self.hyperparams["num_samples"]
        generator = GraphSAGENodeGenerator(graph_sampled, batch_size,
                                           num_samples)
        train_gen = generator.flow(train_labels.index,
                                   train_targets,
                                   shuffle=True)
        graphsage_model = GraphSAGE(
            layer_sizes=self.hyperparams["layer_sizes"],
            generator=generator,
            bias=self.hyperparams["bias"],
            dropout=self.hyperparams["dropout"],
        )
        x_inp, x_out = graphsage_model.in_out_tensors()
        prediction = layers.Dense(units=train_targets.shape[1],
                                  activation="softmax")(x_out)

        model = Model(inputs=x_inp, outputs=prediction)
        model.compile(
            optimizer=optimizers.Adam(lr=self.hyperparams["lr"]),
            loss=losses.categorical_crossentropy,
            metrics=["acc"],
        )

        valid_gen = generator.flow(valid_labels.index, valid_targets)

        history = model.fit(
            train_gen,
            epochs=self.hyperparams["n_epochs"],
            validation_data=valid_gen,
            verbose=self.hyperparams["verbose"],
            shuffle=True,
            use_multiprocessing=True,
        )

        sg.utils.plot_history(history)

        test_gen = generator.flow(test_labels.index, test_targets)
        test_metrics = model.evaluate(test_gen)
        print("\nTest Set Metrics:")
        for name, valid in zip(model.metrics_names, test_metrics):
            print("\t{}: {:0.4f}".format(name, valid))
Example #8
0
def graphsage_pipeline(G, node_subjects, layer_sizes=[32, 32]):
    train_subjects, val_subjects, test_subjects = training_split(node_subjects)

    batch_size = 50
    num_samples = [10, 5]
    generator = GraphSAGENodeGenerator(G, batch_size, num_samples)
    train_gen = generator.flow(train_subjects.index,
                               train_subjects.values,
                               shuffle=True)
    graphsage_model = GraphSAGE(
        layer_sizes=layer_sizes,
        generator=generator,
        bias=True,
        dropout=0.5,
    )

    model = build_model(graphsage_model, train_subjects.values.shape[1])

    val_gen = generator.flow(val_subjects.index, val_subjects.values)
    es_callback = EarlyStopping(monitor="val_acc",
                                patience=50,
                                restore_best_weights=True)
    history = model.fit(train_gen,
                        epochs=200,
                        validation_data=val_gen,
                        verbose=0,
                        shuffle=False,
                        callbacks=[es_callback])

    plot_results(history)
    test_metrics(generator, model, test_subjects)
Example #9
0
def create_graphSAGE_model(graph, link_prediction=False):

    if link_prediction:
        # We are going to train on the original graph
        generator = GraphSAGELinkGenerator(graph,
                                           batch_size=2,
                                           num_samples=[2, 2])
        edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]])
        train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0]))
    else:
        generator = GraphSAGENodeGenerator(graph,
                                           batch_size=2,
                                           num_samples=[2, 2])
        train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]]))

    # if link_prediction:
    #     edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]])
    #     train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0]))
    # else:
    #     train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]]))

    base_model = GraphSAGE(layer_sizes=[8, 8],
                           generator=train_gen,
                           bias=True,
                           dropout=0.5)

    if link_prediction:
        # Expose input and output sockets of graphsage, for source and destination nodes:
        x_inp_src, x_out_src = base_model.node_model()
        x_inp_dst, x_out_dst = base_model.node_model()
        # re-pack into a list where (source, destination) inputs alternate, for link inputs:
        x_inp = [x for ab in zip(x_inp_src, x_inp_dst) for x in ab]
        # same for outputs:
        x_out = [x_out_src, x_out_dst]

        prediction = link_classification(output_dim=1,
                                         output_act="relu",
                                         edge_embedding_method="ip")(x_out)

        keras_model = Model(inputs=x_inp, outputs=prediction)
    else:
        x_inp, x_out = base_model.node_model()
        prediction = layers.Dense(units=2, activation="softmax")(x_out)

        keras_model = Model(inputs=x_inp, outputs=prediction)

    return base_model, keras_model, generator, train_gen
def create_graphsage(train_gen):
    return GraphSAGE(
        layer_sizes=config.LAYER_SIZES,
        generator=train_gen,
        bias=True,
        dropout=config.DROPOUT,
        normalize="l2",
    )
Example #11
0
def _dispatch_inductive_layer(layer_sizes, generator, model_name, params):
    if model_name == "attri2vec":
        embedding_layer = Attri2Vec(
            layer_sizes=layer_sizes,
            generator=generator,
            bias=False, normalize=None
        )
    elif model_name == "graphsage":
        embedding_layer = GraphSAGE(
            layer_sizes=layer_sizes,
            generator=generator,
            bias=True,
            dropout=0.0,
            normalize="l2"
        )
    return embedding_layer
Example #12
0
def _fit_deep_graph_infomax(train_graph, params, model_name):
    """Train unsupervised Deep Graph Infomax."""
    if "gcn_dgi" in model_name or "gat_dgi" in model_name:
        if "cluster" in model_name:
            generator = ClusterNodeGenerator(
                train_graph, clusters=params["clusters"],
                q=params["clusters_q"])
        else:
            generator = FullBatchNodeGenerator(train_graph, sparse=False)

        if "gcn_dgi" in model_name:
            embedding_layer = GCN(
                layer_sizes=[params["embedding_dimension"]],
                activations=["relu"], generator=generator)
        elif "gat_dgi" in model_name:
            embedding_layer = GAT(
                layer_sizes=[params["embedding_dimension"]],
                activations=["relu"], generator=generator, attn_heads=8)
    elif model_name == "graphsage_dgi":
        generator = GraphSAGENodeGenerator(
            train_graph, batch_size=50, num_samples=[5])
        embedding_layer = GraphSAGE(
            layer_sizes=[params["embedding_dimension"]], activations=["relu"],
            generator=generator
        )
    else:
        raise ValueError(f"Unknown mode name {model_name}")

    embedding_model = _execute_deep_graph_infomax(
        train_graph, embedding_layer, generator, params)

    # Here the models can be both inductive and transductive
    if model_name in ["gcn_dgi", "gat_dgi", "graphsage_dgi"]:
        return embedding_model.predict(
            generator.flow(train_graph.nodes()))
    else:
        return embedding_model
Example #13
0
def train(
    G,
    layer_size: List[int],
    num_samples: List[int],
    batch_size: int = 100,
    num_epochs: int = 10,
    learning_rate: float = 0.001,
    dropout: float = 0.0,
):
    """
    Train the GraphSAGE model on the specified graph G
    with given parameters.

    Args:
        G: NetworkX graph file
        layer_size: A list of number of hidden units in each layer of the GraphSAGE model
        num_samples: Number of neighbours to sample at each layer of the GraphSAGE model
        batch_size: Size of batch for inference
        num_epochs: Number of epochs to train the model
        learning_rate: Initial Learning rate
        dropout: The dropout (0->1)
    """

    # Split links into train/test
    print("Using '{}' method to sample negative links".format(
        args.edge_sampling_method))

    # From the original graph, extract E_test and the reduced graph G_test:
    edge_splitter_test = EdgeSplitter(G)
    # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G, and obtain the
    # reduced graph G_test with the sampled links removed:
    G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
        p=0.1,
        keep_connected=True,
        method=args.edge_sampling_method,
        probs=args.edge_sampling_probs,
    )

    # From G_test, extract E_train and the reduced graph G_train:
    edge_splitter_train = EdgeSplitter(G_test, G)
    # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G_test, and obtain the
    # further reduced graph G_train with the sampled links removed:
    G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
        p=0.1,
        keep_connected=True,
        method=args.edge_sampling_method,
        probs=args.edge_sampling_probs,
    )

    # G_train, edge_ds_train, edge_labels_train will be used for model training
    # G_test, edge_ds_test, edge_labels_test will be used for model testing

    # Convert G_train and G_test to StellarGraph objects (undirected, as required by GraphSAGE) for ML:
    G_train = sg.StellarGraph(G_train, node_features="feature")
    G_test = sg.StellarGraph(G_test, node_features="feature")

    # Mapper feeds link data from sampled subgraphs to GraphSAGE model
    # We need to create two mappers: for training and testing of the model
    train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples)
    train_flow = train_gen.flow(edge_ids_train,
                                edge_labels_train,
                                shuffle=True)

    test_gen = GraphSAGELinkGenerator(G_test, batch_size, num_samples)
    test_flow = test_gen.flow(edge_ids_test, edge_labels_test)

    # GraphSAGE model
    graphsage = GraphSAGE(layer_sizes=layer_size,
                          generator=train_gen,
                          bias=True,
                          dropout=dropout)

    # Construct input and output tensors for the link prediction model
    x_inp, x_out = graphsage.build()

    # Final estimator layer
    prediction = link_classification(
        output_dim=1,
        output_act="sigmoid",
        edge_embedding_method=args.edge_embedding_method,
    )(x_out)

    # Stack the GraphSAGE and prediction layers into a Keras model, and specify the loss
    model = keras.Model(inputs=x_inp, outputs=prediction)
    model.compile(
        optimizer=optimizers.Adam(lr=learning_rate),
        loss=losses.binary_crossentropy,
        metrics=[metrics.binary_accuracy],
    )

    # Evaluate the initial (untrained) model on the train and test set:
    init_train_metrics = model.evaluate_generator(train_flow)
    init_test_metrics = model.evaluate_generator(test_flow)

    print("\nTrain Set Metrics of the initial (untrained) model:")
    for name, val in zip(model.metrics_names, init_train_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    print("\nTest Set Metrics of the initial (untrained) model:")
    for name, val in zip(model.metrics_names, init_test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Train model
    print("\nTraining the model for {} epochs...".format(num_epochs))
    history = model.fit_generator(
        train_flow,
        epochs=num_epochs,
        validation_data=test_flow,
        verbose=2,
        shuffle=False,
    )

    # Evaluate and print metrics
    train_metrics = model.evaluate_generator(train_flow)
    test_metrics = model.evaluate_generator(test_flow)

    print("\nTrain Set Metrics of the trained model:")
    for name, val in zip(model.metrics_names, train_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    print("\nTest Set Metrics of the trained model:")
    for name, val in zip(model.metrics_names, test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Save the trained model
    save_str = "_n{}_l{}_d{}_r{}".format(
        "_".join([str(x) for x in num_samples]),
        "_".join([str(x) for x in layer_size]),
        dropout,
        learning_rate,
    )
    model.save("graphsage_link_pred" + save_str + ".h5")
Example #14
0
def train(
    edgelist,
    node_data,
    layer_size,
    num_samples,
    batch_size=100,
    num_epochs=10,
    learning_rate=0.005,
    dropout=0.0,
    target_name="subject",
):
    """
    Train a GraphSAGE model on the specified graph G with given parameters, evaluate it, and save the model.

    Args:
        edgelist: Graph edgelist
        node_data: Feature and target data for nodes
        layer_size: A list of number of hidden nodes in each layer
        num_samples: Number of neighbours to sample at each layer
        batch_size: Size of batch for inference
        num_epochs: Number of epochs to train the model
        learning_rate: Initial Learning rate
        dropout: The dropout (0->1)
    """
    # Extract target and encode as a one-hot vector
    target_encoding = feature_extraction.DictVectorizer(sparse=False)
    node_targets = target_encoding.fit_transform(
        node_data[[target_name]].to_dict("records"))
    node_ids = node_data.index

    # Extract the feature data. These are the feature vectors that the Keras model will use as input.
    # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication.
    node_features = node_data[feature_names]

    # Create graph from edgelist and set node features and node type
    Gnx = nx.from_pandas_edgelist(edgelist, edge_attr="label")
    nx.set_node_attributes(Gnx, "paper", "label")

    # Convert to StellarGraph and prepare for ML
    G = sg.StellarGraph(Gnx,
                        node_type_name="label",
                        node_features=node_features)

    # Split nodes into train/test using stratification.
    train_nodes, test_nodes, train_targets, test_targets = model_selection.train_test_split(
        node_ids,
        node_targets,
        train_size=140,
        test_size=None,
        stratify=node_targets,
        random_state=5232,
    )

    # Split test set into test and validation
    val_nodes, test_nodes, val_targets, test_targets = model_selection.train_test_split(
        test_nodes,
        test_targets,
        train_size=500,
        test_size=None,
        random_state=5214)

    # Create mappers for GraphSAGE that input data from the graph to the model
    generator = GraphSAGENodeGenerator(G, batch_size, num_samples, seed=5312)
    train_gen = generator.flow(train_nodes, train_targets, shuffle=True)
    val_gen = generator.flow(val_nodes, val_targets)

    # GraphSAGE model
    model = GraphSAGE(
        layer_sizes=layer_size,
        generator=train_gen,
        bias=True,
        dropout=dropout,
        aggregator=MeanAggregator,
    )
    # Expose the input and output sockets of the model:
    x_inp, x_out = model.build()

    # Snap the final estimator layer to x_out
    prediction = layers.Dense(units=train_targets.shape[1],
                              activation="softmax")(x_out)

    # Create Keras model for training
    model = keras.Model(inputs=x_inp, outputs=prediction)
    model.compile(
        optimizer=optimizers.Adam(lr=learning_rate, decay=0.001),
        loss=losses.categorical_crossentropy,
        metrics=[metrics.categorical_accuracy],
    )
    print(model.summary())

    # Train model
    history = model.fit_generator(train_gen,
                                  epochs=num_epochs,
                                  validation_data=val_gen,
                                  verbose=2,
                                  shuffle=False)

    # Evaluate on test set and print metrics
    test_metrics = model.evaluate_generator(
        generator.flow(test_nodes, test_targets))
    print("\nTest Set Metrics:")
    for name, val in zip(model.metrics_names, test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Get predictions for all nodes
    all_predictions = model.predict_generator(generator.flow(node_ids))

    # Turn predictions back into the original categories
    node_predictions = pd.DataFrame(
        target_encoding.inverse_transform(all_predictions), index=node_ids)
    accuracy = np.mean([
        "subject=" + gt_subject == p for gt_subject, p in zip(
            node_data["subject"], node_predictions.idxmax(axis=1))
    ])
    print("All-node accuracy: {:3f}".format(accuracy))

    # TODO: extract the GraphSAGE embeddings from x_out, and save/plot them

    # Save the trained model
    save_str = "_n{}_l{}_d{}_r{}".format(
        "_".join([str(x) for x in num_samples]),
        "_".join([str(x) for x in layer_size]),
        dropout,
        learning_rate,
    )
    model.save("cora_example_model" + save_str + ".h5")

    # We must also save the target encoding to convert model predictions
    with open("cora_example_encoding" + save_str + ".pkl", "wb") as f:
        pickle.dump([target_encoding], f)
    train_ids = nodes[:5000]
    test_ids = nodes[5000:]
    train_labels= [graph.nodes[id]["_class"] for id in train_ids]
    test_labels = [graph.nodes[id]["_class"] for id in test_ids]
    all_labels = train_labels + test_labels
    train_labels = np.array(train_labels).reshape(len(train_ids),1)
    test_labels = np.array(test_labels).reshape(len(test_ids), 1)
    print(np.unique(train_labels, return_counts=True))
    print(np.unique(test_labels, return_counts=True))
    generator = GraphSAGENodeGenerator(G, batch_size=50, num_samples=[10,10])
    train_data_gen = generator.flow(train_ids, train_labels)
    test_data_gen = generator.flow(test_ids, test_labels)
    all_gen = generator.flow(list(nodes), all_labels)

    print("Node Gen done!")
    base_model = GraphSAGE(layer_sizes=[32, 32], generator=generator, bias=True, dropout=0.8)
    x_in, x_out = base_model.build()
    prediction = layers.Dense(units=2, activation="softmax")(x_out)

    print("model building done")

    model = Model(inputs=x_in, outputs = prediction)
    model.compile(optimizer=optimizers.Adam(lr=0.005), loss=losses.categorical_crossentropy, metrics=["acc"])
    tensorboard = callbacks.TensorBoard(log_dir="logs",embeddings_freq=1, update_freq=1, histogram_freq=1)
    tboard = model.fit(train_data_gen, epochs=4, validation_data=test_data_gen, verbose=True,
                                  shuffle=False, callbacks=[tensorboard])
    print(tboard)
    print("prediction done")

    y_pred = model.predict(train_data_gen, verbose=1)
    labels = np.argmax(y_pred, axis=1)
Example #16
0
#                    together wi"number of parallel workers to use" the unsupervised sampler will be used to generate samples.
actual_nodes_train = list(Gtrain.nodes())
if testtype == 'nodes':
    assert set(nodes_train).issuperset(actual_nodes_train)
unsupervised_samples = UnsupervisedSampler(Gtrain,
                                           nodes=actual_nodes_train,
                                           length=length_of_walks,
                                           number_of_walks=number_of_walks)
train_gen = GraphSAGELinkGenerator(Gtrain, batch_size,
                                   num_samples).flow(unsupervised_samples)

# Build the model
assert len(layer_sizes) == len(num_samples)
graphsage = GraphSAGE(layer_sizes=layer_sizes,
                      generator=train_gen,
                      bias=bias,
                      dropout=0.0,
                      normalize="l2")
x_inp, x_out = graphsage.build(flatten_output=False)
prediction = link_classification(output_dim=1,
                                 output_act="sigmoid",
                                 edge_embedding_method='ip')(x_out)
model = keras.Model(inputs=x_inp, outputs=prediction)
model.compile(
    optimizer=keras.optimizers.Adam(lr=1e-3),
    loss=keras.losses.binary_crossentropy,
    metrics=[keras.metrics.binary_accuracy],
)

# Train the model
history = model.fit_generator(
Example #17
0
# Hyperparams
batch_size = 20
epochs = 20
num_samples = [20, 10]
layer_sizes = [20, 20]

# Train iterators
train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples)
train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True)



# Model defining - Keras functional API + Stellargraph layers
graphsage = GraphSAGE(
    layer_sizes=layer_sizes, generator=train_gen, bias=True, dropout=0.3
)

x_inp, x_out = graphsage.in_out_tensors()

prediction = link_classification(
    output_dim=1, output_act="relu", edge_embedding_method="ip"
)(x_out)

model = keras.Model(inputs=x_inp, outputs=prediction)

model.compile(
    optimizer=keras.optimizers.Adam(lr=1e-3),
    loss=keras.losses.binary_crossentropy,
    metrics=["acc"],
)
                                           nodes=nodes,
                                           length=length,
                                           number_of_walks=number_of_walks)

## ====================== create a node pair generator =======================
batch_size = 50
epochs = 30
num_samples = [10, 5]

generator = GraphSAGELinkGenerator(G, batch_size, num_samples)
train_gen = generator.flow(unsupervised_samples)

layer_sizes = [50, 50]
graphsage = GraphSAGE(layer_sizes=layer_sizes,
                      generator=generator,
                      bias=True,
                      dropout=0.0,
                      normalize="l2")

# Build the model and expose input and output sockets of graphsage, for node pair inputs:
x_inp, x_out = graphsage.in_out_tensors()

#node classfiication layer
prediction = link_classification(output_dim=1,
                                 output_act="sigmoid",
                                 edge_embedding_method="ip")(x_out)

# model training
model = Model(inputs=x_inp, outputs=prediction)

model.compile(
Example #19
0
def train_model(Gnx, train_data, test_data, all_features):
    output_results = {}
    from collections import Counter
    #TODO: save size of dataset, train_data, and test data
    #save the count of each subject in the blocks
    print(len(train_data), len(test_data))
    subject_groups_train = Counter(train_data['subject'])
    subject_groups_test = Counter(test_data['subject'])
    output_results['train_size'] = len(train_data)
    output_results['test_size'] = len(test_data)
    output_results['subject_groups_train'] = subject_groups_train
    output_results['subject_groups_test'] = subject_groups_test

    #node_features = train_data[feature_names]
    #print (feature_names)
    G = sg.StellarGraph(Gnx, node_features=all_features)
    #TODO: save graph info
    print(G.info())
    print("writing graph.dot")
    #write_dot(Gnx,"graph.dot")
    output_results['graph_info'] = G.info()
    print("building the graph generator...")

    batch_size = 50
    num_samples = [10, 5]
    generator = GraphSAGENodeGenerator(G, batch_size, num_samples)
    #generator = HinSAGENodeGenerator(G, batch_size, num_samples)

    target_encoding = feature_extraction.DictVectorizer(sparse=False)
    train_targets = target_encoding.fit_transform(
        train_data[["subject"]].to_dict('records'))
    print(np.unique(train_data["subject"].to_list()))
    class_weights = class_weight.compute_class_weight(
        'balanced', np.unique(train_data["subject"].to_list()),
        train_data["subject"].to_list())
    print('class_weights', class_weights)
    test_targets = target_encoding.transform(test_data[["subject"
                                                        ]].to_dict('records'))
    train_gen = generator.flow(train_data.index, train_targets, shuffle=True)
    graphsage_model = GraphSAGE(
        #graphsage_model = HinSAGE(
        #layer_sizes=[32, 32],
        layer_sizes=[80, 80],
        generator=generator,  #train_gen,
        bias=True,
        dropout=0.5,
    )
    print("building model...")
    #x_inp, x_out = graphsage_model.build(flatten_output=True)
    x_inp, x_out = graphsage_model.build()
    prediction = layers.Dense(units=train_targets.shape[1],
                              activation="softmax")(x_out)

    model = Model(inputs=x_inp, outputs=prediction)
    print("compiling model...")
    model.compile(
        optimizer=optimizers.Adam(lr=0.005),
        loss=losses.categorical_crossentropy,
        metrics=["acc", metrics.categorical_accuracy],
    )
    print("testing the model...")
    test_gen = generator.flow(test_data.index, test_targets)
    history = model.fit_generator(
        train_gen,
        epochs=EPOCH,
        validation_data=test_gen,
        verbose=2,
        shuffle=True,
        class_weight=class_weights,
    )
    # save test metrics
    test_metrics = model.evaluate_generator(test_gen)
    print("\nTest Set Metrics:")
    output_results['test_metrics'] = []
    for name, val in zip(model.metrics_names, test_metrics):
        output_results['test_metrics'].append({'name': name, 'val:': val})
        print("\t{}: {:0.4f}".format(name, val))

    test_nodes = test_data.index
    test_mapper = generator.flow(test_nodes)
    test_predictions = model.predict_generator(test_mapper)
    node_predictions = target_encoding.inverse_transform(test_predictions)
    results = pd.DataFrame(node_predictions, index=test_nodes).idxmax(axis=1)
    df = pd.DataFrame({
        "Predicted": results,
        "True": test_data['subject']
    })  #, "program":test_data['program']})
    clean_result_labels = df["Predicted"].map(
        lambda x: x.replace('subject=', ''))
    # save predicted labels
    pred_labels = np.unique(clean_result_labels.values)
    #pred_program = np.unique(df['program'].values)
    # save predictions per label
    precision, recall, f1, _ = skmetrics.precision_recall_fscore_support(
        df['True'].values,
        clean_result_labels.values,
        average=None,
        labels=pred_labels)
    output_results['classifier'] = []
    for lbl, prec, rec, fm in zip(pred_labels, precision, recall, f1):
        output_results['classifier'].append({
            'label': lbl,
            'precision': prec,
            'recall': rec,
            'fscore': fm
        })
    print(output_results['classifier'])
    print(pred_labels)
    print('precision: {}'.format(precision))
    print('recall: {}'.format(recall))
    print('fscore: {}'.format(f1))

    return generator, model, x_inp, x_out, history, target_encoding, output_results
Example #20
0
def train(G_list,
          nodes_subjects_list,
          run_num=1,
          start_month_id=220,
          end_month_id=264):
    # 提前定义一些列表方便记录数据,大循环的列表存小循环的列表
    graph_history_list_list = []
    model_list_list = []
    train_gen_list_list = []
    time_list_list = []
    model_weight_list_list = []

    # 选择运行run_num次
    run_num = run_num
    # 选择进行训练的月份,end_month_id最多取
    start_month_id = start_month_id
    end_month_id = end_month_id

    # 创建文件夹保存model
    if not os.path.exists('model'):
        os.makedirs('model')

    # 创建文件夹保存history
    if not os.path.exists('history'):
        os.makedirs('history')

    # 创建文件夹保存figure
    if not os.path.exists('figure'):
        os.makedirs('figure')

    # 创建文件夹保存figure
    if not os.path.exists('figure_distribution'):
        os.makedirs('figure_distribution')

    # 创建文件夹保存test结果
    if not os.path.exists('test_result'):
        os.makedirs('test_result')

    # 大循环记录训练了几次,计算多次是为了减少variance
    # 小循环记录训练的月份
    for j in range(run_num):
        num_samples = [40]

        # 提前定义一些列表记录小循环的数据
        graph_history_list = []
        model_list = []
        train_gen_list = []
        time_list = []
        model_weight_list = []
        test_result = []

        # i为0代表220
        for i in range(start_month_id - 220, end_month_id - 220):
            start = time.time()

            # 前一个月训练,后一个月验证
            train_idx = i
            val_idx = i + 1
            test_idx = i + 2

            # 用train_idx的数据生成训练集的generator
            generator = GraphSAGENodeGenerator(
                G=G_list[train_idx],
                batch_size=len(nodes_subjects_list[train_idx]),
                num_samples=num_samples,
                seed=100)
            train_gen = generator.flow(list(
                nodes_subjects_list[train_idx].index),
                                       nodes_subjects_list[train_idx].values,
                                       shuffle=False)

            # 生成GraphSAGE模型
            graphsage_model = GraphSAGE(layer_sizes=[1],
                                        generator=generator,
                                        bias=True,
                                        aggregator=sg.layer.MeanAggregator,
                                        normalize=None)

            # 提取输出输出的tensor,用keras来构建模型
            x_inp, x_out = graphsage_model.in_out_tensors()
            #         prediction = layers.Dense(units=1)(x_out)

            # 用val_idx的数据生成验证集的generator
            generator = GraphSAGENodeGenerator(
                G=G_list[val_idx],
                batch_size=len(nodes_subjects_list[val_idx]),
                num_samples=num_samples,
                seed=100)
            val_gen = generator.flow(list(nodes_subjects_list[val_idx].index),
                                     nodes_subjects_list[val_idx].values)

            # 用test_idx的数据生成验证集的generator
            generator = GraphSAGENodeGenerator(
                G=G_list[test_idx],
                batch_size=len(nodes_subjects_list[test_idx]),
                num_samples=num_samples,
                seed=100)
            test_gen = generator.flow(
                list(nodes_subjects_list[test_idx].index),
                nodes_subjects_list[test_idx].values)

            # 通过输入输出的tensor构建model
            model = Model(inputs=x_inp, outputs=x_out)
            monitor = EarlyStopping(monitor='val_loss',
                                    min_delta=1e-3,
                                    patience=10,
                                    verbose=2,
                                    mode='auto',
                                    restore_best_weights=True)
            model.compile(optimizer=optimizers.Adam(lr=0.05),
                          loss=losses.mean_squared_error,
                          metrics=[pearson_r])

            history = model.fit(train_gen,
                                epochs=500,
                                validation_data=val_gen,
                                verbose=0,
                                shuffle=False,
                                callbacks=[monitor])

            test_metrics = model.evaluate(test_gen)
            test_result_dict = {}
            print("\n" + str(train_idx + 220) + "'s Test Set: " +
                  str(test_idx + 220) + "'s Metrics:")
            for name, val in zip(model.metrics_names, test_metrics):
                print("\t{}: {:0.4f}".format(name, val))
                test_result_dict[name] = val
            json.dump(
                test_result_dict,
                open(
                    'test_result/' + str(train_idx + 220) + "_" +
                    str(test_idx + 220) + '.json', 'w'))

            test_preds = model.predict(test_gen)

            end = time.time()

            # 保存一些结果
            graph_history_list.append(history)  # 保存训练过程
            model_list.append(model)  # 保存model
            train_gen_list.append(train_gen)  # 保存train_gen方便之后算中间层的结果
            time_list.append(end - start)  # 保存运行时间
            model_weight_list.append(model.weights)  # 保存model的参数
            test_result.append(test_metrics[1])

            # # 存模型model
            # model.save('model/' + str(train_idx + 220) + "_" + str(val_idx + 220) + '.h5')
            # # 存训练过程history
            # json.dump(history.history,
            #           open('history/' + str(train_idx + 220) + "_" + str(val_idx + 220) + '.json', 'w'))
            # # 存训练过程图片figure
            # sg.utils.plot_history(history)
            # plt.title(str(train_idx + 220) + '->' + str(val_idx + 220))
            # plt.savefig('figure/' + str(train_idx + 220) + "_" + str(val_idx + 220) + '.png')
            # plt.show()
            # 存test的prediction的distribution
            plt.figure(figsize=(5, 10))
            plt.subplot(211)
            plt.hist(test_preds, bins=500)
            plt.title("Distribution of Prediction of " + str(test_idx + 220))
            plt.subplot(212)
            plt.hist(nodes_subjects_list[test_idx].values, bins=500)
            plt.title("Distribution of Origin of " + str(test_idx + 220))
            plt.xlabel("ic=" + str(test_metrics[1]))
            plt.savefig('figure_distribution/distribution-' +
                        str(train_idx + 220) + "_" + str(test_idx + 220) +
                        '.png',
                        dpi=300)
            plt.show()

            print(str(i + 220) + "'s " + str(j + 1) + " run has finished")
            print()

        # 将小循环的数据保存
        graph_history_list_list.append(graph_history_list)
        model_list_list.append(model_list)
        train_gen_list_list.append(train_gen_list)
        time_list_list.append(time_list)
        model_weight_list_list.append(model_weight_list)

        return graph_history_list_list, model_list_list, train_gen_list_list, time_list_list, model_weight_list_list, test_result
# on concatenated `(paper1, paper2)` node embeddings.
#
# GraphSAGE part of the model, with hidden layer sizes of 20 for both GraphSAGE layers, a bias term, and no dropout.
# (Dropout can be switched on by specifying a positive dropout rate, 0 < dropout < 1)
#
# Note that the length of layer_sizes list must be equal to the length of num_samples, as len(num_samples) defines
# the number of hops (layers) in the GraphSAGE model.

# In[17]:


layer_sizes = [20, 20]
assert len(layer_sizes) == len(num_samples)

graphsage = GraphSAGE(
    layer_sizes=layer_sizes, generator=generator, bias=True, dropout=0.5
)


# In[18]:


# Build the model and expose the input and output tensors.
x_inp, x_out = graphsage.build()


# Final link classification layer that takes a pair of node embeddings produced by graphsage, applies a binary
# operator to them to produce the corresponding link embedding ('ip' for inner product; other options for the binary
# operator can be seen by running a cell with `?link_classification` in it), and passes it through a dense layer:

# In[19]:
Example #22
0
# Create data generator for our graph, specified by which type
# of model (GraphSAGE) and the learning task (Node) ...
generator = GraphSAGENodeGenerator(G, batch_size, num_samples)

print(train_subjects.index)

# Create an iterator for our training data, this takes the indeces of the
# nodes in the graph to be used for training, as well as their respective
# one-hot encoded label vectors
train_gen = generator.flow(train_subjects.index, train_targets, shuffle = True)

# Specify the graph-learning model

graphsage_model = GraphSAGE(
    layer_sizes=[32, 32], generator=generator, bias=True, dropout=0.5,
    aggregator = MeanAggregator
)

# Extract the input and output tensors of the model. Set predictions
# of the model to be a softmax layer taking output tensor as its input.
x_inp, x_out = graphsage_model.in_out_tensors()
prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out)
model = Model(inputs = x_inp, outputs = prediction)
model.compile(
    optimizer=optimizers.Adam(lr=0.005),
    loss=losses.categorical_crossentropy,
    metrics=["acc"],
)
# To validate/test we need another generator for testing data, no shuffle needed
test_gen = generator.flow(test_subjects.index, test_targets)
history = model.fit(
test_gen = generator.flow(test_subjects.index, test_targets)

# aggregatortype = MaxPoolingAggregator(),
# layer_sizes (list): Hidden feature dimensions for each layer. activations (list): Activations applied to each layer's output;


def get_dropout(input_tensor, p=0.1, mc=False):
    if mc:
        return Dropout(p)(input_tensor, training=True)
    else:
        return Dropout(p)(input_tensor)


graphsage_model = GraphSAGE(layer_sizes=[64, 32, 16],
                            generator=generator,
                            activations=["relu", "relu", "linear"],
                            bias=True,
                            aggregator=MaxPoolingAggregator,
                            dropout=0.1)

x_inp, x_out = graphsage_model.in_out_tensors()
x_out = layers.Dense(units=10, activation="relu")(x_out)
x_out = layers.Dense(units=10, activation="relu")(x_out)
x_out = get_dropout(x_out, p=0.1, mc='mc')
prediction = layers.Dense(units=train_targets.shape[1],
                          activation="softmax")(x_out)

model = Model(inputs=x_inp, outputs=prediction)
model.summary()

##
# model.compile( optimizer=optimizers.Adam(), loss = noderankloss(), metrics=["acc"])
Example #24
0
    def _train_model(self, gnx, train_data, test_data, all_features,
                     target_feature_name):
        subject_groups_train = Counter(train_data[target_feature_name])
        subject_groups_test = Counter(test_data[target_feature_name])

        graph = sg.StellarGraph(gnx, node_features=all_features)

        output_results = {
            'train_size': len(train_data),
            'test_size': len(test_data),
            'subject_groups_train': subject_groups_train,
            'subject_groups_test': subject_groups_test,
            'graph_info': graph.info()
        }

        num_samples = [10, 5]
        generator = GraphSAGENodeGenerator(graph, self.batch_size, num_samples)

        target_encoding = feature_extraction.DictVectorizer(sparse=False)
        train_targets = target_encoding.fit_transform(
            train_data[[target_feature_name]].to_dict('records'))
        class_weights = class_weight.compute_class_weight(
            class_weight='balanced',
            classes=np.unique(train_data[target_feature_name].to_list()),
            y=train_data[target_feature_name].to_list())
        class_weights = dict(enumerate(class_weights))
        test_targets = target_encoding.transform(
            test_data[[target_feature_name]].to_dict('records'))
        train_gen = generator.flow(train_data.index,
                                   train_targets,
                                   shuffle=True)
        graph_sage_model = GraphSAGE(
            layer_sizes=[80, 80],
            generator=generator,  # train_gen,
            bias=True,
            dropout=0.5,
        )
        print('building model...')

        x_inp, x_out = graph_sage_model.build()
        prediction = layers.Dense(units=train_targets.shape[1],
                                  activation="softmax")(x_out)

        model = Model(inputs=x_inp, outputs=prediction)
        print('compiling model...')
        model.compile(
            optimizer=optimizers.Adam(learning_rate=0.005),
            loss=losses.categorical_crossentropy,
            metrics=['acc', metrics.categorical_accuracy],
        )
        print('testing the model...')
        test_gen = generator.flow(test_data.index, test_targets)
        history = model.fit(
            train_gen,
            epochs=self.num_epochs,
            validation_data=test_gen,
            verbose=2,
            shuffle=True,
            class_weight=class_weights,
        )
        # save test metrics
        test_metrics = model.evaluate(test_gen)
        print('Test Set Metrics:')
        output_results['test_metrics'] = []
        for name, val in zip(model.metrics_names, test_metrics):
            output_results['test_metrics'].append({'name': name, 'val:': val})
            print("\t{}: {:0.4f}".format(name, val))

        test_nodes = test_data.index
        test_mapper = generator.flow(test_nodes)
        test_predictions = model.predict(test_mapper)
        node_predictions = target_encoding.inverse_transform(test_predictions)
        results = pd.DataFrame(node_predictions,
                               index=test_nodes).idxmax(axis=1)
        df = pd.DataFrame({
            'Predicted': results,
            'True': test_data[target_feature_name]
        })
        clean_result_labels = df['Predicted'].map(
            lambda x: x.replace('subject=', ''))

        # save predicted labels
        pred_labels = np.unique(clean_result_labels.values)
        precision, recall, f1, _ = skmetrics.precision_recall_fscore_support(
            df['True'].values,
            clean_result_labels.values,
            average=None,
            labels=pred_labels)
        output_results['classifier'] = []
        for lbl, prec, rec, fm in zip(pred_labels, precision, recall, f1):
            output_results['classifier'].append({
                'label': lbl,
                'precision': prec,
                'recall': rec,
                'fscore': fm
            })

        print(output_results['classifier'])
        print(pred_labels)
        print('precision: {}'.format(precision))
        print('recall: {}'.format(recall))
        print('fscore: {}'.format(f1))

        output_results['history'] = {
            'epochs': history.epoch,
            'training_log': history.history,
            'training_params': history.params
        }

        return generator, model, x_inp, x_out, history, target_encoding, output_results