Ejemplo n.º 1
0
    def preprocess_train(self, node_ids):
        """
        preprocess training set
        """
        if not self.ids_exist(node_ids): raise ValueError('node_ids must exist in self.df')

        # subset df for training nodes
        df_tr = self.df[self.df.index.isin(node_ids)]

        # one-hot-encode target
        self.y_encoding = sklearn.feature_extraction.DictVectorizer(sparse=False)
        train_targets = self.y_encoding.fit_transform(df_tr[["target"]].to_dict('records'))



        # import stellargraph
        try:
            import stellargraph as sg
            from stellargraph.mapper import GraphSAGENodeGenerator
        except:
            raise Exception(SG_ERRMSG)
        if version.parse(sg.__version__) < version.parse('0.8'):
            raise Exception(SG_ERRMSG)



        # return generator
        G_sg = sg.StellarGraph(self.G, node_features=self.df[self.feature_names])
        self.G_sg = G_sg
        generator = GraphSAGENodeGenerator(G_sg, U.DEFAULT_BS, [self.sampsize, self.sampsize])
        train_gen = generator.flow(df_tr.index, train_targets, shuffle=True)
        from .node_generator import NodeSequenceWrapper
        return NodeSequenceWrapper(train_gen)
Ejemplo n.º 2
0
def graphsage_pipeline(G, node_subjects, layer_sizes=[32, 32]):
    train_subjects, val_subjects, test_subjects = training_split(node_subjects)

    batch_size = 50
    num_samples = [10, 5]
    generator = GraphSAGENodeGenerator(G, batch_size, num_samples)
    train_gen = generator.flow(train_subjects.index,
                               train_subjects.values,
                               shuffle=True)
    graphsage_model = GraphSAGE(
        layer_sizes=layer_sizes,
        generator=generator,
        bias=True,
        dropout=0.5,
    )

    model = build_model(graphsage_model, train_subjects.values.shape[1])

    val_gen = generator.flow(val_subjects.index, val_subjects.values)
    es_callback = EarlyStopping(monitor="val_acc",
                                patience=50,
                                restore_best_weights=True)
    history = model.fit(train_gen,
                        epochs=200,
                        validation_data=val_gen,
                        verbose=0,
                        shuffle=False,
                        callbacks=[es_callback])

    plot_results(history)
    test_metrics(generator, model, test_subjects)
Ejemplo n.º 3
0
    def preprocess_valid(self, node_ids):
        """
        preprocess validation nodes (transductive inference)
        node_ids (list):  list of node IDs that generator will yield
        """
        if not self.ids_exist(node_ids): raise ValueError('node_ids must exist in self.df')
        if self.y_encoding is None:
            raise Exception('Unset parameters. Are you sure you called preprocess_train first?')

        # subset df for validation nodes
        df_val = self.df[self.df.index.isin(node_ids)]


        # one-hot-encode target
        val_targets = self.y_encoding.transform(df_val[["target"]].to_dict('records'))


        # import stellargraph
        try:
            import stellargraph as sg
            from stellargraph.mapper import GraphSAGENodeGenerator
        except:
            raise Exception(SG_ERRMSG)
        if version.parse(sg.__version__) < version.parse('0.8'):
            raise Exception(SG_ERRMSG)


        # return generator
        if self.G_sg is None:
            self.G_sg = sg.StellarGraph(self.G, node_features=self.df[self.feature_names])
        generator = GraphSAGENodeGenerator(self.G_sg, U.DEFAULT_BS, [self.sampsize,self.sampsize])
        val_gen = generator.flow(df_val.index, val_targets, shuffle=False)
        from .node_generator import NodeSequenceWrapper
        return NodeSequenceWrapper(val_gen)
Ejemplo n.º 4
0
def test(edgelist, node_data, model_file, batch_size, target_name="subject"):
    """
    Load the serialized model and evaluate on all nodes in the graph.

    Args:
        G: NetworkX graph file
        target_converter: Class to give numeric representations of node targets
        feature_converter: CLass to give numeric representations of the node features
        model_file: Location of Keras model to load
        batch_size: Size of batch for inference
    """
    # Extract the feature data. These are the feature vectors that the Keras model will use as input.
    # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication.
    node_features = node_data[feature_names]

    # Create graph from edgelist and set node features and node type
    Gnx = nx.from_pandas_edgelist(edgelist)

    # We must also save the target encoding to convert model predictions
    encoder_file = model_file.replace(
        "cora_example_model", "cora_example_encoding"
    ).replace(".h5", ".pkl")
    with open(encoder_file, "rb") as f:
        target_encoding = pickle.load(f)[0]

    # Endode targets with pre-trained encoder
    node_targets = target_encoding.transform(
        node_data[[target_name]].to_dict("records")
    )
    node_ids = node_data.index

    # Convert to StellarGraph and prepare for ML
    G = sg.StellarGraph(Gnx, node_features=node_features)

    # Load Keras model
    model = keras.models.load_model(
        model_file, custom_objects={"MeanAggregator": MeanAggregator}
    )
    print("Loaded model:")
    model.summary()

    # Get required samples from model
    # TODO: Can we move this to the library?
    num_samples = [
        int(model.input_shape[ii + 1][1] / model.input_shape[ii][1])
        for ii in range(len(model.input_shape) - 1)
    ]

    # Create mappers for GraphSAGE that input data from the graph to the model
    generator = GraphSAGENodeGenerator(
        G, batch_size, num_samples, seed=42
    )
    all_gen = generator.flow(node_ids, node_targets)

    # Evaluate and print metrics
    all_metrics = model.evaluate_generator(all_gen)

    print("\nAll-node Evaluation:")
    for name, val in zip(model.metrics_names, all_metrics):
        print("\t{}: {:0.4f}".format(name, val))
Ejemplo n.º 5
0
    def preprocess_test(self, df_te, G_te):
        """
        ```
        preprocess for inductive inference
        df_te (DataFrame): pandas dataframe containing new node attributes
        G_te (Graph):  a networkx Graph containing new nodes
        ```
        """
        try:
            import networkx as nx
        except ImportError:
            raise ImportError("Please install networkx:  pip install networkx")

        if self.y_encoding is None:
            raise Exception(
                "Unset parameters. Are you sure you called preprocess_train first?"
            )

        # get aggregrated df
        # df_agg = pd.concat([df_te, self.df]).drop_duplicates(keep='last')
        df_agg = pd.concat([df_te, self.df])
        # df_te = pd.concat([self.df, df_agg]).drop_duplicates(keep=False)

        # get aggregrated graph
        is_subset = set(self.G.nodes()) <= set(G_te.nodes())
        if not is_subset:
            raise ValueError("Nodes in self.G must be subset of G_te")
        G_agg = nx.compose(self.G, G_te)

        # one-hot-encode target
        if "target" in df_te.columns:
            test_targets = self.y_encoding.transform(
                df_te[["target"]].to_dict("records"))
        else:
            test_targets = [-1] * len(df_te.shape[0])

        # import stellargraph
        try:
            import stellargraph as sg
            from stellargraph.mapper import GraphSAGENodeGenerator
        except:
            raise Exception(SG_ERRMSG)
        if version.parse(sg.__version__) < version.parse("0.8"):
            raise Exception(SG_ERRMSG)

        # return generator
        G_sg = sg.StellarGraph(G_agg, node_features=df_agg[self.feature_names])
        generator = GraphSAGENodeGenerator(G_sg, U.DEFAULT_BS,
                                           [self.sampsize, self.sampsize])
        test_gen = generator.flow(df_te.index, test_targets, shuffle=False)
        from .sg_wrappers import NodeSequenceWrapper

        return NodeSequenceWrapper(test_gen)
Ejemplo n.º 6
0
def test_graphsage_constructor():
    gs = GraphSAGE(layer_sizes=[4],
                   n_samples=[2],
                   input_dim=2,
                   normalize="l2",
                   multiplicity=1)
    assert gs.dims == [2, 4]
    assert gs.n_samples == [2]
    assert gs.max_hops == 1
    assert gs.bias
    assert len(gs._aggs) == 1

    # Check incorrect normalization flag
    with pytest.raises(ValueError):
        GraphSAGE(
            layer_sizes=[4],
            n_samples=[2],
            input_dim=2,
            normalize=lambda x: x,
            multiplicity=1,
        )

    with pytest.raises(ValueError):
        GraphSAGE(
            layer_sizes=[4],
            n_samples=[2],
            input_dim=2,
            normalize="unknown",
            multiplicity=1,
        )

    # Check requirement for generator or n_samples
    with pytest.raises(KeyError):
        GraphSAGE(layer_sizes=[4])

    # Construction from generator
    G = example_graph(feature_size=3)
    gen = GraphSAGENodeGenerator(G, batch_size=2, num_samples=[2, 2])
    gs = GraphSAGE(layer_sizes=[4, 8], generator=gen, bias=True)

    # The GraphSAGE should no longer accept a Sequence
    t_gen = gen.flow([1, 2])
    with pytest.raises(TypeError):
        gs = GraphSAGE(layer_sizes=[4, 8], generator=t_gen, bias=True)

    assert gs.dims == [3, 4, 8]
    assert gs.n_samples == [2, 2]
    assert gs.max_hops == 2
    assert gs.bias
    assert len(gs._aggs) == 2
Ejemplo n.º 7
0
    def run_model(self):
        graph_sampled, label_series_sampled = self.prepare_data_for_stellargraph(
        )
        train_targets, valid_targets, test_targets, train_labels, valid_labels, test_labels = self.get_train_valid_test(
            label_series_sampled)

        batch_size = self.hyperparams["batch_size"]
        num_samples = self.hyperparams["num_samples"]
        generator = GraphSAGENodeGenerator(graph_sampled, batch_size,
                                           num_samples)
        train_gen = generator.flow(train_labels.index,
                                   train_targets,
                                   shuffle=True)
        graphsage_model = GraphSAGE(
            layer_sizes=self.hyperparams["layer_sizes"],
            generator=generator,
            bias=self.hyperparams["bias"],
            dropout=self.hyperparams["dropout"],
        )
        x_inp, x_out = graphsage_model.in_out_tensors()
        prediction = layers.Dense(units=train_targets.shape[1],
                                  activation="softmax")(x_out)

        model = Model(inputs=x_inp, outputs=prediction)
        model.compile(
            optimizer=optimizers.Adam(lr=self.hyperparams["lr"]),
            loss=losses.categorical_crossentropy,
            metrics=["acc"],
        )

        valid_gen = generator.flow(valid_labels.index, valid_targets)

        history = model.fit(
            train_gen,
            epochs=self.hyperparams["n_epochs"],
            validation_data=valid_gen,
            verbose=self.hyperparams["verbose"],
            shuffle=True,
            use_multiprocessing=True,
        )

        sg.utils.plot_history(history)

        test_gen = generator.flow(test_labels.index, test_targets)
        test_metrics = model.evaluate(test_gen)
        print("\nTest Set Metrics:")
        for name, valid in zip(model.metrics_names, test_metrics):
            print("\t{}: {:0.4f}".format(name, valid))
Ejemplo n.º 8
0
def _dispatch_generator(graph, model_name, params,
                        generator_type="node"):
    """Create a graph generator."""
    if model_name == "watchyourstep":
        return AdjacencyPowerGenerator(
            graph, num_powers=params["num_powers"])
    elif model_name in ["complex", "distmult"]:
        return KGTripleGenerator(graph, params["batch_size"])
    elif model_name == "attri2vec":
        if generator_type == "node":
            return Attri2VecNodeGenerator(
                graph, params["batch_size"])
        else:
            return Attri2VecLinkGenerator(
                graph, params["batch_size"])
    elif model_name in ["graphsage", "graphsage_dgi"]:
        if generator_type == "node":
            return GraphSAGENodeGenerator(
                graph, params["batch_size"], params["num_samples"])
        else:
            return GraphSAGELinkGenerator(
                graph, params["batch_size"], params["num_samples"])
    elif model_name in ["gcn_dgi", "gat_dgi"]:
        return FullBatchNodeGenerator(graph, sparse=False)
    elif model_name in ["cluster_gcn_dgi", "cluster_gat_dgi"]:
        return ClusterNodeGenerator(
            graph, clusters=params["clusters"],
            q=params["clusters_q"])
    else:
        raise ValueError(f"Unknown model name '{model_name}'")
Ejemplo n.º 9
0
    def initialize(self,**hyper_params):

        if(not "batch_size" in hyper_params.keys()):
            batch_size = 16
        if(not "layer_sizes" in hyper_params.keys()):
            num_samples = [25, 10]
        if(not "num_samples" in hyper_params.keys()):
            layer_sizes = [256, 256]
        if(not "bias" in hyper_params.keys()):
            bias = True
        if(not "dropout" in hyper_params.keys()):
            dropout = 0.0
        if(not "lr" in hyper_params.keys()):
            lr = 1e-3
        if(not "num_walks" in hyper_params.keys()):
            num_walks = 1
        if(not "length" in hyper_params.keys()):
            length = 5

        self.graph = sg.StellarGraph(nodes=self.nodes_df,edges=self.edges_df)
        self.nodes = list(self.graph.nodes())

        del self.nodes_df
        del self.edges_df

        unsupervised_samples = UnsupervisedSampler(
            self.graph, nodes=self.nodes, length=length, number_of_walks=num_walks
        )

        # Train iterators
        train_gen = GraphSAGELinkGenerator(self.graph, batch_size, num_samples)
        self.train_flow = train_gen.flow(unsupervised_samples)

        # Model defining - Keras functional API + Stellargraph layers
        graphsage = GraphSAGE(
            layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout, normalize="l2"
        )

        x_inp, x_out = graphsage.in_out_tensors()

        prediction = link_classification(
            output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
        )(x_out)

        self.model = keras.Model(inputs=x_inp, outputs=prediction)

        self.model.compile(
            optimizer=keras.optimizers.Adam(lr=lr),
            loss=keras.losses.binary_crossentropy,
            metrics=[keras.metrics.binary_accuracy],
        )

        x_inp_src = x_inp[0::2]
        x_out_src = x_out[0]
        self.embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

        self.node_gen = GraphSAGENodeGenerator(self.graph, batch_size, num_samples).flow(self.nodes)

        return self.model.get_weights()
Ejemplo n.º 10
0
    def test_gat_build_constructor_wrong_generator(self):
        G = example_graph(feature_size=self.F_in)
        gen = GraphSAGENodeGenerator(G, self.N, [5, 10])

        # test error where generator is of the wrong type for GAT:
        with pytest.raises(TypeError):
            gat = GAT(
                layer_sizes=self.layer_sizes,
                activations=self.activations,
                attn_heads=self.attn_heads,
                bias=True,
                generator=gen,
            )
Ejemplo n.º 11
0
def create_graphSAGE_model(graph, link_prediction=False):

    if link_prediction:
        # We are going to train on the original graph
        generator = GraphSAGELinkGenerator(graph,
                                           batch_size=2,
                                           num_samples=[2, 2])
        edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]])
        train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0]))
    else:
        generator = GraphSAGENodeGenerator(graph,
                                           batch_size=2,
                                           num_samples=[2, 2])
        train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]]))

    # if link_prediction:
    #     edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]])
    #     train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0]))
    # else:
    #     train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]]))

    base_model = GraphSAGE(layer_sizes=[8, 8],
                           generator=train_gen,
                           bias=True,
                           dropout=0.5)

    if link_prediction:
        # Expose input and output sockets of graphsage, for source and destination nodes:
        x_inp_src, x_out_src = base_model.node_model()
        x_inp_dst, x_out_dst = base_model.node_model()
        # re-pack into a list where (source, destination) inputs alternate, for link inputs:
        x_inp = [x for ab in zip(x_inp_src, x_inp_dst) for x in ab]
        # same for outputs:
        x_out = [x_out_src, x_out_dst]

        prediction = link_classification(output_dim=1,
                                         output_act="relu",
                                         edge_embedding_method="ip")(x_out)

        keras_model = Model(inputs=x_inp, outputs=prediction)
    else:
        x_inp, x_out = base_model.node_model()
        prediction = layers.Dense(units=2, activation="softmax")(x_out)

        keras_model = Model(inputs=x_inp, outputs=prediction)

    return base_model, keras_model, generator, train_gen
Ejemplo n.º 12
0
def _fit_deep_graph_infomax(train_graph, params, model_name):
    """Train unsupervised Deep Graph Infomax."""
    if "gcn_dgi" in model_name or "gat_dgi" in model_name:
        if "cluster" in model_name:
            generator = ClusterNodeGenerator(
                train_graph, clusters=params["clusters"],
                q=params["clusters_q"])
        else:
            generator = FullBatchNodeGenerator(train_graph, sparse=False)

        if "gcn_dgi" in model_name:
            embedding_layer = GCN(
                layer_sizes=[params["embedding_dimension"]],
                activations=["relu"], generator=generator)
        elif "gat_dgi" in model_name:
            embedding_layer = GAT(
                layer_sizes=[params["embedding_dimension"]],
                activations=["relu"], generator=generator, attn_heads=8)
    elif model_name == "graphsage_dgi":
        generator = GraphSAGENodeGenerator(
            train_graph, batch_size=50, num_samples=[5])
        embedding_layer = GraphSAGE(
            layer_sizes=[params["embedding_dimension"]], activations=["relu"],
            generator=generator
        )
    else:
        raise ValueError(f"Unknown mode name {model_name}")

    embedding_model = _execute_deep_graph_infomax(
        train_graph, embedding_layer, generator, params)

    # Here the models can be both inductive and transductive
    if model_name in ["gcn_dgi", "gat_dgi", "graphsage_dgi"]:
        return embedding_model.predict(
            generator.flow(train_graph.nodes()))
    else:
        return embedding_model
    labels_sampled,
    train_size=0.05,
    test_size=None,
    stratify=labels_sampled,
    random_state=42,
)

# Turn labels into one-hot encodings
target_encoding = preprocessing.LabelBinarizer()
train_targets = target_encoding.fit_transform(train_labels)
val_targets = target_encoding.transform(val_labels)

# Create a node generator for undirected graph
batch_size = 50
num_samples = [10, 10]
generator = GraphSAGENodeGenerator(graph_sampled, batch_size, num_samples)

# create iterator for training data
train_gen = generator.flow(train_labels.index, train_targets, shuffle=True)

# Make graphsage model
graphsage_model = GraphSAGE(
    layer_sizes=[32, 32],
    generator=generator,
    bias=True,
    dropout=0.5,
)
x_inp, x_out = graphsage_model.in_out_tensors()
prediction = layers.Dense(units=train_targets.shape[1],
                          activation="softmax")(x_out)
Ejemplo n.º 14
0
from tensorflow.keras.models import load_model
import pickle
import scipy.io as io

## ########################################### build graph ################################################
#%% ############################################################################################################

G = StellarGraph.from_networkx(g, node_features="feature")
print(G.info())

#%% #################################### Graphsage Model loadig ###########################################
#%% ############################################################################################################

batch_size = 70
num_samples = [15, 10, 5, 5]
generator = GraphSAGENodeGenerator(G, batch_size, num_samples)

targets = np.array(targetdf['btw'])

test_gen = generator.flow(targetdf.index, targets)

indices = bf.expandy(batch_size, 2)


def noderankloss(index):
    def loss(y_true, y_pred):
        # tf.print(tf.gather(y_true, tf.constant(index[:, 0])))

        yt = tf.math.sigmoid(
            tf.gather(y_true, tf.constant(index[:, 0])) -
            tf.gather(y_true, tf.constant(index[:, 1])))
Ejemplo n.º 15
0
def train(
    edgelist,
    node_data,
    layer_size,
    num_samples,
    batch_size=100,
    num_epochs=10,
    learning_rate=0.005,
    dropout=0.0,
    target_name="subject",
):
    """
    Train a GraphSAGE model on the specified graph G with given parameters, evaluate it, and save the model.

    Args:
        edgelist: Graph edgelist
        node_data: Feature and target data for nodes
        layer_size: A list of number of hidden nodes in each layer
        num_samples: Number of neighbours to sample at each layer
        batch_size: Size of batch for inference
        num_epochs: Number of epochs to train the model
        learning_rate: Initial Learning rate
        dropout: The dropout (0->1)
    """
    # Extract target and encode as a one-hot vector
    target_encoding = feature_extraction.DictVectorizer(sparse=False)
    node_targets = target_encoding.fit_transform(
        node_data[[target_name]].to_dict("records"))
    node_ids = node_data.index

    # Extract the feature data. These are the feature vectors that the Keras model will use as input.
    # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication.
    node_features = node_data[feature_names]

    # Create graph from edgelist and set node features and node type
    Gnx = nx.from_pandas_edgelist(edgelist, edge_attr="label")
    nx.set_node_attributes(Gnx, "paper", "label")

    # Convert to StellarGraph and prepare for ML
    G = sg.StellarGraph(Gnx,
                        node_type_name="label",
                        node_features=node_features)

    # Split nodes into train/test using stratification.
    train_nodes, test_nodes, train_targets, test_targets = model_selection.train_test_split(
        node_ids,
        node_targets,
        train_size=140,
        test_size=None,
        stratify=node_targets,
        random_state=5232,
    )

    # Split test set into test and validation
    val_nodes, test_nodes, val_targets, test_targets = model_selection.train_test_split(
        test_nodes,
        test_targets,
        train_size=500,
        test_size=None,
        random_state=5214)

    # Create mappers for GraphSAGE that input data from the graph to the model
    generator = GraphSAGENodeGenerator(G, batch_size, num_samples, seed=5312)
    train_gen = generator.flow(train_nodes, train_targets, shuffle=True)
    val_gen = generator.flow(val_nodes, val_targets)

    # GraphSAGE model
    model = GraphSAGE(
        layer_sizes=layer_size,
        generator=train_gen,
        bias=True,
        dropout=dropout,
        aggregator=MeanAggregator,
    )
    # Expose the input and output sockets of the model:
    x_inp, x_out = model.build()

    # Snap the final estimator layer to x_out
    prediction = layers.Dense(units=train_targets.shape[1],
                              activation="softmax")(x_out)

    # Create Keras model for training
    model = keras.Model(inputs=x_inp, outputs=prediction)
    model.compile(
        optimizer=optimizers.Adam(lr=learning_rate, decay=0.001),
        loss=losses.categorical_crossentropy,
        metrics=[metrics.categorical_accuracy],
    )
    print(model.summary())

    # Train model
    history = model.fit_generator(train_gen,
                                  epochs=num_epochs,
                                  validation_data=val_gen,
                                  verbose=2,
                                  shuffle=False)

    # Evaluate on test set and print metrics
    test_metrics = model.evaluate_generator(
        generator.flow(test_nodes, test_targets))
    print("\nTest Set Metrics:")
    for name, val in zip(model.metrics_names, test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Get predictions for all nodes
    all_predictions = model.predict_generator(generator.flow(node_ids))

    # Turn predictions back into the original categories
    node_predictions = pd.DataFrame(
        target_encoding.inverse_transform(all_predictions), index=node_ids)
    accuracy = np.mean([
        "subject=" + gt_subject == p for gt_subject, p in zip(
            node_data["subject"], node_predictions.idxmax(axis=1))
    ])
    print("All-node accuracy: {:3f}".format(accuracy))

    # TODO: extract the GraphSAGE embeddings from x_out, and save/plot them

    # Save the trained model
    save_str = "_n{}_l{}_d{}_r{}".format(
        "_".join([str(x) for x in num_samples]),
        "_".join([str(x) for x in layer_size]),
        dropout,
        learning_rate,
    )
    model.save("cora_example_model" + save_str + ".h5")

    # We must also save the target encoding to convert model predictions
    with open("cora_example_encoding" + save_str + ".pkl", "wb") as f:
        pickle.dump([target_encoding], f)
Ejemplo n.º 16
0
def train(G_list,
          nodes_subjects_list,
          run_num=1,
          start_month_id=220,
          end_month_id=264):
    # 提前定义一些列表方便记录数据,大循环的列表存小循环的列表
    graph_history_list_list = []
    model_list_list = []
    train_gen_list_list = []
    time_list_list = []
    model_weight_list_list = []

    # 选择运行run_num次
    run_num = run_num
    # 选择进行训练的月份,end_month_id最多取
    start_month_id = start_month_id
    end_month_id = end_month_id

    # 创建文件夹保存model
    if not os.path.exists('model'):
        os.makedirs('model')

    # 创建文件夹保存history
    if not os.path.exists('history'):
        os.makedirs('history')

    # 创建文件夹保存figure
    if not os.path.exists('figure'):
        os.makedirs('figure')

    # 创建文件夹保存figure
    if not os.path.exists('figure_distribution'):
        os.makedirs('figure_distribution')

    # 创建文件夹保存test结果
    if not os.path.exists('test_result'):
        os.makedirs('test_result')

    # 大循环记录训练了几次,计算多次是为了减少variance
    # 小循环记录训练的月份
    for j in range(run_num):
        num_samples = [40]

        # 提前定义一些列表记录小循环的数据
        graph_history_list = []
        model_list = []
        train_gen_list = []
        time_list = []
        model_weight_list = []
        test_result = []

        # i为0代表220
        for i in range(start_month_id - 220, end_month_id - 220):
            start = time.time()

            # 前一个月训练,后一个月验证
            train_idx = i
            val_idx = i + 1
            test_idx = i + 2

            # 用train_idx的数据生成训练集的generator
            generator = GraphSAGENodeGenerator(
                G=G_list[train_idx],
                batch_size=len(nodes_subjects_list[train_idx]),
                num_samples=num_samples,
                seed=100)
            train_gen = generator.flow(list(
                nodes_subjects_list[train_idx].index),
                                       nodes_subjects_list[train_idx].values,
                                       shuffle=False)

            # 生成GraphSAGE模型
            graphsage_model = GraphSAGE(layer_sizes=[1],
                                        generator=generator,
                                        bias=True,
                                        aggregator=sg.layer.MeanAggregator,
                                        normalize=None)

            # 提取输出输出的tensor,用keras来构建模型
            x_inp, x_out = graphsage_model.in_out_tensors()
            #         prediction = layers.Dense(units=1)(x_out)

            # 用val_idx的数据生成验证集的generator
            generator = GraphSAGENodeGenerator(
                G=G_list[val_idx],
                batch_size=len(nodes_subjects_list[val_idx]),
                num_samples=num_samples,
                seed=100)
            val_gen = generator.flow(list(nodes_subjects_list[val_idx].index),
                                     nodes_subjects_list[val_idx].values)

            # 用test_idx的数据生成验证集的generator
            generator = GraphSAGENodeGenerator(
                G=G_list[test_idx],
                batch_size=len(nodes_subjects_list[test_idx]),
                num_samples=num_samples,
                seed=100)
            test_gen = generator.flow(
                list(nodes_subjects_list[test_idx].index),
                nodes_subjects_list[test_idx].values)

            # 通过输入输出的tensor构建model
            model = Model(inputs=x_inp, outputs=x_out)
            monitor = EarlyStopping(monitor='val_loss',
                                    min_delta=1e-3,
                                    patience=10,
                                    verbose=2,
                                    mode='auto',
                                    restore_best_weights=True)
            model.compile(optimizer=optimizers.Adam(lr=0.05),
                          loss=losses.mean_squared_error,
                          metrics=[pearson_r])

            history = model.fit(train_gen,
                                epochs=500,
                                validation_data=val_gen,
                                verbose=0,
                                shuffle=False,
                                callbacks=[monitor])

            test_metrics = model.evaluate(test_gen)
            test_result_dict = {}
            print("\n" + str(train_idx + 220) + "'s Test Set: " +
                  str(test_idx + 220) + "'s Metrics:")
            for name, val in zip(model.metrics_names, test_metrics):
                print("\t{}: {:0.4f}".format(name, val))
                test_result_dict[name] = val
            json.dump(
                test_result_dict,
                open(
                    'test_result/' + str(train_idx + 220) + "_" +
                    str(test_idx + 220) + '.json', 'w'))

            test_preds = model.predict(test_gen)

            end = time.time()

            # 保存一些结果
            graph_history_list.append(history)  # 保存训练过程
            model_list.append(model)  # 保存model
            train_gen_list.append(train_gen)  # 保存train_gen方便之后算中间层的结果
            time_list.append(end - start)  # 保存运行时间
            model_weight_list.append(model.weights)  # 保存model的参数
            test_result.append(test_metrics[1])

            # # 存模型model
            # model.save('model/' + str(train_idx + 220) + "_" + str(val_idx + 220) + '.h5')
            # # 存训练过程history
            # json.dump(history.history,
            #           open('history/' + str(train_idx + 220) + "_" + str(val_idx + 220) + '.json', 'w'))
            # # 存训练过程图片figure
            # sg.utils.plot_history(history)
            # plt.title(str(train_idx + 220) + '->' + str(val_idx + 220))
            # plt.savefig('figure/' + str(train_idx + 220) + "_" + str(val_idx + 220) + '.png')
            # plt.show()
            # 存test的prediction的distribution
            plt.figure(figsize=(5, 10))
            plt.subplot(211)
            plt.hist(test_preds, bins=500)
            plt.title("Distribution of Prediction of " + str(test_idx + 220))
            plt.subplot(212)
            plt.hist(nodes_subjects_list[test_idx].values, bins=500)
            plt.title("Distribution of Origin of " + str(test_idx + 220))
            plt.xlabel("ic=" + str(test_metrics[1]))
            plt.savefig('figure_distribution/distribution-' +
                        str(train_idx + 220) + "_" + str(test_idx + 220) +
                        '.png',
                        dpi=300)
            plt.show()

            print(str(i + 220) + "'s " + str(j + 1) + " run has finished")
            print()

        # 将小循环的数据保存
        graph_history_list_list.append(graph_history_list)
        model_list_list.append(model_list)
        train_gen_list_list.append(train_gen_list)
        time_list_list.append(time_list)
        model_weight_list_list.append(model_weight_list)

        return graph_history_list_list, model_list_list, train_gen_list_list, time_list_list, model_weight_list_list, test_result
    graph = nx.node_link_graph(data)
    G = StellarGraph.from_networkx(graph, node_features="feature")
    print(G.node_types)
    G.check_graph_for_ml()
    nodes = [node for node in graph.nodes]
    shuffle(nodes)
    train_ids = nodes[:5000]
    test_ids = nodes[5000:]
    train_labels= [graph.nodes[id]["_class"] for id in train_ids]
    test_labels = [graph.nodes[id]["_class"] for id in test_ids]
    all_labels = train_labels + test_labels
    train_labels = np.array(train_labels).reshape(len(train_ids),1)
    test_labels = np.array(test_labels).reshape(len(test_ids), 1)
    print(np.unique(train_labels, return_counts=True))
    print(np.unique(test_labels, return_counts=True))
    generator = GraphSAGENodeGenerator(G, batch_size=50, num_samples=[10,10])
    train_data_gen = generator.flow(train_ids, train_labels)
    test_data_gen = generator.flow(test_ids, test_labels)
    all_gen = generator.flow(list(nodes), all_labels)

    print("Node Gen done!")
    base_model = GraphSAGE(layer_sizes=[32, 32], generator=generator, bias=True, dropout=0.8)
    x_in, x_out = base_model.build()
    prediction = layers.Dense(units=2, activation="softmax")(x_out)

    print("model building done")

    model = Model(inputs=x_in, outputs = prediction)
    model.compile(optimizer=optimizers.Adam(lr=0.005), loss=losses.categorical_crossentropy, metrics=["acc"])
    tensorboard = callbacks.TensorBoard(log_dir="logs",embeddings_freq=1, update_freq=1, histogram_freq=1)
    tboard = model.fit(train_data_gen, epochs=4, validation_data=test_data_gen, verbose=True,
Ejemplo n.º 18
0
    verbose=verbose,
    use_multiprocessing=False,
    workers=nworkers,
    shuffle=True,
)

## Get embeddings for all nodes

# Build a new node-based model
x_inp_src = x_inp[0::2]
x_out_src = x_out[0]
embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

# The node generator feeds graph nodes to `embedding_model`. We want to evaluate node embeddings for all nodes in the graph:
node_ids = sorted(G.nodes)
node_gen = GraphSAGENodeGenerator(G, batch_size, num_samples).flow(node_ids)
emb = embedding_model.predict_generator(node_gen,
                                        workers=nworkers,
                                        verbose=verbose)
node_embeddings = emb[:, 0, :]

if testtype == 'nodes':
    ## Node classification
    X = node_embeddings
    y = np.where(dataset['labels'])[1]

    # Train a Logistic Regression classifier on the training data.
    X_train, X_test, y_train, y_test = X[nodes_train, :], X[
        nodes_test, :], y[nodes_train], y[nodes_test]
    clf = LogisticRegression(verbose=verbose,
                             solver='liblinear',
    # node_data["feature"] = [g.degree(node_id), nx.average_neighbor_degree(g, nodes=[node_id])[node_id], 1, 1, 1]
    node_data["feature"] = [g.degree(node_id), 1, 1, 1]

## ############################################################################################################

G = StellarGraph.from_networkx(g, node_features="feature")
print(G.info())

test_targets = np.array(targetdf)

## #################################### Graphsage Model building ###########################################
#%% ############################################################################################################

batch_size = 20
num_samples = [15, 10, 5, 5]
generator = GraphSAGENodeGenerator(G, batch_size, num_samples)


def noderankloss(index):
    def loss(y_true, y_pred):
        # tf.print(tf.gather(y_true, tf.constant(index[:, 0])))

        yt = tf.math.sigmoid(
            tf.gather(y_true, tf.constant(index[:, 0])) -
            tf.gather(y_true, tf.constant(index[:, 1])))
        yp = tf.math.sigmoid(
            tf.gather(y_pred, tf.constant(index[:, 0])) -
            tf.gather(y_pred, tf.constant(index[:, 1])))
        # tf.print(tf.shape(yt))
        onetensor = tf.ones(shape=tf.shape(yt))
        # tempmatrix = (-1)*K.dot(yt, tf.math.log(tf.transpose(yp))) - K.dot((onetensor - yt),
Ejemplo n.º 20
0
def train_model(Gnx, train_data, test_data, all_features):
    output_results = {}
    from collections import Counter
    #TODO: save size of dataset, train_data, and test data
    #save the count of each subject in the blocks
    print(len(train_data), len(test_data))
    subject_groups_train = Counter(train_data['subject'])
    subject_groups_test = Counter(test_data['subject'])
    output_results['train_size'] = len(train_data)
    output_results['test_size'] = len(test_data)
    output_results['subject_groups_train'] = subject_groups_train
    output_results['subject_groups_test'] = subject_groups_test

    #node_features = train_data[feature_names]
    #print (feature_names)
    G = sg.StellarGraph(Gnx, node_features=all_features)
    #TODO: save graph info
    print(G.info())
    print("writing graph.dot")
    #write_dot(Gnx,"graph.dot")
    output_results['graph_info'] = G.info()
    print("building the graph generator...")

    batch_size = 50
    num_samples = [10, 5]
    generator = GraphSAGENodeGenerator(G, batch_size, num_samples)
    #generator = HinSAGENodeGenerator(G, batch_size, num_samples)

    target_encoding = feature_extraction.DictVectorizer(sparse=False)
    train_targets = target_encoding.fit_transform(
        train_data[["subject"]].to_dict('records'))
    print(np.unique(train_data["subject"].to_list()))
    class_weights = class_weight.compute_class_weight(
        'balanced', np.unique(train_data["subject"].to_list()),
        train_data["subject"].to_list())
    print('class_weights', class_weights)
    test_targets = target_encoding.transform(test_data[["subject"
                                                        ]].to_dict('records'))
    train_gen = generator.flow(train_data.index, train_targets, shuffle=True)
    graphsage_model = GraphSAGE(
        #graphsage_model = HinSAGE(
        #layer_sizes=[32, 32],
        layer_sizes=[80, 80],
        generator=generator,  #train_gen,
        bias=True,
        dropout=0.5,
    )
    print("building model...")
    #x_inp, x_out = graphsage_model.build(flatten_output=True)
    x_inp, x_out = graphsage_model.build()
    prediction = layers.Dense(units=train_targets.shape[1],
                              activation="softmax")(x_out)

    model = Model(inputs=x_inp, outputs=prediction)
    print("compiling model...")
    model.compile(
        optimizer=optimizers.Adam(lr=0.005),
        loss=losses.categorical_crossentropy,
        metrics=["acc", metrics.categorical_accuracy],
    )
    print("testing the model...")
    test_gen = generator.flow(test_data.index, test_targets)
    history = model.fit_generator(
        train_gen,
        epochs=EPOCH,
        validation_data=test_gen,
        verbose=2,
        shuffle=True,
        class_weight=class_weights,
    )
    # save test metrics
    test_metrics = model.evaluate_generator(test_gen)
    print("\nTest Set Metrics:")
    output_results['test_metrics'] = []
    for name, val in zip(model.metrics_names, test_metrics):
        output_results['test_metrics'].append({'name': name, 'val:': val})
        print("\t{}: {:0.4f}".format(name, val))

    test_nodes = test_data.index
    test_mapper = generator.flow(test_nodes)
    test_predictions = model.predict_generator(test_mapper)
    node_predictions = target_encoding.inverse_transform(test_predictions)
    results = pd.DataFrame(node_predictions, index=test_nodes).idxmax(axis=1)
    df = pd.DataFrame({
        "Predicted": results,
        "True": test_data['subject']
    })  #, "program":test_data['program']})
    clean_result_labels = df["Predicted"].map(
        lambda x: x.replace('subject=', ''))
    # save predicted labels
    pred_labels = np.unique(clean_result_labels.values)
    #pred_program = np.unique(df['program'].values)
    # save predictions per label
    precision, recall, f1, _ = skmetrics.precision_recall_fscore_support(
        df['True'].values,
        clean_result_labels.values,
        average=None,
        labels=pred_labels)
    output_results['classifier'] = []
    for lbl, prec, rec, fm in zip(pred_labels, precision, recall, f1):
        output_results['classifier'].append({
            'label': lbl,
            'precision': prec,
            'recall': rec,
            'fscore': fm
        })
    print(output_results['classifier'])
    print(pred_labels)
    print('precision: {}'.format(precision))
    print('recall: {}'.format(recall))
    print('fscore: {}'.format(f1))

    return generator, model, x_inp, x_out, history, target_encoding, output_results
Ejemplo n.º 21
0
 def build_generator(self):
     batch_size = 50
     num_samples = [10, 5]
     return GraphSAGENodeGenerator(self.SG, batch_size, num_samples)
# temp_train_subjects = np.reshape(np.array(train_subjects), (train_subjects.shape[0],1))
# temp_test_subjects = np.reshape(np.array(test_subjects), (test_subjects.shape[0],1))
# train_targets = target_encoding.fit_transform(temp_train_subjects).toarray()
# test_targets = target_encoding.transform(temp_test_subjects).toarray()

train_targets = np.array(train_subjects)
test_targets = np.array(test_subjects)

## #################################### Graphsage Model building ###########################################
#%% ############################################################################################################

batch_size = 40
# number of nodes to consider for each hop
num_samples = [15, 10, 5]
generator = GraphSAGENodeGenerator(G, batch_size, num_samples)

train_gen = generator.flow(
    train_subjects.index, train_targets,
    shuffle=True)  # train_subjects.index for selecting training nodes
test_gen = generator.flow(test_subjects.index, test_targets)

# aggregatortype = MaxPoolingAggregator(),
# layer_sizes (list): Hidden feature dimensions for each layer. activations (list): Activations applied to each layer's output;


def get_dropout(input_tensor, p=0.1, mc=False):
    if mc:
        return Dropout(p)(input_tensor, training=True)
    else:
        return Dropout(p)(input_tensor)
Ejemplo n.º 23
0
    def _train_model(self, gnx, train_data, test_data, all_features,
                     target_feature_name):
        subject_groups_train = Counter(train_data[target_feature_name])
        subject_groups_test = Counter(test_data[target_feature_name])

        graph = sg.StellarGraph(gnx, node_features=all_features)

        output_results = {
            'train_size': len(train_data),
            'test_size': len(test_data),
            'subject_groups_train': subject_groups_train,
            'subject_groups_test': subject_groups_test,
            'graph_info': graph.info()
        }

        num_samples = [10, 5]
        generator = GraphSAGENodeGenerator(graph, self.batch_size, num_samples)

        target_encoding = feature_extraction.DictVectorizer(sparse=False)
        train_targets = target_encoding.fit_transform(
            train_data[[target_feature_name]].to_dict('records'))
        class_weights = class_weight.compute_class_weight(
            class_weight='balanced',
            classes=np.unique(train_data[target_feature_name].to_list()),
            y=train_data[target_feature_name].to_list())
        class_weights = dict(enumerate(class_weights))
        test_targets = target_encoding.transform(
            test_data[[target_feature_name]].to_dict('records'))
        train_gen = generator.flow(train_data.index,
                                   train_targets,
                                   shuffle=True)
        graph_sage_model = GraphSAGE(
            layer_sizes=[80, 80],
            generator=generator,  # train_gen,
            bias=True,
            dropout=0.5,
        )
        print('building model...')

        x_inp, x_out = graph_sage_model.build()
        prediction = layers.Dense(units=train_targets.shape[1],
                                  activation="softmax")(x_out)

        model = Model(inputs=x_inp, outputs=prediction)
        print('compiling model...')
        model.compile(
            optimizer=optimizers.Adam(learning_rate=0.005),
            loss=losses.categorical_crossentropy,
            metrics=['acc', metrics.categorical_accuracy],
        )
        print('testing the model...')
        test_gen = generator.flow(test_data.index, test_targets)
        history = model.fit(
            train_gen,
            epochs=self.num_epochs,
            validation_data=test_gen,
            verbose=2,
            shuffle=True,
            class_weight=class_weights,
        )
        # save test metrics
        test_metrics = model.evaluate(test_gen)
        print('Test Set Metrics:')
        output_results['test_metrics'] = []
        for name, val in zip(model.metrics_names, test_metrics):
            output_results['test_metrics'].append({'name': name, 'val:': val})
            print("\t{}: {:0.4f}".format(name, val))

        test_nodes = test_data.index
        test_mapper = generator.flow(test_nodes)
        test_predictions = model.predict(test_mapper)
        node_predictions = target_encoding.inverse_transform(test_predictions)
        results = pd.DataFrame(node_predictions,
                               index=test_nodes).idxmax(axis=1)
        df = pd.DataFrame({
            'Predicted': results,
            'True': test_data[target_feature_name]
        })
        clean_result_labels = df['Predicted'].map(
            lambda x: x.replace('subject=', ''))

        # save predicted labels
        pred_labels = np.unique(clean_result_labels.values)
        precision, recall, f1, _ = skmetrics.precision_recall_fscore_support(
            df['True'].values,
            clean_result_labels.values,
            average=None,
            labels=pred_labels)
        output_results['classifier'] = []
        for lbl, prec, rec, fm in zip(pred_labels, precision, recall, f1):
            output_results['classifier'].append({
                'label': lbl,
                'precision': prec,
                'recall': rec,
                'fscore': fm
            })

        print(output_results['classifier'])
        print(pred_labels)
        print('precision: {}'.format(precision))
        print('recall: {}'.format(recall))
        print('fscore: {}'.format(f1))

        output_results['history'] = {
            'epochs': history.epoch,
            'training_log': history.history,
            'training_params': history.params
        }

        return generator, model, x_inp, x_out, history, target_encoding, output_results