Ejemplo n.º 1
0
def create_graphSAGE_model(graph, link_prediction=False):

    if link_prediction:
        # We are going to train on the original graph
        generator = GraphSAGELinkGenerator(graph,
                                           batch_size=2,
                                           num_samples=[2, 2])
        edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]])
        train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0]))
    else:
        generator = GraphSAGENodeGenerator(graph,
                                           batch_size=2,
                                           num_samples=[2, 2])
        train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]]))

    # if link_prediction:
    #     edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]])
    #     train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0]))
    # else:
    #     train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]]))

    base_model = GraphSAGE(layer_sizes=[8, 8],
                           generator=train_gen,
                           bias=True,
                           dropout=0.5)

    if link_prediction:
        # Expose input and output sockets of graphsage, for source and destination nodes:
        x_inp_src, x_out_src = base_model.default_model(flatten_output=False)
        x_inp_dst, x_out_dst = base_model.default_model(flatten_output=False)
        # re-pack into a list where (source, destination) inputs alternate, for link inputs:
        x_inp = [x for ab in zip(x_inp_src, x_inp_dst) for x in ab]
        # same for outputs:
        x_out = [x_out_src, x_out_dst]

        prediction = link_classification(output_dim=1,
                                         output_act="relu",
                                         edge_embedding_method="ip")(x_out)

        keras_model = Model(inputs=x_inp, outputs=prediction)
    else:
        x_inp, x_out = base_model.default_model(flatten_output=True)
        prediction = layers.Dense(units=2, activation="softmax")(x_out)

        keras_model = Model(inputs=x_inp, outputs=prediction)

    return base_model, keras_model, generator, train_gen
def train(
    edgelist,
    node_data,
    layer_size,
    num_samples,
    batch_size=100,
    num_epochs=10,
    learning_rate=0.005,
    dropout=0.0,
    target_name="subject",
):
    """
    Train a GraphSAGE model on the specified graph G with given parameters, evaluate it, and save the model.

    Args:
        edgelist: Graph edgelist
        node_data: Feature and target data for nodes
        layer_size: A list of number of hidden nodes in each layer
        num_samples: Number of neighbours to sample at each layer
        batch_size: Size of batch for inference
        num_epochs: Number of epochs to train the model
        learning_rate: Initial Learning rate
        dropout: The dropout (0->1)
    """
    # Extract target and encode as a one-hot vector
    target_encoding = feature_extraction.DictVectorizer(sparse=False)
    node_targets = target_encoding.fit_transform(
        node_data[[target_name]].to_dict("records"))
    node_ids = node_data.index

    # Extract the feature data. These are the feature vectors that the Keras model will use as input.
    # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication.
    node_features = node_data[feature_names]

    # Create graph from edgelist and set node features and node type
    Gnx = nx.from_pandas_edgelist(edgelist)

    # Convert to StellarGraph and prepare for ML
    G = sg.StellarGraph(Gnx,
                        node_type_name="label",
                        node_features=node_features)

    # Split nodes into train/test using stratification.
    train_nodes, test_nodes, train_targets, test_targets = model_selection.train_test_split(
        node_ids,
        node_targets,
        train_size=140,
        test_size=None,
        stratify=node_targets,
        random_state=5232,
    )

    # Split test set into test and validation
    val_nodes, test_nodes, val_targets, test_targets = model_selection.train_test_split(
        test_nodes,
        test_targets,
        train_size=500,
        test_size=None,
        random_state=5214)

    # Create mappers for GraphSAGE that input data from the graph to the model
    generator = GraphSAGENodeGenerator(G, batch_size, num_samples, seed=5312)
    train_gen = generator.flow(train_nodes, train_targets, shuffle=True)
    val_gen = generator.flow(val_nodes, val_targets)

    # GraphSAGE model
    model = GraphSAGE(
        layer_sizes=layer_size,
        generator=train_gen,
        bias=True,
        dropout=dropout,
        aggregator=MeanAggregator,
    )
    # Expose the input and output sockets of the model:
    x_inp, x_out = model.default_model(flatten_output=True)

    # Snap the final estimator layer to x_out
    prediction = layers.Dense(units=train_targets.shape[1],
                              activation="softmax")(x_out)

    # Create Keras model for training
    model = keras.Model(inputs=x_inp, outputs=prediction)
    model.compile(
        optimizer=optimizers.Adam(lr=learning_rate, decay=0.001),
        loss=losses.categorical_crossentropy,
        metrics=[metrics.categorical_accuracy],
    )
    print(model.summary())

    # Train model
    history = model.fit_generator(train_gen,
                                  epochs=num_epochs,
                                  validation_data=val_gen,
                                  verbose=2,
                                  shuffle=False)

    # Evaluate on test set and print metrics
    test_metrics = model.evaluate_generator(
        generator.flow(test_nodes, test_targets))
    print("\nTest Set Metrics:")
    for name, val in zip(model.metrics_names, test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Get predictions for all nodes
    all_predictions = model.predict_generator(generator.flow(node_ids))

    # Turn predictions back into the original categories
    node_predictions = pd.DataFrame(
        target_encoding.inverse_transform(all_predictions), index=node_ids)
    accuracy = np.mean([
        "subject=" + gt_subject == p for gt_subject, p in zip(
            node_data["subject"], node_predictions.idxmax(axis=1))
    ])
    print("All-node accuracy: {:3f}".format(accuracy))

    # TODO: extract the GraphSAGE embeddings from x_out, and save/plot them

    # Save the trained model
    save_str = "_n{}_l{}_d{}_r{}".format(
        "_".join([str(x) for x in num_samples]),
        "_".join([str(x) for x in layer_size]),
        dropout,
        learning_rate,
    )
    model.save("cora_example_model" + save_str + ".h5")

    # We must also save the target encoding to convert model predictions
    with open("cora_example_encoding" + save_str + ".pkl", "wb") as f:
        pickle.dump([target_encoding], f)
Ejemplo n.º 3
0
def train(
    G,
    layer_size: List[int],
    num_samples: List[int],
    batch_size: int = 100,
    num_epochs: int = 10,
    learning_rate: float = 0.001,
    dropout: float = 0.0,
):
    """
    Train the GraphSAGE model on the specified graph G
    with given parameters.

    Args:
        G: NetworkX graph file
        layer_size: A list of number of hidden units in each layer of the GraphSAGE model
        num_samples: Number of neighbours to sample at each layer of the GraphSAGE model
        batch_size: Size of batch for inference
        num_epochs: Number of epochs to train the model
        learning_rate: Initial Learning rate
        dropout: The dropout (0->1)
    """

    # Split links into train/test
    print("Using '{}' method to sample negative links".format(
        args.edge_sampling_method))

    # From the original graph, extract E_test and the reduced graph G_test:
    edge_splitter_test = EdgeSplitter(G)
    # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G, and obtain the
    # reduced graph G_test with the sampled links removed:
    G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
        p=0.1,
        method=args.edge_sampling_method,
        probs=args.edge_sampling_probs)

    # From G_test, extract E_train and the reduced graph G_train:
    edge_splitter_train = EdgeSplitter(G_test, G)
    # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G_test, and obtain the
    # further reduced graph G_train with the sampled links removed:
    G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
        p=0.1,
        method=args.edge_sampling_method,
        probs=args.edge_sampling_probs)

    # G_train, edge_ds_train, edge_labels_train will be used for model training
    # G_test, edge_ds_test, edge_labels_test will be used for model testing

    # Convert G_train and G_test to StellarGraph objects (undirected, as required by GraphSAGE) for ML:
    G_train = sg.StellarGraph(G_train, node_features="feature")
    G_test = sg.StellarGraph(G_test, node_features="feature")

    # Mapper feeds link data from sampled subgraphs to GraphSAGE model
    # We need to create two mappers: for training and testing of the model
    train_gen = GraphSAGELinkGenerator(
        G_train,
        batch_size,
        num_samples,
        name="train",
    ).flow(edge_ids_train, edge_labels_train)

    test_gen = GraphSAGELinkGenerator(
        G_test,
        batch_size,
        num_samples,
        name="test",
    ).flow(edge_ids_test, edge_labels_test)

    # GraphSAGE model
    graphsage = GraphSAGE(layer_sizes=layer_size,
                          generator=train_gen,
                          bias=True,
                          dropout=dropout)

    # Expose input and output sockets of the model, for source and destination nodes:
    x_inp_src, x_out_src = graphsage.default_model(flatten_output=False)
    x_inp_dst, x_out_dst = graphsage.default_model(flatten_output=False)
    # re-pack into a list where (source, target) inputs alternate, for link inputs:
    x_inp = [x for ab in zip(x_inp_src, x_inp_dst) for x in ab]
    # same for outputs:
    x_out = [x_out_src, x_out_dst]

    # Final estimator layer
    prediction = link_classification(
        output_dim=1,
        output_act="sigmoid",
        edge_feature_method=args.edge_feature_method)(x_out)

    # Stack the GraphSAGE and prediction layers into a Keras model, and specify the loss
    model = keras.Model(inputs=x_inp, outputs=prediction)
    model.compile(
        optimizer=optimizers.Adam(lr=learning_rate),
        loss=losses.binary_crossentropy,
        metrics=[metrics.binary_accuracy],
    )

    # Evaluate the initial (untrained) model on the train and test set:
    init_train_metrics = model.evaluate_generator(train_gen)
    init_test_metrics = model.evaluate_generator(test_gen)

    print("\nTrain Set Metrics of the initial (untrained) model:")
    for name, val in zip(model.metrics_names, init_train_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    print("\nTest Set Metrics of the initial (untrained) model:")
    for name, val in zip(model.metrics_names, init_test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Train model
    print("\nTraining the model for {} epochs...".format(num_epochs))
    history = model.fit_generator(
        train_gen,
        epochs=num_epochs,
        validation_data=test_gen,
        verbose=2,
        shuffle=True,
    )

    # Evaluate and print metrics
    train_metrics = model.evaluate_generator(train_gen)
    test_metrics = model.evaluate_generator(test_gen)

    print("\nTrain Set Metrics of the trained model:")
    for name, val in zip(model.metrics_names, train_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    print("\nTest Set Metrics of the trained model:")
    for name, val in zip(model.metrics_names, test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Save the trained model
    save_str = "_n{}_l{}_d{}_r{}".format(
        "_".join([str(x) for x in num_samples]),
        "_".join([str(x) for x in layer_size]),
        dropout,
        learning_rate,
    )
    model.save("graphsage_link_pred" + save_str + ".h5")