def graphsage_pipeline(G, node_subjects, layer_sizes=[32, 32]): train_subjects, val_subjects, test_subjects = training_split(node_subjects) batch_size = 50 num_samples = [10, 5] generator = GraphSAGENodeGenerator(G, batch_size, num_samples) train_gen = generator.flow(train_subjects.index, train_subjects.values, shuffle=True) graphsage_model = GraphSAGE( layer_sizes=layer_sizes, generator=generator, bias=True, dropout=0.5, ) model = build_model(graphsage_model, train_subjects.values.shape[1]) val_gen = generator.flow(val_subjects.index, val_subjects.values) es_callback = EarlyStopping(monitor="val_acc", patience=50, restore_best_weights=True) history = model.fit(train_gen, epochs=200, validation_data=val_gen, verbose=0, shuffle=False, callbacks=[es_callback]) plot_results(history) test_metrics(generator, model, test_subjects)
def graph_node_classifier(name, train_data, layer_sizes=[32, 32], verbose=1): """ Build and return a neural node classification model. Notes: Only mutually-exclusive class labels are supported. Args: name (string): one of: - 'graphsage' for GraphSAGE model (only GraphSAGE currently supported) train_data (NodeSequenceWrapper): a deepwrap.graph.sg_wrappers.NodeSequenceWrapper object verbose (boolean): verbosity of output Return: model (Model): A Keras Model instance """ from .sg_wrappers import NodeSequenceWrapper # check argument if not isinstance(train_data, NodeSequenceWrapper): err = """ train_data must be a deepwrap.graph.sg_wrappers.NodeSequenceWrapper object """ raise Exception(err) if len(layer_sizes) != 2: raise ValueError('layer_sizes must be of length 2') num_classes = U.nclasses_from_data(train_data) # determine multilabel multilabel = U.is_multilabel(train_data) if multilabel: raise ValueError( 'Multi-label classification not currently supported for graphs.') U.vprint("Is Multi-Label? %s" % (multilabel), verbose=verbose) # set loss and activations loss_func = 'categorical_crossentropy' activation = 'softmax' # import stellargraph try: import stellargraph as sg from stellargraph.layer import GraphSAGE except: raise Exception(SG_ERRMSG) if version.parse(sg.__version__) < version.parse('0.8'): raise Exception(SG_ERRMSG) # build a GraphSAGE node classification model graphsage_model = GraphSAGE( layer_sizes=layer_sizes, generator=train_data, bias=True, dropout=0.5, ) # x_inp, x_out = graphsage_model.default_model(flatten_output=True) x_inp, x_out = graphsage_model.build() prediction = Dense(units=num_classes, activation=activation)(x_out) model = Model(inputs=x_inp, outputs=prediction) model.compile(optimizer='adam', loss=loss_func, metrics=["accuracy"]) U.vprint('done', verbose=verbose) return model
def graph_link_predictor(name, train_data, preproc, layer_sizes=[20, 20], verbose=1): """ Build and return a neural link prediction model. Args: name (string): one of: - 'graphsage' for GraphSAGE model (only GraphSAGE currently supported) train_data (LinkSequenceWrapper): a ktrain.graph.sg_wrappers.LinkSequenceWrapper object preproc(LinkPreprocessor): a LinkPreprocessor instance verbose (boolean): verbosity of output Return: model (Model): A Keras Model instance """ from .sg_wrappers import LinkSequenceWrapper # check argument if not isinstance(train_data, LinkSequenceWrapper): err = """ train_data must be a ktrain.graph.sg_wrappers.LinkSequenceWrapper object """ raise Exception(err) if len(layer_sizes) != len(preproc.sample_sizes): raise ValueError( 'number of layer_sizes must match len(preproc.sample_sizes)') num_classes = U.nclasses_from_data(train_data) # set loss and activations loss_func = 'categorical_crossentropy' activation = 'softmax' # import stellargraph try: import stellargraph as sg from stellargraph.layer import GraphSAGE, link_classification except: raise Exception(SG_ERRMSG) if version.parse(sg.__version__) < version.parse('0.8'): raise Exception(SG_ERRMSG) # build a GraphSAGE link prediction model graphsage = GraphSAGE(layer_sizes=layer_sizes, generator=train_data, bias=True, dropout=0.3) x_inp, x_out = graphsage.build() prediction = link_classification(output_dim=1, output_act="relu", edge_embedding_method='ip')(x_out) model = Model(inputs=x_inp, outputs=prediction) model.compile(optimizer=U.DEFAULT_OPT, loss='binary_crossentropy', metrics=["accuracy"]) return model
def initialize(self,**hyper_params): if(not "batch_size" in hyper_params.keys()): batch_size = 16 if(not "layer_sizes" in hyper_params.keys()): num_samples = [25, 10] if(not "num_samples" in hyper_params.keys()): layer_sizes = [256, 256] if(not "bias" in hyper_params.keys()): bias = True if(not "dropout" in hyper_params.keys()): dropout = 0.0 if(not "lr" in hyper_params.keys()): lr = 1e-3 if(not "num_walks" in hyper_params.keys()): num_walks = 1 if(not "length" in hyper_params.keys()): length = 5 self.graph = sg.StellarGraph(nodes=self.nodes_df,edges=self.edges_df) self.nodes = list(self.graph.nodes()) del self.nodes_df del self.edges_df unsupervised_samples = UnsupervisedSampler( self.graph, nodes=self.nodes, length=length, number_of_walks=num_walks ) # Train iterators train_gen = GraphSAGELinkGenerator(self.graph, batch_size, num_samples) self.train_flow = train_gen.flow(unsupervised_samples) # Model defining - Keras functional API + Stellargraph layers graphsage = GraphSAGE( layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout, normalize="l2" ) x_inp, x_out = graphsage.in_out_tensors() prediction = link_classification( output_dim=1, output_act="sigmoid", edge_embedding_method="ip" )(x_out) self.model = keras.Model(inputs=x_inp, outputs=prediction) self.model.compile( optimizer=keras.optimizers.Adam(lr=lr), loss=keras.losses.binary_crossentropy, metrics=[keras.metrics.binary_accuracy], ) x_inp_src = x_inp[0::2] x_out_src = x_out[0] self.embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src) self.node_gen = GraphSAGENodeGenerator(self.graph, batch_size, num_samples).flow(self.nodes) return self.model.get_weights()
def create_graphsage(train_gen): return GraphSAGE( layer_sizes=config.LAYER_SIZES, generator=train_gen, bias=True, dropout=config.DROPOUT, normalize="l2", )
def initialize(self,**hyper_params): if(not "batch_size" in hyper_params.keys()): batch_size = 20 if(not "layer_sizes" in hyper_params.keys()): num_samples = [20, 10] if(not "num_samples" in hyper_params.keys()): layer_sizes = [10, 10 ] if(not "bias" in hyper_params.keys()): bias = True if(not "dropout" in hyper_params.keys()): dropout = 0.1 if(not "lr" in hyper_params.keys()): lr = 1e-2 graph = sg.StellarGraph(nodes=self.nodes,edges=self.edges) # Test split edge_splitter_test = EdgeSplitter(graph) self.graph_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split( p=0.1, method="global", keep_connected=True, seed = 42 ) # Train split edge_splitter_train = EdgeSplitter(self.graph_test) self.graph_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split( p=0.1, method="global", keep_connected=True, seed = 42 ) # Train iterators train_gen = GraphSAGELinkGenerator(self.graph_train, batch_size, num_samples, seed = 42) self.train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True) # Test iterators test_gen = GraphSAGELinkGenerator(self.graph_train, batch_size, num_samples, seed = 42) self.test_flow = test_gen.flow(edge_ids_test, edge_labels_test, shuffle=True) # Model defining - Keras functional API + Stellargraph layers graphsage = GraphSAGE( layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout ) x_inp, x_out = graphsage.in_out_tensors() prediction = link_classification( output_dim=1, output_act="sigmoid", edge_embedding_method="ip" )(x_out) self.model = keras.Model(inputs=x_inp, outputs=prediction) self.model.compile( optimizer=keras.optimizers.Adam(lr=lr), loss=keras.losses.binary_crossentropy, metrics=[keras.metrics.BinaryAccuracy(),keras.metrics.Recall(),keras.metrics.AUC(),keras.metrics.Precision()], ) # return number of training and testing examples return edge_ids_train.shape[0],edge_ids_test.shape[0]
def train_clf(self, graph, L): ''' Train GraphSage model with updated labeled pool L Return new trained model ''' train_targets = self.target_encoding.transform( self.df_targets.loc[L].to_dict("records")) train_gen = self.generator.flow(L, train_targets) gsage = GraphSAGE(layer_sizes=[32, 32], generator=self.generator, bias=True, dropout=0.5) x_inp, x_out = gsage.build() predictions = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) class_support = dict(Counter(self.df_targets.loc[L]["label"])) classes = sorted(self.data.class_labels) counts = [ class_support[c] if c in class_support else 0 for c in classes ] weights = np.sum(counts) / np.array(counts) weighted_loss = self.weighted_categorical_crossentropy(weights) model = Model(inputs=x_inp, outputs=predictions) model.compile( optimizer=optimizers.Adam(lr=0.2), # loss=losses.categorical_crossentropy, loss=weighted_loss, metrics=["acc"], ) # if not os.path.isdir("model_logs"): # os.makedirs("model_logs") # es_callback = EarlyStopping( # monitor="acc", patience=50 # ) # patience is the number of epochs to wait before early stopping in case of no further improvement # mc_callback = ModelCheckpoint( # "model_logs/best_model.h5", monitor="acc", save_best_only=True, save_weights_only=True # ) history = model.fit_generator( train_gen, epochs=50, verbose=0, shuffle= False, # this should be False, since shuffling data means shuffling the whole graph # callbacks=[es_callback, mc_callback], ) # model.load_weights("model_logs/best_model.h5") return model
def initialize(self, **hyper_params): if (not "batch_size" in hyper_params.keys()): batch_size = 20 if (not "layer_sizes" in hyper_params.keys()): num_samples = [20, 10] if (not "num_samples" in hyper_params.keys()): layer_sizes = [20, 20] if (not "bias" in hyper_params.keys()): bias = True if (not "dropout" in hyper_params.keys()): dropout = 0.3 if (not "lr" in hyper_params.keys()): lr = 1e-3 if (not "train_split" in hyper_params.keys()): train_split = 0.2 self.graph = sg.StellarGraph(nodes=self.nodes, edges=self.edges) # Train split edge_splitter_train = EdgeSplitter(self.graph) graph_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split( p=train_split, method="global", keep_connected=True) # Train iterators train_gen = GraphSAGELinkGenerator(graph_train, batch_size, num_samples) self.train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True) # Model defining - Keras functional API + Stellargraph layers graphsage = GraphSAGE(layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout) x_inp, x_out = graphsage.in_out_tensors() prediction = link_classification(output_dim=1, output_act="relu", edge_embedding_method="ip")(x_out) self.model = keras.Model(inputs=x_inp, outputs=prediction) self.model.compile( optimizer=keras.optimizers.Adam(lr=lr), loss=keras.losses.binary_crossentropy, metrics=["acc"], ) return self.model.get_weights()
def run_model(self): graph_sampled, label_series_sampled = self.prepare_data_for_stellargraph( ) train_targets, valid_targets, test_targets, train_labels, valid_labels, test_labels = self.get_train_valid_test( label_series_sampled) batch_size = self.hyperparams["batch_size"] num_samples = self.hyperparams["num_samples"] generator = GraphSAGENodeGenerator(graph_sampled, batch_size, num_samples) train_gen = generator.flow(train_labels.index, train_targets, shuffle=True) graphsage_model = GraphSAGE( layer_sizes=self.hyperparams["layer_sizes"], generator=generator, bias=self.hyperparams["bias"], dropout=self.hyperparams["dropout"], ) x_inp, x_out = graphsage_model.in_out_tensors() prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) model = Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=optimizers.Adam(lr=self.hyperparams["lr"]), loss=losses.categorical_crossentropy, metrics=["acc"], ) valid_gen = generator.flow(valid_labels.index, valid_targets) history = model.fit( train_gen, epochs=self.hyperparams["n_epochs"], validation_data=valid_gen, verbose=self.hyperparams["verbose"], shuffle=True, use_multiprocessing=True, ) sg.utils.plot_history(history) test_gen = generator.flow(test_labels.index, test_targets) test_metrics = model.evaluate(test_gen) print("\nTest Set Metrics:") for name, valid in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, valid))
def _dispatch_inductive_layer(layer_sizes, generator, model_name, params): if model_name == "attri2vec": embedding_layer = Attri2Vec( layer_sizes=layer_sizes, generator=generator, bias=False, normalize=None ) elif model_name == "graphsage": embedding_layer = GraphSAGE( layer_sizes=layer_sizes, generator=generator, bias=True, dropout=0.0, normalize="l2" ) return embedding_layer
def create_graphSAGE_model(graph, link_prediction=False): if link_prediction: # We are going to train on the original graph generator = GraphSAGELinkGenerator(graph, batch_size=2, num_samples=[2, 2]) edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]]) train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0])) else: generator = GraphSAGENodeGenerator(graph, batch_size=2, num_samples=[2, 2]) train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]])) # if link_prediction: # edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]]) # train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0])) # else: # train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]])) base_model = GraphSAGE(layer_sizes=[8, 8], generator=train_gen, bias=True, dropout=0.5) if link_prediction: # Expose input and output sockets of graphsage, for source and destination nodes: x_inp_src, x_out_src = base_model.node_model() x_inp_dst, x_out_dst = base_model.node_model() # re-pack into a list where (source, destination) inputs alternate, for link inputs: x_inp = [x for ab in zip(x_inp_src, x_inp_dst) for x in ab] # same for outputs: x_out = [x_out_src, x_out_dst] prediction = link_classification(output_dim=1, output_act="relu", edge_embedding_method="ip")(x_out) keras_model = Model(inputs=x_inp, outputs=prediction) else: x_inp, x_out = base_model.node_model() prediction = layers.Dense(units=2, activation="softmax")(x_out) keras_model = Model(inputs=x_inp, outputs=prediction) return base_model, keras_model, generator, train_gen
def _fit_deep_graph_infomax(train_graph, params, model_name): """Train unsupervised Deep Graph Infomax.""" if "gcn_dgi" in model_name or "gat_dgi" in model_name: if "cluster" in model_name: generator = ClusterNodeGenerator( train_graph, clusters=params["clusters"], q=params["clusters_q"]) else: generator = FullBatchNodeGenerator(train_graph, sparse=False) if "gcn_dgi" in model_name: embedding_layer = GCN( layer_sizes=[params["embedding_dimension"]], activations=["relu"], generator=generator) elif "gat_dgi" in model_name: embedding_layer = GAT( layer_sizes=[params["embedding_dimension"]], activations=["relu"], generator=generator, attn_heads=8) elif model_name == "graphsage_dgi": generator = GraphSAGENodeGenerator( train_graph, batch_size=50, num_samples=[5]) embedding_layer = GraphSAGE( layer_sizes=[params["embedding_dimension"]], activations=["relu"], generator=generator ) else: raise ValueError(f"Unknown mode name {model_name}") embedding_model = _execute_deep_graph_infomax( train_graph, embedding_layer, generator, params) # Here the models can be both inductive and transductive if model_name in ["gcn_dgi", "gat_dgi", "graphsage_dgi"]: return embedding_model.predict( generator.flow(train_graph.nodes())) else: return embedding_model
layer_sizes = [40, 30] learning_rate = 5e-2 unsupervisedSamples = UnsupervisedSampler(Gs, nodes=G.nodes(), length=length, number_of_walks=number_of_walks) generator = GraphSAGELinkGenerator(Gs, batch_size, num_samples) train_gen = generator.flow(unsupervisedSamples) assert len(layer_sizes) == len(num_samples) graphsage = GraphSAGE(layer_sizes=layer_sizes, generator=generator, bias=True, dropout=0.0, normalize="l2") x_inp, x_out = graphsage.build() prediction = link_classification(output_dim=1, output_act="sigmoid", edge_embedding_method="ip")(x_out) model = keras.Model(inputs=x_inp, outputs=prediction) model.compile(optimizer=keras.optimizers.Adam(lr=learning_rate), loss=keras.losses.binary_crossentropy, metrics=[keras.metrics.binary_accuracy])
def train( G, layer_size: List[int], num_samples: List[int], batch_size: int = 100, num_epochs: int = 10, learning_rate: float = 0.001, dropout: float = 0.0, ): """ Train the GraphSAGE model on the specified graph G with given parameters. Args: G: NetworkX graph file layer_size: A list of number of hidden units in each layer of the GraphSAGE model num_samples: Number of neighbours to sample at each layer of the GraphSAGE model batch_size: Size of batch for inference num_epochs: Number of epochs to train the model learning_rate: Initial Learning rate dropout: The dropout (0->1) """ # Split links into train/test print("Using '{}' method to sample negative links".format( args.edge_sampling_method)) # From the original graph, extract E_test and the reduced graph G_test: edge_splitter_test = EdgeSplitter(G) # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G, and obtain the # reduced graph G_test with the sampled links removed: G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split( p=0.1, keep_connected=True, method=args.edge_sampling_method, probs=args.edge_sampling_probs, ) # From G_test, extract E_train and the reduced graph G_train: edge_splitter_train = EdgeSplitter(G_test, G) # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G_test, and obtain the # further reduced graph G_train with the sampled links removed: G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split( p=0.1, keep_connected=True, method=args.edge_sampling_method, probs=args.edge_sampling_probs, ) # G_train, edge_ds_train, edge_labels_train will be used for model training # G_test, edge_ds_test, edge_labels_test will be used for model testing # Convert G_train and G_test to StellarGraph objects (undirected, as required by GraphSAGE) for ML: G_train = sg.StellarGraph(G_train, node_features="feature") G_test = sg.StellarGraph(G_test, node_features="feature") # Mapper feeds link data from sampled subgraphs to GraphSAGE model # We need to create two mappers: for training and testing of the model train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples) train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True) test_gen = GraphSAGELinkGenerator(G_test, batch_size, num_samples) test_flow = test_gen.flow(edge_ids_test, edge_labels_test) # GraphSAGE model graphsage = GraphSAGE(layer_sizes=layer_size, generator=train_gen, bias=True, dropout=dropout) # Construct input and output tensors for the link prediction model x_inp, x_out = graphsage.build() # Final estimator layer prediction = link_classification( output_dim=1, output_act="sigmoid", edge_embedding_method=args.edge_embedding_method, )(x_out) # Stack the GraphSAGE and prediction layers into a Keras model, and specify the loss model = keras.Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=optimizers.Adam(lr=learning_rate), loss=losses.binary_crossentropy, metrics=[metrics.binary_accuracy], ) # Evaluate the initial (untrained) model on the train and test set: init_train_metrics = model.evaluate_generator(train_flow) init_test_metrics = model.evaluate_generator(test_flow) print("\nTrain Set Metrics of the initial (untrained) model:") for name, val in zip(model.metrics_names, init_train_metrics): print("\t{}: {:0.4f}".format(name, val)) print("\nTest Set Metrics of the initial (untrained) model:") for name, val in zip(model.metrics_names, init_test_metrics): print("\t{}: {:0.4f}".format(name, val)) # Train model print("\nTraining the model for {} epochs...".format(num_epochs)) history = model.fit_generator( train_flow, epochs=num_epochs, validation_data=test_flow, verbose=2, shuffle=False, ) # Evaluate and print metrics train_metrics = model.evaluate_generator(train_flow) test_metrics = model.evaluate_generator(test_flow) print("\nTrain Set Metrics of the trained model:") for name, val in zip(model.metrics_names, train_metrics): print("\t{}: {:0.4f}".format(name, val)) print("\nTest Set Metrics of the trained model:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val)) # Save the trained model save_str = "_n{}_l{}_d{}_r{}".format( "_".join([str(x) for x in num_samples]), "_".join([str(x) for x in layer_size]), dropout, learning_rate, ) model.save("graphsage_link_pred" + save_str + ".h5")
def train( edgelist, node_data, layer_size, num_samples, batch_size=100, num_epochs=10, learning_rate=0.005, dropout=0.0, target_name="subject", ): """ Train a GraphSAGE model on the specified graph G with given parameters, evaluate it, and save the model. Args: edgelist: Graph edgelist node_data: Feature and target data for nodes layer_size: A list of number of hidden nodes in each layer num_samples: Number of neighbours to sample at each layer batch_size: Size of batch for inference num_epochs: Number of epochs to train the model learning_rate: Initial Learning rate dropout: The dropout (0->1) """ # Extract target and encode as a one-hot vector target_encoding = feature_extraction.DictVectorizer(sparse=False) node_targets = target_encoding.fit_transform( node_data[[target_name]].to_dict("records")) node_ids = node_data.index # Extract the feature data. These are the feature vectors that the Keras model will use as input. # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication. node_features = node_data[feature_names] # Create graph from edgelist and set node features and node type Gnx = nx.from_pandas_edgelist(edgelist, edge_attr="label") nx.set_node_attributes(Gnx, "paper", "label") # Convert to StellarGraph and prepare for ML G = sg.StellarGraph(Gnx, node_type_name="label", node_features=node_features) # Split nodes into train/test using stratification. train_nodes, test_nodes, train_targets, test_targets = model_selection.train_test_split( node_ids, node_targets, train_size=140, test_size=None, stratify=node_targets, random_state=5232, ) # Split test set into test and validation val_nodes, test_nodes, val_targets, test_targets = model_selection.train_test_split( test_nodes, test_targets, train_size=500, test_size=None, random_state=5214) # Create mappers for GraphSAGE that input data from the graph to the model generator = GraphSAGENodeGenerator(G, batch_size, num_samples, seed=5312) train_gen = generator.flow(train_nodes, train_targets, shuffle=True) val_gen = generator.flow(val_nodes, val_targets) # GraphSAGE model model = GraphSAGE( layer_sizes=layer_size, generator=train_gen, bias=True, dropout=dropout, aggregator=MeanAggregator, ) # Expose the input and output sockets of the model: x_inp, x_out = model.build() # Snap the final estimator layer to x_out prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) # Create Keras model for training model = keras.Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=optimizers.Adam(lr=learning_rate, decay=0.001), loss=losses.categorical_crossentropy, metrics=[metrics.categorical_accuracy], ) print(model.summary()) # Train model history = model.fit_generator(train_gen, epochs=num_epochs, validation_data=val_gen, verbose=2, shuffle=False) # Evaluate on test set and print metrics test_metrics = model.evaluate_generator( generator.flow(test_nodes, test_targets)) print("\nTest Set Metrics:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val)) # Get predictions for all nodes all_predictions = model.predict_generator(generator.flow(node_ids)) # Turn predictions back into the original categories node_predictions = pd.DataFrame( target_encoding.inverse_transform(all_predictions), index=node_ids) accuracy = np.mean([ "subject=" + gt_subject == p for gt_subject, p in zip( node_data["subject"], node_predictions.idxmax(axis=1)) ]) print("All-node accuracy: {:3f}".format(accuracy)) # TODO: extract the GraphSAGE embeddings from x_out, and save/plot them # Save the trained model save_str = "_n{}_l{}_d{}_r{}".format( "_".join([str(x) for x in num_samples]), "_".join([str(x) for x in layer_size]), dropout, learning_rate, ) model.save("cora_example_model" + save_str + ".h5") # We must also save the target encoding to convert model predictions with open("cora_example_encoding" + save_str + ".pkl", "wb") as f: pickle.dump([target_encoding], f)
train_ids = nodes[:5000] test_ids = nodes[5000:] train_labels= [graph.nodes[id]["_class"] for id in train_ids] test_labels = [graph.nodes[id]["_class"] for id in test_ids] all_labels = train_labels + test_labels train_labels = np.array(train_labels).reshape(len(train_ids),1) test_labels = np.array(test_labels).reshape(len(test_ids), 1) print(np.unique(train_labels, return_counts=True)) print(np.unique(test_labels, return_counts=True)) generator = GraphSAGENodeGenerator(G, batch_size=50, num_samples=[10,10]) train_data_gen = generator.flow(train_ids, train_labels) test_data_gen = generator.flow(test_ids, test_labels) all_gen = generator.flow(list(nodes), all_labels) print("Node Gen done!") base_model = GraphSAGE(layer_sizes=[32, 32], generator=generator, bias=True, dropout=0.8) x_in, x_out = base_model.build() prediction = layers.Dense(units=2, activation="softmax")(x_out) print("model building done") model = Model(inputs=x_in, outputs = prediction) model.compile(optimizer=optimizers.Adam(lr=0.005), loss=losses.categorical_crossentropy, metrics=["acc"]) tensorboard = callbacks.TensorBoard(log_dir="logs",embeddings_freq=1, update_freq=1, histogram_freq=1) tboard = model.fit(train_data_gen, epochs=4, validation_data=test_data_gen, verbose=True, shuffle=False, callbacks=[tensorboard]) print(tboard) print("prediction done") y_pred = model.predict(train_data_gen, verbose=1) labels = np.argmax(y_pred, axis=1)
# together wi"number of parallel workers to use" the unsupervised sampler will be used to generate samples. actual_nodes_train = list(Gtrain.nodes()) if testtype == 'nodes': assert set(nodes_train).issuperset(actual_nodes_train) unsupervised_samples = UnsupervisedSampler(Gtrain, nodes=actual_nodes_train, length=length_of_walks, number_of_walks=number_of_walks) train_gen = GraphSAGELinkGenerator(Gtrain, batch_size, num_samples).flow(unsupervised_samples) # Build the model assert len(layer_sizes) == len(num_samples) graphsage = GraphSAGE(layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=0.0, normalize="l2") x_inp, x_out = graphsage.build(flatten_output=False) prediction = link_classification(output_dim=1, output_act="sigmoid", edge_embedding_method='ip')(x_out) model = keras.Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=keras.optimizers.Adam(lr=1e-3), loss=keras.losses.binary_crossentropy, metrics=[keras.metrics.binary_accuracy], ) # Train the model history = model.fit_generator(
# Hyperparams batch_size = 20 epochs = 20 num_samples = [20, 10] layer_sizes = [20, 20] # Train iterators train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples) train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True) # Model defining - Keras functional API + Stellargraph layers graphsage = GraphSAGE( layer_sizes=layer_sizes, generator=train_gen, bias=True, dropout=0.3 ) x_inp, x_out = graphsage.in_out_tensors() prediction = link_classification( output_dim=1, output_act="relu", edge_embedding_method="ip" )(x_out) model = keras.Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=keras.optimizers.Adam(lr=1e-3), loss=keras.losses.binary_crossentropy, metrics=["acc"], )
def train(G_list, nodes_subjects_list, run_num=1, start_month_id=220, end_month_id=264): # 提前定义一些列表方便记录数据,大循环的列表存小循环的列表 graph_history_list_list = [] model_list_list = [] train_gen_list_list = [] time_list_list = [] model_weight_list_list = [] # 选择运行run_num次 run_num = run_num # 选择进行训练的月份,end_month_id最多取 start_month_id = start_month_id end_month_id = end_month_id # 创建文件夹保存model if not os.path.exists('model'): os.makedirs('model') # 创建文件夹保存history if not os.path.exists('history'): os.makedirs('history') # 创建文件夹保存figure if not os.path.exists('figure'): os.makedirs('figure') # 创建文件夹保存figure if not os.path.exists('figure_distribution'): os.makedirs('figure_distribution') # 创建文件夹保存test结果 if not os.path.exists('test_result'): os.makedirs('test_result') # 大循环记录训练了几次,计算多次是为了减少variance # 小循环记录训练的月份 for j in range(run_num): num_samples = [40] # 提前定义一些列表记录小循环的数据 graph_history_list = [] model_list = [] train_gen_list = [] time_list = [] model_weight_list = [] test_result = [] # i为0代表220 for i in range(start_month_id - 220, end_month_id - 220): start = time.time() # 前一个月训练,后一个月验证 train_idx = i val_idx = i + 1 test_idx = i + 2 # 用train_idx的数据生成训练集的generator generator = GraphSAGENodeGenerator( G=G_list[train_idx], batch_size=len(nodes_subjects_list[train_idx]), num_samples=num_samples, seed=100) train_gen = generator.flow(list( nodes_subjects_list[train_idx].index), nodes_subjects_list[train_idx].values, shuffle=False) # 生成GraphSAGE模型 graphsage_model = GraphSAGE(layer_sizes=[1], generator=generator, bias=True, aggregator=sg.layer.MeanAggregator, normalize=None) # 提取输出输出的tensor,用keras来构建模型 x_inp, x_out = graphsage_model.in_out_tensors() # prediction = layers.Dense(units=1)(x_out) # 用val_idx的数据生成验证集的generator generator = GraphSAGENodeGenerator( G=G_list[val_idx], batch_size=len(nodes_subjects_list[val_idx]), num_samples=num_samples, seed=100) val_gen = generator.flow(list(nodes_subjects_list[val_idx].index), nodes_subjects_list[val_idx].values) # 用test_idx的数据生成验证集的generator generator = GraphSAGENodeGenerator( G=G_list[test_idx], batch_size=len(nodes_subjects_list[test_idx]), num_samples=num_samples, seed=100) test_gen = generator.flow( list(nodes_subjects_list[test_idx].index), nodes_subjects_list[test_idx].values) # 通过输入输出的tensor构建model model = Model(inputs=x_inp, outputs=x_out) monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=10, verbose=2, mode='auto', restore_best_weights=True) model.compile(optimizer=optimizers.Adam(lr=0.05), loss=losses.mean_squared_error, metrics=[pearson_r]) history = model.fit(train_gen, epochs=500, validation_data=val_gen, verbose=0, shuffle=False, callbacks=[monitor]) test_metrics = model.evaluate(test_gen) test_result_dict = {} print("\n" + str(train_idx + 220) + "'s Test Set: " + str(test_idx + 220) + "'s Metrics:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val)) test_result_dict[name] = val json.dump( test_result_dict, open( 'test_result/' + str(train_idx + 220) + "_" + str(test_idx + 220) + '.json', 'w')) test_preds = model.predict(test_gen) end = time.time() # 保存一些结果 graph_history_list.append(history) # 保存训练过程 model_list.append(model) # 保存model train_gen_list.append(train_gen) # 保存train_gen方便之后算中间层的结果 time_list.append(end - start) # 保存运行时间 model_weight_list.append(model.weights) # 保存model的参数 test_result.append(test_metrics[1]) # # 存模型model # model.save('model/' + str(train_idx + 220) + "_" + str(val_idx + 220) + '.h5') # # 存训练过程history # json.dump(history.history, # open('history/' + str(train_idx + 220) + "_" + str(val_idx + 220) + '.json', 'w')) # # 存训练过程图片figure # sg.utils.plot_history(history) # plt.title(str(train_idx + 220) + '->' + str(val_idx + 220)) # plt.savefig('figure/' + str(train_idx + 220) + "_" + str(val_idx + 220) + '.png') # plt.show() # 存test的prediction的distribution plt.figure(figsize=(5, 10)) plt.subplot(211) plt.hist(test_preds, bins=500) plt.title("Distribution of Prediction of " + str(test_idx + 220)) plt.subplot(212) plt.hist(nodes_subjects_list[test_idx].values, bins=500) plt.title("Distribution of Origin of " + str(test_idx + 220)) plt.xlabel("ic=" + str(test_metrics[1])) plt.savefig('figure_distribution/distribution-' + str(train_idx + 220) + "_" + str(test_idx + 220) + '.png', dpi=300) plt.show() print(str(i + 220) + "'s " + str(j + 1) + " run has finished") print() # 将小循环的数据保存 graph_history_list_list.append(graph_history_list) model_list_list.append(model_list) train_gen_list_list.append(train_gen_list) time_list_list.append(time_list) model_weight_list_list.append(model_weight_list) return graph_history_list_list, model_list_list, train_gen_list_list, time_list_list, model_weight_list_list, test_result
def train_model(Gnx, train_data, test_data, all_features): output_results = {} from collections import Counter #TODO: save size of dataset, train_data, and test data #save the count of each subject in the blocks print(len(train_data), len(test_data)) subject_groups_train = Counter(train_data['subject']) subject_groups_test = Counter(test_data['subject']) output_results['train_size'] = len(train_data) output_results['test_size'] = len(test_data) output_results['subject_groups_train'] = subject_groups_train output_results['subject_groups_test'] = subject_groups_test #node_features = train_data[feature_names] #print (feature_names) G = sg.StellarGraph(Gnx, node_features=all_features) #TODO: save graph info print(G.info()) print("writing graph.dot") #write_dot(Gnx,"graph.dot") output_results['graph_info'] = G.info() print("building the graph generator...") batch_size = 50 num_samples = [10, 5] generator = GraphSAGENodeGenerator(G, batch_size, num_samples) #generator = HinSAGENodeGenerator(G, batch_size, num_samples) target_encoding = feature_extraction.DictVectorizer(sparse=False) train_targets = target_encoding.fit_transform( train_data[["subject"]].to_dict('records')) print(np.unique(train_data["subject"].to_list())) class_weights = class_weight.compute_class_weight( 'balanced', np.unique(train_data["subject"].to_list()), train_data["subject"].to_list()) print('class_weights', class_weights) test_targets = target_encoding.transform(test_data[["subject" ]].to_dict('records')) train_gen = generator.flow(train_data.index, train_targets, shuffle=True) graphsage_model = GraphSAGE( #graphsage_model = HinSAGE( #layer_sizes=[32, 32], layer_sizes=[80, 80], generator=generator, #train_gen, bias=True, dropout=0.5, ) print("building model...") #x_inp, x_out = graphsage_model.build(flatten_output=True) x_inp, x_out = graphsage_model.build() prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) model = Model(inputs=x_inp, outputs=prediction) print("compiling model...") model.compile( optimizer=optimizers.Adam(lr=0.005), loss=losses.categorical_crossentropy, metrics=["acc", metrics.categorical_accuracy], ) print("testing the model...") test_gen = generator.flow(test_data.index, test_targets) history = model.fit_generator( train_gen, epochs=EPOCH, validation_data=test_gen, verbose=2, shuffle=True, class_weight=class_weights, ) # save test metrics test_metrics = model.evaluate_generator(test_gen) print("\nTest Set Metrics:") output_results['test_metrics'] = [] for name, val in zip(model.metrics_names, test_metrics): output_results['test_metrics'].append({'name': name, 'val:': val}) print("\t{}: {:0.4f}".format(name, val)) test_nodes = test_data.index test_mapper = generator.flow(test_nodes) test_predictions = model.predict_generator(test_mapper) node_predictions = target_encoding.inverse_transform(test_predictions) results = pd.DataFrame(node_predictions, index=test_nodes).idxmax(axis=1) df = pd.DataFrame({ "Predicted": results, "True": test_data['subject'] }) #, "program":test_data['program']}) clean_result_labels = df["Predicted"].map( lambda x: x.replace('subject=', '')) # save predicted labels pred_labels = np.unique(clean_result_labels.values) #pred_program = np.unique(df['program'].values) # save predictions per label precision, recall, f1, _ = skmetrics.precision_recall_fscore_support( df['True'].values, clean_result_labels.values, average=None, labels=pred_labels) output_results['classifier'] = [] for lbl, prec, rec, fm in zip(pred_labels, precision, recall, f1): output_results['classifier'].append({ 'label': lbl, 'precision': prec, 'recall': rec, 'fscore': fm }) print(output_results['classifier']) print(pred_labels) print('precision: {}'.format(precision)) print('recall: {}'.format(recall)) print('fscore: {}'.format(f1)) return generator, model, x_inp, x_out, history, target_encoding, output_results
# on concatenated `(paper1, paper2)` node embeddings. # # GraphSAGE part of the model, with hidden layer sizes of 20 for both GraphSAGE layers, a bias term, and no dropout. # (Dropout can be switched on by specifying a positive dropout rate, 0 < dropout < 1) # # Note that the length of layer_sizes list must be equal to the length of num_samples, as len(num_samples) defines # the number of hops (layers) in the GraphSAGE model. # In[17]: layer_sizes = [20, 20] assert len(layer_sizes) == len(num_samples) graphsage = GraphSAGE( layer_sizes=layer_sizes, generator=generator, bias=True, dropout=0.5 ) # In[18]: # Build the model and expose the input and output tensors. x_inp, x_out = graphsage.build() # Final link classification layer that takes a pair of node embeddings produced by graphsage, applies a binary # operator to them to produce the corresponding link embedding ('ip' for inner product; other options for the binary # operator can be seen by running a cell with `?link_classification` in it), and passes it through a dense layer: # In[19]:
# Create data generator for our graph, specified by which type # of model (GraphSAGE) and the learning task (Node) ... generator = GraphSAGENodeGenerator(G, batch_size, num_samples) print(train_subjects.index) # Create an iterator for our training data, this takes the indeces of the # nodes in the graph to be used for training, as well as their respective # one-hot encoded label vectors train_gen = generator.flow(train_subjects.index, train_targets, shuffle = True) # Specify the graph-learning model graphsage_model = GraphSAGE( layer_sizes=[32, 32], generator=generator, bias=True, dropout=0.5, aggregator = MeanAggregator ) # Extract the input and output tensors of the model. Set predictions # of the model to be a softmax layer taking output tensor as its input. x_inp, x_out = graphsage_model.in_out_tensors() prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) model = Model(inputs = x_inp, outputs = prediction) model.compile( optimizer=optimizers.Adam(lr=0.005), loss=losses.categorical_crossentropy, metrics=["acc"], ) # To validate/test we need another generator for testing data, no shuffle needed test_gen = generator.flow(test_subjects.index, test_targets) history = model.fit(
test_gen = generator.flow(test_subjects.index, test_targets) # aggregatortype = MaxPoolingAggregator(), # layer_sizes (list): Hidden feature dimensions for each layer. activations (list): Activations applied to each layer's output; def get_dropout(input_tensor, p=0.1, mc=False): if mc: return Dropout(p)(input_tensor, training=True) else: return Dropout(p)(input_tensor) graphsage_model = GraphSAGE(layer_sizes=[64, 32, 16], generator=generator, activations=["relu", "relu", "linear"], bias=True, aggregator=MaxPoolingAggregator, dropout=0.1) x_inp, x_out = graphsage_model.in_out_tensors() x_out = layers.Dense(units=10, activation="relu")(x_out) x_out = layers.Dense(units=10, activation="relu")(x_out) x_out = get_dropout(x_out, p=0.1, mc='mc') prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) model = Model(inputs=x_inp, outputs=prediction) model.summary() ## # model.compile( optimizer=optimizers.Adam(), loss = noderankloss(), metrics=["acc"])
def _train_model(self, gnx, train_data, test_data, all_features, target_feature_name): subject_groups_train = Counter(train_data[target_feature_name]) subject_groups_test = Counter(test_data[target_feature_name]) graph = sg.StellarGraph(gnx, node_features=all_features) output_results = { 'train_size': len(train_data), 'test_size': len(test_data), 'subject_groups_train': subject_groups_train, 'subject_groups_test': subject_groups_test, 'graph_info': graph.info() } num_samples = [10, 5] generator = GraphSAGENodeGenerator(graph, self.batch_size, num_samples) target_encoding = feature_extraction.DictVectorizer(sparse=False) train_targets = target_encoding.fit_transform( train_data[[target_feature_name]].to_dict('records')) class_weights = class_weight.compute_class_weight( class_weight='balanced', classes=np.unique(train_data[target_feature_name].to_list()), y=train_data[target_feature_name].to_list()) class_weights = dict(enumerate(class_weights)) test_targets = target_encoding.transform( test_data[[target_feature_name]].to_dict('records')) train_gen = generator.flow(train_data.index, train_targets, shuffle=True) graph_sage_model = GraphSAGE( layer_sizes=[80, 80], generator=generator, # train_gen, bias=True, dropout=0.5, ) print('building model...') x_inp, x_out = graph_sage_model.build() prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) model = Model(inputs=x_inp, outputs=prediction) print('compiling model...') model.compile( optimizer=optimizers.Adam(learning_rate=0.005), loss=losses.categorical_crossentropy, metrics=['acc', metrics.categorical_accuracy], ) print('testing the model...') test_gen = generator.flow(test_data.index, test_targets) history = model.fit( train_gen, epochs=self.num_epochs, validation_data=test_gen, verbose=2, shuffle=True, class_weight=class_weights, ) # save test metrics test_metrics = model.evaluate(test_gen) print('Test Set Metrics:') output_results['test_metrics'] = [] for name, val in zip(model.metrics_names, test_metrics): output_results['test_metrics'].append({'name': name, 'val:': val}) print("\t{}: {:0.4f}".format(name, val)) test_nodes = test_data.index test_mapper = generator.flow(test_nodes) test_predictions = model.predict(test_mapper) node_predictions = target_encoding.inverse_transform(test_predictions) results = pd.DataFrame(node_predictions, index=test_nodes).idxmax(axis=1) df = pd.DataFrame({ 'Predicted': results, 'True': test_data[target_feature_name] }) clean_result_labels = df['Predicted'].map( lambda x: x.replace('subject=', '')) # save predicted labels pred_labels = np.unique(clean_result_labels.values) precision, recall, f1, _ = skmetrics.precision_recall_fscore_support( df['True'].values, clean_result_labels.values, average=None, labels=pred_labels) output_results['classifier'] = [] for lbl, prec, rec, fm in zip(pred_labels, precision, recall, f1): output_results['classifier'].append({ 'label': lbl, 'precision': prec, 'recall': rec, 'fscore': fm }) print(output_results['classifier']) print(pred_labels) print('precision: {}'.format(precision)) print('recall: {}'.format(recall)) print('fscore: {}'.format(f1)) output_results['history'] = { 'epochs': history.epoch, 'training_log': history.history, 'training_params': history.params } return generator, model, x_inp, x_out, history, target_encoding, output_results