def preprocess_train(self, node_ids): """ preprocess training set """ if not self.ids_exist(node_ids): raise ValueError('node_ids must exist in self.df') # subset df for training nodes df_tr = self.df[self.df.index.isin(node_ids)] # one-hot-encode target self.y_encoding = sklearn.feature_extraction.DictVectorizer(sparse=False) train_targets = self.y_encoding.fit_transform(df_tr[["target"]].to_dict('records')) # import stellargraph try: import stellargraph as sg from stellargraph.mapper import GraphSAGENodeGenerator except: raise Exception(SG_ERRMSG) if version.parse(sg.__version__) < version.parse('0.8'): raise Exception(SG_ERRMSG) # return generator G_sg = sg.StellarGraph(self.G, node_features=self.df[self.feature_names]) self.G_sg = G_sg generator = GraphSAGENodeGenerator(G_sg, U.DEFAULT_BS, [self.sampsize, self.sampsize]) train_gen = generator.flow(df_tr.index, train_targets, shuffle=True) from .node_generator import NodeSequenceWrapper return NodeSequenceWrapper(train_gen)
def graphsage_pipeline(G, node_subjects, layer_sizes=[32, 32]): train_subjects, val_subjects, test_subjects = training_split(node_subjects) batch_size = 50 num_samples = [10, 5] generator = GraphSAGENodeGenerator(G, batch_size, num_samples) train_gen = generator.flow(train_subjects.index, train_subjects.values, shuffle=True) graphsage_model = GraphSAGE( layer_sizes=layer_sizes, generator=generator, bias=True, dropout=0.5, ) model = build_model(graphsage_model, train_subjects.values.shape[1]) val_gen = generator.flow(val_subjects.index, val_subjects.values) es_callback = EarlyStopping(monitor="val_acc", patience=50, restore_best_weights=True) history = model.fit(train_gen, epochs=200, validation_data=val_gen, verbose=0, shuffle=False, callbacks=[es_callback]) plot_results(history) test_metrics(generator, model, test_subjects)
def preprocess_valid(self, node_ids): """ preprocess validation nodes (transductive inference) node_ids (list): list of node IDs that generator will yield """ if not self.ids_exist(node_ids): raise ValueError('node_ids must exist in self.df') if self.y_encoding is None: raise Exception('Unset parameters. Are you sure you called preprocess_train first?') # subset df for validation nodes df_val = self.df[self.df.index.isin(node_ids)] # one-hot-encode target val_targets = self.y_encoding.transform(df_val[["target"]].to_dict('records')) # import stellargraph try: import stellargraph as sg from stellargraph.mapper import GraphSAGENodeGenerator except: raise Exception(SG_ERRMSG) if version.parse(sg.__version__) < version.parse('0.8'): raise Exception(SG_ERRMSG) # return generator if self.G_sg is None: self.G_sg = sg.StellarGraph(self.G, node_features=self.df[self.feature_names]) generator = GraphSAGENodeGenerator(self.G_sg, U.DEFAULT_BS, [self.sampsize,self.sampsize]) val_gen = generator.flow(df_val.index, val_targets, shuffle=False) from .node_generator import NodeSequenceWrapper return NodeSequenceWrapper(val_gen)
def test(edgelist, node_data, model_file, batch_size, target_name="subject"): """ Load the serialized model and evaluate on all nodes in the graph. Args: G: NetworkX graph file target_converter: Class to give numeric representations of node targets feature_converter: CLass to give numeric representations of the node features model_file: Location of Keras model to load batch_size: Size of batch for inference """ # Extract the feature data. These are the feature vectors that the Keras model will use as input. # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication. node_features = node_data[feature_names] # Create graph from edgelist and set node features and node type Gnx = nx.from_pandas_edgelist(edgelist) # We must also save the target encoding to convert model predictions encoder_file = model_file.replace( "cora_example_model", "cora_example_encoding" ).replace(".h5", ".pkl") with open(encoder_file, "rb") as f: target_encoding = pickle.load(f)[0] # Endode targets with pre-trained encoder node_targets = target_encoding.transform( node_data[[target_name]].to_dict("records") ) node_ids = node_data.index # Convert to StellarGraph and prepare for ML G = sg.StellarGraph(Gnx, node_features=node_features) # Load Keras model model = keras.models.load_model( model_file, custom_objects={"MeanAggregator": MeanAggregator} ) print("Loaded model:") model.summary() # Get required samples from model # TODO: Can we move this to the library? num_samples = [ int(model.input_shape[ii + 1][1] / model.input_shape[ii][1]) for ii in range(len(model.input_shape) - 1) ] # Create mappers for GraphSAGE that input data from the graph to the model generator = GraphSAGENodeGenerator( G, batch_size, num_samples, seed=42 ) all_gen = generator.flow(node_ids, node_targets) # Evaluate and print metrics all_metrics = model.evaluate_generator(all_gen) print("\nAll-node Evaluation:") for name, val in zip(model.metrics_names, all_metrics): print("\t{}: {:0.4f}".format(name, val))
def preprocess_test(self, df_te, G_te): """ ``` preprocess for inductive inference df_te (DataFrame): pandas dataframe containing new node attributes G_te (Graph): a networkx Graph containing new nodes ``` """ try: import networkx as nx except ImportError: raise ImportError("Please install networkx: pip install networkx") if self.y_encoding is None: raise Exception( "Unset parameters. Are you sure you called preprocess_train first?" ) # get aggregrated df # df_agg = pd.concat([df_te, self.df]).drop_duplicates(keep='last') df_agg = pd.concat([df_te, self.df]) # df_te = pd.concat([self.df, df_agg]).drop_duplicates(keep=False) # get aggregrated graph is_subset = set(self.G.nodes()) <= set(G_te.nodes()) if not is_subset: raise ValueError("Nodes in self.G must be subset of G_te") G_agg = nx.compose(self.G, G_te) # one-hot-encode target if "target" in df_te.columns: test_targets = self.y_encoding.transform( df_te[["target"]].to_dict("records")) else: test_targets = [-1] * len(df_te.shape[0]) # import stellargraph try: import stellargraph as sg from stellargraph.mapper import GraphSAGENodeGenerator except: raise Exception(SG_ERRMSG) if version.parse(sg.__version__) < version.parse("0.8"): raise Exception(SG_ERRMSG) # return generator G_sg = sg.StellarGraph(G_agg, node_features=df_agg[self.feature_names]) generator = GraphSAGENodeGenerator(G_sg, U.DEFAULT_BS, [self.sampsize, self.sampsize]) test_gen = generator.flow(df_te.index, test_targets, shuffle=False) from .sg_wrappers import NodeSequenceWrapper return NodeSequenceWrapper(test_gen)
def test_graphsage_constructor(): gs = GraphSAGE(layer_sizes=[4], n_samples=[2], input_dim=2, normalize="l2", multiplicity=1) assert gs.dims == [2, 4] assert gs.n_samples == [2] assert gs.max_hops == 1 assert gs.bias assert len(gs._aggs) == 1 # Check incorrect normalization flag with pytest.raises(ValueError): GraphSAGE( layer_sizes=[4], n_samples=[2], input_dim=2, normalize=lambda x: x, multiplicity=1, ) with pytest.raises(ValueError): GraphSAGE( layer_sizes=[4], n_samples=[2], input_dim=2, normalize="unknown", multiplicity=1, ) # Check requirement for generator or n_samples with pytest.raises(KeyError): GraphSAGE(layer_sizes=[4]) # Construction from generator G = example_graph(feature_size=3) gen = GraphSAGENodeGenerator(G, batch_size=2, num_samples=[2, 2]) gs = GraphSAGE(layer_sizes=[4, 8], generator=gen, bias=True) # The GraphSAGE should no longer accept a Sequence t_gen = gen.flow([1, 2]) with pytest.raises(TypeError): gs = GraphSAGE(layer_sizes=[4, 8], generator=t_gen, bias=True) assert gs.dims == [3, 4, 8] assert gs.n_samples == [2, 2] assert gs.max_hops == 2 assert gs.bias assert len(gs._aggs) == 2
def run_model(self): graph_sampled, label_series_sampled = self.prepare_data_for_stellargraph( ) train_targets, valid_targets, test_targets, train_labels, valid_labels, test_labels = self.get_train_valid_test( label_series_sampled) batch_size = self.hyperparams["batch_size"] num_samples = self.hyperparams["num_samples"] generator = GraphSAGENodeGenerator(graph_sampled, batch_size, num_samples) train_gen = generator.flow(train_labels.index, train_targets, shuffle=True) graphsage_model = GraphSAGE( layer_sizes=self.hyperparams["layer_sizes"], generator=generator, bias=self.hyperparams["bias"], dropout=self.hyperparams["dropout"], ) x_inp, x_out = graphsage_model.in_out_tensors() prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) model = Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=optimizers.Adam(lr=self.hyperparams["lr"]), loss=losses.categorical_crossentropy, metrics=["acc"], ) valid_gen = generator.flow(valid_labels.index, valid_targets) history = model.fit( train_gen, epochs=self.hyperparams["n_epochs"], validation_data=valid_gen, verbose=self.hyperparams["verbose"], shuffle=True, use_multiprocessing=True, ) sg.utils.plot_history(history) test_gen = generator.flow(test_labels.index, test_targets) test_metrics = model.evaluate(test_gen) print("\nTest Set Metrics:") for name, valid in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, valid))
def _dispatch_generator(graph, model_name, params, generator_type="node"): """Create a graph generator.""" if model_name == "watchyourstep": return AdjacencyPowerGenerator( graph, num_powers=params["num_powers"]) elif model_name in ["complex", "distmult"]: return KGTripleGenerator(graph, params["batch_size"]) elif model_name == "attri2vec": if generator_type == "node": return Attri2VecNodeGenerator( graph, params["batch_size"]) else: return Attri2VecLinkGenerator( graph, params["batch_size"]) elif model_name in ["graphsage", "graphsage_dgi"]: if generator_type == "node": return GraphSAGENodeGenerator( graph, params["batch_size"], params["num_samples"]) else: return GraphSAGELinkGenerator( graph, params["batch_size"], params["num_samples"]) elif model_name in ["gcn_dgi", "gat_dgi"]: return FullBatchNodeGenerator(graph, sparse=False) elif model_name in ["cluster_gcn_dgi", "cluster_gat_dgi"]: return ClusterNodeGenerator( graph, clusters=params["clusters"], q=params["clusters_q"]) else: raise ValueError(f"Unknown model name '{model_name}'")
def initialize(self,**hyper_params): if(not "batch_size" in hyper_params.keys()): batch_size = 16 if(not "layer_sizes" in hyper_params.keys()): num_samples = [25, 10] if(not "num_samples" in hyper_params.keys()): layer_sizes = [256, 256] if(not "bias" in hyper_params.keys()): bias = True if(not "dropout" in hyper_params.keys()): dropout = 0.0 if(not "lr" in hyper_params.keys()): lr = 1e-3 if(not "num_walks" in hyper_params.keys()): num_walks = 1 if(not "length" in hyper_params.keys()): length = 5 self.graph = sg.StellarGraph(nodes=self.nodes_df,edges=self.edges_df) self.nodes = list(self.graph.nodes()) del self.nodes_df del self.edges_df unsupervised_samples = UnsupervisedSampler( self.graph, nodes=self.nodes, length=length, number_of_walks=num_walks ) # Train iterators train_gen = GraphSAGELinkGenerator(self.graph, batch_size, num_samples) self.train_flow = train_gen.flow(unsupervised_samples) # Model defining - Keras functional API + Stellargraph layers graphsage = GraphSAGE( layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout, normalize="l2" ) x_inp, x_out = graphsage.in_out_tensors() prediction = link_classification( output_dim=1, output_act="sigmoid", edge_embedding_method="ip" )(x_out) self.model = keras.Model(inputs=x_inp, outputs=prediction) self.model.compile( optimizer=keras.optimizers.Adam(lr=lr), loss=keras.losses.binary_crossentropy, metrics=[keras.metrics.binary_accuracy], ) x_inp_src = x_inp[0::2] x_out_src = x_out[0] self.embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src) self.node_gen = GraphSAGENodeGenerator(self.graph, batch_size, num_samples).flow(self.nodes) return self.model.get_weights()
def test_gat_build_constructor_wrong_generator(self): G = example_graph(feature_size=self.F_in) gen = GraphSAGENodeGenerator(G, self.N, [5, 10]) # test error where generator is of the wrong type for GAT: with pytest.raises(TypeError): gat = GAT( layer_sizes=self.layer_sizes, activations=self.activations, attn_heads=self.attn_heads, bias=True, generator=gen, )
def create_graphSAGE_model(graph, link_prediction=False): if link_prediction: # We are going to train on the original graph generator = GraphSAGELinkGenerator(graph, batch_size=2, num_samples=[2, 2]) edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]]) train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0])) else: generator = GraphSAGENodeGenerator(graph, batch_size=2, num_samples=[2, 2]) train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]])) # if link_prediction: # edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]]) # train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0])) # else: # train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]])) base_model = GraphSAGE(layer_sizes=[8, 8], generator=train_gen, bias=True, dropout=0.5) if link_prediction: # Expose input and output sockets of graphsage, for source and destination nodes: x_inp_src, x_out_src = base_model.node_model() x_inp_dst, x_out_dst = base_model.node_model() # re-pack into a list where (source, destination) inputs alternate, for link inputs: x_inp = [x for ab in zip(x_inp_src, x_inp_dst) for x in ab] # same for outputs: x_out = [x_out_src, x_out_dst] prediction = link_classification(output_dim=1, output_act="relu", edge_embedding_method="ip")(x_out) keras_model = Model(inputs=x_inp, outputs=prediction) else: x_inp, x_out = base_model.node_model() prediction = layers.Dense(units=2, activation="softmax")(x_out) keras_model = Model(inputs=x_inp, outputs=prediction) return base_model, keras_model, generator, train_gen
def _fit_deep_graph_infomax(train_graph, params, model_name): """Train unsupervised Deep Graph Infomax.""" if "gcn_dgi" in model_name or "gat_dgi" in model_name: if "cluster" in model_name: generator = ClusterNodeGenerator( train_graph, clusters=params["clusters"], q=params["clusters_q"]) else: generator = FullBatchNodeGenerator(train_graph, sparse=False) if "gcn_dgi" in model_name: embedding_layer = GCN( layer_sizes=[params["embedding_dimension"]], activations=["relu"], generator=generator) elif "gat_dgi" in model_name: embedding_layer = GAT( layer_sizes=[params["embedding_dimension"]], activations=["relu"], generator=generator, attn_heads=8) elif model_name == "graphsage_dgi": generator = GraphSAGENodeGenerator( train_graph, batch_size=50, num_samples=[5]) embedding_layer = GraphSAGE( layer_sizes=[params["embedding_dimension"]], activations=["relu"], generator=generator ) else: raise ValueError(f"Unknown mode name {model_name}") embedding_model = _execute_deep_graph_infomax( train_graph, embedding_layer, generator, params) # Here the models can be both inductive and transductive if model_name in ["gcn_dgi", "gat_dgi", "graphsage_dgi"]: return embedding_model.predict( generator.flow(train_graph.nodes())) else: return embedding_model
labels_sampled, train_size=0.05, test_size=None, stratify=labels_sampled, random_state=42, ) # Turn labels into one-hot encodings target_encoding = preprocessing.LabelBinarizer() train_targets = target_encoding.fit_transform(train_labels) val_targets = target_encoding.transform(val_labels) # Create a node generator for undirected graph batch_size = 50 num_samples = [10, 10] generator = GraphSAGENodeGenerator(graph_sampled, batch_size, num_samples) # create iterator for training data train_gen = generator.flow(train_labels.index, train_targets, shuffle=True) # Make graphsage model graphsage_model = GraphSAGE( layer_sizes=[32, 32], generator=generator, bias=True, dropout=0.5, ) x_inp, x_out = graphsage_model.in_out_tensors() prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out)
from tensorflow.keras.models import load_model import pickle import scipy.io as io ## ########################################### build graph ################################################ #%% ############################################################################################################ G = StellarGraph.from_networkx(g, node_features="feature") print(G.info()) #%% #################################### Graphsage Model loadig ########################################### #%% ############################################################################################################ batch_size = 70 num_samples = [15, 10, 5, 5] generator = GraphSAGENodeGenerator(G, batch_size, num_samples) targets = np.array(targetdf['btw']) test_gen = generator.flow(targetdf.index, targets) indices = bf.expandy(batch_size, 2) def noderankloss(index): def loss(y_true, y_pred): # tf.print(tf.gather(y_true, tf.constant(index[:, 0]))) yt = tf.math.sigmoid( tf.gather(y_true, tf.constant(index[:, 0])) - tf.gather(y_true, tf.constant(index[:, 1])))
def train( edgelist, node_data, layer_size, num_samples, batch_size=100, num_epochs=10, learning_rate=0.005, dropout=0.0, target_name="subject", ): """ Train a GraphSAGE model on the specified graph G with given parameters, evaluate it, and save the model. Args: edgelist: Graph edgelist node_data: Feature and target data for nodes layer_size: A list of number of hidden nodes in each layer num_samples: Number of neighbours to sample at each layer batch_size: Size of batch for inference num_epochs: Number of epochs to train the model learning_rate: Initial Learning rate dropout: The dropout (0->1) """ # Extract target and encode as a one-hot vector target_encoding = feature_extraction.DictVectorizer(sparse=False) node_targets = target_encoding.fit_transform( node_data[[target_name]].to_dict("records")) node_ids = node_data.index # Extract the feature data. These are the feature vectors that the Keras model will use as input. # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication. node_features = node_data[feature_names] # Create graph from edgelist and set node features and node type Gnx = nx.from_pandas_edgelist(edgelist, edge_attr="label") nx.set_node_attributes(Gnx, "paper", "label") # Convert to StellarGraph and prepare for ML G = sg.StellarGraph(Gnx, node_type_name="label", node_features=node_features) # Split nodes into train/test using stratification. train_nodes, test_nodes, train_targets, test_targets = model_selection.train_test_split( node_ids, node_targets, train_size=140, test_size=None, stratify=node_targets, random_state=5232, ) # Split test set into test and validation val_nodes, test_nodes, val_targets, test_targets = model_selection.train_test_split( test_nodes, test_targets, train_size=500, test_size=None, random_state=5214) # Create mappers for GraphSAGE that input data from the graph to the model generator = GraphSAGENodeGenerator(G, batch_size, num_samples, seed=5312) train_gen = generator.flow(train_nodes, train_targets, shuffle=True) val_gen = generator.flow(val_nodes, val_targets) # GraphSAGE model model = GraphSAGE( layer_sizes=layer_size, generator=train_gen, bias=True, dropout=dropout, aggregator=MeanAggregator, ) # Expose the input and output sockets of the model: x_inp, x_out = model.build() # Snap the final estimator layer to x_out prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) # Create Keras model for training model = keras.Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=optimizers.Adam(lr=learning_rate, decay=0.001), loss=losses.categorical_crossentropy, metrics=[metrics.categorical_accuracy], ) print(model.summary()) # Train model history = model.fit_generator(train_gen, epochs=num_epochs, validation_data=val_gen, verbose=2, shuffle=False) # Evaluate on test set and print metrics test_metrics = model.evaluate_generator( generator.flow(test_nodes, test_targets)) print("\nTest Set Metrics:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val)) # Get predictions for all nodes all_predictions = model.predict_generator(generator.flow(node_ids)) # Turn predictions back into the original categories node_predictions = pd.DataFrame( target_encoding.inverse_transform(all_predictions), index=node_ids) accuracy = np.mean([ "subject=" + gt_subject == p for gt_subject, p in zip( node_data["subject"], node_predictions.idxmax(axis=1)) ]) print("All-node accuracy: {:3f}".format(accuracy)) # TODO: extract the GraphSAGE embeddings from x_out, and save/plot them # Save the trained model save_str = "_n{}_l{}_d{}_r{}".format( "_".join([str(x) for x in num_samples]), "_".join([str(x) for x in layer_size]), dropout, learning_rate, ) model.save("cora_example_model" + save_str + ".h5") # We must also save the target encoding to convert model predictions with open("cora_example_encoding" + save_str + ".pkl", "wb") as f: pickle.dump([target_encoding], f)
def train(G_list, nodes_subjects_list, run_num=1, start_month_id=220, end_month_id=264): # 提前定义一些列表方便记录数据,大循环的列表存小循环的列表 graph_history_list_list = [] model_list_list = [] train_gen_list_list = [] time_list_list = [] model_weight_list_list = [] # 选择运行run_num次 run_num = run_num # 选择进行训练的月份,end_month_id最多取 start_month_id = start_month_id end_month_id = end_month_id # 创建文件夹保存model if not os.path.exists('model'): os.makedirs('model') # 创建文件夹保存history if not os.path.exists('history'): os.makedirs('history') # 创建文件夹保存figure if not os.path.exists('figure'): os.makedirs('figure') # 创建文件夹保存figure if not os.path.exists('figure_distribution'): os.makedirs('figure_distribution') # 创建文件夹保存test结果 if not os.path.exists('test_result'): os.makedirs('test_result') # 大循环记录训练了几次,计算多次是为了减少variance # 小循环记录训练的月份 for j in range(run_num): num_samples = [40] # 提前定义一些列表记录小循环的数据 graph_history_list = [] model_list = [] train_gen_list = [] time_list = [] model_weight_list = [] test_result = [] # i为0代表220 for i in range(start_month_id - 220, end_month_id - 220): start = time.time() # 前一个月训练,后一个月验证 train_idx = i val_idx = i + 1 test_idx = i + 2 # 用train_idx的数据生成训练集的generator generator = GraphSAGENodeGenerator( G=G_list[train_idx], batch_size=len(nodes_subjects_list[train_idx]), num_samples=num_samples, seed=100) train_gen = generator.flow(list( nodes_subjects_list[train_idx].index), nodes_subjects_list[train_idx].values, shuffle=False) # 生成GraphSAGE模型 graphsage_model = GraphSAGE(layer_sizes=[1], generator=generator, bias=True, aggregator=sg.layer.MeanAggregator, normalize=None) # 提取输出输出的tensor,用keras来构建模型 x_inp, x_out = graphsage_model.in_out_tensors() # prediction = layers.Dense(units=1)(x_out) # 用val_idx的数据生成验证集的generator generator = GraphSAGENodeGenerator( G=G_list[val_idx], batch_size=len(nodes_subjects_list[val_idx]), num_samples=num_samples, seed=100) val_gen = generator.flow(list(nodes_subjects_list[val_idx].index), nodes_subjects_list[val_idx].values) # 用test_idx的数据生成验证集的generator generator = GraphSAGENodeGenerator( G=G_list[test_idx], batch_size=len(nodes_subjects_list[test_idx]), num_samples=num_samples, seed=100) test_gen = generator.flow( list(nodes_subjects_list[test_idx].index), nodes_subjects_list[test_idx].values) # 通过输入输出的tensor构建model model = Model(inputs=x_inp, outputs=x_out) monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=10, verbose=2, mode='auto', restore_best_weights=True) model.compile(optimizer=optimizers.Adam(lr=0.05), loss=losses.mean_squared_error, metrics=[pearson_r]) history = model.fit(train_gen, epochs=500, validation_data=val_gen, verbose=0, shuffle=False, callbacks=[monitor]) test_metrics = model.evaluate(test_gen) test_result_dict = {} print("\n" + str(train_idx + 220) + "'s Test Set: " + str(test_idx + 220) + "'s Metrics:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val)) test_result_dict[name] = val json.dump( test_result_dict, open( 'test_result/' + str(train_idx + 220) + "_" + str(test_idx + 220) + '.json', 'w')) test_preds = model.predict(test_gen) end = time.time() # 保存一些结果 graph_history_list.append(history) # 保存训练过程 model_list.append(model) # 保存model train_gen_list.append(train_gen) # 保存train_gen方便之后算中间层的结果 time_list.append(end - start) # 保存运行时间 model_weight_list.append(model.weights) # 保存model的参数 test_result.append(test_metrics[1]) # # 存模型model # model.save('model/' + str(train_idx + 220) + "_" + str(val_idx + 220) + '.h5') # # 存训练过程history # json.dump(history.history, # open('history/' + str(train_idx + 220) + "_" + str(val_idx + 220) + '.json', 'w')) # # 存训练过程图片figure # sg.utils.plot_history(history) # plt.title(str(train_idx + 220) + '->' + str(val_idx + 220)) # plt.savefig('figure/' + str(train_idx + 220) + "_" + str(val_idx + 220) + '.png') # plt.show() # 存test的prediction的distribution plt.figure(figsize=(5, 10)) plt.subplot(211) plt.hist(test_preds, bins=500) plt.title("Distribution of Prediction of " + str(test_idx + 220)) plt.subplot(212) plt.hist(nodes_subjects_list[test_idx].values, bins=500) plt.title("Distribution of Origin of " + str(test_idx + 220)) plt.xlabel("ic=" + str(test_metrics[1])) plt.savefig('figure_distribution/distribution-' + str(train_idx + 220) + "_" + str(test_idx + 220) + '.png', dpi=300) plt.show() print(str(i + 220) + "'s " + str(j + 1) + " run has finished") print() # 将小循环的数据保存 graph_history_list_list.append(graph_history_list) model_list_list.append(model_list) train_gen_list_list.append(train_gen_list) time_list_list.append(time_list) model_weight_list_list.append(model_weight_list) return graph_history_list_list, model_list_list, train_gen_list_list, time_list_list, model_weight_list_list, test_result
graph = nx.node_link_graph(data) G = StellarGraph.from_networkx(graph, node_features="feature") print(G.node_types) G.check_graph_for_ml() nodes = [node for node in graph.nodes] shuffle(nodes) train_ids = nodes[:5000] test_ids = nodes[5000:] train_labels= [graph.nodes[id]["_class"] for id in train_ids] test_labels = [graph.nodes[id]["_class"] for id in test_ids] all_labels = train_labels + test_labels train_labels = np.array(train_labels).reshape(len(train_ids),1) test_labels = np.array(test_labels).reshape(len(test_ids), 1) print(np.unique(train_labels, return_counts=True)) print(np.unique(test_labels, return_counts=True)) generator = GraphSAGENodeGenerator(G, batch_size=50, num_samples=[10,10]) train_data_gen = generator.flow(train_ids, train_labels) test_data_gen = generator.flow(test_ids, test_labels) all_gen = generator.flow(list(nodes), all_labels) print("Node Gen done!") base_model = GraphSAGE(layer_sizes=[32, 32], generator=generator, bias=True, dropout=0.8) x_in, x_out = base_model.build() prediction = layers.Dense(units=2, activation="softmax")(x_out) print("model building done") model = Model(inputs=x_in, outputs = prediction) model.compile(optimizer=optimizers.Adam(lr=0.005), loss=losses.categorical_crossentropy, metrics=["acc"]) tensorboard = callbacks.TensorBoard(log_dir="logs",embeddings_freq=1, update_freq=1, histogram_freq=1) tboard = model.fit(train_data_gen, epochs=4, validation_data=test_data_gen, verbose=True,
verbose=verbose, use_multiprocessing=False, workers=nworkers, shuffle=True, ) ## Get embeddings for all nodes # Build a new node-based model x_inp_src = x_inp[0::2] x_out_src = x_out[0] embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src) # The node generator feeds graph nodes to `embedding_model`. We want to evaluate node embeddings for all nodes in the graph: node_ids = sorted(G.nodes) node_gen = GraphSAGENodeGenerator(G, batch_size, num_samples).flow(node_ids) emb = embedding_model.predict_generator(node_gen, workers=nworkers, verbose=verbose) node_embeddings = emb[:, 0, :] if testtype == 'nodes': ## Node classification X = node_embeddings y = np.where(dataset['labels'])[1] # Train a Logistic Regression classifier on the training data. X_train, X_test, y_train, y_test = X[nodes_train, :], X[ nodes_test, :], y[nodes_train], y[nodes_test] clf = LogisticRegression(verbose=verbose, solver='liblinear',
# node_data["feature"] = [g.degree(node_id), nx.average_neighbor_degree(g, nodes=[node_id])[node_id], 1, 1, 1] node_data["feature"] = [g.degree(node_id), 1, 1, 1] ## ############################################################################################################ G = StellarGraph.from_networkx(g, node_features="feature") print(G.info()) test_targets = np.array(targetdf) ## #################################### Graphsage Model building ########################################### #%% ############################################################################################################ batch_size = 20 num_samples = [15, 10, 5, 5] generator = GraphSAGENodeGenerator(G, batch_size, num_samples) def noderankloss(index): def loss(y_true, y_pred): # tf.print(tf.gather(y_true, tf.constant(index[:, 0]))) yt = tf.math.sigmoid( tf.gather(y_true, tf.constant(index[:, 0])) - tf.gather(y_true, tf.constant(index[:, 1]))) yp = tf.math.sigmoid( tf.gather(y_pred, tf.constant(index[:, 0])) - tf.gather(y_pred, tf.constant(index[:, 1]))) # tf.print(tf.shape(yt)) onetensor = tf.ones(shape=tf.shape(yt)) # tempmatrix = (-1)*K.dot(yt, tf.math.log(tf.transpose(yp))) - K.dot((onetensor - yt),
def train_model(Gnx, train_data, test_data, all_features): output_results = {} from collections import Counter #TODO: save size of dataset, train_data, and test data #save the count of each subject in the blocks print(len(train_data), len(test_data)) subject_groups_train = Counter(train_data['subject']) subject_groups_test = Counter(test_data['subject']) output_results['train_size'] = len(train_data) output_results['test_size'] = len(test_data) output_results['subject_groups_train'] = subject_groups_train output_results['subject_groups_test'] = subject_groups_test #node_features = train_data[feature_names] #print (feature_names) G = sg.StellarGraph(Gnx, node_features=all_features) #TODO: save graph info print(G.info()) print("writing graph.dot") #write_dot(Gnx,"graph.dot") output_results['graph_info'] = G.info() print("building the graph generator...") batch_size = 50 num_samples = [10, 5] generator = GraphSAGENodeGenerator(G, batch_size, num_samples) #generator = HinSAGENodeGenerator(G, batch_size, num_samples) target_encoding = feature_extraction.DictVectorizer(sparse=False) train_targets = target_encoding.fit_transform( train_data[["subject"]].to_dict('records')) print(np.unique(train_data["subject"].to_list())) class_weights = class_weight.compute_class_weight( 'balanced', np.unique(train_data["subject"].to_list()), train_data["subject"].to_list()) print('class_weights', class_weights) test_targets = target_encoding.transform(test_data[["subject" ]].to_dict('records')) train_gen = generator.flow(train_data.index, train_targets, shuffle=True) graphsage_model = GraphSAGE( #graphsage_model = HinSAGE( #layer_sizes=[32, 32], layer_sizes=[80, 80], generator=generator, #train_gen, bias=True, dropout=0.5, ) print("building model...") #x_inp, x_out = graphsage_model.build(flatten_output=True) x_inp, x_out = graphsage_model.build() prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) model = Model(inputs=x_inp, outputs=prediction) print("compiling model...") model.compile( optimizer=optimizers.Adam(lr=0.005), loss=losses.categorical_crossentropy, metrics=["acc", metrics.categorical_accuracy], ) print("testing the model...") test_gen = generator.flow(test_data.index, test_targets) history = model.fit_generator( train_gen, epochs=EPOCH, validation_data=test_gen, verbose=2, shuffle=True, class_weight=class_weights, ) # save test metrics test_metrics = model.evaluate_generator(test_gen) print("\nTest Set Metrics:") output_results['test_metrics'] = [] for name, val in zip(model.metrics_names, test_metrics): output_results['test_metrics'].append({'name': name, 'val:': val}) print("\t{}: {:0.4f}".format(name, val)) test_nodes = test_data.index test_mapper = generator.flow(test_nodes) test_predictions = model.predict_generator(test_mapper) node_predictions = target_encoding.inverse_transform(test_predictions) results = pd.DataFrame(node_predictions, index=test_nodes).idxmax(axis=1) df = pd.DataFrame({ "Predicted": results, "True": test_data['subject'] }) #, "program":test_data['program']}) clean_result_labels = df["Predicted"].map( lambda x: x.replace('subject=', '')) # save predicted labels pred_labels = np.unique(clean_result_labels.values) #pred_program = np.unique(df['program'].values) # save predictions per label precision, recall, f1, _ = skmetrics.precision_recall_fscore_support( df['True'].values, clean_result_labels.values, average=None, labels=pred_labels) output_results['classifier'] = [] for lbl, prec, rec, fm in zip(pred_labels, precision, recall, f1): output_results['classifier'].append({ 'label': lbl, 'precision': prec, 'recall': rec, 'fscore': fm }) print(output_results['classifier']) print(pred_labels) print('precision: {}'.format(precision)) print('recall: {}'.format(recall)) print('fscore: {}'.format(f1)) return generator, model, x_inp, x_out, history, target_encoding, output_results
def build_generator(self): batch_size = 50 num_samples = [10, 5] return GraphSAGENodeGenerator(self.SG, batch_size, num_samples)
# temp_train_subjects = np.reshape(np.array(train_subjects), (train_subjects.shape[0],1)) # temp_test_subjects = np.reshape(np.array(test_subjects), (test_subjects.shape[0],1)) # train_targets = target_encoding.fit_transform(temp_train_subjects).toarray() # test_targets = target_encoding.transform(temp_test_subjects).toarray() train_targets = np.array(train_subjects) test_targets = np.array(test_subjects) ## #################################### Graphsage Model building ########################################### #%% ############################################################################################################ batch_size = 40 # number of nodes to consider for each hop num_samples = [15, 10, 5] generator = GraphSAGENodeGenerator(G, batch_size, num_samples) train_gen = generator.flow( train_subjects.index, train_targets, shuffle=True) # train_subjects.index for selecting training nodes test_gen = generator.flow(test_subjects.index, test_targets) # aggregatortype = MaxPoolingAggregator(), # layer_sizes (list): Hidden feature dimensions for each layer. activations (list): Activations applied to each layer's output; def get_dropout(input_tensor, p=0.1, mc=False): if mc: return Dropout(p)(input_tensor, training=True) else: return Dropout(p)(input_tensor)
def _train_model(self, gnx, train_data, test_data, all_features, target_feature_name): subject_groups_train = Counter(train_data[target_feature_name]) subject_groups_test = Counter(test_data[target_feature_name]) graph = sg.StellarGraph(gnx, node_features=all_features) output_results = { 'train_size': len(train_data), 'test_size': len(test_data), 'subject_groups_train': subject_groups_train, 'subject_groups_test': subject_groups_test, 'graph_info': graph.info() } num_samples = [10, 5] generator = GraphSAGENodeGenerator(graph, self.batch_size, num_samples) target_encoding = feature_extraction.DictVectorizer(sparse=False) train_targets = target_encoding.fit_transform( train_data[[target_feature_name]].to_dict('records')) class_weights = class_weight.compute_class_weight( class_weight='balanced', classes=np.unique(train_data[target_feature_name].to_list()), y=train_data[target_feature_name].to_list()) class_weights = dict(enumerate(class_weights)) test_targets = target_encoding.transform( test_data[[target_feature_name]].to_dict('records')) train_gen = generator.flow(train_data.index, train_targets, shuffle=True) graph_sage_model = GraphSAGE( layer_sizes=[80, 80], generator=generator, # train_gen, bias=True, dropout=0.5, ) print('building model...') x_inp, x_out = graph_sage_model.build() prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) model = Model(inputs=x_inp, outputs=prediction) print('compiling model...') model.compile( optimizer=optimizers.Adam(learning_rate=0.005), loss=losses.categorical_crossentropy, metrics=['acc', metrics.categorical_accuracy], ) print('testing the model...') test_gen = generator.flow(test_data.index, test_targets) history = model.fit( train_gen, epochs=self.num_epochs, validation_data=test_gen, verbose=2, shuffle=True, class_weight=class_weights, ) # save test metrics test_metrics = model.evaluate(test_gen) print('Test Set Metrics:') output_results['test_metrics'] = [] for name, val in zip(model.metrics_names, test_metrics): output_results['test_metrics'].append({'name': name, 'val:': val}) print("\t{}: {:0.4f}".format(name, val)) test_nodes = test_data.index test_mapper = generator.flow(test_nodes) test_predictions = model.predict(test_mapper) node_predictions = target_encoding.inverse_transform(test_predictions) results = pd.DataFrame(node_predictions, index=test_nodes).idxmax(axis=1) df = pd.DataFrame({ 'Predicted': results, 'True': test_data[target_feature_name] }) clean_result_labels = df['Predicted'].map( lambda x: x.replace('subject=', '')) # save predicted labels pred_labels = np.unique(clean_result_labels.values) precision, recall, f1, _ = skmetrics.precision_recall_fscore_support( df['True'].values, clean_result_labels.values, average=None, labels=pred_labels) output_results['classifier'] = [] for lbl, prec, rec, fm in zip(pred_labels, precision, recall, f1): output_results['classifier'].append({ 'label': lbl, 'precision': prec, 'recall': rec, 'fscore': fm }) print(output_results['classifier']) print(pred_labels) print('precision: {}'.format(precision)) print('recall: {}'.format(recall)) print('fscore: {}'.format(f1)) output_results['history'] = { 'epochs': history.epoch, 'training_log': history.history, 'training_params': history.params } return generator, model, x_inp, x_out, history, target_encoding, output_results