def pgframe_to_stellargraph(pgframe, directed=True, include_type=False, feature_vector_prop=None, feature_props=None, edge_weight=None): """Convert a PGFrame to a StellarGraph object.""" if feature_props is None: feature_props = [] feature_array = None if include_type: nodes = {} for t in pgframe.node_types(): index = pgframe.nodes(typed_by=t) if feature_vector_prop is not None: feature_array = np.array( pgframe.get_node_property_values(feature_vector_prop, typed_by=t).to_list()) elif len("feature_props") > 0: feature_array = pgframe.nodes( raw_frame=True, typed_by=t)[feature_props].to_numpy() nodes[t] = sg.IndexedArray(feature_array, index=index) else: if feature_vector_prop is not None: feature_array = np.array( pgframe.get_node_property_values( feature_vector_prop).to_list()) elif len("feature_props") > 0: feature_array = pgframe.nodes( raw_frame=True)[feature_props].to_numpy() nodes = sg.IndexedArray(feature_array, index=pgframe.nodes()) if pgframe.number_of_edges() > 0: edges = pgframe.edges(raw_frame=True, include_index=True, filter_props=lambda x: ((x == "@type") if include_type else False) or x == edge_weight, rename_cols={ '@source_id': 'source', "@target_id": "target" }) else: edges = pd.DataFrame(columns=["source", "target"]) if directed: graph = sg.StellarDiGraph( nodes=nodes, edges=edges, edge_weight_column=edge_weight, edge_type_column="@type" if include_type else None) else: graph = sg.StellarGraph( nodes=nodes, edges=edges, edge_weight_column=edge_weight, edge_type_column="@type" if include_type else None) return graph
def serialize_stellargraph(self, attributes: List[str], node_types: List[str]) -> (sg.StellarDiGraph, bool, str, set): contains_fraud = False edges = { 'source': [], 'target': [] } nodes = {} nodes_index = [] fraud_ids = set() for attribute_name in attributes: nodes[attribute_name] = [] for type_name in node_types: nodes[f'type_{type_name}'] = [] for index, node in enumerate(self._nodes): node_properties = node.get_properties() nodes_index.append(node.get_id()) # ground truth if 'is_fraud' in node_properties and node_properties['is_fraud']: contains_fraud = True fraud_ids.add(node_properties['fraud_id']) # data for attribute_name in attributes: if attribute_name in node_properties: if isinstance(node_properties[attribute_name], bool): nodes[attribute_name].append(1 if node_properties[attribute_name] else 0) elif isinstance(node_properties[attribute_name], int) or isinstance(node_properties[attribute_name], float): nodes[attribute_name].append(node_properties[attribute_name]) elif isinstance(node_properties[attribute_name], str) and node_properties[attribute_name].isdigit(): nodes[attribute_name].append(float(node_properties[attribute_name])) else: nodes[attribute_name].append(np.nan) else: nodes[attribute_name].append(np.nan) for type_name in node_types: nodes[f'type_{type_name}'].append(node.get_type() == type_name) for neighbor in node.get_neighbors(): edges['source'].append(node.get_id()) edges['target'].append(neighbor.get_id()) nodes_df = pd.DataFrame(nodes, index=nodes_index) # normalize for attribute_name in attributes: column_data = nodes_df[attribute_name] nodes_df[attribute_name] = ((column_data - np.nanmean(column_data)) / np.nanstd(column_data)) nodes_df.fillna(-1, inplace=True) return sg.StellarDiGraph(nodes_df, edges=pd.DataFrame(edges)), contains_fraud, self._name, fraud_ids
def serialize_stellargraph_node_level(self, attributes: List[str], node_types: List[str]) -> (sg.StellarDiGraph, pd.Series, pd.Series): nodes_gt = pd.Series() nodes_fraud_id = pd.Series() edges = { 'source': [], 'target': [] } nodes = {} nodes_index = [] for attribute_name in attributes: nodes[attribute_name] = [] for type_name in node_types: nodes[f'type_{type_name}'] = [] for index, node in enumerate(self._nodes): node_properties = node.get_properties() nodes_index.append(node.get_id()) # ground truth if 'is_fraud' in node_properties: if node_properties['is_fraud']: nodes_gt._set_value(node.get_id(), 'fraud') nodes_fraud_id._set_value(node.get_id(), node_properties['fraud_id']) else: nodes_gt._set_value(node.get_id(), 'no_fraud') nodes_fraud_id._set_value(node.get_id(), None) else: nodes_gt._set_value(node.get_id(), 'irrelevant') nodes_fraud_id._set_value(node.get_id(), None) # attributes for attribute_name in attributes: if attribute_name in node_properties: if isinstance(node_properties[attribute_name], bool): nodes[attribute_name].append(1 if node_properties[attribute_name] else 0) elif isinstance(node_properties[attribute_name], int) or isinstance(node_properties[attribute_name], float): nodes[attribute_name].append(node_properties[attribute_name]) elif isinstance(node_properties[attribute_name], str) and node_properties[attribute_name].isdigit(): nodes[attribute_name].append(float(node_properties[attribute_name])) else: nodes[attribute_name].append(np.nan) else: nodes[attribute_name].append(np.nan) for type_name in node_types: nodes[f'type_{type_name}'].append(node.get_type() == type_name) for neighbor in node.get_neighbors(): edges['source'].append(node.get_id()) edges['target'].append(neighbor.get_id()) nodes_df = pd.DataFrame(nodes, index=nodes_index) # normalize for attribute_name in attributes: column_data = nodes_df[attribute_name] nodes_df[attribute_name] = ((column_data - np.nanmean(column_data)) / np.nanstd(column_data)) nodes_df.fillna(-1, inplace=True) return sg.StellarDiGraph(nodes_df, edges=pd.DataFrame(edges)), nodes_gt, self._name, nodes_fraud_id
def infer_attributes_gat(Gnx, savepred=True, plot=False): # Define node data feature_names = [ "in_degree", "out_degree", # "in_degree_centrality", # "out_degree_centrality", # "closeness_centrality", # "betweenness_centrality", "clustering_coefficient", # "square_clustering", "core_number", # "pagerank", # "constraint", # "effective_size" ] node_type = [v for k, v in nx.get_node_attributes(Gnx, 'data').items()] d = {"node_type": node_type} if "in_degree" in feature_names: indeg = [v for k, v in Gnx.in_degree] indeg = np.divide(indeg, max(indeg)) indeg[indeg >= 0.5] = 1 indeg[indeg < 0.5] = 0 d["in_degree"] = indeg if "out_degree" in feature_names: outdeg = [v for k, v in Gnx.out_degree] outdeg = np.divide(outdeg, max(outdeg)) outdeg[outdeg >= 0.5] = 1 outdeg[outdeg < 0.5] = 0 d["out_degree"] = outdeg if "in_degree_centrality" in feature_names: indeg_cent = [ v for k, v in nx.algorithms.in_degree_centrality(Gnx).items() ] indeg_cent = np.divide(indeg_cent, max(indeg_cent)) indeg_cent[indeg_cent >= 0.5] = 1 indeg_cent[indeg_cent < 0.5] = 0 d["in_degree_centrality"] = indeg_cent if "out_degree_centrality" in feature_names: outdeg_cent = [ v for k, v in nx.algorithms.out_degree_centrality(Gnx).items() ] outdeg_cent = np.divide(outdeg_cent, max(outdeg_cent)) outdeg_cent[outdeg_cent >= 0.5] = 1 outdeg_cent[outdeg_cent < 0.5] = 0 d["out_degree_centrality"] = outdeg_cent if "closeness_centrality" in feature_names: close_cent = [ v for k, v in nx.algorithms.closeness_centrality(Gnx).items() ] close_cent = np.divide(close_cent, max(close_cent)) close_cent[close_cent >= 0.5] = 1 close_cent[close_cent < 0.5] = 0 d["closeness_centrality"] = close_cent if "betweenness_centrality" in feature_names: between_cent = [ v for k, v in nx.algorithms.betweenness_centrality(Gnx).items() ] between_cent = np.divide(between_cent, max(between_cent)) between_cent[between_cent >= 0.5] = 1 between_cent[between_cent < 0.5] = 0 d["betweenness_centrality"] = between_cent if "clustering_coefficient" in feature_names: clustering_co = [v for k, v in nx.algorithms.clustering(Gnx).items()] clustering_co = np.divide(clustering_co, max(clustering_co)) clustering_co[clustering_co >= 0.5] = 1 clustering_co[clustering_co < 0.5] = 0 d["clustering_coefficient"] = clustering_co if "square_clustering" in feature_names: sq_clustering = [ v for k, v in nx.algorithms.square_clustering(Gnx).items() ] sq_clustering = np.divide(sq_clustering, max(sq_clustering)) sq_clustering[sq_clustering >= 0.5] = 1 sq_clustering[sq_clustering < 0.5] = 0 d["square_clustering"] = sq_clustering if "core_number" in feature_names: core_number = [v for k, v in nx.algorithms.core_number(Gnx).items()] core_number = np.divide(core_number, max(core_number)) core_number[core_number >= 0.5] = 1 core_number[core_number < 0.5] = 0 d["core_number"] = core_number if "pagerank" in feature_names: pagerank = [v for k, v in nx.algorithms.pagerank(Gnx).items()] pagerank = np.divide(pagerank, max(pagerank)) pagerank[pagerank >= 0.5] = 1 pagerank[pagerank < 0.5] = 0 d["pagerank"] = pagerank if "constraint" in feature_names: constraint = [v for k, v in nx.algorithms.constraint(Gnx).items()] constraint = np.divide(constraint, max(constraint)) constraint[np.isnan(constraint)] = 0 constraint[constraint >= 0.5] = 1 constraint[constraint < 0.5] = 0 d["constraint"] = constraint if "effective_size" in feature_names: effective_size = [ v for k, v in nx.algorithms.effective_size(Gnx).items() ] effective_size = np.divide(effective_size, max(effective_size)) effective_size[np.isnan(effective_size)] = 0 effective_size[effective_size >= 0.5] = 1 effective_size[effective_size < 0.5] = 0 d["effective_size"] = effective_size node_data = pd.DataFrame(data=d, index=nodes) node_data = shuffle(node_data) # Split the data train_data, test_data = model_selection.train_test_split( node_data, train_size=int(0.80 * len(Gnx))) val_data, test_data = model_selection.train_test_split( test_data, train_size=int(0.15 * len(Gnx))) # Convert to numeric arrays target_encoding = feature_extraction.DictVectorizer(sparse=False) train_targets = target_encoding.fit_transform( train_data[["node_type"]].to_dict('records')) val_targets = target_encoding.transform(val_data[["node_type" ]].to_dict('records')) test_targets = target_encoding.transform(test_data[["node_type" ]].to_dict('records')) node_features = node_data[feature_names] # Create the GAT model in Keras G = sg.StellarDiGraph(Gnx, node_features=node_features) print(G.info()) generator = FullBatchNodeGenerator(G) train_gen = generator.flow(train_data.index, train_targets) gat = GAT( layer_sizes=[8, train_targets.shape[1]], attn_heads=8, generator=generator, bias=True, in_dropout=0.5, attn_dropout=0.5, activations=["elu", "softmax"], normalize=None, ) # Expose the input and output tensors of the GAT model for node prediction, via GAT.node_model() method: x_inp, predictions = gat.node_model() # Train the model model = Model(inputs=x_inp, outputs=predictions) model.compile( optimizer=optimizers.Adam(lr=0.005), loss=losses.categorical_crossentropy, weighted_metrics=["acc"], ) val_gen = generator.flow(val_data.index, val_targets) if not os.path.isdir(".temp/logs"): os.makedirs(".temp/logs") if not os.path.isdir(".temp/output"): os.makedirs(".temp/output") es_callback = EarlyStopping( monitor="val_weighted_acc", patience= 100 # patience is the number of epochs to wait before early stopping in case of no further improvement ) mc_callback = ModelCheckpoint( ".temp/logs/best_model.h5", monitor="val_weighted_acc", save_best_only=True, save_weights_only=True, ) history = model.fit_generator( train_gen, epochs=2000, validation_data=val_gen, verbose=2, shuffle= False, # this should be False, since shuffling data means shuffling the whole graph callbacks=[es_callback, mc_callback], ) # Reload the saved weights model.load_weights(".temp/logs/best_model.h5") # Evaluate the best nidek in the test set test_gen = generator.flow(test_data.index, test_targets) test_metrics = model.evaluate_generator(test_gen) print("\nTest Set Metrics:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val)) # Make predictions with the model all_nodes = node_data.index all_gen = generator.flow(all_nodes) all_predictions = model.predict_generator(all_gen) node_predictions = target_encoding.inverse_transform(all_predictions) results = pd.DataFrame(node_predictions, index=G.nodes()).idxmax(axis=1) df = pd.DataFrame({"Predicted": results, "True": node_data['node_type']}) print(df.head) if savepred: df.to_excel(".temp/output/output" + str(datetime.datetime.now()).replace(':', '-') + ".xlsx") if plot: # Node embeddings emb_layer = model.layers[3] print("Embedding layer: {}, output shape {}".format( emb_layer.name, emb_layer.output_shape)) embedding_model = Model(inputs=x_inp, outputs=emb_layer.output) emb = embedding_model.predict_generator(all_gen) X = emb y = np.argmax(target_encoding.transform( node_data.reindex(G.nodes())[["node_type"]].to_dict('records')), axis=1) if X.shape[1] > 2: transform = TSNE #PCA trans = transform(n_components=2) emb_transformed = pd.DataFrame(trans.fit_transform(X), index=list(G.nodes())) emb_transformed['label'] = y else: emb_transformed = pd.DataFrame(X, index=list(G.nodes())) emb_transformed = emb_transformed.rename(columns={'0': 0, '1': 1}) def plot_emb(transform, emb_transformed): fig, ax = plt.subplots(figsize=(7, 7)) ax.scatter(emb_transformed[0], emb_transformed[1], c=emb_transformed['label'].astype("category"), cmap="jet", alpha=0.7) ax.set(aspect="equal", xlabel="$X_1$", ylabel="$X_2$") plt.title( '{} visualization of GAT embeddings for the fighter graph'. format(transform.__name__)) # Plot the training history def remove_prefix(text, prefix): return text[text.startswith(prefix) and len(prefix):] def plot_history(history): metrics = sorted( set([ remove_prefix(m, "val_") for m in list(history.history.keys()) ])) for m in metrics: # summarize history for metric m plt.figure() plt.plot(history.history[m]) plt.plot(history.history['val_' + m]) plt.title(m) plt.ylabel(m) plt.xlabel('epoch') plt.legend(['train', 'validation'], loc='best') plot_history(history) plot_emb(transform, emb_transformed) plt.show() return df
def createEmbeddings(v_sets, e_sets, core_targets, ext_targets, v_sample, e_sample): print("DeepGraphInfomax embedding Starting") t0 = time.time() verbose = 1 # Initialize stellargraph object G = sg.StellarDiGraph(v_sets, e_sets) ''' HinSAGENodeGenerator(G, batch_size, num_samples, head_node_type=None, schema=None, seed=None, name=None) G = graph (stellargraph object) batch_size = size of batch to return num_samples = the number of samples per layer (hop) to take head_node_type = the node type that will be given to the generator using the flow method. The model will expect this type. If not given, it defaults to a single node type. Note: HinSAGE does aggregation on multiple node types but then predicts on one type. ''' def create_embeddings(node_type, num_samples, hinsage_layer_sizes, epochs, patience, batch_size, dropout, activations): # Check if num_samples and layer_size are compatible assert len(hinsage_layer_sizes) == len(num_samples) generator = HinSAGENodeGenerator(G, batch_size, num_samples=num_samples, head_node_type=node_type) # HinSAGE layers hinsage = HinSAGE(layer_sizes=hinsage_layer_sizes, activations=activations, generator=generator, bias=True, normalize="l2", dropout=dropout) def run_deep_graph_infomax(base_model, generator, epochs, node_type): corrupted_generator = CorruptedGenerator(generator) gen = corrupted_generator.flow(G.nodes(node_type=node_type)) infomax = DeepGraphInfomax(base_model, corrupted_generator) x_in, x_out = infomax.in_out_tensors() print("Starting Training") ttrain = time.time() # Train model = Model(inputs=x_in, outputs=x_out) model.compile(loss=tf.nn.sigmoid_cross_entropy_with_logits, optimizer=Adam(lr=1e-3)) es = EarlyStopping(monitor="loss", min_delta=0, patience=patience) history = model.fit(gen, epochs=epochs, verbose=verbose, callbacks=[es]) # sg.utils.plot_history(history) ttrain1 = time.time() print( f"Training complete in {(ttrain1-ttrain):.2f} s ({(ttrain1-ttrain)/60:.2f} min)" ) x_emb_in, x_emb_out = base_model.in_out_tensors() # for full batch models, squeeze out the batch dim (which is 1) if generator.num_batch_dims() == 2: x_emb_out = tf.squeeze(x_emb_out, axis=0) return x_emb_in, x_emb_out # Run Deep Graph Infomax x_emb_in, x_emb_out = run_deep_graph_infomax(hinsage, generator, epochs=epochs, node_type=node_type) emb_model = Model(inputs=x_emb_in, outputs=x_emb_out) all_embeddings = emb_model.predict( generator.flow(G.nodes(node_type=node_type))) # TSNE visualization of embeddings ttsne = time.time() print("Creating TSNE") embeddings_2d = pd.DataFrame( TSNE(n_components=2).fit_transform(all_embeddings), index=G.nodes(node_type=node_type)) # draw the points (colors based on ExtendedCaseGraphID) node_ids = G.nodes(node_type=node_type).tolist() ext_targets = v_sample.loc[[int(node_id) for node_id in node_ids ]].ExtendedCaseGraphID label_map = { l: i * 10 for i, l in enumerate(np.unique(ext_targets), start=10) if pd.notna(l) } node_colours = [ label_map[target] if pd.notna(target) else 0 for target in ext_targets ] ttsne1 = time.time() print( f"TSNE completed in {(ttsne1-ttsne):.2f} s ({(ttsne1-ttsne)/60:.2f} min)" ) alpha = 0.7 fig, ax = plt.subplots(figsize=(15, 15)) ax.scatter( embeddings_2d[0], embeddings_2d[1], c=node_colours, cmap="jet", alpha=alpha, ) ax.set(aspect="equal") plt.title( f'TSNE visualization of HinSAGE "{node_type}" embeddings with Deep Graph Infomax' ) plt.savefig(f"./embeddings/HinSAGE_DGI_embeddings_{node_type}.pdf") return all_embeddings, embeddings_2d # Repeat DGI HinSAGE algorithm for every node type # (each node type requires a training phase) account_embeddings, account_2d = create_embeddings( node_type="Account", epochs=75, patience=25, batch_size=250, dropout=0.4, num_samples=[8, 4], hinsage_layer_sizes=[32, 32], activations=['relu', 'softmax']) customer_embeddings, customer_2d = create_embeddings( node_type="Customer", epochs=100, patience=50, batch_size=400, dropout=0.4, num_samples=[12], hinsage_layer_sizes=[72], activations=['relu']) derEntity_embeddings, derEntity_2d = create_embeddings( node_type="Derived Entity", epochs=100, patience=50, batch_size=1200, dropout=0.25, num_samples=[12], hinsage_layer_sizes=[72], activations=['relu']) # Address and External Entity don't have any outgoing nodes and can't be used for this. # Another technique specific for External Entities and Addresses might be a good fit. # Put all the embeddings in the same map # TODO # arrays full_graph_embeddings = [ account_embeddings, customer_embeddings, derEntity_embeddings ] # dataframes full_graph_2d_frames = [account_2d, customer_2d, derEntity_2d] full_graph_2d = pd.concat(full_graph_2d_frames) # draw all the embeddings together node_ids_full = np.concatenate( (G.nodes(node_type='Account'), G.nodes(node_type='Customer'), G.nodes(node_type='Derived Entity'))).tolist() ext_targets_full = v_sample.loc[[ int(node_id) for node_id in node_ids_full ]].ExtendedCaseGraphID label_map_full = { l: i * 10 for i, l in enumerate(np.unique(ext_targets_full), start=10) if pd.notna(l) } node_colours_full = [ label_map_full[target] if pd.notna(target) else 0 for target in ext_targets_full ] alpha = 0.7 fig, ax = plt.subplots(figsize=(15, 15)) ax.scatter( full_graph_2d[0], full_graph_2d[1], c=node_colours_full, cmap="jet", alpha=alpha, ) ax.set(aspect="equal") plt.title( f'TSNE visualization of HinSAGE Full Graph embeddings with Deep Graph Infomax' ) plt.savefig("./embeddings/HinSAGE_DGI_embeddings_FullGraph.pdf") # Train a classifier for prediction # TODO t1 = time.time() print(f"HinSAGE DGI completed in {(t1-t0):.2f} s ({(t1-t0)/60:.2f} min)") return full_graph_embeddings
set(node_data["subject"]) train_data, test_data = model_selection.train_test_split( node_data, train_size=0.1, test_size=None, stratify=node_data["subject"] ) Counter(train_data["subject"]) target_encoding = feature_extraction.DictVectorizer(sparse=False) train_targets = target_encoding.fit_transform(train_data[["subject"]].to_dict("records")) test_targets = target_encoding.transform(test_data[["subject"]].to_dict("records")) node_features = node_data[feature_names] G = sg.StellarDiGraph(nodes={"paper": node_features}, edges={"cites": edgelist}) batch_size = 50 in_samples = [5, 2] out_samples = [5, 2] generator = DirectedGraphSAGENodeGenerator(G, batch_size, in_samples, out_samples) train_gen = generator.flow(train_data.index, train_targets, shuffle=True) graphsage_model = DirectedGraphSAGE( layer_sizes=[32, 32], generator=generator, bias=False, dropout=0.5, ) x_inp, x_out = graphsage_model.build() prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out)
def sg_DeepWalk(v_sets, e_sets, v_sample, e_sample): G = sg.StellarDiGraph(v_sets, e_sets) #### Graph embedding with NODE2VEC and WORD2VEC print("Running DeepWalk") rw = sg.data.BiasedRandomWalk(G) t0 = time.time() walks = rw.run( nodes=list(G.nodes()), # root nodes length=10, # maximum length of a random walk n=10, # number of random walks per root node p=0.6, # Defines (unormalised) probability, 1/p, of returning to source node q=1.7, # Defines (unormalised) probability, 1/q, for moving away from source node ) t1 = time.time() print("Number of random walks: {} in {:.2f} s".format( len(walks), (t1 - t0))) str_walks = [[str(n) for n in walk] for walk in walks] model = Word2Vec(str_walks, size=128, window=5, min_count=0, sg=1, workers=8, iter=5) # size: length of embedding vector # The embedding vectors can be retrieved from model.wv using the node ID. # model.wv["19231"].shape # Retrieve node embeddings node_ids = model.wv.index2word # list of node IDs node_embeddings = ( model.wv.vectors ) # numpy.ndarray of size number of nodes times embeddings dimensionality # Retrieve corresponding targets # from training csv # core_targets = core_target_sample.loc[[int(node_id) for node_id in node_ids if int(node_id) in list(core_target_sample.index)]].CaseID # ext_targets = ext_target_sample.loc[[int(node_id) for node_id in node_ids if int(node_id) in list(ext_target_sample.index)]].CaseID # from vertices' data core_targets = v_sample.loc[[int(node_id) for node_id in node_ids]].CoreCaseGraphID ext_targets = v_sample.loc[[int(node_id) for node_id in node_ids]].ExtendedCaseGraphID t2 = time.time() print(f"Deepwalk complete: {(t2-t0):.2f} s") # Visualize embeddings with TSNE embs_2d = get_TSNE(node_embeddings) # Draw the embedding points, coloring them by the target label (CaseID) alpha = 0.6 label_map = { l: i for i, l in enumerate(np.unique(ext_targets), start=10) if pd.notna(l) } label_map[0] = 1 node_colours = [ label_map[target] if pd.notna(target) else 0 for target in ext_targets ] plt.figure(figsize=(15, 15)) plt.axes().set(aspect="equal") plt.scatter( embs_2d[:, 0], embs_2d[:, 1], c=node_colours, cmap="jet", alpha=alpha, ) plt.title("TSNE visualization of node embeddings w.r.t. Extended Case ID") plt.show() return node_ids, node_embeddings, core_targets, ext_targets
def DGIPipeline(v_sets, e_sets, v_data, e_data, core_targets, ext_targets, core_testing): print("HINSAGE DGI FULL PIPELINE STARTED") tin = time.time() #? Sort based on testingFlag # data_splits[i].iloc[INDEX].values[0] # where INDEX: # [0] testingFlag=NaN # [1] testingFlag=0 # [2] testingFlag=1 data_splits = dict() for i in v_sets: v_sets[i] = v_sets[i].sort_values('testingFlag') data_splits[i] = v_sets[i].testingFlag.value_counts().to_frame() v_sets[i] = v_sets[i].drop('testingFlag', axis=1) #? Removing ExtendedCaseGraphID for i in v_sets: v_sets[i] = v_sets[i].drop('ExtendedCaseGraphID', axis=1) #? Create the graph object G = sg.StellarDiGraph(v_sets, e_sets) ''' Iterate through the algotithm for every node type. This is because HinSAGE can predict on one node type at a time, even though it uses all the graph to compute the embeddings. ''' # Parameters batch_size = 200 dropout = 0.4 verbose = 1 visualize = False def run_for_node_type(v_type, hinsage_layer_sizes, num_samples, activations, epochs): nan_tflag = data_splits[v_type].iloc[0].values[0] train_tflag = data_splits[v_type].iloc[1].values[0] test_tflag = data_splits[v_type].iloc[2].values[0] train_cv_set = v_sets[v_type][nan_tflag:nan_tflag + train_tflag] train_cv_ids = train_cv_set.index.values.tolist() train_cv_labels = v_data.loc[[ int(node_id) for node_id in train_cv_ids ]].ExtendedCaseGraphID test_set = v_sets[v_type][-test_tflag:] test_ids = test_set.index.values.tolist() generator = HinSAGENodeGenerator(G, batch_size, num_samples, head_node_type=v_type) hinsage = HinSAGE(layer_sizes=hinsage_layer_sizes, activations=activations, generator=generator, bias=True, normalize="l2", dropout=dropout) def run_deep_graph_infomax(base_model, generator, epochs): print(f"Starting training for {v_type} type: ") t0 = time.time() corrupted_generator = CorruptedGenerator(generator) gen = corrupted_generator.flow(G.nodes(node_type=v_type)) infomax = DeepGraphInfomax(base_model, corrupted_generator) x_in, x_out = infomax.in_out_tensors() # Train with DGI model = Model(inputs=x_in, outputs=x_out) model.compile(loss=tf.nn.sigmoid_cross_entropy_with_logits, optimizer=Adam(lr=1e-3)) es = EarlyStopping(monitor="loss", min_delta=0, patience=10) history = model.fit(gen, epochs=epochs, verbose=verbose, callbacks=[es]) #sg.utils.plot_history(history) x_emb_in, x_emb_out = base_model.in_out_tensors() if generator.num_batch_dims() == 2: x_emb_out = tf.squeeze(x_emb_out, axis=0) t1 = time.time() print(f'Time required: {t1-t0:.2f} s ({(t1-t0)/60:.1f} min)') return x_emb_in, x_emb_out, model #? Train HinSAGE model: x_emb_in, x_emb_out, _model = run_deep_graph_infomax(hinsage, generator, epochs=epochs) emb_model = Model(inputs=x_emb_in, outputs=x_emb_out) train_cv_embs = emb_model.predict( generator.flow(train_cv_set.index.values)) #? Optional: Plot embeddings of training and CV set of current node type if (visualize == True): train_cv_embs_2d = pd.DataFrame( TSNE(n_components=2).fit_transform(train_cv_embs), index=train_cv_set.index.values) label_map = { l: i * 10 for i, l in enumerate(np.unique(train_cv_labels), start=10) if pd.notna(l) } node_colours = [ label_map[target] if pd.notna(target) else 0 for target in train_cv_labels ] alpha = 0.7 fig, ax = plt.subplots(figsize=(15, 15)) ax.scatter( train_cv_embs_2d[0], train_cv_embs_2d[1], c=node_colours, cmap="jet", alpha=alpha, ) ax.set(aspect="equal") plt.title( f"TSNE of HinSAGE {v_type} embeddings with DGI- coloring on ExtendedCaseGraphID" ) plt.show() return 1 #? Split training and cross valuation set using 80% 20% simple ordered split n_embs = train_cv_embs.shape[0] train_size = int(n_embs * 0.80) cv_size = int(n_embs * 0.20) train_set = train_cv_embs[:train_size] train_labels = np.ravel( pd.DataFrame(train_cv_labels.values[:train_size]).fillna(0)) cv_set = train_cv_embs[-cv_size:] cv_labels = np.ravel( pd.DataFrame(train_cv_labels.values[-cv_size:]).fillna(0)) #? CLASSIFY print(f"Running Classifier for {v_type} type") classifier = DecisionTreeClassifier() classifier.fit( X=train_set, y=train_labels, ) cv_pred = classifier.predict(cv_set) f1_avg = f1_score(cv_labels, cv_pred, average='weighted') acc = (cv_pred == cv_labels).mean() print(f"{v_type} CV Metrics: f1: {f1_avg:.6f} - acc: {acc:.6f}") #? Now Run on test set test_embs = emb_model.predict(generator.flow(test_set.index.values)) test_pred = classifier.predict(test_embs) #? Save predictions outdir = './output' outname = f"{v_type}_predictions.csv" if not os.path.exists(outdir): os.mkdir(outdir) fullname = os.path.join(outdir, outname) output = pd.DataFrame(test_ids) output = output.rename(columns={0: 'node_id'}) output['ExtendedCaseGraphID'] = test_pred output = output.set_index('node_id') output.to_csv(fullname) return output #? Run for each node type full_predictions = pd.DataFrame() for v_type in v_sets: if v_type == 'Account': epochs = 12 num_samples = [8, 4] hinsage_layer_sizes = [32, 32] activations = ['relu', 'relu'] else: epochs = 30 num_samples = [12] hinsage_layer_sizes = [72] activations = ['relu'] if v_type != 'External Entity' and v_type != 'Address': predictions = run_for_node_type(v_type, hinsage_layer_sizes, num_samples, activations, epochs) full_predictions = full_predictions.append(predictions) full_predictions.to_csv("./output/full_predictions.csv") tout = time.time() print(f"HINSAGE DGI FULL PIPELINE COMPLETED: {(tin-tout)/60:.0f} min") return 1