Esempio n. 1
0
def pgframe_to_stellargraph(pgframe,
                            directed=True,
                            include_type=False,
                            feature_vector_prop=None,
                            feature_props=None,
                            edge_weight=None):
    """Convert a PGFrame to a StellarGraph object."""
    if feature_props is None:
        feature_props = []

    feature_array = None
    if include_type:
        nodes = {}
        for t in pgframe.node_types():
            index = pgframe.nodes(typed_by=t)
            if feature_vector_prop is not None:
                feature_array = np.array(
                    pgframe.get_node_property_values(feature_vector_prop,
                                                     typed_by=t).to_list())
            elif len("feature_props") > 0:
                feature_array = pgframe.nodes(
                    raw_frame=True, typed_by=t)[feature_props].to_numpy()
            nodes[t] = sg.IndexedArray(feature_array, index=index)
    else:
        if feature_vector_prop is not None:
            feature_array = np.array(
                pgframe.get_node_property_values(
                    feature_vector_prop).to_list())
        elif len("feature_props") > 0:
            feature_array = pgframe.nodes(
                raw_frame=True)[feature_props].to_numpy()
        nodes = sg.IndexedArray(feature_array, index=pgframe.nodes())

    if pgframe.number_of_edges() > 0:
        edges = pgframe.edges(raw_frame=True,
                              include_index=True,
                              filter_props=lambda x:
                              ((x == "@type")
                               if include_type else False) or x == edge_weight,
                              rename_cols={
                                  '@source_id': 'source',
                                  "@target_id": "target"
                              })
    else:
        edges = pd.DataFrame(columns=["source", "target"])

    if directed:
        graph = sg.StellarDiGraph(
            nodes=nodes,
            edges=edges,
            edge_weight_column=edge_weight,
            edge_type_column="@type" if include_type else None)
    else:
        graph = sg.StellarGraph(
            nodes=nodes,
            edges=edges,
            edge_weight_column=edge_weight,
            edge_type_column="@type" if include_type else None)
    return graph
Esempio n. 2
0
    def serialize_stellargraph(self, attributes: List[str], node_types: List[str]) -> (sg.StellarDiGraph, bool, str, set):
        contains_fraud = False
        edges = {
            'source': [],
            'target': []
        }
        nodes = {}
        nodes_index = []
        fraud_ids = set()

        for attribute_name in attributes:
            nodes[attribute_name] = []

        for type_name in node_types:
            nodes[f'type_{type_name}'] = []

        for index, node in enumerate(self._nodes):
            node_properties = node.get_properties()
            nodes_index.append(node.get_id())

            # ground truth
            if 'is_fraud' in node_properties and node_properties['is_fraud']:
                contains_fraud = True

                fraud_ids.add(node_properties['fraud_id'])

            # data
            for attribute_name in attributes:
                if attribute_name in node_properties:
                    if isinstance(node_properties[attribute_name], bool):
                        nodes[attribute_name].append(1 if node_properties[attribute_name] else 0)
                    elif isinstance(node_properties[attribute_name], int) or isinstance(node_properties[attribute_name], float):
                        nodes[attribute_name].append(node_properties[attribute_name])
                    elif isinstance(node_properties[attribute_name], str) and node_properties[attribute_name].isdigit():
                        nodes[attribute_name].append(float(node_properties[attribute_name]))
                    else:
                        nodes[attribute_name].append(np.nan)
                else:
                    nodes[attribute_name].append(np.nan)

            for type_name in node_types:
                nodes[f'type_{type_name}'].append(node.get_type() == type_name)

            for neighbor in node.get_neighbors():
                edges['source'].append(node.get_id())
                edges['target'].append(neighbor.get_id())

        nodes_df = pd.DataFrame(nodes, index=nodes_index)

        # normalize
        for attribute_name in attributes:
            column_data = nodes_df[attribute_name]
            nodes_df[attribute_name] = ((column_data - np.nanmean(column_data)) / np.nanstd(column_data))

        nodes_df.fillna(-1, inplace=True)

        return sg.StellarDiGraph(nodes_df, edges=pd.DataFrame(edges)), contains_fraud, self._name, fraud_ids
Esempio n. 3
0
    def serialize_stellargraph_node_level(self, attributes: List[str], node_types: List[str]) -> (sg.StellarDiGraph, pd.Series, pd.Series):
        nodes_gt = pd.Series()
        nodes_fraud_id = pd.Series()
        edges = {
            'source': [],
            'target': []
        }
        nodes = {}
        nodes_index = []

        for attribute_name in attributes:
            nodes[attribute_name] = []

        for type_name in node_types:
            nodes[f'type_{type_name}'] = []

        for index, node in enumerate(self._nodes):
            node_properties = node.get_properties()
            nodes_index.append(node.get_id())

            # ground truth
            if 'is_fraud' in node_properties:
                if node_properties['is_fraud']:
                    nodes_gt._set_value(node.get_id(), 'fraud')
                    nodes_fraud_id._set_value(node.get_id(), node_properties['fraud_id'])
                else:
                    nodes_gt._set_value(node.get_id(), 'no_fraud')
                    nodes_fraud_id._set_value(node.get_id(), None)
            else:
                nodes_gt._set_value(node.get_id(), 'irrelevant')
                nodes_fraud_id._set_value(node.get_id(), None)

            # attributes
            for attribute_name in attributes:
                if attribute_name in node_properties:
                    if isinstance(node_properties[attribute_name], bool):
                        nodes[attribute_name].append(1 if node_properties[attribute_name] else 0)
                    elif isinstance(node_properties[attribute_name], int) or isinstance(node_properties[attribute_name], float):
                        nodes[attribute_name].append(node_properties[attribute_name])
                    elif isinstance(node_properties[attribute_name], str) and node_properties[attribute_name].isdigit():
                        nodes[attribute_name].append(float(node_properties[attribute_name]))
                    else:
                        nodes[attribute_name].append(np.nan)
                else:
                    nodes[attribute_name].append(np.nan)

            for type_name in node_types:
                nodes[f'type_{type_name}'].append(node.get_type() == type_name)

            for neighbor in node.get_neighbors():
                edges['source'].append(node.get_id())
                edges['target'].append(neighbor.get_id())

        nodes_df = pd.DataFrame(nodes, index=nodes_index)

        # normalize
        for attribute_name in attributes:
            column_data = nodes_df[attribute_name]
            nodes_df[attribute_name] = ((column_data - np.nanmean(column_data)) / np.nanstd(column_data))

        nodes_df.fillna(-1, inplace=True)

        return sg.StellarDiGraph(nodes_df, edges=pd.DataFrame(edges)), nodes_gt, self._name, nodes_fraud_id
Esempio n. 4
0
def infer_attributes_gat(Gnx, savepred=True, plot=False):
    # Define node data
    feature_names = [
        "in_degree",
        "out_degree",
        # "in_degree_centrality",
        # "out_degree_centrality",
        # "closeness_centrality",
        # "betweenness_centrality",
        "clustering_coefficient",
        # "square_clustering",
        "core_number",
        # "pagerank",
        # "constraint",
        # "effective_size"
    ]
    node_type = [v for k, v in nx.get_node_attributes(Gnx, 'data').items()]
    d = {"node_type": node_type}
    if "in_degree" in feature_names:
        indeg = [v for k, v in Gnx.in_degree]
        indeg = np.divide(indeg, max(indeg))
        indeg[indeg >= 0.5] = 1
        indeg[indeg < 0.5] = 0
        d["in_degree"] = indeg
    if "out_degree" in feature_names:
        outdeg = [v for k, v in Gnx.out_degree]
        outdeg = np.divide(outdeg, max(outdeg))
        outdeg[outdeg >= 0.5] = 1
        outdeg[outdeg < 0.5] = 0
        d["out_degree"] = outdeg
    if "in_degree_centrality" in feature_names:
        indeg_cent = [
            v for k, v in nx.algorithms.in_degree_centrality(Gnx).items()
        ]
        indeg_cent = np.divide(indeg_cent, max(indeg_cent))
        indeg_cent[indeg_cent >= 0.5] = 1
        indeg_cent[indeg_cent < 0.5] = 0
        d["in_degree_centrality"] = indeg_cent
    if "out_degree_centrality" in feature_names:
        outdeg_cent = [
            v for k, v in nx.algorithms.out_degree_centrality(Gnx).items()
        ]
        outdeg_cent = np.divide(outdeg_cent, max(outdeg_cent))
        outdeg_cent[outdeg_cent >= 0.5] = 1
        outdeg_cent[outdeg_cent < 0.5] = 0
        d["out_degree_centrality"] = outdeg_cent
    if "closeness_centrality" in feature_names:
        close_cent = [
            v for k, v in nx.algorithms.closeness_centrality(Gnx).items()
        ]
        close_cent = np.divide(close_cent, max(close_cent))
        close_cent[close_cent >= 0.5] = 1
        close_cent[close_cent < 0.5] = 0
        d["closeness_centrality"] = close_cent
    if "betweenness_centrality" in feature_names:
        between_cent = [
            v for k, v in nx.algorithms.betweenness_centrality(Gnx).items()
        ]
        between_cent = np.divide(between_cent, max(between_cent))
        between_cent[between_cent >= 0.5] = 1
        between_cent[between_cent < 0.5] = 0
        d["betweenness_centrality"] = between_cent
    if "clustering_coefficient" in feature_names:
        clustering_co = [v for k, v in nx.algorithms.clustering(Gnx).items()]
        clustering_co = np.divide(clustering_co, max(clustering_co))
        clustering_co[clustering_co >= 0.5] = 1
        clustering_co[clustering_co < 0.5] = 0
        d["clustering_coefficient"] = clustering_co
    if "square_clustering" in feature_names:
        sq_clustering = [
            v for k, v in nx.algorithms.square_clustering(Gnx).items()
        ]
        sq_clustering = np.divide(sq_clustering, max(sq_clustering))
        sq_clustering[sq_clustering >= 0.5] = 1
        sq_clustering[sq_clustering < 0.5] = 0
        d["square_clustering"] = sq_clustering
    if "core_number" in feature_names:
        core_number = [v for k, v in nx.algorithms.core_number(Gnx).items()]
        core_number = np.divide(core_number, max(core_number))
        core_number[core_number >= 0.5] = 1
        core_number[core_number < 0.5] = 0
        d["core_number"] = core_number
    if "pagerank" in feature_names:
        pagerank = [v for k, v in nx.algorithms.pagerank(Gnx).items()]
        pagerank = np.divide(pagerank, max(pagerank))
        pagerank[pagerank >= 0.5] = 1
        pagerank[pagerank < 0.5] = 0
        d["pagerank"] = pagerank
    if "constraint" in feature_names:
        constraint = [v for k, v in nx.algorithms.constraint(Gnx).items()]
        constraint = np.divide(constraint, max(constraint))
        constraint[np.isnan(constraint)] = 0
        constraint[constraint >= 0.5] = 1
        constraint[constraint < 0.5] = 0
        d["constraint"] = constraint
    if "effective_size" in feature_names:
        effective_size = [
            v for k, v in nx.algorithms.effective_size(Gnx).items()
        ]
        effective_size = np.divide(effective_size, max(effective_size))
        effective_size[np.isnan(effective_size)] = 0
        effective_size[effective_size >= 0.5] = 1
        effective_size[effective_size < 0.5] = 0
        d["effective_size"] = effective_size
    node_data = pd.DataFrame(data=d, index=nodes)
    node_data = shuffle(node_data)

    # Split the data
    train_data, test_data = model_selection.train_test_split(
        node_data, train_size=int(0.80 * len(Gnx)))
    val_data, test_data = model_selection.train_test_split(
        test_data, train_size=int(0.15 * len(Gnx)))

    # Convert to numeric arrays
    target_encoding = feature_extraction.DictVectorizer(sparse=False)

    train_targets = target_encoding.fit_transform(
        train_data[["node_type"]].to_dict('records'))
    val_targets = target_encoding.transform(val_data[["node_type"
                                                      ]].to_dict('records'))
    test_targets = target_encoding.transform(test_data[["node_type"
                                                        ]].to_dict('records'))

    node_features = node_data[feature_names]

    # Create the GAT model in Keras
    G = sg.StellarDiGraph(Gnx, node_features=node_features)
    print(G.info())

    generator = FullBatchNodeGenerator(G)

    train_gen = generator.flow(train_data.index, train_targets)

    gat = GAT(
        layer_sizes=[8, train_targets.shape[1]],
        attn_heads=8,
        generator=generator,
        bias=True,
        in_dropout=0.5,
        attn_dropout=0.5,
        activations=["elu", "softmax"],
        normalize=None,
    )

    # Expose the input and output tensors of the GAT model for node prediction, via GAT.node_model() method:
    x_inp, predictions = gat.node_model()

    # Train the model
    model = Model(inputs=x_inp, outputs=predictions)
    model.compile(
        optimizer=optimizers.Adam(lr=0.005),
        loss=losses.categorical_crossentropy,
        weighted_metrics=["acc"],
    )

    val_gen = generator.flow(val_data.index, val_targets)

    if not os.path.isdir(".temp/logs"):
        os.makedirs(".temp/logs")
    if not os.path.isdir(".temp/output"):
        os.makedirs(".temp/output")

    es_callback = EarlyStopping(
        monitor="val_weighted_acc",
        patience=
        100  # patience is the number of epochs to wait before early stopping in case of no further improvement
    )

    mc_callback = ModelCheckpoint(
        ".temp/logs/best_model.h5",
        monitor="val_weighted_acc",
        save_best_only=True,
        save_weights_only=True,
    )

    history = model.fit_generator(
        train_gen,
        epochs=2000,
        validation_data=val_gen,
        verbose=2,
        shuffle=
        False,  # this should be False, since shuffling data means shuffling the whole graph
        callbacks=[es_callback, mc_callback],
    )

    # Reload the saved weights
    model.load_weights(".temp/logs/best_model.h5")

    # Evaluate the best nidek in the test set
    test_gen = generator.flow(test_data.index, test_targets)

    test_metrics = model.evaluate_generator(test_gen)
    print("\nTest Set Metrics:")
    for name, val in zip(model.metrics_names, test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Make predictions with the model
    all_nodes = node_data.index
    all_gen = generator.flow(all_nodes)
    all_predictions = model.predict_generator(all_gen)

    node_predictions = target_encoding.inverse_transform(all_predictions)

    results = pd.DataFrame(node_predictions, index=G.nodes()).idxmax(axis=1)
    df = pd.DataFrame({"Predicted": results, "True": node_data['node_type']})
    print(df.head)

    if savepred:
        df.to_excel(".temp/output/output" +
                    str(datetime.datetime.now()).replace(':', '-') + ".xlsx")

    if plot:
        # Node embeddings
        emb_layer = model.layers[3]
        print("Embedding layer: {}, output shape {}".format(
            emb_layer.name, emb_layer.output_shape))
        embedding_model = Model(inputs=x_inp, outputs=emb_layer.output)
        emb = embedding_model.predict_generator(all_gen)

        X = emb
        y = np.argmax(target_encoding.transform(
            node_data.reindex(G.nodes())[["node_type"]].to_dict('records')),
                      axis=1)

        if X.shape[1] > 2:
            transform = TSNE  #PCA
            trans = transform(n_components=2)
            emb_transformed = pd.DataFrame(trans.fit_transform(X),
                                           index=list(G.nodes()))
            emb_transformed['label'] = y
        else:
            emb_transformed = pd.DataFrame(X, index=list(G.nodes()))
            emb_transformed = emb_transformed.rename(columns={'0': 0, '1': 1})

        def plot_emb(transform, emb_transformed):
            fig, ax = plt.subplots(figsize=(7, 7))
            ax.scatter(emb_transformed[0],
                       emb_transformed[1],
                       c=emb_transformed['label'].astype("category"),
                       cmap="jet",
                       alpha=0.7)
            ax.set(aspect="equal", xlabel="$X_1$", ylabel="$X_2$")
            plt.title(
                '{} visualization of GAT embeddings for the fighter graph'.
                format(transform.__name__))

        # Plot the training history
        def remove_prefix(text, prefix):
            return text[text.startswith(prefix) and len(prefix):]

        def plot_history(history):
            metrics = sorted(
                set([
                    remove_prefix(m, "val_")
                    for m in list(history.history.keys())
                ]))
            for m in metrics:
                # summarize history for metric m
                plt.figure()
                plt.plot(history.history[m])
                plt.plot(history.history['val_' + m])
                plt.title(m)
                plt.ylabel(m)
                plt.xlabel('epoch')
                plt.legend(['train', 'validation'], loc='best')

        plot_history(history)
        plot_emb(transform, emb_transformed)
        plt.show()

    return df
Esempio n. 5
0
def createEmbeddings(v_sets, e_sets, core_targets, ext_targets, v_sample,
                     e_sample):
    print("DeepGraphInfomax embedding Starting")

    t0 = time.time()

    verbose = 1

    # Initialize stellargraph object
    G = sg.StellarDiGraph(v_sets, e_sets)
    '''
  HinSAGENodeGenerator(G, batch_size, num_samples, head_node_type=None, schema=None, seed=None, name=None)

  G = graph (stellargraph object)
  batch_size = size of batch to return
  num_samples = the number of samples per layer (hop) to take
  head_node_type = the node type that will be given to the generator using the flow method. 
                  The model will expect this type.
                  If not given, it defaults to a single node type.
                  Note: HinSAGE does aggregation on multiple node types 
                  but then predicts on one type.
  '''
    def create_embeddings(node_type, num_samples, hinsage_layer_sizes, epochs,
                          patience, batch_size, dropout, activations):

        # Check if num_samples and layer_size are compatible
        assert len(hinsage_layer_sizes) == len(num_samples)

        generator = HinSAGENodeGenerator(G,
                                         batch_size,
                                         num_samples=num_samples,
                                         head_node_type=node_type)

        # HinSAGE layers
        hinsage = HinSAGE(layer_sizes=hinsage_layer_sizes,
                          activations=activations,
                          generator=generator,
                          bias=True,
                          normalize="l2",
                          dropout=dropout)

        def run_deep_graph_infomax(base_model, generator, epochs, node_type):
            corrupted_generator = CorruptedGenerator(generator)
            gen = corrupted_generator.flow(G.nodes(node_type=node_type))
            infomax = DeepGraphInfomax(base_model, corrupted_generator)

            x_in, x_out = infomax.in_out_tensors()

            print("Starting Training")
            ttrain = time.time()
            # Train
            model = Model(inputs=x_in, outputs=x_out)
            model.compile(loss=tf.nn.sigmoid_cross_entropy_with_logits,
                          optimizer=Adam(lr=1e-3))
            es = EarlyStopping(monitor="loss", min_delta=0, patience=patience)

            history = model.fit(gen,
                                epochs=epochs,
                                verbose=verbose,
                                callbacks=[es])
            # sg.utils.plot_history(history)

            ttrain1 = time.time()
            print(
                f"Training complete in {(ttrain1-ttrain):.2f} s ({(ttrain1-ttrain)/60:.2f} min)"
            )

            x_emb_in, x_emb_out = base_model.in_out_tensors()
            # for full batch models, squeeze out the batch dim (which is 1)
            if generator.num_batch_dims() == 2:
                x_emb_out = tf.squeeze(x_emb_out, axis=0)

            return x_emb_in, x_emb_out

        # Run Deep Graph Infomax
        x_emb_in, x_emb_out = run_deep_graph_infomax(hinsage,
                                                     generator,
                                                     epochs=epochs,
                                                     node_type=node_type)

        emb_model = Model(inputs=x_emb_in, outputs=x_emb_out)
        all_embeddings = emb_model.predict(
            generator.flow(G.nodes(node_type=node_type)))

        # TSNE visualization of embeddings
        ttsne = time.time()
        print("Creating TSNE")
        embeddings_2d = pd.DataFrame(
            TSNE(n_components=2).fit_transform(all_embeddings),
            index=G.nodes(node_type=node_type))

        # draw the points (colors based on ExtendedCaseGraphID)
        node_ids = G.nodes(node_type=node_type).tolist()
        ext_targets = v_sample.loc[[int(node_id) for node_id in node_ids
                                    ]].ExtendedCaseGraphID

        label_map = {
            l: i * 10
            for i, l in enumerate(np.unique(ext_targets), start=10)
            if pd.notna(l)
        }
        node_colours = [
            label_map[target] if pd.notna(target) else 0
            for target in ext_targets
        ]

        ttsne1 = time.time()
        print(
            f"TSNE completed in {(ttsne1-ttsne):.2f} s ({(ttsne1-ttsne)/60:.2f} min)"
        )

        alpha = 0.7
        fig, ax = plt.subplots(figsize=(15, 15))
        ax.scatter(
            embeddings_2d[0],
            embeddings_2d[1],
            c=node_colours,
            cmap="jet",
            alpha=alpha,
        )
        ax.set(aspect="equal")
        plt.title(
            f'TSNE visualization of HinSAGE "{node_type}" embeddings with Deep Graph Infomax'
        )
        plt.savefig(f"./embeddings/HinSAGE_DGI_embeddings_{node_type}.pdf")

        return all_embeddings, embeddings_2d

    # Repeat DGI HinSAGE algorithm for every node type
    # (each node type requires a training phase)

    account_embeddings, account_2d = create_embeddings(
        node_type="Account",
        epochs=75,
        patience=25,
        batch_size=250,
        dropout=0.4,
        num_samples=[8, 4],
        hinsage_layer_sizes=[32, 32],
        activations=['relu', 'softmax'])

    customer_embeddings, customer_2d = create_embeddings(
        node_type="Customer",
        epochs=100,
        patience=50,
        batch_size=400,
        dropout=0.4,
        num_samples=[12],
        hinsage_layer_sizes=[72],
        activations=['relu'])

    derEntity_embeddings, derEntity_2d = create_embeddings(
        node_type="Derived Entity",
        epochs=100,
        patience=50,
        batch_size=1200,
        dropout=0.25,
        num_samples=[12],
        hinsage_layer_sizes=[72],
        activations=['relu'])

    # Address and External Entity don't have any outgoing nodes and can't be used for this.
    # Another technique specific for External Entities and Addresses might be a good fit.

    # Put all the embeddings in the same map
    # TODO

    # arrays
    full_graph_embeddings = [
        account_embeddings, customer_embeddings, derEntity_embeddings
    ]

    # dataframes
    full_graph_2d_frames = [account_2d, customer_2d, derEntity_2d]
    full_graph_2d = pd.concat(full_graph_2d_frames)

    # draw all the embeddings together
    node_ids_full = np.concatenate(
        (G.nodes(node_type='Account'), G.nodes(node_type='Customer'),
         G.nodes(node_type='Derived Entity'))).tolist()

    ext_targets_full = v_sample.loc[[
        int(node_id) for node_id in node_ids_full
    ]].ExtendedCaseGraphID

    label_map_full = {
        l: i * 10
        for i, l in enumerate(np.unique(ext_targets_full), start=10)
        if pd.notna(l)
    }
    node_colours_full = [
        label_map_full[target] if pd.notna(target) else 0
        for target in ext_targets_full
    ]

    alpha = 0.7
    fig, ax = plt.subplots(figsize=(15, 15))
    ax.scatter(
        full_graph_2d[0],
        full_graph_2d[1],
        c=node_colours_full,
        cmap="jet",
        alpha=alpha,
    )
    ax.set(aspect="equal")
    plt.title(
        f'TSNE visualization of HinSAGE Full Graph embeddings with Deep Graph Infomax'
    )
    plt.savefig("./embeddings/HinSAGE_DGI_embeddings_FullGraph.pdf")

    # Train a classifier for prediction
    # TODO

    t1 = time.time()
    print(f"HinSAGE DGI completed in {(t1-t0):.2f} s ({(t1-t0)/60:.2f} min)")

    return full_graph_embeddings
Esempio n. 6
0
set(node_data["subject"])

train_data, test_data = model_selection.train_test_split(
    node_data, train_size=0.1, test_size=None, stratify=node_data["subject"]
)

Counter(train_data["subject"])
target_encoding = feature_extraction.DictVectorizer(sparse=False)

train_targets = target_encoding.fit_transform(train_data[["subject"]].to_dict("records"))
test_targets = target_encoding.transform(test_data[["subject"]].to_dict("records"))

node_features = node_data[feature_names]

G = sg.StellarDiGraph(nodes={"paper": node_features}, edges={"cites": edgelist})


batch_size = 50
in_samples = [5, 2]
out_samples = [5, 2]

generator = DirectedGraphSAGENodeGenerator(G, batch_size, in_samples, out_samples)
train_gen = generator.flow(train_data.index, train_targets, shuffle=True)
graphsage_model = DirectedGraphSAGE(
    layer_sizes=[32, 32], generator=generator, bias=False, dropout=0.5,
)

x_inp, x_out = graphsage_model.build()
prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out)
Esempio n. 7
0
def sg_DeepWalk(v_sets, e_sets, v_sample, e_sample):
    G = sg.StellarDiGraph(v_sets, e_sets)

    #### Graph embedding with NODE2VEC and WORD2VEC

    print("Running DeepWalk")

    rw = sg.data.BiasedRandomWalk(G)
    t0 = time.time()
    walks = rw.run(
        nodes=list(G.nodes()),  # root nodes
        length=10,  # maximum length of a random walk
        n=10,  # number of random walks per root node
        p=0.6,  # Defines (unormalised) probability, 1/p, of returning to source node
        q=1.7,  # Defines (unormalised) probability, 1/q, for moving away from source node
    )
    t1 = time.time()
    print("Number of random walks: {} in {:.2f} s".format(
        len(walks), (t1 - t0)))

    str_walks = [[str(n) for n in walk] for walk in walks]
    model = Word2Vec(str_walks,
                     size=128,
                     window=5,
                     min_count=0,
                     sg=1,
                     workers=8,
                     iter=5)
    # size: length of embedding vector

    # The embedding vectors can be retrieved from model.wv using the node ID.
    # model.wv["19231"].shape

    # Retrieve node embeddings
    node_ids = model.wv.index2word  # list of node IDs
    node_embeddings = (
        model.wv.vectors
    )  # numpy.ndarray of size number of nodes times embeddings dimensionality

    # Retrieve corresponding targets

    # from training csv
    # core_targets = core_target_sample.loc[[int(node_id) for node_id in node_ids if int(node_id) in list(core_target_sample.index)]].CaseID
    # ext_targets = ext_target_sample.loc[[int(node_id) for node_id in node_ids if int(node_id) in list(ext_target_sample.index)]].CaseID

    # from vertices' data
    core_targets = v_sample.loc[[int(node_id)
                                 for node_id in node_ids]].CoreCaseGraphID
    ext_targets = v_sample.loc[[int(node_id)
                                for node_id in node_ids]].ExtendedCaseGraphID

    t2 = time.time()
    print(f"Deepwalk complete: {(t2-t0):.2f} s")

    # Visualize embeddings with TSNE
    embs_2d = get_TSNE(node_embeddings)

    # Draw the embedding points, coloring them by the target label (CaseID)
    alpha = 0.6
    label_map = {
        l: i
        for i, l in enumerate(np.unique(ext_targets), start=10) if pd.notna(l)
    }
    label_map[0] = 1
    node_colours = [
        label_map[target] if pd.notna(target) else 0 for target in ext_targets
    ]

    plt.figure(figsize=(15, 15))
    plt.axes().set(aspect="equal")
    plt.scatter(
        embs_2d[:, 0],
        embs_2d[:, 1],
        c=node_colours,
        cmap="jet",
        alpha=alpha,
    )
    plt.title("TSNE visualization of node embeddings w.r.t. Extended Case ID")
    plt.show()

    return node_ids, node_embeddings, core_targets, ext_targets
Esempio n. 8
0
def DGIPipeline(v_sets, e_sets, v_data, e_data, core_targets, ext_targets,
                core_testing):
    print("HINSAGE DGI FULL PIPELINE STARTED")
    tin = time.time()

    #? Sort based on testingFlag
    # data_splits[i].iloc[INDEX].values[0]
    # where INDEX:
    # [0] testingFlag=NaN
    # [1] testingFlag=0
    # [2] testingFlag=1
    data_splits = dict()
    for i in v_sets:
        v_sets[i] = v_sets[i].sort_values('testingFlag')
        data_splits[i] = v_sets[i].testingFlag.value_counts().to_frame()
        v_sets[i] = v_sets[i].drop('testingFlag', axis=1)

    #? Removing ExtendedCaseGraphID
    for i in v_sets:
        v_sets[i] = v_sets[i].drop('ExtendedCaseGraphID', axis=1)

    #? Create the graph object
    G = sg.StellarDiGraph(v_sets, e_sets)
    '''
  Iterate through the algotithm for every node type.
  This is because HinSAGE can predict on one node type at a time, even though
  it uses all the graph to compute the embeddings.
  '''
    # Parameters
    batch_size = 200
    dropout = 0.4
    verbose = 1
    visualize = False

    def run_for_node_type(v_type, hinsage_layer_sizes, num_samples,
                          activations, epochs):
        nan_tflag = data_splits[v_type].iloc[0].values[0]
        train_tflag = data_splits[v_type].iloc[1].values[0]
        test_tflag = data_splits[v_type].iloc[2].values[0]

        train_cv_set = v_sets[v_type][nan_tflag:nan_tflag + train_tflag]
        train_cv_ids = train_cv_set.index.values.tolist()
        train_cv_labels = v_data.loc[[
            int(node_id) for node_id in train_cv_ids
        ]].ExtendedCaseGraphID

        test_set = v_sets[v_type][-test_tflag:]
        test_ids = test_set.index.values.tolist()

        generator = HinSAGENodeGenerator(G,
                                         batch_size,
                                         num_samples,
                                         head_node_type=v_type)

        hinsage = HinSAGE(layer_sizes=hinsage_layer_sizes,
                          activations=activations,
                          generator=generator,
                          bias=True,
                          normalize="l2",
                          dropout=dropout)

        def run_deep_graph_infomax(base_model, generator, epochs):
            print(f"Starting training for {v_type} type: ")
            t0 = time.time()
            corrupted_generator = CorruptedGenerator(generator)
            gen = corrupted_generator.flow(G.nodes(node_type=v_type))
            infomax = DeepGraphInfomax(base_model, corrupted_generator)

            x_in, x_out = infomax.in_out_tensors()

            # Train with DGI
            model = Model(inputs=x_in, outputs=x_out)
            model.compile(loss=tf.nn.sigmoid_cross_entropy_with_logits,
                          optimizer=Adam(lr=1e-3))
            es = EarlyStopping(monitor="loss", min_delta=0, patience=10)
            history = model.fit(gen,
                                epochs=epochs,
                                verbose=verbose,
                                callbacks=[es])
            #sg.utils.plot_history(history)

            x_emb_in, x_emb_out = base_model.in_out_tensors()
            if generator.num_batch_dims() == 2:
                x_emb_out = tf.squeeze(x_emb_out, axis=0)

            t1 = time.time()
            print(f'Time required: {t1-t0:.2f} s ({(t1-t0)/60:.1f} min)')

            return x_emb_in, x_emb_out, model

        #? Train HinSAGE model:
        x_emb_in, x_emb_out, _model = run_deep_graph_infomax(hinsage,
                                                             generator,
                                                             epochs=epochs)

        emb_model = Model(inputs=x_emb_in, outputs=x_emb_out)

        train_cv_embs = emb_model.predict(
            generator.flow(train_cv_set.index.values))

        #? Optional: Plot embeddings of training and CV set of current node type
        if (visualize == True):
            train_cv_embs_2d = pd.DataFrame(
                TSNE(n_components=2).fit_transform(train_cv_embs),
                index=train_cv_set.index.values)
            label_map = {
                l: i * 10
                for i, l in enumerate(np.unique(train_cv_labels), start=10)
                if pd.notna(l)
            }
            node_colours = [
                label_map[target] if pd.notna(target) else 0
                for target in train_cv_labels
            ]

            alpha = 0.7
            fig, ax = plt.subplots(figsize=(15, 15))
            ax.scatter(
                train_cv_embs_2d[0],
                train_cv_embs_2d[1],
                c=node_colours,
                cmap="jet",
                alpha=alpha,
            )
            ax.set(aspect="equal")
            plt.title(
                f"TSNE of HinSAGE {v_type} embeddings with DGI- coloring on ExtendedCaseGraphID"
            )
            plt.show()

            return 1

        #? Split training and cross valuation set using 80% 20% simple ordered split
        n_embs = train_cv_embs.shape[0]
        train_size = int(n_embs * 0.80)
        cv_size = int(n_embs * 0.20)

        train_set = train_cv_embs[:train_size]
        train_labels = np.ravel(
            pd.DataFrame(train_cv_labels.values[:train_size]).fillna(0))

        cv_set = train_cv_embs[-cv_size:]
        cv_labels = np.ravel(
            pd.DataFrame(train_cv_labels.values[-cv_size:]).fillna(0))

        #? CLASSIFY
        print(f"Running Classifier for {v_type} type")
        classifier = DecisionTreeClassifier()
        classifier.fit(
            X=train_set,
            y=train_labels,
        )
        cv_pred = classifier.predict(cv_set)
        f1_avg = f1_score(cv_labels, cv_pred, average='weighted')
        acc = (cv_pred == cv_labels).mean()
        print(f"{v_type} CV Metrics: f1: {f1_avg:.6f} - acc: {acc:.6f}")

        #? Now Run on test set
        test_embs = emb_model.predict(generator.flow(test_set.index.values))
        test_pred = classifier.predict(test_embs)

        #? Save predictions
        outdir = './output'
        outname = f"{v_type}_predictions.csv"
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        fullname = os.path.join(outdir, outname)

        output = pd.DataFrame(test_ids)
        output = output.rename(columns={0: 'node_id'})
        output['ExtendedCaseGraphID'] = test_pred
        output = output.set_index('node_id')

        output.to_csv(fullname)

        return output

    #? Run for each node type
    full_predictions = pd.DataFrame()
    for v_type in v_sets:
        if v_type == 'Account':
            epochs = 12
            num_samples = [8, 4]
            hinsage_layer_sizes = [32, 32]
            activations = ['relu', 'relu']
        else:
            epochs = 30
            num_samples = [12]
            hinsage_layer_sizes = [72]
            activations = ['relu']

        if v_type != 'External Entity' and v_type != 'Address':
            predictions = run_for_node_type(v_type, hinsage_layer_sizes,
                                            num_samples, activations, epochs)
            full_predictions = full_predictions.append(predictions)

    full_predictions.to_csv("./output/full_predictions.csv")

    tout = time.time()
    print(f"HINSAGE DGI FULL PIPELINE COMPLETED: {(tin-tout)/60:.0f} min")
    return 1