Ejemplo n.º 1
0
    def preprocess_train(self, node_ids):
        """
        preprocess training set
        """
        if not self.ids_exist(node_ids): raise ValueError('node_ids must exist in self.df')

        # subset df for training nodes
        df_tr = self.df[self.df.index.isin(node_ids)]

        # one-hot-encode target
        self.y_encoding = sklearn.feature_extraction.DictVectorizer(sparse=False)
        train_targets = self.y_encoding.fit_transform(df_tr[["target"]].to_dict('records'))



        # import stellargraph
        try:
            import stellargraph as sg
            from stellargraph.mapper import GraphSAGENodeGenerator
        except:
            raise Exception(SG_ERRMSG)
        if version.parse(sg.__version__) < version.parse('0.8'):
            raise Exception(SG_ERRMSG)



        # return generator
        G_sg = sg.StellarGraph(self.G, node_features=self.df[self.feature_names])
        self.G_sg = G_sg
        generator = GraphSAGENodeGenerator(G_sg, U.DEFAULT_BS, [self.sampsize, self.sampsize])
        train_gen = generator.flow(df_tr.index, train_targets, shuffle=True)
        from .node_generator import NodeSequenceWrapper
        return NodeSequenceWrapper(train_gen)
Ejemplo n.º 2
0
    def build_sgc_features(self, g, feature_dict):
        num_features = len(list(feature_dict.values())[0])
        feature_names = ["w_{}".format(ii) for ii in range(num_features)]
        column_names = feature_names + ["label"]

        features = {}
        for f in column_names:
            features[f] = []

        for s in g.nodes():
            for i in range(num_features):
                features["w_{}".format(i)].append(feature_dict[s][i])
            features['label'].append(g.nodes()[s]['label'])

        F = pd.DataFrame(data=features, index=g.nodes())
        self.df_features = F[feature_names]
        self.df_targets = F[['label']].astype(str)

        self.df_features = self.reduce_dimensions(self.df_features)

        self.SG = sg.StellarGraph(g,
                                  node_features=self.df_features,
                                  node_type_name='tag')
        self.generator = self.build_generator()

        target_encoding = feature_extraction.DictVectorizer(sparse=False)
        self.target_encoding = target_encoding.fit(
            self.df_targets.to_dict("records"))
Ejemplo n.º 3
0
    def preprocess_valid(self, node_ids):
        """
        preprocess validation nodes (transductive inference)
        node_ids (list):  list of node IDs that generator will yield
        """
        if not self.ids_exist(node_ids): raise ValueError('node_ids must exist in self.df')
        if self.y_encoding is None:
            raise Exception('Unset parameters. Are you sure you called preprocess_train first?')

        # subset df for validation nodes
        df_val = self.df[self.df.index.isin(node_ids)]


        # one-hot-encode target
        val_targets = self.y_encoding.transform(df_val[["target"]].to_dict('records'))


        # import stellargraph
        try:
            import stellargraph as sg
            from stellargraph.mapper import GraphSAGENodeGenerator
        except:
            raise Exception(SG_ERRMSG)
        if version.parse(sg.__version__) < version.parse('0.8'):
            raise Exception(SG_ERRMSG)


        # return generator
        if self.G_sg is None:
            self.G_sg = sg.StellarGraph(self.G, node_features=self.df[self.feature_names])
        generator = GraphSAGENodeGenerator(self.G_sg, U.DEFAULT_BS, [self.sampsize,self.sampsize])
        val_gen = generator.flow(df_val.index, val_targets, shuffle=False)
        from .node_generator import NodeSequenceWrapper
        return NodeSequenceWrapper(val_gen)
Ejemplo n.º 4
0
def get_graph():
    Gnx = create_graph_from_edgelist(read_edgelist())

    node_data, node_features = read_node_features()

    G = sg.StellarGraph(Gnx, node_features=node_features)
    return node_data, G
Ejemplo n.º 5
0
    def preprocess_train(self, G, edge_ids, edge_labels, mode='train'):
        """
        ```
        preprocess training set
        Args:
          G (networkx graph): networkx graph
          edge_ids(list): list of tuples representing edge ids
          edge_labels(list): edge labels (1 or 0 to indicated whether it is a true edge in original graph or not)
        ```
        """
        # import stellargraph
        try:
            import stellargraph as sg
            from stellargraph.mapper import GraphSAGELinkGenerator
        except:
            raise Exception(SG_ERRMSG)
        if version.parse(sg.__version__) < version.parse('0.8'):
            raise Exception(SG_ERRMSG)

        #edge_labels = to_categorical(edge_labels)
        G_sg = sg.StellarGraph(G, node_features="feature")
        #print(G_sg.info())
        shuffle = True if mode == 'train' else False
        link_seq = GraphSAGELinkGenerator(
            G_sg, U.DEFAULT_BS, self.sample_sizes).flow(edge_ids,
                                                        edge_labels,
                                                        shuffle=shuffle)
        from .sg_wrappers import LinkSequenceWrapper
        return LinkSequenceWrapper(link_seq)
Ejemplo n.º 6
0
def test(edgelist, node_data, model_file, batch_size, target_name="subject"):
    """
    Load the serialized model and evaluate on all nodes in the graph.

    Args:
        G: NetworkX graph file
        target_converter: Class to give numeric representations of node targets
        feature_converter: CLass to give numeric representations of the node features
        model_file: Location of Keras model to load
        batch_size: Size of batch for inference
    """
    # Extract the feature data. These are the feature vectors that the Keras model will use as input.
    # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication.
    node_features = node_data[feature_names]

    # Create graph from edgelist and set node features and node type
    Gnx = nx.from_pandas_edgelist(edgelist)

    # We must also save the target encoding to convert model predictions
    encoder_file = model_file.replace(
        "cora_example_model", "cora_example_encoding"
    ).replace(".h5", ".pkl")
    with open(encoder_file, "rb") as f:
        target_encoding = pickle.load(f)[0]

    # Endode targets with pre-trained encoder
    node_targets = target_encoding.transform(
        node_data[[target_name]].to_dict("records")
    )
    node_ids = node_data.index

    # Convert to StellarGraph and prepare for ML
    G = sg.StellarGraph(Gnx, node_features=node_features)

    # Load Keras model
    model = keras.models.load_model(
        model_file, custom_objects={"MeanAggregator": MeanAggregator}
    )
    print("Loaded model:")
    model.summary()

    # Get required samples from model
    # TODO: Can we move this to the library?
    num_samples = [
        int(model.input_shape[ii + 1][1] / model.input_shape[ii][1])
        for ii in range(len(model.input_shape) - 1)
    ]

    # Create mappers for GraphSAGE that input data from the graph to the model
    generator = GraphSAGENodeGenerator(
        G, batch_size, num_samples, seed=42
    )
    all_gen = generator.flow(node_ids, node_targets)

    # Evaluate and print metrics
    all_metrics = model.evaluate_generator(all_gen)

    print("\nAll-node Evaluation:")
    for name, val in zip(model.metrics_names, all_metrics):
        print("\t{}: {:0.4f}".format(name, val))
Ejemplo n.º 7
0
def form_graph(edges_path, meta_path, ids_path, meta_received):
    edges = pd.read_csv(edges_path, sep=",", index_col=0)
    ID = 111180

    idss = pd.read_csv(ids_path, index_col=0, names=["paper_id"]).iloc[1:].append(
        pd.DataFrame([111180], columns=["paper_id"])).reset_index(drop=True)
    meta = pd.read_csv(meta_path, index_col=0)
    new_meta = pd.DataFrame([meta_received], columns=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"])
    meta_final = meta.append(new_meta).reset_index(drop=True)
    meta_finall = meta_final.join(idss).set_index("paper_id")
    ids = pd.read_csv("data/cutted_edges_to.csv", index_col=0).iloc[1:].append(
        pd.DataFrame([ID], columns=["0"])).reset_index(drop=True)

    column_from = []
    for i in range(len(ids)):
        column_from.append([ID])
    column_from = pd.DataFrame(column_from, columns=["from"])
    edges_final = column_from.join(ids)
    edges_final.rename(columns={'from': '0', '0': '1'}, inplace=True)
    edges_final = edges.append(edges_final).reset_index(drop=True)
    edge_data = pd.DataFrame(
        {
            "source": list(edges_final["0"].astype(int)),
            "target": list(edges_final["1"].astype(int))
        })

    G = sg.StellarGraph(
        {"paper": meta_finall}, {"paper-cites": edge_data}
    )

    print(G.info())
    return G
Ejemplo n.º 8
0
    def initialize(self,**hyper_params):

        if(not "batch_size" in hyper_params.keys()):
            batch_size = 16
        if(not "layer_sizes" in hyper_params.keys()):
            num_samples = [25, 10]
        if(not "num_samples" in hyper_params.keys()):
            layer_sizes = [256, 256]
        if(not "bias" in hyper_params.keys()):
            bias = True
        if(not "dropout" in hyper_params.keys()):
            dropout = 0.0
        if(not "lr" in hyper_params.keys()):
            lr = 1e-3
        if(not "num_walks" in hyper_params.keys()):
            num_walks = 1
        if(not "length" in hyper_params.keys()):
            length = 5

        self.graph = sg.StellarGraph(nodes=self.nodes_df,edges=self.edges_df)
        self.nodes = list(self.graph.nodes())

        del self.nodes_df
        del self.edges_df

        unsupervised_samples = UnsupervisedSampler(
            self.graph, nodes=self.nodes, length=length, number_of_walks=num_walks
        )

        # Train iterators
        train_gen = GraphSAGELinkGenerator(self.graph, batch_size, num_samples)
        self.train_flow = train_gen.flow(unsupervised_samples)

        # Model defining - Keras functional API + Stellargraph layers
        graphsage = GraphSAGE(
            layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout, normalize="l2"
        )

        x_inp, x_out = graphsage.in_out_tensors()

        prediction = link_classification(
            output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
        )(x_out)

        self.model = keras.Model(inputs=x_inp, outputs=prediction)

        self.model.compile(
            optimizer=keras.optimizers.Adam(lr=lr),
            loss=keras.losses.binary_crossentropy,
            metrics=[keras.metrics.binary_accuracy],
        )

        x_inp_src = x_inp[0::2]
        x_out_src = x_out[0]
        self.embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

        self.node_gen = GraphSAGENodeGenerator(self.graph, batch_size, num_samples).flow(self.nodes)

        return self.model.get_weights()
Ejemplo n.º 9
0
def pgframe_to_stellargraph(pgframe,
                            directed=True,
                            include_type=False,
                            feature_vector_prop=None,
                            feature_props=None,
                            edge_weight=None):
    """Convert a PGFrame to a StellarGraph object."""
    if feature_props is None:
        feature_props = []

    feature_array = None
    if include_type:
        nodes = {}
        for t in pgframe.node_types():
            index = pgframe.nodes(typed_by=t)
            if feature_vector_prop is not None:
                feature_array = np.array(
                    pgframe.get_node_property_values(feature_vector_prop,
                                                     typed_by=t).to_list())
            elif len("feature_props") > 0:
                feature_array = pgframe.nodes(
                    raw_frame=True, typed_by=t)[feature_props].to_numpy()
            nodes[t] = sg.IndexedArray(feature_array, index=index)
    else:
        if feature_vector_prop is not None:
            feature_array = np.array(
                pgframe.get_node_property_values(
                    feature_vector_prop).to_list())
        elif len("feature_props") > 0:
            feature_array = pgframe.nodes(
                raw_frame=True)[feature_props].to_numpy()
        nodes = sg.IndexedArray(feature_array, index=pgframe.nodes())

    if pgframe.number_of_edges() > 0:
        edges = pgframe.edges(raw_frame=True,
                              include_index=True,
                              filter_props=lambda x:
                              ((x == "@type")
                               if include_type else False) or x == edge_weight,
                              rename_cols={
                                  '@source_id': 'source',
                                  "@target_id": "target"
                              })
    else:
        edges = pd.DataFrame(columns=["source", "target"])

    if directed:
        graph = sg.StellarDiGraph(
            nodes=nodes,
            edges=edges,
            edge_weight_column=edge_weight,
            edge_type_column="@type" if include_type else None)
    else:
        graph = sg.StellarGraph(
            nodes=nodes,
            edges=edges,
            edge_weight_column=edge_weight,
            edge_type_column="@type" if include_type else None)
    return graph
Ejemplo n.º 10
0
    def initialize(self,**hyper_params):

        if(not "batch_size" in hyper_params.keys()):
            batch_size = 20
        if(not "layer_sizes" in hyper_params.keys()):
            num_samples = [20, 10]
        if(not "num_samples" in hyper_params.keys()):
            layer_sizes = [10, 10 ]
        if(not "bias" in hyper_params.keys()):
            bias = True
        if(not "dropout" in hyper_params.keys()):
            dropout = 0.1
        if(not "lr" in hyper_params.keys()):
            lr = 1e-2

        graph = sg.StellarGraph(nodes=self.nodes,edges=self.edges)

        # Test split
        edge_splitter_test = EdgeSplitter(graph)
        self.graph_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
            p=0.1, method="global", keep_connected=True, seed = 42
        )

        # Train split
        edge_splitter_train = EdgeSplitter(self.graph_test)
        self.graph_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
            p=0.1, method="global", keep_connected=True, seed = 42
        )

        # Train iterators
        train_gen = GraphSAGELinkGenerator(self.graph_train, batch_size, num_samples, seed = 42)
        self.train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True)

        # Test iterators
        test_gen = GraphSAGELinkGenerator(self.graph_train, batch_size, num_samples, seed = 42)
        self.test_flow = test_gen.flow(edge_ids_test, edge_labels_test, shuffle=True)

        # Model defining - Keras functional API + Stellargraph layers
        graphsage = GraphSAGE(
            layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout
        )

        x_inp, x_out = graphsage.in_out_tensors()

        prediction = link_classification(
            output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
        )(x_out)

        self.model = keras.Model(inputs=x_inp, outputs=prediction)

        self.model.compile(
            optimizer=keras.optimizers.Adam(lr=lr),
            loss=keras.losses.binary_crossentropy,
            metrics=[keras.metrics.BinaryAccuracy(),keras.metrics.Recall(),keras.metrics.AUC(),keras.metrics.Precision()],
        )

        # return number of training and testing examples
        return edge_ids_train.shape[0],edge_ids_test.shape[0]
Ejemplo n.º 11
0
def main(graph_loc, layer_sizes, activations, dropout, learning_rate):
    edgelist = pd.read_table(
        os.path.join(graph_loc, 'cora.cites'), header=None, names=['source', 'target']
    )

    # Load node features
    # The CORA dataset contains binary attributes 'w_x' that correspond to whether the corresponding keyword
    # (out of 1433 keywords) is found in the corresponding publication.
    feature_names = ['w_{}'.format(ii) for ii in range(1433)]
    # Also, there is a "subject" column
    column_names = feature_names + ['subject']
    node_data = pd.read_table(
        os.path.join(graph_loc, 'cora.content'), header=None, names=column_names
    )

    target_encoding = feature_extraction.DictVectorizer(sparse=False)
    node_targets = target_encoding.fit_transform(
        node_data[['subject']].to_dict("records")
    )

    node_ids = node_data.index
    node_features = node_data[feature_names]

    Gnx = nx.from_pandas_edgelist(edgelist)

    # Convert to StellarGraph and prepare for ML
    G = sg.StellarGraph(Gnx, node_type_name="label", node_features=node_features)

    # Split nodes into train/test using stratification.
    train_nodes, test_nodes, train_targets, test_targets = model_selection.train_test_split(
        node_ids, node_targets, train_size=140, test_size=None, stratify=node_targets, random_state=55232
    )

    # Split test set into test and validation
    val_nodes, test_nodes, val_targets, test_targets = model_selection.train_test_split(
        test_nodes, test_targets, train_size=300, test_size=None, random_state=523214
    )

    generator = FullBatchNodeGenerator(G, func_opt=GCN_Aadj_feats_op, filter='localpool')

    model = train(train_nodes, train_targets, val_nodes, val_targets, generator, dropout,
        layer_sizes, learning_rate, activations)

    # Save the trained model
    save_str = "_h{}_l{}_d{}_r{}".format(
        "gcn", ''.join([str(x) for x in layer_sizes]), str(dropout), str(learning_rate)
    )

    model.save("cora_gcn_model" + save_str + ".h5")

    # We must also save the target encoding to convert model predictions
    with open("cora_gcn_encoding" + save_str + ".pkl", "wb") as f:
        pickle.dump([target_encoding], f)

    test(test_nodes, test_targets, generator, "cora_gcn_model" + save_str + ".h5")
Ejemplo n.º 12
0
    def preprocess_test(self, df_te, G_te):
        """
        ```
        preprocess for inductive inference
        df_te (DataFrame): pandas dataframe containing new node attributes
        G_te (Graph):  a networkx Graph containing new nodes
        ```
        """
        try:
            import networkx as nx
        except ImportError:
            raise ImportError("Please install networkx:  pip install networkx")

        if self.y_encoding is None:
            raise Exception(
                "Unset parameters. Are you sure you called preprocess_train first?"
            )

        # get aggregrated df
        # df_agg = pd.concat([df_te, self.df]).drop_duplicates(keep='last')
        df_agg = pd.concat([df_te, self.df])
        # df_te = pd.concat([self.df, df_agg]).drop_duplicates(keep=False)

        # get aggregrated graph
        is_subset = set(self.G.nodes()) <= set(G_te.nodes())
        if not is_subset:
            raise ValueError("Nodes in self.G must be subset of G_te")
        G_agg = nx.compose(self.G, G_te)

        # one-hot-encode target
        if "target" in df_te.columns:
            test_targets = self.y_encoding.transform(
                df_te[["target"]].to_dict("records"))
        else:
            test_targets = [-1] * len(df_te.shape[0])

        # import stellargraph
        try:
            import stellargraph as sg
            from stellargraph.mapper import GraphSAGENodeGenerator
        except:
            raise Exception(SG_ERRMSG)
        if version.parse(sg.__version__) < version.parse("0.8"):
            raise Exception(SG_ERRMSG)

        # return generator
        G_sg = sg.StellarGraph(G_agg, node_features=df_agg[self.feature_names])
        generator = GraphSAGENodeGenerator(G_sg, U.DEFAULT_BS,
                                           [self.sampsize, self.sampsize])
        test_gen = generator.flow(df_te.index, test_targets, shuffle=False)
        from .sg_wrappers import NodeSequenceWrapper

        return NodeSequenceWrapper(test_gen)
Ejemplo n.º 13
0
    def initialize(self, **hyper_params):

        if (not "batch_size" in hyper_params.keys()):
            batch_size = 20
        if (not "layer_sizes" in hyper_params.keys()):
            num_samples = [20, 10]
        if (not "num_samples" in hyper_params.keys()):
            layer_sizes = [20, 20]
        if (not "bias" in hyper_params.keys()):
            bias = True
        if (not "dropout" in hyper_params.keys()):
            dropout = 0.3
        if (not "lr" in hyper_params.keys()):
            lr = 1e-3
        if (not "train_split" in hyper_params.keys()):
            train_split = 0.2

        self.graph = sg.StellarGraph(nodes=self.nodes, edges=self.edges)

        # Train split
        edge_splitter_train = EdgeSplitter(self.graph)
        graph_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
            p=train_split, method="global", keep_connected=True)

        # Train iterators
        train_gen = GraphSAGELinkGenerator(graph_train, batch_size,
                                           num_samples)
        self.train_flow = train_gen.flow(edge_ids_train,
                                         edge_labels_train,
                                         shuffle=True)

        # Model defining - Keras functional API + Stellargraph layers
        graphsage = GraphSAGE(layer_sizes=layer_sizes,
                              generator=train_gen,
                              bias=bias,
                              dropout=dropout)

        x_inp, x_out = graphsage.in_out_tensors()

        prediction = link_classification(output_dim=1,
                                         output_act="relu",
                                         edge_embedding_method="ip")(x_out)

        self.model = keras.Model(inputs=x_inp, outputs=prediction)

        self.model.compile(
            optimizer=keras.optimizers.Adam(lr=lr),
            loss=keras.losses.binary_crossentropy,
            metrics=["acc"],
        )

        return self.model.get_weights()
Ejemplo n.º 14
0
def CreateGraph(filename, seperator, *args):
    edges = pd.read_csv(filename, sep=seperator)

    drugs = pd.DataFrame(index=pd.unique(edges[args[0]]))
    genes = pd.DataFrame(index=pd.unique(edges[args[1]]))

    graphObj = sg.StellarGraph(
        {"drug": drugs, "gene": genes},
        edges,
        source_column=args[0],
        target_column=args[1],
    )
    return graphObj
Ejemplo n.º 15
0
def test(G, model_file: AnyStr, batch_size: int = 100):
    """
    Load the serialized model and evaluate on a random balanced subset of all links in the graph.
    Note that the set of links the model is evaluated on may contain links from the model's training set.
    To avoid this, set the seed of the edge splitter to the same seed as used for link splitting in train()

    Args:
        G: NetworkX graph file
        model_file: Location of Keras model to load
        batch_size: Size of batch for inference
    """
    print("Loading model from ", model_file)
    model = keras.models.load_model(
        model_file, custom_objects={"MeanAggregator": MeanAggregator})

    # Get required input shapes from model
    num_samples = [
        int(model.input_shape[ii + 1][1] / model.input_shape[ii][1])
        for ii in range(1,
                        len(model.input_shape) - 1, 2)
    ]

    edge_splitter_test = EdgeSplitter(G)
    # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G, and obtain the
    # reduced graph G_test with the sampled links removed:
    G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
        p=0.1,
        method=args.edge_sampling_method,
        probs=args.edge_sampling_probs)

    # Convert G_test to StellarGraph object (undirected, as required by GraphSAGE):
    G_test = sg.StellarGraph(G_test, node_features="feature")

    # Generator feeds data from (source, target) sampled subgraphs to GraphSAGE model
    test_gen = GraphSAGELinkGenerator(
        G_test,
        batch_size,
        num_samples,
        name="test",
    ).flow(edge_ids_test, edge_labels_test)

    # Evaluate and print metrics
    test_metrics = model.evaluate_generator(test_gen)

    print("\nTest Set Evaluation:")
    for name, val in zip(model.metrics_names, test_metrics):
        print("\t{}: {:0.4f}".format(name, val))
Ejemplo n.º 16
0
def load_graph(node_file, edge_file):
    nodes_df = pd.read_csv(node_file, sep=",", header=None, encoding='utf-8')
    edges_df = pd.read_csv(edge_file, sep=",", header=None, encoding='utf-8')

    nodes_list = []
    node_type_list = []
    output_layer = []
    nodes_and_types_list = []
    edges_source_list = []
    edges_target_list = []

    for index, line in nodes_df.iterrows():
        node_id = int(str(line[0]).rstrip())
        node_type = str(line[1]).rstrip()
        node_type = encode_node_type(node_type)
        nodes_list.append(node_id)
        node_type_id = 0

        if node_type == "[go]":
            node_type_id = 1
            output_layer.append([node_id, 1, 0, 0, 0, 0, 0, 0, 0, 0])
        if node_type == "[co]":
            node_type_id = 2
            output_layer.append([node_id, 0, 1, 0, 0, 0, 0, 0, 0, 0])
        if node_type == "[ss]":
            node_type_id = 3
            output_layer.append([node_id, 0, 0, 1, 0, 0, 0, 0, 0, 0])
        if node_type == "[ta]":
            node_type_id = 4
            output_layer.append([node_id, 0, 0, 0, 1, 0, 0, 0, 0, 0])
        if node_type == "[ti]":
            node_type_id = 5
            output_layer.append([node_id, 0, 0, 0, 0, 1, 0, 0, 0, 0])
        if node_type == "[pa]":
            node_type_id = 6
            output_layer.append([node_id, 0, 0, 0, 0, 0, 1, 0, 0, 0])
        if node_type == "[di]":
            node_type_id = 7
            output_layer.append([node_id, 0, 0, 0, 0, 0, 0, 1, 0, 0])
        if node_type == "[dr]":
            node_type_id = 8
            output_layer.append([node_id, 0, 0, 0, 0, 0, 0, 0, 1, 0])
        if node_type == "[se]":
            node_type_id = 9
            output_layer.append([node_id, 0, 0, 0, 0, 0, 0, 0, 0, 1])

        node_type_list.append(node_type_id)
        nodes_and_types_list.append([node_id, node_type_id])

    for index, line in edges_df.iterrows():
        source = int(str(line[0]).rstrip())
        target = int(str(line[1]).rstrip())
        edges_source_list.append(source)
        edges_target_list.append(target)

    nodes_df = pd.DataFrame({"type": node_type_list}, index=nodes_list)
    edges_df = pd.DataFrame({
        "source": edges_source_list,
        "target": edges_target_list
    })

    graph = sg.StellarGraph(nodes_df, edges_df)

    return graph, nodes_and_types_list, output_layer
def load_data(start_month_id=220, end_month_id=264):

    # # 每个时间点股票的列表的列表,为求这段时间内共同的不含缺失值的股票做准备
    # stock_code_list_list = []
    # for month_id in range(250, 260):
    #     stock_code_list_list.append(list(pd.read_csv('csv_demo_con/' +
    #                                                  str(month_id + 1) + '.csv',index_col = 0).dropna(how='any')['stock'].values))
    # # 求这个时间段里都存在的股票
    # common_stock_code_list = stock_code_list_list[0]
    # for i in range(260-250):
    #     common_stock_code_list = [x for x in common_stock_code_list if x in stock_code_list_list[i]]

    if not os.path.exists('data'):
        os.makedirs('data')
    '''
    G是stellargraph中封装好的对象,存储图相关的信息
    nodes_subjects是节点的标签,这里是norm_return
    nodes_features是节点的特征,这里是因子信息
    提前做好列表,方便存每个月的数据
    '''
    G_list = []
    nodes_subjects_list = []
    nodes_features_list = []

    # range调整需要计算的月份,但是延迟一个月,219实际计算的是220.csv,264实际计算的是265.csv
    # 文件中有220.csv到264.csv,所以range取(219, 264)即可全部读取
    for month_id in range(start_month_id - 1, end_month_id + 1):

        # if如果不存这些文件,就说明是第一次运行,则要计算邻接矩阵adj和因子矩阵factor
        # else如果已经存在,那么直接读取即可
        if not (os.path.exists("data/factor-" + str(month_id + 1) + ".csv")
                and os.path.exists("data/adj-" + str(month_id + 1) + ".npz")):
            # 行业为0~30,读取为ori_df,为创建行业的邻接矩阵做准备
            ori_df = pd.read_excel('monthly_indus.xlsx',
                                   index_col=0,
                                   header=None)
            stock_code_list = list(ori_df.index)

            # 字典存stock_code对应的行数
            stock_code_id_dict = {}
            for i, j in zip([i for i in range(3945)], stock_code_list):
                stock_code_id_dict[j] = i

            sub_ori_df = ori_df.iloc[:, month_id]

            # 这三个数组为构建稀疏矩阵做准备
            adj_row = []
            adj_col = []
            adj_data = []

            # 行业编号有30个:0~30
            for indus_code in range(31):
                sub_ori_df_of_indus_code = sub_ori_df[sub_ori_df == indus_code]
                index_of_sub_ori_df_of_indus_code = sub_ori_df_of_indus_code.index
                size = len(sub_ori_df_of_indus_code)
                if size > 1:
                    for i in range(size):
                        for j in range(i, size):
                            # 若为range(i + 1, size),则邻接矩阵的对角线为0
                            stock_1_code = index_of_sub_ori_df_of_indus_code[i]
                            stock_2_code = index_of_sub_ori_df_of_indus_code[j]
                            stock_1_id = stock_code_id_dict[stock_1_code]
                            stock_2_id = stock_code_id_dict[stock_2_code]
                            adj_row.append(stock_1_id)
                            adj_col.append(stock_2_id)
                            adj_data.append(1)

            # 直接创建稀疏的adj可以大幅加速,但还要转回dataframe,因为要根据stock_code和因子求交集
            adj_sparse = sp.coo_matrix((adj_data, (adj_row, adj_col)),
                                       shape=(3945, 3945))
            adj_df = pd.DataFrame((adj_sparse).toarray(),
                                  index=stock_code_list,
                                  columns=stock_code_list)

            # 新得到adj_df, factor_df,里面包含的股票一致
            adj_df, factor_df = create_mat(
                adj_df, 'csv_demo_con/' + str(month_id + 1) + '.csv')

            # 将adj_df转回adj_sparse,创建edges为创建G对象做准备
            adj_sparse = sp.coo_matrix(adj_df.values)

            # 将邻接矩阵和因子矩阵保存,以后就不用再计算一遍了
            sp.save_npz('data/adj-' + str(month_id + 1) + '.npz', adj_sparse)
            factor_df.to_csv('data/factor-' + str(month_id + 1) + '.csv')

        else:
            adj_sparse = sp.load_npz('data/adj-' + str(month_id + 1) + '.npz')
            factor_df = pd.read_csv('data/factor-' + str(month_id + 1) +
                                    '.csv',
                                    index_col=0)
        '''
            如
                row = [1, 3, 4, 6, 8]
                col = [3, 5, 7, 8, 10]
            则1节点和3节点有连结,3节点和5节点有连接,以此类推
            边权由weight对应的列表表示
            注意:每个节点是对应不同的股票的
        '''

        row = adj_sparse.row
        col = adj_sparse.col
        edges = pd.DataFrame({
            "source": row,
            "target": col,
            "weight": [1 for i in range(len(row))]
        })
        '''
            nodes是一个列表,第0个特征对应第0个节点,第1个特征对应第1个节点,以此类推
        '''
        nodes = factor_df.reset_index().loc[:, 'return_1m':'return_12m']
        nodes_features_list.append(nodes)

        #     # 大家共同使用第一个月的邻接矩阵
        #     if month_id == 250:
        #         common_edges = edges

        # 创建包含图信息的对象G
        G = sg.StellarGraph(nodes, edges)
        G_list.append(G)

        # 创建每个节点对应的标签,这里是norm_return
        node_subjects = factor_df.reset_index()['norm_return']
        nodes_subjects_list.append(node_subjects)

        print(month_id + 1, "has finished")
    return G_list, nodes_subjects_list, nodes_features_list
triads = create_triads(X, Y, Lx, Ly, W, nx, ny, order)  ###############

X = triads[0]
Y1 = triads[1]
Y2 = triads[2]
W = triads[3]
Lx = triads[4]
Ly1 = triads[5]
Ly2 = triads[6]

#input_graphs = get_graphs()

# Create the graph network.
# convert the raw data into StellarGraph's graph format for faster operations
graph = sg.StellarGraph(nodes, edges)

generator = sg.mapper.FullBatchNodeGenerator(graph, method="gcn")

# two layers of GCN, each with hidden dimension 16
gcn = sg.layer.GCN(layer_sizes=[16, 16], generator=generator)
x_inp, x_out = gcn.in_out_tensors(
)  # create the input and output TensorFlow tensors

# use TensorFlow Keras to add a layer to compute the (one-hot) predictions
predictions = tf.keras.layers.Dense(units=len(ground_truth_targets.columns),
                                    activation="softmax")(x_out)

# use the input and output tensors to create a TensorFlow Keras model
model = tf.keras.Model(inputs=x_inp, outputs=predictions)
'''
Ejemplo n.º 19
0
    def train(
        self,
        layer_size,
        num_samples,
        train_size=0.7,
        batch_size: int = 200,
        num_epochs: int = 20,
        learning_rate=5e-3,
        dropout=0.0,
        use_bias=True,
    ):
        """
        Build and train the HinSAGE model for link attribute prediction on the specified graph G
        with given parameters.

        Args:
            layer_size: a list of number of hidden nodes in each layer
            num_samples: number of neighbours to sample at each layer
            batch_size: size of mini batch
            num_epochs: number of epochs to train the model (epoch = all training batches are streamed through the model once)
            learning_rate: initial learning rate
            dropout: dropout probability in the range [0, 1)
            use_bias: tells whether to use a bias terms in HinSAGE model

        Returns:

        """

        # Training and test edges
        edges = list(self.g.edges(data=True))
        edges_train, edges_test = model_selection.train_test_split(
            edges, train_size=train_size)

        #  Edgelists:
        edgelist_train = [(e[0], e[1]) for e in edges_train]
        edgelist_test = [(e[0], e[1]) for e in edges_test]

        labels_train = [e[2]["score"] for e in edges_train]
        labels_test = [e[2]["score"] for e in edges_test]

        # Our machine learning task of learning user-movie ratings can be framed as a supervised Link Attribute Inference:
        # given a graph of user-movie ratings, we train a model for rating prediction using the ratings edges_train,
        # and evaluate it using the test ratings edges_test. The model also requires the user-movie graph structure.
        # To proceed, we need to create a StellarGraph object from the ingested graph, for training the model:
        # When sampling the GraphSAGE subgraphs, we want to treat user-movie links as undirected
        self.g = sg.StellarGraph(self.g, node_features="feature")

        # Next, we create the link generators for preparing and streaming training and testing data to the model.
        # The mappers essentially sample k-hop subgraphs of G with randomly selected head nodes, as required by
        # the HinSAGE algorithm, and generate minibatches of those samples to be fed to the input layer of the HinSAGE model.
        generator = HinSAGELinkGenerator(self.g,
                                         batch_size,
                                         num_samples,
                                         head_node_types=["user", "movie"])
        train_gen = generator.flow(edgelist_train, labels_train)
        test_gen = generator.flow(edgelist_test, labels_test)

        # Build the model by stacking a two-layer HinSAGE model and a link regression layer on top.
        assert len(layer_size) == len(
            num_samples
        ), "layer_size and num_samples must be of the same length! Stopping."

        hinsage = HinSAGE(layer_sizes=layer_size,
                          generator=generator,
                          bias=use_bias,
                          dropout=dropout)

        # Define input and output sockets of hinsage:
        x_inp, x_out = hinsage.build()

        # Final estimator layer
        score_prediction = link_regression(
            edge_embedding_method=args.edge_embedding_method)(x_out)

        # Create Keras model for training
        model = Model(inputs=x_inp, outputs=score_prediction)
        model.compile(
            optimizer=optimizers.Adam(lr=learning_rate),
            loss=losses.mean_squared_error,
            metrics=[root_mean_square_error, metrics.mae],
        )

        # Train model
        print("Training the model for {} epochs with initial learning rate {}".
              format(num_epochs, learning_rate))
        history = model.fit_generator(
            train_gen,
            validation_data=test_gen,
            epochs=num_epochs,
            verbose=2,
            shuffle=True,
            use_multiprocessing=True,
            workers=multiprocessing.cpu_count() // 2,
        )

        # Evaluate and print metrics
        test_metrics = model.evaluate_generator(test_gen)

        print("Test Evaluation:")
        for name, val in zip(model.metrics_names, test_metrics):
            print("\t{}: {:0.4f}".format(name, val))
Ejemplo n.º 20
0
def main():
    with open(r"training.txt", "r") as f:
        reader = csv.reader(f)
        training = list(reader)
    # in order of training examples
    training = [element[0].split(" ") for element in training]
    training = pd.DataFrame(training, columns=['Node1', 'Node2', 'Link'])
    print("Training examples shape: {}".format(training.shape))

    with open(r"testing.txt", "r") as f:
        reader = csv.reader(f)
        testing = list(reader)
    # in order of testing examples
    testing = [element[0].split(" ") for element in testing]
    testing = pd.DataFrame(testing, columns=['Node1', 'Node2'])
    print("Testing examples shape: {}".format(testing.shape))
    '''
    uncomment lines for reduced corpus with stopword removal. In future integrate stemmer here, multi-language
    '''
    NODE_INFO_DIRECTORY = r"node_information/text/"

    corpus_path = r"pickles/simple_corpus.PICKLE"
    ids_path = r"pickles/ids.PICKLE"
    if os.path.exists(corpus_path):
        with open(corpus_path, 'rb') as f:
            corpus = pickle.load(f)
        f.close()
        with open(ids_path, 'rb') as f:
            ids = pickle.load(f)
        f.close()
    else:
        corpus = []
        ids = []
        for filename in tqdm(os.listdir(NODE_INFO_DIRECTORY),
                             position=0,
                             leave=True):
            with open(NODE_INFO_DIRECTORY + filename,
                      'r',
                      encoding='UTF-8',
                      errors='ignore') as f:
                doc_string = []
                for line in f:
                    [
                        doc_string.append(token.strip())
                        for token in line.lower().strip().split(" ")
                        if token != ""
                    ]
                corpus.append(' '.join(doc_string))
                ids.append(filename[:-4])
        with open(corpus_path, '+wb') as f:
            pickle.dump(corpus, f)
        f.close()
        with open(ids_path, '+wb') as f:
            pickle.dump(ids, f)
        f.close()

    stemmed_corpus_path = r"pickles/stemmed_corpus.PICKLE"
    if os.path.exists(stemmed_corpus_path):
        with open(stemmed_corpus_path, 'rb') as f:
            stemmed_corpus = pickle.load(f)
        f.close()
    else:
        print('Stemmed corpus unavailable')

    # in order of alphabetical text information i.e. 0, 1, 10, 100
    node_info = pd.DataFrame({
        'id': ids,
        'corpus': corpus,
        'stemmed': stemmed_corpus
    })
    print("Training node info shape: {}".format(node_info.shape))

    train_graph_split_path = 'pickles/train_graph_split.PICKLE'

    if os.path.exists(train_graph_split_path):
        with open(train_graph_split_path, 'rb') as f:
            keep_indices = pickle.load(f)
        f.close()
    else:
        keep_indices = random.sample(range(len(training)),
                                     k=int(len(training) * 0.05))
        with open(train_graph_split_path, '+wb') as f:
            pickle.dump(keep_indices, f)
        f.close()

    data_train_val = training.iloc[keep_indices]

    linked_nodes = training.loc[training['Link'] == '1']
    linked_nodes = linked_nodes[['Node1', 'Node2']]
    edgelist = linked_nodes.rename(columns={
        "Node1": "source",
        "Node2": "target"
    })

    lda_path = r"pickles/stemmed_lda_matrix.PICKLE"
    if os.path.exists(lda_path):
        with open(lda_path, 'rb') as f:
            lda = pickle.load(f)
        f.close()

    lda.shape

    feature_names = node_column_names = ["w_{}".format(ii) for ii in range(10)]
    node_data = pd.DataFrame(lda, columns=node_column_names)
    node_data.index = [str(i) for i in node_data.index]

    G_all_nx = nx.from_pandas_edgelist(edgelist)

    all_node_features = node_data[feature_names]

    G_all = sg.StellarGraph(G_all_nx, node_features=all_node_features)

    print(G_all.info())

    G_all.get_feature_for_nodes(['0'])

    ## Get DBLP Subgraph
    ### with papers published before a threshold year

    sub_linked_nodes = data_train_val.loc[data_train_val['Link'] == '1']
    sub_linked_nodes = sub_linked_nodes[['Node1', 'Node2']]
    subgraph_edgelist = sub_linked_nodes.rename(columns={
        "Node1": "source",
        "Node2": "target"
    })

    G_sub_nx = nx.from_pandas_edgelist(subgraph_edgelist)

    subgraph_node_ids = sorted(list(G_sub_nx.nodes))

    subgraph_node_features = node_data[feature_names].reindex(
        subgraph_node_ids)

    G_sub = sg.StellarGraph(G_sub_nx, node_features=subgraph_node_features)

    print(G_sub.info())

    ## Train attri2vec on the DBLP Subgraph

    nodes = list(G_sub.nodes())
    number_of_walks = int(input('Number of Walks: '))
    length = int(input('Walk length: '))

    unsupervised_samples = UnsupervisedSampler(G_sub,
                                               nodes=nodes,
                                               length=length,
                                               number_of_walks=number_of_walks)

    batch_size = 50
    epochs = int(input('Enter number of epochs: '))

    generator = Attri2VecLinkGenerator(G_sub, batch_size)

    layer_sizes = [128]
    attri2vec = Attri2Vec(layer_sizes=layer_sizes,
                          generator=generator.flow(unsupervised_samples),
                          bias=False,
                          normalize=None)

    # Build the model and expose input and output sockets of attri2vec, for node pair inputs:
    x_inp, x_out = attri2vec.build()

    prediction = link_classification(output_dim=1,
                                     output_act="sigmoid",
                                     edge_embedding_method='ip')(x_out)

    model = keras.Model(inputs=x_inp, outputs=prediction)

    model.compile(
        optimizer=keras.optimizers.Adam(lr=1e-2),
        loss=keras.losses.binary_crossentropy,
        metrics=[keras.metrics.binary_accuracy],
    )

    history = model.fit_generator(
        generator.flow(unsupervised_samples),
        epochs=epochs,
        verbose=1,
        use_multiprocessing=bool(int(input('Multiprocessing? 1/0: '))),
        workers=int(input('Number of workers: ')),
        shuffle=True,
    )
    print(history)
    model.save('model_walks{}len{}e{}.h5'.format(number_of_walks, length,
                                                 epochs))
    return model
Ejemplo n.º 21
0
def train(
    G,
    layer_size: List[int],
    num_samples: List[int],
    batch_size: int = 100,
    num_epochs: int = 10,
    learning_rate: float = 0.001,
    dropout: float = 0.0,
):
    """
    Train the GraphSAGE model on the specified graph G
    with given parameters.

    Args:
        G: NetworkX graph file
        layer_size: A list of number of hidden units in each layer of the GraphSAGE model
        num_samples: Number of neighbours to sample at each layer of the GraphSAGE model
        batch_size: Size of batch for inference
        num_epochs: Number of epochs to train the model
        learning_rate: Initial Learning rate
        dropout: The dropout (0->1)
    """

    # Split links into train/test
    print("Using '{}' method to sample negative links".format(
        args.edge_sampling_method))

    # From the original graph, extract E_test and the reduced graph G_test:
    edge_splitter_test = EdgeSplitter(G)
    # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G, and obtain the
    # reduced graph G_test with the sampled links removed:
    G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
        p=0.1,
        keep_connected=True,
        method=args.edge_sampling_method,
        probs=args.edge_sampling_probs,
    )

    # From G_test, extract E_train and the reduced graph G_train:
    edge_splitter_train = EdgeSplitter(G_test, G)
    # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G_test, and obtain the
    # further reduced graph G_train with the sampled links removed:
    G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
        p=0.1,
        keep_connected=True,
        method=args.edge_sampling_method,
        probs=args.edge_sampling_probs,
    )

    # G_train, edge_ds_train, edge_labels_train will be used for model training
    # G_test, edge_ds_test, edge_labels_test will be used for model testing

    # Convert G_train and G_test to StellarGraph objects (undirected, as required by GraphSAGE) for ML:
    G_train = sg.StellarGraph(G_train, node_features="feature")
    G_test = sg.StellarGraph(G_test, node_features="feature")

    # Mapper feeds link data from sampled subgraphs to GraphSAGE model
    # We need to create two mappers: for training and testing of the model
    train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples)
    train_flow = train_gen.flow(edge_ids_train,
                                edge_labels_train,
                                shuffle=True)

    test_gen = GraphSAGELinkGenerator(G_test, batch_size, num_samples)
    test_flow = test_gen.flow(edge_ids_test, edge_labels_test)

    # GraphSAGE model
    graphsage = GraphSAGE(layer_sizes=layer_size,
                          generator=train_gen,
                          bias=True,
                          dropout=dropout)

    # Construct input and output tensors for the link prediction model
    x_inp, x_out = graphsage.build()

    # Final estimator layer
    prediction = link_classification(
        output_dim=1,
        output_act="sigmoid",
        edge_embedding_method=args.edge_embedding_method,
    )(x_out)

    # Stack the GraphSAGE and prediction layers into a Keras model, and specify the loss
    model = keras.Model(inputs=x_inp, outputs=prediction)
    model.compile(
        optimizer=optimizers.Adam(lr=learning_rate),
        loss=losses.binary_crossentropy,
        metrics=[metrics.binary_accuracy],
    )

    # Evaluate the initial (untrained) model on the train and test set:
    init_train_metrics = model.evaluate_generator(train_flow)
    init_test_metrics = model.evaluate_generator(test_flow)

    print("\nTrain Set Metrics of the initial (untrained) model:")
    for name, val in zip(model.metrics_names, init_train_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    print("\nTest Set Metrics of the initial (untrained) model:")
    for name, val in zip(model.metrics_names, init_test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Train model
    print("\nTraining the model for {} epochs...".format(num_epochs))
    history = model.fit_generator(
        train_flow,
        epochs=num_epochs,
        validation_data=test_flow,
        verbose=2,
        shuffle=False,
    )

    # Evaluate and print metrics
    train_metrics = model.evaluate_generator(train_flow)
    test_metrics = model.evaluate_generator(test_flow)

    print("\nTrain Set Metrics of the trained model:")
    for name, val in zip(model.metrics_names, train_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    print("\nTest Set Metrics of the trained model:")
    for name, val in zip(model.metrics_names, test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Save the trained model
    save_str = "_n{}_l{}_d{}_r{}".format(
        "_".join([str(x) for x in num_samples]),
        "_".join([str(x) for x in layer_size]),
        dropout,
        learning_rate,
    )
    model.save("graphsage_link_pred" + save_str + ".h5")
Ejemplo n.º 22
0
def train(
    edgelist,
    node_data,
    attn_heads,
    layer_sizes,
    num_epochs=10,
    learning_rate=0.005,
    es_patience=100,
    dropout=0.0,
    target_name="subject",
):
    """
    Train a GAT model on the specified graph G with given parameters, evaluate it, and save the model.

    Args:
        edgelist: Graph edgelist
        node_data: Feature and target data for nodes
        attn_heads: Number of attention heads in GAT layers
        layer_sizes: A list of number of hidden nodes in each layer
        num_epochs: Number of epochs to train the model
        learning_rate: Initial Learning rate
        dropout: The dropout (0->1)
    """
    # Extract target and encode as a one-hot vector
    target_encoding = feature_extraction.DictVectorizer(sparse=False)
    node_targets = target_encoding.fit_transform(
        node_data[[target_name]].to_dict("records"))
    node_ids = node_data.index

    # Extract the feature data. These are the feature vectors that the Keras model will use as input.
    # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication.
    node_features = node_data[feature_names]

    # Create graph from edgelist and set node features and node type
    Gnx = nx.from_pandas_edgelist(edgelist)

    # Convert to StellarGraph and prepare for ML
    G = sg.StellarGraph(Gnx,
                        node_type_name="label",
                        node_features=node_features)

    # Split nodes into train/test using stratification.
    train_nodes, test_nodes, train_targets, test_targets = model_selection.train_test_split(
        node_ids,
        node_targets,
        train_size=140,
        test_size=None,
        stratify=node_targets,
        random_state=55232,
    )

    # Further split test set into validation and test
    val_nodes, test_nodes, val_targets, test_targets = model_selection.train_test_split(
        test_nodes,
        test_targets,
        train_size=500,
        test_size=1000,
        random_state=523214)

    # Create mappers for GraphSAGE that input data from the graph to the model
    generator = FullBatchNodeGenerator(G)
    train_gen = generator.flow(train_nodes, train_targets)
    val_gen = generator.flow(val_nodes, val_targets)

    # GAT model
    gat = GAT(
        layer_sizes=layer_sizes,
        attn_heads=attn_heads,
        generator=generator,
        bias=True,
        in_dropout=dropout,
        attn_dropout=dropout,
        activations=["elu", "elu"],
        normalize=None,
    )
    # Expose the input and output tensors of the GAT model for nodes:
    x_inp, x_out = gat.node_model(add_self_loops=True)

    # Snap the final estimator layer to x_out
    x_out = layers.Dense(units=train_targets.shape[1],
                         activation="softmax")(x_out)

    # Create Keras model for training
    model = keras.Model(inputs=x_inp, outputs=x_out)
    model.compile(
        optimizer=optimizers.Adam(lr=learning_rate, decay=0.001),
        loss=losses.categorical_crossentropy,
        weighted_metrics=["acc"],
    )
    print(model.summary())

    # Train model
    # Callbacks
    if not os.path.isdir("logs"):
        os.makedirs("logs")
    N = len(node_ids)
    es_callback = EarlyStopping(monitor="val_weighted_acc",
                                patience=es_patience)
    tb_callback = TensorBoard(batch_size=N)
    mc_callback = ModelCheckpoint(
        "logs/best_model.h5",
        monitor="val_weighted_acc",
        save_best_only=True,
        save_weights_only=True,
    )

    if args.interface == "fit":
        print("\nUsing model.fit() to train the model\n")
        # Get the training data
        [X, A], y_train, node_mask_train = train_gen.__getitem__(0)
        N = A.shape[0]
        # A = sparse.csr_matrix(A + np.eye(A.shape[0]))  # Add self-loops

        # Get the validation data
        [_, _], y_val, node_mask_val = val_gen.__getitem__(0)

        history = model.fit(
            x=[X, A],
            y=y_train,
            sample_weight=node_mask_train,
            batch_size=N,
            shuffle=
            False,  # must be False, since shuffling data means shuffling the whole graph
            epochs=num_epochs,
            verbose=2,
            validation_data=([X, A], y_val, node_mask_val),
            callbacks=[es_callback, tb_callback, mc_callback],
        )
    else:
        print("\nUsing model.fit_generator() to train the model\n")
        history = model.fit_generator(
            train_gen,
            epochs=num_epochs,
            validation_data=val_gen,
            verbose=2,
            shuffle=False,
            callbacks=[es_callback, tb_callback, mc_callback],
        )

    # Load best model
    model.load_weights("logs/best_model.h5")

    # Evaluate on validation set and print metrics
    if args.interface == "fit":
        val_metrics = model.evaluate(x=[X, A],
                                     y=y_val,
                                     sample_weight=node_mask_val,
                                     batch_size=N)
    else:
        val_metrics = model.evaluate_generator(val_gen)

    print("\nBest model's Validation Set Metrics:")
    for name, val in zip(model.metrics_names, val_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Evaluate on test set and print metrics
    if args.interface == "fit":
        [_, _], y_test, node_mask_test = generator.flow(
            test_nodes, test_targets).__getitem__(0)
        test_metrics = model.evaluate(x=[X, A],
                                      y=y_test,
                                      sample_weight=node_mask_test,
                                      batch_size=N)
    else:
        test_metrics = model.evaluate_generator(
            generator.flow(test_nodes, test_targets))

    print("\nBest model's Test Set Metrics:")
    for name, val in zip(model.metrics_names, test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Get predictions for all nodes
    # Note that the `predict` or `predict_generator` function now operates differently to the `GraphSAGE` or `HinSAGE` models
    # in that if you give it less than the complete set of nodes, it will still return all predictions and in a fixed order
    # defined by the order of nodes in X and A (which is defined by the order of G.nodes()).
    if args.interface == "fit":
        all_predictions = model.predict(x=[X, A], batch_size=N)
    else:
        all_predictions = model.predict_generator(generator.flow(node_ids))

    # Turn predictions back into the original categories
    node_predictions = pd.DataFrame(
        target_encoding.inverse_transform(all_predictions),
        index=list(G.nodes()))
    accuracy = np.mean([
        "subject=" + gt_subject == p
        for gt_subject, p in zip(node_data["subject"][list(G.nodes())],
                                 node_predictions.idxmax(axis=1))
    ])
    print("\nAll-node accuracy: {:0.4f}".format(accuracy))

    # Save the trained model
    save_str = "_h{}_l{}_d{}_r{}".format(
        attn_heads, "_".join([str(x) for x in layer_sizes]), dropout,
        learning_rate)
    model.save("cora_gat_model" + save_str + ".h5")

    # We must also save the target encoding to convert model predictions
    with open("cora_gat_encoding" + save_str + ".pkl", "wb") as f:
        pickle.dump([target_encoding], f)
Ejemplo n.º 23
0
def train(
    edgelist,
    node_data,
    layer_size,
    num_samples,
    batch_size=100,
    num_epochs=10,
    learning_rate=0.005,
    dropout=0.0,
    target_name="subject",
):
    """
    Train a GraphSAGE model on the specified graph G with given parameters, evaluate it, and save the model.

    Args:
        edgelist: Graph edgelist
        node_data: Feature and target data for nodes
        layer_size: A list of number of hidden nodes in each layer
        num_samples: Number of neighbours to sample at each layer
        batch_size: Size of batch for inference
        num_epochs: Number of epochs to train the model
        learning_rate: Initial Learning rate
        dropout: The dropout (0->1)
    """
    # Extract target and encode as a one-hot vector
    target_encoding = feature_extraction.DictVectorizer(sparse=False)
    node_targets = target_encoding.fit_transform(
        node_data[[target_name]].to_dict("records"))
    node_ids = node_data.index

    # Extract the feature data. These are the feature vectors that the Keras model will use as input.
    # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication.
    node_features = node_data[feature_names]

    # Create graph from edgelist and set node features and node type
    Gnx = nx.from_pandas_edgelist(edgelist, edge_attr="label")
    nx.set_node_attributes(Gnx, "paper", "label")

    # Convert to StellarGraph and prepare for ML
    G = sg.StellarGraph(Gnx,
                        node_type_name="label",
                        node_features=node_features)

    # Split nodes into train/test using stratification.
    train_nodes, test_nodes, train_targets, test_targets = model_selection.train_test_split(
        node_ids,
        node_targets,
        train_size=140,
        test_size=None,
        stratify=node_targets,
        random_state=5232,
    )

    # Split test set into test and validation
    val_nodes, test_nodes, val_targets, test_targets = model_selection.train_test_split(
        test_nodes,
        test_targets,
        train_size=500,
        test_size=None,
        random_state=5214)

    # Create mappers for GraphSAGE that input data from the graph to the model
    generator = GraphSAGENodeGenerator(G, batch_size, num_samples, seed=5312)
    train_gen = generator.flow(train_nodes, train_targets, shuffle=True)
    val_gen = generator.flow(val_nodes, val_targets)

    # GraphSAGE model
    model = GraphSAGE(
        layer_sizes=layer_size,
        generator=train_gen,
        bias=True,
        dropout=dropout,
        aggregator=MeanAggregator,
    )
    # Expose the input and output sockets of the model:
    x_inp, x_out = model.build()

    # Snap the final estimator layer to x_out
    prediction = layers.Dense(units=train_targets.shape[1],
                              activation="softmax")(x_out)

    # Create Keras model for training
    model = keras.Model(inputs=x_inp, outputs=prediction)
    model.compile(
        optimizer=optimizers.Adam(lr=learning_rate, decay=0.001),
        loss=losses.categorical_crossentropy,
        metrics=[metrics.categorical_accuracy],
    )
    print(model.summary())

    # Train model
    history = model.fit_generator(train_gen,
                                  epochs=num_epochs,
                                  validation_data=val_gen,
                                  verbose=2,
                                  shuffle=False)

    # Evaluate on test set and print metrics
    test_metrics = model.evaluate_generator(
        generator.flow(test_nodes, test_targets))
    print("\nTest Set Metrics:")
    for name, val in zip(model.metrics_names, test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Get predictions for all nodes
    all_predictions = model.predict_generator(generator.flow(node_ids))

    # Turn predictions back into the original categories
    node_predictions = pd.DataFrame(
        target_encoding.inverse_transform(all_predictions), index=node_ids)
    accuracy = np.mean([
        "subject=" + gt_subject == p for gt_subject, p in zip(
            node_data["subject"], node_predictions.idxmax(axis=1))
    ])
    print("All-node accuracy: {:3f}".format(accuracy))

    # TODO: extract the GraphSAGE embeddings from x_out, and save/plot them

    # Save the trained model
    save_str = "_n{}_l{}_d{}_r{}".format(
        "_".join([str(x) for x in num_samples]),
        "_".join([str(x) for x in layer_size]),
        dropout,
        learning_rate,
    )
    model.save("cora_example_model" + save_str + ".h5")

    # We must also save the target encoding to convert model predictions
    with open("cora_example_encoding" + save_str + ".pkl", "wb") as f:
        pickle.dump([target_encoding], f)
Ejemplo n.º 24
0
def train_model(Gnx, train_data, test_data, all_features):
    output_results = {}
    from collections import Counter
    #TODO: save size of dataset, train_data, and test data
    #save the count of each subject in the blocks
    print(len(train_data), len(test_data))
    subject_groups_train = Counter(train_data['subject'])
    subject_groups_test = Counter(test_data['subject'])
    output_results['train_size'] = len(train_data)
    output_results['test_size'] = len(test_data)
    output_results['subject_groups_train'] = subject_groups_train
    output_results['subject_groups_test'] = subject_groups_test

    #node_features = train_data[feature_names]
    #print (feature_names)
    G = sg.StellarGraph(Gnx, node_features=all_features)
    #TODO: save graph info
    print(G.info())
    print("writing graph.dot")
    #write_dot(Gnx,"graph.dot")
    output_results['graph_info'] = G.info()
    print("building the graph generator...")

    batch_size = 50
    num_samples = [10, 5]
    generator = GraphSAGENodeGenerator(G, batch_size, num_samples)
    #generator = HinSAGENodeGenerator(G, batch_size, num_samples)

    target_encoding = feature_extraction.DictVectorizer(sparse=False)
    train_targets = target_encoding.fit_transform(
        train_data[["subject"]].to_dict('records'))
    print(np.unique(train_data["subject"].to_list()))
    class_weights = class_weight.compute_class_weight(
        'balanced', np.unique(train_data["subject"].to_list()),
        train_data["subject"].to_list())
    print('class_weights', class_weights)
    test_targets = target_encoding.transform(test_data[["subject"
                                                        ]].to_dict('records'))
    train_gen = generator.flow(train_data.index, train_targets, shuffle=True)
    graphsage_model = GraphSAGE(
        #graphsage_model = HinSAGE(
        #layer_sizes=[32, 32],
        layer_sizes=[80, 80],
        generator=generator,  #train_gen,
        bias=True,
        dropout=0.5,
    )
    print("building model...")
    #x_inp, x_out = graphsage_model.build(flatten_output=True)
    x_inp, x_out = graphsage_model.build()
    prediction = layers.Dense(units=train_targets.shape[1],
                              activation="softmax")(x_out)

    model = Model(inputs=x_inp, outputs=prediction)
    print("compiling model...")
    model.compile(
        optimizer=optimizers.Adam(lr=0.005),
        loss=losses.categorical_crossentropy,
        metrics=["acc", metrics.categorical_accuracy],
    )
    print("testing the model...")
    test_gen = generator.flow(test_data.index, test_targets)
    history = model.fit_generator(
        train_gen,
        epochs=EPOCH,
        validation_data=test_gen,
        verbose=2,
        shuffle=True,
        class_weight=class_weights,
    )
    # save test metrics
    test_metrics = model.evaluate_generator(test_gen)
    print("\nTest Set Metrics:")
    output_results['test_metrics'] = []
    for name, val in zip(model.metrics_names, test_metrics):
        output_results['test_metrics'].append({'name': name, 'val:': val})
        print("\t{}: {:0.4f}".format(name, val))

    test_nodes = test_data.index
    test_mapper = generator.flow(test_nodes)
    test_predictions = model.predict_generator(test_mapper)
    node_predictions = target_encoding.inverse_transform(test_predictions)
    results = pd.DataFrame(node_predictions, index=test_nodes).idxmax(axis=1)
    df = pd.DataFrame({
        "Predicted": results,
        "True": test_data['subject']
    })  #, "program":test_data['program']})
    clean_result_labels = df["Predicted"].map(
        lambda x: x.replace('subject=', ''))
    # save predicted labels
    pred_labels = np.unique(clean_result_labels.values)
    #pred_program = np.unique(df['program'].values)
    # save predictions per label
    precision, recall, f1, _ = skmetrics.precision_recall_fscore_support(
        df['True'].values,
        clean_result_labels.values,
        average=None,
        labels=pred_labels)
    output_results['classifier'] = []
    for lbl, prec, rec, fm in zip(pred_labels, precision, recall, f1):
        output_results['classifier'].append({
            'label': lbl,
            'precision': prec,
            'recall': rec,
            'fscore': fm
        })
    print(output_results['classifier'])
    print(pred_labels)
    print('precision: {}'.format(precision))
    print('recall: {}'.format(recall))
    print('fscore: {}'.format(f1))

    return generator, model, x_inp, x_out, history, target_encoding, output_results
Ejemplo n.º 25
0
def main(graph_loc, layer_sizes, activations, dropout, learning_rate,
         num_epochs):
    # Load edges in order 'cited-paper' <- 'citing-paper'
    edgelist = pd.read_csv(
        os.path.join(graph_loc, "cora.cites"),
        sep="\t",
        header=None,
        names=["target", "source"],
    )

    # Load node features
    # The CORA dataset contains binary attributes 'w_x' that correspond to whether the corresponding keyword
    # (out of 1433 keywords) is found in the corresponding publication.
    feature_names = ["w_{}".format(ii) for ii in range(1433)]
    # Also, there is a "subject" column
    column_names = feature_names + ["subject"]
    node_data = pd.read_csv(
        os.path.join(graph_loc, "cora.content"),
        sep="\t",
        header=None,
        names=column_names,
    )

    target_encoding = feature_extraction.DictVectorizer(sparse=False)
    node_targets = target_encoding.fit_transform(
        node_data[["subject"]].to_dict("records"))

    node_ids = node_data.index
    node_features = node_data[feature_names]

    Gnx = nx.from_pandas_edgelist(edgelist)

    # Convert to StellarGraph and prepare for ML
    G = sg.StellarGraph(Gnx, node_features=node_features)

    # Split nodes into train/test using stratification.
    (
        train_nodes,
        test_nodes,
        train_targets,
        test_targets,
    ) = model_selection.train_test_split(
        node_ids,
        node_targets,
        train_size=140,
        test_size=None,
        stratify=node_targets,
        random_state=55232,
    )

    # Split test set into test and validation
    val_nodes, test_nodes, val_targets, test_targets = model_selection.train_test_split(
        test_nodes,
        test_targets,
        train_size=300,
        test_size=None,
        random_state=523214)

    # We specify the method='gcn' to give the pre-processing required by the GCN algorithm.
    generator = FullBatchNodeGenerator(G, method="gcn")

    model = train(
        train_nodes,
        train_targets,
        val_nodes,
        val_targets,
        generator,
        dropout,
        layer_sizes,
        learning_rate,
        activations,
        num_epochs,
    )

    # Save the trained model
    save_str = "_h{}_l{}_d{}_r{}".format(
        "gcn", "".join([str(x) for x in layer_sizes]), str(dropout),
        str(learning_rate))

    model.save("cora_gcn_model" + save_str + ".h5")

    # We must also save the target encoding to convert model predictions
    with open("cora_gcn_encoding" + save_str + ".pkl", "wb") as f:
        pickle.dump([target_encoding], f)

    test(test_nodes, test_targets, generator,
         "cora_gcn_model" + save_str + ".h5")
Ejemplo n.º 26
0
else:
    assert testtype == 'edges'
    gtrain_edgelist, edges_test_true, edges_test_false = Main.Dataset.make_edges_test_set(
        to_julia_edgelist(Gnx), testprop)
    edges_test_true = edges_test_true - 1
    edges_test_false = edges_test_false - 1
    Gtrain_nx = from_julia_edgelist(gtrain_edgelist)
    # Recover nodes that are now isolated in Gtrain_nx, not seen through the edgelist
    for n in Gnx.nodes():
        if n not in Gtrain_nx.nodes():
            Gtrain_nx.add_node(n)
    nx.set_node_attributes(Gtrain_nx, "paper", "label")

## Train the embedding
#                    mo"number of epochs to train for"l
G = sg.StellarGraph(Gnx, node_features=node_features)
Gtrain = sg.StellarGraph(Gtrain_nx, node_features=node_features)

# The graph G
#                    together wi"number of parallel workers to use" the unsupervised sampler will be used to generate samples.
actual_nodes_train = list(Gtrain.nodes())
if testtype == 'nodes':
    assert set(nodes_train).issuperset(actual_nodes_train)
unsupervised_samples = UnsupervisedSampler(Gtrain,
                                           nodes=actual_nodes_train,
                                           length=length_of_walks,
                                           number_of_walks=number_of_walks)
train_gen = GraphSAGELinkGenerator(Gtrain, batch_size,
                                   num_samples).flow(unsupervised_samples)

# Build the model
Ejemplo n.º 27
0
 def get_stellargraph(self):
     return sg.StellarGraph(self.g_nx, node_type_name="ntype", node_features=self.node_features)
Ejemplo n.º 28
0
# path_weights = sys.argv[1]
# path_node_partition = sys.argv[2]
# path_edge_partition = sys.argv[3]

path_weights = "./weights/weights.npy"
path_node_partition = "./data/4_attributes_0"
path_edge_partition = "./data/4_0"

# Constructing the graph
nodes = pd.read_csv(path_node_partition , sep='\t', lineterminator='\n',header=None).loc[:,0:1433]
nodes.set_index(0,inplace=True)

edges = pd.read_csv(path_edge_partition , sep='\s+', lineterminator='\n', header=None)
edges.columns = ["source","target"]

G = sg.StellarGraph(nodes=nodes,edges=edges)

# Train split
edge_splitter_train = EdgeSplitter(G)
G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
    p=0.2, method="global", keep_connected=True
)

# Hyperparams
batch_size = 20
epochs = 20
num_samples = [20, 10]
layer_sizes = [20, 20]

# Train iterators
train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples)
Ejemplo n.º 29
0
    def _train_model(self, gnx, train_data, test_data, all_features,
                     target_feature_name):
        subject_groups_train = Counter(train_data[target_feature_name])
        subject_groups_test = Counter(test_data[target_feature_name])

        graph = sg.StellarGraph(gnx, node_features=all_features)

        output_results = {
            'train_size': len(train_data),
            'test_size': len(test_data),
            'subject_groups_train': subject_groups_train,
            'subject_groups_test': subject_groups_test,
            'graph_info': graph.info()
        }

        num_samples = [10, 5]
        generator = GraphSAGENodeGenerator(graph, self.batch_size, num_samples)

        target_encoding = feature_extraction.DictVectorizer(sparse=False)
        train_targets = target_encoding.fit_transform(
            train_data[[target_feature_name]].to_dict('records'))
        class_weights = class_weight.compute_class_weight(
            class_weight='balanced',
            classes=np.unique(train_data[target_feature_name].to_list()),
            y=train_data[target_feature_name].to_list())
        class_weights = dict(enumerate(class_weights))
        test_targets = target_encoding.transform(
            test_data[[target_feature_name]].to_dict('records'))
        train_gen = generator.flow(train_data.index,
                                   train_targets,
                                   shuffle=True)
        graph_sage_model = GraphSAGE(
            layer_sizes=[80, 80],
            generator=generator,  # train_gen,
            bias=True,
            dropout=0.5,
        )
        print('building model...')

        x_inp, x_out = graph_sage_model.build()
        prediction = layers.Dense(units=train_targets.shape[1],
                                  activation="softmax")(x_out)

        model = Model(inputs=x_inp, outputs=prediction)
        print('compiling model...')
        model.compile(
            optimizer=optimizers.Adam(learning_rate=0.005),
            loss=losses.categorical_crossentropy,
            metrics=['acc', metrics.categorical_accuracy],
        )
        print('testing the model...')
        test_gen = generator.flow(test_data.index, test_targets)
        history = model.fit(
            train_gen,
            epochs=self.num_epochs,
            validation_data=test_gen,
            verbose=2,
            shuffle=True,
            class_weight=class_weights,
        )
        # save test metrics
        test_metrics = model.evaluate(test_gen)
        print('Test Set Metrics:')
        output_results['test_metrics'] = []
        for name, val in zip(model.metrics_names, test_metrics):
            output_results['test_metrics'].append({'name': name, 'val:': val})
            print("\t{}: {:0.4f}".format(name, val))

        test_nodes = test_data.index
        test_mapper = generator.flow(test_nodes)
        test_predictions = model.predict(test_mapper)
        node_predictions = target_encoding.inverse_transform(test_predictions)
        results = pd.DataFrame(node_predictions,
                               index=test_nodes).idxmax(axis=1)
        df = pd.DataFrame({
            'Predicted': results,
            'True': test_data[target_feature_name]
        })
        clean_result_labels = df['Predicted'].map(
            lambda x: x.replace('subject=', ''))

        # save predicted labels
        pred_labels = np.unique(clean_result_labels.values)
        precision, recall, f1, _ = skmetrics.precision_recall_fscore_support(
            df['True'].values,
            clean_result_labels.values,
            average=None,
            labels=pred_labels)
        output_results['classifier'] = []
        for lbl, prec, rec, fm in zip(pred_labels, precision, recall, f1):
            output_results['classifier'].append({
                'label': lbl,
                'precision': prec,
                'recall': rec,
                'fscore': fm
            })

        print(output_results['classifier'])
        print(pred_labels)
        print('precision: {}'.format(precision))
        print('recall: {}'.format(recall))
        print('fscore: {}'.format(f1))

        output_results['history'] = {
            'epochs': history.epoch,
            'training_log': history.history,
            'training_params': history.params
        }

        return generator, model, x_inp, x_out, history, target_encoding, output_results