Exemple #1
0
def encode_types(graph, node_types, edge_types):
    node_iterator = multidigraph_node_data_iterator(graph)
    encode_categorically(node_iterator, node_types, 'type', 'categorical_type')

    edge_iterator = multidigraph_edge_data_iterator(graph)
    encode_categorically(edge_iterator, edge_types, 'type', 'categorical_type')
    return graph
Exemple #2
0
    def __call__(self, graph):
        if self.obfuscate:
            obfuscate_labels(graph, self.obfuscate)
        # Encode attribute values as number
        graph = encode_values(graph, self.categorical, self.continuous)
        graph = nx.convert_node_labels_to_integers(
            graph, label_attribute=self.label_attribute
        )
        if self.duplicate:
            graph = duplicate_edges_in_reverse(graph)
        # Node or Edge Type as int
        graph = encode_types(graph, multidigraph_node_data_iterator, self.node_types)
        graph = encode_types(graph, multidigraph_edge_data_iterator, self.edge_types)

        for data in multidigraph_node_data_iterator(graph):
            features = create_feature_vector(data)
            target = data[self.target_name]
            data.clear()
            data["x"] = features
            data["y"] = target

        for data in multidigraph_edge_data_iterator(graph):
            features = create_feature_vector(data)
            target = data[self.target_name]
            data.clear()
            data["edge_attr"] = features
            data["y_edge"] = target

        return graph
Exemple #3
0
def encode_values(graph, categorical_attributes, continuous_attributes):
    for node_data in multidigraph_node_data_iterator(graph):
        typ = node_data['type']

        if categorical_attributes is not None and typ in categorical_attributes.keys():
            # Add the integer value of the category for each categorical attribute instance
            category_values = categorical_attributes[typ]
            node_data['encoded_value'] = category_values.index(node_data['value'])

        elif continuous_attributes is not None and typ in continuous_attributes.keys():
            min_val, max_val = continuous_attributes[typ]
            node_data['encoded_value'] = (node_data['value'] - min_val) / (max_val - min_val)

        else:
            node_data['encoded_value'] = 0
    for edge_data in multidigraph_edge_data_iterator(graph):
        edge_data['encoded_value'] = 0

    return graph
Exemple #4
0
def pipeline(graphs,
             tr_ge_split,
             node_types,
             edge_types,
             num_processing_steps_tr=10,
             num_processing_steps_ge=10,
             num_training_iterations=10000,
             continuous_attributes=None,
             categorical_attributes=None,
             type_embedding_dim=5,
             attr_embedding_dim=6,
             edge_output_size=3,
             node_output_size=3,
             output_dir=None):

    ############################################################
    # Manipulate the graph data
    ############################################################

    # Encode attribute values
    for graph in graphs:
        for node_data in multidigraph_node_data_iterator(graph):
            typ = node_data['type']

            if categorical_attributes is not None and typ in categorical_attributes.keys(
            ):
                # Add the integer value of the category for each categorical attribute instance
                category_values = categorical_attributes[typ]
                node_data['encoded_value'] = category_values.index(
                    node_data['value'])

            elif continuous_attributes is not None and typ in continuous_attributes.keys(
            ):
                min_val, max_val = continuous_attributes[typ]
                node_data['encoded_value'] = (node_data['value'] -
                                              min_val) / (max_val - min_val)

            else:
                node_data['encoded_value'] = 0

        for edge_data in multidigraph_edge_data_iterator(graph):
            edge_data['encoded_value'] = 0

    indexed_graphs = [
        nx.convert_node_labels_to_integers(graph, label_attribute='concept')
        for graph in graphs
    ]
    graphs = [duplicate_edges_in_reverse(graph) for graph in indexed_graphs]

    graphs = [encode_types(graph, node_types, edge_types) for graph in graphs]

    input_graphs = [create_input_graph(graph) for graph in graphs]
    target_graphs = [create_target_graph(graph) for graph in graphs]

    tr_input_graphs = input_graphs[:tr_ge_split]
    tr_target_graphs = target_graphs[:tr_ge_split]
    ge_input_graphs = input_graphs[tr_ge_split:]
    ge_target_graphs = target_graphs[tr_ge_split:]

    ############################################################
    # Build and run the KGCN
    ############################################################

    attr_embedders = configure_embedders(node_types, attr_embedding_dim,
                                         categorical_attributes,
                                         continuous_attributes)

    kgcn = KGCN(len(node_types),
                len(edge_types),
                type_embedding_dim,
                attr_embedding_dim,
                attr_embedders,
                edge_output_size=edge_output_size,
                node_output_size=node_output_size)

    learner = KGCNLearner(kgcn,
                          num_processing_steps_tr=num_processing_steps_tr,
                          num_processing_steps_ge=num_processing_steps_ge)

    train_values, test_values, tr_info = learner(
        tr_input_graphs,
        tr_target_graphs,
        ge_input_graphs,
        ge_target_graphs,
        num_training_iterations=num_training_iterations,
        log_dir=output_dir)

    plot_across_training(*tr_info, output_file=f'{output_dir}learning.png')
    plot_predictions(ge_input_graphs,
                     test_values,
                     num_processing_steps_ge,
                     output_file=f'{output_dir}graph.png')

    logit_graphs = graphs_tuple_to_networkxs(test_values["outputs"][-1])

    indexed_ge_graphs = indexed_graphs[tr_ge_split:]
    ge_graphs = [
        apply_logits_to_graphs(graph, logit_graph)
        for graph, logit_graph in zip(indexed_ge_graphs, logit_graphs)
    ]

    for ge_graph in ge_graphs:
        for data in multidigraph_data_iterator(ge_graph):
            data['probabilities'] = softmax(data['logits'])
            data['prediction'] = int(np.argmax(data['probabilities']))

    _, _, _, _, _, solveds_tr, solveds_ge = tr_info
    return ge_graphs, solveds_tr, solveds_ge
Exemple #5
0
def pipeline(graphs,
             tr_ge_split,
             node_types,
             edge_types,
             num_processing_steps_tr=10,
             num_processing_steps_ge=10,
             num_training_iterations=10000,
             categorical_attributes=None,
             type_embedding_dim=5,
             attr_embedding_dim=6,
             edge_output_size=3,
             node_output_size=3):

    ############################################################
    # Manipulate the graph data
    ############################################################

    # Encode attribute values
    for graph in graphs:

        for data in multidigraph_data_iterator(graph):
            data['encoded_value'] = 0

        for node_data in multidigraph_node_data_iterator(graph):
            typ = node_data['type']

            # Add the integer value of the category for each categorical attribute instance
            for attr_typ, category_values in categorical_attributes.items():
                if typ == attr_typ:
                    node_data['encoded_value'] = category_values.index(
                        node_data['value'])

    indexed_graphs = [
        nx.convert_node_labels_to_integers(graph, label_attribute='concept')
        for graph in graphs
    ]
    graphs = [duplicate_edges_in_reverse(graph) for graph in indexed_graphs]

    graphs = [encode_types(graph, node_types, edge_types) for graph in graphs]

    input_graphs = [create_input_graph(graph) for graph in graphs]
    target_graphs = [create_target_graph(graph) for graph in graphs]

    tr_input_graphs = input_graphs[:tr_ge_split]
    tr_target_graphs = target_graphs[:tr_ge_split]
    ge_input_graphs = input_graphs[tr_ge_split:]
    ge_target_graphs = target_graphs[tr_ge_split:]

    ############################################################
    # Build and run the KGCN
    ############################################################

    type_categories_list = [i for i, _ in enumerate(node_types)]
    non_attribute_nodes = type_categories_list.copy()

    attr_embedders = dict()

    # Construct categorical attribute embedders
    for attr_typ, category_values in categorical_attributes.items():
        num_categories = len(category_values)

        def make_embedder():
            return CategoricalAttribute(num_categories,
                                        attr_embedding_dim,
                                        name=attr_typ + '_cat_embedder')

        attr_typ_index = node_types.index(attr_typ)

        # Record the embedder, and the index of the type that it should encode
        attr_embedders[make_embedder] = [attr_typ_index]

        non_attribute_nodes.pop(attr_typ_index)

    # All entities and relations (non-attributes) also need an embedder with matching output dimension, which does
    # nothing. This is provided as a list of their indices
    def make_blank_embedder():
        return BlankAttribute(attr_embedding_dim)

    attr_embedders[make_blank_embedder] = non_attribute_nodes

    kgcn = KGCN(len(node_types),
                len(edge_types),
                type_embedding_dim,
                attr_embedding_dim,
                attr_embedders,
                edge_output_size=edge_output_size,
                node_output_size=node_output_size)

    learner = KGCNLearner(kgcn,
                          num_processing_steps_tr=num_processing_steps_tr,
                          num_processing_steps_ge=num_processing_steps_ge)

    train_values, test_values, tr_info = learner(
        tr_input_graphs,
        tr_target_graphs,
        ge_input_graphs,
        ge_target_graphs,
        num_training_iterations=num_training_iterations)

    plot_across_training(*tr_info)
    plot_predictions(ge_input_graphs, test_values, num_processing_steps_ge)

    logit_graphs = graphs_tuple_to_networkxs(test_values["outputs"][-1])

    indexed_ge_graphs = indexed_graphs[tr_ge_split:]
    ge_graphs = [
        apply_logits_to_graphs(graph, logit_graph)
        for graph, logit_graph in zip(indexed_ge_graphs, logit_graphs)
    ]

    for ge_graph in ge_graphs:
        for data in multidigraph_data_iterator(ge_graph):
            data['probabilities'] = softmax(data['logits'])
            data['prediction'] = int(np.argmax(data['probabilities']))

    _, _, _, _, _, solveds_tr, solveds_ge = tr_info
    return ge_graphs, solveds_tr, solveds_ge