Beispiel #1
0
    def test_edges_are_duplicated_as_expected(self):
        graph = nx.MultiDiGraph(name=0)

        p0 = Thing('V123', 'person', 'entity')
        p1 = Thing('V456', 'person', 'entity')
        par0 = Thing('V789', 'parentship', 'relation')

        # people
        graph.add_node(p0, type='person', solution=1)
        graph.add_node(p1, type='person', solution=1)

        # parentships
        graph.add_node(par0, type='parentship', solution=1)
        graph.add_edge(par0, p0, type='parent', solution=1)
        graph.add_edge(par0, p1, type='child', solution=1)

        duplicate_edges_in_reverse(graph)

        expected_graph = nx.MultiDiGraph(name=0)

        # people
        expected_graph.add_node(p0, type='person', solution=1)
        expected_graph.add_node(p1, type='person', solution=1)

        # parentships
        expected_graph.add_node(par0, type='parentship', solution=1)
        expected_graph.add_edge(par0, p0, type='parent', solution=1)
        expected_graph.add_edge(par0, p1, type='child', solution=1)

        # Duplicates
        expected_graph.add_edge(p0, par0, type='parent', solution=1)
        expected_graph.add_edge(p1, par0, type='child', solution=1)
        self.assertGraphsEqual(expected_graph, graph)
Beispiel #2
0
def pipeline(graphs,
             tr_ge_split,
             node_types,
             edge_types,
             num_processing_steps_tr=10,
             num_processing_steps_ge=10,
             num_training_iterations=10000,
             continuous_attributes=None,
             categorical_attributes=None,
             type_embedding_dim=5,
             attr_embedding_dim=6,
             edge_output_size=3,
             node_output_size=3,
             output_dir=None):

    ############################################################
    # Manipulate the graph data
    ############################################################

    # Encode attribute values
    for graph in graphs:
        for node_data in multidigraph_node_data_iterator(graph):
            typ = node_data['type']

            if categorical_attributes is not None and typ in categorical_attributes.keys(
            ):
                # Add the integer value of the category for each categorical attribute instance
                category_values = categorical_attributes[typ]
                node_data['encoded_value'] = category_values.index(
                    node_data['value'])

            elif continuous_attributes is not None and typ in continuous_attributes.keys(
            ):
                min_val, max_val = continuous_attributes[typ]
                node_data['encoded_value'] = (node_data['value'] -
                                              min_val) / (max_val - min_val)

            else:
                node_data['encoded_value'] = 0

        for edge_data in multidigraph_edge_data_iterator(graph):
            edge_data['encoded_value'] = 0

    indexed_graphs = [
        nx.convert_node_labels_to_integers(graph, label_attribute='concept')
        for graph in graphs
    ]
    graphs = [duplicate_edges_in_reverse(graph) for graph in indexed_graphs]

    graphs = [encode_types(graph, node_types, edge_types) for graph in graphs]

    input_graphs = [create_input_graph(graph) for graph in graphs]
    target_graphs = [create_target_graph(graph) for graph in graphs]

    tr_input_graphs = input_graphs[:tr_ge_split]
    tr_target_graphs = target_graphs[:tr_ge_split]
    ge_input_graphs = input_graphs[tr_ge_split:]
    ge_target_graphs = target_graphs[tr_ge_split:]

    ############################################################
    # Build and run the KGCN
    ############################################################

    attr_embedders = configure_embedders(node_types, attr_embedding_dim,
                                         categorical_attributes,
                                         continuous_attributes)

    kgcn = KGCN(len(node_types),
                len(edge_types),
                type_embedding_dim,
                attr_embedding_dim,
                attr_embedders,
                edge_output_size=edge_output_size,
                node_output_size=node_output_size)

    learner = KGCNLearner(kgcn,
                          num_processing_steps_tr=num_processing_steps_tr,
                          num_processing_steps_ge=num_processing_steps_ge)

    train_values, test_values, tr_info = learner(
        tr_input_graphs,
        tr_target_graphs,
        ge_input_graphs,
        ge_target_graphs,
        num_training_iterations=num_training_iterations,
        log_dir=output_dir)

    plot_across_training(*tr_info, output_file=f'{output_dir}learning.png')
    plot_predictions(ge_input_graphs,
                     test_values,
                     num_processing_steps_ge,
                     output_file=f'{output_dir}graph.png')

    logit_graphs = graphs_tuple_to_networkxs(test_values["outputs"][-1])

    indexed_ge_graphs = indexed_graphs[tr_ge_split:]
    ge_graphs = [
        apply_logits_to_graphs(graph, logit_graph)
        for graph, logit_graph in zip(indexed_ge_graphs, logit_graphs)
    ]

    for ge_graph in ge_graphs:
        for data in multidigraph_data_iterator(ge_graph):
            data['probabilities'] = softmax(data['logits'])
            data['prediction'] = int(np.argmax(data['probabilities']))

    _, _, _, _, _, solveds_tr, solveds_ge = tr_info
    return ge_graphs, solveds_tr, solveds_ge
Beispiel #3
0
def pipeline(graphs,
             tr_ge_split,
             node_types,
             edge_types,
             num_processing_steps_tr=10,
             num_processing_steps_ge=10,
             num_training_iterations=10000,
             continuous_attributes=None,
             categorical_attributes=None,
             type_embedding_dim=5,
             attr_embedding_dim=6,
             edge_output_size=3,
             node_output_size=3,
             output_dir=None):

    ############################################################
    # Manipulate the graph data
    ############################################################

    # Encode attribute values
    graphs = [
        encode_values(graph, categorical_attributes, continuous_attributes)
        for graph in graphs
    ]

    indexed_graphs = [
        nx.convert_node_labels_to_integers(graph, label_attribute='concept')
        for graph in graphs
    ]
    graphs = [duplicate_edges_in_reverse(graph) for graph in indexed_graphs]

    graphs = [
        encode_types(graph, multidigraph_node_data_iterator, node_types)
        for graph in graphs
    ]
    graphs = [
        encode_types(graph, multidigraph_edge_data_iterator, edge_types)
        for graph in graphs
    ]

    input_graphs = [create_input_graph(graph) for graph in graphs]
    target_graphs = [create_target_graph(graph) for graph in graphs]

    tr_input_graphs = input_graphs[:tr_ge_split]
    tr_target_graphs = target_graphs[:tr_ge_split]
    ge_input_graphs = input_graphs[tr_ge_split:]
    ge_target_graphs = target_graphs[tr_ge_split:]

    ############################################################
    # Build and run the KGCN
    ############################################################

    thing_embedder = ThingEmbedder(node_types, type_embedding_dim,
                                   attr_embedding_dim, categorical_attributes,
                                   continuous_attributes)

    role_embedder = RoleEmbedder(len(edge_types), type_embedding_dim)

    kgcn = KGCN(thing_embedder,
                role_embedder,
                edge_output_size=edge_output_size,
                node_output_size=node_output_size)

    learner = KGCNLearner(kgcn,
                          num_processing_steps_tr=num_processing_steps_tr,
                          num_processing_steps_ge=num_processing_steps_ge)

    train_values, test_values, tr_info = learner(
        tr_input_graphs,
        tr_target_graphs,
        ge_input_graphs,
        ge_target_graphs,
        num_training_iterations=num_training_iterations,
        log_dir=output_dir)

    plot_across_training(*tr_info, output_file=f'{output_dir}learning.png')
    plot_predictions(graphs[tr_ge_split:],
                     test_values,
                     num_processing_steps_ge,
                     output_file=f'{output_dir}graph.png')

    logit_graphs = graphs_tuple_to_networkxs(test_values["outputs"][-1])

    indexed_ge_graphs = indexed_graphs[tr_ge_split:]
    ge_graphs = [
        apply_logits_to_graphs(graph, logit_graph)
        for graph, logit_graph in zip(indexed_ge_graphs, logit_graphs)
    ]

    for ge_graph in ge_graphs:
        for data in multidigraph_data_iterator(ge_graph):
            data['probabilities'] = softmax(data['logits'])
            data['prediction'] = int(np.argmax(data['probabilities']))

    _, _, _, _, _, solveds_tr, solveds_ge = tr_info
    return ge_graphs, solveds_tr, solveds_ge
Beispiel #4
0
def pipeline(graphs,
             tr_ge_split,
             node_types,
             edge_types,
             num_processing_steps_tr=10,
             num_processing_steps_ge=10,
             num_training_iterations=10000,
             categorical_attributes=None,
             type_embedding_dim=5,
             attr_embedding_dim=6,
             edge_output_size=3,
             node_output_size=3):

    ############################################################
    # Manipulate the graph data
    ############################################################

    # Encode attribute values
    for graph in graphs:

        for data in multidigraph_data_iterator(graph):
            data['encoded_value'] = 0

        for node_data in multidigraph_node_data_iterator(graph):
            typ = node_data['type']

            # Add the integer value of the category for each categorical attribute instance
            for attr_typ, category_values in categorical_attributes.items():
                if typ == attr_typ:
                    node_data['encoded_value'] = category_values.index(
                        node_data['value'])

    indexed_graphs = [
        nx.convert_node_labels_to_integers(graph, label_attribute='concept')
        for graph in graphs
    ]
    graphs = [duplicate_edges_in_reverse(graph) for graph in indexed_graphs]

    graphs = [encode_types(graph, node_types, edge_types) for graph in graphs]

    input_graphs = [create_input_graph(graph) for graph in graphs]
    target_graphs = [create_target_graph(graph) for graph in graphs]

    tr_input_graphs = input_graphs[:tr_ge_split]
    tr_target_graphs = target_graphs[:tr_ge_split]
    ge_input_graphs = input_graphs[tr_ge_split:]
    ge_target_graphs = target_graphs[tr_ge_split:]

    ############################################################
    # Build and run the KGCN
    ############################################################

    type_categories_list = [i for i, _ in enumerate(node_types)]
    non_attribute_nodes = type_categories_list.copy()

    attr_embedders = dict()

    # Construct categorical attribute embedders
    for attr_typ, category_values in categorical_attributes.items():
        num_categories = len(category_values)

        def make_embedder():
            return CategoricalAttribute(num_categories,
                                        attr_embedding_dim,
                                        name=attr_typ + '_cat_embedder')

        attr_typ_index = node_types.index(attr_typ)

        # Record the embedder, and the index of the type that it should encode
        attr_embedders[make_embedder] = [attr_typ_index]

        non_attribute_nodes.pop(attr_typ_index)

    # All entities and relations (non-attributes) also need an embedder with matching output dimension, which does
    # nothing. This is provided as a list of their indices
    def make_blank_embedder():
        return BlankAttribute(attr_embedding_dim)

    attr_embedders[make_blank_embedder] = non_attribute_nodes

    kgcn = KGCN(len(node_types),
                len(edge_types),
                type_embedding_dim,
                attr_embedding_dim,
                attr_embedders,
                edge_output_size=edge_output_size,
                node_output_size=node_output_size)

    learner = KGCNLearner(kgcn,
                          num_processing_steps_tr=num_processing_steps_tr,
                          num_processing_steps_ge=num_processing_steps_ge)

    train_values, test_values, tr_info = learner(
        tr_input_graphs,
        tr_target_graphs,
        ge_input_graphs,
        ge_target_graphs,
        num_training_iterations=num_training_iterations)

    plot_across_training(*tr_info)
    plot_predictions(ge_input_graphs, test_values, num_processing_steps_ge)

    logit_graphs = graphs_tuple_to_networkxs(test_values["outputs"][-1])

    indexed_ge_graphs = indexed_graphs[tr_ge_split:]
    ge_graphs = [
        apply_logits_to_graphs(graph, logit_graph)
        for graph, logit_graph in zip(indexed_ge_graphs, logit_graphs)
    ]

    for ge_graph in ge_graphs:
        for data in multidigraph_data_iterator(ge_graph):
            data['probabilities'] = softmax(data['logits'])
            data['prediction'] = int(np.argmax(data['probabilities']))

    _, _, _, _, _, solveds_tr, solveds_ge = tr_info
    return ge_graphs, solveds_tr, solveds_ge
Beispiel #5
0
def pipeline(graphs,
             tr_ge_split,
             node_types,
             edge_types,
             num_processing_steps_tr=10,
             num_processing_steps_ge=10,
             num_training_iterations=10000,
             continuous_attributes=None,
             categorical_attributes=None,
             type_embedding_dim=5,
             attr_embedding_dim=6,
             edge_output_size=3,
             node_output_size=3,
             output_dir=None,
             do_test=False,
             save_fle="test_model.ckpt",
             reload_fle=""):

    ############################################################
    # Manipulate the graph data
    ############################################################

    # Encode attribute values
    graphs = [
        encode_values(graph, categorical_attributes, continuous_attributes)
        for graph in graphs
    ]

    indexed_graphs = [
        nx.convert_node_labels_to_integers(graph, label_attribute='concept')
        for graph in graphs
    ]
    graphs = [duplicate_edges_in_reverse(graph) for graph in indexed_graphs]

    graphs = [
        encode_types(graph, multidigraph_node_data_iterator, node_types)
        for graph in graphs
    ]
    graphs = [
        encode_types(graph, multidigraph_edge_data_iterator, edge_types)
        for graph in graphs
    ]

    input_graphs = [create_input_graph(graph) for graph in graphs]
    target_graphs = [create_target_graph(graph) for graph in graphs]

    tr_input_graphs = input_graphs[:tr_ge_split]
    tr_target_graphs = target_graphs[:tr_ge_split]
    ge_input_graphs = input_graphs[tr_ge_split:]
    ge_target_graphs = target_graphs[tr_ge_split:]

    ############################################################
    # Build and run the KGCN
    ############################################################

    thing_embedder = ThingEmbedder(node_types, type_embedding_dim,
                                   attr_embedding_dim, categorical_attributes,
                                   continuous_attributes)

    role_embedder = RoleEmbedder(len(edge_types), type_embedding_dim)

    kgcn = KGCN(thing_embedder,
                role_embedder,
                edge_output_size=edge_output_size,
                node_output_size=node_output_size)

    learner = KGCNLearner(
        kgcn,
        num_processing_steps_tr=
        num_processing_steps_tr,  # These processing steps indicate how many message-passing iterations to do for every training / testing step
        num_processing_steps_ge=num_processing_steps_ge,
        log_dir=output_dir,
        save_fle=f'{output_dir}/{save_fle}',
        reload_fle=f'{output_dir}/{reload_fle}')

    # only test
    if not (Path(output_dir) / reload_fle).is_dir() and do_test is True:
        print("\n\nVALIDATION ONLY\n\n")
        test_values, tr_info = learner.infer(ge_input_graphs, ge_target_graphs)
        #,log_dir=output_dir)
    # train
    else:
        print("\n\nTRAINING\n\n")
        train_values, test_values, tr_info = learner.train(
            tr_input_graphs,  #input_graphs
            tr_target_graphs,
            ge_input_graphs,
            ge_target_graphs,
            num_training_iterations=num_training_iterations)
        #,log_dir=output_dir)

    plot_across_training(*tr_info, output_file=f'{output_dir}/learning.png')
    plot_predictions(graphs[tr_ge_split:],
                     test_values,
                     num_processing_steps_ge,
                     output_file=f'{output_dir}/graph.png')

    logit_graphs = graphs_tuple_to_networkxs(test_values["outputs"][-1])

    indexed_ge_graphs = indexed_graphs[tr_ge_split:]
    ge_graphs = [
        apply_logits_to_graphs(graph, logit_graph)
        for graph, logit_graph in zip(indexed_ge_graphs, logit_graphs)
    ]

    for ge_graph in ge_graphs:
        for data in multidigraph_data_iterator(ge_graph):
            data['probabilities'] = softmax(data['logits'])
            # assing 0,1,2 based argmax of logits -> TODO: threshold
            data['prediction'] = int(np.argmax(data['probabilities']))

    _, _, _, _, _, solveds_tr, solveds_ge = tr_info
    return ge_graphs, solveds_tr, solveds_ge