Ejemplo n.º 1
0
def obfuscate_labels(graph, types_and_roles_to_obfuscate):
    # Remove label leakage - change type labels that indicate candidates into non-candidates
    for data in multidigraph_data_iterator(graph):
        for label_to_obfuscate, with_label in types_and_roles_to_obfuscate.items():
            if data['type'] == label_to_obfuscate:
                data.update(type=with_label)
                break
Ejemplo n.º 2
0
def create_input_graph(graph, features_field="features"):
    input_graph = graph.copy()
    augment_data_fields(multidigraph_data_iterator(input_graph),
                        ("input", "categorical_type", "encoded_value"),
                        features_field)
    input_graph.graph[features_field] = np.array([0.0] * 5, dtype=np.float32)
    return input_graph
Ejemplo n.º 3
0
def create_concept_graphs(example_indices, grakn_session):
    graphs = []
    qh = QueryHandler()

    infer = True

    for example_id in example_indices:
        print(f'Creating graph for example {example_id}')
        graph_query_handles = qh.get_query_handles(example_id)
        with grakn_session.transaction().read() as tx:
            # Build a graph from the queries, samplers, and query graphs
            graph = build_graph_from_queries(graph_query_handles, tx, infer=infer)

        # Remove label leakage - change type labels that indicate candidates into non-candidates
        for data in multidigraph_data_iterator(graph):
            typ = data['type']
            if typ == 'candidate-diagnosis':
                data.update(type='diagnosis')
            elif typ == 'candidate-patient':
                data.update(type='patient')
            elif typ == 'candidate-diagnosed-disease':
                data.update(type='diagnosed-disease')

        graph.name = example_id
        graphs.append(graph)

    return graphs
Ejemplo n.º 4
0
def obfuscate_labels(graph, types_and_roles_to_obfuscate):
    """Taken directly from diagnosis.py from the kglib example"""
    # Remove label leakage - change type labels that indicate candidates into non-candidates
    for data in multidigraph_data_iterator(graph):
        for label_to_obfuscate, with_label in types_and_roles_to_obfuscate.items():
            if data["type"] == label_to_obfuscate:
                data.update(type=with_label)
                break
Ejemplo n.º 5
0
def create_target_graph(graph):
    target_graph = graph.copy()
    solution_one_hot_encoding = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]], dtype=np.float32)

    for data in multidigraph_data_iterator(target_graph):
        features = solution_one_hot_encoding[data["solution"]]
        data.clear()
        data["features"] = features

    target_graph.graph["features"] = np.array([0.0] * 5, dtype=np.float32)
    return target_graph
Ejemplo n.º 6
0
def create_target_graph(graph, features_field="features"):
    target_graph = graph.copy()
    target_graph = encode_solutions(target_graph,
                                    solution_field="solution",
                                    encoded_solution_field="encoded_solution",
                                    encodings=np.array([[1., 0., 0.],
                                                        [0., 1., 0.],
                                                        [0., 0., 1.]]))
    augment_data_fields(multidigraph_data_iterator(target_graph),
                        ("encoded_solution", ), features_field)
    target_graph.graph[features_field] = np.array([0.0] * 5, dtype=np.float32)
    return target_graph
Ejemplo n.º 7
0
def create_input_graph(graph):
    input_graph = graph.copy()

    for data in multidigraph_data_iterator(input_graph):
        if data["solution"] == 0:
            preexists = 1
        else:
            preexists = 0

        features = stack_features([preexists, data["categorical_type"], data["encoded_value"]])
        data.clear()
        data["features"] = features

    input_graph.graph["features"] = np.array([0.0] * 5, dtype=np.float32)
    return input_graph
Ejemplo n.º 8
0
def encode_solutions(graph,
                     solution_field="solution",
                     encoded_solution_field="encoded_solution",
                     encodings=np.array([[1., 0., 0.], [0., 1., 0.],
                                         [0., 0., 1.]])):
    """
    Determines the encoding to use for a solution category
    Args:
        graph: Graph to update
        solution_field: The property in the graph that holds the value of the solution
        encoded_solution_field: The property in the graph to use to hold the new solution value
        encodings: An array, a row from which will be picked as the new solution based on using the current solution
            as a row index

    Returns: Graph with updated `encoded_solution_field`

    """

    for data in multidigraph_data_iterator(graph):
        solution = data[solution_field]
        data[encoded_solution_field] = encodings[solution]

    return graph
Ejemplo n.º 9
0
def pipeline(graphs,
             tr_ge_split,
             node_types,
             edge_types,
             num_processing_steps_tr=10,
             num_processing_steps_ge=10,
             num_training_iterations=10000,
             continuous_attributes=None,
             categorical_attributes=None,
             type_embedding_dim=5,
             attr_embedding_dim=6,
             edge_output_size=3,
             node_output_size=3,
             output_dir=None):

    ############################################################
    # Manipulate the graph data
    ############################################################

    # Encode attribute values
    for graph in graphs:
        for node_data in multidigraph_node_data_iterator(graph):
            typ = node_data['type']

            if categorical_attributes is not None and typ in categorical_attributes.keys(
            ):
                # Add the integer value of the category for each categorical attribute instance
                category_values = categorical_attributes[typ]
                node_data['encoded_value'] = category_values.index(
                    node_data['value'])

            elif continuous_attributes is not None and typ in continuous_attributes.keys(
            ):
                min_val, max_val = continuous_attributes[typ]
                node_data['encoded_value'] = (node_data['value'] -
                                              min_val) / (max_val - min_val)

            else:
                node_data['encoded_value'] = 0

        for edge_data in multidigraph_edge_data_iterator(graph):
            edge_data['encoded_value'] = 0

    indexed_graphs = [
        nx.convert_node_labels_to_integers(graph, label_attribute='concept')
        for graph in graphs
    ]
    graphs = [duplicate_edges_in_reverse(graph) for graph in indexed_graphs]

    graphs = [encode_types(graph, node_types, edge_types) for graph in graphs]

    input_graphs = [create_input_graph(graph) for graph in graphs]
    target_graphs = [create_target_graph(graph) for graph in graphs]

    tr_input_graphs = input_graphs[:tr_ge_split]
    tr_target_graphs = target_graphs[:tr_ge_split]
    ge_input_graphs = input_graphs[tr_ge_split:]
    ge_target_graphs = target_graphs[tr_ge_split:]

    ############################################################
    # Build and run the KGCN
    ############################################################

    attr_embedders = configure_embedders(node_types, attr_embedding_dim,
                                         categorical_attributes,
                                         continuous_attributes)

    kgcn = KGCN(len(node_types),
                len(edge_types),
                type_embedding_dim,
                attr_embedding_dim,
                attr_embedders,
                edge_output_size=edge_output_size,
                node_output_size=node_output_size)

    learner = KGCNLearner(kgcn,
                          num_processing_steps_tr=num_processing_steps_tr,
                          num_processing_steps_ge=num_processing_steps_ge)

    train_values, test_values, tr_info = learner(
        tr_input_graphs,
        tr_target_graphs,
        ge_input_graphs,
        ge_target_graphs,
        num_training_iterations=num_training_iterations,
        log_dir=output_dir)

    plot_across_training(*tr_info, output_file=f'{output_dir}learning.png')
    plot_predictions(ge_input_graphs,
                     test_values,
                     num_processing_steps_ge,
                     output_file=f'{output_dir}graph.png')

    logit_graphs = graphs_tuple_to_networkxs(test_values["outputs"][-1])

    indexed_ge_graphs = indexed_graphs[tr_ge_split:]
    ge_graphs = [
        apply_logits_to_graphs(graph, logit_graph)
        for graph, logit_graph in zip(indexed_ge_graphs, logit_graphs)
    ]

    for ge_graph in ge_graphs:
        for data in multidigraph_data_iterator(ge_graph):
            data['probabilities'] = softmax(data['logits'])
            data['prediction'] = int(np.argmax(data['probabilities']))

    _, _, _, _, _, solveds_tr, solveds_ge = tr_info
    return ge_graphs, solveds_tr, solveds_ge
Ejemplo n.º 10
0
def pipeline(graphs,
             tr_ge_split,
             node_types,
             edge_types,
             num_processing_steps_tr=10,
             num_processing_steps_ge=10,
             num_training_iterations=10000,
             continuous_attributes=None,
             categorical_attributes=None,
             type_embedding_dim=5,
             attr_embedding_dim=6,
             edge_output_size=3,
             node_output_size=3,
             output_dir=None):

    ############################################################
    # Manipulate the graph data
    ############################################################

    # Encode attribute values
    graphs = [
        encode_values(graph, categorical_attributes, continuous_attributes)
        for graph in graphs
    ]

    indexed_graphs = [
        nx.convert_node_labels_to_integers(graph, label_attribute='concept')
        for graph in graphs
    ]
    graphs = [duplicate_edges_in_reverse(graph) for graph in indexed_graphs]

    graphs = [
        encode_types(graph, multidigraph_node_data_iterator, node_types)
        for graph in graphs
    ]
    graphs = [
        encode_types(graph, multidigraph_edge_data_iterator, edge_types)
        for graph in graphs
    ]

    input_graphs = [create_input_graph(graph) for graph in graphs]
    target_graphs = [create_target_graph(graph) for graph in graphs]

    tr_input_graphs = input_graphs[:tr_ge_split]
    tr_target_graphs = target_graphs[:tr_ge_split]
    ge_input_graphs = input_graphs[tr_ge_split:]
    ge_target_graphs = target_graphs[tr_ge_split:]

    ############################################################
    # Build and run the KGCN
    ############################################################

    thing_embedder = ThingEmbedder(node_types, type_embedding_dim,
                                   attr_embedding_dim, categorical_attributes,
                                   continuous_attributes)

    role_embedder = RoleEmbedder(len(edge_types), type_embedding_dim)

    kgcn = KGCN(thing_embedder,
                role_embedder,
                edge_output_size=edge_output_size,
                node_output_size=node_output_size)

    learner = KGCNLearner(kgcn,
                          num_processing_steps_tr=num_processing_steps_tr,
                          num_processing_steps_ge=num_processing_steps_ge)

    train_values, test_values, tr_info = learner(
        tr_input_graphs,
        tr_target_graphs,
        ge_input_graphs,
        ge_target_graphs,
        num_training_iterations=num_training_iterations,
        log_dir=output_dir)

    plot_across_training(*tr_info, output_file=f'{output_dir}learning.png')
    plot_predictions(graphs[tr_ge_split:],
                     test_values,
                     num_processing_steps_ge,
                     output_file=f'{output_dir}graph.png')

    logit_graphs = graphs_tuple_to_networkxs(test_values["outputs"][-1])

    indexed_ge_graphs = indexed_graphs[tr_ge_split:]
    ge_graphs = [
        apply_logits_to_graphs(graph, logit_graph)
        for graph, logit_graph in zip(indexed_ge_graphs, logit_graphs)
    ]

    for ge_graph in ge_graphs:
        for data in multidigraph_data_iterator(ge_graph):
            data['probabilities'] = softmax(data['logits'])
            data['prediction'] = int(np.argmax(data['probabilities']))

    _, _, _, _, _, solveds_tr, solveds_ge = tr_info
    return ge_graphs, solveds_tr, solveds_ge
Ejemplo n.º 11
0
def pipeline(graphs,
             tr_ge_split,
             node_types,
             edge_types,
             num_processing_steps_tr=10,
             num_processing_steps_ge=10,
             num_training_iterations=10000,
             categorical_attributes=None,
             type_embedding_dim=5,
             attr_embedding_dim=6,
             edge_output_size=3,
             node_output_size=3):

    ############################################################
    # Manipulate the graph data
    ############################################################

    # Encode attribute values
    for graph in graphs:

        for data in multidigraph_data_iterator(graph):
            data['encoded_value'] = 0

        for node_data in multidigraph_node_data_iterator(graph):
            typ = node_data['type']

            # Add the integer value of the category for each categorical attribute instance
            for attr_typ, category_values in categorical_attributes.items():
                if typ == attr_typ:
                    node_data['encoded_value'] = category_values.index(
                        node_data['value'])

    indexed_graphs = [
        nx.convert_node_labels_to_integers(graph, label_attribute='concept')
        for graph in graphs
    ]
    graphs = [duplicate_edges_in_reverse(graph) for graph in indexed_graphs]

    graphs = [encode_types(graph, node_types, edge_types) for graph in graphs]

    input_graphs = [create_input_graph(graph) for graph in graphs]
    target_graphs = [create_target_graph(graph) for graph in graphs]

    tr_input_graphs = input_graphs[:tr_ge_split]
    tr_target_graphs = target_graphs[:tr_ge_split]
    ge_input_graphs = input_graphs[tr_ge_split:]
    ge_target_graphs = target_graphs[tr_ge_split:]

    ############################################################
    # Build and run the KGCN
    ############################################################

    type_categories_list = [i for i, _ in enumerate(node_types)]
    non_attribute_nodes = type_categories_list.copy()

    attr_embedders = dict()

    # Construct categorical attribute embedders
    for attr_typ, category_values in categorical_attributes.items():
        num_categories = len(category_values)

        def make_embedder():
            return CategoricalAttribute(num_categories,
                                        attr_embedding_dim,
                                        name=attr_typ + '_cat_embedder')

        attr_typ_index = node_types.index(attr_typ)

        # Record the embedder, and the index of the type that it should encode
        attr_embedders[make_embedder] = [attr_typ_index]

        non_attribute_nodes.pop(attr_typ_index)

    # All entities and relations (non-attributes) also need an embedder with matching output dimension, which does
    # nothing. This is provided as a list of their indices
    def make_blank_embedder():
        return BlankAttribute(attr_embedding_dim)

    attr_embedders[make_blank_embedder] = non_attribute_nodes

    kgcn = KGCN(len(node_types),
                len(edge_types),
                type_embedding_dim,
                attr_embedding_dim,
                attr_embedders,
                edge_output_size=edge_output_size,
                node_output_size=node_output_size)

    learner = KGCNLearner(kgcn,
                          num_processing_steps_tr=num_processing_steps_tr,
                          num_processing_steps_ge=num_processing_steps_ge)

    train_values, test_values, tr_info = learner(
        tr_input_graphs,
        tr_target_graphs,
        ge_input_graphs,
        ge_target_graphs,
        num_training_iterations=num_training_iterations)

    plot_across_training(*tr_info)
    plot_predictions(ge_input_graphs, test_values, num_processing_steps_ge)

    logit_graphs = graphs_tuple_to_networkxs(test_values["outputs"][-1])

    indexed_ge_graphs = indexed_graphs[tr_ge_split:]
    ge_graphs = [
        apply_logits_to_graphs(graph, logit_graph)
        for graph, logit_graph in zip(indexed_ge_graphs, logit_graphs)
    ]

    for ge_graph in ge_graphs:
        for data in multidigraph_data_iterator(ge_graph):
            data['probabilities'] = softmax(data['logits'])
            data['prediction'] = int(np.argmax(data['probabilities']))

    _, _, _, _, _, solveds_tr, solveds_ge = tr_info
    return ge_graphs, solveds_tr, solveds_ge
Ejemplo n.º 12
0
def pipeline(graphs,
             tr_ge_split,
             node_types,
             edge_types,
             num_processing_steps_tr=10,
             num_processing_steps_ge=10,
             num_training_iterations=10000,
             continuous_attributes=None,
             categorical_attributes=None,
             type_embedding_dim=5,
             attr_embedding_dim=6,
             edge_output_size=3,
             node_output_size=3,
             output_dir=None,
             do_test=False,
             save_fle="test_model.ckpt",
             reload_fle=""):

    ############################################################
    # Manipulate the graph data
    ############################################################

    # Encode attribute values
    graphs = [
        encode_values(graph, categorical_attributes, continuous_attributes)
        for graph in graphs
    ]

    indexed_graphs = [
        nx.convert_node_labels_to_integers(graph, label_attribute='concept')
        for graph in graphs
    ]
    graphs = [duplicate_edges_in_reverse(graph) for graph in indexed_graphs]

    graphs = [
        encode_types(graph, multidigraph_node_data_iterator, node_types)
        for graph in graphs
    ]
    graphs = [
        encode_types(graph, multidigraph_edge_data_iterator, edge_types)
        for graph in graphs
    ]

    input_graphs = [create_input_graph(graph) for graph in graphs]
    target_graphs = [create_target_graph(graph) for graph in graphs]

    tr_input_graphs = input_graphs[:tr_ge_split]
    tr_target_graphs = target_graphs[:tr_ge_split]
    ge_input_graphs = input_graphs[tr_ge_split:]
    ge_target_graphs = target_graphs[tr_ge_split:]

    ############################################################
    # Build and run the KGCN
    ############################################################

    thing_embedder = ThingEmbedder(node_types, type_embedding_dim,
                                   attr_embedding_dim, categorical_attributes,
                                   continuous_attributes)

    role_embedder = RoleEmbedder(len(edge_types), type_embedding_dim)

    kgcn = KGCN(thing_embedder,
                role_embedder,
                edge_output_size=edge_output_size,
                node_output_size=node_output_size)

    learner = KGCNLearner(
        kgcn,
        num_processing_steps_tr=
        num_processing_steps_tr,  # These processing steps indicate how many message-passing iterations to do for every training / testing step
        num_processing_steps_ge=num_processing_steps_ge,
        log_dir=output_dir,
        save_fle=f'{output_dir}/{save_fle}',
        reload_fle=f'{output_dir}/{reload_fle}')

    # only test
    if not (Path(output_dir) / reload_fle).is_dir() and do_test is True:
        print("\n\nVALIDATION ONLY\n\n")
        test_values, tr_info = learner.infer(ge_input_graphs, ge_target_graphs)
        #,log_dir=output_dir)
    # train
    else:
        print("\n\nTRAINING\n\n")
        train_values, test_values, tr_info = learner.train(
            tr_input_graphs,  #input_graphs
            tr_target_graphs,
            ge_input_graphs,
            ge_target_graphs,
            num_training_iterations=num_training_iterations)
        #,log_dir=output_dir)

    plot_across_training(*tr_info, output_file=f'{output_dir}/learning.png')
    plot_predictions(graphs[tr_ge_split:],
                     test_values,
                     num_processing_steps_ge,
                     output_file=f'{output_dir}/graph.png')

    logit_graphs = graphs_tuple_to_networkxs(test_values["outputs"][-1])

    indexed_ge_graphs = indexed_graphs[tr_ge_split:]
    ge_graphs = [
        apply_logits_to_graphs(graph, logit_graph)
        for graph, logit_graph in zip(indexed_ge_graphs, logit_graphs)
    ]

    for ge_graph in ge_graphs:
        for data in multidigraph_data_iterator(ge_graph):
            data['probabilities'] = softmax(data['logits'])
            # assing 0,1,2 based argmax of logits -> TODO: threshold
            data['prediction'] = int(np.argmax(data['probabilities']))

    _, _, _, _, _, solveds_tr, solveds_ge = tr_info
    return ge_graphs, solveds_tr, solveds_ge