def compute_rballs_tw(in_files, output_dir):
    nx_graph, uri_node_map = rdf.convert_rdf_to_nx_graph(in_files, discard_classes=True)
    node_uri_map = {node: uri.replace(u",", u"[comma]").replace(u"\n", u"[new_line]") for uri, node in uri_node_map.items()}
    nodes_in_graph = nx_graph.number_of_nodes()
    print "Nodes in graph:", nodes_in_graph
    
    for d in ["in", "out", "all"]:
        rballs_with_big_tw = set()
        for r in [2, 3, 4, 5]:
            out_file = codecs.open(output_dir + "tw_r{0}_{1}".format(r, d), "w", "utf8")
            
            i = 0
            for node in nx_graph.nodes_iter():
                print "-------------------------------------"
                print u"Node {0}/{1} ({2})".format(i, nodes_in_graph, node_uri_map[node])
                print "r = {0}, d = {1}".format(r, d)
                if node in rballs_with_big_tw:
                    # don't compute treewidth for r-balls which are known to be big
                    tw = -1
                else:
                    rball = algorithms.r_ball(nx_graph, node, r, -1 if d == "in" else 1 if d == "out" else 0)
                    print "r-ball nodes:", rball.number_of_nodes()
                    tw = arnborg_proskurowski.get_treewidth(rball)
                    if tw == -1:
                        rballs_with_big_tw.add(node)
                print "Treewidth: ", tw
                line = u"{0},{1}\n".format(node_uri_map[node], tw)
                out_file.write(line)
#                 nxext.visualize_graph(rball, node_labels=True, edge_labels=False)
                i += 1
             
            out_file.close()
Beispiel #2
0
 def testRDFToNxGraphConvertionWithColoring(self):
     dummy_colored, _ = rdf.convert_rdf_to_nx_graph(
         ["test_files/dummy.rdf"], test_mode=True)
     isomorphic = algorithms.isomorphic(
         example_graphs.gt_dummy_colored_expected, dummy_colored)
     self.assertTrue(
         isomorphic,
         "Problem converting RDF graph to Networkx graph with colors.")
def calculate_ch_matrix():
    in_files = helpers.datasets[dataset]["files"]

    print "Converting RDF to NetworkX graph started at", time.strftime(
        time_format)
    start = time.time()
    graph, node_id_map = rdf.convert_rdf_to_nx_graph(in_files,
                                                     discard_classes=False)
    print "Converting RDF to NetworkX graph took", time.time() - start, "s"
    print "-----------------------------------------"

    print "Saving NodeID map started at", time.strftime(time_format)
    start = time.time()
    inout.save_to_file(node_id_map, path + "{0}_node_id_map".format(dataset))
    print "Saving NodeID map took", time.time() - start, "s"
    print "-----------------------------------------"

    print "Building hypergraph started at", time.strftime(time_format)
    start = time.time()
    hypergraph = Hypergraph(graph)
    print "Building hypergraph took", time.time() - start, "s"
    print "-----------------------------------------"

    print "Saving hypergraph started at", time.strftime(time_format)
    start = time.time()
    hypergraph.save_to_file(path + "{0}_hgraph".format(dataset))
    print "Saving hypergraph took", time.time() - start, "s"
    print "-----------------------------------------"

    print "Building characteristic matrix started at", time.strftime(
        time_format)
    start = time.time()
    rballs_database, index_node_map = similar_nodes_mining.extract_rballs_database(
        hypergraph, r_in=r_in, r_out=r_out, r_all=r_all)
    ch_matrix = CharacteristicMatrix(rballs_database,
                                     hypergraph.number_of_nodes(),
                                     wl_iterations=wl_iterations,
                                     print_progress=True)
    print "Building characteristic matrix took", time.time() - start, "s"
    print "-----------------------------------------"

    print "Saving Column index to Node map started at", time.strftime(
        time_format)
    start = time.time()
    inout.save_to_file(index_node_map,
                       path + "{0}_index_node_map".format(dataset))
    print "Saving Column index to Node map took", time.time() - start, "s"
    print "-----------------------------------------"

    print "Saving characteristic matrix started at", time.strftime(time_format)
    start = time.time()
    ch_matrix.save_to_file(path + "{0}_ch_matrix".format(dataset))
    print "Saving characteristic matrix took", time.time() - start, "s"
    print "-----------------------------------------"

    return ch_matrix, hypergraph, index_node_map, node_id_map
def calculate_ch_matrix():
    in_files = helpers.datasets[dataset]["files"]
    
    print "Converting RDF to NetworkX graph started at", time.strftime(time_format)
    start = time.time()
    graph, node_id_map = rdf.convert_rdf_to_nx_graph(in_files, discard_classes=False)
    print "Converting RDF to NetworkX graph took", time.time() - start, "s"
    print "-----------------------------------------"
    
    print "Saving NodeID map started at", time.strftime(time_format)
    start = time.time()
    inout.save_to_file(node_id_map, path + "{0}_node_id_map".format(dataset))
    print "Saving NodeID map took", time.time() - start, "s"
    print "-----------------------------------------"
    
    print "Building hypergraph started at", time.strftime(time_format)
    start = time.time()
    hypergraph = Hypergraph(graph)
    print "Building hypergraph took", time.time() - start, "s"
    print "-----------------------------------------"
    
    print "Saving hypergraph started at", time.strftime(time_format)
    start = time.time()
    hypergraph.save_to_file(path + "{0}_hgraph".format(dataset))
    print "Saving hypergraph took", time.time() - start, "s"
    print "-----------------------------------------"
    
    print "Building characteristic matrix started at", time.strftime(time_format)
    start = time.time()
    rballs_database, index_node_map = similar_nodes_mining.extract_rballs_database(hypergraph, r_in=r_in, r_out=r_out, r_all=r_all)
    ch_matrix = CharacteristicMatrix(rballs_database, hypergraph.number_of_nodes(), wl_iterations=wl_iterations, print_progress=True)
    print "Building characteristic matrix took", time.time() - start, "s"
    print "-----------------------------------------"
    
    print "Saving Column index to Node map started at", time.strftime(time_format)
    start = time.time()
    inout.save_to_file(index_node_map, path + "{0}_index_node_map".format(dataset))
    print "Saving Column index to Node map took", time.time() - start, "s"
    print "-----------------------------------------"
    
    print "Saving characteristic matrix started at", time.strftime(time_format)
    start = time.time()
    ch_matrix.save_to_file(path + "{0}_ch_matrix".format(dataset))
    print "Saving characteristic matrix took", time.time() - start, "s"
    print "-----------------------------------------"
    
    return ch_matrix, hypergraph, index_node_map, node_id_map
def compute_rballs_tw(in_files, output_dir):
    nx_graph, uri_node_map = rdf.convert_rdf_to_nx_graph(in_files,
                                                         discard_classes=True)
    node_uri_map = {
        node: uri.replace(u",", u"[comma]").replace(u"\n", u"[new_line]")
        for uri, node in uri_node_map.items()
    }
    nodes_in_graph = nx_graph.number_of_nodes()
    print "Nodes in graph:", nodes_in_graph

    for d in ["in", "out", "all"]:
        rballs_with_big_tw = set()
        for r in [2, 3, 4, 5]:
            out_file = codecs.open(output_dir + "tw_r{0}_{1}".format(r, d),
                                   "w", "utf8")

            i = 0
            for node in nx_graph.nodes_iter():
                print "-------------------------------------"
                print u"Node {0}/{1} ({2})".format(i, nodes_in_graph,
                                                   node_uri_map[node])
                print "r = {0}, d = {1}".format(r, d)
                if node in rballs_with_big_tw:
                    # don't compute treewidth for r-balls which are known to be big
                    tw = -1
                else:
                    rball = algorithms.r_ball(
                        nx_graph, node, r,
                        -1 if d == "in" else 1 if d == "out" else 0)
                    print "r-ball nodes:", rball.number_of_nodes()
                    tw = arnborg_proskurowski.get_treewidth(rball)
                    if tw == -1:
                        rballs_with_big_tw.add(node)
                print "Treewidth: ", tw
                line = u"{0},{1}\n".format(node_uri_map[node], tw)
                out_file.write(line)
                #                 nxext.visualize_graph(rball, node_labels=True, edge_labels=False)
                i += 1

            out_file.close()
Beispiel #6
0
 def testRDFToNxGraphConvertionWithColoring(self):
     dummy_colored, _ = rdf.convert_rdf_to_nx_graph(["test_files/dummy.rdf"], test_mode=True)
     isomorphic = algorithms.isomorphic(example_graphs.gt_dummy_colored_expected, dummy_colored)
     self.assertTrue(isomorphic, "Problem converting RDF graph to Networkx graph with colors.")
Beispiel #7
0
dataset = "drugadmin"
wl_iter_range = [3] # range(0, 10)
k_L_range = [
    (20, 1),    # inflection point ~0.
    (15, 5),    # inflection point 0.1
    (10, 9),    # inflection point 0.2
    (7, 12),    # inflection point 0.3
    (5, 13),    # inflection point 0.4
    (4, 16),    # inflection point 0.5
    (3, 16),    # inflection point 0.6
    (2, 11),    # inflection point 0.7
    (2, 25),    # inflection point 0.8
    (1, 10),    # inflection point 0.9
    (1, 20),    # inflection point ~1.
]
infl_point_range = [0., 0.0000001, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.]
p_range = [1]
r_in_range = [3] # range(0, 4)
r_out_range = [2] # range(0, 4)
r_all_range = [0]

output_dir = "../output_rdf/crossval_test/"

if __name__ == '__main__':
    in_files = helpers.datasets[dataset]["files"]
    graph, node_id_map = rdf.convert_rdf_to_nx_graph(in_files, discard_classes=False)
    hypergraph = Hypergraph(graph)
    best_model = crossval.loo_crossval(hypergraph, wl_iter_range, r_in_range, r_out_range, r_all_range, output_dir, infl_point_range=infl_point_range)
#     best_model = crossval.loo_crossval(hypergraph, wl_iter_range, r_in_range, r_out_range, r_all_range, output_dir, k_L_range=k_L_range)
    print "Best model:", best_model
Beispiel #8
0
def prepare_rdf_chemical_data(rdf_files, compounds_targets_file, uri_prefix, process_compound_function=None,
                              compounds_and_targets=None, sort_rdf_nodes_before_processing=False,
                              rdf_colors_state=None):
    def read_compounds_and_targets():
        with open(compounds_targets_file, "r") as ct_file:
            for line in ct_file.readlines():
                if line.startswith("#"):
                    continue
                elif line.startswith("$"):
                    break
                else:
                    comp_id, target_label = tuple(line[:-1].split(" "))
                    yield unicode(comp_id), int(target_label)
    
    def chem_database_generator(full_graph, uri_node_map, type_color_map, compounds_and_targets):
        literal_colors = set()
        for rdf_type in type_color_map:
            # TODO: this condition is unsafe because it may remove not only literal colors
            if rdf_type.startswith(u"http://www.w3.org/2001/XMLSchema#"):
                literal_colors.add(type_color_map[rdf_type])
        
        bool_colors = filter(lambda x: x.startswith(u"http://www.w3.org/2001/XMLSchema#boolean"), type_color_map)
        bool_colors = set(map(lambda x: type_color_map[x], bool_colors))
        literal_colors -= bool_colors
         
        for node in full_graph.nodes():
            node_labels_set = set(full_graph.node[node]["labels"])
            # remove all literals (except booleans)
            if literal_colors & node_labels_set:
                full_graph.remove_node(node)
        
        # remove the color of named individual type from all nodes where it occurs
        named_indiv_uri = u"http://www.w3.org/2002/07/owl#NamedIndividual"
        if named_indiv_uri in type_color_map:
            named_indiv_color = type_color_map[named_indiv_uri]
            for node in full_graph.nodes_iter():
                if named_indiv_color in full_graph.node[node]["labels"]:
                    full_graph.node[node]["labels"].remove(named_indiv_color)
    
        full_hypergraph = Hypergraph(full_graph)
        
#         ################
#         # INFO: use this to remove the isMutagenic property when predicting mutagenicity
#         is_mutag_color = type_color_map[u"http://dl-learner.org/carcinogenesis#isMutagenic"]
#         edges_to_remove = []
#         for edge in full_hypergraph.edges_iter():
#             if is_mutag_color in full_hypergraph.edge(edge)['labels']:
#                 edges_to_remove.append(edge)
#         for edge in edges_to_remove:
#             full_hypergraph.safe_remove_edge(edge)
#         ################
        
        if not compounds_and_targets:
            compounds_and_targets = read_compounds_and_targets()
        
        def remove_other_neighbors_of_bool_literals(hypergraph, center_node):
            center_neighbors = hypergraph.neighbors(center_node)
            bool_literals = filter(lambda n: set(hypergraph.node[n]['labels']) & bool_colors, center_neighbors)
            for bool_literal in bool_literals:
                bool_literal_neigbors = set(hypergraph.neighbors(bool_literal))
                # exclude the center node from the removable nodes
                bool_literal_neigbors.remove(center_node)
                for neigh in bool_literal_neigbors:
                    hypergraph.safe_remove_node(neigh)
        
        for comp_id, target_label in compounds_and_targets:
            node_id = u"n_{0}".format(uri_node_map[uri_prefix + comp_id])
            comp_neighborhood_hypergraph = algorithms.r_ball_hyper(full_hypergraph, node_id, 2, 0)
            remove_other_neighbors_of_bool_literals(comp_neighborhood_hypergraph, node_id)
            ch_db_record = (comp_id, [comp_neighborhood_hypergraph], target_label)
            if process_compound_function:
                process_compound_function(ch_db_record)
#             ############
#             def get_key(value, dictionary):
#                 for key in dictionary:
#                     if dictionary[key] == value:
#                         return key
#                 return None
#             g = ch_db_record[1][0].copy()
#             for n in g.node:
#                 n_new_labels = []
#                 for n_color in g.node[n]['labels']:
#                     n_rdf_type = get_key(n_color, type_color_map)
#                     n_rdf_type = n_rdf_type[n_rdf_type.find(u"#") + 1:]
#                     n_new_labels.append(n_rdf_type)
#                 g.node[n]['labels'] = n_new_labels
#             g.visualize()
#             ############
            yield ch_db_record
    
    if rdf_colors_state:
        rdf_base_colors = rdf_colors_state['colors']
        rdf_next_color_id = rdf_colors_state['next_color_id']
    else:
        rdf_base_colors = None
        rdf_next_color_id = None
    
    full_graph, uri_node_map, type_color_map, next_color_id = rdf.convert_rdf_to_nx_graph(rdf_files, return_colors=True,
                                                                                          test_mode=sort_rdf_nodes_before_processing,
                                                                                          base_colors=rdf_base_colors, next_color_id=rdf_next_color_id,
                                                                                          encode_boolean_value_in_color=True)
    
    chem_database = chem_database_generator(full_graph, uri_node_map, type_color_map, compounds_and_targets)
    new_rdf_colors_state = {'colors': type_color_map, 'next_color_id': next_color_id}
    
    return chem_database, new_rdf_colors_state