Ejemplo n.º 1
0
def build_svmlight_chemical_data(in_files, wl_iterations, output_dir, format_rdf=False, compounds_targets_file=None, uri_prefix=None,
                                 shingles_type="features", window_size=5, accumulate_wl_shingles=True, fingerprints=False,
                                 sort_rdf_nodes_before_processing=True, state_input_file=None, state_output_file=None,
                                 save_just_last_wl_it=False):
    if format_rdf:
        assert type(in_files) is list
        assert bool(compounds_targets_file)
        assert bool(uri_prefix)
    else:
        if type(in_files) is list:
            in_files = in_files[0]
    
    files = []
    
    for i in range(wl_iterations + 1):
        files.append(open(output_dir + "svm_light_data_wl_{0}".format(i), "w"))
    
    if state_input_file:
        state = inout.load_from_file(state_input_file)
        state['files'] = files
    else:
        wl_state = {"wl_state": None}
        shingle_id_map = {}
        if not fingerprints:
            if accumulate_wl_shingles:
                wl_state["next_shingle_id"] = 1
            else:
                for i in range(wl_iterations + 1):
                    wl_state["wl_{0}_next_shingle_id".format(i)] = 1
        
        state = {
            "files": files,
            "wl_state": wl_state,
            "shingle_id_map": shingle_id_map,
            "rdf_colors": {'colors': None, 'next_color_id': None}
        }
    
    def process_compound(chem_record):
        process_record(chem_record, wl_iterations, state, binary_target_labels=True,
                       shingles_type=shingles_type, window_size=window_size, accumulate_wl_shingles=accumulate_wl_shingles,
                       fingerprints=fingerprints, save_just_last_wl_it=save_just_last_wl_it)
    
    if format_rdf:
        chem_database, state['rdf_colors'] = prepare_rdf_chemical_data(in_files, compounds_targets_file, uri_prefix, process_compound,
                                                  sort_rdf_nodes_before_processing=sort_rdf_nodes_before_processing,
                                                  rdf_colors_state=state['rdf_colors'])
    else:
        chem_database = read_chemical_compounts(in_files, process_compound)
    
    for i, _ in enumerate(chem_database):
        print i
        
    for f in files:
        f.close()
    del state['files']
    
    if state_output_file:
        inout.save_to_file(state, state_output_file)
    
    print "Done."
def calculate_ch_matrix():
    in_files = helpers.datasets[dataset]["files"]

    print "Converting RDF to NetworkX graph started at", time.strftime(
        time_format)
    start = time.time()
    graph, node_id_map = rdf.convert_rdf_to_nx_graph(in_files,
                                                     discard_classes=False)
    print "Converting RDF to NetworkX graph took", time.time() - start, "s"
    print "-----------------------------------------"

    print "Saving NodeID map started at", time.strftime(time_format)
    start = time.time()
    inout.save_to_file(node_id_map, path + "{0}_node_id_map".format(dataset))
    print "Saving NodeID map took", time.time() - start, "s"
    print "-----------------------------------------"

    print "Building hypergraph started at", time.strftime(time_format)
    start = time.time()
    hypergraph = Hypergraph(graph)
    print "Building hypergraph took", time.time() - start, "s"
    print "-----------------------------------------"

    print "Saving hypergraph started at", time.strftime(time_format)
    start = time.time()
    hypergraph.save_to_file(path + "{0}_hgraph".format(dataset))
    print "Saving hypergraph took", time.time() - start, "s"
    print "-----------------------------------------"

    print "Building characteristic matrix started at", time.strftime(
        time_format)
    start = time.time()
    rballs_database, index_node_map = similar_nodes_mining.extract_rballs_database(
        hypergraph, r_in=r_in, r_out=r_out, r_all=r_all)
    ch_matrix = CharacteristicMatrix(rballs_database,
                                     hypergraph.number_of_nodes(),
                                     wl_iterations=wl_iterations,
                                     print_progress=True)
    print "Building characteristic matrix took", time.time() - start, "s"
    print "-----------------------------------------"

    print "Saving Column index to Node map started at", time.strftime(
        time_format)
    start = time.time()
    inout.save_to_file(index_node_map,
                       path + "{0}_index_node_map".format(dataset))
    print "Saving Column index to Node map took", time.time() - start, "s"
    print "-----------------------------------------"

    print "Saving characteristic matrix started at", time.strftime(time_format)
    start = time.time()
    ch_matrix.save_to_file(path + "{0}_ch_matrix".format(dataset))
    print "Saving characteristic matrix took", time.time() - start, "s"
    print "-----------------------------------------"

    return ch_matrix, hypergraph, index_node_map, node_id_map
Ejemplo n.º 3
0
def calculate_ch_matrix():
    in_files = helpers.datasets[dataset]["files"]
    
    print "Converting RDF to NetworkX graph started at", time.strftime(time_format)
    start = time.time()
    graph, node_id_map = rdf.convert_rdf_to_nx_graph(in_files, discard_classes=False)
    print "Converting RDF to NetworkX graph took", time.time() - start, "s"
    print "-----------------------------------------"
    
    print "Saving NodeID map started at", time.strftime(time_format)
    start = time.time()
    inout.save_to_file(node_id_map, path + "{0}_node_id_map".format(dataset))
    print "Saving NodeID map took", time.time() - start, "s"
    print "-----------------------------------------"
    
    print "Building hypergraph started at", time.strftime(time_format)
    start = time.time()
    hypergraph = Hypergraph(graph)
    print "Building hypergraph took", time.time() - start, "s"
    print "-----------------------------------------"
    
    print "Saving hypergraph started at", time.strftime(time_format)
    start = time.time()
    hypergraph.save_to_file(path + "{0}_hgraph".format(dataset))
    print "Saving hypergraph took", time.time() - start, "s"
    print "-----------------------------------------"
    
    print "Building characteristic matrix started at", time.strftime(time_format)
    start = time.time()
    rballs_database, index_node_map = similar_nodes_mining.extract_rballs_database(hypergraph, r_in=r_in, r_out=r_out, r_all=r_all)
    ch_matrix = CharacteristicMatrix(rballs_database, hypergraph.number_of_nodes(), wl_iterations=wl_iterations, print_progress=True)
    print "Building characteristic matrix took", time.time() - start, "s"
    print "-----------------------------------------"
    
    print "Saving Column index to Node map started at", time.strftime(time_format)
    start = time.time()
    inout.save_to_file(index_node_map, path + "{0}_index_node_map".format(dataset))
    print "Saving Column index to Node map took", time.time() - start, "s"
    print "-----------------------------------------"
    
    print "Saving characteristic matrix started at", time.strftime(time_format)
    start = time.time()
    ch_matrix.save_to_file(path + "{0}_ch_matrix".format(dataset))
    print "Saving characteristic matrix took", time.time() - start, "s"
    print "-----------------------------------------"
    
    return ch_matrix, hypergraph, index_node_map, node_id_map
Ejemplo n.º 4
0
 def save_to_file(self, out_file, compress=True):
     inout.save_to_file(self, out_file, compress)

if __name__ == '__main__':
    ch_matrix, hypergraph, index_node_map, node_id_map = calculate_ch_matrix()
    #     ch_matrix, hypergraph, index_node_map, node_id_map = load_ch_matrix()

    sketch_matrix = calculate_sketch_matrix(ch_matrix, hypergraph)
    #     sketch_matrix, index_node_map, node_id_map = load_sketch_matrix()

    print "Building similarity matrix started at", time.strftime(time_format)
    start = time.time()
    sim_mat = similar_nodes_mining.get_node_similarity_matrix(sketch_matrix)
    print "Building similarity matrix took", time.time() - start, "s"
    print "-----------------------------------------"

    print "Extracting similar nodes started at", time.strftime(time_format)
    start = time.time()
    similar_nodes = similar_nodes_mining.get_all_similar_nodes(
        sim_mat, index_node_map)
    print "Extracting similar nodes took", time.time() - start, "s"
    print "-----------------------------------------"

    print "Saving similar nodes started at", time.strftime(time_format)
    start = time.time()
    inout.save_to_file(similar_nodes,
                       path + "{0}_similar_nodes".format(dataset))
    print "Saving similar nodes took", time.time() - start, "s"
    print "-----------------------------------------"

    print "DONE!"
Ejemplo n.º 6
0
    print "-----------------------------------------"
    
    return sketch_matrix, index_node_map, node_id_map

if __name__ == '__main__':
    ch_matrix, hypergraph, index_node_map, node_id_map = calculate_ch_matrix()
#     ch_matrix, hypergraph, index_node_map, node_id_map = load_ch_matrix()
    
    sketch_matrix = calculate_sketch_matrix(ch_matrix, hypergraph)
#     sketch_matrix, index_node_map, node_id_map = load_sketch_matrix()
    
    print "Building similarity matrix started at", time.strftime(time_format)
    start = time.time()
    sim_mat = similar_nodes_mining.get_node_similarity_matrix(sketch_matrix)
    print "Building similarity matrix took", time.time() - start, "s"
    print "-----------------------------------------"
    
    print "Extracting similar nodes started at", time.strftime(time_format)
    start = time.time()
    similar_nodes = similar_nodes_mining.get_all_similar_nodes(sim_mat, index_node_map)
    print "Extracting similar nodes took", time.time() - start, "s"
    print "-----------------------------------------"
    
    print "Saving similar nodes started at", time.strftime(time_format)
    start = time.time()
    inout.save_to_file(similar_nodes, path + "{0}_similar_nodes".format(dataset))
    print "Saving similar nodes took", time.time() - start, "s"
    print "-----------------------------------------"
    
    print "DONE!"
Ejemplo n.º 7
0
 def save_to_file(self, out_file, compress=True):
     inout.save_to_file(self, out_file, compress)
Ejemplo n.º 8
0
def extract_rballs_from_rdf_server(entries, output_dir, r, edge_dir, sparql_endpoint="http://localhost:3030/ds/query",
                                   entries_count_expected=-1, sort_rdf_nodes_before_processing=True):
    '''Extract r-balls around the given entry nodes from the graph on the server using SPARQL queries.
    :param entries: the entry nodes (resources, URI/IRIs) which will serve as center nodes of the r-balls
    :param output_dir: the directory for writing the output files
    :param r: radius of the r-balls
    :param edge_dir: the direction of edges to be considered (0 - all edges, 1 - only outgoing, -1 - only incoming)
    :param sparql_endpoint: URL of the SPARQL end-point. Default is http://localhost:3030/ds/query (for Apache Jena Fuseki)
    :param entries_count_expected: Expected number of entries to process.
    :param sort_rdf_nodes_before_processing: Used to yield the same colors in multiple runs. 
    '''
    colors = None
    next_color_id = None
    
    nodes_count_distribution = {}
    type_distribution = {}
    def update_stats(nodes_count, target_labels, colors):
        def get_target_uri_map():
            target_uri_map = {}
            for uri in colors:
                if colors[uri] in target_labels:
                    target_uri_map[colors[uri]] = uri
                    if len(target_uri_map) == len(target_labels):
                        break
            return target_uri_map
        
        if nodes_count not in nodes_count_distribution:
            nodes_count_distribution[nodes_count] = 0
        nodes_count_distribution[nodes_count] += 1
        
        target_uri_map = get_target_uri_map()
        for target in target_uri_map:
            type_uri = target_uri_map[target]
            if type_uri not in type_distribution:
                type_distribution[type_uri] = 0
            type_distribution[type_uri] += 1
    
    start_time = time.time()
    
    for i, entry_uri in enumerate(entries):
#         # TODO: specific case of 2-in-balls
#         query_status, rdf_r_ball = rdf.quary_2_in_ball(entry_uri, sparql_endpoint)
        query_status, rdf_r_ball = rdf.quary_r_ball(entry_uri, r, edge_dir, sparql_endpoint, ignore_type_paths=True, include_types=True)
        assert not query_status
        r_ball, uri_nodes_map, colors, next_color_id = rdf.convert_rdf_graph_to_nx_graph(rdf_r_ball, test_mode=sort_rdf_nodes_before_processing,
                                                                                         return_colors=True, base_colors=colors, next_color_id=next_color_id)
        if entry_uri not in uri_nodes_map:
            # in case the r-ball is empty
            node_id = 0
            r_ball.add_node(node_id, labels=["0"])
            uri_nodes_map[entry_uri] = node_id
        
        center_node = uri_nodes_map[entry_uri]
        target_labels = list(r_ball.node[center_node]["labels"])
        # Make he center node of color 0 (owl:Thing)
        # The original colors of the center node serve as target labels of the r-ball
        r_ball.node[center_node]["labels"] = ["0"]
        hyper_r_ball = Hypergraph(r_ball)
        nodes_count = r_ball.number_of_nodes()
        if i % 10 == 0: # print every 100 records
            elapsed_time = time.time() - start_time
            if entries_count_expected == -1 or i == 0:
                time_est = "Elapsed time: {0:.2f}s".format(elapsed_time)
            else:
                time_left = (elapsed_time / i) * (entries_count_expected - i) 
                time_est = "Time left: {0:.2f}s".format(time_left)
            print i, time_est, nodes_count, entry_uri, target_labels
        update_stats(nodes_count, target_labels, colors)
        graph_database_record = (entry_uri, [hyper_r_ball], target_labels)
        inout.save_to_file(graph_database_record, output_dir + "r_ball_{0}".format(i))
    
    return nodes_count_distribution, type_distribution