def build_svmlight_chemical_data(in_files, wl_iterations, output_dir, format_rdf=False, compounds_targets_file=None, uri_prefix=None, shingles_type="features", window_size=5, accumulate_wl_shingles=True, fingerprints=False, sort_rdf_nodes_before_processing=True, state_input_file=None, state_output_file=None, save_just_last_wl_it=False): if format_rdf: assert type(in_files) is list assert bool(compounds_targets_file) assert bool(uri_prefix) else: if type(in_files) is list: in_files = in_files[0] files = [] for i in range(wl_iterations + 1): files.append(open(output_dir + "svm_light_data_wl_{0}".format(i), "w")) if state_input_file: state = inout.load_from_file(state_input_file) state['files'] = files else: wl_state = {"wl_state": None} shingle_id_map = {} if not fingerprints: if accumulate_wl_shingles: wl_state["next_shingle_id"] = 1 else: for i in range(wl_iterations + 1): wl_state["wl_{0}_next_shingle_id".format(i)] = 1 state = { "files": files, "wl_state": wl_state, "shingle_id_map": shingle_id_map, "rdf_colors": {'colors': None, 'next_color_id': None} } def process_compound(chem_record): process_record(chem_record, wl_iterations, state, binary_target_labels=True, shingles_type=shingles_type, window_size=window_size, accumulate_wl_shingles=accumulate_wl_shingles, fingerprints=fingerprints, save_just_last_wl_it=save_just_last_wl_it) if format_rdf: chem_database, state['rdf_colors'] = prepare_rdf_chemical_data(in_files, compounds_targets_file, uri_prefix, process_compound, sort_rdf_nodes_before_processing=sort_rdf_nodes_before_processing, rdf_colors_state=state['rdf_colors']) else: chem_database = read_chemical_compounts(in_files, process_compound) for i, _ in enumerate(chem_database): print i for f in files: f.close() del state['files'] if state_output_file: inout.save_to_file(state, state_output_file) print "Done."
def calculate_ch_matrix(): in_files = helpers.datasets[dataset]["files"] print "Converting RDF to NetworkX graph started at", time.strftime( time_format) start = time.time() graph, node_id_map = rdf.convert_rdf_to_nx_graph(in_files, discard_classes=False) print "Converting RDF to NetworkX graph took", time.time() - start, "s" print "-----------------------------------------" print "Saving NodeID map started at", time.strftime(time_format) start = time.time() inout.save_to_file(node_id_map, path + "{0}_node_id_map".format(dataset)) print "Saving NodeID map took", time.time() - start, "s" print "-----------------------------------------" print "Building hypergraph started at", time.strftime(time_format) start = time.time() hypergraph = Hypergraph(graph) print "Building hypergraph took", time.time() - start, "s" print "-----------------------------------------" print "Saving hypergraph started at", time.strftime(time_format) start = time.time() hypergraph.save_to_file(path + "{0}_hgraph".format(dataset)) print "Saving hypergraph took", time.time() - start, "s" print "-----------------------------------------" print "Building characteristic matrix started at", time.strftime( time_format) start = time.time() rballs_database, index_node_map = similar_nodes_mining.extract_rballs_database( hypergraph, r_in=r_in, r_out=r_out, r_all=r_all) ch_matrix = CharacteristicMatrix(rballs_database, hypergraph.number_of_nodes(), wl_iterations=wl_iterations, print_progress=True) print "Building characteristic matrix took", time.time() - start, "s" print "-----------------------------------------" print "Saving Column index to Node map started at", time.strftime( time_format) start = time.time() inout.save_to_file(index_node_map, path + "{0}_index_node_map".format(dataset)) print "Saving Column index to Node map took", time.time() - start, "s" print "-----------------------------------------" print "Saving characteristic matrix started at", time.strftime(time_format) start = time.time() ch_matrix.save_to_file(path + "{0}_ch_matrix".format(dataset)) print "Saving characteristic matrix took", time.time() - start, "s" print "-----------------------------------------" return ch_matrix, hypergraph, index_node_map, node_id_map
def calculate_ch_matrix(): in_files = helpers.datasets[dataset]["files"] print "Converting RDF to NetworkX graph started at", time.strftime(time_format) start = time.time() graph, node_id_map = rdf.convert_rdf_to_nx_graph(in_files, discard_classes=False) print "Converting RDF to NetworkX graph took", time.time() - start, "s" print "-----------------------------------------" print "Saving NodeID map started at", time.strftime(time_format) start = time.time() inout.save_to_file(node_id_map, path + "{0}_node_id_map".format(dataset)) print "Saving NodeID map took", time.time() - start, "s" print "-----------------------------------------" print "Building hypergraph started at", time.strftime(time_format) start = time.time() hypergraph = Hypergraph(graph) print "Building hypergraph took", time.time() - start, "s" print "-----------------------------------------" print "Saving hypergraph started at", time.strftime(time_format) start = time.time() hypergraph.save_to_file(path + "{0}_hgraph".format(dataset)) print "Saving hypergraph took", time.time() - start, "s" print "-----------------------------------------" print "Building characteristic matrix started at", time.strftime(time_format) start = time.time() rballs_database, index_node_map = similar_nodes_mining.extract_rballs_database(hypergraph, r_in=r_in, r_out=r_out, r_all=r_all) ch_matrix = CharacteristicMatrix(rballs_database, hypergraph.number_of_nodes(), wl_iterations=wl_iterations, print_progress=True) print "Building characteristic matrix took", time.time() - start, "s" print "-----------------------------------------" print "Saving Column index to Node map started at", time.strftime(time_format) start = time.time() inout.save_to_file(index_node_map, path + "{0}_index_node_map".format(dataset)) print "Saving Column index to Node map took", time.time() - start, "s" print "-----------------------------------------" print "Saving characteristic matrix started at", time.strftime(time_format) start = time.time() ch_matrix.save_to_file(path + "{0}_ch_matrix".format(dataset)) print "Saving characteristic matrix took", time.time() - start, "s" print "-----------------------------------------" return ch_matrix, hypergraph, index_node_map, node_id_map
def save_to_file(self, out_file, compress=True): inout.save_to_file(self, out_file, compress)
if __name__ == '__main__': ch_matrix, hypergraph, index_node_map, node_id_map = calculate_ch_matrix() # ch_matrix, hypergraph, index_node_map, node_id_map = load_ch_matrix() sketch_matrix = calculate_sketch_matrix(ch_matrix, hypergraph) # sketch_matrix, index_node_map, node_id_map = load_sketch_matrix() print "Building similarity matrix started at", time.strftime(time_format) start = time.time() sim_mat = similar_nodes_mining.get_node_similarity_matrix(sketch_matrix) print "Building similarity matrix took", time.time() - start, "s" print "-----------------------------------------" print "Extracting similar nodes started at", time.strftime(time_format) start = time.time() similar_nodes = similar_nodes_mining.get_all_similar_nodes( sim_mat, index_node_map) print "Extracting similar nodes took", time.time() - start, "s" print "-----------------------------------------" print "Saving similar nodes started at", time.strftime(time_format) start = time.time() inout.save_to_file(similar_nodes, path + "{0}_similar_nodes".format(dataset)) print "Saving similar nodes took", time.time() - start, "s" print "-----------------------------------------" print "DONE!"
print "-----------------------------------------" return sketch_matrix, index_node_map, node_id_map if __name__ == '__main__': ch_matrix, hypergraph, index_node_map, node_id_map = calculate_ch_matrix() # ch_matrix, hypergraph, index_node_map, node_id_map = load_ch_matrix() sketch_matrix = calculate_sketch_matrix(ch_matrix, hypergraph) # sketch_matrix, index_node_map, node_id_map = load_sketch_matrix() print "Building similarity matrix started at", time.strftime(time_format) start = time.time() sim_mat = similar_nodes_mining.get_node_similarity_matrix(sketch_matrix) print "Building similarity matrix took", time.time() - start, "s" print "-----------------------------------------" print "Extracting similar nodes started at", time.strftime(time_format) start = time.time() similar_nodes = similar_nodes_mining.get_all_similar_nodes(sim_mat, index_node_map) print "Extracting similar nodes took", time.time() - start, "s" print "-----------------------------------------" print "Saving similar nodes started at", time.strftime(time_format) start = time.time() inout.save_to_file(similar_nodes, path + "{0}_similar_nodes".format(dataset)) print "Saving similar nodes took", time.time() - start, "s" print "-----------------------------------------" print "DONE!"
def extract_rballs_from_rdf_server(entries, output_dir, r, edge_dir, sparql_endpoint="http://localhost:3030/ds/query", entries_count_expected=-1, sort_rdf_nodes_before_processing=True): '''Extract r-balls around the given entry nodes from the graph on the server using SPARQL queries. :param entries: the entry nodes (resources, URI/IRIs) which will serve as center nodes of the r-balls :param output_dir: the directory for writing the output files :param r: radius of the r-balls :param edge_dir: the direction of edges to be considered (0 - all edges, 1 - only outgoing, -1 - only incoming) :param sparql_endpoint: URL of the SPARQL end-point. Default is http://localhost:3030/ds/query (for Apache Jena Fuseki) :param entries_count_expected: Expected number of entries to process. :param sort_rdf_nodes_before_processing: Used to yield the same colors in multiple runs. ''' colors = None next_color_id = None nodes_count_distribution = {} type_distribution = {} def update_stats(nodes_count, target_labels, colors): def get_target_uri_map(): target_uri_map = {} for uri in colors: if colors[uri] in target_labels: target_uri_map[colors[uri]] = uri if len(target_uri_map) == len(target_labels): break return target_uri_map if nodes_count not in nodes_count_distribution: nodes_count_distribution[nodes_count] = 0 nodes_count_distribution[nodes_count] += 1 target_uri_map = get_target_uri_map() for target in target_uri_map: type_uri = target_uri_map[target] if type_uri not in type_distribution: type_distribution[type_uri] = 0 type_distribution[type_uri] += 1 start_time = time.time() for i, entry_uri in enumerate(entries): # # TODO: specific case of 2-in-balls # query_status, rdf_r_ball = rdf.quary_2_in_ball(entry_uri, sparql_endpoint) query_status, rdf_r_ball = rdf.quary_r_ball(entry_uri, r, edge_dir, sparql_endpoint, ignore_type_paths=True, include_types=True) assert not query_status r_ball, uri_nodes_map, colors, next_color_id = rdf.convert_rdf_graph_to_nx_graph(rdf_r_ball, test_mode=sort_rdf_nodes_before_processing, return_colors=True, base_colors=colors, next_color_id=next_color_id) if entry_uri not in uri_nodes_map: # in case the r-ball is empty node_id = 0 r_ball.add_node(node_id, labels=["0"]) uri_nodes_map[entry_uri] = node_id center_node = uri_nodes_map[entry_uri] target_labels = list(r_ball.node[center_node]["labels"]) # Make he center node of color 0 (owl:Thing) # The original colors of the center node serve as target labels of the r-ball r_ball.node[center_node]["labels"] = ["0"] hyper_r_ball = Hypergraph(r_ball) nodes_count = r_ball.number_of_nodes() if i % 10 == 0: # print every 100 records elapsed_time = time.time() - start_time if entries_count_expected == -1 or i == 0: time_est = "Elapsed time: {0:.2f}s".format(elapsed_time) else: time_left = (elapsed_time / i) * (entries_count_expected - i) time_est = "Time left: {0:.2f}s".format(time_left) print i, time_est, nodes_count, entry_uri, target_labels update_stats(nodes_count, target_labels, colors) graph_database_record = (entry_uri, [hyper_r_ball], target_labels) inout.save_to_file(graph_database_record, output_dir + "r_ball_{0}".format(i)) return nodes_count_distribution, type_distribution