def loo_crossval_naive(graph_database, wl_iter_range, param_2_range, quality_function, output_dir, base_model={}, shingles_type="features", window_size=5, accumulate_wl_shingles=True): '''Similar to loo_crossval_sketch but computes directly the Jaccard similarities between the columns in the characteristic matrix, without using a sketch matrix. Not applicable for big datasets. ''' best_model = model_p(-1, -1, -1, base_model=base_model) cols_count = len(graph_database) models_file = open(output_dir + "models_naive", "a") for wl_iterations in wl_iter_range: ch_matrix = CharacteristicMatrix(graph_database, cols_count, wl_iterations=wl_iterations, shingles_type=shingles_type, window_size=window_size, accumulate_wl_shingles=accumulate_wl_shingles) jaccard_similarity_matrix = ch_matrix.compute_jaccard_similarity_matrix() for p in param_2_range: avg_quality = 0. for i in range(cols_count): avg_quality += float(quality_function(i, jaccard_similarity_matrix, p)) avg_quality /= cols_count current_model = model_p(avg_quality, wl_iterations, p, base_model=base_model) print current_model models_file.write(str(current_model) + ",\n") models_file.flush() if avg_quality > best_model["quality"]: best_model = current_model if not base_model: # print best model when there are no outer parameters models_file.write("Best model: " + str(best_model) + "\n") models_file.close() return best_model
def testCharacteristicMatrix_JaccardSimMatrix(self): dummy_hypergraph = Hypergraph(example_graphs.snm_dummy_graph) rballs_database, _ = similar_nodes_mining.extract_rballs_database(dummy_hypergraph, r_in=3, r_out=2, r_all=0) nodes_count = dummy_hypergraph.number_of_nodes() ch_matrix = CharacteristicMatrix(rballs_database, nodes_count, wl_iterations=0) ch_matrix_jaccard_sim = ch_matrix.compute_jaccard_similarity_matrix() equality = (self.ch_matrix_jaccard_sim_exp == ch_matrix_jaccard_sim).all() self.assertTrue(equality, "The computed Jaccard similarity matrix is wrong.")
def testCharacteristicMatrix_ReadWrite(self): file_name = "test_files/characteristic_matrix.tmp" dummy_hypergraph = Hypergraph(example_graphs.snm_dummy_graph) rballs_database, _ = similar_nodes_mining.extract_rballs_database(dummy_hypergraph, r_in=2, r_out=2, r_all=0) nodes_count = dummy_hypergraph.number_of_nodes() ch_matrix = CharacteristicMatrix(rballs_database, nodes_count, wl_iterations=4) ch_matrix.save_to_file(file_name) read_ch_matrix = CharacteristicMatrix.load_from_file(file_name) self.assertEqual(read_ch_matrix, ch_matrix, "The read characteristic matrix is different from the saved one.")
def calculate_ch_matrix(): in_files = helpers.datasets[dataset]["files"] print "Converting RDF to NetworkX graph started at", time.strftime( time_format) start = time.time() graph, node_id_map = rdf.convert_rdf_to_nx_graph(in_files, discard_classes=False) print "Converting RDF to NetworkX graph took", time.time() - start, "s" print "-----------------------------------------" print "Saving NodeID map started at", time.strftime(time_format) start = time.time() inout.save_to_file(node_id_map, path + "{0}_node_id_map".format(dataset)) print "Saving NodeID map took", time.time() - start, "s" print "-----------------------------------------" print "Building hypergraph started at", time.strftime(time_format) start = time.time() hypergraph = Hypergraph(graph) print "Building hypergraph took", time.time() - start, "s" print "-----------------------------------------" print "Saving hypergraph started at", time.strftime(time_format) start = time.time() hypergraph.save_to_file(path + "{0}_hgraph".format(dataset)) print "Saving hypergraph took", time.time() - start, "s" print "-----------------------------------------" print "Building characteristic matrix started at", time.strftime( time_format) start = time.time() rballs_database, index_node_map = similar_nodes_mining.extract_rballs_database( hypergraph, r_in=r_in, r_out=r_out, r_all=r_all) ch_matrix = CharacteristicMatrix(rballs_database, hypergraph.number_of_nodes(), wl_iterations=wl_iterations, print_progress=True) print "Building characteristic matrix took", time.time() - start, "s" print "-----------------------------------------" print "Saving Column index to Node map started at", time.strftime( time_format) start = time.time() inout.save_to_file(index_node_map, path + "{0}_index_node_map".format(dataset)) print "Saving Column index to Node map took", time.time() - start, "s" print "-----------------------------------------" print "Saving characteristic matrix started at", time.strftime(time_format) start = time.time() ch_matrix.save_to_file(path + "{0}_ch_matrix".format(dataset)) print "Saving characteristic matrix took", time.time() - start, "s" print "-----------------------------------------" return ch_matrix, hypergraph, index_node_map, node_id_map
def testSimilarNodesMining(self): dummy_hypergraph = Hypergraph(example_graphs.snm_dummy_graph) rballs_database, _ = similar_nodes_mining.extract_rballs_database(dummy_hypergraph, r_in=3, r_out=2, r_all=0) nodes_count = dummy_hypergraph.number_of_nodes() ch_matrix = CharacteristicMatrix(rballs_database, nodes_count, wl_iterations=0) ch_matrix_jaccard_sim = ch_matrix.compute_jaccard_similarity_matrix() similarity_matrix_exp = np.array(ch_matrix_jaccard_sim >= 0.8, dtype=np.float32) sketch_matrix = SketchMatrix(25, 265, ch_matrix) similarity_matrix = similar_nodes_mining.get_node_similarity_matrix(sketch_matrix) equality = (similarity_matrix_exp == similarity_matrix).all() self.assertTrue(equality, "The computed similarity matrix is wrong (Keep in mind that the sketch_matrix is probabilistic, therefore, it may not be always correct. The test may pass in another run.).")
def testCharacteristicMatrix_JaccardSimMatrix(self): dummy_hypergraph = Hypergraph(example_graphs.snm_dummy_graph) rballs_database, _ = similar_nodes_mining.extract_rballs_database( dummy_hypergraph, r_in=3, r_out=2, r_all=0) nodes_count = dummy_hypergraph.number_of_nodes() ch_matrix = CharacteristicMatrix(rballs_database, nodes_count, wl_iterations=0) ch_matrix_jaccard_sim = ch_matrix.compute_jaccard_similarity_matrix() equality = ( self.ch_matrix_jaccard_sim_exp == ch_matrix_jaccard_sim).all() self.assertTrue(equality, "The computed Jaccard similarity matrix is wrong.")
def loo_crossval_naive(graph_database, wl_iter_range, param_2_range, quality_function, output_dir, base_model={}, shingles_type="features", window_size=5, accumulate_wl_shingles=True): '''Similar to loo_crossval_sketch but computes directly the Jaccard similarities between the columns in the characteristic matrix, without using a sketch matrix. Not applicable for big datasets. ''' best_model = model_p(-1, -1, -1, base_model=base_model) cols_count = len(graph_database) models_file = open(output_dir + "models_naive", "a") for wl_iterations in wl_iter_range: ch_matrix = CharacteristicMatrix( graph_database, cols_count, wl_iterations=wl_iterations, shingles_type=shingles_type, window_size=window_size, accumulate_wl_shingles=accumulate_wl_shingles) jaccard_similarity_matrix = ch_matrix.compute_jaccard_similarity_matrix( ) for p in param_2_range: avg_quality = 0. for i in range(cols_count): avg_quality += float( quality_function(i, jaccard_similarity_matrix, p)) avg_quality /= cols_count current_model = model_p(avg_quality, wl_iterations, p, base_model=base_model) print current_model models_file.write(str(current_model) + ",\n") models_file.flush() if avg_quality > best_model["quality"]: best_model = current_model if not base_model: # print best model when there are no outer parameters models_file.write("Best model: " + str(best_model) + "\n") models_file.close() return best_model
def testCharacteristicMatrix_ReadWrite(self): file_name = "test_files/characteristic_matrix.tmp" dummy_hypergraph = Hypergraph(example_graphs.snm_dummy_graph) rballs_database, _ = similar_nodes_mining.extract_rballs_database( dummy_hypergraph, r_in=2, r_out=2, r_all=0) nodes_count = dummy_hypergraph.number_of_nodes() ch_matrix = CharacteristicMatrix(rballs_database, nodes_count, wl_iterations=4) ch_matrix.save_to_file(file_name) read_ch_matrix = CharacteristicMatrix.load_from_file(file_name) self.assertEqual( read_ch_matrix, ch_matrix, "The read characteristic matrix is different from the saved one.")
def load_ch_matrix(): print "Reading NodeID map started at", time.strftime(time_format) start = time.time() node_id_map = inout.load_from_file(path + "{0}_node_id_map".format(dataset)) print "Reading NodeID map took", time.time() - start, "s" print "-----------------------------------------" print "Reading hypergraph started at", time.strftime(time_format) start = time.time() hypergraph = Hypergraph.load_from_file(path + "{0}_hgraph".format(dataset)) print "Reading hypergraph took", time.time() - start, "s" print "-----------------------------------------" print "Reading characteristic matrix started at", time.strftime(time_format) start = time.time() ch_matrix = CharacteristicMatrix.load_from_file(path + "{0}_ch_matrix".format(dataset)) print "Reading characteristic matrix took", time.time() - start, "s" print "-----------------------------------------" print "Reading Column index to Node map started at", time.strftime(time_format) start = time.time() index_node_map = inout.load_from_file(path + "{0}_index_node_map".format(dataset)) print "Reading Column index to Node map took", time.time() - start, "s" print "-----------------------------------------" return ch_matrix, hypergraph, index_node_map, node_id_map
def load_ch_matrix(): print "Reading NodeID map started at", time.strftime(time_format) start = time.time() node_id_map = inout.load_from_file(path + "{0}_node_id_map".format(dataset)) print "Reading NodeID map took", time.time() - start, "s" print "-----------------------------------------" print "Reading hypergraph started at", time.strftime(time_format) start = time.time() hypergraph = Hypergraph.load_from_file(path + "{0}_hgraph".format(dataset)) print "Reading hypergraph took", time.time() - start, "s" print "-----------------------------------------" print "Reading characteristic matrix started at", time.strftime( time_format) start = time.time() ch_matrix = CharacteristicMatrix.load_from_file( path + "{0}_ch_matrix".format(dataset)) print "Reading characteristic matrix took", time.time() - start, "s" print "-----------------------------------------" print "Reading Column index to Node map started at", time.strftime( time_format) start = time.time() index_node_map = inout.load_from_file(path + "{0}_index_node_map".format(dataset)) print "Reading Column index to Node map took", time.time() - start, "s" print "-----------------------------------------" return ch_matrix, hypergraph, index_node_map, node_id_map
def testGetSimilarNodesToQueryNode(self): dummy_hypergraph = Hypergraph(example_graphs.snm_dummy_graph) rballs_database, _ = similar_nodes_mining.extract_rballs_database( dummy_hypergraph, r_in=3, r_out=2, r_all=0) nodes_count = dummy_hypergraph.number_of_nodes() ch_matrix = CharacteristicMatrix(rballs_database, nodes_count, wl_iterations=0) sketch_matrix = SketchMatrix(25, 265, ch_matrix) similar_nodes_exp = np.array([0, 5, 7]) similar_nodes, _ = similar_nodes_mining.get_similar_nodes( "n_7", dummy_hypergraph, sketch_matrix, 0, [], r_in=3, r_out=2, r_all=0) equality = similar_nodes_exp == similar_nodes if type(equality) is not bool: equality = equality.all() self.assertTrue( equality, "Wrong similar nodes were extracted (Keep in mind that the sketch_matrix is probabilistic, therefore, it may not be always correct. The test may pass in another run.)." )
def calculate_ch_matrix(): in_files = helpers.datasets[dataset]["files"] print "Converting RDF to NetworkX graph started at", time.strftime(time_format) start = time.time() graph, node_id_map = rdf.convert_rdf_to_nx_graph(in_files, discard_classes=False) print "Converting RDF to NetworkX graph took", time.time() - start, "s" print "-----------------------------------------" print "Saving NodeID map started at", time.strftime(time_format) start = time.time() inout.save_to_file(node_id_map, path + "{0}_node_id_map".format(dataset)) print "Saving NodeID map took", time.time() - start, "s" print "-----------------------------------------" print "Building hypergraph started at", time.strftime(time_format) start = time.time() hypergraph = Hypergraph(graph) print "Building hypergraph took", time.time() - start, "s" print "-----------------------------------------" print "Saving hypergraph started at", time.strftime(time_format) start = time.time() hypergraph.save_to_file(path + "{0}_hgraph".format(dataset)) print "Saving hypergraph took", time.time() - start, "s" print "-----------------------------------------" print "Building characteristic matrix started at", time.strftime(time_format) start = time.time() rballs_database, index_node_map = similar_nodes_mining.extract_rballs_database(hypergraph, r_in=r_in, r_out=r_out, r_all=r_all) ch_matrix = CharacteristicMatrix(rballs_database, hypergraph.number_of_nodes(), wl_iterations=wl_iterations, print_progress=True) print "Building characteristic matrix took", time.time() - start, "s" print "-----------------------------------------" print "Saving Column index to Node map started at", time.strftime(time_format) start = time.time() inout.save_to_file(index_node_map, path + "{0}_index_node_map".format(dataset)) print "Saving Column index to Node map took", time.time() - start, "s" print "-----------------------------------------" print "Saving characteristic matrix started at", time.strftime(time_format) start = time.time() ch_matrix.save_to_file(path + "{0}_ch_matrix".format(dataset)) print "Saving characteristic matrix took", time.time() - start, "s" print "-----------------------------------------" return ch_matrix, hypergraph, index_node_map, node_id_map
def testCharacteristicMatrix(self): dummy_hypergraph = Hypergraph(example_graphs.snm_dummy_graph) rballs_database, _ = similar_nodes_mining.extract_rballs_database( dummy_hypergraph, r_in=3, r_out=2, r_all=0) nodes_count = dummy_hypergraph.number_of_nodes() ch_matrix = CharacteristicMatrix(rballs_database, nodes_count, wl_iterations=0) self.assertEqual(self.raw_ch_matrix_exp, ch_matrix.sparse_matrix, "The computed characteristic matrix is wrong.")
def testSimilarNodesMining(self): dummy_hypergraph = Hypergraph(example_graphs.snm_dummy_graph) rballs_database, _ = similar_nodes_mining.extract_rballs_database( dummy_hypergraph, r_in=3, r_out=2, r_all=0) nodes_count = dummy_hypergraph.number_of_nodes() ch_matrix = CharacteristicMatrix(rballs_database, nodes_count, wl_iterations=0) ch_matrix_jaccard_sim = ch_matrix.compute_jaccard_similarity_matrix() similarity_matrix_exp = np.array(ch_matrix_jaccard_sim >= 0.8, dtype=np.float32) sketch_matrix = SketchMatrix(25, 265, ch_matrix) similarity_matrix = similar_nodes_mining.get_node_similarity_matrix( sketch_matrix) equality = (similarity_matrix_exp == similarity_matrix).all() self.assertTrue( equality, "The computed similarity matrix is wrong (Keep in mind that the sketch_matrix is probabilistic, therefore, it may not be always correct. The test may pass in another run.)." )
def testSketchMatrix_ReadWrite(self): file_name = "test_files/sketch_matrix.tmp" dummy_hypergraph = Hypergraph(example_graphs.snm_dummy_graph) rballs_database, _ = similar_nodes_mining.extract_rballs_database( dummy_hypergraph, r_in=2, r_out=2, r_all=0) nodes_count = dummy_hypergraph.number_of_nodes() ch_matrix = CharacteristicMatrix(rballs_database, nodes_count, wl_iterations=4) sketch_matrix = SketchMatrix(5, 20, ch_matrix) sketch_matrix.save_to_file(file_name) read_sketch_matrix = SketchMatrix.load_from_file(file_name) equality = (read_sketch_matrix.matrix == sketch_matrix.matrix).all() self.assertTrue( equality, "The read sketch matrix is different from the saved one.")
def loo_crossval_sketch(graph_database, wl_iter_range, k_L_range, output_dir, base_model={}, cols_count=None, shingles_type="features", window_size=5): '''Leave-one-out cross-validation. :param graph_database: Defined the same way as for CharacteristicMatrix constructor (but cannot be a generator). :param wl_iter_range: Range of Weisfeiler-Lehman iterations to be considered in the cross-validation. :param k_L_range: A range of (k, L) tuples for the sketch matrix to be considered in the cross-validation. # :param quality_function: a function with signature (G, sketch_matrix), where G is a list of graphs # representing a single entity in the database and sketch_matrix is a sketch matrix. The function # should return a real value. The cross-validation will find the model that maximizes this function. :param output_dir: A local directory, that will be used to save the sketch matrices of all models. :param base_model: A base model that is going to be extended by the new parameters. :return The best model as a dictionary. ''' def quality(i, sketch_matrix): col_i = sketch_matrix.get_column(i) similar_cols = list(sketch_matrix.get_similar_columns(col_i)) if i in similar_cols: similar_cols.remove(i) similar_targets = map(lambda c: graph_database[c][2], similar_cols) true_target_i = graph_database[i][2] estimated_target_i = statistics.predict_target_majority( similar_targets) # print "Col:", i, ", Target:", true_target_i, ", Est. target: ", estimated_target_i # print "Similar cols:", similar_cols # print "Similar targets:", similar_targets # print "--------------------------------------" # fp = open(output_dir + "classification_sketch", "a") # fp.write("Col: {0}, Target: {1}, Est. target: {2}\n".format(i, true_target_i, estimated_target_i)) # fp.write("Similar cols: {0}\n".format(similar_cols)) # fp.write("Similar targets: {0}\n".format(similar_targets)) # fp.write("--------------------------------------\n") # fp.close() if type(true_target_i) is list: return int(estimated_target_i in true_target_i) # zero-one loss else: return int(true_target_i == estimated_target_i) # zero-one loss best_model = model_infl_point(-1, -1, base_model=base_model) if not cols_count: cols_count = len(graph_database) models_file = open(output_dir + "models_sketch", "a") for wl_iterations in wl_iter_range: # start = time.time() ch_matrix = CharacteristicMatrix(graph_database, cols_count, wl_iterations=wl_iterations, print_progress=True, shingles_type=shingles_type, window_size=window_size) # print "Building characteristic matrix for wl_iter =", wl_iterations, "took:", time.time() - start for k, L in k_L_range: # start = time.time() sketch_matrix = SketchMatrix(k, L, ch_matrix) # print "Building sketch matrix for k={0} and L={1} took:".format(k, L), time.time() - start # sketch_matrix.save_to_file(output_dir + "sketch_matrix_wl{0}_k{1}_L{2}".format(wl_iterations, k, L)) # start = time.time() avg_quality = 0. for i in range(cols_count): avg_quality += float(quality(i, sketch_matrix)) avg_quality /= cols_count # print "Classification took:", time.time() - start current_model = model_infl_point(avg_quality, wl_iterations, k, L, base_model=base_model) print current_model models_file.write(str(current_model) + ",\n") models_file.flush() if avg_quality > best_model["quality"]: best_model = current_model if not base_model: # print best model when there are no outer parameters models_file.write("Best model: " + str(best_model) + "\n") models_file.close() return best_model
def d_fold_crossval(data, cols_count, d, k_L_range, output_dir, base_model={}, multilabel=False, multilabel_prediction_threshold=0.4): '''Cross-validation in d-folds. :param data: Input data, where each record is a tuple of the form (target, props), where props is a sparse vector. :param cols_count: Number of records in data. :param d: Number of cross-validation folds. :param k_L_range: A range of (k, L) tuples for the sketch matrix to be considered in the cross-validation. :param output_dir: A local directory, that will be used to save the sketch matrices of all models. :param base_model: A base model that is going to be extended by the new parameters. :param multilabel: (default False) If True, a record may have multiple different integer target labels. If False, the target labels are binary. :return: The best model as a dictionary. ''' def quality(sketch_matrix, train_sketch, test_sketch, train_targets, test_targets): k = sketch_matrix.k L = sketch_matrix.L train_cols_count = np.shape(train_sketch)[1] if multilabel: test_targets_pred = [] else: test_targets_proba = np.empty(len(test_targets)) for i in range(np.shape(test_sketch)[1]): col_i = test_sketch[:, i:i + 1] similar_cols = SketchMatrix._get_similar_columns( col_i, train_sketch, k, L, train_cols_count) similar_targets = itertools.chain( *map(lambda c: train_targets[c], similar_cols)) # similar_targets = list(itertools.chain(*map(lambda c: train_targets[c], similar_cols))) if multilabel: target_proportions = statistics.get_multilabel_target_proportions( similar_targets, np.shape(similar_cols)[0]) targets_pred = filter( lambda target: target_proportions[target] > multilabel_prediction_threshold, target_proportions) test_targets_pred.append(targets_pred) # print "Col:", i, ", Target:", test_targets[i], ", Est. target: ", targets_pred # print "Similar cols:", similar_cols # print "Similar targets:", similar_targets # print "--------------------------------------" # _acc, _prec, _recall, _f1 = statistics.multi_label_scores([test_targets[i]], [targets_pred]) # fp = open(output_dir + "classification_sketch", "a") # fp.write("Col: {0}, Target: {1}, Est. target: {2}\n".format(i, test_targets[i], targets_pred)) # fp.write("Accuracy: {0}, Precision: {1}, Recall: {2}, F1: {3}\n".format(_acc, _prec, _recall, _f1)) # fp.write("Similar cols: {0}\n".format(list(similar_cols))) # fp.write("Similar targets: {0}\n".format(similar_targets)) # fp.write("--------------------------------------\n") # fp.close() else: estimated_target_proba_i = statistics.predict_binary_target_proba( similar_targets) test_targets_proba[i] = estimated_target_proba_i if multilabel: # TODO: compute also AUC? acc, prec, recall, f1 = statistics.multi_label_scores( test_targets, test_targets_pred) return -1., acc, prec, recall, f1 else: return all_scores(test_targets, test_targets_proba) best_model = model_score([-1., -1., -1., -1., -1.], base_model=base_model) models_file = open(output_dir + "models_sketch", "a") start = time.time() ch_matrix = CharacteristicMatrix(records=data, cols_count=cols_count, print_progress=True) targets = ch_matrix.target_values print "Building characteristic matrix took:", time.time() - start for k, L in k_L_range: start = time.time() sketch_matrix = SketchMatrix(k, L, ch_matrix) print "Building sketch matrix for k={0} and L={1} took:".format( k, L), time.time() - start # sketch_matrix.save_to_file(output_dir + "sketch_matrix_wl{0}_k{1}_L{2}".format(wl_iterations, k, L)) start = time.time() avg_score = d_folds(d, sketch_matrix, cols_count, quality, targets) print "Classification took:", time.time() - start current_model = model_score(avg_score, k, L, base_model=base_model) print current_model models_file.write(str(current_model) + ",\n") models_file.flush() if avg_score[0] > best_model["auc"]: best_model = current_model if not base_model: # print best model when there are no outer parameters models_file.write("Best model: " + str(best_model) + "\n") models_file.close() return best_model