def testShingleExtraction(self): shingles_exp = [ "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),4),1)", "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),4),1)", "(0.1;(1.2;(7,((0,1),(1,0))),4),(1.2;(8,((0,1))),3),1)", "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),4),1)", "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),5),1)", "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),5),1)", "(0.1;(1.2;(7,((0,1),(1,0))),5),(1.2;(8,((0,1))),3),1)", "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),5),1)", "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),6),1)", "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),6),1)", "(0.1;(1.2;(7,((0,1),(1,0))),6),(1.2;(8,((0,1))),3),1)", "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),6),1)", "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),4),2)", "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),4),2)", "(0.1;(1.2;(7,((0,1),(1,0))),4),(1.2;(8,((0,1))),3),2)", "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),4),2)", "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),5),2)", "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),5),2)", "(0.1;(1.2;(7,((0,1),(1,0))),5),(1.2;(8,((0,1))),3),2)", "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),5),2)", "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),6),2)", "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),6),2)", "(0.1;(1.2;(7,((0,1),(1,0))),6),(1.2;(8,((0,1))),3),2)", "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),6),2)" ] shingles = shingle_extraction.extract_shingles( example_graphs.snm_dummy_feature) self.assertEqual(shingles_exp, list(shingles), "Wrong shingles were extracted from feature.")
def testShingleExtraction(self): shingles_exp = [ "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),4),1)", "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),4),1)", "(0.1;(1.2;(7,((0,1),(1,0))),4),(1.2;(8,((0,1))),3),1)", "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),4),1)", "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),5),1)", "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),5),1)", "(0.1;(1.2;(7,((0,1),(1,0))),5),(1.2;(8,((0,1))),3),1)", "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),5),1)", "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),6),1)", "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),6),1)", "(0.1;(1.2;(7,((0,1),(1,0))),6),(1.2;(8,((0,1))),3),1)", "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),6),1)", "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),4),2)", "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),4),2)", "(0.1;(1.2;(7,((0,1),(1,0))),4),(1.2;(8,((0,1))),3),2)", "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),4),2)", "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),5),2)", "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),5),2)", "(0.1;(1.2;(7,((0,1),(1,0))),5),(1.2;(8,((0,1))),3),2)", "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),5),2)", "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),6),2)", "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),6),2)", "(0.1;(1.2;(7,((0,1),(1,0))),6),(1.2;(8,((0,1))),3),2)", "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),6),2)" ] shingles = shingle_extraction.extract_shingles(example_graphs.snm_dummy_feature) self.assertEqual(shingles_exp, list(shingles), "Wrong shingles were extracted from feature.")
def inner(query_features): for features in query_features: for feature in features: shingles = shingle_extraction.extract_shingles(feature) fingerprints = fingerprint.get_fingerprints(shingles) for fp in fingerprints: yield fp
def get_minhash_fingerprint_naive(feature, h, cached_shingles_dict=None): '''Get naively the fingerprint of the shingle which has minimal index (wrt the permutation defined by h) among all shingles contained in the feature. :param feature: a Networkx graph. :param h: a hash function defining a permutation of fingerprints. :param cached_shingles_dict (optional): A dictionary of the form {feature_id : set_of_fingerprints_of_shingles} :return An integer fingerprint of a shingle. ''' if type(cached_shingles_dict) is dict: _, feature_id = arnborg_proskurowski.get_canonical_representation(feature) if feature_id in cached_shingles_dict: fingerprints = cached_shingles_dict[feature_id] else: shingles = shingle_extraction.extract_shingles(feature) fingerprints = set(get_fingerprints(shingles)) cached_shingles_dict[feature_id] = fingerprints else: shingles = shingle_extraction.extract_shingles(feature) fingerprints = get_fingerprints(shingles) return min(fingerprints, key=h)
def get_minhash_fingerprint_naive(feature, h, cached_shingles_dict=None): '''Get naively the fingerprint of the shingle which has minimal index (wrt the permutation defined by h) among all shingles contained in the feature. :param feature: a Networkx graph. :param h: a hash function defining a permutation of fingerprints. :param cached_shingles_dict (optional): A dictionary of the form {feature_id : set_of_fingerprints_of_shingles} :return An integer fingerprint of a shingle. ''' if type(cached_shingles_dict) is dict: _, feature_id = arnborg_proskurowski.get_canonical_representation( feature) if feature_id in cached_shingles_dict: fingerprints = cached_shingles_dict[feature_id] else: shingles = shingle_extraction.extract_shingles(feature) fingerprints = set(get_fingerprints(shingles)) cached_shingles_dict[feature_id] = fingerprints else: shingles = shingle_extraction.extract_shingles(feature) fingerprints = get_fingerprints(shingles) return min(fingerprints, key=h)
def build(self, feature_lists): self.sparse_matrix = {} i = -1 for _, record_features, _ in feature_lists: i += 1 if self.print_progress: print "Ch.Mat.: Processing column", i, "of", self.cols_count for feature in record_features: shingles = shingle_extraction.extract_shingles(feature) fingerprints = fingerprint.get_fingerprints(shingles) for fp in fingerprints: if not self.sparse_matrix.has_key(fp): self.sparse_matrix[fp] = set() self.sparse_matrix[fp].add(i)
def compute_column_fingerprints(self, record_graphs): assert self.wl_state features = [] for hypergraph in record_graphs: new_features, self.wl_state = feature_extraction.extract_features(hypergraph, self.wl_iterations, self.wl_state) features += new_features column = set() for feature in features: shingles = shingle_extraction.extract_shingles(feature) fingerprints = fingerprint.get_fingerprints(shingles) column |= set(fingerprints) return sorted(column)
def process_record(record, wl_iterations, state, binary_target_labels=True, shingles_type="no-shingles", window_size=5, accumulate_wl_shingles=True, fingerprints=True, save_just_last_wl_it=False): def get_sh_type(shingles_type): if shingles_type == 'all': # includes both w-shingles and features return 3 elif shingles_type == 'w-shingles': return 2 elif shingles_type == 'features': return 1 else: # default 'no-shingles', which means that the whole canonical representations will be returned return 0 sh_type = get_sh_type(shingles_type) files = state["files"] wl_state = state["wl_state"] shingle_id_map = state["shingle_id_map"] def process_shingles(shingles, record_data_vector, wl_it): next_shingle_id_key = "next_shingle_id" if accumulate_wl_shingles else "wl_{0}_next_shingle_id".format(wl_it) if not fingerprints: for shingle in shingles: if shingle not in shingle_id_map: shingle_id_map[shingle] = wl_state[next_shingle_id_key] wl_state[next_shingle_id_key] += 1 record_data_vector.add((shingle_id_map[shingle], 1)) else: shingle_ids = set(fingerprint.get_fingerprints(shingles, size=24)) record_data_vector |= set(map(lambda shingle_id: (shingle_id, 1), shingle_ids)) if sh_type < 2: print "Record ID: {0}, Target: {1}".format(record[0], record[2]) else: print "Record ID: {0}, Target: {1}, Window-Size: {2}".format(record[0], record[2], window_size) if sh_type != 0: record_data_wl_vectors = {i: set() for i in range(wl_iterations + 1)} else: # for 'no-shingles' record_canon_repr = [] # a list containing the canonical representations for each WL iteration for record_graph in record[1]: if sh_type == 1 or sh_type == 3: # for 'features' or 'all' if accumulate_wl_shingles: record_data_vector = set() fea_ext_iter = feature_extraction.extract_features_for_each_wl_iter(record_graph, wl_iterations, wl_state["wl_state"]) for wl_it, new_features, wl_state["wl_state"] in fea_ext_iter: if not accumulate_wl_shingles: record_data_vector = set() for feature in new_features: shingles = shingle_extraction.extract_shingles(feature) process_shingles(shingles, record_data_vector, wl_it) record_data_wl_vectors[wl_it] |= record_data_vector elif sh_type == 2 or sh_type == 3: # for 'w-shingles' and 'all' # TODO: should we exclude records with tree-width > 3? if accumulate_wl_shingles: record_data_vector = set() w_shingles_ext_iter = shingle_extraction.extract_w_shingles_for_each_wl_iter(record_graph, wl_iterations, wl_state["wl_state"], window_size=window_size) for wl_it, new_w_shingles, wl_state["wl_state"] in w_shingles_ext_iter: if not accumulate_wl_shingles: record_data_vector = set() process_shingles(new_w_shingles, record_data_vector, wl_it) record_data_wl_vectors[wl_it] |= record_data_vector elif sh_type == 0: # for 'no-shingles' record_canon_representations_iter = shingle_extraction.extract_canon_repr_for_each_wl_iter(record_graph, wl_iterations, wl_state["wl_state"]) for _, canon_repr, wl_state["wl_state"] in record_canon_representations_iter: # just collect the canonical representations for each WL iteration record_canon_repr.append('"' + canon_repr + '"') for wl_it in range(wl_iterations + 1): if sh_type != 0: if binary_target_labels: data_instance = (record[2] if record[2] > 0 else -1, sorted(record_data_wl_vectors[wl_it], key=lambda x: x[0])) else: data_instance = (",".join(record[2]), sorted(record_data_wl_vectors[wl_it], key=lambda x: x[0])) if not save_just_last_wl_it: files[wl_it].write("{0} {1}\n".format(data_instance[0], " ".join(["{0}:{1}".format(f, v) for f, v in data_instance[1]]))) files[wl_it].flush() else: # for 'no-shingles' if binary_target_labels: target = record[2] if record[2] > 0 else -1 else: target = ",".join(record[2]) data = '[' + ','.join(record_canon_repr[:wl_it + 1]) + ']' if not save_just_last_wl_it: files[wl_it].write("{0} {1}\n".format(target, data)) files[wl_it].flush() if save_just_last_wl_it: files[0].write("{0} {1}\n".format(data_instance[0], " ".join(["{0}:{1}".format(f, v) for f, v in data_instance[1]]))) files[0].flush()