def testShingleExtraction(self):
     shingles_exp = [
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),4),1)",
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),4),1)",
         "(0.1;(1.2;(7,((0,1),(1,0))),4),(1.2;(8,((0,1))),3),1)",
         "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),4),1)",
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),5),1)",
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),5),1)",
         "(0.1;(1.2;(7,((0,1),(1,0))),5),(1.2;(8,((0,1))),3),1)",
         "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),5),1)",
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),6),1)",
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),6),1)",
         "(0.1;(1.2;(7,((0,1),(1,0))),6),(1.2;(8,((0,1))),3),1)",
         "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),6),1)",
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),4),2)",
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),4),2)",
         "(0.1;(1.2;(7,((0,1),(1,0))),4),(1.2;(8,((0,1))),3),2)",
         "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),4),2)",
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),5),2)",
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),5),2)",
         "(0.1;(1.2;(7,((0,1),(1,0))),5),(1.2;(8,((0,1))),3),2)",
         "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),5),2)",
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),6),2)",
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),6),2)",
         "(0.1;(1.2;(7,((0,1),(1,0))),6),(1.2;(8,((0,1))),3),2)",
         "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),6),2)"
     ]
     shingles = shingle_extraction.extract_shingles(
         example_graphs.snm_dummy_feature)
     self.assertEqual(shingles_exp, list(shingles),
                      "Wrong shingles were extracted from feature.")
 def testShingleExtraction(self):
     shingles_exp = [
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),4),1)",
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),4),1)",
         "(0.1;(1.2;(7,((0,1),(1,0))),4),(1.2;(8,((0,1))),3),1)",
         "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),4),1)",
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),5),1)",
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),5),1)",
         "(0.1;(1.2;(7,((0,1),(1,0))),5),(1.2;(8,((0,1))),3),1)",
         "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),5),1)",
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),6),1)",
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),6),1)",
         "(0.1;(1.2;(7,((0,1),(1,0))),6),(1.2;(8,((0,1))),3),1)",
         "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),6),1)",
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),4),2)",
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),4),2)",
         "(0.1;(1.2;(7,((0,1),(1,0))),4),(1.2;(8,((0,1))),3),2)",
         "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),4),2)",
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),5),2)",
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),5),2)",
         "(0.1;(1.2;(7,((0,1),(1,0))),5),(1.2;(8,((0,1))),3),2)",
         "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),5),2)",
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(7,((0,1),(1,0))),6),2)",
         "(0.1;(1.2;(7,((0,1))),3),(1.2;(8,((1,0))),6),2)",
         "(0.1;(1.2;(7,((0,1),(1,0))),6),(1.2;(8,((0,1))),3),2)",
         "(0.1;(1.2;(8,((0,1))),3),(1.2;(8,((1,0))),6),2)"
     ]
     shingles = shingle_extraction.extract_shingles(example_graphs.snm_dummy_feature)
     self.assertEqual(shingles_exp, list(shingles), "Wrong shingles were extracted from feature.")
Ejemplo n.º 3
0
 def inner(query_features):
     for features in query_features:
         for feature in features:
             shingles = shingle_extraction.extract_shingles(feature)
             fingerprints = fingerprint.get_fingerprints(shingles)
             for fp in fingerprints:
                 yield fp
Ejemplo n.º 4
0
 def inner(query_features):
     for features in query_features:
         for feature in features:
             shingles = shingle_extraction.extract_shingles(feature)
             fingerprints = fingerprint.get_fingerprints(shingles)
             for fp in fingerprints:
                 yield fp
Ejemplo n.º 5
0
def get_minhash_fingerprint_naive(feature, h, cached_shingles_dict=None):
    '''Get naively the fingerprint of the shingle which has minimal
    index (wrt the permutation defined by h) among all shingles
    contained in the feature.
    :param feature: a Networkx graph.
    :param h: a hash function defining a permutation of fingerprints.
    :param cached_shingles_dict (optional): A dictionary of the form {feature_id : set_of_fingerprints_of_shingles}
    :return An integer fingerprint of a shingle.
    '''
    if type(cached_shingles_dict) is dict:
        _, feature_id = arnborg_proskurowski.get_canonical_representation(feature)
        if feature_id in cached_shingles_dict:
            fingerprints = cached_shingles_dict[feature_id]
        else:
            shingles = shingle_extraction.extract_shingles(feature)
            fingerprints = set(get_fingerprints(shingles))
            cached_shingles_dict[feature_id] = fingerprints
    else:
        shingles = shingle_extraction.extract_shingles(feature)
        fingerprints = get_fingerprints(shingles)
    return min(fingerprints, key=h)
Ejemplo n.º 6
0
def get_minhash_fingerprint_naive(feature, h, cached_shingles_dict=None):
    '''Get naively the fingerprint of the shingle which has minimal
    index (wrt the permutation defined by h) among all shingles
    contained in the feature.
    :param feature: a Networkx graph.
    :param h: a hash function defining a permutation of fingerprints.
    :param cached_shingles_dict (optional): A dictionary of the form {feature_id : set_of_fingerprints_of_shingles}
    :return An integer fingerprint of a shingle.
    '''
    if type(cached_shingles_dict) is dict:
        _, feature_id = arnborg_proskurowski.get_canonical_representation(
            feature)
        if feature_id in cached_shingles_dict:
            fingerprints = cached_shingles_dict[feature_id]
        else:
            shingles = shingle_extraction.extract_shingles(feature)
            fingerprints = set(get_fingerprints(shingles))
            cached_shingles_dict[feature_id] = fingerprints
    else:
        shingles = shingle_extraction.extract_shingles(feature)
        fingerprints = get_fingerprints(shingles)
    return min(fingerprints, key=h)
 def build(self, feature_lists):
     self.sparse_matrix = {}
     i = -1
     for _, record_features, _ in feature_lists:
         i += 1
         if self.print_progress:
             print "Ch.Mat.: Processing column", i, "of", self.cols_count
         for feature in record_features:
             shingles = shingle_extraction.extract_shingles(feature)
             fingerprints = fingerprint.get_fingerprints(shingles)
             for fp in fingerprints:
                 if not self.sparse_matrix.has_key(fp):
                     self.sparse_matrix[fp] = set()
                 self.sparse_matrix[fp].add(i)
 def compute_column_fingerprints(self, record_graphs):
     assert self.wl_state
     features = []
     for hypergraph in record_graphs:
         new_features, self.wl_state = feature_extraction.extract_features(hypergraph, self.wl_iterations, self.wl_state)
         features += new_features
     
     column = set()
     
     for feature in features:
         shingles = shingle_extraction.extract_shingles(feature)
         fingerprints = fingerprint.get_fingerprints(shingles)
         column |= set(fingerprints)
     
     return sorted(column)
Ejemplo n.º 9
0
def process_record(record, wl_iterations, state, binary_target_labels=True, shingles_type="no-shingles", window_size=5, accumulate_wl_shingles=True, fingerprints=True, save_just_last_wl_it=False):
    def get_sh_type(shingles_type):
        if shingles_type == 'all':
            # includes both w-shingles and features
            return 3
        elif shingles_type == 'w-shingles':
            return 2
        elif shingles_type == 'features':
            return 1
        else: # default 'no-shingles', which means that the whole canonical representations will be returned
            return 0
    
    sh_type = get_sh_type(shingles_type)
    files = state["files"]
    wl_state = state["wl_state"]
    shingle_id_map = state["shingle_id_map"]
    
    def process_shingles(shingles, record_data_vector, wl_it):
        next_shingle_id_key = "next_shingle_id" if accumulate_wl_shingles else "wl_{0}_next_shingle_id".format(wl_it)
        if not fingerprints:
            for shingle in shingles:
                if shingle not in shingle_id_map:
                    shingle_id_map[shingle] = wl_state[next_shingle_id_key]
                    wl_state[next_shingle_id_key] += 1
                record_data_vector.add((shingle_id_map[shingle], 1))
        else:
            shingle_ids = set(fingerprint.get_fingerprints(shingles, size=24))
            record_data_vector |= set(map(lambda shingle_id: (shingle_id, 1), shingle_ids))
    
    if sh_type < 2:
        print "Record ID: {0}, Target: {1}".format(record[0], record[2])
    else:
        print "Record ID: {0}, Target: {1}, Window-Size: {2}".format(record[0], record[2], window_size)
    
    if sh_type != 0:
        record_data_wl_vectors = {i: set() for i in range(wl_iterations + 1)}
    else: # for 'no-shingles'
        record_canon_repr = [] # a list containing the canonical representations for each WL iteration

    for record_graph in record[1]:
        if sh_type == 1 or sh_type == 3:
            # for 'features' or 'all'
            if accumulate_wl_shingles:
                record_data_vector = set()
            fea_ext_iter = feature_extraction.extract_features_for_each_wl_iter(record_graph, wl_iterations, wl_state["wl_state"])
            for wl_it, new_features, wl_state["wl_state"] in fea_ext_iter:
                if not accumulate_wl_shingles:
                    record_data_vector = set()
                for feature in new_features:
                    shingles = shingle_extraction.extract_shingles(feature)
                    process_shingles(shingles, record_data_vector, wl_it)
                record_data_wl_vectors[wl_it] |= record_data_vector
        
        elif sh_type == 2 or sh_type == 3:
            # for 'w-shingles' and 'all'
            # TODO: should we exclude records with tree-width > 3?
            if accumulate_wl_shingles:
                record_data_vector = set()
            w_shingles_ext_iter = shingle_extraction.extract_w_shingles_for_each_wl_iter(record_graph, wl_iterations, wl_state["wl_state"], window_size=window_size)
            for wl_it, new_w_shingles, wl_state["wl_state"] in w_shingles_ext_iter:
                if not accumulate_wl_shingles:
                    record_data_vector = set()
                process_shingles(new_w_shingles, record_data_vector, wl_it)
                record_data_wl_vectors[wl_it] |= record_data_vector
        
        elif sh_type == 0:
            # for 'no-shingles'
            record_canon_representations_iter = shingle_extraction.extract_canon_repr_for_each_wl_iter(record_graph, wl_iterations, wl_state["wl_state"])
            for _, canon_repr, wl_state["wl_state"] in record_canon_representations_iter:
                # just collect the canonical representations for each WL iteration
                record_canon_repr.append('"' + canon_repr + '"')
    
    for wl_it in range(wl_iterations + 1):
        if sh_type != 0:
            if binary_target_labels:
                data_instance = (record[2] if record[2] > 0 else -1, sorted(record_data_wl_vectors[wl_it], key=lambda x: x[0]))
            else:
                data_instance = (",".join(record[2]), sorted(record_data_wl_vectors[wl_it], key=lambda x: x[0]))
        
            if not save_just_last_wl_it:
                files[wl_it].write("{0} {1}\n".format(data_instance[0], " ".join(["{0}:{1}".format(f, v) for f, v in data_instance[1]])))
                files[wl_it].flush()
        
        else: # for 'no-shingles'
            if binary_target_labels:
                target = record[2] if record[2] > 0 else -1
            else:
                target = ",".join(record[2])
            data = '[' + ','.join(record_canon_repr[:wl_it + 1]) + ']'
            
            if not save_just_last_wl_it:
                files[wl_it].write("{0} {1}\n".format(target, data))
                files[wl_it].flush()
    
    if save_just_last_wl_it:
        files[0].write("{0} {1}\n".format(data_instance[0], " ".join(["{0}:{1}".format(f, v) for f, v in data_instance[1]])))
        files[0].flush()