def clustering(dataset_folder, dataset, window_size=10, hash_module=1024, threshold=26, input_limit=None): loader = Loader() pages = loader.load_pages(dataset_folder, dataset) pages = pages[:input_limit] logger = Logger.get_instance() logger.print('############### INIZIO PASSO 1 ####################', 1) hash_table = {} algoritmo = Algoritmo() hash_table = algoritmo.passo1(pages, window_size, hash_module) logger.print('############### FINE PASSO 1 ####################', 1) logger.print(hash_table, 3) logger.print('############### INIZIO PASSO 2 ####################', 1) hash_table = algoritmo.passo2(hash_table, threshold) ## TODO: testing passo1 ## TODO: testing passo2 ## TODO: testing passo3 ## TODO: da rivedere bene come fare gli hash che per ora sono fortemente dipendenti dal modulo che scegliamo, anche alla luce dei risultati che raggiungiamo logger.print('############### FINE PASSO 2 ####################', 1) logger.print(hash_table, 3) logger.print('############### INIZIO PASSO 3 ####################', 1) cluster = {} cluster = algoritmo.passo3(hash_table, pages, hash_module, window_size) logger.print('################ FINE PASSO 3 ####################', 1) logger.print('Numero cluster ' + str(len(cluster)), 2) logger.print('\nClusters: \n', 2) logger.print(cluster, 2) file = open("prediction.csv", "w") index_cluster = 0 for key in cluster: logger.print("\ncluster\n", 3) for page in cluster[key]: file.write(page.name + ", " + str(index_cluster) + "\n") index_cluster += 1 file.close()
def passo1(self, pages, window_size=10, hash_module=256): logger = Logger.get_instance() hash_table = {} for page in pages: logger.print("Processing page: " + page.name, 2) shingle_set = extract_shingle_set(page, window_size) shingle_vector = create_shingle_vector(shingle_set, hash_module) masked_shingle_vectors = k_shingle_cover(shingle_vector, 6) for masked_shingle_vector in masked_shingle_vectors: #Ecco la bruttura if (masked_shingle_vector.getContent() in hash_table): hash_table[masked_shingle_vector.getContent( )] = hash_table.get(masked_shingle_vector.getContent()) + 1 else: hash_table[masked_shingle_vector.getContent()] = 1 return hash_table