Example #1
0
def clustering(dataset_folder,
               dataset,
               window_size=10,
               hash_module=1024,
               threshold=26,
               input_limit=None):
    loader = Loader()
    pages = loader.load_pages(dataset_folder, dataset)
    pages = pages[:input_limit]

    logger = Logger.get_instance()
    logger.print('############### INIZIO PASSO 1 ####################', 1)
    hash_table = {}
    algoritmo = Algoritmo()
    hash_table = algoritmo.passo1(pages, window_size, hash_module)

    logger.print('############### FINE PASSO 1 ####################', 1)
    logger.print(hash_table, 3)

    logger.print('############### INIZIO PASSO 2 ####################', 1)
    hash_table = algoritmo.passo2(hash_table, threshold)

    ## TODO: testing passo1
    ## TODO: testing passo2
    ## TODO: testing passo3
    ## TODO: da rivedere bene come fare gli hash che per ora sono fortemente dipendenti dal modulo che scegliamo, anche alla luce dei risultati che raggiungiamo

    logger.print('############### FINE PASSO 2 ####################', 1)
    logger.print(hash_table, 3)

    logger.print('############### INIZIO PASSO 3 ####################', 1)
    cluster = {}
    cluster = algoritmo.passo3(hash_table, pages, hash_module, window_size)

    logger.print('################ FINE PASSO 3 ####################', 1)
    logger.print('Numero cluster ' + str(len(cluster)), 2)
    logger.print('\nClusters: \n', 2)
    logger.print(cluster, 2)

    file = open("prediction.csv", "w")
    index_cluster = 0
    for key in cluster:
        logger.print("\ncluster\n", 3)
        for page in cluster[key]:
            file.write(page.name + ", " + str(index_cluster) + "\n")
        index_cluster += 1

    file.close()
    def passo1(self, pages, window_size=10, hash_module=256):
        logger = Logger.get_instance()
        hash_table = {}
        for page in pages:
            logger.print("Processing page: " + page.name, 2)
            shingle_set = extract_shingle_set(page, window_size)
            shingle_vector = create_shingle_vector(shingle_set, hash_module)
            masked_shingle_vectors = k_shingle_cover(shingle_vector, 6)
            for masked_shingle_vector in masked_shingle_vectors:
                #Ecco la bruttura
                if (masked_shingle_vector.getContent() in hash_table):
                    hash_table[masked_shingle_vector.getContent(
                    )] = hash_table.get(masked_shingle_vector.getContent()) + 1
                else:
                    hash_table[masked_shingle_vector.getContent()] = 1

        return hash_table