Esempio n. 1
0
 def _best_cluster(x):
     counts =[]
     for ward_or_phash, unions in pw:
         counts.append([])
         for u in unions:
             p = 0
             for item in x[1][ward_or_phash]:
                 p += u.get(item, 0)
             counts[-1].append(p)
     best = list(map(np.argmax, counts))
     distances = [np.sum((kPoints[i] - flatten_hist_cen(x[1]))**2) for i in range(len(kPoints))]
     best.append(np.argmin(distances))
     # TODO I think the following line has a typo distances[b] is what is should be.
     return [(x[0], (b, 'self', distances[best[-1]], x[1]['ward'], x[1]['phash'])) for b in best]
Esempio n. 2
0
def kmeans(config):
    """ Kmeans with merging and counting of perceptive hashes and 
    ward hashes among clusters."""
    measures = sc.pickleFile(hdfs_path(config, "map_each_image", "measures"))
    data = measures.map(lambda x: (x[1]["id"], flatten_hist_cen(x[1]), x[1]["phash"], x[1]["ward"])).cache()
    K = config["n_clusters_group"]
    convergeDist = config["kmeans_group_converge"]
    sample = data.takeSample(False, K, 1)
    kPoints = [k[1] for k in sample]
    tempDist = 10 * convergeDist
    idx = 0
    within_set_sse = []
    while tempDist > convergeDist:
        max_len = config["in_memory_set_len"] / K
        ward_max_len = int(0.5 * max_len)
        phash_max_len = int(max_len - ward_max_len)
        closest = data.map(partial(km_map, kPoints))
        pointStats = closest.reduceByKey(partial(reduce_dist, ward_max_len, phash_max_len))
        pts_hash_union = pointStats.map(lambda (x, (y, z, u, w)): (x, (y / z, u, w)))
        tempDist = pts_hash_union.map(lambda (x, (y, u, w)): np.sum((kPoints[x] - y) ** 2)).sum()
        newPoints = pts_hash_union.map(lambda (x, (y, u, w)): (x, np.array(y, dtype="int32"))).collect()
        idx += 1
        if idx > config["max_iter_group"]:
            break
        print("kmeans did iteration: ", idx, file=sys.stderr)
    for (x, y) in newPoints:
        kPoints[x] = y
    phash_unions = pts_hash_union.map(lambda (x, (y, u, w)): u)
    phash_unions.saveAsPickleFile(hdfs_path(config, "km", "phash_unions"))
    ward_unions = pts_hash_union.map(lambda (x, (y, u, w)): w)
    ward_unions.saveAsPickleFile(hdfs_path(config, "km", "ward_unions"))
    # The rest of the function deals with writing various lookup tables.

    # save the fit data and the meta stats as a single item in list
    kpsave = sc.parallelize([kPoints, tempDist, within_set_sse])
    kpsave.saveAsPickleFile(hdfs_path(config, "km", "cluster_center_meta"))

    def flat(field_to_field):
        flat_map = partial(flat_map_indicators, config["phash_chunk_len"], kPoints, {field_to_field: True})
        data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile(hdfs_path(config, "km", field_to_field))

    options = options_template.copy()
    options.update(config["kmeans_output"])
    for k, v in options.items():
        if v:
            flat(k)
Esempio n. 3
0
def kmeans(config):
    """ Kmeans with merging and counting of perceptive hashes and 
    ward hashes among clusters."""
    measures = sc.pickleFile(hdfs_path(config, 'map_each_image', 'measures'))
    data = measures.map(lambda x: (x[1]['id'], flatten_hist_cen(x[1]), x[1][
        'phash'], x[1]['ward'])).cache()
    K = config['n_clusters_group']
    convergeDist = config['kmeans_group_converge']
    sample = data.takeSample(False, K, 1)
    kPoints = [k[1] for k in sample]
    if convergeDist is not None:
        tempDist = 10 * convergeDist
    else:
        tempDist = 1e12
    idx = 0
    within_set_sse = []
    while tempDist > convergeDist:
        max_len = config['in_memory_set_len'] / K
        ward_max_len = int(.5 * max_len)
        phash_max_len = int(max_len - ward_max_len)
        closest = data.map(partial(km_map, kPoints))
        pointStats = closest.reduceByKey(
            partial(reduce_dist, ward_max_len, phash_max_len))
        pts_hash_union = pointStats.map(lambda (x, (y, z, u, w)):
                                        (x, (y / z, u, w)))
        if convergeDist is not None:
            tempDist = pts_hash_union.map(lambda (x, (y, u, w)): np.sum(
                (kPoints[x] - y)**2)).sum()
        newPoints = pts_hash_union.map(
            lambda (x, (y, u, w)): (x, np.array(y, dtype="int32"))).collect()
        idx += 1
        if idx > config['max_iter_group']:
            break
        print('kmeans did iteration: ', idx, file=sys.stderr)
    for (x, y) in newPoints:
        kPoints[x] = y
    phash_unions = pts_hash_union.map(lambda (x, (y, u, w)): u)
    phash_unions.saveAsPickleFile(hdfs_path(config, 'km', 'phash_unions'))
    ward_unions = pts_hash_union.map(lambda (x, (y, u, w)): w)
    ward_unions.saveAsPickleFile(hdfs_path(config, 'km', 'ward_unions'))
    # The rest of the function deals with writing various lookup tables.

    # save the fit data and the meta stats as a single item in list
    kpsave = sc.parallelize([
        kPoints,
        tempDist,
        within_set_sse,
    ])
    kpsave.saveAsPickleFile(hdfs_path(config, 'km', 'cluster_center_meta'))

    def flat(field_to_field):
        flat_map = partial(flat_map_indicators, config['phash_chunk_len'],
                           kPoints, {field_to_field: True})
        data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile(
            hdfs_path(config, 'km', field_to_field))

    options = options_template.copy()
    options.update(config['kmeans_output'])
    for k, v in options.items():
        if v:
            flat(k)