Esempi in Python per hdfs_path, esempi in Python per hdfs_paths.hdfs_path

Esempio n. 1

0

Mostra file

File: image_mapper.py Progetto: ashishlal/image-analyzer

def kmeans(config):
    """ Kmeans with merging and counting of perceptive hashes and 
    ward hashes among clusters."""
    measures = sc.pickleFile(hdfs_path(config, "map_each_image", "measures"))
    data = measures.map(lambda x: (x[1]["id"], flatten_hist_cen(x[1]), x[1]["phash"], x[1]["ward"])).cache()
    K = config["n_clusters_group"]
    convergeDist = config["kmeans_group_converge"]
    sample = data.takeSample(False, K, 1)
    kPoints = [k[1] for k in sample]
    tempDist = 10 * convergeDist
    idx = 0
    within_set_sse = []
    while tempDist > convergeDist:
        max_len = config["in_memory_set_len"] / K
        ward_max_len = int(0.5 * max_len)
        phash_max_len = int(max_len - ward_max_len)
        closest = data.map(partial(km_map, kPoints))
        pointStats = closest.reduceByKey(partial(reduce_dist, ward_max_len, phash_max_len))
        pts_hash_union = pointStats.map(lambda (x, (y, z, u, w)): (x, (y / z, u, w)))
        tempDist = pts_hash_union.map(lambda (x, (y, u, w)): np.sum((kPoints[x] - y) ** 2)).sum()
        newPoints = pts_hash_union.map(lambda (x, (y, u, w)): (x, np.array(y, dtype="int32"))).collect()
        idx += 1
        if idx > config["max_iter_group"]:
            break
        print("kmeans did iteration: ", idx, file=sys.stderr)
    for (x, y) in newPoints:
        kPoints[x] = y
    phash_unions = pts_hash_union.map(lambda (x, (y, u, w)): u)
    phash_unions.saveAsPickleFile(hdfs_path(config, "km", "phash_unions"))
    ward_unions = pts_hash_union.map(lambda (x, (y, u, w)): w)
    ward_unions.saveAsPickleFile(hdfs_path(config, "km", "ward_unions"))
    # The rest of the function deals with writing various lookup tables.

    # save the fit data and the meta stats as a single item in list
    kpsave = sc.parallelize([kPoints, tempDist, within_set_sse])
    kpsave.saveAsPickleFile(hdfs_path(config, "km", "cluster_center_meta"))

    def flat(field_to_field):
        flat_map = partial(flat_map_indicators, config["phash_chunk_len"], kPoints, {field_to_field: True})
        data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile(hdfs_path(config, "km", field_to_field))

    options = options_template.copy()
    options.update(config["kmeans_output"])
    for k, v in options.items():
        if v:
            flat(k)

Esempio n. 2

0

Mostra file

def find_similar(sc, config):
    """Use cluster to hash and hash to key 
    joins to find_similar images.

    TODO: more rounds of search, and have an 
    option to do ward OR perceptive hash OR both.
    Ward is more expansive (false positives) than 
    perceptive hashes, so the join can get slow with many
    matches.  Maybe ward hashes should be a second try."""
    kmeans_meta = sc.pickleFile(hdfs_path(config, 'km','cluster_center_meta'))
    kmeans_meta = kmeans_meta.map(lambda x:x).collect()
    kPoints, tempDist, within_set_sse = kmeans_meta
    phash_unions = sc.pickleFile(
                    hdfs_path(config, 'km', 'phash_unions')
                ).map(
                    lambda x:x
                ).collect()
    ward_unions = sc.pickleFile(
                    hdfs_path(config, 'km', 'ward_unions')
                ).map(lambda x:x).collect()
    if not config.get('candidate_has_mapped'):
        scores = map_each_image(sc, 
                config, 
                config['candidate_spec'], 
                config['candidate_measures_spec'])
    else:
        scores = sc.pickleFile(config['candidate_measures_spec'])
    scores.cache()
    for net_round in range(config['search_rounds']):
        samples = join_nearest(sc,
                                config,
                                kPoints, 
                                phash_unions,
                                ward_unions,
                            scores)
        
        #TODO logic here for more rounds of sampling
    return samples

Esempio n. 3

0

Mostra file

File: fuzzify_training.py Progetto: MShaffar19/image-analyzer

def fuzzify(config, fname, hdfs_name):
    from PIL import Image
    img = Image.open(fname)
    n = np.array(img)
    for i in range(0, n.shape[0] - filterx, filterx):
        for j in range(0, n.shape[1] - filtery, filtery):
            if random.uniform(0, 1) < change_perc:
                for z in range(3):
                    n[i:i + filterx, j:j + filtery,
                      z] = np.median(n[i:i + filterx, j:j + filtery, z])
    new = Image.fromarray(np.array(np.round(n), dtype=np.uint8))
    loc_name = fname + 'fuz'
    new.save(loc_name, format="png")
    print(
        sp.Popen([
            'hadoop', 'fs', '-put', loc_name,
            hdfs_path(config, config['fuzzy_example_data'], hdfs_name)
        ]).communicate())

Esempio n. 4

0

Mostra file

File: fuzzify_training.py Progetto: ashishlal/image-analyzer

def fuzzify(config, fname, hdfs_name):
    from PIL import Image
    img = Image.open(fname)
    n = np.array(img)
    for i in range(0,n.shape[0] -filterx, filterx):
        for j in range(0,n.shape[1]-filtery, filtery):
            if random.uniform(0, 1) < change_perc:
                for z in range(3):
                    n[i: i+filterx, j:j+filtery,z] = np.median(n[i:i+filterx,j:j+filtery,z])
    new = Image.fromarray(np.array(np.round(n), dtype=np.uint8))
    loc_name = fname + 'fuz'
    new.save(loc_name,format="png")
    print(sp.Popen(['hadoop', 
    'fs',
    '-put', 
    loc_name, 
    hdfs_path(config, 
    config['fuzzy_example_data'], 
    hdfs_name)]).communicate())

Esempio n. 5

0

Mostra file

def join_nearest(sc, 
                config, 
                kPoints, 
                phash_unions, 
                ward_unions, 
                scores):
    
    """Use candidates' scores to assign them to best clusters based 
    on euclidean distance and number of matching hashes, ward or perceptive.
    Join those assigned clusters to perceptive and ward hashes from training
    and then join hashes to keys."""
    pw = tuple(zip(('phash', 'ward'), (phash_unions, ward_unions)))
    def _best_cluster(x):
        counts =[]
        for ward_or_phash, unions in pw:
            counts.append([])
            for u in unions:
                p = 0
                for item in x[1][ward_or_phash]:
                    p += u.get(item, 0)
                counts[-1].append(p)
        best = list(map(np.argmax, counts))
        distances = [np.sum((kPoints[i] - flatten_hist_cen(x[1]))**2) for i in range(len(kPoints))]
        best.append(np.argmin(distances))
        # TODO I think the following line has a typo distances[b] is what is should be.
        return [(x[0], (b, 'self', distances[best[-1]], x[1]['ward'], x[1]['phash'])) for b in best]
    best_clusters = scores.flatMap(_best_cluster)
    best_clusters
    best_clusters.sortBy(lambda x:x[1][2]).cache()
    phash_c = best_clusters.flatMap(
                    partial(cluster_chunk, 
                                config,
                                'phash'))
    phash_c_id = phash_c.map(lambda x:x[1])
    ward_c = best_clusters.flatMap(partial(cluster_chunk, 
                                        config,
                                        'ward'))
    ward_c_id = ward_c.map(lambda x:x[1])
    cluster_to_phash = sc.pickleFile(hdfs_path(config, 
                                                'km', 
                                                'cluster_to_phash'))
    cluster_to_ward = sc.pickleFile(hdfs_path(config, 
                                            'km', 
                                            'cluster_to_ward'))
    rdds = ( ward_c, phash_c)
    rdds2 = (ward_c_id, phash_c_id)
    table_names = ('ward_matches','phash_matches')
    labels = ('ward_to_key','phash_to_key')
    out = {}
    to_join = []
    for table, rdd, rdd2, label in zip(table_names, rdds, rdds2, labels):
    
        join_on_cluster = rdd.join(
            cluster_to_phash if table == 'phash_matches' else cluster_to_ward
        )
        map_ward_or_phash = join_on_cluster.map(lambda x:(x[1][0][0], x))
        to_key = sc.pickleFile(hdfs_path(config, 'km', label))
        hash_joined = map_ward_or_phash.join(
            to_key
        )
        hash_joined2 = rdd2.join(to_key)
        
        # pulling the two image keys out into pairs
        cand_key_to_key = hash_joined.map(
            lambda x: (x[1][0][1][0][1], x[1][-1])
        )
        samp = cand_key_to_key.take(config['search_sample_step'])
        out[table] = samp
        as_key_counts = cand_key_to_key.groupByKey(
            ).map(
            count_keys
            )
        as_key_counts.cache()
        as_key_counts.saveAsPickleFile(
            hdfs_path(config, 'candidates', config['candidate_batch'], "%s_counts" % label)
        )
        to_join.append(as_key_counts)
    # map the candidate id with best match of a hash with indicators of fit
    def map_best(x):
        """The key, (best agreeing key, vote count for agreeing, total votes) """
        (key, ((best_match, agree_count), dict_)) = x
        return (key, (best_match, agree_count, sum(dict_.values())))
    
    # join the ward best key with phash best key
    joined_final_matches = to_join[0].map(
                            map_best
                        ).join(
                            to_join[1].map(map_best)
                        )
    joined_final_matches.saveAsPickleFile(
            hdfs_path(config, 'candidates',config['candidate_batch'], 'joined_final_matches')
        )
    out['joined'] = joined_final_matches.take(config['search_sample_step'])
    return out

Esempio n. 6

0

Mostra file

File: image_mapper.py Progetto: ashishlal/image-analyzer

 def flat(field_to_field):
     flat_map = partial(flat_map_indicators, config["phash_chunk_len"], kPoints, {field_to_field: True})
     data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile(hdfs_path(config, "km", field_to_field))

Esempio n. 7

0

Mostra file

File: image_mapper.py Progetto: ashishlal/image-analyzer

    def flat(field_to_field):
        flat_map = partial(flat_map_indicators, config["phash_chunk_len"], kPoints, {field_to_field: True})
        data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile(hdfs_path(config, "km", field_to_field))

    options = options_template.copy()
    options.update(config["kmeans_output"])
    for k, v in options.items():
        if v:
            flat(k)


if __name__ == "__main__":
    if config.get("random_state"):
        config["random_state"] = np.random.RandomState(config["random_state"])
    else:
        config["random_state"] = np.random.RandomState(None)
    import datetime

    started = datetime.datetime.now()
    print("started at:::", started)
    actions = config["actions"]
    make_hdfs_dirs(config)
    if "map_each_image" in actions:
        map_each_image(sc, config, config["input_spec"], hdfs_path(config, "map_each_image", "measures"))
    if "kmeans" in actions:
        kmeans(config)
    if "find_similar" in actions:
        search.find_similar(sc, config)
    ended = datetime.datetime.now()
    print("Elapsed Time (seconds):::", (ended - started).total_seconds(), "\nAt", ended.isoformat())

Esempio n. 8

0

Mostra file

 def flat(field_to_field):
     flat_map = partial(flat_map_indicators, config['phash_chunk_len'],
                        kPoints, {field_to_field: True})
     data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile(
         hdfs_path(config, 'km', field_to_field))

Esempio n. 9

0

Mostra file

def kmeans(config):
    """ Kmeans with merging and counting of perceptive hashes and 
    ward hashes among clusters."""
    measures = sc.pickleFile(hdfs_path(config, 'map_each_image', 'measures'))
    data = measures.map(lambda x: (x[1]['id'], flatten_hist_cen(x[1]), x[1][
        'phash'], x[1]['ward'])).cache()
    K = config['n_clusters_group']
    convergeDist = config['kmeans_group_converge']
    sample = data.takeSample(False, K, 1)
    kPoints = [k[1] for k in sample]
    if convergeDist is not None:
        tempDist = 10 * convergeDist
    else:
        tempDist = 1e12
    idx = 0
    within_set_sse = []
    while tempDist > convergeDist:
        max_len = config['in_memory_set_len'] / K
        ward_max_len = int(.5 * max_len)
        phash_max_len = int(max_len - ward_max_len)
        closest = data.map(partial(km_map, kPoints))
        pointStats = closest.reduceByKey(
            partial(reduce_dist, ward_max_len, phash_max_len))
        pts_hash_union = pointStats.map(lambda (x, (y, z, u, w)):
                                        (x, (y / z, u, w)))
        if convergeDist is not None:
            tempDist = pts_hash_union.map(lambda (x, (y, u, w)): np.sum(
                (kPoints[x] - y)**2)).sum()
        newPoints = pts_hash_union.map(
            lambda (x, (y, u, w)): (x, np.array(y, dtype="int32"))).collect()
        idx += 1
        if idx > config['max_iter_group']:
            break
        print('kmeans did iteration: ', idx, file=sys.stderr)
    for (x, y) in newPoints:
        kPoints[x] = y
    phash_unions = pts_hash_union.map(lambda (x, (y, u, w)): u)
    phash_unions.saveAsPickleFile(hdfs_path(config, 'km', 'phash_unions'))
    ward_unions = pts_hash_union.map(lambda (x, (y, u, w)): w)
    ward_unions.saveAsPickleFile(hdfs_path(config, 'km', 'ward_unions'))
    # The rest of the function deals with writing various lookup tables.

    # save the fit data and the meta stats as a single item in list
    kpsave = sc.parallelize([
        kPoints,
        tempDist,
        within_set_sse,
    ])
    kpsave.saveAsPickleFile(hdfs_path(config, 'km', 'cluster_center_meta'))

    def flat(field_to_field):
        flat_map = partial(flat_map_indicators, config['phash_chunk_len'],
                           kPoints, {field_to_field: True})
        data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile(
            hdfs_path(config, 'km', field_to_field))

    options = options_template.copy()
    options.update(config['kmeans_output'])
    for k, v in options.items():
        if v:
            flat(k)

Esempio n. 10

0

Mostra file

        data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile(
            hdfs_path(config, 'km', field_to_field))

    options = options_template.copy()
    options.update(config['kmeans_output'])
    for k, v in options.items():
        if v:
            flat(k)


if __name__ == "__main__":
    if config.get('random_state'):
        config['random_state'] = np.random.RandomState(config['random_state'])
    else:
        config['random_state'] = np.random.RandomState(None)
    import datetime
    started = datetime.datetime.now()
    print('started at:::', started)
    actions = config['actions']
    make_hdfs_dirs(config)
    if 'map_each_image' in actions:
        map_each_image(sc, config, config['input_spec'],
                       hdfs_path(config, 'map_each_image', 'measures'))
    if 'kmeans' in actions:
        kmeans(config)
    if 'find_similar' in actions:
        search.find_similar(sc, config)
    ended = datetime.datetime.now()
    print('Elapsed Time (seconds):::', (ended - started).total_seconds(),
          '\nAt', ended.isoformat())