Example #1
0
def find_similar(sc, config):
    """Use cluster to hash and hash to key 
    joins to find_similar images.

    TODO: more rounds of search, and have an 
    option to do ward OR perceptive hash OR both.
    Ward is more expansive (false positives) than 
    perceptive hashes, so the join can get slow with many
    matches.  Maybe ward hashes should be a second try."""
    kmeans_meta = sc.pickleFile(hdfs_path(config, 'km','cluster_center_meta'))
    kmeans_meta = kmeans_meta.map(lambda x:x).collect()
    kPoints, tempDist, within_set_sse = kmeans_meta
    phash_unions = sc.pickleFile(
                    hdfs_path(config, 'km', 'phash_unions')
                ).map(
                    lambda x:x
                ).collect()
    ward_unions = sc.pickleFile(
                    hdfs_path(config, 'km', 'ward_unions')
                ).map(lambda x:x).collect()
    if not config.get('candidate_has_mapped'):
        scores = map_each_image(sc, 
                config, 
                config['candidate_spec'], 
                config['candidate_measures_spec'])
    else:
        scores = sc.pickleFile(config['candidate_measures_spec'])
    scores.cache()
    for net_round in range(config['search_rounds']):
        samples = join_nearest(sc,
                                config,
                                kPoints, 
                                phash_unions,
                                ward_unions,
                            scores)
        
        #TODO logic here for more rounds of sampling
    return samples
Example #2
0
    def flat(field_to_field):
        flat_map = partial(flat_map_indicators, config["phash_chunk_len"], kPoints, {field_to_field: True})
        data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile(hdfs_path(config, "km", field_to_field))

    options = options_template.copy()
    options.update(config["kmeans_output"])
    for k, v in options.items():
        if v:
            flat(k)


if __name__ == "__main__":
    if config.get("random_state"):
        config["random_state"] = np.random.RandomState(config["random_state"])
    else:
        config["random_state"] = np.random.RandomState(None)
    import datetime

    started = datetime.datetime.now()
    print("started at:::", started)
    actions = config["actions"]
    make_hdfs_dirs(config)
    if "map_each_image" in actions:
        map_each_image(sc, config, config["input_spec"], hdfs_path(config, "map_each_image", "measures"))
    if "kmeans" in actions:
        kmeans(config)
    if "find_similar" in actions:
        search.find_similar(sc, config)
    ended = datetime.datetime.now()
    print("Elapsed Time (seconds):::", (ended - started).total_seconds(), "\nAt", ended.isoformat())
Example #3
0
        data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile(
            hdfs_path(config, 'km', field_to_field))

    options = options_template.copy()
    options.update(config['kmeans_output'])
    for k, v in options.items():
        if v:
            flat(k)


if __name__ == "__main__":
    if config.get('random_state'):
        config['random_state'] = np.random.RandomState(config['random_state'])
    else:
        config['random_state'] = np.random.RandomState(None)
    import datetime
    started = datetime.datetime.now()
    print('started at:::', started)
    actions = config['actions']
    make_hdfs_dirs(config)
    if 'map_each_image' in actions:
        map_each_image(sc, config, config['input_spec'],
                       hdfs_path(config, 'map_each_image', 'measures'))
    if 'kmeans' in actions:
        kmeans(config)
    if 'find_similar' in actions:
        search.find_similar(sc, config)
    ended = datetime.datetime.now()
    print('Elapsed Time (seconds):::', (ended - started).total_seconds(),
          '\nAt', ended.isoformat())