def find_similar(sc, config): """Use cluster to hash and hash to key joins to find_similar images. TODO: more rounds of search, and have an option to do ward OR perceptive hash OR both. Ward is more expansive (false positives) than perceptive hashes, so the join can get slow with many matches. Maybe ward hashes should be a second try.""" kmeans_meta = sc.pickleFile(hdfs_path(config, 'km','cluster_center_meta')) kmeans_meta = kmeans_meta.map(lambda x:x).collect() kPoints, tempDist, within_set_sse = kmeans_meta phash_unions = sc.pickleFile( hdfs_path(config, 'km', 'phash_unions') ).map( lambda x:x ).collect() ward_unions = sc.pickleFile( hdfs_path(config, 'km', 'ward_unions') ).map(lambda x:x).collect() if not config.get('candidate_has_mapped'): scores = map_each_image(sc, config, config['candidate_spec'], config['candidate_measures_spec']) else: scores = sc.pickleFile(config['candidate_measures_spec']) scores.cache() for net_round in range(config['search_rounds']): samples = join_nearest(sc, config, kPoints, phash_unions, ward_unions, scores) #TODO logic here for more rounds of sampling return samples
def flat(field_to_field): flat_map = partial(flat_map_indicators, config["phash_chunk_len"], kPoints, {field_to_field: True}) data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile(hdfs_path(config, "km", field_to_field)) options = options_template.copy() options.update(config["kmeans_output"]) for k, v in options.items(): if v: flat(k) if __name__ == "__main__": if config.get("random_state"): config["random_state"] = np.random.RandomState(config["random_state"]) else: config["random_state"] = np.random.RandomState(None) import datetime started = datetime.datetime.now() print("started at:::", started) actions = config["actions"] make_hdfs_dirs(config) if "map_each_image" in actions: map_each_image(sc, config, config["input_spec"], hdfs_path(config, "map_each_image", "measures")) if "kmeans" in actions: kmeans(config) if "find_similar" in actions: search.find_similar(sc, config) ended = datetime.datetime.now() print("Elapsed Time (seconds):::", (ended - started).total_seconds(), "\nAt", ended.isoformat())
data.flatMap(lambda x: flat_map(*x)).saveAsPickleFile( hdfs_path(config, 'km', field_to_field)) options = options_template.copy() options.update(config['kmeans_output']) for k, v in options.items(): if v: flat(k) if __name__ == "__main__": if config.get('random_state'): config['random_state'] = np.random.RandomState(config['random_state']) else: config['random_state'] = np.random.RandomState(None) import datetime started = datetime.datetime.now() print('started at:::', started) actions = config['actions'] make_hdfs_dirs(config) if 'map_each_image' in actions: map_each_image(sc, config, config['input_spec'], hdfs_path(config, 'map_each_image', 'measures')) if 'kmeans' in actions: kmeans(config) if 'find_similar' in actions: search.find_similar(sc, config) ended = datetime.datetime.now() print('Elapsed Time (seconds):::', (ended - started).total_seconds(), '\nAt', ended.isoformat())