def featurize_candidates(bucket, prefix, batch_size, source_filename): imgnt = imagenet.ImageNetData() cds = candidate_data.CandidateData(verbose=False) filenames_to_ignore = [ '2018-08-06_17:33_vaishaal.json', '2018-08-17_17:24_vaishaal.json', 'vaishaal_hits_submitted_2018-08-17-18:28:33-PDT.json', 'vaishaal_hits_submitted_2018-08-17-18:50:38-PDT.json', 'vaishaal_hits_submitted_2018-08-17-19:28:24-PDT.json', 'vaishaal_hits_submitted_2018-08-17-19:56:28-PDT.json', 'vaishaal_hits_submitted_2018-08-25-09:47:26-PDT.json'] mturk = mturk_data.MTurkData(live=True, load_assignments=True, source_filenames_to_ignore=filenames_to_ignore, verbose=False) to_featurize = [] to_featurize_keys = [] client = utils.get_s3_client() i = 0 #candidate_list = dataset_sampling.get_histogram_sampling_ndc_candidates(imgnet=imgnt, cds=cds, mturk=mturk) start = timer() with open('../data/metadata/fc7_candidates.json', 'r') as f: candidate_list = json.load(f) for k in candidate_list: key_name = os.path.join(prefix, str(k)+".npy") key_exists = utils.key_exists(bucket, key_name) if not key_exists: img = cds.load_image(k, size='original', verbose=False) img = skimage.transform.resize(img, FEATURIZE_SIZE, preserve_range=True) to_featurize.append(img) to_featurize_keys.append(k) #if i > 250: # break; i = i + 1 print('Got candidate {}'.format(i)) end = timer() print(f"Took {end-start} seconds to get remaining candidates.") print('Beginning featurization of {} items'.format(len(to_featurize_keys))) if len(to_featurize) > 0: to_featurize = np.stack(to_featurize, axis=0) print(f"input shape {to_featurize.shape}") batch_size = min(len(to_featurize), batch_size) features = featurize.vgg16_features(to_featurize, batch_size=batch_size) print(f"features shape {features.shape}") for i,f in enumerate(features): key_name = os.path.join(prefix, to_featurize_keys[i]+".npy") bio = io.BytesIO() np.save(bio, f) print("writing key {0}".format(key_name)) utils.put_s3_object_bytes_with_backoff(bio.getvalue(), key_name) print(f"Took {end-start} seconds to get remaining candidates.")
def flatten_tarball(tarball_name, prefix, bucket="imagenet2datav2", verbose=False): tarball_bytes = utils.get_s3_file_bytes(tarball_name, cache_on_local_disk=False, verbose=verbose) tf = tarfile.open(fileobj=io.BytesIO(tarball_bytes)) for member in tf.getmembers(): if member.isfile(): file_bytes = tf.extractfile(member).read() key = prefix + member.name utils.put_s3_object_bytes_with_backoff(file_bytes, key, bucket=bucket, delay_factor=10) return
def compute_nearest_neighbors(distance_measures, candidate_filenames, reference_filenames, top_k, window_size, cache, cache_root): cache_key = compute_hash(distance_measures, candidate_filenames, reference_filenames, top_k, window_size) full_key = f"{cache_root}/{cache_key}" timing_info = {} if cache: if utils.key_exists(BUCKET, full_key): load_start = timer() ret_value = pickle.loads( utils.get_s3_object_bytes_with_backoff(full_key)[0]) load_end = timer() compute_start = compute_end = timer() timing_info['load_start'] = load_start timing_info['load_end'] = load_end timing_info['compute_start'] = compute_start timing_info['compute_end'] = compute_end timing_info['cached'] = True return ret_value, timing_info imgnt = imagenet.ImageNetData(cache_on_local_disk=True, verbose=False, cache_root_path='/tmp/imagenet2_cache') cds = candidate_data.CandidateData(cache_on_local_disk=True, load_metadata_from_s3=True, verbose=False, cache_root_path='/tmp/imagenet2_cache') loader = image_loader.ImageLoader(imgnt, cds, cache_on_local_disk=True, num_tries=4, cache_root_path='/tmp/imagenet2_cache') load_start = timer() if ('l2' in distance_measures) or ('dssim' in distance_measures): candidate_image_dict = loader.load_image_batch(candidate_filenames, size='scaled_256', force_rgb=True, verbose=False) reference_image_dict = loader.load_image_batch(reference_filenames, size='scaled_256', force_rgb=True, verbose=False) if 'fc7' in distance_measures: candidate_feature_dict = loader.load_features_batch( candidate_filenames, verbose=False) reference_feature_dict = loader.load_features_batch( reference_filenames, verbose=False) load_end = timer() compute_start = timer() result = {} for distance_measure in distance_measures: if distance_measure == 'l2': result['l2'] = compute_l2_distances(candidate_image_dict, reference_image_dict, 196608) elif distance_measure == 'dssim': result['dssim'] = compute_dssim_distances(candidate_image_dict, reference_image_dict, window_size) elif distance_measure == 'fc7': result['fc7'] = compute_l2_distances(candidate_feature_dict, reference_feature_dict, 4096) else: raise ValueError('Unknown distance measure') compute_end = timer() timing_info = {} timing_info['load_start'] = load_start timing_info['load_end'] = load_end timing_info['compute_start'] = compute_start timing_info['compute_end'] = compute_end timing_info['cached'] = False res = compute_top_k(result, top_k) if cache: utils.put_s3_object_bytes_with_backoff(pickle.dumps(res), full_key) return res, timing_info
def s3_cp(dest): data, _ = utils.get_s3_object_bytes_with_backoff(key, bucket=bucket) utils.put_s3_object_bytes_with_backoff(data, dest, bucket=bucket) return dest
json_dir, json_data, blacklist = candidate_data.load_data() time_string = datetime.now(timezone.utc).strftime('%Y-%m-%d_%H-%M-%S_%Z') key = 'metadata/candidate_metadata_' + time_string + '.pickle' pickle_dict = {} pickle_dict['json_data'] = json_data pickle_dict['json_dir'] = json_dir pickle_dict['blacklist'] = blacklist pickle_dict['username'] = getpass.getuser() pickle_dict['time_string'] = time_string pickle_bytes = pickle.dumps(pickle_dict) utils.put_s3_object_bytes_with_backoff(pickle_bytes, key, bucket=bucket) if num_replicas > 1: destinations = [] replicas_counter_len = len(str(num_replicas)) format_string = '_replica{{:0{}d}}-{{}}'.format(replicas_counter_len) for ii in range(num_replicas): destinations.append(key + format_string.format(ii + 1, num_replicas)) if use_pywren_for_replicas: def s3_cp(dest): data, _ = utils.get_s3_object_bytes_with_backoff(key, bucket=bucket) utils.put_s3_object_bytes_with_backoff(data, dest, bucket=bucket) return dest