Exemple #1
0
def featurize_candidates(bucket, prefix, batch_size, source_filename):
    imgnt = imagenet.ImageNetData()
    cds = candidate_data.CandidateData(verbose=False)
    filenames_to_ignore = [
        '2018-08-06_17:33_vaishaal.json',
        '2018-08-17_17:24_vaishaal.json',
        'vaishaal_hits_submitted_2018-08-17-18:28:33-PDT.json',
        'vaishaal_hits_submitted_2018-08-17-18:50:38-PDT.json',
        'vaishaal_hits_submitted_2018-08-17-19:28:24-PDT.json',
        'vaishaal_hits_submitted_2018-08-17-19:56:28-PDT.json',
        'vaishaal_hits_submitted_2018-08-25-09:47:26-PDT.json']
    mturk = mturk_data.MTurkData(live=True, load_assignments=True, source_filenames_to_ignore=filenames_to_ignore, verbose=False)
    to_featurize = []
    to_featurize_keys = []
    client = utils.get_s3_client()
    i = 0
    #candidate_list = dataset_sampling.get_histogram_sampling_ndc_candidates(imgnet=imgnt, cds=cds, mturk=mturk)
    start = timer()
    with open('../data/metadata/fc7_candidates.json', 'r') as f:
        candidate_list = json.load(f)
    for k in candidate_list:
        key_name = os.path.join(prefix, str(k)+".npy")
        key_exists = utils.key_exists(bucket, key_name)
        if not key_exists:
            img = cds.load_image(k, size='original', verbose=False)
            img  = skimage.transform.resize(img, FEATURIZE_SIZE, preserve_range=True)
            to_featurize.append(img)
            to_featurize_keys.append(k)
            #if i > 250:
            #    break;
            i = i + 1
            print('Got candidate {}'.format(i))
    end = timer()
    print(f"Took {end-start} seconds to get remaining candidates.")
    print('Beginning featurization of {} items'.format(len(to_featurize_keys)))
    if len(to_featurize) > 0:
        to_featurize = np.stack(to_featurize, axis=0)
        print(f"input shape {to_featurize.shape}")
        batch_size = min(len(to_featurize), batch_size)
        features = featurize.vgg16_features(to_featurize, batch_size=batch_size)
        print(f"features shape {features.shape}")
        for i,f in enumerate(features):
            key_name = os.path.join(prefix, to_featurize_keys[i]+".npy")
            bio = io.BytesIO()
            np.save(bio, f)
            print("writing key {0}".format(key_name))
            utils.put_s3_object_bytes_with_backoff(bio.getvalue(), key_name)
    print(f"Took {end-start} seconds to get remaining candidates.")
def flatten_tarball(tarball_name,
                    prefix,
                    bucket="imagenet2datav2",
                    verbose=False):
    tarball_bytes = utils.get_s3_file_bytes(tarball_name,
                                            cache_on_local_disk=False,
                                            verbose=verbose)
    tf = tarfile.open(fileobj=io.BytesIO(tarball_bytes))
    for member in tf.getmembers():
        if member.isfile():
            file_bytes = tf.extractfile(member).read()
            key = prefix + member.name
            utils.put_s3_object_bytes_with_backoff(file_bytes,
                                                   key,
                                                   bucket=bucket,
                                                   delay_factor=10)
    return
Exemple #3
0
def compute_nearest_neighbors(distance_measures, candidate_filenames,
                              reference_filenames, top_k, window_size, cache,
                              cache_root):
    cache_key = compute_hash(distance_measures, candidate_filenames,
                             reference_filenames, top_k, window_size)
    full_key = f"{cache_root}/{cache_key}"
    timing_info = {}
    if cache:
        if utils.key_exists(BUCKET, full_key):
            load_start = timer()
            ret_value = pickle.loads(
                utils.get_s3_object_bytes_with_backoff(full_key)[0])
            load_end = timer()
            compute_start = compute_end = timer()
            timing_info['load_start'] = load_start
            timing_info['load_end'] = load_end
            timing_info['compute_start'] = compute_start
            timing_info['compute_end'] = compute_end
            timing_info['cached'] = True
            return ret_value, timing_info

    imgnt = imagenet.ImageNetData(cache_on_local_disk=True,
                                  verbose=False,
                                  cache_root_path='/tmp/imagenet2_cache')
    cds = candidate_data.CandidateData(cache_on_local_disk=True,
                                       load_metadata_from_s3=True,
                                       verbose=False,
                                       cache_root_path='/tmp/imagenet2_cache')
    loader = image_loader.ImageLoader(imgnt,
                                      cds,
                                      cache_on_local_disk=True,
                                      num_tries=4,
                                      cache_root_path='/tmp/imagenet2_cache')
    load_start = timer()
    if ('l2' in distance_measures) or ('dssim' in distance_measures):
        candidate_image_dict = loader.load_image_batch(candidate_filenames,
                                                       size='scaled_256',
                                                       force_rgb=True,
                                                       verbose=False)
        reference_image_dict = loader.load_image_batch(reference_filenames,
                                                       size='scaled_256',
                                                       force_rgb=True,
                                                       verbose=False)
    if 'fc7' in distance_measures:
        candidate_feature_dict = loader.load_features_batch(
            candidate_filenames, verbose=False)
        reference_feature_dict = loader.load_features_batch(
            reference_filenames, verbose=False)
    load_end = timer()

    compute_start = timer()
    result = {}
    for distance_measure in distance_measures:
        if distance_measure == 'l2':
            result['l2'] = compute_l2_distances(candidate_image_dict,
                                                reference_image_dict, 196608)
        elif distance_measure == 'dssim':
            result['dssim'] = compute_dssim_distances(candidate_image_dict,
                                                      reference_image_dict,
                                                      window_size)
        elif distance_measure == 'fc7':
            result['fc7'] = compute_l2_distances(candidate_feature_dict,
                                                 reference_feature_dict, 4096)
        else:
            raise ValueError('Unknown distance measure')
    compute_end = timer()
    timing_info = {}
    timing_info['load_start'] = load_start
    timing_info['load_end'] = load_end
    timing_info['compute_start'] = compute_start
    timing_info['compute_end'] = compute_end
    timing_info['cached'] = False

    res = compute_top_k(result, top_k)
    if cache:
        utils.put_s3_object_bytes_with_backoff(pickle.dumps(res), full_key)

    return res, timing_info
 def s3_cp(dest):
     data, _ = utils.get_s3_object_bytes_with_backoff(key,
                                                      bucket=bucket)
     utils.put_s3_object_bytes_with_backoff(data, dest, bucket=bucket)
     return dest
json_dir, json_data, blacklist = candidate_data.load_data()

time_string = datetime.now(timezone.utc).strftime('%Y-%m-%d_%H-%M-%S_%Z')
key = 'metadata/candidate_metadata_' + time_string + '.pickle'

pickle_dict = {}
pickle_dict['json_data'] = json_data
pickle_dict['json_dir'] = json_dir
pickle_dict['blacklist'] = blacklist
pickle_dict['username'] = getpass.getuser()
pickle_dict['time_string'] = time_string

pickle_bytes = pickle.dumps(pickle_dict)

utils.put_s3_object_bytes_with_backoff(pickle_bytes, key, bucket=bucket)

if num_replicas > 1:
    destinations = []
    replicas_counter_len = len(str(num_replicas))
    format_string = '_replica{{:0{}d}}-{{}}'.format(replicas_counter_len)
    for ii in range(num_replicas):
        destinations.append(key + format_string.format(ii + 1, num_replicas))

    if use_pywren_for_replicas:

        def s3_cp(dest):
            data, _ = utils.get_s3_object_bytes_with_backoff(key,
                                                             bucket=bucket)
            utils.put_s3_object_bytes_with_backoff(data, dest, bucket=bucket)
            return dest