def featurize_test_images(bucket, prefix, batch_size): imgnt = imagenet.ImageNetData() to_featurize = [] to_featurize_keys = [] client = utils.get_s3_client() start = timer() num_batches = 0 for k in imgnt.test_filenames: key_name = os.path.join(prefix, f"{k}.npy") key_exists = utils.key_exists(bucket, key_name) if not key_exists: img = imgnt.load_image(k, size='scaled_256', force_rgb=True, verbose=False) img = skimage.transform.resize(img, FEATURIZE_SIZE, preserve_range=True) to_featurize.append(img) to_featurize_keys.append(k) if len(to_featurize) >= batch_size: num_batches += 1 featurize_and_upload_batch(to_featurize, to_featurize_keys, batch_size, bucket, prefix, client) end = timer() print('processing bach {} (size {}) took {} seconds'.format( num_batches, len(to_featurize), end - start)) start = timer() to_featurize = [] to_featurize_keys = [] if len(to_featurize) > 0: featurize_and_upload_batch(to_featurize, to_featurize_keys, batch_size, bucket, prefix, client)
def create_and_store_batch(batch_ids): batch_key = hash_ids(batch_ids) full_key = os.path.join(prefix, batch_key) + '.pickle' data = {} for cur_id in batch_ids: cur_key = 'imagenet2candidates_scaled/' + cur_id + '.jpg' data[cur_id], _ = utils.get_s3_object_bytes_with_backoff(cur_key, bucket=bucket) client = utils.get_s3_client() client.put_object(Key=full_key, Bucket=bucket, Body=pickle.dumps(data)) return (batch_key, batch_ids)
def write_test_output(test_keys, features, bucket="imagenet2datav2"): client = utils.get_s3_client() for idx in range(features.shape[0]): filename = test_keys[idx].split('.')[0].split('/')[1] key = 'imagenet-test-featurized-2/' + filename + '.npy' bio = io.BytesIO() np.save(bio, features[idx]) bstream = bio.getvalue() client.put_object(Bucket=bucket, Key=key, Body=bstream, ACL="bucket-owner-full-control") print('Done writing features')
def featurize_candidates(bucket, prefix, batch_size, source_filename): imgnt = imagenet.ImageNetData() cds = candidate_data.CandidateData(verbose=False) filenames_to_ignore = [ '2018-08-06_17:33_vaishaal.json', '2018-08-17_17:24_vaishaal.json', 'vaishaal_hits_submitted_2018-08-17-18:28:33-PDT.json', 'vaishaal_hits_submitted_2018-08-17-18:50:38-PDT.json', 'vaishaal_hits_submitted_2018-08-17-19:28:24-PDT.json', 'vaishaal_hits_submitted_2018-08-17-19:56:28-PDT.json', 'vaishaal_hits_submitted_2018-08-25-09:47:26-PDT.json'] mturk = mturk_data.MTurkData(live=True, load_assignments=True, source_filenames_to_ignore=filenames_to_ignore, verbose=False) to_featurize = [] to_featurize_keys = [] client = utils.get_s3_client() i = 0 #candidate_list = dataset_sampling.get_histogram_sampling_ndc_candidates(imgnet=imgnt, cds=cds, mturk=mturk) start = timer() with open('../data/metadata/fc7_candidates.json', 'r') as f: candidate_list = json.load(f) for k in candidate_list: key_name = os.path.join(prefix, str(k)+".npy") key_exists = utils.key_exists(bucket, key_name) if not key_exists: img = cds.load_image(k, size='original', verbose=False) img = skimage.transform.resize(img, FEATURIZE_SIZE, preserve_range=True) to_featurize.append(img) to_featurize_keys.append(k) #if i > 250: # break; i = i + 1 print('Got candidate {}'.format(i)) end = timer() print(f"Took {end-start} seconds to get remaining candidates.") print('Beginning featurization of {} items'.format(len(to_featurize_keys))) if len(to_featurize) > 0: to_featurize = np.stack(to_featurize, axis=0) print(f"input shape {to_featurize.shape}") batch_size = min(len(to_featurize), batch_size) features = featurize.vgg16_features(to_featurize, batch_size=batch_size) print(f"features shape {features.shape}") for i,f in enumerate(features): key_name = os.path.join(prefix, to_featurize_keys[i]+".npy") bio = io.BytesIO() np.save(bio, f) print("writing key {0}".format(key_name)) utils.put_s3_object_bytes_with_backoff(bio.getvalue(), key_name) print(f"Took {end-start} seconds to get remaining candidates.")
def featurize_s3_tarball(tarball_key, bucket="imagenet2datav2", batch_size=32): client = utils.get_s3_client() read_bytes = client.get_object(Key=tarball_key, Bucket=bucket)["Body"].read() tarball = tarfile.open(fileobj=io.BytesIO(read_bytes)) images = [] image_filenames = [] for member in tarball.getmembers(): f = tarball.extractfile(member) if (f != None): im = skimage.transform.resize(imageio.imread(f), (224, 224), preserve_range=True) if (len(im.shape) == 2): im = np.stack((im, im, im), axis=2) image_filenames.append(member.name) images.append(im) images = np.stack(images, axis=0) features = vgg16_features(images, batch_size=batch_size) write_output(tarball_key, features, image_filenames) return features, tarball_key
def write_output( tarball_key, features, image_filenames, bucket="imagenet2datav2", ): client = utils.get_s3_client() dir_name, file_key = tarball_key.split('/') file_key = file_key.replace('-scaled.tar', '-fc7.pkl') key = dir_name + '-featurized/' + file_key results = {} for idx in range(features.shape[0]): filename = image_filenames[idx].split('.')[0].split('/')[1] results[filename] = features[idx] tmp = pickle.dumps(results) print('Uploading {} to s3 '.format(key)) client.put_object(Bucket=bucket, Key=key, Body=tmp, ACL="bucket-owner-full-control")
import os import requests import tempfile from airflow.exceptions import AirflowException import values from utils import format_date_scrape, get_s3_client client = get_s3_client() def upload_visitors_data(prev_ds, *args, **kwargs): formatted_date = format_date_scrape(prev_ds) VISITS_URL = values.VISITS_URL data = {'valorCaja1': formatted_date} response = requests.post(VISITS_URL, data=data) if response.ok: key_name = f"{prev_ds}_visits" with tempfile.NamedTemporaryFile(mode='w+b') as temp: temp.write(response.content) temp.seek(0) client.upload_file(temp.name, os.environ.get('AWS_DATA_LAKE_BUCKET'), key_name) else: raise AirflowException("Request failed.")
def write_image_to_s3(img_dict, bucket, prefix): t = time.time() url = img_dict['url'] ext = url.split(".")[-1] key = img_dict["id_ours"] mturk_key = "{2}_mturk/{0}.{1}".format(key, ext, prefix) if utils.key_exists(bucket, mturk_key): return img_dict gcontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1) img_bytes = urllib.request.urlopen(url, context=gcontext).read() pil_image = Image.open(io.BytesIO(img_bytes)) rotated_img, _ = fix_orientation(pil_image) np_image = np.array(rotated_img) np_image = utils.make_rgb(np_image) try: image_ndc = skimage.transform.resize(np_image, (NDC_SIZE, NDC_SIZE), preserve_range=True) except MemoryError: raise Exception(f"Image {img_dict} memory error") bigger_side = max(np_image.shape) scale_fac = MTURK_RESCALE_SIZE / bigger_side image_mturk = skimage.transform.rescale(np_image, scale=scale_fac, preserve_range=True) bio_mturk = io.BytesIO() bio_orig = io.BytesIO() bio_ndc = io.BytesIO() imageio.imwrite(uri=bio_orig, im=np_image, format="jpg", quality=90) try: imageio.imwrite(uri=bio_mturk, im=image_mturk, format="jpg", quality=90) except: raise Exception(f"Image {img_dict} error") imageio.imwrite(uri=bio_ndc, im=image_ndc, format="jpg", quality=90) client = utils.get_s3_client() ext = "jpg" backoff = 1 while (True): try: client.put_object(Key="{2}_scaled/{0}.{1}".format( key, ext, prefix), Bucket=bucket, Body=bio_ndc.getvalue()) client.put_object(Key="{2}_original/{0}.{1}".format( key, ext, prefix), Bucket=bucket, Body=bio_orig.getvalue()) client.put_object(Key="{2}_mturk/{0}.{1}".format(key, ext, prefix), Bucket=bucket, Body=bio_mturk.getvalue()) break except: time.sleep(backoff) backoff *= 2 e = time.time() print("One image took ", e - t) img_dict["width"] = np_image.shape[1] img_dict["height"] = np_image.shape[0] return img_dict
def test(top_k=5, seed=586724699, extra_pairs=[]): im_data = imagenet.ImageNetData() np.random.seed(seed) images = np.random.choice(references, control_images + num_extra_images, replace=False) extra_images = images[control_images:] images = images[:control_images] image_ids = [] img_info = [] test_dataset = [] client = utils.get_s3_client() true_dict = {} to_featurize = [] to_featurize_keys = [] for im_name in images: im_meta, img = make_test_img(im_data, im_name, prefix=prefix, size=size, exact=exact) true_dict[im_meta['id_ours']] = im_name img_info.append(im_meta) img_orig = img if not exact: img = mod_fn(img) img = resize(img, (256, 256), preserve_range=True) else: im_bytes = img img = imageio.imread(img) if 'fc7' in metrics: key_name = os.path.join("imagenet2candidates_featurized", f"{im_meta['id_ours']}.npy") im_resize = resize(img_orig, (224, 224), preserve_range=True) to_featurize.append(im_resize.astype('float32')) to_featurize_keys.append(key_name) bio = io.BytesIO() if not exact: imageio.imwrite(uri=bio, im=img, format="jpg", quality=100) bstream = bio.getvalue() else: print("Exact bytes..") bstream = im_bytes key = "imagenet2candidates_scaled/{0}.jpg".format( im_meta['id_ours']) print("uploading.. to {0}".format(key)) client.put_object(Bucket=bucket, Key=key, Body=bstream) if len(to_featurize) > 0: to_featurize = np.stack(to_featurize, axis=0) batch_size = min(len(to_featurize), 32) features = featurize.vgg16_features(to_featurize, batch_size=batch_size, use_gpu=False) for i, f in enumerate(features): key_name = to_featurize_keys[i] bio = io.BytesIO() np.save(bio, f) print("writing features key {0}".format(key_name)) bstream = bio.getvalue() print("feature hash ", hashlib.sha1(bstream).hexdigest()) client.put_object(Key=key_name, Bucket=bucket, Body=bstream) with open( "../data/search_results/test_{0}_results.json".format(prefix), "w+") as f: f.write(json.dumps(img_info)) candidates = [x['id_ours'] for x in img_info] extra_images = list(extra_images) print("extra pairs", extra_pairs) print("len extra_images", len(extra_images)) for e, v in extra_pairs: true_dict[e] = v candidates.append(e) extra_images.append(v) print("len after append extra_images", len(extra_images)) for e in extra_images: true_dict[e] = e for e in images: true_dict[e] = e reference_names = list(images) + list(extra_images) print( f"running near duplicate check on {candidates} vs {reference_names}" ) print(f"num references {len(references)}") res, t_info = near_duplicate_checker.get_near_duplicates( candidates, reference_names, top_k=top_k, dssim_window_size=35, use_pywren=False, ref_chunk_size=100, cd_chunk_size=100, distance_metrics=metrics) for m, val in res.items(): for k, v in val.items(): true_answer = true_dict[k] result = v[0][0] if (true_answer != result): print(m, val, k, v) print(f"expected {true_answer}, got {result}") print(v) assert true_answer == result print("Passed NDC for metric {0} for test {1}".format( m, prefix)) if (exact): if (not np.isclose(v[0][1], 0)): print(m, val, k, v) assert np.isclose(v[0][1], 0) return res