Beispiel #1
0
def featurize_test_images(bucket, prefix, batch_size):
    imgnt = imagenet.ImageNetData()
    to_featurize = []
    to_featurize_keys = []
    client = utils.get_s3_client()
    start = timer()
    num_batches = 0
    for k in imgnt.test_filenames:
        key_name = os.path.join(prefix, f"{k}.npy")
        key_exists = utils.key_exists(bucket, key_name)
        if not key_exists:
            img = imgnt.load_image(k,
                                   size='scaled_256',
                                   force_rgb=True,
                                   verbose=False)
            img = skimage.transform.resize(img,
                                           FEATURIZE_SIZE,
                                           preserve_range=True)
            to_featurize.append(img)
            to_featurize_keys.append(k)
            if len(to_featurize) >= batch_size:
                num_batches += 1
                featurize_and_upload_batch(to_featurize, to_featurize_keys,
                                           batch_size, bucket, prefix, client)
                end = timer()
                print('processing bach {} (size {}) took {} seconds'.format(
                    num_batches, len(to_featurize), end - start))
                start = timer()
                to_featurize = []
                to_featurize_keys = []
    if len(to_featurize) > 0:
        featurize_and_upload_batch(to_featurize, to_featurize_keys, batch_size,
                                   bucket, prefix, client)
Beispiel #2
0
 def create_and_store_batch(batch_ids):
     batch_key = hash_ids(batch_ids)
     full_key = os.path.join(prefix, batch_key) + '.pickle'
     data = {}
     for cur_id in batch_ids:
         cur_key = 'imagenet2candidates_scaled/' + cur_id + '.jpg'
         data[cur_id], _ = utils.get_s3_object_bytes_with_backoff(cur_key, bucket=bucket)
     client = utils.get_s3_client()
     client.put_object(Key=full_key, Bucket=bucket, Body=pickle.dumps(data))
     return (batch_key, batch_ids)
Beispiel #3
0
def write_test_output(test_keys, features, bucket="imagenet2datav2"):
    client = utils.get_s3_client()
    for idx in range(features.shape[0]):
        filename = test_keys[idx].split('.')[0].split('/')[1]
        key = 'imagenet-test-featurized-2/' + filename + '.npy'
        bio = io.BytesIO()
        np.save(bio, features[idx])
        bstream = bio.getvalue()
        client.put_object(Bucket=bucket,
                          Key=key,
                          Body=bstream,
                          ACL="bucket-owner-full-control")
    print('Done writing features')
Beispiel #4
0
def featurize_candidates(bucket, prefix, batch_size, source_filename):
    imgnt = imagenet.ImageNetData()
    cds = candidate_data.CandidateData(verbose=False)
    filenames_to_ignore = [
        '2018-08-06_17:33_vaishaal.json',
        '2018-08-17_17:24_vaishaal.json',
        'vaishaal_hits_submitted_2018-08-17-18:28:33-PDT.json',
        'vaishaal_hits_submitted_2018-08-17-18:50:38-PDT.json',
        'vaishaal_hits_submitted_2018-08-17-19:28:24-PDT.json',
        'vaishaal_hits_submitted_2018-08-17-19:56:28-PDT.json',
        'vaishaal_hits_submitted_2018-08-25-09:47:26-PDT.json']
    mturk = mturk_data.MTurkData(live=True, load_assignments=True, source_filenames_to_ignore=filenames_to_ignore, verbose=False)
    to_featurize = []
    to_featurize_keys = []
    client = utils.get_s3_client()
    i = 0
    #candidate_list = dataset_sampling.get_histogram_sampling_ndc_candidates(imgnet=imgnt, cds=cds, mturk=mturk)
    start = timer()
    with open('../data/metadata/fc7_candidates.json', 'r') as f:
        candidate_list = json.load(f)
    for k in candidate_list:
        key_name = os.path.join(prefix, str(k)+".npy")
        key_exists = utils.key_exists(bucket, key_name)
        if not key_exists:
            img = cds.load_image(k, size='original', verbose=False)
            img  = skimage.transform.resize(img, FEATURIZE_SIZE, preserve_range=True)
            to_featurize.append(img)
            to_featurize_keys.append(k)
            #if i > 250:
            #    break;
            i = i + 1
            print('Got candidate {}'.format(i))
    end = timer()
    print(f"Took {end-start} seconds to get remaining candidates.")
    print('Beginning featurization of {} items'.format(len(to_featurize_keys)))
    if len(to_featurize) > 0:
        to_featurize = np.stack(to_featurize, axis=0)
        print(f"input shape {to_featurize.shape}")
        batch_size = min(len(to_featurize), batch_size)
        features = featurize.vgg16_features(to_featurize, batch_size=batch_size)
        print(f"features shape {features.shape}")
        for i,f in enumerate(features):
            key_name = os.path.join(prefix, to_featurize_keys[i]+".npy")
            bio = io.BytesIO()
            np.save(bio, f)
            print("writing key {0}".format(key_name))
            utils.put_s3_object_bytes_with_backoff(bio.getvalue(), key_name)
    print(f"Took {end-start} seconds to get remaining candidates.")
Beispiel #5
0
def featurize_s3_tarball(tarball_key, bucket="imagenet2datav2", batch_size=32):
    client = utils.get_s3_client()
    read_bytes = client.get_object(Key=tarball_key,
                                   Bucket=bucket)["Body"].read()
    tarball = tarfile.open(fileobj=io.BytesIO(read_bytes))
    images = []
    image_filenames = []
    for member in tarball.getmembers():
        f = tarball.extractfile(member)
        if (f != None):
            im = skimage.transform.resize(imageio.imread(f), (224, 224),
                                          preserve_range=True)
            if (len(im.shape) == 2):
                im = np.stack((im, im, im), axis=2)
            image_filenames.append(member.name)
            images.append(im)
    images = np.stack(images, axis=0)
    features = vgg16_features(images, batch_size=batch_size)
    write_output(tarball_key, features, image_filenames)
    return features, tarball_key
Beispiel #6
0
def write_output(
    tarball_key,
    features,
    image_filenames,
    bucket="imagenet2datav2",
):
    client = utils.get_s3_client()
    dir_name, file_key = tarball_key.split('/')
    file_key = file_key.replace('-scaled.tar', '-fc7.pkl')
    key = dir_name + '-featurized/' + file_key
    results = {}
    for idx in range(features.shape[0]):
        filename = image_filenames[idx].split('.')[0].split('/')[1]
        results[filename] = features[idx]
    tmp = pickle.dumps(results)
    print('Uploading {} to s3 '.format(key))
    client.put_object(Bucket=bucket,
                      Key=key,
                      Body=tmp,
                      ACL="bucket-owner-full-control")
Beispiel #7
0
import os
import requests
import tempfile

from airflow.exceptions import AirflowException

import values
from utils import format_date_scrape, get_s3_client

client = get_s3_client()


def upload_visitors_data(prev_ds, *args, **kwargs):
    formatted_date = format_date_scrape(prev_ds)
    VISITS_URL = values.VISITS_URL
    data = {'valorCaja1': formatted_date}
    response = requests.post(VISITS_URL, data=data)
    if response.ok:
        key_name = f"{prev_ds}_visits"
        with tempfile.NamedTemporaryFile(mode='w+b') as temp:
            temp.write(response.content)
            temp.seek(0)
            client.upload_file(temp.name,
                               os.environ.get('AWS_DATA_LAKE_BUCKET'),
                               key_name)
    else:
        raise AirflowException("Request failed.")
def write_image_to_s3(img_dict, bucket, prefix):

    t = time.time()
    url = img_dict['url']
    ext = url.split(".")[-1]
    key = img_dict["id_ours"]
    mturk_key = "{2}_mturk/{0}.{1}".format(key, ext, prefix)
    if utils.key_exists(bucket, mturk_key):
        return img_dict
    gcontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
    img_bytes = urllib.request.urlopen(url, context=gcontext).read()
    pil_image = Image.open(io.BytesIO(img_bytes))
    rotated_img, _ = fix_orientation(pil_image)
    np_image = np.array(rotated_img)
    np_image = utils.make_rgb(np_image)
    try:

        image_ndc = skimage.transform.resize(np_image, (NDC_SIZE, NDC_SIZE),
                                             preserve_range=True)
    except MemoryError:
        raise Exception(f"Image {img_dict} memory error")

    bigger_side = max(np_image.shape)
    scale_fac = MTURK_RESCALE_SIZE / bigger_side
    image_mturk = skimage.transform.rescale(np_image,
                                            scale=scale_fac,
                                            preserve_range=True)

    bio_mturk = io.BytesIO()
    bio_orig = io.BytesIO()
    bio_ndc = io.BytesIO()

    imageio.imwrite(uri=bio_orig, im=np_image, format="jpg", quality=90)
    try:
        imageio.imwrite(uri=bio_mturk,
                        im=image_mturk,
                        format="jpg",
                        quality=90)
    except:
        raise Exception(f"Image {img_dict} error")

    imageio.imwrite(uri=bio_ndc, im=image_ndc, format="jpg", quality=90)

    client = utils.get_s3_client()
    ext = "jpg"
    backoff = 1
    while (True):
        try:
            client.put_object(Key="{2}_scaled/{0}.{1}".format(
                key, ext, prefix),
                              Bucket=bucket,
                              Body=bio_ndc.getvalue())
            client.put_object(Key="{2}_original/{0}.{1}".format(
                key, ext, prefix),
                              Bucket=bucket,
                              Body=bio_orig.getvalue())
            client.put_object(Key="{2}_mturk/{0}.{1}".format(key, ext, prefix),
                              Bucket=bucket,
                              Body=bio_mturk.getvalue())
            break
        except:
            time.sleep(backoff)
            backoff *= 2
    e = time.time()
    print("One image took ", e - t)
    img_dict["width"] = np_image.shape[1]
    img_dict["height"] = np_image.shape[0]
    return img_dict
    def test(top_k=5, seed=586724699, extra_pairs=[]):
        im_data = imagenet.ImageNetData()
        np.random.seed(seed)
        images = np.random.choice(references,
                                  control_images + num_extra_images,
                                  replace=False)
        extra_images = images[control_images:]
        images = images[:control_images]
        image_ids = []
        img_info = []
        test_dataset = []
        client = utils.get_s3_client()
        true_dict = {}
        to_featurize = []
        to_featurize_keys = []
        for im_name in images:
            im_meta, img = make_test_img(im_data,
                                         im_name,
                                         prefix=prefix,
                                         size=size,
                                         exact=exact)
            true_dict[im_meta['id_ours']] = im_name
            img_info.append(im_meta)
            img_orig = img
            if not exact:
                img = mod_fn(img)
                img = resize(img, (256, 256), preserve_range=True)
            else:
                im_bytes = img
                img = imageio.imread(img)
            if 'fc7' in metrics:
                key_name = os.path.join("imagenet2candidates_featurized",
                                        f"{im_meta['id_ours']}.npy")
                im_resize = resize(img_orig, (224, 224), preserve_range=True)
                to_featurize.append(im_resize.astype('float32'))
                to_featurize_keys.append(key_name)
            bio = io.BytesIO()
            if not exact:
                imageio.imwrite(uri=bio, im=img, format="jpg", quality=100)
                bstream = bio.getvalue()

            else:
                print("Exact bytes..")
                bstream = im_bytes
            key = "imagenet2candidates_scaled/{0}.jpg".format(
                im_meta['id_ours'])
            print("uploading.. to {0}".format(key))
            client.put_object(Bucket=bucket, Key=key, Body=bstream)
        if len(to_featurize) > 0:
            to_featurize = np.stack(to_featurize, axis=0)
            batch_size = min(len(to_featurize), 32)
            features = featurize.vgg16_features(to_featurize,
                                                batch_size=batch_size,
                                                use_gpu=False)
            for i, f in enumerate(features):
                key_name = to_featurize_keys[i]
                bio = io.BytesIO()
                np.save(bio, f)
                print("writing features key {0}".format(key_name))
                bstream = bio.getvalue()
                print("feature hash ", hashlib.sha1(bstream).hexdigest())
                client.put_object(Key=key_name, Bucket=bucket, Body=bstream)

        with open(
                "../data/search_results/test_{0}_results.json".format(prefix),
                "w+") as f:
            f.write(json.dumps(img_info))
        candidates = [x['id_ours'] for x in img_info]
        extra_images = list(extra_images)
        print("extra pairs", extra_pairs)
        print("len extra_images", len(extra_images))
        for e, v in extra_pairs:
            true_dict[e] = v
            candidates.append(e)
            extra_images.append(v)
            print("len after append extra_images", len(extra_images))

        for e in extra_images:
            true_dict[e] = e

        for e in images:
            true_dict[e] = e

        reference_names = list(images) + list(extra_images)
        print(
            f"running near duplicate check on {candidates} vs {reference_names}"
        )
        print(f"num references {len(references)}")
        res, t_info = near_duplicate_checker.get_near_duplicates(
            candidates,
            reference_names,
            top_k=top_k,
            dssim_window_size=35,
            use_pywren=False,
            ref_chunk_size=100,
            cd_chunk_size=100,
            distance_metrics=metrics)
        for m, val in res.items():
            for k, v in val.items():
                true_answer = true_dict[k]
                result = v[0][0]
                if (true_answer != result):
                    print(m, val, k, v)
                print(f"expected {true_answer}, got {result}")
                print(v)
                assert true_answer == result
                print("Passed NDC for metric {0} for test {1}".format(
                    m, prefix))
                if (exact):
                    if (not np.isclose(v[0][1], 0)):
                        print(m, val, k, v)
                    assert np.isclose(v[0][1], 0)
        return res