Example #1
0
def featurize_test(test_keys, batch_size=64):
    images = []
    for img_name in test_keys:
        try:
            file_bytes = utils.get_s3_file_bytes(img_name, verbose=False)
            image = imageio.imread(file_bytes)
            if len(image.shape) == 3:
                if image.shape[2] == 4:
                    print('Removing alpha channel for image', name)
                    image = image[:, :, :3]
            elif len(image.shape) == 2:
                image = np.stack((image, image, image), axis=2)
            if image.size != 196608:
                print(img_name)
                raise
            image = skimage.transform.resize(image, (224, 224),
                                             preserve_range=True)
        except:
            print('Exception: ' + str(img_name) + str(sys.exc_info()[0]))
            raise
        images.append(image)
    images = np.stack(images, axis=0)
    print('Beginning featurization')
    features = vgg16_features(images, batch_size=batch_size)
    write_test_output(test_keys, features)
    return features
Example #2
0
 def get_s3_file_bytes(self, remote_filename, verbose=True, num_replicas=1):
     return utils.get_s3_file_bytes(remote_filename,
                                    bucket=self.bucket,
                                    cache_on_local_disk=self.cache_on_local_disk,
                                    cache_root_path=self.cache_root_path,
                                    verbose=verbose,
                                    num_replicas=num_replicas)
    def __init__(self,
                 *,
                 imgnet=None,
                 candidates=None,
                 mturk_data=None,
                 load_review_thresholds=False,
                 verbose=True,
                 bucket='imagenet2datav2'):
        assert imgnet is not None
        self.imgnet = imgnet
        self.imagenet_filenames = set(imgnet.get_all_image_names())
        assert candidates is not None
        assert not candidates.blacklist_excluded
        self.cds = candidates
        assert mturk_data is not None
        self.mturk = mturk_data

        review_data_filepath = (
            pathlib.Path(__file__).parent /
            '../data/metadata/nearest_neighbor_reviews_v2.json').resolve()
        with open(review_data_filepath, 'r') as f:
            self.review_data = json.load(f)
        check_review_data(self.review_data,
                          self.cds,
                          self.imgnet,
                          imgnet_filenames=self.imagenet_filenames)
        if verbose:
            print('Loaded review data from {}'.format(review_data_filepath))
            print('    Review info data {} candidates'.format(
                len(self.review_data)))

        ndc_resolution_override_filepath = (
            pathlib.Path(__file__).parent /
            '../data/metadata/near_duplicate_resolution_override.json'
        ).resolve()
        with open(ndc_resolution_override_filepath, 'r') as f:
            self.ndc_resolution_override_set = set(json.load(f))
        if verbose:
            print('Loaded near duplicate resolution override data from {}'.
                  format(ndc_resolution_override_filepath))
            print('    {} resolution overrides'.format(
                len(self.ndc_resolution_override_set)))

        if load_review_thresholds:
            key = 'review_thresholds/data_2018-12-13_06-33-26_UTC.pickle'
            pickle_bytes = utils.get_s3_file_bytes(key, verbose=verbose)
            pickle_dict = pickle.loads(pickle_bytes)
            data_source = 's3://' + bucket + '/' + key
            self.review_threshold = pickle_dict['review_thresholds']
            print(f'Loaded review thresholds from {data_source}')

        self.reload_near_duplicate_data(verbose)
Example #4
0
def flatten_tarball(tarball_name,
                    prefix,
                    bucket="imagenet2datav2",
                    verbose=False):
    tarball_bytes = utils.get_s3_file_bytes(tarball_name,
                                            cache_on_local_disk=False,
                                            verbose=verbose)
    tf = tarfile.open(fileobj=io.BytesIO(tarball_bytes))
    for member in tf.getmembers():
        if member.isfile():
            file_bytes = tf.extractfile(member).read()
            key = prefix + member.name
            utils.put_s3_object_bytes_with_backoff(file_bytes,
                                                   key,
                                                   bucket=bucket,
                                                   delay_factor=10)
    return
import os
import urllib.request

import imagenet
import utils

imgnet = imagenet.ImageNetData(load_class_info=False)


def lookup_wnid(wnid):
    url = 'http://www.image-net.org/api/text/wordnet.synset.getwords?wnid={0}'.format(
        wnid)
    return urllib.request.urlopen(url).read().decode().strip().split('\n')


gloss_bytes = utils.get_s3_file_bytes('metadata/gloss.txt',
                                      cache_on_local_disk=False)
gloss_string = gloss_bytes.decode('utf-8')
gloss_lines = gloss_string.split('\n')
gloss = {}
for line in gloss_lines:
    wnid = line[:9]
    cur_gloss = line[10:]
    gloss[wnid] = cur_gloss

tmpci2 = []
wnids = sorted(imgnet.train_imgs_by_wnid.keys())

for ii, wnid in enumerate(wnids):
    cur_dict = {}
    cur_dict['cid'] = ii
    cur_dict['wnid'] = wnid
Example #6
0
    def __init__(self,
                 *,
                 live=False,
                 load_assignments=True,
                 assignment_source='s3',
                 source_filenames_to_ignore=[],
                 include_blacklisted_hits=False,
                 verbose=True,
                 cache_on_local_disk=True,
                 cache_root_path=None,
                 bucket='imagenet2datav2'):
        self.source_filenames_to_ignore = source_filenames_to_ignore
        self.include_blacklisted_hits = include_blacklisted_hits
        self.bucket = bucket
        self.cache_on_local_disk = cache_on_local_disk

        self.live = live
        if self.cache_on_local_disk:
            if cache_root_path is None:
                self.cache_root_path = pathlib.Path(
                    __file__).parent / '../data/cache'
            else:
                self.ache_root_path = pathlib.Path(cache_root_path)
            self.cache_root_path = self.cache_root_path.resolve()
        else:
            assert cache_root_path is None
            self.cache_root_path = None

        self.hits, self.mturk_ids_to_uuid, _, _, self.blacklisted_hits = mturk_utils.load_local_hit_data(
            live=self.live,
            verbose=verbose,
            source_filenames_to_ignore=source_filenames_to_ignore,
            include_blacklisted_hits=self.include_blacklisted_hits)
        self.hits_of_image = {}
        self.hits_for_wnid = {}
        for hit in self.hits.values():
            cur_wnid = hit['wnid']
            if cur_wnid not in self.hits_for_wnid:
                self.hits_for_wnid[cur_wnid] = []
            self.hits_for_wnid[cur_wnid].append(hit)
            for img in hit['images_all']:
                if img not in self.hits_of_image:
                    self.hits_of_image[img] = []
                self.hits_of_image[img].append(hit)
        if load_assignments:
            assert assignment_source in ['s3', 'mturk']
            if assignment_source == 'mturk':
                raise NotImplementedError
            else:
                key = 'mturk_results/data_live_2018-12-13_04-59-25_UTC.pickle'
                pickle_bytes = utils.get_s3_file_bytes(key, verbose=verbose)
                pickle_dict = pickle.loads(pickle_bytes)
                data_source = 's3://' + bucket + '/' + key
                all_assignments = pickle_dict['assignments']
                self.assignments = {}
                num_assignment_hits_ignored = 0
                for uuid, assignment_dict in all_assignments.items():
                    if uuid in self.hits:
                        self.assignments[uuid] = assignment_dict
                    else:
                        num_assignment_hits_ignored += 1
                num_hits_without_assignments = 0
                for uuid in self.hits.keys():
                    if uuid not in self.assignments:
                        self.assignments[uuid] = {}
                        num_hits_without_assignments += 1
                assert len(self.assignments) == len(self.hits)
                if verbose:
                    print(
                        'Using pickled JSON data stored by {} from {} locally'.
                        format(pickle_dict['username'],
                               pickle_dict['json_dir']))
                    print('    S3 source: {}'.format(data_source))
                    print('    Ignored assignment data for {} HITs'.format(
                        num_assignment_hits_ignored))
                    print('    {} HITs do not have assignment data'.format(
                        num_hits_without_assignments))
                self.assignment_time_string = pickle_dict['time_string']
            self.num_assignments = 0
            for cur_assignments in self.assignments.values():
                self.num_assignments += len(cur_assignments)
            self.image_fraction_selected = {}
            self.image_num_assignments = {}
            for img, cur_hits in self.hits_of_image.items():
                self.image_fraction_selected[img] = {}
                self.image_num_assignments[img] = {}
                for ch in cur_hits:
                    cur_wnid = ch['wnid']
                    if cur_wnid not in self.image_fraction_selected[img]:
                        self.image_fraction_selected[img][cur_wnid] = 0
                        assert cur_wnid not in self.image_num_assignments[img]
                        self.image_num_assignments[img][cur_wnid] = 0
                    cur_assignments = self.assignments[ch['uuid']]
                    for a in cur_assignments.values():
                        if a['AssignmentStatus'] in ['Submitted', 'Approved']:
                            self.image_num_assignments[img][cur_wnid] += 1
                            if img in a['Answer']:
                                self.image_fraction_selected[img][
                                    cur_wnid] += 1
            for img, wnid_dict in self.image_fraction_selected.items():
                for wnid in wnid_dict:
                    if self.image_num_assignments[img][wnid] == 0:
                        self.image_fraction_selected[img][wnid] = 0.0
                    else:
                        self.image_fraction_selected[img][
                            wnid] /= self.image_num_assignments[img][wnid]

            self.num_valid_assignments_by_hit = {}
            for uuid, hit in self.hits.items():
                self.num_valid_assignments_by_hit[uuid] = 0
                cur_assignments = self.assignments[uuid]
                for a in cur_assignments.values():
                    if a['AssignmentStatus'] in ['Submitted', 'Approved']:
                        self.num_valid_assignments_by_hit[uuid] += 1

            self.hit_image_num_selected = {}
            self.hit_image_fraction_selected = {}
            for uuid, hit in self.hits.items():
                self.hit_image_num_selected[uuid] = {}
                self.hit_image_fraction_selected[uuid] = {}
                for img in hit['images_all']:
                    assert img not in self.hit_image_num_selected[uuid]
                    assert img not in self.hit_image_fraction_selected[uuid]
                    self.hit_image_num_selected[uuid][img] = 0
                    self.hit_image_fraction_selected[uuid][img] = 0
                    cur_assignments = self.assignments[uuid]
                    for a in cur_assignments.values():
                        if a['AssignmentStatus'] in ['Submitted', 'Approved'
                                                     ] and img in a['Answer']:
                            self.hit_image_num_selected[uuid][img] += 1
            for uuid, imgs in self.hit_image_num_selected.items():
                for img in imgs.keys():
                    if self.num_valid_assignments_by_hit[uuid] == 0:
                        self.hit_image_fraction_selected[uuid][img] = 0.0
                    else:
                        self.hit_image_fraction_selected[uuid][
                            img] = self.hit_image_num_selected[uuid][
                                img] / self.num_valid_assignments_by_hit[uuid]

            self.hits_by_worker = {}
            for uuid, cur_assignments in self.assignments.items():
                for ca in cur_assignments.values():
                    cur_worker = ca['WorkerId']
                    if cur_worker not in self.hits_by_worker:
                        self.hits_by_worker[cur_worker] = []
                    self.hits_by_worker[cur_worker].append(self.hits[uuid])

            self.assignments_flat = {}
            for cur_assignments in self.assignments.values():
                for ca_id, ca in cur_assignments.items():
                    assert ca_id not in self.assignments_flat
                    self.assignments_flat[ca_id] = ca
bucket = 'imagenet2datav2'

print('Running consistency check:')
num_errors, num_warnings, local_hit_ids_missing_remotely = mturk_utils.mturk_vs_local_consistency_check(
    live=live)
assert num_errors == 0
assert num_warnings == len(local_hit_ids_missing_remotely)

# TODO: handle the blacklist correctly (do not include in the HITs)
hits, mturk_ids_to_uuid, json_dir, json_filenames, blacklisted_hits = mturk_utils.load_local_hit_data(
    live=live, verbose=True, include_blacklisted_hits=True)
client = mturk_utils.get_mturk_client(live=live)

backup_s3_key = 'mturk_results/data_live_2018-12-04_17-24-42_UTC.pickle'
backup_bytes = utils.get_s3_file_bytes(backup_s3_key, verbose=True)
backup_data = pickle.loads(backup_bytes)

backup_assignments = {}
for hit_id in local_hit_ids_missing_remotely:
    cur_uuid = mturk_ids_to_uuid[hit_id]
    backup_assignments[cur_uuid] = backup_data['assignments'][cur_uuid]
print(
    f'Took assignment data for {len(backup_assignments)} HITs from the backup {backup_s3_key}'
)

assignments = mturk_utils.get_all_hit_assignments(
    live=live,
    hit_source='local',
    local_mturk_ids_to_uuid=mturk_ids_to_uuid,
    verbose=True,