def featurize_test(test_keys, batch_size=64): images = [] for img_name in test_keys: try: file_bytes = utils.get_s3_file_bytes(img_name, verbose=False) image = imageio.imread(file_bytes) if len(image.shape) == 3: if image.shape[2] == 4: print('Removing alpha channel for image', name) image = image[:, :, :3] elif len(image.shape) == 2: image = np.stack((image, image, image), axis=2) if image.size != 196608: print(img_name) raise image = skimage.transform.resize(image, (224, 224), preserve_range=True) except: print('Exception: ' + str(img_name) + str(sys.exc_info()[0])) raise images.append(image) images = np.stack(images, axis=0) print('Beginning featurization') features = vgg16_features(images, batch_size=batch_size) write_test_output(test_keys, features) return features
def get_s3_file_bytes(self, remote_filename, verbose=True, num_replicas=1): return utils.get_s3_file_bytes(remote_filename, bucket=self.bucket, cache_on_local_disk=self.cache_on_local_disk, cache_root_path=self.cache_root_path, verbose=verbose, num_replicas=num_replicas)
def __init__(self, *, imgnet=None, candidates=None, mturk_data=None, load_review_thresholds=False, verbose=True, bucket='imagenet2datav2'): assert imgnet is not None self.imgnet = imgnet self.imagenet_filenames = set(imgnet.get_all_image_names()) assert candidates is not None assert not candidates.blacklist_excluded self.cds = candidates assert mturk_data is not None self.mturk = mturk_data review_data_filepath = ( pathlib.Path(__file__).parent / '../data/metadata/nearest_neighbor_reviews_v2.json').resolve() with open(review_data_filepath, 'r') as f: self.review_data = json.load(f) check_review_data(self.review_data, self.cds, self.imgnet, imgnet_filenames=self.imagenet_filenames) if verbose: print('Loaded review data from {}'.format(review_data_filepath)) print(' Review info data {} candidates'.format( len(self.review_data))) ndc_resolution_override_filepath = ( pathlib.Path(__file__).parent / '../data/metadata/near_duplicate_resolution_override.json' ).resolve() with open(ndc_resolution_override_filepath, 'r') as f: self.ndc_resolution_override_set = set(json.load(f)) if verbose: print('Loaded near duplicate resolution override data from {}'. format(ndc_resolution_override_filepath)) print(' {} resolution overrides'.format( len(self.ndc_resolution_override_set))) if load_review_thresholds: key = 'review_thresholds/data_2018-12-13_06-33-26_UTC.pickle' pickle_bytes = utils.get_s3_file_bytes(key, verbose=verbose) pickle_dict = pickle.loads(pickle_bytes) data_source = 's3://' + bucket + '/' + key self.review_threshold = pickle_dict['review_thresholds'] print(f'Loaded review thresholds from {data_source}') self.reload_near_duplicate_data(verbose)
def flatten_tarball(tarball_name, prefix, bucket="imagenet2datav2", verbose=False): tarball_bytes = utils.get_s3_file_bytes(tarball_name, cache_on_local_disk=False, verbose=verbose) tf = tarfile.open(fileobj=io.BytesIO(tarball_bytes)) for member in tf.getmembers(): if member.isfile(): file_bytes = tf.extractfile(member).read() key = prefix + member.name utils.put_s3_object_bytes_with_backoff(file_bytes, key, bucket=bucket, delay_factor=10) return
import os import urllib.request import imagenet import utils imgnet = imagenet.ImageNetData(load_class_info=False) def lookup_wnid(wnid): url = 'http://www.image-net.org/api/text/wordnet.synset.getwords?wnid={0}'.format( wnid) return urllib.request.urlopen(url).read().decode().strip().split('\n') gloss_bytes = utils.get_s3_file_bytes('metadata/gloss.txt', cache_on_local_disk=False) gloss_string = gloss_bytes.decode('utf-8') gloss_lines = gloss_string.split('\n') gloss = {} for line in gloss_lines: wnid = line[:9] cur_gloss = line[10:] gloss[wnid] = cur_gloss tmpci2 = [] wnids = sorted(imgnet.train_imgs_by_wnid.keys()) for ii, wnid in enumerate(wnids): cur_dict = {} cur_dict['cid'] = ii cur_dict['wnid'] = wnid
def __init__(self, *, live=False, load_assignments=True, assignment_source='s3', source_filenames_to_ignore=[], include_blacklisted_hits=False, verbose=True, cache_on_local_disk=True, cache_root_path=None, bucket='imagenet2datav2'): self.source_filenames_to_ignore = source_filenames_to_ignore self.include_blacklisted_hits = include_blacklisted_hits self.bucket = bucket self.cache_on_local_disk = cache_on_local_disk self.live = live if self.cache_on_local_disk: if cache_root_path is None: self.cache_root_path = pathlib.Path( __file__).parent / '../data/cache' else: self.ache_root_path = pathlib.Path(cache_root_path) self.cache_root_path = self.cache_root_path.resolve() else: assert cache_root_path is None self.cache_root_path = None self.hits, self.mturk_ids_to_uuid, _, _, self.blacklisted_hits = mturk_utils.load_local_hit_data( live=self.live, verbose=verbose, source_filenames_to_ignore=source_filenames_to_ignore, include_blacklisted_hits=self.include_blacklisted_hits) self.hits_of_image = {} self.hits_for_wnid = {} for hit in self.hits.values(): cur_wnid = hit['wnid'] if cur_wnid not in self.hits_for_wnid: self.hits_for_wnid[cur_wnid] = [] self.hits_for_wnid[cur_wnid].append(hit) for img in hit['images_all']: if img not in self.hits_of_image: self.hits_of_image[img] = [] self.hits_of_image[img].append(hit) if load_assignments: assert assignment_source in ['s3', 'mturk'] if assignment_source == 'mturk': raise NotImplementedError else: key = 'mturk_results/data_live_2018-12-13_04-59-25_UTC.pickle' pickle_bytes = utils.get_s3_file_bytes(key, verbose=verbose) pickle_dict = pickle.loads(pickle_bytes) data_source = 's3://' + bucket + '/' + key all_assignments = pickle_dict['assignments'] self.assignments = {} num_assignment_hits_ignored = 0 for uuid, assignment_dict in all_assignments.items(): if uuid in self.hits: self.assignments[uuid] = assignment_dict else: num_assignment_hits_ignored += 1 num_hits_without_assignments = 0 for uuid in self.hits.keys(): if uuid not in self.assignments: self.assignments[uuid] = {} num_hits_without_assignments += 1 assert len(self.assignments) == len(self.hits) if verbose: print( 'Using pickled JSON data stored by {} from {} locally'. format(pickle_dict['username'], pickle_dict['json_dir'])) print(' S3 source: {}'.format(data_source)) print(' Ignored assignment data for {} HITs'.format( num_assignment_hits_ignored)) print(' {} HITs do not have assignment data'.format( num_hits_without_assignments)) self.assignment_time_string = pickle_dict['time_string'] self.num_assignments = 0 for cur_assignments in self.assignments.values(): self.num_assignments += len(cur_assignments) self.image_fraction_selected = {} self.image_num_assignments = {} for img, cur_hits in self.hits_of_image.items(): self.image_fraction_selected[img] = {} self.image_num_assignments[img] = {} for ch in cur_hits: cur_wnid = ch['wnid'] if cur_wnid not in self.image_fraction_selected[img]: self.image_fraction_selected[img][cur_wnid] = 0 assert cur_wnid not in self.image_num_assignments[img] self.image_num_assignments[img][cur_wnid] = 0 cur_assignments = self.assignments[ch['uuid']] for a in cur_assignments.values(): if a['AssignmentStatus'] in ['Submitted', 'Approved']: self.image_num_assignments[img][cur_wnid] += 1 if img in a['Answer']: self.image_fraction_selected[img][ cur_wnid] += 1 for img, wnid_dict in self.image_fraction_selected.items(): for wnid in wnid_dict: if self.image_num_assignments[img][wnid] == 0: self.image_fraction_selected[img][wnid] = 0.0 else: self.image_fraction_selected[img][ wnid] /= self.image_num_assignments[img][wnid] self.num_valid_assignments_by_hit = {} for uuid, hit in self.hits.items(): self.num_valid_assignments_by_hit[uuid] = 0 cur_assignments = self.assignments[uuid] for a in cur_assignments.values(): if a['AssignmentStatus'] in ['Submitted', 'Approved']: self.num_valid_assignments_by_hit[uuid] += 1 self.hit_image_num_selected = {} self.hit_image_fraction_selected = {} for uuid, hit in self.hits.items(): self.hit_image_num_selected[uuid] = {} self.hit_image_fraction_selected[uuid] = {} for img in hit['images_all']: assert img not in self.hit_image_num_selected[uuid] assert img not in self.hit_image_fraction_selected[uuid] self.hit_image_num_selected[uuid][img] = 0 self.hit_image_fraction_selected[uuid][img] = 0 cur_assignments = self.assignments[uuid] for a in cur_assignments.values(): if a['AssignmentStatus'] in ['Submitted', 'Approved' ] and img in a['Answer']: self.hit_image_num_selected[uuid][img] += 1 for uuid, imgs in self.hit_image_num_selected.items(): for img in imgs.keys(): if self.num_valid_assignments_by_hit[uuid] == 0: self.hit_image_fraction_selected[uuid][img] = 0.0 else: self.hit_image_fraction_selected[uuid][ img] = self.hit_image_num_selected[uuid][ img] / self.num_valid_assignments_by_hit[uuid] self.hits_by_worker = {} for uuid, cur_assignments in self.assignments.items(): for ca in cur_assignments.values(): cur_worker = ca['WorkerId'] if cur_worker not in self.hits_by_worker: self.hits_by_worker[cur_worker] = [] self.hits_by_worker[cur_worker].append(self.hits[uuid]) self.assignments_flat = {} for cur_assignments in self.assignments.values(): for ca_id, ca in cur_assignments.items(): assert ca_id not in self.assignments_flat self.assignments_flat[ca_id] = ca
bucket = 'imagenet2datav2' print('Running consistency check:') num_errors, num_warnings, local_hit_ids_missing_remotely = mturk_utils.mturk_vs_local_consistency_check( live=live) assert num_errors == 0 assert num_warnings == len(local_hit_ids_missing_remotely) # TODO: handle the blacklist correctly (do not include in the HITs) hits, mturk_ids_to_uuid, json_dir, json_filenames, blacklisted_hits = mturk_utils.load_local_hit_data( live=live, verbose=True, include_blacklisted_hits=True) client = mturk_utils.get_mturk_client(live=live) backup_s3_key = 'mturk_results/data_live_2018-12-04_17-24-42_UTC.pickle' backup_bytes = utils.get_s3_file_bytes(backup_s3_key, verbose=True) backup_data = pickle.loads(backup_bytes) backup_assignments = {} for hit_id in local_hit_ids_missing_remotely: cur_uuid = mturk_ids_to_uuid[hit_id] backup_assignments[cur_uuid] = backup_data['assignments'][cur_uuid] print( f'Took assignment data for {len(backup_assignments)} HITs from the backup {backup_s3_key}' ) assignments = mturk_utils.get_all_hit_assignments( live=live, hit_source='local', local_mturk_ids_to_uuid=mturk_ids_to_uuid, verbose=True,