def main(): imgnt = imagenet.ImageNetData() wnids = list(imgnt.train_imgs_by_wnid.keys()) train_tarball_names = get_tarball_names(wnids, 'imagenet-train/') val_tarball_names = get_tarball_names(wnids, 'imagenet-validation/val-') def flatten_train_tarball(tarball_name): return flatten_tarball(tarball_name, prefix="imagenet-train-individual/") def flatten_val_tarball(tarball_name): return flatten_tarball(tarball_name, prefix="imagenet-validation-individual/") pwex = pywren.default_executor() futures = pwex.map(flatten_val_tarball, val_tarball_names) failed_wnids = [] for future, wnid in zip(futures, wnids): try: future.result() except: failed_wnids.append(wnid) print('wnid failed', wnid) print(failed_wnids) results = pywren.get_all_results(futures)
def generate_top_k_wnids_json(): top_k = 21 imgnet = imagenet.ImageNetData() wnids = list(imgnet.train_imgs_by_wnid.keys()) result = get_all_top_k_wnids(wnids, top_k) with open('../data/metadata/wnid_to_most_similar_wnids.json', 'w') as fp: json.dump(result, fp, indent=2)
def featurize_test_images(bucket, prefix, batch_size): imgnt = imagenet.ImageNetData() to_featurize = [] to_featurize_keys = [] client = utils.get_s3_client() start = timer() num_batches = 0 for k in imgnt.test_filenames: key_name = os.path.join(prefix, f"{k}.npy") key_exists = utils.key_exists(bucket, key_name) if not key_exists: img = imgnt.load_image(k, size='scaled_256', force_rgb=True, verbose=False) img = skimage.transform.resize(img, FEATURIZE_SIZE, preserve_range=True) to_featurize.append(img) to_featurize_keys.append(k) if len(to_featurize) >= batch_size: num_batches += 1 featurize_and_upload_batch(to_featurize, to_featurize_keys, batch_size, bucket, prefix, client) end = timer() print('processing bach {} (size {}) took {} seconds'.format( num_batches, len(to_featurize), end - start)) start = timer() to_featurize = [] to_featurize_keys = [] if len(to_featurize) > 0: featurize_and_upload_batch(to_featurize, to_featurize_keys, batch_size, bucket, prefix, client)
def above_threshold(dataset_size, selection_frequency_threshold, min_num_annotations, seed, output_filename, starting_from, wnid_thresholds_filename): output_filepath = pathlib.Path( __file__).parent / '../data/datasets' / output_filename output_filepath = output_filepath.resolve() assert not output_filepath.is_file() if starting_from is not None: starting_from_filepath = pathlib.Path( __file__).parent / '../data/datasets' / starting_from starting_from_filepath = starting_from_filepath.resolve() assert starting_from_filepath.is_file() with open(starting_from_filepath, 'r') as f: starting_from_loaded = json.load(f) if wnid_thresholds_filename is not None: wnid_thresholds_filepath = pathlib.Path(wnid_thresholds_filename) wnid_thresholds_filepath = wnid_thresholds_filepath.resolve() assert wnid_thresholds_filepath.is_file() with open(wnid_thresholds_filepath, 'r') as f: wnid_thresholds = json.load(f) else: wnid_thresholds = None review_targets = {'l2': 1.2e8, 'dssim': 0.2205, 'fc7': 1.32e4} success, result, sampling_candidates, exclusions, carried_over_from_prev = dataset_sampling.sample_above_threshold( dataset_size=dataset_size, selection_frequency_threshold=selection_frequency_threshold, min_num_annotations=min_num_annotations, near_duplicate_review_targets=review_targets, seed=seed, starting_from=starting_from_loaded, wnid_thresholds=wnid_thresholds) if not success: imgnet = imagenet.ImageNetData() num_per_class = dataset_size // 1000 print('Failed to sample a valid dataset.') print( 'The following wnids have fewer than {} candidates above threshold {} with at least {} annotations' .format(num_per_class, selection_frequency_threshold, min_num_annotations)) for wnid, cur_candidates in sampling_candidates.items(): if len(cur_candidates) < num_per_class - len( carried_over_from_prev[wnid]): print( ' {}: {} sampling candidates, plus {} carried over from the previous dataset ({})' .format(wnid, len(cur_candidates), len(carried_over_from_prev[wnid]), ', '.join(imgnet.class_info_by_wnid[wnid].synset))) for reason, excluded_candidates in exclusions[wnid].items(): print(' {}: {} candidates'.format( reason, len(excluded_candidates))) result['output_filename'] = output_filename with open(output_filepath, 'w') as f: json.dump(result, f, indent=2) print('Wrote dataset to {}'.format(output_filepath))
def generate_hit_html(hit_data, html_template_path, html_style_path, add_question_header=True): imagenet_data = imagenet.ImageNetData() with open(html_template_path, "r") as f: html_text = f.read() with open(html_style_path, "r") as f: style_text = f.read() htmls = {} for hit in hit_data: out_html = '' if (add_question_header): out_html += QUESTION_HEADER wnid = hit["wnid"] class_info = imagenet_data.class_info_by_wnid[wnid] synset = class_info.synset gloss = class_info.gloss wikipedia_pages = class_info.wikipedia_pages wikipedia_page = ", ".join( ['<a href="{0}">{1}</a>'.format(x, x) for x in wikipedia_pages]) synset = " or ".join(synset) image_html = '' for i, image in enumerate(hit["images_all"]): if (image in hit["images_to_label"]): # image is an id encrypted_image = utils.encrypt_string_with_magic(image) image_decrypted = utils.decrypt_string_with_magic( encrypted_image) assert (image_decrypted == image) encrypted_image_quoted = quote(encrypted_image) s3_link = "https://s3-us-west-2.amazonaws.com/imagenet2datav2/encrypted/{0}".format( encrypted_image_quoted) + ".jpg" #print("S3 links ", s3_link) else: encrypted_image = utils.encrypt_string_with_magic(image) image_decrypted = utils.decrypt_string_with_magic( encrypted_image) assert (image_decrypted == image) encrypted_image_quoted = quote(encrypted_image) s3_link = "https://s3-us-west-2.amazonaws.com/imagenet2datav2/encrypted/{0}".format( encrypted_image_quoted) + ".jpg" html = HTML_TEMPLATE.format(img=encrypted_image, url=s3_link, checkboxnum=i) image_html += html image_html += "\n" html_body = html_text.format(image_data=image_html, synset=synset, gloss=gloss, wiki=wikipedia_page) out_html += html_body out_html += style_text if (add_question_header): out_html += QUESTION_FOOTER htmls[hit["uuid"]] = out_html return htmls
def featurize_candidates(bucket, prefix, batch_size, source_filename): imgnt = imagenet.ImageNetData() cds = candidate_data.CandidateData(verbose=False) filenames_to_ignore = [ '2018-08-06_17:33_vaishaal.json', '2018-08-17_17:24_vaishaal.json', 'vaishaal_hits_submitted_2018-08-17-18:28:33-PDT.json', 'vaishaal_hits_submitted_2018-08-17-18:50:38-PDT.json', 'vaishaal_hits_submitted_2018-08-17-19:28:24-PDT.json', 'vaishaal_hits_submitted_2018-08-17-19:56:28-PDT.json', 'vaishaal_hits_submitted_2018-08-25-09:47:26-PDT.json'] mturk = mturk_data.MTurkData(live=True, load_assignments=True, source_filenames_to_ignore=filenames_to_ignore, verbose=False) to_featurize = [] to_featurize_keys = [] client = utils.get_s3_client() i = 0 #candidate_list = dataset_sampling.get_histogram_sampling_ndc_candidates(imgnet=imgnt, cds=cds, mturk=mturk) start = timer() with open('../data/metadata/fc7_candidates.json', 'r') as f: candidate_list = json.load(f) for k in candidate_list: key_name = os.path.join(prefix, str(k)+".npy") key_exists = utils.key_exists(bucket, key_name) if not key_exists: img = cds.load_image(k, size='original', verbose=False) img = skimage.transform.resize(img, FEATURIZE_SIZE, preserve_range=True) to_featurize.append(img) to_featurize_keys.append(k) #if i > 250: # break; i = i + 1 print('Got candidate {}'.format(i)) end = timer() print(f"Took {end-start} seconds to get remaining candidates.") print('Beginning featurization of {} items'.format(len(to_featurize_keys))) if len(to_featurize) > 0: to_featurize = np.stack(to_featurize, axis=0) print(f"input shape {to_featurize.shape}") batch_size = min(len(to_featurize), batch_size) features = featurize.vgg16_features(to_featurize, batch_size=batch_size) print(f"features shape {features.shape}") for i,f in enumerate(features): key_name = os.path.join(prefix, to_featurize_keys[i]+".npy") bio = io.BytesIO() np.save(bio, f) print("writing key {0}".format(key_name)) utils.put_s3_object_bytes_with_backoff(bio.getvalue(), key_name) print(f"Took {end-start} seconds to get remaining candidates.")
def get_similarity_sorted_wnids(q_wnid): """Returns a list of sorted (wnid, dist) duples in order of most similar to least similar to the query wnid.""" q_synset = wn.synset_from_pos_and_offset(q_wnid[0], int(q_wnid[1:])) imgnet = imagenet.ImageNetData() wnids = list(imgnet.train_imgs_by_wnid.keys()) similarity_dict = {} for wnid in wnids: cur_synset = wn.synset_from_pos_and_offset(wnid[0], int(wnid[1:])) similarity_dict[wnid] = q_synset.path_similarity(cur_synset) sorted_wnids = [ (k, similarity_dict[k]) for k in sorted(similarity_dict, key=similarity_dict.get, reverse=True) ] return sorted_wnids
def main(): with open('../data/metadata/unprocessed_wnids.json', 'r') as f: bad_wnids = json.load(f) #with open('../data/metadata/wnid_to_parent_2.json', 'r') as f: # wnid_to_parent = json.load(f) imgnt = imagenet.ImageNetData() wnid_to_parent = {} wnids_with_additional_search_terms = [] wnids_with_no_additional_search_terms = [] for wnid in bad_wnids: if wnid not in wnid_to_parent: wnid_to_parent[wnid] = [] synset = imgnt.class_info_by_wnid[wnid].synset gloss = imgnt.class_info_by_wnid[wnid].gloss cur_synset = wn.synset_from_pos_and_offset(wnid[0], int(wnid[1:])) gloss_list = gloss.split() for parent in cur_synset.hypernyms(): inherited_hypernym = parent.hypernyms() for inherited_parent in inherited_hypernym: inherited_hypernym_list = inherited_parent.lemma_names() parent_list = parent.lemma_names() intersect = intersection(gloss_list, parent_list) if len(intersect) > 0: wnid_to_parent[wnid].extend(intersect) wnid_to_parent[wnid] = list(set(wnid_to_parent[wnid])) wnids_with_additional_search_terms.append(wnid) print('Wnid: ', wnid) print('Synset: ', synset) print('Gloss: ', gloss) print('Parent : ', parent_list) print('Parents parent: ', inherited_hypernym_list) print('Intersection', intersect) print() with open('../data/metadata/wnid_to_parent_3.json', 'w') as f: json.dump(wnid_to_parent, f, indent=2) with open( '../data/metadata/unprocessed_wnids_with_additional_search_terms.json', 'w') as f: json.dump(list(set(wnids_with_additional_search_terms)), f, indent=2)
def sample_val_dummy(dataset_size, seed): num_classes = 1000 assert dataset_size % num_classes == 0 num_per_class = dataset_size // num_classes rng = random.Random(seed) imgnet = imagenet.ImageNetData() dataset_images = [] all_wnids = list(sorted(list(imgnet.class_info_by_wnid.keys()))) for wnid in all_wnids: images_for_wnid = list(sorted(imgnet.val_imgs_by_wnid[wnid])) cur_images = rng.sample(images_for_wnid, num_per_class) dataset_images.extend(sorted([(x, wnid) for x in cur_images])) assert len(dataset_images) == dataset_size result = {} result['sampling_function'] = 'sample_val_dummy' result['time_string'] = get_time_string() result['username'] = getpass.getuser() result['seed'] = seed result['image_filenames'] = dataset_images return result
def sample_val_annotated(dataset_size, min_num_annotations, seed): num_classes = 1000 assert dataset_size % num_classes == 0 num_per_class = dataset_size // num_classes rng = random.Random(seed) imgnet = imagenet.ImageNetData() mturk = mturk_data.MTurkData(live=True, load_assignments=True, source_filenames_to_ignore=mturk_data. main_collection_filenames_to_ignore) dataset_images = [] all_wnids = list(sorted(list(imgnet.class_info_by_wnid.keys()))) for wnid in all_wnids: valid_images_for_wnid = [] for img in imgnet.val_imgs_by_wnid[wnid]: if img in mturk.image_num_assignments and wnid in mturk.image_num_assignments[ img] and mturk.image_num_assignments[img][ wnid] >= min_num_annotations: valid_images_for_wnid.append(img) valid_images_for_wnid = sorted(valid_images_for_wnid) assert len(valid_images_for_wnid) >= num_per_class cur_images = rng.sample(valid_images_for_wnid, num_per_class) dataset_images.extend(sorted([(x, wnid) for x in cur_images])) rng.shuffle(dataset_images) assert len(dataset_images) == dataset_size result = {} result['sampling_function'] = 'sample_val_annotated' result['min_num_annotations'] = min_num_annotations result['time_string'] = get_time_string() result['username'] = getpass.getuser() result['seed'] = seed result['image_filenames'] = dataset_images return result
def download_images(datasets, include_val): imgnet = imagenet.ImageNetData() cds = candidate_data.CandidateData(exclude_blacklisted_candidates=False) loader = image_loader.ImageLoader(imgnet, cds) all_wnids = list(sorted(list(imgnet.class_info_by_wnid.keys()))) assert len(all_wnids) == 1000 for dataset in datasets.split(','): print(f'Downloading images for dataset {dataset} ...') dataset_filepath = pathlib.Path( __file__).parent / '../data/datasets' / (dataset + '.json') dataset_filepath = dataset_filepath.resolve() assert dataset_filepath.is_file() with open(dataset_filepath, 'r') as f: data = json.load(f) dataset_by_wnid = {x: [] for x in all_wnids} for img, wnid in data['image_filenames']: dataset_by_wnid[wnid].append(img) for cur_wnid in tqdm.tqdm(all_wnids): images_to_download = dataset_by_wnid[cur_wnid] #if include_val: # images_to_download.extend(imgnet.val_imgs_by_wnid[cur_wnid]) loader.load_image_bytes_batch(images_to_download, size='scaled_500', verbose=False) if include_val: print('Downloading all validation images ...') for cur_wnid in tqdm.tqdm(all_wnids): images_to_download = imgnet.val_imgs_by_wnid[cur_wnid] loader.load_image_bytes_batch(images_to_download, size='scaled_500', verbose=False)
import json import pathlib import click import tqdm import candidate_data import image_loader import imagenet imgnet = imagenet.ImageNetData() cds = candidate_data.CandidateData(exclude_blacklisted_candidates=False) loader = image_loader.ImageLoader(imgnet, cds) all_wnids = list(sorted(list(imgnet.class_info_by_wnid.keys()))) assert len(all_wnids) == 1000 print('Downloading all candidate images ...') for cur_wnid in tqdm.tqdm(all_wnids): images_to_download = cds.candidates_by_wnid[cur_wnid] images_to_download = [x['id_ours'] for x in images_to_download] loader.load_image_bytes_batch(images_to_download, size='scaled_500', verbose=False) if __name__ == "__main__": download_images()
def main(args): imgnt = imagenet.ImageNetData(verbose=False) cds = candidate_data.CandidateData(verbose=False) filenames_to_ignore = [ '2018-08-06_17:33_vaishaal.json', '2018-08-17_17:24_vaishaal.json', 'vaishaal_hits_submitted_2018-08-17-18:28:33-PDT.json', 'vaishaal_hits_submitted_2018-08-17-18:50:38-PDT.json', 'vaishaal_hits_submitted_2018-08-17-19:28:24-PDT.json', 'vaishaal_hits_submitted_2018-08-17-19:56:28-PDT.json', 'vaishaal_hits_submitted_2018-08-25-09:47:26-PDT.json' ] mturk = mturk_data.MTurkData( live=True, load_assignments=True, source_filenames_to_ignore=filenames_to_ignore, verbose=False) with open(args.input_filename, 'rb') as fp: nn_results = pickle.load(fp) print('Current nearest neighbor statistics') print_nn_stats(nn_results) print() metric_to_cd, cd_to_dist_counter = select_test_candidates( imgnt, cds, mturk, nn_results, args) print('Remaining candidate distances to compute') for d, num_cds in cd_to_dist_counter.items(): print('{} cds left for metric {}'.format(num_cds, d)) print() print('Computing neighbors for ') for d in distance_metrics: print('{} candidates in metric {}.'.format(len(metric_to_cd[d]), d)) print() for metric in args.metrics: if metric == 'l2' or metric == 'fc7': candidates = metric_to_cd[metric] result, _ = compute_distances_for_all_references( candidates, metric, imgnt, cds, mturk, args) if len(result) != len(candidates): print('WARNING: len(result) {} len(candidates) {}'.format( len(result), len(candidates))) #assert len(result) == len(candidates) nn_results = save_ndc_result(result, nn_results, metric, args) for metric in args.metrics: if metric == 'dssim': print('Computing distances for dssim') candidates = metric_to_cd[metric] result = compute_distances_for_wnid_references( candidates, metric, imgnt, cds, mturk, args) if len(result) != len(candidates): print('WARNING: len(result) {} len(candidates) {}'.format( len(result), len(candidates))) #assert len(result) == len(candidates) print('Saving results') start = timer() nn_results = save_ndc_result(result, nn_results, metric, args) end = timer() print('Saving the results took {} seconds'.format(end - start)) num_candidates_left = {} for d in args.metrics: num_candidates_left[d] = cd_to_dist_counter[d] return num_candidates_left
def compute_nearest_neighbors(distance_measures, candidate_filenames, reference_filenames, top_k, window_size, cache, cache_root): cache_key = compute_hash(distance_measures, candidate_filenames, reference_filenames, top_k, window_size) full_key = f"{cache_root}/{cache_key}" timing_info = {} if cache: if utils.key_exists(BUCKET, full_key): load_start = timer() ret_value = pickle.loads( utils.get_s3_object_bytes_with_backoff(full_key)[0]) load_end = timer() compute_start = compute_end = timer() timing_info['load_start'] = load_start timing_info['load_end'] = load_end timing_info['compute_start'] = compute_start timing_info['compute_end'] = compute_end timing_info['cached'] = True return ret_value, timing_info imgnt = imagenet.ImageNetData(cache_on_local_disk=True, verbose=False, cache_root_path='/tmp/imagenet2_cache') cds = candidate_data.CandidateData(cache_on_local_disk=True, load_metadata_from_s3=True, verbose=False, cache_root_path='/tmp/imagenet2_cache') loader = image_loader.ImageLoader(imgnt, cds, cache_on_local_disk=True, num_tries=4, cache_root_path='/tmp/imagenet2_cache') load_start = timer() if ('l2' in distance_measures) or ('dssim' in distance_measures): candidate_image_dict = loader.load_image_batch(candidate_filenames, size='scaled_256', force_rgb=True, verbose=False) reference_image_dict = loader.load_image_batch(reference_filenames, size='scaled_256', force_rgb=True, verbose=False) if 'fc7' in distance_measures: candidate_feature_dict = loader.load_features_batch( candidate_filenames, verbose=False) reference_feature_dict = loader.load_features_batch( reference_filenames, verbose=False) load_end = timer() compute_start = timer() result = {} for distance_measure in distance_measures: if distance_measure == 'l2': result['l2'] = compute_l2_distances(candidate_image_dict, reference_image_dict, 196608) elif distance_measure == 'dssim': result['dssim'] = compute_dssim_distances(candidate_image_dict, reference_image_dict, window_size) elif distance_measure == 'fc7': result['fc7'] = compute_l2_distances(candidate_feature_dict, reference_feature_dict, 4096) else: raise ValueError('Unknown distance measure') compute_end = timer() timing_info = {} timing_info['load_start'] = load_start timing_info['load_end'] = load_end timing_info['compute_start'] = compute_start timing_info['compute_end'] = compute_end timing_info['cached'] = False res = compute_top_k(result, top_k) if cache: utils.put_s3_object_bytes_with_backoff(pickle.dumps(res), full_key) return res, timing_info
def carry_over_reviews(dataset_filename, starting_from): dataset_filepath = pathlib.Path( __file__).parent / '../data/datasets' / dataset_filename dataset_filepath = dataset_filepath.resolve() assert dataset_filepath.is_file() with open(dataset_filepath, 'r') as f: dataset = json.load(f) prev_dataset_filepath = pathlib.Path( __file__).parent / '../data/datasets' / starting_from prev_dataset_filepath = prev_dataset_filepath.resolve() assert prev_dataset_filepath.is_file() with open(prev_dataset_filepath, 'r') as f: prev_dataset = json.load(f) assert dataset['starting_from'] == prev_dataset['output_filename'] assert starting_from.endswith('.json') prev_review_filename = starting_from[:-5] + '_review.json' prev_review_filepath = pathlib.Path( __file__).parent / '../data/dataset_reviews' / prev_review_filename prev_review_filepath = prev_review_filepath.resolve() assert prev_review_filepath.is_file() with open(prev_review_filepath, 'r') as f: prev_review = json.load(f) imgnet = imagenet.ImageNetData() all_wnids = list(sorted(list(imgnet.class_info_by_wnid.keys()))) assert len(all_wnids) == 1000 prev_dataset_by_wnid = {} dataset_by_wnid = {} for wnid in all_wnids: prev_dataset_by_wnid[wnid] = [] dataset_by_wnid[wnid] = [] for img, wnid in dataset['image_filenames']: dataset_by_wnid[wnid].append(img) for img, wnid in prev_dataset['image_filenames']: prev_dataset_by_wnid[wnid].append(img) new_review = {} for wnid in all_wnids: new_review[wnid] = {} new_review[wnid]['problematic'] = False if prev_review[wnid]['problematic']: new_review[wnid]['reviewed'] = False else: if images_are_same(dataset_by_wnid[wnid], prev_dataset_by_wnid[wnid]): new_review[wnid]['reviewed'] = prev_review[wnid]['reviewed'] else: new_review[wnid]['reviewed'] = False assert dataset_filename.endswith('.json') new_review_filename = dataset_filename[:-5] + '_review.json' new_review_filepath = pathlib.Path( __file__).parent / '../data/dataset_reviews' / new_review_filename new_review_filepath = new_review_filepath.resolve() assert not new_review_filepath.is_file() with open(new_review_filepath, 'w') as f: json.dump(new_review, f, indent=2, sort_keys=True) print('Wrote new review data to {}'.format(new_review_filepath)) num_reviewed = len([x for x in new_review.items() if x[1]['reviewed']]) num_problematic = len( [x for x in new_review.items() if x[1]['problematic']]) print(' {} reviewed wnids'.format(num_reviewed)) print(' {} problematic wnids'.format(num_problematic))
def best(dataset_size, min_num_annotations, seed, output_filename, starting_from): output_filepath = pathlib.Path( __file__).parent / '../data/datasets' / output_filename output_filepath = output_filepath.resolve() assert not output_filepath.is_file() if starting_from is not None: starting_from_filepath = pathlib.Path( __file__).parent / '../data/datasets' / starting_from starting_from_filepath = starting_from_filepath.resolve() assert starting_from_filepath.is_file() with open(starting_from_filepath, 'r') as f: starting_from_loaded = json.load(f) else: starting_from_loaded = None imgnet = imagenet.ImageNetData() mturk = mturk_data.MTurkData(live=True, load_assignments=True, source_filenames_to_ignore=mturk_data. main_collection_filenames_to_ignore) review_targets = {'l2': 1.2e8, 'dssim': 0.2205, 'fc7': 1.32e4} success, result, sampling_candidates, exclusions, carried_over_from_prev = dataset_sampling.sample_best( dataset_size=dataset_size, min_num_annotations=min_num_annotations, near_duplicate_review_targets=review_targets, seed=seed, starting_from=starting_from_loaded) if not success: num_per_class = dataset_size // 1000 print('Failed to sample a valid dataset.') print( 'The following wnids have fewer than {} candidates with at least {} annotations' .format(num_per_class, min_num_annotations)) for wnid, cur_candidates in sampling_candidates.items(): if len(cur_candidates) < num_per_class - len( carried_over_from_prev[wnid]): print( ' {}: {} sampling candidates, plus {} carried over from the previous dataset ({})' .format(wnid, len(cur_candidates), len(carried_over_from_prev[wnid]), ', '.join(imgnet.class_info_by_wnid[wnid].synset))) for reason, excluded_candidates in exclusions[wnid].items(): print(' {}: {} candidates'.format( reason, len(excluded_candidates))) avg_selection_frequency = 0.0 for img, wnid in result['image_filenames']: avg_selection_frequency += mturk.image_fraction_selected[img][wnid] avg_selection_frequency /= len(result['image_filenames']) print(f'\nAverage selection frequency: {avg_selection_frequency:.2}') all_wnids = list(sorted(list(imgnet.class_info_by_wnid.keys()))) selection_frequencies_by_wnid = {x: [] for x in all_wnids} for img, wnid in result['image_filenames']: selection_frequencies_by_wnid[wnid].append( mturk.image_fraction_selected[img][wnid]) min_selection_frequency_by_wnid = { x: min(selection_frequencies_by_wnid[x]) for x in all_wnids } avg_selection_frequency_by_wnid = { x: statistics.mean(selection_frequencies_by_wnid[x]) for x in all_wnids } show_worst_k = 20 print('\nwnids with the smallest minimum selection frequencies:') for wnid, sel_freq in sorted(min_selection_frequency_by_wnid.items(), key=lambda x: (x[1], x[0]))[:show_worst_k]: synset = ', '.join(imgnet.class_info_by_wnid[wnid].synset) print(f' {wnid}: {sel_freq:.3f} ({synset})') print('\nwnids with the smallest average selection frequencies:') for wnid, sel_freq in sorted(avg_selection_frequency_by_wnid.items(), key=lambda x: (x[1], x[0]))[:show_worst_k]: synset = ', '.join(imgnet.class_info_by_wnid[wnid].synset) print(f' {wnid}: {sel_freq:.3f} ({synset})') result['output_filename'] = output_filename with open(output_filepath, 'w') as f: json.dump(result, f, indent=2) print('\nWrote dataset to {}'.format(output_filepath))
def main(args): imgnt = imagenet.ImageNetData() with open(args.flickr_api_key_filename, 'r') as f: flickr_api_keys = json.load(f) api_key = flickr_api_keys[0] api_secret = flickr_api_keys[1] with open(args.wnids, 'r') as f: wnids = json.load(f) print('processing {} wnids'.format(len(wnids))) if not args.parallel: all_results = [] for wnid in wnids: print("Flickr search for wnid {}".format(wnid)) res = flickr_search_synset(imgnt, [wnid], api_key, api_secret, args) all_results += res else: pywren_config = wc.default() pywren_config["runtime"]["s3_bucket"] = "imagenet2datav2" pywren_config["runtime"][ "s3_key"] = "pywren.runtime/pywren_runtime-3.6-imagenet2.tar.gz" pwex = pywren.default_executor(config=pywren_config) pywren_func = lambda x: flickr_search_synset(imgnt, x, api_key, api_secret, args) pywren_args = list( utils.chunks(wnids, int(np.ceil(len(wnids) / args.num_serial_tasks)))) num_images_per_wnid = {} with open( '../data/metadata/flickr_' + args.min_date_uploaded + '_' + args.max_date_uploaded + '.json', 'r') as fp: num_images_per_wnid = json.load(fp) for ii, lst in enumerate(pywren_args): print("Map {} over {} wnids ".format(ii, len(lst))) unfinished_wnids = [] for wnid in lst: if wnid not in num_images_per_wnid: unfinished_wnids.append(wnid) print("Executing pywren call for {} wnids".format( len(unfinished_wnids))) futures = pwex.map(pywren_func, [[x] for x in unfinished_wnids]) pywren.wait(futures) results = [f.result()[0] for f in futures] num_images = [f.result()[1] for f in futures] for ii, wnid in enumerate(unfinished_wnids): num_images_per_wnid[wnid] = num_images[ii] all_results = [] for res in results: all_results += res with open( '../data/metadata/flickr_' + args.min_date_uploaded + '_' + args.max_date_uploaded + '.json', 'w') as fp: json.dump(num_images_per_wnid, fp, indent=2) print('Got {} results'.format(len(all_results))) current_date = str(datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S')) out_file = '../data/search_results/' + current_date + '_' + getpass.getuser( ) + '.json' with open(out_file, 'w+') as fp: json.dump(all_results, fp, indent=2)
print(f"expected {true_answer}, got {result}") print(v) assert true_answer == result print("Passed NDC for metric {0} for test {1}".format( m, prefix)) if (exact): if (not np.isclose(v[0][1], 0)): print(m, val, k, v) assert np.isclose(v[0][1], 0) return res return test if __name__ == "__main__": im_data = imagenet.ImageNetData() #image_names = im_data.get_all_val_image_names() + im_data.get_all_train_image_names() + im_data #image_names = im_data.get_all_val_image_names() + im_data.get_all_train_image_names() + im_data references = im_data.get_all_val_image_names()[:100] custom_test = make_test(references, mod_fn=lambda x: x + np.random.randn(*x.shape), metrics=['fc7'], size='scaled_256', num_extra_images=10, exact=False) custom_test(top_k=10, extra_pairs=[("n02085936_7394.JPEG", "n02085936_10397.JPEG")]) custom_test(top_k=10) references = im_data.get_all_train_image_names()[:100] custom_test = make_test(references,
import json import os import urllib.request import imagenet import utils imgnet = imagenet.ImageNetData(load_class_info=False) def lookup_wnid(wnid): url = 'http://www.image-net.org/api/text/wordnet.synset.getwords?wnid={0}'.format( wnid) return urllib.request.urlopen(url).read().decode().strip().split('\n') gloss_bytes = utils.get_s3_file_bytes('metadata/gloss.txt', cache_on_local_disk=False) gloss_string = gloss_bytes.decode('utf-8') gloss_lines = gloss_string.split('\n') gloss = {} for line in gloss_lines: wnid = line[:9] cur_gloss = line[10:] gloss[wnid] = cur_gloss tmpci2 = [] wnids = sorted(imgnet.train_imgs_by_wnid.keys()) for ii, wnid in enumerate(wnids): cur_dict = {}
def eval(dataset, models, batch_size): dataset_filename = dataset if models == 'all': models = all_models else: models = models.split(',') for model in models: assert model in all_models dataset_filepath = pathlib.Path(__file__).parent / '../data/datasets' / ( dataset_filename + '.json') print('Reading dataset from {} ...'.format(dataset_filepath)) with open(dataset_filepath, 'r') as f: dataset = json.load(f) cur_imgs = [x[0] for x in dataset['image_filenames']] imgnet = imagenet.ImageNetData() cds = candidate_data.CandidateData(load_metadata_from_s3=False, exclude_blacklisted_candidates=False) loader = image_loader.ImageLoader(imgnet, cds) pbar = tqdm(total=len(cur_imgs), desc='Dataset download') img_data = loader.load_image_bytes_batch( cur_imgs, size='scaled_500', verbose=False, download_callback=lambda x: pbar.update(x)) pbar.close() for model in tqdm(models, desc='Model evaluations'): if (model not in extra_models): tqdm.write('Evaluating {}'.format(model)) resize_size = 256 center_crop_size = 224 if model == 'inception_v3': resize_size = 299 center_crop_size = 299 data_loader = eval_utils.get_data_loader( cur_imgs, imgnet, cds, image_size='scaled_500', resize_size=resize_size, center_crop_size=center_crop_size, batch_size=batch_size) pt_model = getattr(torchvision.models, model)(pretrained=True) if (torch.cuda.is_available()): pt_model = pt_model.cuda() pt_model.eval() tqdm.write(' Number of trainable parameters: {}'.format( sum(p.numel() for p in pt_model.parameters() if p.requires_grad))) predictions, top1_acc, top5_acc, total_time, num_images = eval_utils.evaluate_model( pt_model, data_loader, show_progress_bar=True) tqdm.write(' Evaluated {} images'.format(num_images)) tqdm.write(' Top-1 accuracy: {:.2f}'.format(100.0 * top1_acc)) tqdm.write(' Top-5 accuracy: {:.2f}'.format(100.0 * top5_acc)) tqdm.write( ' Total time: {:.1f} (average time per image: {:.2f} ms)'. format(total_time, 1000.0 * total_time / num_images)) npy_out_filepath = pathlib.Path( __file__).parent / '../data/predictions' / dataset_filename / ( model + '.npy') npy_out_filepath = npy_out_filepath.resolve() directory = os.path.dirname(npy_out_filepath) if not os.path.exists(directory): os.makedirs(directory) if (os.path.exists(npy_out_filepath)): old_preds = np.load(npy_out_filepath) np.save(f'{npy_out_filepath}.{int(time.time())}', old_preds) print('checking old preds is same as new preds') if not np.allclose(old_preds, predictions): diffs = np.round(old_preds - predictions, 4) print('old preds != new preds') else: print('old preds == new_preds!') np.save(npy_out_filepath, predictions) tqdm.write(' Saved predictions to {}'.format(npy_out_filepath)) else: tqdm.write('Evaluating extra model {}'.format(model)) if (model in {"dpn68b", "dpn92", "dpn107"}): pt_model = pretrainedmodels.__dict__[model]( num_classes=1000, pretrained='imagenet+5k') else: pt_model = pretrainedmodels.__dict__[model]( num_classes=1000, pretrained='imagenet') tf_img = pretrained_utils.TransformImage(pt_model) load_img = pretrained_utils.LoadImage() tqdm.write(' Number of trainable parameters: {}'.format( sum(p.numel() for p in pt_model.parameters() if p.requires_grad))) #print(pt_model) #print(load_img) dataset = eval_utils.ImageLoaderDataset(cur_imgs, imgnet, cds, 'scaled_500', transform=tf_img) data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True) if (torch.cuda.is_available()): pt_model = pt_model.cuda() pt_model.eval() predictions, top1_acc, top5_acc, total_time, num_images = eval_utils.evaluate_model( pt_model, data_loader, show_progress_bar=True) tqdm.write(' Evaluated {} images'.format(num_images)) tqdm.write(' Top-1 accuracy: {:.2f}'.format(100.0 * top1_acc)) tqdm.write(' Top-5 accuracy: {:.2f}'.format(100.0 * top5_acc)) tqdm.write( ' Total time: {:.1f} (average time per image: {:.2f} ms)'. format(total_time, 1000.0 * total_time / num_images)) npy_out_filepath = pathlib.Path( __file__).parent / '../data/predictions' / dataset_filename / ( model + '.npy') npy_out_filepath = npy_out_filepath.resolve() directory = os.path.dirname(npy_out_filepath) if not os.path.exists(directory): os.makedirs(directory) if (os.path.exists(npy_out_filepath)): old_preds = np.load(npy_out_filepath) np.save(f'{npy_out_filepath}.{int(time.time())}', old_preds) print('checking old preds is same as new preds') if not np.allclose(old_preds, predictions): diffs = np.round(old_preds - predictions, 4) print('old preds != new preds') else: print('old preds == new_preds!') np.save(npy_out_filepath, predictions) tqdm.write(' Saved predictions to {}'.format(npy_out_filepath))
def _generate_hits(candidates, images_per_hit=25, pos_control=0, neg_control=0, seed=0): ''' Generates a list of dictionaries fully specifying the HITs ''' #assert(neg_control == 0) c_data = candidate_data.CandidateData() imagenet_data = imagenet.ImageNetData() with open("../data/metadata/wnid_to_most_similar_wnids.json") as f: neg_ids = json.loads(f.read()) grouped_by_class = defaultdict(list) np.random.seed(seed) hits = [] print("Num Candidates ", len(candidates)) for c in candidates: c_json = c_data.all_candidates[c] c_wnid = c_data.all_candidates[c]["wnid"] grouped_by_class[c_wnid].append(c_json) wiki_fail = False for k, v in grouped_by_class.items(): class_info = imagenet_data.class_info_by_wnid[k] if (len(class_info.wikipedia_pages) == 0): print(f"no wikipedia page for {k}") wiki_fail = True hit_lines = list( utils.chunks(v, images_per_hit - pos_control - neg_control)) tail_len = len(hit_lines[-1]) if (tail_len != len(hit_lines[0]) and tail_len < (images_per_hit - pos_control - neg_control)): idxs = np.random.choice(len(v) - tail_len, images_per_hit - tail_len - pos_control - neg_control, replace=False) for i in idxs: hit_lines[-1].append(v[i]) for hit_line in hit_lines: hit_data = {} hit_data["wnid"] = k # list of image ids hit_data["images_to_label"] = [] hit_data["images_pos_control"] = [] hit_data["images_neg_control"] = [] hit_data["images_all"] = [] hit_data["user"] = getpass.getuser() hit_data["uuid"] = str(uuid.uuid4()) hit_data["time"] = str(datetime.now(tzlocal())) hit_data["submitted"] = False hit_data["hit_id"] = '' hit_data["hit_type_id"] = '' val_imgs_dict = imagenet_data.val_imgs_by_wnid wnid = k pos_class = val_imgs_dict[wnid] pos_extra = int( np.ceil((images_per_hit - pos_control - neg_control - len(hit_line)) / 2)) neg_extra = int( np.floor((images_per_hit - pos_control - neg_control - len(hit_line)) / 2)) if (len(hit_line) == images_per_hit - pos_control - neg_control): assert (pos_extra == 0) assert (neg_extra == 0) idxs_pos = np.random.choice(len(pos_class), pos_control + pos_extra, replace=False) if (wnid not in neg_ids): assert False neg_wnid = neg_ids[wnid][1] neg_class = val_imgs_dict[neg_wnid] idxs_neg = np.random.choice(len(neg_class), neg_control + neg_extra, replace=False) #idxs_neg = [] pos_control_list = [] neg_control_list = [] for i in idxs_pos: pos_control_list.append(pos_class[i]) for i in idxs_neg: neg_control_list.append(neg_class[i]) for i, image in enumerate(hit_line): hit_data["images_to_label"].append(image['id_ours']) hit_data["images_all"].append(image['id_ours']) # right now this won't work for i, image in enumerate(pos_control_list): hit_data["images_pos_control"].append(image) hit_data["images_all"].append(image) for i, image in enumerate(neg_control_list): hit_data["images_neg_control"].append(image) hit_data["images_all"].append(image) np.random.shuffle(hit_data["images_all"]) hits.append(hit_data) if (wiki_fail): assert False return hits
def generate_opposite_class_json(): imgnet = imagenet.ImageNetData() wnids = list(imgnet.train_imgs_by_wnid.keys()) result = get_all_negative_wnids(wnids) with open('../data/metadata/wnid_to_farthest_wnid.json', 'w') as fp: json.dump(result, fp, indent=2)
def test(top_k=5, seed=586724699, extra_pairs=[]): im_data = imagenet.ImageNetData() np.random.seed(seed) images = np.random.choice(references, control_images + num_extra_images, replace=False) extra_images = images[control_images:] images = images[:control_images] image_ids = [] img_info = [] test_dataset = [] client = utils.get_s3_client() true_dict = {} to_featurize = [] to_featurize_keys = [] for im_name in images: im_meta, img = make_test_img(im_data, im_name, prefix=prefix, size=size, exact=exact) true_dict[im_meta['id_ours']] = im_name img_info.append(im_meta) img_orig = img if not exact: img = mod_fn(img) img = resize(img, (256, 256), preserve_range=True) else: im_bytes = img img = imageio.imread(img) if 'fc7' in metrics: key_name = os.path.join("imagenet2candidates_featurized", f"{im_meta['id_ours']}.npy") im_resize = resize(img_orig, (224, 224), preserve_range=True) to_featurize.append(im_resize.astype('float32')) to_featurize_keys.append(key_name) bio = io.BytesIO() if not exact: imageio.imwrite(uri=bio, im=img, format="jpg", quality=100) bstream = bio.getvalue() else: print("Exact bytes..") bstream = im_bytes key = "imagenet2candidates_scaled/{0}.jpg".format( im_meta['id_ours']) print("uploading.. to {0}".format(key)) client.put_object(Bucket=bucket, Key=key, Body=bstream) if len(to_featurize) > 0: to_featurize = np.stack(to_featurize, axis=0) batch_size = min(len(to_featurize), 32) features = featurize.vgg16_features(to_featurize, batch_size=batch_size, use_gpu=False) for i, f in enumerate(features): key_name = to_featurize_keys[i] bio = io.BytesIO() np.save(bio, f) print("writing features key {0}".format(key_name)) bstream = bio.getvalue() print("feature hash ", hashlib.sha1(bstream).hexdigest()) client.put_object(Key=key_name, Bucket=bucket, Body=bstream) with open( "../data/search_results/test_{0}_results.json".format(prefix), "w+") as f: f.write(json.dumps(img_info)) candidates = [x['id_ours'] for x in img_info] extra_images = list(extra_images) print("extra pairs", extra_pairs) print("len extra_images", len(extra_images)) for e, v in extra_pairs: true_dict[e] = v candidates.append(e) extra_images.append(v) print("len after append extra_images", len(extra_images)) for e in extra_images: true_dict[e] = e for e in images: true_dict[e] = e reference_names = list(images) + list(extra_images) print( f"running near duplicate check on {candidates} vs {reference_names}" ) print(f"num references {len(references)}") res, t_info = near_duplicate_checker.get_near_duplicates( candidates, reference_names, top_k=top_k, dssim_window_size=35, use_pywren=False, ref_chunk_size=100, cd_chunk_size=100, distance_metrics=metrics) for m, val in res.items(): for k, v in val.items(): true_answer = true_dict[k] result = v[0][0] if (true_answer != result): print(m, val, k, v) print(f"expected {true_answer}, got {result}") print(v) assert true_answer == result print("Passed NDC for metric {0} for test {1}".format( m, prefix)) if (exact): if (not np.isclose(v[0][1], 0)): print(m, val, k, v) assert np.isclose(v[0][1], 0) return res
def wnid_histogram(dataset_size, min_num_annotations_candidates, min_num_annotations_val, min_num_val_images_per_wnid, seed, output_filename, starting_from, allow_upward_sampling): output_filepath = pathlib.Path( __file__).parent / '../data/datasets' / output_filename output_filepath = output_filepath.resolve() assert not output_filepath.is_file() if starting_from is not None: starting_from_filepath = pathlib.Path( __file__).parent / '../data/datasets' / starting_from starting_from_filepath = starting_from_filepath.resolve() assert starting_from_filepath.is_file() with open(starting_from_filepath, 'r') as f: starting_from_loaded = json.load(f) else: starting_from_loaded = None review_targets = {'l2': 1.2e8, 'dssim': 0.2205, 'fc7': 1.32e4} histogram_bins = [0.2, 0.4, 0.6, 0.8] num_bins = len(histogram_bins) + 1 success, result, results_metadata = dataset_sampling.sample_wnid_histogram( dataset_size=dataset_size, histogram_bins=histogram_bins, min_num_annotations_candidates=min_num_annotations_candidates, min_num_annotations_val=min_num_annotations_val, min_num_val_images_per_wnid=min_num_val_images_per_wnid, near_duplicate_review_targets=review_targets, seed=seed, starting_from=starting_from_loaded, allow_upward_sampling=allow_upward_sampling) imgnet = imagenet.ImageNetData() all_wnids = list(sorted(list(imgnet.class_info_by_wnid.keys()))) if not success: total_num_problematic_bins = 0 print( f'Failed to sample a valid dataset ({len(result["image_filenames"])} instead of {dataset_size} images).' ) print( 'The following wnid bins have insufficient images (before potential upward sampling):' ) for wnid in all_wnids: cur_histogram = results_metadata['wnid_histograms'][wnid] cur_sampling_candidates = results_metadata['sampling_candidates'][ wnid] cur_carried_over_from_prev = results_metadata[ 'carried_over_from_prev'][wnid] cur_exclusions = results_metadata['exclusions'][wnid] problematic_bins = [] for cur_bin in range(num_bins): if cur_histogram[cur_bin] > len( cur_sampling_candidates[cur_bin]) + len( cur_carried_over_from_prev[cur_bin]): problematic_bins.append(cur_bin) total_num_problematic_bins += len(problematic_bins) if len(problematic_bins) > 0: print('wnid {} ({})'.format( wnid, ', '.join(imgnet.class_info_by_wnid[wnid].synset))) for cur_bin in problematic_bins: cur_low, cur_high = dataset_sampling.get_bin_boundaries( histogram_bins, cur_bin) cur_valid = len(cur_sampling_candidates[cur_bin]) + len( cur_carried_over_from_prev[cur_bin]) print( ' bin ({} {}): target {}, currently have {}'.format( cur_low, cur_high, cur_histogram[cur_bin], cur_valid)) print(' {} sampling candidates'.format( len(cur_sampling_candidates[cur_bin]))) print( ' {} carried over from previous dataset'.format( len(cur_carried_over_from_prev[cur_bin]))) for reason, excluded_candidates in cur_exclusions[ cur_bin].items(): print(' {}: {} excluded candidates'.format( reason, len(excluded_candidates))) print() print( '{} problematic bins in total'.format(total_num_problematic_bins)) if allow_upward_sampling: num_upward_sampled = 0 print('\nUpward sampled the following images:') for wnid in all_wnids: upward_sampled_for_wnid = results_metadata['upward_sampled'][wnid] has_upsampled_bins = False for cur_bin in range(num_bins): if len(upward_sampled_for_wnid[cur_bin]) > 0: has_upsampled_bins = True break if has_upsampled_bins: print('wnid {} ({})'.format( wnid, ', '.join(imgnet.class_info_by_wnid[wnid].synset))) for cur_bin in range(num_bins): cur_upward_sampled = upward_sampled_for_wnid[cur_bin] for cid, to_bin in cur_upward_sampled: original_low, original_high = dataset_sampling.get_bin_boundaries( histogram_bins, cur_bin) to_low, to_high = dataset_sampling.get_bin_boundaries( histogram_bins, to_bin) print( f' sampled {cid} belonging to bin ({original_low} {original_high}) from bin ({to_low} {to_high}) instead' ) num_upward_sampled += 1 print() print(f'\nUpwarded sampled {num_upward_sampled} images in total') if not success: print( 'The following wnid have insufficient images even after upward sampling:' ) num_per_class = dataset_size // 1000 images_by_wnid = {} for img, wnid in result['image_filenames']: if wnid not in images_by_wnid: images_by_wnid[wnid] = [] images_by_wnid[wnid].append(img) for wnid in all_wnids: if len(images_by_wnid[wnid]) < num_per_class: print(' wnid {}: {} / {} images ({})'.format( wnid, len(images_by_wnid[wnid]), num_per_class, ', '.join(imgnet.class_info_by_wnid[wnid].synset))) result['output_filename'] = output_filename with open(output_filepath, 'w') as f: json.dump(result, f, indent=2) print('Wrote dataset to {}'.format(output_filepath))
def sample_wnid_histogram(*, dataset_size, histogram_bins, min_num_annotations_candidates, min_num_annotations_val, min_num_val_images_per_wnid, near_duplicate_review_targets, seed, starting_from=None, allow_upward_sampling=False): num_classes = 1000 assert dataset_size % num_classes == 0 for metric in near_duplicate_data.metric_names: assert metric in near_duplicate_review_targets assert len(near_duplicate_review_targets) == len( near_duplicate_data.metric_names) num_per_class = dataset_size // num_classes num_bins = len(histogram_bins) + 1 rng = random.Random(seed) imgnet = imagenet.ImageNetData() cds = candidate_data.CandidateData(load_metadata_from_s3=False, exclude_blacklisted_candidates=False) mturk = mturk_data.MTurkData(live=True, load_assignments=True, source_filenames_to_ignore=mturk_data. main_collection_filenames_to_ignore) ndc = near_duplicate_data.NearDuplicateData(imgnet=imgnet, candidates=cds, mturk_data=mturk, load_review_thresholds=True) all_wnids = list(sorted(list(imgnet.class_info_by_wnid.keys()))) success = True histograms_success, wnid_histograms, usable_val_imgs_by_wnid = compute_wnid_histograms( imgnet=imgnet, mturk=mturk, min_num_annotations_val=min_num_annotations_val, min_num_val_images_per_wnid=min_num_val_images_per_wnid, histogram_bins=histogram_bins, num_per_class=num_per_class) if not histograms_success: success = False prev_dataset_by_wnid = get_prev_dataset_by_wnid(starting_from, dataset_size, all_wnids, cds) def is_cid_ok(cid, wnid): if cid in cds.blacklist: return False, 'blacklisted' if cid not in mturk.image_num_assignments: return False, 'few_assignments' if wnid not in mturk.image_num_assignments[cid]: return False, 'few_assignments' if mturk.image_num_assignments[cid][ wnid] < min_num_annotations_candidates: return False, 'few_assignments' if ndc.is_near_duplicate[cid]: return False, 'near_duplicate' sufficiently_reviewed = True if cid not in ndc.review_threshold: sufficiently_reviewed = False else: for metric in near_duplicate_data.metric_names: if metric not in ndc.review_threshold[cid]: sufficiently_reviewed = False elif ndc.review_threshold[cid][ metric] <= near_duplicate_review_targets[metric]: sufficiently_reviewed = False if not sufficiently_reviewed: return False, 'unreviewed' return True, None dataset_images = [] sampling_candidates = {} exclusions = {} carried_over_from_prev = {} upward_sampled = {} for wnid in all_wnids: cur_target = wnid_histograms[wnid] exclusions[wnid] = {} for x in range(num_bins): exclusions[wnid][x] = OrderedDict([('blacklisted', []), ('few_assignments', []), ('below_threshold', []), ('near_duplicate', []), ('unreviewed', [])]) carried_over_from_prev[wnid] = {x: [] for x in range(num_bins)} sampled_images_by_bin = {x: [] for x in range(num_bins)} prev_by_bin = {x: [] for x in range(num_bins)} for cid in prev_dataset_by_wnid[wnid]: cur_freq = mturk.image_fraction_selected[cid][wnid] cur_bin = get_histogram_bin(cur_freq, histogram_bins) cur_ok, cur_reason = is_cid_ok(cid, wnid) if cur_ok: prev_by_bin[cur_bin].append(cid) else: exclusions[wnid][cur_bin][cur_reason].append(cid) for cur_bin in range(num_bins): if len(prev_by_bin[cur_bin]) <= cur_target[cur_bin]: sampled_images_by_bin[cur_bin].extend(prev_by_bin[cur_bin]) carried_over_from_prev[wnid][cur_bin].extend( prev_by_bin[cur_bin]) else: cur_sample = rng.sample(prev_by_bin[cur_bin], cur_target[cur_bin]) sampled_images_by_bin[cur_bin].extend(cur_sample) carried_over_from_prev[wnid][cur_bin].extend(cur_sample) sample_candidates_by_bin = {x: [] for x in range(num_bins)} unmodified_sample_candidates_by_bin = {x: [] for x in range(num_bins)} for cand in cds.candidates_by_wnid[wnid]: cid = cand['id_ours'] if cid in mturk.image_fraction_selected and wnid in mturk.image_fraction_selected[ cid]: cur_freq = mturk.image_fraction_selected[cid][wnid] else: cur_freq = 0.0 cur_bin = get_histogram_bin(cur_freq, histogram_bins) cur_ok, cur_reason = is_cid_ok(cid, wnid) if cur_ok: already_used = False for tmp_bin in range(num_bins): if cid in carried_over_from_prev[wnid][tmp_bin]: already_used = True if not already_used: sample_candidates_by_bin[cur_bin].append(cid) else: exclusions[wnid][cur_bin][cur_reason].append(cid) for cur_bin in range(num_bins): sample_candidates_by_bin[cur_bin] = list( sorted(sample_candidates_by_bin[cur_bin])) unmodified_sample_candidates_by_bin[cur_bin] = copy.deepcopy( sample_candidates_by_bin[cur_bin]) num_remaining_to_sample = cur_target[cur_bin] - len( sampled_images_by_bin[cur_bin]) if num_remaining_to_sample > len( sample_candidates_by_bin[cur_bin]): if not allow_upward_sampling: success = False cur_sample = sample_candidates_by_bin[cur_bin] sample_candidates_by_bin[cur_bin] = [] else: cur_sample = rng.sample(sample_candidates_by_bin[cur_bin], num_remaining_to_sample) sample_candidates_by_bin[cur_bin] = list( set(sample_candidates_by_bin[cur_bin]) - set(cur_sample)) sampled_images_by_bin[cur_bin].extend(cur_sample) if allow_upward_sampling: upward_sampled[wnid] = [] for cur_bin in range(num_bins): cur_upward_sampled = [] num_remaining_to_sample = cur_target[cur_bin] - len( sampled_images_by_bin[cur_bin]) if num_remaining_to_sample > 0: assert len(sample_candidates_by_bin[cur_bin]) == 0 for _ in range(num_remaining_to_sample): found_bin = False for next_bin in range(cur_bin + 1, num_bins): if len(sample_candidates_by_bin[next_bin]) > 0: sample_candidates_from_prev = set( sample_candidates_by_bin[next_bin]) & set( prev_dataset_by_wnid[wnid]) if len(sample_candidates_from_prev) > 0: cur_sample = [ list(sample_candidates_from_prev)[0] ] print( f' upward sampled {cur_sample[0]} from the prev dataset' ) else: cur_sample = rng.sample( sample_candidates_by_bin[next_bin], 1) print( f' upward sampled {cur_sample[0]} randomly' ) assert len(cur_sample) == 1 sampled_images_by_bin[cur_bin].extend(cur_sample) sample_candidates_by_bin[next_bin] = list( set(sample_candidates_by_bin[next_bin]) - set(cur_sample)) cur_upward_sampled.append( (cur_sample[0], next_bin)) found_bin = True break if not found_bin: success = False upward_sampled[wnid].append(cur_upward_sampled) for cur_bin in range(num_bins): dataset_images.extend([x, wnid] for x in sampled_images_by_bin[cur_bin]) sampling_candidates[wnid] = unmodified_sample_candidates_by_bin rng.shuffle(dataset_images) if len(dataset_images) > dataset_size: print(len(dataset_images), dataset_size) assert len(dataset_images) <= dataset_size if success: assert len(dataset_images) == dataset_size result = {} result['sampling_function'] = 'sample_wnid_histogram' result['target_size'] = dataset_size result['histogram_bins'] = histogram_bins result['min_num_annotations_candidates'] = min_num_annotations_candidates result['min_num_annotations_val'] = min_num_annotations_val result['min_num_val_images_per_wnid'] = min_num_val_images_per_wnid result['near_duplicate_review_targets'] = near_duplicate_review_targets result['time_string'] = get_time_string() result['username'] = getpass.getuser() result['seed'] = seed result['image_filenames'] = dataset_images result['is_valid'] = success result['allow_upward_sampling'] = allow_upward_sampling if starting_from is not None: result['starting_from'] = starting_from['output_filename'] result_metadata = {} result_metadata['wnid_histograms'] = wnid_histograms result_metadata['usable_val_imgs_by_wnid'] = usable_val_imgs_by_wnid result_metadata['sampling_candidates'] = sampling_candidates result_metadata['exclusions'] = exclusions result_metadata['carried_over_from_prev'] = carried_over_from_prev result_metadata['upward_sampled'] = upward_sampled return success, result, result_metadata
def sample_above_threshold(*, dataset_size, selection_frequency_threshold, min_num_annotations, near_duplicate_review_targets, seed, starting_from=None, wnid_thresholds=None): num_classes = 1000 assert dataset_size % num_classes == 0 for metric in near_duplicate_data.metric_names: assert metric in near_duplicate_review_targets assert len(near_duplicate_review_targets) == len( near_duplicate_data.metric_names) num_per_class = dataset_size // num_classes rng = random.Random(seed) imgnet = imagenet.ImageNetData() cds = candidate_data.CandidateData(load_metadata_from_s3=False, exclude_blacklisted_candidates=False) mturk = mturk_data.MTurkData(live=True, load_assignments=True, source_filenames_to_ignore=mturk_data. main_collection_filenames_to_ignore) ndc = near_duplicate_data.NearDuplicateData(imgnet=imgnet, candidates=cds, mturk_data=mturk, load_review_thresholds=True) def is_cid_ok(cid, wnid): if cid in cds.blacklist: return False, 'blacklisted' if cid not in mturk.image_num_assignments: return False, 'few_assignments' if wnid not in mturk.image_num_assignments[cid]: return False, 'few_assignments' if mturk.image_num_assignments[cid][wnid] < min_num_annotations: return False, 'few_assignments' if wnid in wnid_thresholds: cur_threshold = wnid_thresholds[wnid] else: cur_threshold = selection_frequency_threshold if mturk.image_fraction_selected[cid][wnid] < cur_threshold: return False, 'below_threshold' if ndc.is_near_duplicate[cid]: return False, 'near_duplicate' sufficiently_reviewed = True for metric in near_duplicate_data.metric_names: if cid not in ndc.review_threshold or metric not in ndc.review_threshold[ cid]: sufficiently_reviewed = False elif ndc.review_threshold[cid][ metric] <= near_duplicate_review_targets[metric]: sufficiently_reviewed = False if not sufficiently_reviewed: return False, 'unreviewed' return True, None all_wnids = list(sorted(list(imgnet.class_info_by_wnid.keys()))) if wnid_thresholds is not None: for wnid in wnid_thresholds.keys(): assert wnid in all_wnids prev_dataset_by_wnid = get_prev_dataset_by_wnid(starting_from, dataset_size, all_wnids, cds) dataset_images = [] sampling_candidates = {} exclusions = {} success = True carried_over_from_prev = {} for wnid in all_wnids: sampling_candidates[wnid] = [] exclusions[wnid] = OrderedDict([('blacklisted', []), ('few_assignments', []), ('below_threshold', []), ('near_duplicate', []), ('unreviewed', [])]) carried_over_from_prev[wnid] = [] if wnid in prev_dataset_by_wnid: for cid in prev_dataset_by_wnid[wnid]: if is_cid_ok(cid, wnid)[0]: carried_over_from_prev[wnid].append(cid) for cand in cds.candidates_by_wnid[wnid]: cid = cand['id_ours'] cur_ok, cur_reason = is_cid_ok(cid, wnid) if cur_ok: if cid not in carried_over_from_prev[wnid]: sampling_candidates[wnid].append(cid) else: exclusions[wnid][cur_reason].append(cid) sampling_candidates[wnid] = list(sorted(sampling_candidates[wnid])) remaining_to_sample = num_per_class - len(carried_over_from_prev[wnid]) if len(sampling_candidates[wnid]) < remaining_to_sample: success = False tmp_images = [(x, wnid) for x in carried_over_from_prev[wnid] ] + [(x, wnid) for x in sampling_candidates[wnid]] dataset_images.extend(tmp_images) else: new_images = rng.sample(sampling_candidates[wnid], remaining_to_sample) tmp_images = [(x, wnid) for x in carried_over_from_prev[wnid] ] + [(x, wnid) for x in new_images] dataset_images.extend(tmp_images) rng.shuffle(dataset_images) if success: assert len(dataset_images) == dataset_size result = {} result['sampling_function'] = 'sample_above_threshold' result['target_size'] = dataset_size result['selection_frequency_threshold'] = selection_frequency_threshold result['min_num_annotations'] = min_num_annotations result['near_duplicate_review_targets'] = near_duplicate_review_targets result['time_string'] = get_time_string() result['username'] = getpass.getuser() result['seed'] = seed result['image_filenames'] = dataset_images result['is_valid'] = success if starting_from is not None: result['starting_from'] = starting_from['output_filename'] if wnid_thresholds is not None: result['wnid_thresholds'] = wnid_thresholds return success, result, sampling_candidates, exclusions, carried_over_from_prev