def filter_by_wnids(cd_lst, num_wnids): cd_data = candidate_data.CandidateData() wnids = { cd_data.all_candidates[x]['wnid'] for x in cd_lst } wnids = list(wnids) print("num_wnids", num_wnids) print("num_wnids", type(num_wnids)) good_wnids = wnids[:num_wnids] good_cd_lst = [x for x in cd_lst if cd_data.all_candidates[x]['wnid'] in good_wnids] return good_cd_lst
def featurize_candidates(bucket, prefix, batch_size, source_filename): imgnt = imagenet.ImageNetData() cds = candidate_data.CandidateData(verbose=False) filenames_to_ignore = [ '2018-08-06_17:33_vaishaal.json', '2018-08-17_17:24_vaishaal.json', 'vaishaal_hits_submitted_2018-08-17-18:28:33-PDT.json', 'vaishaal_hits_submitted_2018-08-17-18:50:38-PDT.json', 'vaishaal_hits_submitted_2018-08-17-19:28:24-PDT.json', 'vaishaal_hits_submitted_2018-08-17-19:56:28-PDT.json', 'vaishaal_hits_submitted_2018-08-25-09:47:26-PDT.json'] mturk = mturk_data.MTurkData(live=True, load_assignments=True, source_filenames_to_ignore=filenames_to_ignore, verbose=False) to_featurize = [] to_featurize_keys = [] client = utils.get_s3_client() i = 0 #candidate_list = dataset_sampling.get_histogram_sampling_ndc_candidates(imgnet=imgnt, cds=cds, mturk=mturk) start = timer() with open('../data/metadata/fc7_candidates.json', 'r') as f: candidate_list = json.load(f) for k in candidate_list: key_name = os.path.join(prefix, str(k)+".npy") key_exists = utils.key_exists(bucket, key_name) if not key_exists: img = cds.load_image(k, size='original', verbose=False) img = skimage.transform.resize(img, FEATURIZE_SIZE, preserve_range=True) to_featurize.append(img) to_featurize_keys.append(k) #if i > 250: # break; i = i + 1 print('Got candidate {}'.format(i)) end = timer() print(f"Took {end-start} seconds to get remaining candidates.") print('Beginning featurization of {} items'.format(len(to_featurize_keys))) if len(to_featurize) > 0: to_featurize = np.stack(to_featurize, axis=0) print(f"input shape {to_featurize.shape}") batch_size = min(len(to_featurize), batch_size) features = featurize.vgg16_features(to_featurize, batch_size=batch_size) print(f"features shape {features.shape}") for i,f in enumerate(features): key_name = os.path.join(prefix, to_featurize_keys[i]+".npy") bio = io.BytesIO() np.save(bio, f) print("writing key {0}".format(key_name)) utils.put_s3_object_bytes_with_backoff(bio.getvalue(), key_name) print(f"Took {end-start} seconds to get remaining candidates.")
def main(args): mturk = mturk_data.MTurkData(live=True, verbose=True, load_assignments=True) cds = candidate_data.CandidateData() cd_filenames = list(cds.all_candidates.keys()) mturk_images = list(mturk.hits_of_image.keys()) print('Current number of mturk images: {}'.format(len(mturk_images))) with open('../data/metadata/nearest_neighbor_results.pickle', 'rb') as f: nn_results = pickle.load(f) print('Current nearest neighbor statistics: ') print_nn_stats(nn_results) print() # Load existing reviews reviews = {} with open('../data/metadata/nearest_neighbor_reviews_v2.json', 'r') as f: reviews = json.load(f) print('Current review statistics: ') print_nn_stats(reviews) print() # Load existing near duplicates with open('../data/metadata/near_duplicates.json', 'r') as f: near_duplicates = json.load(f) print('Number of candidates with duplicates: {}'.format(len(near_duplicates))) print() # Load blacklisted search keywords with open('../data/metadata/blacklisted_search_keywords.json', 'r') as f: blacklisted_search_keywords = json.load(f) print('Blacklisted search keywords: {}'.format(blacklisted_search_keywords)) # Load blacklisted candidates with open('../data/metadata/candidate_blacklist.json', 'r') as f: candidate_blacklist = json.load(f) candidates_for_hit = list(set(cd_filenames)) candidates_for_hit = remove_sufficiently_labeled_candidates(candidates_for_hit, cds, mturk) print('{} possible unhit candidates'.format(len(candidates_for_hit))) candidates_for_hit = remove_blacklist_candidates(candidates_for_hit, candidate_blacklist) deduplicated_cds = remove_near_duplicates(candidates_for_hit, near_duplicates, cd_filenames) final_cds = remove_blacklisted_search_keys(deduplicated_cds, blacklisted_search_keywords, cds) final_cds = filter_by_wnids(final_cds, args.num_wnids) print('Saving {} candidates for hit'.format(len(final_cds))) current_date = str(datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S')) with open('../data/hit_candidates/candidates_for_hit_' + current_date + '.json', 'w') as f: json.dump(final_cds, f, indent=2)
def download_images(datasets, include_val): imgnet = imagenet.ImageNetData() cds = candidate_data.CandidateData(exclude_blacklisted_candidates=False) loader = image_loader.ImageLoader(imgnet, cds) all_wnids = list(sorted(list(imgnet.class_info_by_wnid.keys()))) assert len(all_wnids) == 1000 for dataset in datasets.split(','): print(f'Downloading images for dataset {dataset} ...') dataset_filepath = pathlib.Path( __file__).parent / '../data/datasets' / (dataset + '.json') dataset_filepath = dataset_filepath.resolve() assert dataset_filepath.is_file() with open(dataset_filepath, 'r') as f: data = json.load(f) dataset_by_wnid = {x: [] for x in all_wnids} for img, wnid in data['image_filenames']: dataset_by_wnid[wnid].append(img) for cur_wnid in tqdm.tqdm(all_wnids): images_to_download = dataset_by_wnid[cur_wnid] #if include_val: # images_to_download.extend(imgnet.val_imgs_by_wnid[cur_wnid]) loader.load_image_bytes_batch(images_to_download, size='scaled_500', verbose=False) if include_val: print('Downloading all validation images ...') for cur_wnid in tqdm.tqdm(all_wnids): images_to_download = imgnet.val_imgs_by_wnid[cur_wnid] loader.load_image_bytes_batch(images_to_download, size='scaled_500', verbose=False)
import json import pathlib import click import tqdm import candidate_data import image_loader import imagenet imgnet = imagenet.ImageNetData() cds = candidate_data.CandidateData(exclude_blacklisted_candidates=False) loader = image_loader.ImageLoader(imgnet, cds) all_wnids = list(sorted(list(imgnet.class_info_by_wnid.keys()))) assert len(all_wnids) == 1000 print('Downloading all candidate images ...') for cur_wnid in tqdm.tqdm(all_wnids): images_to_download = cds.candidates_by_wnid[cur_wnid] images_to_download = [x['id_ours'] for x in images_to_download] loader.load_image_bytes_batch(images_to_download, size='scaled_500', verbose=False) if __name__ == "__main__": download_images()
def main(args): imgnt = imagenet.ImageNetData(verbose=False) cds = candidate_data.CandidateData(verbose=False) filenames_to_ignore = [ '2018-08-06_17:33_vaishaal.json', '2018-08-17_17:24_vaishaal.json', 'vaishaal_hits_submitted_2018-08-17-18:28:33-PDT.json', 'vaishaal_hits_submitted_2018-08-17-18:50:38-PDT.json', 'vaishaal_hits_submitted_2018-08-17-19:28:24-PDT.json', 'vaishaal_hits_submitted_2018-08-17-19:56:28-PDT.json', 'vaishaal_hits_submitted_2018-08-25-09:47:26-PDT.json' ] mturk = mturk_data.MTurkData( live=True, load_assignments=True, source_filenames_to_ignore=filenames_to_ignore, verbose=False) with open(args.input_filename, 'rb') as fp: nn_results = pickle.load(fp) print('Current nearest neighbor statistics') print_nn_stats(nn_results) print() metric_to_cd, cd_to_dist_counter = select_test_candidates( imgnt, cds, mturk, nn_results, args) print('Remaining candidate distances to compute') for d, num_cds in cd_to_dist_counter.items(): print('{} cds left for metric {}'.format(num_cds, d)) print() print('Computing neighbors for ') for d in distance_metrics: print('{} candidates in metric {}.'.format(len(metric_to_cd[d]), d)) print() for metric in args.metrics: if metric == 'l2' or metric == 'fc7': candidates = metric_to_cd[metric] result, _ = compute_distances_for_all_references( candidates, metric, imgnt, cds, mturk, args) if len(result) != len(candidates): print('WARNING: len(result) {} len(candidates) {}'.format( len(result), len(candidates))) #assert len(result) == len(candidates) nn_results = save_ndc_result(result, nn_results, metric, args) for metric in args.metrics: if metric == 'dssim': print('Computing distances for dssim') candidates = metric_to_cd[metric] result = compute_distances_for_wnid_references( candidates, metric, imgnt, cds, mturk, args) if len(result) != len(candidates): print('WARNING: len(result) {} len(candidates) {}'.format( len(result), len(candidates))) #assert len(result) == len(candidates) print('Saving results') start = timer() nn_results = save_ndc_result(result, nn_results, metric, args) end = timer() print('Saving the results took {} seconds'.format(end - start)) num_candidates_left = {} for d in args.metrics: num_candidates_left[d] = cd_to_dist_counter[d] return num_candidates_left
def compute_nearest_neighbors(distance_measures, candidate_filenames, reference_filenames, top_k, window_size, cache, cache_root): cache_key = compute_hash(distance_measures, candidate_filenames, reference_filenames, top_k, window_size) full_key = f"{cache_root}/{cache_key}" timing_info = {} if cache: if utils.key_exists(BUCKET, full_key): load_start = timer() ret_value = pickle.loads( utils.get_s3_object_bytes_with_backoff(full_key)[0]) load_end = timer() compute_start = compute_end = timer() timing_info['load_start'] = load_start timing_info['load_end'] = load_end timing_info['compute_start'] = compute_start timing_info['compute_end'] = compute_end timing_info['cached'] = True return ret_value, timing_info imgnt = imagenet.ImageNetData(cache_on_local_disk=True, verbose=False, cache_root_path='/tmp/imagenet2_cache') cds = candidate_data.CandidateData(cache_on_local_disk=True, load_metadata_from_s3=True, verbose=False, cache_root_path='/tmp/imagenet2_cache') loader = image_loader.ImageLoader(imgnt, cds, cache_on_local_disk=True, num_tries=4, cache_root_path='/tmp/imagenet2_cache') load_start = timer() if ('l2' in distance_measures) or ('dssim' in distance_measures): candidate_image_dict = loader.load_image_batch(candidate_filenames, size='scaled_256', force_rgb=True, verbose=False) reference_image_dict = loader.load_image_batch(reference_filenames, size='scaled_256', force_rgb=True, verbose=False) if 'fc7' in distance_measures: candidate_feature_dict = loader.load_features_batch( candidate_filenames, verbose=False) reference_feature_dict = loader.load_features_batch( reference_filenames, verbose=False) load_end = timer() compute_start = timer() result = {} for distance_measure in distance_measures: if distance_measure == 'l2': result['l2'] = compute_l2_distances(candidate_image_dict, reference_image_dict, 196608) elif distance_measure == 'dssim': result['dssim'] = compute_dssim_distances(candidate_image_dict, reference_image_dict, window_size) elif distance_measure == 'fc7': result['fc7'] = compute_l2_distances(candidate_feature_dict, reference_feature_dict, 4096) else: raise ValueError('Unknown distance measure') compute_end = timer() timing_info = {} timing_info['load_start'] = load_start timing_info['load_end'] = load_end timing_info['compute_start'] = compute_start timing_info['compute_end'] = compute_end timing_info['cached'] = False res = compute_top_k(result, top_k) if cache: utils.put_s3_object_bytes_with_backoff(pickle.dumps(res), full_key) return res, timing_info
import pywren from pywren import wrenconfig as wc import candidate_data import utils pywren_config = wc.default() pywren_config["runtime"]["s3_bucket"] = "imagenet2pywren" pywren_config["runtime"][ "s3_key"] = "pywren.runtime/pywren_runtime-3.6-imagenet2pywren.meta.json" pwex = pywren.default_executor(config=pywren_config) print("pywren config", pwex.config) c_data = candidate_data.CandidateData() all_cs = c_data.all_candidates chunked_cs = list(utils.chunks(list(all_cs.keys()), 100)) def return_not_exists(lst): ret_lst = [] for e in lst: key = "{0}/{1}.jpg".format("imagenet2candidates_scaled", e) exists = utils.key_exists(bucket="imagenet2datav2", key=key) print(exists, key) if (not exists): ret_lst.append(e) return ret_lst def return_not_exists_encrypted(lst):
model_ft.fc = nn.Linear(2048, 2) elif (args.model == "resnet152"): model_ft = models.resnet152(pretrained=args.pretrained) model_ft.fc = nn.Linear(8192, 2) else: raise Exception("Unsupported model") dataset_filename = args.dataset dataset_filepath = pathlib.Path(__file__).parent / '../data/datasets' / (dataset_filename + '.json') with open(dataset_filepath, 'r') as f: dataset = json.load(f) imgs = [x[0] for x in dataset['image_filenames']] print('Reading dataset from {} ...'.format(dataset_filepath)) imgnet = imagenet.ImageNetData() cds = candidate_data.CandidateData(load_metadata_from_s3=False, exclude_blacklisted_candidates=False) loader = image_loader.ImageLoader(imgnet, cds) pbar = tqdm(total=len(imgs), desc='New Dataset download') img_data = loader.load_image_bytes_batch(imgs, size='scaled_256', verbose=False, download_callback=lambda x:pbar.update(x)) pbar.close() torch_dataset = ImageLoaderDataset(imgs, imgnet, cds, 'scaled_256', transform=transforms.ToTensor()) control_dataset_filename = CONTROL_NAME control_dataset_filepath = pathlib.Path(__file__).parent / '../data/datasets' / (control_dataset_filename + '.json') with open(control_dataset_filepath, 'r') as f: control_dataset = json.load(f) control_imgs = [x[0] for x in dataset['image_filenames']] print('Reading dataset from {} ...'.format(control_dataset_filepath)) control_torch_dataset = ImageLoaderDataset(control_imgs, imgnet, cds, 'scaled_256', transform=transforms.ToTensor()) pbar = tqdm(total=len(control_imgs), desc='Control Dataset download') img_data = loader.load_image_bytes_batch(control_imgs, size='scaled_256', verbose=False, download_callback=lambda x:pbar.update(x)) pbar.close()
import copy from datetime import datetime, timezone import hashlib import json import pickle import time import candidate_data cds = candidate_data.CandidateData(exclude_blacklisted_candidates=False, verbose=True) all_cds = list(cds.all_candidates.values()) + cds.duplicates for c in all_cds: assert c['search_engine'] == 'flickr' cds_by_flickr_id = {} for c in cds.all_candidates.values(): cur_flickr_id = c['id_search_engine'] if cur_flickr_id not in cds_by_flickr_id: cds_by_flickr_id[cur_flickr_id] = [] cds_by_flickr_id[cur_flickr_id].append(c) relevant_duplicates_by_flickr_id_wnid = {} num_duplicates_skipped = 0 num_duplicates_proposed = 0 for c in cds.duplicates: cur_flickr_id = c['id_search_engine'] cur_wnid = c['wnid']
def eval(dataset, models, batch_size): dataset_filename = dataset if models == 'all': models = all_models else: models = models.split(',') for model in models: assert model in all_models dataset_filepath = pathlib.Path(__file__).parent / '../data/datasets' / ( dataset_filename + '.json') print('Reading dataset from {} ...'.format(dataset_filepath)) with open(dataset_filepath, 'r') as f: dataset = json.load(f) cur_imgs = [x[0] for x in dataset['image_filenames']] imgnet = imagenet.ImageNetData() cds = candidate_data.CandidateData(load_metadata_from_s3=False, exclude_blacklisted_candidates=False) loader = image_loader.ImageLoader(imgnet, cds) pbar = tqdm(total=len(cur_imgs), desc='Dataset download') img_data = loader.load_image_bytes_batch( cur_imgs, size='scaled_500', verbose=False, download_callback=lambda x: pbar.update(x)) pbar.close() for model in tqdm(models, desc='Model evaluations'): if (model not in extra_models): tqdm.write('Evaluating {}'.format(model)) resize_size = 256 center_crop_size = 224 if model == 'inception_v3': resize_size = 299 center_crop_size = 299 data_loader = eval_utils.get_data_loader( cur_imgs, imgnet, cds, image_size='scaled_500', resize_size=resize_size, center_crop_size=center_crop_size, batch_size=batch_size) pt_model = getattr(torchvision.models, model)(pretrained=True) if (torch.cuda.is_available()): pt_model = pt_model.cuda() pt_model.eval() tqdm.write(' Number of trainable parameters: {}'.format( sum(p.numel() for p in pt_model.parameters() if p.requires_grad))) predictions, top1_acc, top5_acc, total_time, num_images = eval_utils.evaluate_model( pt_model, data_loader, show_progress_bar=True) tqdm.write(' Evaluated {} images'.format(num_images)) tqdm.write(' Top-1 accuracy: {:.2f}'.format(100.0 * top1_acc)) tqdm.write(' Top-5 accuracy: {:.2f}'.format(100.0 * top5_acc)) tqdm.write( ' Total time: {:.1f} (average time per image: {:.2f} ms)'. format(total_time, 1000.0 * total_time / num_images)) npy_out_filepath = pathlib.Path( __file__).parent / '../data/predictions' / dataset_filename / ( model + '.npy') npy_out_filepath = npy_out_filepath.resolve() directory = os.path.dirname(npy_out_filepath) if not os.path.exists(directory): os.makedirs(directory) if (os.path.exists(npy_out_filepath)): old_preds = np.load(npy_out_filepath) np.save(f'{npy_out_filepath}.{int(time.time())}', old_preds) print('checking old preds is same as new preds') if not np.allclose(old_preds, predictions): diffs = np.round(old_preds - predictions, 4) print('old preds != new preds') else: print('old preds == new_preds!') np.save(npy_out_filepath, predictions) tqdm.write(' Saved predictions to {}'.format(npy_out_filepath)) else: tqdm.write('Evaluating extra model {}'.format(model)) if (model in {"dpn68b", "dpn92", "dpn107"}): pt_model = pretrainedmodels.__dict__[model]( num_classes=1000, pretrained='imagenet+5k') else: pt_model = pretrainedmodels.__dict__[model]( num_classes=1000, pretrained='imagenet') tf_img = pretrained_utils.TransformImage(pt_model) load_img = pretrained_utils.LoadImage() tqdm.write(' Number of trainable parameters: {}'.format( sum(p.numel() for p in pt_model.parameters() if p.requires_grad))) #print(pt_model) #print(load_img) dataset = eval_utils.ImageLoaderDataset(cur_imgs, imgnet, cds, 'scaled_500', transform=tf_img) data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True) if (torch.cuda.is_available()): pt_model = pt_model.cuda() pt_model.eval() predictions, top1_acc, top5_acc, total_time, num_images = eval_utils.evaluate_model( pt_model, data_loader, show_progress_bar=True) tqdm.write(' Evaluated {} images'.format(num_images)) tqdm.write(' Top-1 accuracy: {:.2f}'.format(100.0 * top1_acc)) tqdm.write(' Top-5 accuracy: {:.2f}'.format(100.0 * top5_acc)) tqdm.write( ' Total time: {:.1f} (average time per image: {:.2f} ms)'. format(total_time, 1000.0 * total_time / num_images)) npy_out_filepath = pathlib.Path( __file__).parent / '../data/predictions' / dataset_filename / ( model + '.npy') npy_out_filepath = npy_out_filepath.resolve() directory = os.path.dirname(npy_out_filepath) if not os.path.exists(directory): os.makedirs(directory) if (os.path.exists(npy_out_filepath)): old_preds = np.load(npy_out_filepath) np.save(f'{npy_out_filepath}.{int(time.time())}', old_preds) print('checking old preds is same as new preds') if not np.allclose(old_preds, predictions): diffs = np.round(old_preds - predictions, 4) print('old preds != new preds') else: print('old preds == new_preds!') np.save(npy_out_filepath, predictions) tqdm.write(' Saved predictions to {}'.format(npy_out_filepath))
def _generate_hits(candidates, images_per_hit=25, pos_control=0, neg_control=0, seed=0): ''' Generates a list of dictionaries fully specifying the HITs ''' #assert(neg_control == 0) c_data = candidate_data.CandidateData() imagenet_data = imagenet.ImageNetData() with open("../data/metadata/wnid_to_most_similar_wnids.json") as f: neg_ids = json.loads(f.read()) grouped_by_class = defaultdict(list) np.random.seed(seed) hits = [] print("Num Candidates ", len(candidates)) for c in candidates: c_json = c_data.all_candidates[c] c_wnid = c_data.all_candidates[c]["wnid"] grouped_by_class[c_wnid].append(c_json) wiki_fail = False for k, v in grouped_by_class.items(): class_info = imagenet_data.class_info_by_wnid[k] if (len(class_info.wikipedia_pages) == 0): print(f"no wikipedia page for {k}") wiki_fail = True hit_lines = list( utils.chunks(v, images_per_hit - pos_control - neg_control)) tail_len = len(hit_lines[-1]) if (tail_len != len(hit_lines[0]) and tail_len < (images_per_hit - pos_control - neg_control)): idxs = np.random.choice(len(v) - tail_len, images_per_hit - tail_len - pos_control - neg_control, replace=False) for i in idxs: hit_lines[-1].append(v[i]) for hit_line in hit_lines: hit_data = {} hit_data["wnid"] = k # list of image ids hit_data["images_to_label"] = [] hit_data["images_pos_control"] = [] hit_data["images_neg_control"] = [] hit_data["images_all"] = [] hit_data["user"] = getpass.getuser() hit_data["uuid"] = str(uuid.uuid4()) hit_data["time"] = str(datetime.now(tzlocal())) hit_data["submitted"] = False hit_data["hit_id"] = '' hit_data["hit_type_id"] = '' val_imgs_dict = imagenet_data.val_imgs_by_wnid wnid = k pos_class = val_imgs_dict[wnid] pos_extra = int( np.ceil((images_per_hit - pos_control - neg_control - len(hit_line)) / 2)) neg_extra = int( np.floor((images_per_hit - pos_control - neg_control - len(hit_line)) / 2)) if (len(hit_line) == images_per_hit - pos_control - neg_control): assert (pos_extra == 0) assert (neg_extra == 0) idxs_pos = np.random.choice(len(pos_class), pos_control + pos_extra, replace=False) if (wnid not in neg_ids): assert False neg_wnid = neg_ids[wnid][1] neg_class = val_imgs_dict[neg_wnid] idxs_neg = np.random.choice(len(neg_class), neg_control + neg_extra, replace=False) #idxs_neg = [] pos_control_list = [] neg_control_list = [] for i in idxs_pos: pos_control_list.append(pos_class[i]) for i in idxs_neg: neg_control_list.append(neg_class[i]) for i, image in enumerate(hit_line): hit_data["images_to_label"].append(image['id_ours']) hit_data["images_all"].append(image['id_ours']) # right now this won't work for i, image in enumerate(pos_control_list): hit_data["images_pos_control"].append(image) hit_data["images_all"].append(image) for i, image in enumerate(neg_control_list): hit_data["images_neg_control"].append(image) hit_data["images_all"].append(image) np.random.shuffle(hit_data["images_all"]) hits.append(hit_data) if (wiki_fail): assert False return hits
def sample_above_threshold(*, dataset_size, selection_frequency_threshold, min_num_annotations, near_duplicate_review_targets, seed, starting_from=None, wnid_thresholds=None): num_classes = 1000 assert dataset_size % num_classes == 0 for metric in near_duplicate_data.metric_names: assert metric in near_duplicate_review_targets assert len(near_duplicate_review_targets) == len( near_duplicate_data.metric_names) num_per_class = dataset_size // num_classes rng = random.Random(seed) imgnet = imagenet.ImageNetData() cds = candidate_data.CandidateData(load_metadata_from_s3=False, exclude_blacklisted_candidates=False) mturk = mturk_data.MTurkData(live=True, load_assignments=True, source_filenames_to_ignore=mturk_data. main_collection_filenames_to_ignore) ndc = near_duplicate_data.NearDuplicateData(imgnet=imgnet, candidates=cds, mturk_data=mturk, load_review_thresholds=True) def is_cid_ok(cid, wnid): if cid in cds.blacklist: return False, 'blacklisted' if cid not in mturk.image_num_assignments: return False, 'few_assignments' if wnid not in mturk.image_num_assignments[cid]: return False, 'few_assignments' if mturk.image_num_assignments[cid][wnid] < min_num_annotations: return False, 'few_assignments' if wnid in wnid_thresholds: cur_threshold = wnid_thresholds[wnid] else: cur_threshold = selection_frequency_threshold if mturk.image_fraction_selected[cid][wnid] < cur_threshold: return False, 'below_threshold' if ndc.is_near_duplicate[cid]: return False, 'near_duplicate' sufficiently_reviewed = True for metric in near_duplicate_data.metric_names: if cid not in ndc.review_threshold or metric not in ndc.review_threshold[ cid]: sufficiently_reviewed = False elif ndc.review_threshold[cid][ metric] <= near_duplicate_review_targets[metric]: sufficiently_reviewed = False if not sufficiently_reviewed: return False, 'unreviewed' return True, None all_wnids = list(sorted(list(imgnet.class_info_by_wnid.keys()))) if wnid_thresholds is not None: for wnid in wnid_thresholds.keys(): assert wnid in all_wnids prev_dataset_by_wnid = get_prev_dataset_by_wnid(starting_from, dataset_size, all_wnids, cds) dataset_images = [] sampling_candidates = {} exclusions = {} success = True carried_over_from_prev = {} for wnid in all_wnids: sampling_candidates[wnid] = [] exclusions[wnid] = OrderedDict([('blacklisted', []), ('few_assignments', []), ('below_threshold', []), ('near_duplicate', []), ('unreviewed', [])]) carried_over_from_prev[wnid] = [] if wnid in prev_dataset_by_wnid: for cid in prev_dataset_by_wnid[wnid]: if is_cid_ok(cid, wnid)[0]: carried_over_from_prev[wnid].append(cid) for cand in cds.candidates_by_wnid[wnid]: cid = cand['id_ours'] cur_ok, cur_reason = is_cid_ok(cid, wnid) if cur_ok: if cid not in carried_over_from_prev[wnid]: sampling_candidates[wnid].append(cid) else: exclusions[wnid][cur_reason].append(cid) sampling_candidates[wnid] = list(sorted(sampling_candidates[wnid])) remaining_to_sample = num_per_class - len(carried_over_from_prev[wnid]) if len(sampling_candidates[wnid]) < remaining_to_sample: success = False tmp_images = [(x, wnid) for x in carried_over_from_prev[wnid] ] + [(x, wnid) for x in sampling_candidates[wnid]] dataset_images.extend(tmp_images) else: new_images = rng.sample(sampling_candidates[wnid], remaining_to_sample) tmp_images = [(x, wnid) for x in carried_over_from_prev[wnid] ] + [(x, wnid) for x in new_images] dataset_images.extend(tmp_images) rng.shuffle(dataset_images) if success: assert len(dataset_images) == dataset_size result = {} result['sampling_function'] = 'sample_above_threshold' result['target_size'] = dataset_size result['selection_frequency_threshold'] = selection_frequency_threshold result['min_num_annotations'] = min_num_annotations result['near_duplicate_review_targets'] = near_duplicate_review_targets result['time_string'] = get_time_string() result['username'] = getpass.getuser() result['seed'] = seed result['image_filenames'] = dataset_images result['is_valid'] = success if starting_from is not None: result['starting_from'] = starting_from['output_filename'] if wnid_thresholds is not None: result['wnid_thresholds'] = wnid_thresholds return success, result, sampling_candidates, exclusions, carried_over_from_prev
def sample_wnid_histogram(*, dataset_size, histogram_bins, min_num_annotations_candidates, min_num_annotations_val, min_num_val_images_per_wnid, near_duplicate_review_targets, seed, starting_from=None, allow_upward_sampling=False): num_classes = 1000 assert dataset_size % num_classes == 0 for metric in near_duplicate_data.metric_names: assert metric in near_duplicate_review_targets assert len(near_duplicate_review_targets) == len( near_duplicate_data.metric_names) num_per_class = dataset_size // num_classes num_bins = len(histogram_bins) + 1 rng = random.Random(seed) imgnet = imagenet.ImageNetData() cds = candidate_data.CandidateData(load_metadata_from_s3=False, exclude_blacklisted_candidates=False) mturk = mturk_data.MTurkData(live=True, load_assignments=True, source_filenames_to_ignore=mturk_data. main_collection_filenames_to_ignore) ndc = near_duplicate_data.NearDuplicateData(imgnet=imgnet, candidates=cds, mturk_data=mturk, load_review_thresholds=True) all_wnids = list(sorted(list(imgnet.class_info_by_wnid.keys()))) success = True histograms_success, wnid_histograms, usable_val_imgs_by_wnid = compute_wnid_histograms( imgnet=imgnet, mturk=mturk, min_num_annotations_val=min_num_annotations_val, min_num_val_images_per_wnid=min_num_val_images_per_wnid, histogram_bins=histogram_bins, num_per_class=num_per_class) if not histograms_success: success = False prev_dataset_by_wnid = get_prev_dataset_by_wnid(starting_from, dataset_size, all_wnids, cds) def is_cid_ok(cid, wnid): if cid in cds.blacklist: return False, 'blacklisted' if cid not in mturk.image_num_assignments: return False, 'few_assignments' if wnid not in mturk.image_num_assignments[cid]: return False, 'few_assignments' if mturk.image_num_assignments[cid][ wnid] < min_num_annotations_candidates: return False, 'few_assignments' if ndc.is_near_duplicate[cid]: return False, 'near_duplicate' sufficiently_reviewed = True if cid not in ndc.review_threshold: sufficiently_reviewed = False else: for metric in near_duplicate_data.metric_names: if metric not in ndc.review_threshold[cid]: sufficiently_reviewed = False elif ndc.review_threshold[cid][ metric] <= near_duplicate_review_targets[metric]: sufficiently_reviewed = False if not sufficiently_reviewed: return False, 'unreviewed' return True, None dataset_images = [] sampling_candidates = {} exclusions = {} carried_over_from_prev = {} upward_sampled = {} for wnid in all_wnids: cur_target = wnid_histograms[wnid] exclusions[wnid] = {} for x in range(num_bins): exclusions[wnid][x] = OrderedDict([('blacklisted', []), ('few_assignments', []), ('below_threshold', []), ('near_duplicate', []), ('unreviewed', [])]) carried_over_from_prev[wnid] = {x: [] for x in range(num_bins)} sampled_images_by_bin = {x: [] for x in range(num_bins)} prev_by_bin = {x: [] for x in range(num_bins)} for cid in prev_dataset_by_wnid[wnid]: cur_freq = mturk.image_fraction_selected[cid][wnid] cur_bin = get_histogram_bin(cur_freq, histogram_bins) cur_ok, cur_reason = is_cid_ok(cid, wnid) if cur_ok: prev_by_bin[cur_bin].append(cid) else: exclusions[wnid][cur_bin][cur_reason].append(cid) for cur_bin in range(num_bins): if len(prev_by_bin[cur_bin]) <= cur_target[cur_bin]: sampled_images_by_bin[cur_bin].extend(prev_by_bin[cur_bin]) carried_over_from_prev[wnid][cur_bin].extend( prev_by_bin[cur_bin]) else: cur_sample = rng.sample(prev_by_bin[cur_bin], cur_target[cur_bin]) sampled_images_by_bin[cur_bin].extend(cur_sample) carried_over_from_prev[wnid][cur_bin].extend(cur_sample) sample_candidates_by_bin = {x: [] for x in range(num_bins)} unmodified_sample_candidates_by_bin = {x: [] for x in range(num_bins)} for cand in cds.candidates_by_wnid[wnid]: cid = cand['id_ours'] if cid in mturk.image_fraction_selected and wnid in mturk.image_fraction_selected[ cid]: cur_freq = mturk.image_fraction_selected[cid][wnid] else: cur_freq = 0.0 cur_bin = get_histogram_bin(cur_freq, histogram_bins) cur_ok, cur_reason = is_cid_ok(cid, wnid) if cur_ok: already_used = False for tmp_bin in range(num_bins): if cid in carried_over_from_prev[wnid][tmp_bin]: already_used = True if not already_used: sample_candidates_by_bin[cur_bin].append(cid) else: exclusions[wnid][cur_bin][cur_reason].append(cid) for cur_bin in range(num_bins): sample_candidates_by_bin[cur_bin] = list( sorted(sample_candidates_by_bin[cur_bin])) unmodified_sample_candidates_by_bin[cur_bin] = copy.deepcopy( sample_candidates_by_bin[cur_bin]) num_remaining_to_sample = cur_target[cur_bin] - len( sampled_images_by_bin[cur_bin]) if num_remaining_to_sample > len( sample_candidates_by_bin[cur_bin]): if not allow_upward_sampling: success = False cur_sample = sample_candidates_by_bin[cur_bin] sample_candidates_by_bin[cur_bin] = [] else: cur_sample = rng.sample(sample_candidates_by_bin[cur_bin], num_remaining_to_sample) sample_candidates_by_bin[cur_bin] = list( set(sample_candidates_by_bin[cur_bin]) - set(cur_sample)) sampled_images_by_bin[cur_bin].extend(cur_sample) if allow_upward_sampling: upward_sampled[wnid] = [] for cur_bin in range(num_bins): cur_upward_sampled = [] num_remaining_to_sample = cur_target[cur_bin] - len( sampled_images_by_bin[cur_bin]) if num_remaining_to_sample > 0: assert len(sample_candidates_by_bin[cur_bin]) == 0 for _ in range(num_remaining_to_sample): found_bin = False for next_bin in range(cur_bin + 1, num_bins): if len(sample_candidates_by_bin[next_bin]) > 0: sample_candidates_from_prev = set( sample_candidates_by_bin[next_bin]) & set( prev_dataset_by_wnid[wnid]) if len(sample_candidates_from_prev) > 0: cur_sample = [ list(sample_candidates_from_prev)[0] ] print( f' upward sampled {cur_sample[0]} from the prev dataset' ) else: cur_sample = rng.sample( sample_candidates_by_bin[next_bin], 1) print( f' upward sampled {cur_sample[0]} randomly' ) assert len(cur_sample) == 1 sampled_images_by_bin[cur_bin].extend(cur_sample) sample_candidates_by_bin[next_bin] = list( set(sample_candidates_by_bin[next_bin]) - set(cur_sample)) cur_upward_sampled.append( (cur_sample[0], next_bin)) found_bin = True break if not found_bin: success = False upward_sampled[wnid].append(cur_upward_sampled) for cur_bin in range(num_bins): dataset_images.extend([x, wnid] for x in sampled_images_by_bin[cur_bin]) sampling_candidates[wnid] = unmodified_sample_candidates_by_bin rng.shuffle(dataset_images) if len(dataset_images) > dataset_size: print(len(dataset_images), dataset_size) assert len(dataset_images) <= dataset_size if success: assert len(dataset_images) == dataset_size result = {} result['sampling_function'] = 'sample_wnid_histogram' result['target_size'] = dataset_size result['histogram_bins'] = histogram_bins result['min_num_annotations_candidates'] = min_num_annotations_candidates result['min_num_annotations_val'] = min_num_annotations_val result['min_num_val_images_per_wnid'] = min_num_val_images_per_wnid result['near_duplicate_review_targets'] = near_duplicate_review_targets result['time_string'] = get_time_string() result['username'] = getpass.getuser() result['seed'] = seed result['image_filenames'] = dataset_images result['is_valid'] = success result['allow_upward_sampling'] = allow_upward_sampling if starting_from is not None: result['starting_from'] = starting_from['output_filename'] result_metadata = {} result_metadata['wnid_histograms'] = wnid_histograms result_metadata['usable_val_imgs_by_wnid'] = usable_val_imgs_by_wnid result_metadata['sampling_candidates'] = sampling_candidates result_metadata['exclusions'] = exclusions result_metadata['carried_over_from_prev'] = carried_over_from_prev result_metadata['upward_sampled'] = upward_sampled return success, result, result_metadata