def fewer_bias(questions, test_concepts, ratio, logger): logger('Biasing a set of questions') with logger.levelup(): logger(f'Original size: {len(questions)}') involved_indexes = { concept: [ ind for ind, qst in questions.questions.items() if concept in qst['keywords'] ] for concept in test_concepts } remove = union( *tuple( random_choice_ratio(indexes, 1 - ratio) for indexes in involved_indexes.values()), as_set=True, ) full = union( *tuple(involved_indexes.values()), as_set=True, ) logger(f'Removed: {len(remove)} out of {len(full)}', resume=True) output = questions.copy() output.set_indexes(difference(output.indexes, remove)) return output
def split_train_val(visual_dataset, logger, args): """ This splitting method takes only the train & val part of the visual dataset. The train split is re-split into train and val, and the original val split is regarded as test split. """ logger('Splitting with train-val parts') train_dataset = visual_dataset.copy().filter( lambda scene: scene['split'] == 'train') test_dataset = visual_dataset.copy().filter( lambda scene: scene['split'] == 'val') val_dataset = train_dataset.copy() train_dataset.set_indexes(random_choice_ratio(train_dataset.indexes, 6 / 7)) val_dataset.set_indexes( difference(val_dataset.indexes, train_dataset.indexes)) visual_datasets = { 'train': train_dataset, 'val': val_dataset, 'test': test_dataset } show_split_sizes(visual_datasets, logger) return visual_datasets
def split_by_visual_bias(visual_dataset, logger, args, visual_bias): """ Splitting the visual dataset as sepcified in `default_bias` """ logger('Splitting by visual bias') logger(visual_bias, resume=True, pretty=True) isinstanceof_stats = load_knowledge(args.task, 'isinstanceof') def resplit_fn(scene): return get_split_by_visual_bias(scene, visual_bias[args.task], isinstanceof_stats) resplited = visual_dataset.resplit(resplit_fn) train_dataset = resplited['train'] val_dataset = train_dataset.copy() test_dataset = resplited['test'] train_dataset.set_indexes(random_choice_ratio(train_dataset.indexes, 6 / 7)) val_dataset.set_indexes( difference(val_dataset.indexes, train_dataset.indexes)) visual_datasets = { 'train': train_dataset, 'val': val_dataset, 'test': test_dataset } show_split_sizes(visual_datasets, logger) return visual_datasets
def cub_split(visual_dataset, logger, args): """ Splitting method specially for CUB dataset """ logger('Special splitting function for CUB dataset\n' 'Splitting by ratio specified in args') def get_species(scene): return scene['objects']['0']['name'] species_by_index = { image_id: get_species(scene) for image_id, scene in visual_dataset.sceneGraphs.items() } index_by_species = { species: [ image_id for image_id, name in species_by_index.items() if name == species ] for species in set(species_by_index.values()) } train_indexes = union( *tuple( random_choice_ratio(indexes, args.split_ratio['train']) for indexes in index_by_species.values()), as_set=True, ) val_indexes = union(*tuple( random_choice_ratio( difference(indexes, train_indexes), args.split_ratio['val'] / (1 - args.split_ratio['train'])) for indexes in index_by_species.values()), as_set=True) test_indexes = difference(visual_dataset.indexes, union(train_indexes, val_indexes)) train_dataset = visual_dataset.copy().set_indexes(train_indexes) val_dataset = visual_dataset.copy().set_indexes(val_indexes) test_dataset = visual_dataset.copy().set_indexes(test_indexes) visual_datasets = { 'train': train_dataset, 'val': val_dataset, 'test': test_dataset } show_split_sizes(visual_datasets, logger) return visual_datasets
def get_testConcepts_zeroshot(source, args, logger): synonym_stats = load_knowledge(args.task, 'synonym') syn_groups = [ intersection(source, synset) for exampler, synset in synonym_stats.items() ] test_concepts = [ random_one(synset) for synset in random_choice_ratio(syn_groups, 0.5) if len(synset) > 1 ] logger(f'Selecting test concepts: \n{test_concepts}') logger(f'num = {len(test_concepts)}', resume=True) return test_concepts
def split_by_visual_bias_leaked(visual_dataset, logger, args, visual_bias): """ Splitting the visual dataset as sepcified in `default_bias` """ logger('Splitting by visual bias, with a few unbiased samples') logger(visual_bias, resume=True, pretty=True) isinstanceof_stats = load_knowledge(args.task, 'isinstanceof') def resplit_fn(scene): raw_split = get_split_by_visual_bias(scene, visual_bias[args.task], isinstanceof_stats) ''' if raw_split == 'test' and \ np.random.rand() < args.debiasing_leak: return 'train' else: ''' return raw_split resplited = visual_dataset.resplit(resplit_fn) train_dataset = resplited['train'] val_dataset = train_dataset.copy() test_dataset = resplited['test'] train_dataset.set_indexes(random_choice_ratio(train_dataset.indexes, 6 / 7)) val_dataset.set_indexes( difference(val_dataset.indexes, train_dataset.indexes)) leaked_indexes = random_choice(test_dataset.indexes, args.debiasing_leak) train_dataset.add_indexes(leaked_indexes) test_dataset.remove_indexes(leaked_indexes) visual_datasets = { 'train': train_dataset, 'val': val_dataset, 'test': test_dataset } show_split_sizes(visual_datasets, logger) return visual_datasets
def get_testConcepts(source, args, logger): test_concepts = random_choice_ratio( source, args.generalization_ratio) logger(f'Selecting test concepts: \n{test_concepts}') logger(f'num = {len(test_concepts)}', resume=True) return test_concepts