Beispiel #1
0
def split_by_visual_bias(visual_dataset, logger, args, visual_bias):
    """
    Splitting the visual dataset as sepcified in `default_bias`
    """

    logger('Splitting by visual bias')
    logger(visual_bias, resume=True, pretty=True)

    isinstanceof_stats = load_knowledge(args.task, 'isinstanceof')

    def resplit_fn(scene):
        return get_split_by_visual_bias(scene, visual_bias[args.task],
                                        isinstanceof_stats)

    resplited = visual_dataset.resplit(resplit_fn)
    train_dataset = resplited['train']
    val_dataset = train_dataset.copy()
    test_dataset = resplited['test']

    train_dataset.set_indexes(random_choice_ratio(train_dataset.indexes,
                                                  6 / 7))
    val_dataset.set_indexes(
        difference(val_dataset.indexes, train_dataset.indexes))

    visual_datasets = {
        'train': train_dataset,
        'val': val_dataset,
        'test': test_dataset
    }
    show_split_sizes(visual_datasets, logger)
    return visual_datasets
Beispiel #2
0
def register_visual_concepts(
    visual_dataset,
    concepts,
    args,
    register_synonyms,
    register_hypernyms,
    register_meronyms,
    forbidden_concepts,
    logger,
    experiment_name,
):

    # registering from visual and conceptual datasets
    if forbidden_concepts is not None:
        logger('Filtering out forbidden concepts:')
        logger(forbidden_concepts, resume=True, pretty=True)
        concepts.filter_out(forbidden_concepts)

    logger('Registering visual dataset')
    visual_dataset.register_concepts(concepts)
    if args.most_frequent != -1:
        word_count = load_knowledge(args.task, 'most_common', logger)
        concepts.filter_most_frequent(word_count, args.most_frequent)
        # visual_dataset.filter_concepts(concepts)

    # registering from knowledge
    if register_synonyms:
        logger('Registering synonyms')
        with logger.levelup():
            concepts.register_related(load_knowledge(args.task, 'synonym'))
    if register_hypernyms:
        logger('Registering hypernyms')
        with logger.levelup():
            concepts.register_related(load_knowledge(args.task, 'hypernym'))
    if register_meronyms:
        logger('Registering meronyms')
        with logger.levelup():
            concepts.register(
                all_secondary_elements(load_knowledge(args.task, 'meronym')),
                multiple=True,
            )

    return concepts
Beispiel #3
0
def get_testConcepts_zeroshot(source, args, logger):
    synonym_stats = load_knowledge(args.task, 'synonym')
    syn_groups = [
        intersection(source, synset)
        for exampler, synset in synonym_stats.items()
    ]
    test_concepts = [
        random_one(synset)
        for synset in random_choice_ratio(syn_groups, 0.5)
        if len(synset) > 1
    ]
    logger(f'Selecting test concepts: \n{test_concepts}')
    logger(f'num = {len(test_concepts)}', resume=True)
    return test_concepts
Beispiel #4
0
def select_by_visual_bias(visual_dataset, visual_bias):
    iio_stats = load_knowledge(args.task, 'isinstanceof')
    splits = {
        image_id: split_by_visual_bias(scene, visual_bias, iio_stats)
        for image_id, scene in visual_dataset.sceneGraphs.items()
        # for image_id, scene in visual_dataset.local_sceneGraphs.items()
    }
    resplit = {
        split: [
            image_id for image_id, this_split in splits.items()
            if this_split == split
        ]
        for split in ['train', 'val', 'test']
    }
    return resplit
Beispiel #5
0
def split_by_visual_bias_leaked(visual_dataset, logger, args, visual_bias):
    """
    Splitting the visual dataset as sepcified in `default_bias`
    """

    logger('Splitting by visual bias, with a few unbiased samples')
    logger(visual_bias, resume=True, pretty=True)

    isinstanceof_stats = load_knowledge(args.task, 'isinstanceof')

    def resplit_fn(scene):
        raw_split = get_split_by_visual_bias(scene, visual_bias[args.task],
                                             isinstanceof_stats)
        '''
        if raw_split == 'test' and \
                np.random.rand() < args.debiasing_leak:
            return 'train'
        else:
        '''
        return raw_split

    resplited = visual_dataset.resplit(resplit_fn)
    train_dataset = resplited['train']
    val_dataset = train_dataset.copy()
    test_dataset = resplited['test']

    train_dataset.set_indexes(random_choice_ratio(train_dataset.indexes,
                                                  6 / 7))
    val_dataset.set_indexes(
        difference(val_dataset.indexes, train_dataset.indexes))

    leaked_indexes = random_choice(test_dataset.indexes, args.debiasing_leak)
    train_dataset.add_indexes(leaked_indexes)
    test_dataset.remove_indexes(leaked_indexes)

    visual_datasets = {
        'train': train_dataset,
        'val': val_dataset,
        'test': test_dataset
    }
    show_split_sizes(visual_datasets, logger)
    return visual_datasets
Beispiel #6
0
def build_dataset(args):
    """
    Main function for building question dataset
    """

    local_dir = os.path.join(args.dataset_dir, args.name)
    make_dir(local_dir)
    logger = Logger(local_dir, is_main=True)
    prepare.print_args(args, logger)
    dataset_config = args.dataset_config[args.task]
    config = lazy_import()[args.mode, args.task, args.experiment].config

    # loading visual dataset
    logger('Loading dataset')
    with logger.levelup():
        visual_dataset = prepare.load_visual_dataset(
            args, logger, dataset_config['scene_process'])

    logger('Splittinng dataset')
    with logger.levelup():
        # task-specific visual split
        visual_splits = dataset_config['visual_split_fn'](visual_dataset,
                                                          logger, args)
        visual_dataset.mark_splits(get_split_indexes(visual_splits))
        # experiment-specific visual split
        visual_splits = config['visual_split_fn'](visual_dataset, logger, args,
                                                  **config['split_kwarg'])
        split_indexes = get_split_indexes(visual_splits)

    # Registering concepts, building exist-checktable
    tools = register.init_word2index(logger)
    logger('Registering visual concepts')
    with logger.levelup():
        register.register_visual_concepts(
            visual_dataset, tools.concepts, args, config['register_synonyms'],
            config['register_hypernyms'], config['register_meronyms'],
            load_knowledge(args.task, 'forbidden'), logger, args.experiment)
        logger(f'Num of concepts: {len(tools.concepts)}')
    logger('Building exist-checktable')
    with logger.levelup():
        exist_checktable = misc.exist_checktable(tools.concepts, args, logger)

    # building conceptual and visual questions
    builders = register_builders(args, tools.concepts, config)

    logger('Building conceptual questions')
    with logger.levelup():
        conceptual_questions = build_all_conceptual_questions(
            args, builders, tools.concepts,
            config['conceptual_question_types'], logger)

    logger('Building visual questions')
    with logger.levelup():
        visual_questions = build_all_visual_questions(
            args, config, builders, tools.concepts, visual_splits,
            config['visual_question_types'], exist_checktable, logger)

    # registering question tokens
    iter_conceptual = list(q for questions in conceptual_questions.values()
                           for q in questions)
    iter_visual = list(q for one_split in visual_questions.values()
                       for questions in one_split.values() for q in questions)
    register.register_question_token(iter_conceptual, tools, logger)
    register.register_question_token(iter_visual, tools, logger)

    # save
    logger('Saving')
    with logger.levelup():
        save(local_dir, logger, conceptual_questions, visual_questions,
             visual_dataset.sceneGraphs, tools, split_indexes)

    embed()
Beispiel #7
0
 def task_knowledge(name):
     return load_knowledge(args.task, name)
Beispiel #8
0
def exist_checktable(all_concepts, args, logger):
    """
    This function returns a look-up table for determining the
    entailment and mutual-exclusion among concepts.
    """
    synonym_stats = load_knowledge(args.task, 'synonym', logger)
    isinstanceof_stats = load_knowledge(args.task, 'isinstanceof', logger)
    hierarchy = load_knowledge(
        args.task, 'hierarchy', logger, from_source=True)

    all_concepts = set(all_concepts)
    cues = {
        concept: {
            True: set([concept]),
            False: set(),
        }
        for concept in all_concepts
    }
    results = copy.deepcopy(cues)

    # Dealing with synonyms first
    if synonym_stats is not None:
        logger('Dealing with synonyms first')
        if isinstanceof_stats is not None:
            logger('expand the isinstanceof stats', resume=True)

        ambiguous = 0
        for examplar, synset in synonym_stats.items():
            group = set(synset)
            group.add(examplar)
            for x in group:
                for y in group:
                    if x in all_concepts:
                        cues[x][True].add(y)
                    if y in all_concepts:
                        results[y][True].add(x)

            # expanding isinstanceof knowledge
            if isinstanceof_stats is not None:
                categories = set([belongs_to(isinstanceof_stats, name)
                                  for name in group])
                if None in categories:
                    categories.remove(None)
                if len(categories) == 1:
                    cat = list(categories)[0]
                    isinstanceof_stats[cat] = \
                        union(isinstanceof_stats[cat], group)
                else:
                    ambiguous += 1
        logger(f'{ambiguous} out of {len(synonym_stats)} synsets '
               'are ambiguous', resume=True)

    # Dealing with hierarchy information then, by walking through the forest
    trace_line = set()

    def trace_down(forest):
        # tracing down the current tree
        for sub_root, sub_forest in forest.items():
            trace_line.add(sub_root)
            for hyper in trace_line:
                if hyper in all_concepts:
                    cues[hyper][True].add(sub_root)
                if sub_root in all_concepts:
                    results[sub_root][True].add(hyper)
            if sub_forest is not None:
                trace_down(sub_forest)
            trace_line.remove(sub_root)

    if hierarchy is not None:
        logger('Dealing with hierarchy information')
        if isinstanceof_stats is not None:
            logger('expand the isinstanceof_stats', resume=True)

        ambiguous = 0
        for root, forest in hierarchy.items():
            trace_down({root: forest})
            # expand the isinstanceof stats
            if isinstanceof_stats is not None:
                all_nodes = all_in_hierarchy({root: forest})
                categories = set([belongs_to(isinstanceof_stats, name)
                                  for name in all_nodes])

                if None in categories:
                    categories.remove(None)
                if len(categories) == 1:
                    cat = list(categories)[0]
                    isinstanceof_stats[cat] = \
                        union(isinstanceof_stats[cat], all_nodes)
                else:
                    ambiguous += 1

        logger(f'{ambiguous} out of {len(hierarchy)} hierarchies '
               'are ambiguous', resume=True)

    # Dealing with the isinstanceof information.
    if isinstanceof_stats is not None:
        logger('Dealing with the isinstanceof information')
        for group in isinstanceof_stats.values():
            for x in group:
                if x in all_concepts:
                    for y in group:
                        if y in all_concepts:
                            if y not in cues[x][True] and \
                                    y not in results[x][True]:
                                cues[x][False].add(y)
                                results[y][False].add(x)

    return {'cues': cues, 'results': results}