Esempio n. 1
0
def build_dictionary(args: Args, task: str) -> Dictionary:
    dictionary_path = os.path.join(args.output_path, f'{task}_dictionary.pkl')

    if os.path.exists(dictionary_path):
        print(f'{dictionary_path} exists, load cache')
        return Dictionary.load_from_file(dictionary_path)

    dictionary = Dictionary()

    def build(split: str):
        df = load_csv_from_dataset(task, split)

        for question in df['question']:
            dictionary.tokenize(
                question,
                add_word=True,
                extra_dict=glove.stoi if split == 'Test' else None)

        if task in MULTIPLE_CHOICE_TASKS:
            for answer_key in ['a1', 'a2', 'a3', 'a4', 'a5']:
                for answer in df[answer_key]:
                    dictionary.tokenize(
                        answer,
                        add_word=True,
                        extra_dict=glove.stoi if split == 'Test' else None)

    build('Train')
    build('Test')

    dictionary.dump_to_file(dictionary_path)
    return dictionary
Esempio n. 2
0
if __name__ == "__main__":

    set_default_logger(args.experiment_path, debug=args.debug)
    # config = ConfigFactory.parse_file(args.config)

    fix_seed(config)

    pprint(config)

    TASK = config.get_string('task')

    best_meters = dict()

    if TASK == 'youtube2text':
        youtube2text_dictionary = Dictionary.load_from_file(
            os.path.join(config.get_string('cache_path'),
                         'youtube2text_dictionary.pkl'))
        youtube2text_qtype_dict = dict()
        for qtype in ['what', 'who']:
            qtype_id = youtube2text_dictionary.word2idx[qtype]
            youtube2text_qtype_dict[qtype_id] = qtype

    if args.experiment_path is not None:
        writer = SummaryWriter(log_dir=args.experiment_path)
    else:
        # writer: SummaryWriter = FakeObj()
        raise Exception('No exp path for tensorboard')

    main()

    writer.close()