Exemple #1
0
def get_tasks(args: config.Params,
              cuda_device: Any) -> (List[Task], List[str], List[str]):
    """Get and save tasks:

    1. Set up task storage file paths
    2. Parse config for task names
    3. Load (or build and save) task objects
    4. Call counting methods on task objects
    5. Log example-count stats for tasks.

    Parameters
    ----------
    args : config.Params
        config map.

    Returns
    -------
    List[Task]
        list of all loaded Tasks.
    List[str]
        pretrain task names.
    List[str]
        target task names.

    """
    data_path = args.data_dir
    scratch_path = args.exp_dir

    pretrain_task_names = parse_task_list_arg(args.pretrain_tasks)
    target_task_names = parse_task_list_arg(args.target_tasks)
    # TODO: We don't want diagnostic tasks in train_task_names
    # but want to support glue/superglue task macros.
    pretrain_task_names = list(
        filter(lambda x: x not in ALL_DIAGNOSTICS, pretrain_task_names))

    task_names = sorted(set(pretrain_task_names + target_task_names))
    assert data_path is not None
    scratch_path = scratch_path or data_path
    log.info("Writing pre-preprocessed tasks to %s", scratch_path)

    tasks = []
    for name in task_names:
        task = _get_task(name,
                         args,
                         data_path=data_path,
                         scratch_path=scratch_path)
        tasks.append(task)

        # Count examples, store in example_counts.
        if task.example_counts is None:
            task.count_examples()
        log.info(
            "\tTask '%s': %s",
            task.name,
            " ".join(("|%s|=%d" % kv for kv in task.example_counts.items())),
        )

    log.info("\tFinished loading tasks: %s.",
             " ".join([task.name for task in tasks]))
    return tasks, pretrain_task_names, target_task_names
Exemple #2
0
def get_tasks(args):
    """ Actually build or load (from pickles) the tasks. """
    data_path = args.data_dir
    scratch_path = args.exp_dir

    pretrain_task_names = parse_task_list_arg(args.pretrain_tasks)
    target_task_names = parse_task_list_arg(args.target_tasks)
    # TODO: We don't want diagnostic tasks in train_task_names
    # but want to support glue/superglue task macros.
    pretrain_task_names = list(
        filter(lambda x: x not in ALL_DIAGNOSTICS, pretrain_task_names))

    task_names = sorted(set(pretrain_task_names + target_task_names))
    assert data_path is not None
    scratch_path = scratch_path or data_path
    log.info("Writing pre-preprocessed tasks to %s", scratch_path)

    tasks = []
    for name in task_names:
        task = _get_task(name,
                         args,
                         data_path=data_path,
                         scratch_path=scratch_path)
        tasks.append(task)

        # Count examples, store in example_counts.
        if task.example_counts is None:
            task.count_examples()
        log.info(
            "\tTask '%s': %s",
            task.name,
            " ".join(("|%s|=%d" % kv for kv in task.example_counts.items())),
        )

    log.info("\tFinished loading tasks: %s.",
             " ".join([task.name for task in tasks]))
    return tasks, pretrain_task_names, target_task_names
Exemple #3
0
    def parameter_setup(self, args):
        # Set trainability of this module.
        for param in self.model.parameters():
            param.requires_grad = bool(args.transfer_paradigm == "finetune")

        self.num_layers = self.model.config.num_hidden_layers
        if args.pytorch_transformers_max_layer >= 0:
            self.max_layer = args.pytorch_transformers_max_layer
            assert self.max_layer <= self.num_layers
        else:
            self.max_layer = self.num_layers

        if args.transfer_paradigm == "frozen":
            if isinstance(
                self, (OpenAIGPTEmbedderModule, GPT2EmbedderModule, TransfoXLEmbedderModule)
            ):
                log.warning(
                    "NOTE: OpenAI GPT, GPT-2 and Transformer-XL add new tokens for classification"
                    "tasks, under 'frozen' transfer_paradigm, their embeddings will not be trained"
                )

        # Configure scalar mixing, ELMo-style.
        if self.output_mode == "mix":
            if args.transfer_paradigm == "frozen":
                log.warning(
                    "NOTE: pytorch_transformers_output_mode='mix', so scalar "
                    "mixing weights will be fine-tuned even if BERT "
                    "model is frozen."
                )
            # TODO: if doing multiple target tasks, allow for multiple sets of
            # scalars. See the ELMo implementation here:
            # https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py#L115
            assert len(parse_task_list_arg(args.target_tasks)) <= 1, (
                "pytorch_transformers_output_mode='mix' only supports a single set of "
                "scalars (but if you need this feature, see the TODO in "
                "the code!)"
            )
            # Always have one more mixing weight, for lexical layer.
            self.scalar_mix = scalar_mix.ScalarMix(self.max_layer + 1, do_layer_norm=False)
Exemple #4
0
def build_tasks(
    args: config.Params, cuda_device: Any
) -> (List[Task], List[Task], Vocabulary, Union[np.ndarray, float]):
    """Main logic for preparing tasks:

    1. create or load the tasks
    2. configure classifiers for tasks
    3. set up indexers
    4. build and save vocab to disk
    5. load vocab from disk
    6. if specified, load word embeddings
    7. set up ModelPreprocessingInterface (MPI) to handle model-specific preprocessing
    8. index tasks using vocab and task-specific MPI, save to disk.
    9. return: task data lazy-loaders in phase-specific lists w/ vocab, and word embeddings

    Parameters
    ----------
    args : Params
        config map

    Returns
    -------
    List[Task]
        list of pretrain Tasks.
    List[Task]
        list of target Tasks.
    allennlp.data.Vocabulary
        vocabulary from task data.
    Union[np.ndarray, float]
        Word embeddings.

    """
    # 1) create / load tasks
    tasks, pretrain_task_names, target_task_names = get_tasks(
        args, cuda_device)
    for task in tasks:
        task_classifier = config.get_task_attr(args, task.name,
                                               "use_classifier")
        setattr(task, "_classifier_name",
                task_classifier if task_classifier else task.name)

    tokenizer_names = {task.name: task.tokenizer_name for task in tasks}
    assert not len(set(tokenizer_names.values())) > 1, (
        f"Error: mixing tasks with different tokenizers!"
        " Tokenizations: {tokenizer_names:s}")

    # 2) build / load vocab and indexers
    indexers = build_indexers(args)

    vocab_path = os.path.join(args.exp_dir, "vocab")
    if args.reload_vocab or not os.path.exists(vocab_path):
        _build_vocab(args, tasks, vocab_path)

    # Always load vocab from file.
    vocab = Vocabulary.from_files(vocab_path)
    log.info("\tLoaded vocab from %s", vocab_path)

    for namespace, mapping in vocab._index_to_token.items():
        log.info("\tVocab namespace %s: size %d", namespace, len(mapping))
    log.info("\tFinished building vocab.")
    args.max_word_v_size = vocab.get_vocab_size("tokens")
    args.max_char_v_size = vocab.get_vocab_size("chars")

    # 3) build / load word vectors
    word_embs = None
    if args.input_module in ["glove", "fastText"]:
        emb_file = os.path.join(args.exp_dir, "embs.pkl")
        if args.reload_vocab or not os.path.exists(emb_file):
            word_embs = _build_embeddings(args, vocab, emb_file)
        else:  # load from file
            word_embs = pkl.load(open(emb_file, "rb"))
        log.info("Trimmed word embeddings: %s", str(word_embs.size()))

    # 4) Set up model_preprocessing_interface
    model_preprocessing_interface = ModelPreprocessingInterface(args)

    # 5) Index tasks using vocab (if preprocessed copy not available).
    preproc_dir = os.path.join(args.exp_dir, "preproc")
    utils.maybe_make_dir(preproc_dir)
    reindex_tasks = parse_task_list_arg(args.reindex_tasks)
    utils.assert_for_log(
        not (args.reload_indexing and not reindex_tasks),
        'Flag reload_indexing was set, but no tasks are set to reindex (use -o "args.reindex_tasks'
        ' = "task1,task2,..."")',
    )

    for task in tasks:
        force_reindex = args.reload_indexing and task.name in reindex_tasks
        for split in ALL_SPLITS:
            log_prefix = "\tTask '%s', split '%s'" % (task.name, split)
            relative_path = _get_serialized_record_path(
                task.name, split, "preproc")
            cache_found = _find_cached_file(args.exp_dir,
                                            args.global_ro_exp_dir,
                                            relative_path,
                                            log_prefix=log_prefix)
            if force_reindex or not cache_found:
                # Re-index from scratch.
                record_file = _get_serialized_record_path(
                    task.name, split, preproc_dir)
                if os.path.exists(record_file) and os.path.islink(record_file):
                    os.remove(record_file)

                _index_split(task, split, indexers, vocab, record_file,
                             model_preprocessing_interface)

        # Delete in-memory data - we'll lazy-load from disk later.
        # TODO: delete task.{split}_data_text?

    log.info("\tFinished indexing tasks")

    # 6) Initialize tasks with data iterators.
    pretrain_tasks = []
    target_tasks = []
    for task in tasks:
        # Replace lists of instances with lazy generators from disk.
        task.set_instance_iterable(
            split_name="val",
            instance_iterable=_get_instance_generator(task.name, "val",
                                                      preproc_dir),
        )
        task.set_instance_iterable(
            split_name="test",
            instance_iterable=_get_instance_generator(task.name, "test",
                                                      preproc_dir),
        )
        # When using pretrain_data_fraction, we need modified iterators for use
        # only on training datasets at pretraining time.
        if task.name in pretrain_task_names:
            log.info("\tCreating trimmed pretraining-only version of " +
                     task.name + " train.")
            task.set_instance_iterable(
                split_name="train",
                instance_iterable=_get_instance_generator(
                    task.name,
                    "train",
                    preproc_dir,
                    fraction=args.pretrain_data_fraction),
                phase="pretrain",
            )
            pretrain_tasks.append(task)
        # When using target_train_data_fraction, we need modified iterators
        # only for training datasets at do_target_task_training time.
        if task.name in target_task_names:
            log.info("\tCreating trimmed target-only version of " + task.name +
                     " train.")
            task.set_instance_iterable(
                split_name="train",
                instance_iterable=_get_instance_generator(
                    task.name,
                    "train",
                    preproc_dir,
                    fraction=args.target_train_data_fraction),
                phase="target_train",
            )
            target_tasks.append(task)

    log.info("\t  Training on %s", ", ".join(pretrain_task_names))
    log.info("\t  Evaluating on %s", ", ".join(target_task_names))
    return pretrain_tasks, target_tasks, vocab, word_embs
Exemple #5
0
def build_tasks(args):
    """Main logic for preparing tasks, doing so by
    1) creating / loading the tasks
    2) building / loading the vocabulary
    3) building / loading the word vectors
    4) indexing each task's data
    5) initializing lazy loaders (streaming iterators)
    """

    # 1) create / load tasks
    tasks, pretrain_task_names, target_task_names = get_tasks(args)
    for task in tasks:
        task_classifier = config.get_task_attr(args, task.name,
                                               "use_classifier")
        setattr(task, "_classifier_name",
                task_classifier if task_classifier else task.name)

    tokenizer_names = {task.name: task.tokenizer_name for task in tasks}
    assert len(set(tokenizer_names.values())) == 1, (
        f"Error: mixing tasks with different tokenizers!"
        " Tokenizations: {tokenizer_names:s}")

    # 2) build / load vocab and indexers
    indexers = build_indexers(args)

    vocab_path = os.path.join(args.exp_dir, "vocab")
    if args.reload_vocab or not os.path.exists(vocab_path):
        _build_vocab(args, tasks, vocab_path)

    # Always load vocab from file.
    vocab = Vocabulary.from_files(vocab_path)
    log.info("\tLoaded vocab from %s", vocab_path)

    for namespace, mapping in vocab._index_to_token.items():
        log.info("\tVocab namespace %s: size %d", namespace, len(mapping))
    log.info("\tFinished building vocab.")
    args.max_word_v_size = vocab.get_vocab_size("tokens")
    args.max_char_v_size = vocab.get_vocab_size("chars")

    # 3) build / load word vectors
    word_embs = None
    if args.input_module in ["glove", "fastText"]:
        emb_file = os.path.join(args.exp_dir, "embs.pkl")
        if args.reload_vocab or not os.path.exists(emb_file):
            word_embs = _build_embeddings(args, vocab, emb_file)
        else:  # load from file
            word_embs = pkl.load(open(emb_file, "rb"))
        log.info("Trimmed word embeddings: %s", str(word_embs.size()))

    # 4) Set up model_preprocessing_interface
    model_preprocessing_interface = ModelPreprocessingInterface(args)

    # 5) Index tasks using vocab (if preprocessed copy not available).
    preproc_dir = os.path.join(args.exp_dir, "preproc")
    utils.maybe_make_dir(preproc_dir)
    reindex_tasks = parse_task_list_arg(args.reindex_tasks)
    utils.assert_for_log(
        not (args.reload_indexing and not reindex_tasks),
        'Flag reload_indexing was set, but no tasks are set to reindex (use -o "args.reindex_tasks'
        ' = "task1,task2,..."")',
    )

    for task in tasks:
        force_reindex = args.reload_indexing and task.name in reindex_tasks
        for split in ALL_SPLITS:
            log_prefix = "\tTask '%s', split '%s'" % (task.name, split)
            relative_path = _get_serialized_record_path(
                task.name, split, "preproc")
            cache_found = _find_cached_file(args.exp_dir,
                                            args.global_ro_exp_dir,
                                            relative_path,
                                            log_prefix=log_prefix)
            if force_reindex or not cache_found:
                # Re-index from scratch.
                record_file = _get_serialized_record_path(
                    task.name, split, preproc_dir)
                if os.path.exists(record_file) and os.path.islink(record_file):
                    os.remove(record_file)

                _index_split(task, split, indexers, vocab, record_file,
                             model_preprocessing_interface)

        # Delete in-memory data - we'll lazy-load from disk later.
        # TODO: delete task.{split}_data_text as well?
        task.train_data = None
        task.val_data = None
        task.test_data = None

    log.info("\tFinished indexing tasks")

    # 6) Initialize tasks with data iterators.
    pretrain_tasks = []
    target_tasks = []
    for task in tasks:
        # Replace lists of instances with lazy generators from disk.
        task.val_data = _get_instance_generator(task.name, "val", preproc_dir)
        task.test_data = _get_instance_generator(task.name, "test",
                                                 preproc_dir)
        # When using pretrain_data_fraction, we need modified iterators for use
        # only on training datasets at pretraining time.
        if task.name in pretrain_task_names:
            log.info("\tCreating trimmed pretraining-only version of " +
                     task.name + " train.")
            task.train_data = _get_instance_generator(
                task.name,
                "train",
                preproc_dir,
                fraction=args.pretrain_data_fraction)
            pretrain_tasks.append(task)
        # When using target_train_data_fraction, we need modified iterators
        # only for training datasets at do_target_task_training time.
        if task.name in target_task_names:
            log.info("\tCreating trimmed target-only version of " + task.name +
                     " train.")
            task.train_data = _get_instance_generator(
                task.name,
                "train",
                preproc_dir,
                fraction=args.target_train_data_fraction)
            target_tasks.append(task)

    log.info("\t  Training on %s", ", ".join(pretrain_task_names))
    log.info("\t  Evaluating on %s", ", ".join(target_task_names))
    return pretrain_tasks, target_tasks, vocab, word_embs