Ejemplo n.º 1
0
def build_tasks(args):
    """Main logic for preparing tasks, doing so by
    1) creating / loading the tasks
    2) building / loading the vocabulary
    3) building / loading the word vectors
    4) indexing each task's data
    5) initializing lazy loaders (streaming iterators)
    """

    # 1) create / load tasks
    tasks, pretrain_task_names, target_task_names = get_tasks(args)
    for task in tasks:
        task_classifier = config.get_task_attr(args, task.name,
                                               "use_classifier")
        setattr(task, "_classifier_name",
                task_classifier if task_classifier else task.name)

    tokenizer_names = {task.name: task.tokenizer_name for task in tasks}
    assert len(set(tokenizer_names.values())) == 1, (
        f"Error: mixing tasks with different tokenizers!"
        " Tokenizations: {tokenizer_names:s}")

    # 2) build / load vocab and indexers
    indexers = build_indexers(args)

    vocab_path = os.path.join(args.exp_dir, "vocab")
    if args.reload_vocab or not os.path.exists(vocab_path):
        _build_vocab(args, tasks, vocab_path)

    # Always load vocab from file.
    vocab = Vocabulary.from_files(vocab_path)
    log.info("\tLoaded vocab from %s", vocab_path)

    for namespace, mapping in vocab._index_to_token.items():
        log.info("\tVocab namespace %s: size %d", namespace, len(mapping))
    log.info("\tFinished building vocab.")
    args.max_word_v_size = vocab.get_vocab_size("tokens")
    args.max_char_v_size = vocab.get_vocab_size("chars")

    # 3) build / load word vectors
    word_embs = None
    if args.input_module in ["glove", "fastText"]:
        emb_file = os.path.join(args.exp_dir, "embs.pkl")
        if args.reload_vocab or not os.path.exists(emb_file):
            word_embs = _build_embeddings(args, vocab, emb_file)
        else:  # load from file
            word_embs = pkl.load(open(emb_file, "rb"))
        log.info("Trimmed word embeddings: %s", str(word_embs.size()))

    # 4) Index tasks using vocab (if preprocessed copy not available).
    preproc_dir = os.path.join(args.exp_dir, "preproc")
    utils.maybe_make_dir(preproc_dir)
    reindex_tasks = parse_task_list_arg(args.reindex_tasks)
    utils.assert_for_log(
        not (args.reload_indexing and not reindex_tasks),
        'Flag reload_indexing was set, but no tasks are set to reindex (use -o "args.reindex_tasks'
        ' = "task1,task2,..."")',
    )

    # Set up boundary_token_fn, which applies SOS/EOS/SEP/CLS delimiters
    if args.input_module.startswith("bert"):
        from jiant.pytorch_transformers_interface.modules import BertEmbedderModule

        boundary_token_fn = BertEmbedderModule.apply_boundary_tokens
    elif args.input_module.startswith("xlnet"):
        from jiant.pytorch_transformers_interface.modules import XLNetEmbedderModule

        boundary_token_fn = XLNetEmbedderModule.apply_boundary_tokens
    else:
        boundary_token_fn = utils.apply_standard_boundary_tokens

    for task in tasks:
        force_reindex = args.reload_indexing and task.name in reindex_tasks
        for split in ALL_SPLITS:
            log_prefix = "\tTask '%s', split '%s'" % (task.name, split)
            relative_path = _get_serialized_record_path(
                task.name, split, "preproc")
            cache_found = _find_cached_file(args.exp_dir,
                                            args.global_ro_exp_dir,
                                            relative_path,
                                            log_prefix=log_prefix)
            if force_reindex or not cache_found:
                # Re-index from scratch.
                record_file = _get_serialized_record_path(
                    task.name, split, preproc_dir)
                if os.path.exists(record_file) and os.path.islink(record_file):
                    os.remove(record_file)

                _index_split(task, split, indexers, vocab, record_file,
                             boundary_token_fn)

        # Delete in-memory data - we'll lazy-load from disk later.
        # TODO: delete task.{split}_data_text as well?
        task.train_data = None
        task.val_data = None
        task.test_data = None

    log.info("\tFinished indexing tasks")

    # 5) Initialize tasks with data iterators.
    pretrain_tasks = []
    target_tasks = []
    for task in tasks:
        # Replace lists of instances with lazy generators from disk.
        task.val_data = _get_instance_generator(task.name, "val", preproc_dir)
        task.test_data = _get_instance_generator(task.name, "test",
                                                 preproc_dir)
        # When using pretrain_data_fraction, we need modified iterators for use
        # only on training datasets at pretraining time.
        if task.name in pretrain_task_names:
            log.info("\tCreating trimmed pretraining-only version of " +
                     task.name + " train.")
            task.train_data = _get_instance_generator(
                task.name,
                "train",
                preproc_dir,
                fraction=args.pretrain_data_fraction)
            pretrain_tasks.append(task)
        # When using target_train_data_fraction, we need modified iterators
        # only for training datasets at do_target_task_training time.
        if task.name in target_task_names:
            log.info("\tCreating trimmed target-only version of " + task.name +
                     " train.")
            task.train_data = _get_instance_generator(
                task.name,
                "train",
                preproc_dir,
                fraction=args.target_train_data_fraction)
            target_tasks.append(task)

    log.info("\t  Training on %s", ", ".join(pretrain_task_names))
    log.info("\t  Evaluating on %s", ", ".join(target_task_names))
    return pretrain_tasks, target_tasks, vocab, word_embs
Ejemplo n.º 2
0
def build_tasks(
    args: config.Params, cuda_device: Any
) -> (List[Task], List[Task], Vocabulary, Union[np.ndarray, float]):
    """Main logic for preparing tasks:

    1. create or load the tasks
    2. configure classifiers for tasks
    3. set up indexers
    4. build and save vocab to disk
    5. load vocab from disk
    6. if specified, load word embeddings
    7. set up ModelPreprocessingInterface (MPI) to handle model-specific preprocessing
    8. index tasks using vocab and task-specific MPI, save to disk.
    9. return: task data lazy-loaders in phase-specific lists w/ vocab, and word embeddings

    Parameters
    ----------
    args : Params
        config map

    Returns
    -------
    List[Task]
        list of pretrain Tasks.
    List[Task]
        list of target Tasks.
    allennlp.data.Vocabulary
        vocabulary from task data.
    Union[np.ndarray, float]
        Word embeddings.

    """
    # 1) create / load tasks
    tasks, pretrain_task_names, target_task_names = get_tasks(
        args, cuda_device)
    for task in tasks:
        task_classifier = config.get_task_attr(args, task.name,
                                               "use_classifier")
        setattr(task, "_classifier_name",
                task_classifier if task_classifier else task.name)

    tokenizer_names = {task.name: task.tokenizer_name for task in tasks}
    assert not len(set(tokenizer_names.values())) > 1, (
        f"Error: mixing tasks with different tokenizers!"
        " Tokenizations: {tokenizer_names:s}")

    # 2) build / load vocab and indexers
    indexers = build_indexers(args)

    vocab_path = os.path.join(args.exp_dir, "vocab")
    if args.reload_vocab or not os.path.exists(vocab_path):
        _build_vocab(args, tasks, vocab_path)

    # Always load vocab from file.
    vocab = Vocabulary.from_files(vocab_path)
    log.info("\tLoaded vocab from %s", vocab_path)

    for namespace, mapping in vocab._index_to_token.items():
        log.info("\tVocab namespace %s: size %d", namespace, len(mapping))
    log.info("\tFinished building vocab.")
    args.max_word_v_size = vocab.get_vocab_size("tokens")
    args.max_char_v_size = vocab.get_vocab_size("chars")

    # 3) build / load word vectors
    word_embs = None
    if args.input_module in ["glove", "fastText"]:
        emb_file = os.path.join(args.exp_dir, "embs.pkl")
        if args.reload_vocab or not os.path.exists(emb_file):
            word_embs = _build_embeddings(args, vocab, emb_file)
        else:  # load from file
            word_embs = pkl.load(open(emb_file, "rb"))
        log.info("Trimmed word embeddings: %s", str(word_embs.size()))

    # 4) Set up model_preprocessing_interface
    model_preprocessing_interface = ModelPreprocessingInterface(args)

    # 5) Index tasks using vocab (if preprocessed copy not available).
    preproc_dir = os.path.join(args.exp_dir, "preproc")
    utils.maybe_make_dir(preproc_dir)
    reindex_tasks = parse_task_list_arg(args.reindex_tasks)
    utils.assert_for_log(
        not (args.reload_indexing and not reindex_tasks),
        'Flag reload_indexing was set, but no tasks are set to reindex (use -o "args.reindex_tasks'
        ' = "task1,task2,..."")',
    )

    for task in tasks:
        force_reindex = args.reload_indexing and task.name in reindex_tasks
        for split in ALL_SPLITS:
            log_prefix = "\tTask '%s', split '%s'" % (task.name, split)
            relative_path = _get_serialized_record_path(
                task.name, split, "preproc")
            cache_found = _find_cached_file(args.exp_dir,
                                            args.global_ro_exp_dir,
                                            relative_path,
                                            log_prefix=log_prefix)
            if force_reindex or not cache_found:
                # Re-index from scratch.
                record_file = _get_serialized_record_path(
                    task.name, split, preproc_dir)
                if os.path.exists(record_file) and os.path.islink(record_file):
                    os.remove(record_file)

                _index_split(task, split, indexers, vocab, record_file,
                             model_preprocessing_interface)

        # Delete in-memory data - we'll lazy-load from disk later.
        # TODO: delete task.{split}_data_text?

    log.info("\tFinished indexing tasks")

    # 6) Initialize tasks with data iterators.
    pretrain_tasks = []
    target_tasks = []
    for task in tasks:
        # Replace lists of instances with lazy generators from disk.
        task.set_instance_iterable(
            split_name="val",
            instance_iterable=_get_instance_generator(task.name, "val",
                                                      preproc_dir),
        )
        task.set_instance_iterable(
            split_name="test",
            instance_iterable=_get_instance_generator(task.name, "test",
                                                      preproc_dir),
        )
        # When using pretrain_data_fraction, we need modified iterators for use
        # only on training datasets at pretraining time.
        if task.name in pretrain_task_names:
            log.info("\tCreating trimmed pretraining-only version of " +
                     task.name + " train.")
            task.set_instance_iterable(
                split_name="train",
                instance_iterable=_get_instance_generator(
                    task.name,
                    "train",
                    preproc_dir,
                    fraction=args.pretrain_data_fraction),
                phase="pretrain",
            )
            pretrain_tasks.append(task)
        # When using target_train_data_fraction, we need modified iterators
        # only for training datasets at do_target_task_training time.
        if task.name in target_task_names:
            log.info("\tCreating trimmed target-only version of " + task.name +
                     " train.")
            task.set_instance_iterable(
                split_name="train",
                instance_iterable=_get_instance_generator(
                    task.name,
                    "train",
                    preproc_dir,
                    fraction=args.target_train_data_fraction),
                phase="target_train",
            )
            target_tasks.append(task)

    log.info("\t  Training on %s", ", ".join(pretrain_task_names))
    log.info("\t  Evaluating on %s", ", ".join(target_task_names))
    return pretrain_tasks, target_tasks, vocab, word_embs
def check_configurations(args, pretrain_tasks, target_tasks):
    """
    Checks configurations for any obvious logical flaws
    and that necessary parameters are set for each step -
    throws asserts and exits if found.

    Parameters
    ----------------
    args: Params object
    pretrain_tasks: list of pretraining Task objects
    target_tasks: list of target task training Task objects

    Returns
    ----------------
    None
    """
    steps_log = io.StringIO()
    if any([t.val_metric_decreases for t in pretrain_tasks]) and any(
        [not t.val_metric_decreases for t in pretrain_tasks]):
        log.warn(
            "\tMixing training tasks with increasing and decreasing val metrics!"
        )

    if args.load_target_train_checkpoint != "none":
        assert_for_log(
            not args.do_pretrain,
            "Error: Attempting to train a model and then replace that model with one from "
            "a checkpoint.",
        )
        steps_log.write("Loading model from path: %s \n" %
                        args.load_target_train_checkpoint)

    assert_for_log(
        args.transfer_paradigm in ["finetune", "frozen"],
        "Transfer paradigm %s not supported!" % args.transfer_paradigm,
    )

    if args.do_pretrain:
        assert_for_log(
            args.pretrain_tasks != "none",
            "Error: Must specify at least one pretraining task: [%s]" %
            args.pretrain_tasks,
        )
        steps_log.write("Training model on tasks: %s \n" % args.pretrain_tasks)

    if args.do_target_task_training:
        assert_for_log(
            args.target_tasks != "none",
            "Error: Must specify at least one target task: [%s]" %
            args.target_tasks,
        )
        steps_log.write("Re-training model for individual target tasks \n")
        assert_for_log(
            len(set(pretrain_tasks).intersection(target_tasks)) == 0
            or args.allow_reuse_of_pretraining_parameters
            or args.do_pretrain == 0,
            "If you're pretraining on a task you plan to reuse as a target task, set\n"
            "allow_reuse_of_pretraining_parameters = 1 (risky), or train in two steps:\n"
            "train with do_pretrain = 1, do_target_task_training = 0, stop, and restart with\n"
            "do_pretrain = 0 and do_target_task_training = 1.",
        )
    if args.do_full_eval:
        assert_for_log(
            args.target_tasks != "none",
            "Error: Must specify at least one target task: [%s]" %
            args.target_tasks,
        )
        if not args.do_target_task_training:
            untrained_tasks = set(
                config.get_task_attr(
                    args, task.name, "use_classifier", default=task.name)
                for task in target_tasks)
            if args.do_pretrain:
                untrained_tasks -= set(
                    config.get_task_attr(
                        args, task.name, "use_classifier", default=task.name)
                    for task in pretrain_tasks)
            if len(untrained_tasks) > 0:
                assert (
                    args.load_model
                    or args.load_target_train_checkpoint not in ["none", ""]
                    or args.allow_untrained_encoder_parameters
                ), f"Evaluating a target task model on tasks {untrained_tasks} "
                "without training it on this run or loading a checkpoint. "
                "Set `allow_untrained_encoder_parameters` if you really want to use "
                "an untrained task model."
                log.warning(
                    f"Evauluating a target task model on tasks {untrained_tasks} without training "
                    "it in this run. It's up to you to ensure that you are loading parameters "
                    "that were sufficiently trained for this task.")
        steps_log.write("Evaluating model on tasks: %s \n" % args.target_tasks)

    log.info("Will run the following steps for this experiment:\n%s",
             steps_log.getvalue())
    steps_log.close()
Ejemplo n.º 4
0
 def _get_task_attr(attr_name, default=None):
     return config.get_task_attr(args, task_name, attr_name, default)