def _get_task(name: str, args: config.Params, data_path: str,
              scratch_path: str) -> Task:
    """Get task object from disk if available. Else construct, prepare and save a new task object.

    Parameters
    ----------
    name : str
        task name to load.
    args : config.Params
        param handler object.
    data_path : str
        base data directory.
    scratch_path : str
        where to save Task objects.

    Returns
    -------
    Task
        loaded task object.

    """
    assert name in TASKS_REGISTRY, f"Task '{name:s}' not found!"
    task_cls, rel_path, task_kw = TASKS_REGISTRY[name]
    pkl_path = os.path.join(scratch_path, "tasks",
                            f"{name:s}.{args.tokenizer:s}.pkl")
    # TODO: refactor to always read from disk, even if task is constructed
    # here. This should avoid subtle bugs from deserialization issues.
    if os.path.isfile(pkl_path) and not args.reload_tasks:
        task = pkl.load(open(pkl_path, "rb"))
        log.info("\tLoaded existing task %s", name)
    else:
        log.info("\tCreating task %s from scratch.", name)
        # These tasks take an additional kwarg.
        if name == "nli-prob" or name == "nli-alt":
            # TODO: remove special case, replace with something general
            # to pass custom loader args to task.
            task_kw["probe_path"] = args["nli-prob"].probe_path
        if name in ALL_SEQ2SEQ_TASKS:
            task_kw["max_targ_v_size"] = args.max_targ_word_v_size
        task_src_path = os.path.join(data_path, rel_path)
        task = task_cls(
            task_src_path,
            max_seq_len=args.max_seq_len,
            name=name,
            tokenizer_name=args.tokenizer,
            **task_kw,
        )
        log.info('testing: %s', str(args))
        # if the user requires to calculate the online code of an edge probing task
        if args.get("online_code_preshuffle_seed", False) and args.get(
                "online_code_data_split", False):
            task.load_data(args)
            log.info('testing, flags detected; preprocess.py')
        else:
            task.load_data()
        utils.maybe_make_dir(os.path.dirname(pkl_path))
        pkl.dump(task, open(pkl_path, "wb"))

    return task
def check_arg_name(args: config.Params):
    """Check for obsolete params in config, throw exceptions if obsolete params are found.

    Parameters
    ----------
    args: config.Params
        config map

    Raises
    ------
    AssertionError
        If obsolete parameter names are present in config

    """
    # Mapping - key: old name, value: new name
    name_dict = {
        "task_patience": "lr_patience",
        "do_train": "do_pretrain",
        "train_for_eval": "do_target_task_training",
        "do_eval": "do_full_eval",
        "train_tasks": "pretrain_tasks",
        "eval_tasks": "target_tasks",
        "eval_data_fraction": "target_train_data_fraction",
        "eval_val_interval": "target_train_val_interval",
        "eval_max_vals": "target_train_max_vals",
        "eval_data_fraction": "target_train_data_fraction",
    }
    for task in task_modules.ALL_GLUE_TASKS + task_modules.ALL_SUPERGLUE_TASKS:
        assert_for_log(
            not args.regex_contains("^{}_".format(task)),
            "Error: Attempting to load old task-specific args for task %s, please refer to the "
            "master branch's default configs for the most recent task specific argument "
            "structures." % task,
        )
    for old_name, new_name in name_dict.items():
        assert_for_log(
            old_name not in args,
            "Error: Attempting to load old arg name %s, please update to new name %s."
            % (old_name, name_dict[old_name]),
        )
    old_input_module_vals = [
        "elmo",
        "elmo_chars_only",
        "bert_model_name",
        "openai_transformer",
        "word_embs",
    ]
    for input_type in old_input_module_vals:
        assert_for_log(
            input_type not in args,
            "Error: Attempting to load old arg name %s, please use input_module config "
            "parameter and refer to master branch's default configs for current way to specify %s."
            % (input_type, input_type),
        )
Beispiel #3
0
def build_tasks(
    args: config.Params, cuda_device: Any
) -> (List[Task], List[Task], Vocabulary, Union[np.ndarray, float]):
    """Main logic for preparing tasks:

    1. create or load the tasks
    2. configure classifiers for tasks
    3. set up indexers
    4. build and save vocab to disk
    5. load vocab from disk
    6. if specified, load word embeddings
    7. set up ModelPreprocessingInterface (MPI) to handle model-specific preprocessing
    8. index tasks using vocab and task-specific MPI, save to disk.
    9. return: task data lazy-loaders in phase-specific lists w/ vocab, and word embeddings

    Parameters
    ----------
    args : Params
        config map

    Returns
    -------
    List[Task]
        list of pretrain Tasks.
    List[Task]
        list of target Tasks.
    allennlp.data.Vocabulary
        vocabulary from task data.
    Union[np.ndarray, float]
        Word embeddings.

    """
    # 1) create / load tasks
    tasks, pretrain_task_names, target_task_names = get_tasks(
        args, cuda_device)
    for task in tasks:
        task_classifier = config.get_task_attr(args, task.name,
                                               "use_classifier")
        setattr(task, "_classifier_name",
                task_classifier if task_classifier else task.name)

    tokenizer_names = {task.name: task.tokenizer_name for task in tasks}
    assert not len(set(tokenizer_names.values())) > 1, (
        f"Error: mixing tasks with different tokenizers!"
        " Tokenizations: {tokenizer_names:s}")

    # 2) build / load vocab and indexers
    indexers = build_indexers(args)

    vocab_path = os.path.join(args.exp_dir, "vocab")
    if args.reload_vocab or not os.path.exists(vocab_path):
        _build_vocab(args, tasks, vocab_path)

    # Always load vocab from file.
    vocab = Vocabulary.from_files(vocab_path)
    log.info("\tLoaded vocab from %s", vocab_path)

    for namespace, mapping in vocab._index_to_token.items():
        log.info("\tVocab namespace %s: size %d", namespace, len(mapping))
    log.info("\tFinished building vocab.")
    args.max_word_v_size = vocab.get_vocab_size("tokens")
    args.max_char_v_size = vocab.get_vocab_size("chars")

    # 3) build / load word vectors
    word_embs = None
    if args.input_module in ["glove", "fastText"]:
        emb_file = os.path.join(args.exp_dir, "embs.pkl")
        if args.reload_vocab or not os.path.exists(emb_file):
            word_embs = _build_embeddings(args, vocab, emb_file)
        else:  # load from file
            word_embs = pkl.load(open(emb_file, "rb"))
        log.info("Trimmed word embeddings: %s", str(word_embs.size()))

    # 4) Set up model_preprocessing_interface
    model_preprocessing_interface = ModelPreprocessingInterface(args)

    # 5) Index tasks using vocab (if preprocessed copy not available).
    preproc_dir = os.path.join(args.exp_dir, "preproc")
    utils.maybe_make_dir(preproc_dir)
    reindex_tasks = parse_task_list_arg(args.reindex_tasks)
    utils.assert_for_log(
        not (args.reload_indexing and not reindex_tasks),
        'Flag reload_indexing was set, but no tasks are set to reindex (use -o "args.reindex_tasks'
        ' = "task1,task2,..."")',
    )

    for task in tasks:
        force_reindex = args.reload_indexing and task.name in reindex_tasks
        for split in ALL_SPLITS:
            log_prefix = "\tTask '%s', split '%s'" % (task.name, split)
            relative_path = _get_serialized_record_path(
                task.name, split, "preproc")
            cache_found = _find_cached_file(args.exp_dir,
                                            args.global_ro_exp_dir,
                                            relative_path,
                                            log_prefix=log_prefix)
            if force_reindex or not cache_found:
                # Re-index from scratch.
                record_file = _get_serialized_record_path(
                    task.name, split, preproc_dir)
                if os.path.exists(record_file) and os.path.islink(record_file):
                    os.remove(record_file)

                _index_split(task, split, indexers, vocab, record_file,
                             model_preprocessing_interface)

        # Delete in-memory data - we'll lazy-load from disk later.
        # TODO: delete task.{split}_data_text?

    log.info("\tFinished indexing tasks")

    # 6) Initialize tasks with data iterators.
    pretrain_tasks = []
    target_tasks = []
    for task in tasks:
        # Replace lists of instances with lazy generators from disk.
        task.set_instance_iterable(
            split_name="val",
            instance_iterable=_get_instance_generator(task.name, "val",
                                                      preproc_dir),
        )
        task.set_instance_iterable(
            split_name="test",
            instance_iterable=_get_instance_generator(task.name, "test",
                                                      preproc_dir),
        )
        # When using pretrain_data_fraction, we need modified iterators for use
        # only on training datasets at pretraining time.
        if task.name in pretrain_task_names:
            log.info("\tCreating trimmed pretraining-only version of " +
                     task.name + " train.")
            task.set_instance_iterable(
                split_name="train",
                instance_iterable=_get_instance_generator(
                    task.name,
                    "train",
                    preproc_dir,
                    fraction=args.pretrain_data_fraction),
                phase="pretrain",
            )
            pretrain_tasks.append(task)
        # When using target_train_data_fraction, we need modified iterators
        # only for training datasets at do_target_task_training time.
        if task.name in target_task_names:
            log.info("\tCreating trimmed target-only version of " + task.name +
                     " train.")
            task.set_instance_iterable(
                split_name="train",
                instance_iterable=_get_instance_generator(
                    task.name,
                    "train",
                    preproc_dir,
                    fraction=args.target_train_data_fraction),
                phase="target_train",
            )
            target_tasks.append(task)

    log.info("\t  Training on %s", ", ".join(pretrain_task_names))
    log.info("\t  Evaluating on %s", ", ".join(target_task_names))
    return pretrain_tasks, target_tasks, vocab, word_embs
def initial_setup(args: config.Params, cl_args: argparse.Namespace) -> (config.Params, int):
    """Perform setup steps:

    1. create project, exp, and run dirs if they don't already exist
    2. create log formatter
    3. configure GCP remote logging
    4. set up email notifier
    5. log git info
    6. write the config out to file
    7. log diff between default and experiment's configs
    8. choose torch's and random's random seed
    9. if config specifies a single GPU, then set the GPU's random seed (doesn't cover multi-GPU)
    10. resolve "auto" settings for tokenizer and pool_type parameters

    Parameters
    ----------
    args : config.Params
        config map
    cl_args : argparse.Namespace
        mapping named arguments to parsed values

    Returns
    -------
    args : config.Params
        config map
    seed : int
        random's and pytorch's random seed

    """
    output = io.StringIO()
    maybe_make_dir(args.project_dir)  # e.g. /nfs/jsalt/exp/$HOSTNAME
    maybe_make_dir(args.exp_dir)  # e.g. <project_dir>/jiant-demo
    maybe_make_dir(args.run_dir)  # e.g. <project_dir>/jiant-demo/sst
    log_fh = log.FileHandler(args.local_log_path)
    log_fmt = log.Formatter("%(asctime)s: %(message)s", datefmt="%m/%d %I:%M:%S %p")
    log_fh.setFormatter(log_fmt)
    log.getLogger().addHandler(log_fh)

    if cl_args.remote_log:
        from jiant.utils import gcp

        gcp.configure_remote_logging(args.remote_log_name)

    if cl_args.notify:
        from jiant.utils import emails

        global EMAIL_NOTIFIER
        log.info("Registering email notifier for %s", cl_args.notify)
        EMAIL_NOTIFIER = emails.get_notifier(cl_args.notify, args)

    if EMAIL_NOTIFIER:
        EMAIL_NOTIFIER(body="Starting run.", prefix="")

    _log_git_info()
    config_file = os.path.join(args.run_dir, "params.conf")
    config.write_params(args, config_file)

    print_args = select_relevant_print_args(args)
    log.info("Parsed args: \n%s", print_args)

    log.info("Saved config to %s", config_file)

    seed = random.randint(1, 10000) if args.random_seed < 0 else args.random_seed
    random.seed(seed)
    torch.manual_seed(seed)
    log.info("Using random seed %d", seed)
    if isinstance(args.cuda, int) and args.cuda >= 0:
        # If only running on one GPU.
        try:
            if not torch.cuda.is_available():
                raise EnvironmentError("CUDA is not available, or not detected" " by PyTorch.")
            log.info("Using GPU %d", args.cuda)
            torch.cuda.set_device(args.cuda)
            torch.cuda.manual_seed_all(seed)
        except Exception:
            log.warning(
                "GPU access failed. You might be using a CPU-only installation of PyTorch. "
                "Falling back to CPU."
            )
            args.cuda = -1

    if args.tokenizer == "auto":
        args.tokenizer = tokenizers.select_tokenizer(args)
    if args.pool_type == "auto":
        args.pool_type = select_pool_type(args)

    return args, seed