def _get_task(name: str, args: config.Params, data_path: str, scratch_path: str) -> Task: """Get task object from disk if available. Else construct, prepare and save a new task object. Parameters ---------- name : str task name to load. args : config.Params param handler object. data_path : str base data directory. scratch_path : str where to save Task objects. Returns ------- Task loaded task object. """ assert name in TASKS_REGISTRY, f"Task '{name:s}' not found!" task_cls, rel_path, task_kw = TASKS_REGISTRY[name] pkl_path = os.path.join(scratch_path, "tasks", f"{name:s}.{args.tokenizer:s}.pkl") # TODO: refactor to always read from disk, even if task is constructed # here. This should avoid subtle bugs from deserialization issues. if os.path.isfile(pkl_path) and not args.reload_tasks: task = pkl.load(open(pkl_path, "rb")) log.info("\tLoaded existing task %s", name) else: log.info("\tCreating task %s from scratch.", name) # These tasks take an additional kwarg. if name == "nli-prob" or name == "nli-alt": # TODO: remove special case, replace with something general # to pass custom loader args to task. task_kw["probe_path"] = args["nli-prob"].probe_path if name in ALL_SEQ2SEQ_TASKS: task_kw["max_targ_v_size"] = args.max_targ_word_v_size task_src_path = os.path.join(data_path, rel_path) task = task_cls( task_src_path, max_seq_len=args.max_seq_len, name=name, tokenizer_name=args.tokenizer, **task_kw, ) log.info('testing: %s', str(args)) # if the user requires to calculate the online code of an edge probing task if args.get("online_code_preshuffle_seed", False) and args.get( "online_code_data_split", False): task.load_data(args) log.info('testing, flags detected; preprocess.py') else: task.load_data() utils.maybe_make_dir(os.path.dirname(pkl_path)) pkl.dump(task, open(pkl_path, "wb")) return task
def check_arg_name(args: config.Params): """Check for obsolete params in config, throw exceptions if obsolete params are found. Parameters ---------- args: config.Params config map Raises ------ AssertionError If obsolete parameter names are present in config """ # Mapping - key: old name, value: new name name_dict = { "task_patience": "lr_patience", "do_train": "do_pretrain", "train_for_eval": "do_target_task_training", "do_eval": "do_full_eval", "train_tasks": "pretrain_tasks", "eval_tasks": "target_tasks", "eval_data_fraction": "target_train_data_fraction", "eval_val_interval": "target_train_val_interval", "eval_max_vals": "target_train_max_vals", "eval_data_fraction": "target_train_data_fraction", } for task in task_modules.ALL_GLUE_TASKS + task_modules.ALL_SUPERGLUE_TASKS: assert_for_log( not args.regex_contains("^{}_".format(task)), "Error: Attempting to load old task-specific args for task %s, please refer to the " "master branch's default configs for the most recent task specific argument " "structures." % task, ) for old_name, new_name in name_dict.items(): assert_for_log( old_name not in args, "Error: Attempting to load old arg name %s, please update to new name %s." % (old_name, name_dict[old_name]), ) old_input_module_vals = [ "elmo", "elmo_chars_only", "bert_model_name", "openai_transformer", "word_embs", ] for input_type in old_input_module_vals: assert_for_log( input_type not in args, "Error: Attempting to load old arg name %s, please use input_module config " "parameter and refer to master branch's default configs for current way to specify %s." % (input_type, input_type), )
def build_tasks( args: config.Params, cuda_device: Any ) -> (List[Task], List[Task], Vocabulary, Union[np.ndarray, float]): """Main logic for preparing tasks: 1. create or load the tasks 2. configure classifiers for tasks 3. set up indexers 4. build and save vocab to disk 5. load vocab from disk 6. if specified, load word embeddings 7. set up ModelPreprocessingInterface (MPI) to handle model-specific preprocessing 8. index tasks using vocab and task-specific MPI, save to disk. 9. return: task data lazy-loaders in phase-specific lists w/ vocab, and word embeddings Parameters ---------- args : Params config map Returns ------- List[Task] list of pretrain Tasks. List[Task] list of target Tasks. allennlp.data.Vocabulary vocabulary from task data. Union[np.ndarray, float] Word embeddings. """ # 1) create / load tasks tasks, pretrain_task_names, target_task_names = get_tasks( args, cuda_device) for task in tasks: task_classifier = config.get_task_attr(args, task.name, "use_classifier") setattr(task, "_classifier_name", task_classifier if task_classifier else task.name) tokenizer_names = {task.name: task.tokenizer_name for task in tasks} assert not len(set(tokenizer_names.values())) > 1, ( f"Error: mixing tasks with different tokenizers!" " Tokenizations: {tokenizer_names:s}") # 2) build / load vocab and indexers indexers = build_indexers(args) vocab_path = os.path.join(args.exp_dir, "vocab") if args.reload_vocab or not os.path.exists(vocab_path): _build_vocab(args, tasks, vocab_path) # Always load vocab from file. vocab = Vocabulary.from_files(vocab_path) log.info("\tLoaded vocab from %s", vocab_path) for namespace, mapping in vocab._index_to_token.items(): log.info("\tVocab namespace %s: size %d", namespace, len(mapping)) log.info("\tFinished building vocab.") args.max_word_v_size = vocab.get_vocab_size("tokens") args.max_char_v_size = vocab.get_vocab_size("chars") # 3) build / load word vectors word_embs = None if args.input_module in ["glove", "fastText"]: emb_file = os.path.join(args.exp_dir, "embs.pkl") if args.reload_vocab or not os.path.exists(emb_file): word_embs = _build_embeddings(args, vocab, emb_file) else: # load from file word_embs = pkl.load(open(emb_file, "rb")) log.info("Trimmed word embeddings: %s", str(word_embs.size())) # 4) Set up model_preprocessing_interface model_preprocessing_interface = ModelPreprocessingInterface(args) # 5) Index tasks using vocab (if preprocessed copy not available). preproc_dir = os.path.join(args.exp_dir, "preproc") utils.maybe_make_dir(preproc_dir) reindex_tasks = parse_task_list_arg(args.reindex_tasks) utils.assert_for_log( not (args.reload_indexing and not reindex_tasks), 'Flag reload_indexing was set, but no tasks are set to reindex (use -o "args.reindex_tasks' ' = "task1,task2,..."")', ) for task in tasks: force_reindex = args.reload_indexing and task.name in reindex_tasks for split in ALL_SPLITS: log_prefix = "\tTask '%s', split '%s'" % (task.name, split) relative_path = _get_serialized_record_path( task.name, split, "preproc") cache_found = _find_cached_file(args.exp_dir, args.global_ro_exp_dir, relative_path, log_prefix=log_prefix) if force_reindex or not cache_found: # Re-index from scratch. record_file = _get_serialized_record_path( task.name, split, preproc_dir) if os.path.exists(record_file) and os.path.islink(record_file): os.remove(record_file) _index_split(task, split, indexers, vocab, record_file, model_preprocessing_interface) # Delete in-memory data - we'll lazy-load from disk later. # TODO: delete task.{split}_data_text? log.info("\tFinished indexing tasks") # 6) Initialize tasks with data iterators. pretrain_tasks = [] target_tasks = [] for task in tasks: # Replace lists of instances with lazy generators from disk. task.set_instance_iterable( split_name="val", instance_iterable=_get_instance_generator(task.name, "val", preproc_dir), ) task.set_instance_iterable( split_name="test", instance_iterable=_get_instance_generator(task.name, "test", preproc_dir), ) # When using pretrain_data_fraction, we need modified iterators for use # only on training datasets at pretraining time. if task.name in pretrain_task_names: log.info("\tCreating trimmed pretraining-only version of " + task.name + " train.") task.set_instance_iterable( split_name="train", instance_iterable=_get_instance_generator( task.name, "train", preproc_dir, fraction=args.pretrain_data_fraction), phase="pretrain", ) pretrain_tasks.append(task) # When using target_train_data_fraction, we need modified iterators # only for training datasets at do_target_task_training time. if task.name in target_task_names: log.info("\tCreating trimmed target-only version of " + task.name + " train.") task.set_instance_iterable( split_name="train", instance_iterable=_get_instance_generator( task.name, "train", preproc_dir, fraction=args.target_train_data_fraction), phase="target_train", ) target_tasks.append(task) log.info("\t Training on %s", ", ".join(pretrain_task_names)) log.info("\t Evaluating on %s", ", ".join(target_task_names)) return pretrain_tasks, target_tasks, vocab, word_embs
def initial_setup(args: config.Params, cl_args: argparse.Namespace) -> (config.Params, int): """Perform setup steps: 1. create project, exp, and run dirs if they don't already exist 2. create log formatter 3. configure GCP remote logging 4. set up email notifier 5. log git info 6. write the config out to file 7. log diff between default and experiment's configs 8. choose torch's and random's random seed 9. if config specifies a single GPU, then set the GPU's random seed (doesn't cover multi-GPU) 10. resolve "auto" settings for tokenizer and pool_type parameters Parameters ---------- args : config.Params config map cl_args : argparse.Namespace mapping named arguments to parsed values Returns ------- args : config.Params config map seed : int random's and pytorch's random seed """ output = io.StringIO() maybe_make_dir(args.project_dir) # e.g. /nfs/jsalt/exp/$HOSTNAME maybe_make_dir(args.exp_dir) # e.g. <project_dir>/jiant-demo maybe_make_dir(args.run_dir) # e.g. <project_dir>/jiant-demo/sst log_fh = log.FileHandler(args.local_log_path) log_fmt = log.Formatter("%(asctime)s: %(message)s", datefmt="%m/%d %I:%M:%S %p") log_fh.setFormatter(log_fmt) log.getLogger().addHandler(log_fh) if cl_args.remote_log: from jiant.utils import gcp gcp.configure_remote_logging(args.remote_log_name) if cl_args.notify: from jiant.utils import emails global EMAIL_NOTIFIER log.info("Registering email notifier for %s", cl_args.notify) EMAIL_NOTIFIER = emails.get_notifier(cl_args.notify, args) if EMAIL_NOTIFIER: EMAIL_NOTIFIER(body="Starting run.", prefix="") _log_git_info() config_file = os.path.join(args.run_dir, "params.conf") config.write_params(args, config_file) print_args = select_relevant_print_args(args) log.info("Parsed args: \n%s", print_args) log.info("Saved config to %s", config_file) seed = random.randint(1, 10000) if args.random_seed < 0 else args.random_seed random.seed(seed) torch.manual_seed(seed) log.info("Using random seed %d", seed) if isinstance(args.cuda, int) and args.cuda >= 0: # If only running on one GPU. try: if not torch.cuda.is_available(): raise EnvironmentError("CUDA is not available, or not detected" " by PyTorch.") log.info("Using GPU %d", args.cuda) torch.cuda.set_device(args.cuda) torch.cuda.manual_seed_all(seed) except Exception: log.warning( "GPU access failed. You might be using a CPU-only installation of PyTorch. " "Falling back to CPU." ) args.cuda = -1 if args.tokenizer == "auto": args.tokenizer = tokenizers.select_tokenizer(args) if args.pool_type == "auto": args.pool_type = select_pool_type(args) return args, seed