def build_vocab_from_args(args: argparse.Namespace): if not args.output_path.endswith(".tar.gz"): raise ValueError("param 'output_path' should end with '.tar.gz'") if os.path.exists(args.output_path) and not args.force: raise RuntimeError(f"{args.output_path} already exists. Use --force to overwrite.") output_directory = os.path.dirname(args.output_path) os.makedirs(output_directory, exist_ok=True) params = Params.from_file(args.param_path) with tempfile.TemporaryDirectory() as temp_dir: # Serializes the vocab to 'tempdir/vocabulary'. make_vocab_from_params(params, temp_dir) # The CacheFile context manager gives us a temporary file to write to. # On a successful exit from the context, it will rename the temp file to # the target `output_path`. with CacheFile(args.output_path, suffix=".tar.gz") as temp_archive: logger.info("Archiving vocabulary to %s", args.output_path) with tarfile.open(temp_archive.name, "w:gz") as archive: vocab_dir = os.path.join(temp_dir, "vocabulary") for fname in os.listdir(vocab_dir): if fname.endswith(".lock"): continue archive.add(os.path.join(vocab_dir, fname), arcname=fname) print(f"Success! Vocab saved to {args.output_path}") print('You can now set the "vocabulary" entry of your training config to:') print(json.dumps({"type": "from_files", "directory": os.path.abspath(args.output_path)}))
def test_raise_error_if_directory_non_empty(self): params = Params({ "dataset_reader": { "type": "train-util-test-reader" }, "train_data_path": "path-to-training-file", "validation_data_path": "path-to-validation-file", }) os.makedirs(self.TEST_DIR / "vocabulary") with open(self.TEST_DIR / "vocabulary" / "blah", "w") as random_file: random_file.write("BLAH!") with pytest.raises(ConfigurationError, match="The 'vocabulary' directory in the provided"): make_vocab_from_params(params, str(self.TEST_DIR))
def test_invalid_datasets_for_vocab_creation(self): params = Params({ "dataset_reader": { "type": "train-util-test-reader" }, "train_data_path": "path-to-training-file", "validation_data_path": "path-to-validation-file", "datasets_for_vocab_creation": ["train", "validation", "test"], }) with pytest.raises(ConfigurationError, match="invalid 'datasets_for_vocab_creation' test"): make_vocab_from_params(params, str(self.TEST_DIR))
def test_no_instances_read_for_vocab(self, caplog, params): _ = make_vocab_from_params(params, str(self.TEST_DIR)) log_messages = "\n".join([rec.message for rec in caplog.records]) assert "...train-util-test-reader reading from" not in log_messages assert "Reading training data" not in log_messages assert "Reading validation data" not in log_messages assert "Reading test data" not in log_messages
def test_using_seperate_validation_reader(self, caplog): params = Params({ "dataset_reader": { "type": "train-util-test-reader" }, "validation_dataset_reader": { "type": "train-util-test-reader" }, "train_data_path": "path-to-training-file", "validation_data_path": "path-to-validation-file", }) _ = make_vocab_from_params(params, str(self.TEST_DIR)) log_messages = "\n".join([rec.message for rec in caplog.records]) assert "Using a separate dataset reader to load validation and test data" in log_messages
def test_only_train_read_for_vocab(self, caplog): params = Params({ "dataset_reader": { "type": "train-util-test-reader" }, "train_data_path": "path-to-training-file", }) _ = make_vocab_from_params(params, str(self.TEST_DIR)) log_messages = "\n".join([rec.message for rec in caplog.records]) assert "...train-util-test-reader reading from path-to-training-file" in log_messages assert "...train-util-test-reader reading from path-to-validation-file" not in log_messages assert "...train-util-test-reader reading from path-to-test-file" not in log_messages assert "Reading training data" in log_messages assert "Reading validation data" not in log_messages assert "Reading test data" not in log_messages
def train_model( params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, node_rank: int = 0, include_package: List[str] = None, batch_weight_key: str = "", ) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. # Parameters params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see ``Model.from_archive``. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. node_rank : ``int``, optional Rank of the current node in distributed training include_package : ``List[str]``, optional In distributed mode, extra packages mentioned will be imported in trainer workers. batch_weight_key : ``str``, optional (default="") If non-empty, name of metric used to weight the loss on a per-batch basis. # Returns best_model : ``Model`` The model with the best epoch weights. """ training_util.create_serialization_dir(params, serialization_dir, recover, force) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) distributed_params = params.params.pop("distributed", None) # If distributed isn't in the config and the config contains strictly # one cuda device, we just run a single training process. if distributed_params is None: model = _train_worker( process_rank=0, params=params, serialization_dir=serialization_dir, file_friendly_logging=file_friendly_logging, include_package=include_package, batch_weight_key=batch_weight_key, ) archive_model(serialization_dir) return model # Otherwise, we are running multiple processes for training. else: # We are careful here so that we can raise a good error if someone # passed the wrong thing - cuda_devices are required. device_ids = distributed_params.pop("cuda_devices", None) multi_device = isinstance(device_ids, list) and len(device_ids) > 1 num_nodes = distributed_params.pop("num_nodes", 1) if not (multi_device or num_nodes > 1): raise ConfigurationError( "Multiple cuda devices/nodes need to be configured to run distributed training." ) check_for_gpu(device_ids) master_addr = distributed_params.pop("master_address", "127.0.0.1") master_port = distributed_params.pop("master_port", 29500) num_procs = len(device_ids) world_size = num_nodes * num_procs logging.info( f"Switching to distributed training mode since multiple GPUs are configured" f"Master is at: {master_addr}:{master_port} | Rank of this node: {node_rank} | " f"Number of workers in this node: {num_procs} | Number of nodes: {num_nodes} | " f"World size: {world_size}") # Creating `Vocabulary` objects from workers could be problematic since # the data iterators in each worker will yield only `rank` specific # instances. Hence it is safe to construct the vocabulary and write it # to disk before initializing the distributed context. The workers will # load the vocabulary from the path specified. if params.get("vocabulary", Params({})).get("type", "") != "from_files": vocab = training_util.make_vocab_from_params( params.duplicate(), serialization_dir) params["vocabulary"] = { "type": "from_files", "directory": os.path.join(serialization_dir, "vocabulary"), "padding_token": vocab._padding_token, "oov_token": vocab._oov_token, } mp.spawn( _train_worker, args=( params.duplicate(), serialization_dir, file_friendly_logging, include_package, batch_weight_key, node_rank, master_addr, master_port, world_size, device_ids, ), nprocs=num_procs, ) archive_model(serialization_dir) model = Model.load(params, serialization_dir) return model
def train_model( params: Params, serialization_dir: Union[str, PathLike], recover: bool = False, force: bool = False, node_rank: int = 0, include_package: List[str] = None, dry_run: bool = False, file_friendly_logging: bool = False, ) -> Optional[Model]: """ Trains the model specified in the given [`Params`](../common/params.md#params) object, using the data and training parameters also specified in that object, and saves the results in `serialization_dir`. # Parameters params : `Params` A parameter object specifying an AllenNLP Experiment. serialization_dir : `str` The directory in which to save results and logs. recover : `bool`, optional (default=`False`) If `True`, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see `Model.from_archive`. force : `bool`, optional (default=`False`) If `True`, we will overwrite the serialization directory if it already exists. node_rank : `int`, optional Rank of the current node in distributed training include_package : `List[str]`, optional In distributed mode, extra packages mentioned will be imported in trainer workers. dry_run : `bool`, optional (default=`False`) Do not train a model, but create a vocabulary, show dataset statistics and other training information. file_friendly_logging : `bool`, optional (default=`False`) If `True`, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. # Returns best_model : `Optional[Model]` The model with the best epoch weights or `None` if in dry run. """ common_logging.FILE_FRIENDLY_LOGGING = file_friendly_logging training_util.create_serialization_dir(params, serialization_dir, recover, force) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) distributed_params = params.params.pop("distributed", None) # If distributed isn't in the config and the config contains strictly # one cuda device, we just run a single training process. if distributed_params is None: model = _train_worker( process_rank=0, params=params, serialization_dir=serialization_dir, include_package=include_package, dry_run=dry_run, file_friendly_logging=file_friendly_logging, ) if not dry_run: archive_model(serialization_dir) return model # Otherwise, we are running multiple processes for training. else: # We are careful here so that we can raise a good error if someone # passed the wrong thing - cuda_devices are required. device_ids = distributed_params.pop("cuda_devices", None) multi_device = isinstance(device_ids, list) and len(device_ids) > 1 num_nodes = distributed_params.pop("num_nodes", 1) if not (multi_device or num_nodes > 1): raise ConfigurationError( "Multiple cuda devices/nodes need to be configured to run distributed training." ) check_for_gpu(device_ids) master_addr = distributed_params.pop("master_address", "127.0.0.1") master_port = distributed_params.pop("master_port", 29500) num_procs = len(device_ids) world_size = num_nodes * num_procs # Creating `Vocabulary` objects from workers could be problematic since # the data loaders in each worker will yield only `rank` specific # instances. Hence it is safe to construct the vocabulary and write it # to disk before initializing the distributed context. The workers will # load the vocabulary from the path specified. vocab_dir = os.path.join(serialization_dir, "vocabulary") if recover: vocab = Vocabulary.from_files(vocab_dir) else: vocab = training_util.make_vocab_from_params( params.duplicate(), serialization_dir, print_statistics=dry_run) params["vocabulary"] = { "type": "from_files", "directory": vocab_dir, "padding_token": vocab._padding_token, "oov_token": vocab._oov_token, } logging.info( "Switching to distributed training mode since multiple GPUs are configured | " f"Master is at: {master_addr}:{master_port} | Rank of this node: {node_rank} | " f"Number of workers in this node: {num_procs} | Number of nodes: {num_nodes} | " f"World size: {world_size}") mp.spawn( _train_worker, args=( params.duplicate(), serialization_dir, include_package, dry_run, node_rank, master_addr, master_port, world_size, device_ids, file_friendly_logging, ), nprocs=num_procs, ) if dry_run: return None else: archive_model(serialization_dir) model = Model.load(params, serialization_dir) return model
def train_model( params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, cache_directory: str = None, cache_prefix: str = None, node_rank: int = 0, include_package: List[str] = None, ) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. # Parameters params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. cache_directory : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. cache_prefix : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. node_rank : ``int``, optional Rank of the current node in distributed training include_package : ``List[str]``, optional In distributed mode, extra packages mentioned will be imported in trainer workers. # Returns best_model : ``Model`` The model with the best epoch weights. """ create_serialization_dir(params, serialization_dir, recover, force) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) distributed_params = params.params.pop("distributed", None) # If distributed isn't in the config and the config contains strictly # one cuda device, we just run a single training process. if distributed_params is None: model = _train_worker( process_rank=0, params=params, serialization_dir=serialization_dir, file_friendly_logging=file_friendly_logging, recover=recover, cache_directory=cache_directory, cache_prefix=cache_prefix, include_package=include_package, ) archive_model(serialization_dir, files_to_archive=params.files_to_archive) return model # Otherwise, we are running multiple processes for training. else: # We are careful here so that we can raise a good error if someone # passed the wrong thing - cuda_devices are required. device_ids = distributed_params.pop("cuda_devices", None) multi_device = isinstance(device_ids, list) and len(device_ids) > 1 num_nodes = distributed_params.pop("num_nodes", 1) if not (multi_device or num_nodes > 1): raise ConfigurationError( "Multiple cuda devices/nodes need to be configured to run distributed training." ) check_for_gpu(device_ids) master_addr = distributed_params.pop("master_address", "127.0.0.1") master_port = distributed_params.pop("master_port", 29500) num_procs = len(device_ids) world_size = num_nodes * num_procs os.environ["MASTER_ADDR"] = master_addr os.environ["MASTER_PORT"] = str(master_port) os.environ["WORLD_SIZE"] = str(world_size) logging.info( f"Switching to distributed training mode since multiple GPUs are configured" f"Master is at: {master_addr}:{master_port} | Rank of this node: {node_rank} | " f"Number of workers in this node: {num_procs} | Number of nodes: {num_nodes} | " f"World size: {world_size}") # Creating `Vocabulary` objects from workers could be problematic since the data iterators # in each worker will yield only `rank` specific instances. Hence it is safe to construct # the vocabulary and write it to disk before initializing the distributed context. The workers # will load the vocabulary from the path specified. make_vocab_from_params(params.duplicate(), serialization_dir) params["vocabulary"] = { "directory_path": os.path.join(serialization_dir, "vocabulary"), "extend": False, # vocab extension would have been done above } mp.spawn( _train_worker, args=( params.duplicate(), serialization_dir, file_friendly_logging, recover, cache_directory, cache_prefix, include_package, node_rank, master_addr, master_port, world_size, device_ids, ), nprocs=num_procs, ) archive_model(serialization_dir, files_to_archive=params.files_to_archive) model = Model.load(params, serialization_dir) return model