Beispiel #1
0
def build_vocab_from_args(args: argparse.Namespace):
    if not args.output_path.endswith(".tar.gz"):
        raise ValueError("param 'output_path' should end with '.tar.gz'")

    if os.path.exists(args.output_path) and not args.force:
        raise RuntimeError(f"{args.output_path} already exists. Use --force to overwrite.")

    output_directory = os.path.dirname(args.output_path)
    os.makedirs(output_directory, exist_ok=True)

    params = Params.from_file(args.param_path)

    with tempfile.TemporaryDirectory() as temp_dir:
        # Serializes the vocab to 'tempdir/vocabulary'.
        make_vocab_from_params(params, temp_dir)

        # The CacheFile context manager gives us a temporary file to write to.
        # On a successful exit from the context, it will rename the temp file to
        # the target `output_path`.
        with CacheFile(args.output_path, suffix=".tar.gz") as temp_archive:
            logger.info("Archiving vocabulary to %s", args.output_path)

            with tarfile.open(temp_archive.name, "w:gz") as archive:
                vocab_dir = os.path.join(temp_dir, "vocabulary")
                for fname in os.listdir(vocab_dir):
                    if fname.endswith(".lock"):
                        continue
                    archive.add(os.path.join(vocab_dir, fname), arcname=fname)

    print(f"Success! Vocab saved to {args.output_path}")
    print('You can now set the "vocabulary" entry of your training config to:')
    print(json.dumps({"type": "from_files", "directory": os.path.abspath(args.output_path)}))
Beispiel #2
0
 def test_raise_error_if_directory_non_empty(self):
     params = Params({
         "dataset_reader": {
             "type": "train-util-test-reader"
         },
         "train_data_path": "path-to-training-file",
         "validation_data_path": "path-to-validation-file",
     })
     os.makedirs(self.TEST_DIR / "vocabulary")
     with open(self.TEST_DIR / "vocabulary" / "blah", "w") as random_file:
         random_file.write("BLAH!")
     with pytest.raises(ConfigurationError,
                        match="The 'vocabulary' directory in the provided"):
         make_vocab_from_params(params, str(self.TEST_DIR))
Beispiel #3
0
 def test_invalid_datasets_for_vocab_creation(self):
     params = Params({
         "dataset_reader": {
             "type": "train-util-test-reader"
         },
         "train_data_path":
         "path-to-training-file",
         "validation_data_path":
         "path-to-validation-file",
         "datasets_for_vocab_creation": ["train", "validation", "test"],
     })
     with pytest.raises(ConfigurationError,
                        match="invalid 'datasets_for_vocab_creation' test"):
         make_vocab_from_params(params, str(self.TEST_DIR))
Beispiel #4
0
 def test_no_instances_read_for_vocab(self, caplog, params):
     _ = make_vocab_from_params(params, str(self.TEST_DIR))
     log_messages = "\n".join([rec.message for rec in caplog.records])
     assert "...train-util-test-reader reading from" not in log_messages
     assert "Reading training data" not in log_messages
     assert "Reading validation data" not in log_messages
     assert "Reading test data" not in log_messages
Beispiel #5
0
 def test_using_seperate_validation_reader(self, caplog):
     params = Params({
         "dataset_reader": {
             "type": "train-util-test-reader"
         },
         "validation_dataset_reader": {
             "type": "train-util-test-reader"
         },
         "train_data_path": "path-to-training-file",
         "validation_data_path": "path-to-validation-file",
     })
     _ = make_vocab_from_params(params, str(self.TEST_DIR))
     log_messages = "\n".join([rec.message for rec in caplog.records])
     assert "Using a separate dataset reader to load validation and test data" in log_messages
Beispiel #6
0
 def test_only_train_read_for_vocab(self, caplog):
     params = Params({
         "dataset_reader": {
             "type": "train-util-test-reader"
         },
         "train_data_path": "path-to-training-file",
     })
     _ = make_vocab_from_params(params, str(self.TEST_DIR))
     log_messages = "\n".join([rec.message for rec in caplog.records])
     assert "...train-util-test-reader reading from path-to-training-file" in log_messages
     assert "...train-util-test-reader reading from path-to-validation-file" not in log_messages
     assert "...train-util-test-reader reading from path-to-test-file" not in log_messages
     assert "Reading training data" in log_messages
     assert "Reading validation data" not in log_messages
     assert "Reading test data" not in log_messages
Beispiel #7
0
def train_model(
    params: Params,
    serialization_dir: str,
    file_friendly_logging: bool = False,
    recover: bool = False,
    force: bool = False,
    node_rank: int = 0,
    include_package: List[str] = None,
    batch_weight_key: str = "",
) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    # Parameters

    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see ``Model.from_archive``.
    force : ``bool``, optional (default=False)
        If ``True``, we will overwrite the serialization directory if it already exists.
    node_rank : ``int``, optional
        Rank of the current node in distributed training
    include_package : ``List[str]``, optional
        In distributed mode, extra packages mentioned will be imported in trainer workers.
    batch_weight_key : ``str``, optional (default="")
        If non-empty, name of metric used to weight the loss on a per-batch basis.

    # Returns

    best_model : ``Model``
        The model with the best epoch weights.
    """
    training_util.create_serialization_dir(params, serialization_dir, recover,
                                           force)
    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    distributed_params = params.params.pop("distributed", None)
    # If distributed isn't in the config and the config contains strictly
    # one cuda device, we just run a single training process.
    if distributed_params is None:
        model = _train_worker(
            process_rank=0,
            params=params,
            serialization_dir=serialization_dir,
            file_friendly_logging=file_friendly_logging,
            include_package=include_package,
            batch_weight_key=batch_weight_key,
        )
        archive_model(serialization_dir)
        return model

    # Otherwise, we are running multiple processes for training.
    else:
        # We are careful here so that we can raise a good error if someone
        # passed the wrong thing - cuda_devices are required.
        device_ids = distributed_params.pop("cuda_devices", None)
        multi_device = isinstance(device_ids, list) and len(device_ids) > 1
        num_nodes = distributed_params.pop("num_nodes", 1)

        if not (multi_device or num_nodes > 1):
            raise ConfigurationError(
                "Multiple cuda devices/nodes need to be configured to run distributed training."
            )
        check_for_gpu(device_ids)

        master_addr = distributed_params.pop("master_address", "127.0.0.1")
        master_port = distributed_params.pop("master_port", 29500)
        num_procs = len(device_ids)
        world_size = num_nodes * num_procs

        logging.info(
            f"Switching to distributed training mode since multiple GPUs are configured"
            f"Master is at: {master_addr}:{master_port} | Rank of this node: {node_rank} | "
            f"Number of workers in this node: {num_procs} | Number of nodes: {num_nodes} | "
            f"World size: {world_size}")

        # Creating `Vocabulary` objects from workers could be problematic since
        # the data iterators in each worker will yield only `rank` specific
        # instances. Hence it is safe to construct the vocabulary and write it
        # to disk before initializing the distributed context. The workers will
        # load the vocabulary from the path specified.
        if params.get("vocabulary", Params({})).get("type",
                                                    "") != "from_files":
            vocab = training_util.make_vocab_from_params(
                params.duplicate(), serialization_dir)
            params["vocabulary"] = {
                "type": "from_files",
                "directory": os.path.join(serialization_dir, "vocabulary"),
                "padding_token": vocab._padding_token,
                "oov_token": vocab._oov_token,
            }

        mp.spawn(
            _train_worker,
            args=(
                params.duplicate(),
                serialization_dir,
                file_friendly_logging,
                include_package,
                batch_weight_key,
                node_rank,
                master_addr,
                master_port,
                world_size,
                device_ids,
            ),
            nprocs=num_procs,
        )
        archive_model(serialization_dir)
        model = Model.load(params, serialization_dir)
        return model
Beispiel #8
0
def train_model(
    params: Params,
    serialization_dir: Union[str, PathLike],
    recover: bool = False,
    force: bool = False,
    node_rank: int = 0,
    include_package: List[str] = None,
    dry_run: bool = False,
    file_friendly_logging: bool = False,
) -> Optional[Model]:
    """
    Trains the model specified in the given [`Params`](../common/params.md#params) object, using the data
    and training parameters also specified in that object, and saves the results in `serialization_dir`.

    # Parameters

    params : `Params`
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : `str`
        The directory in which to save results and logs.
    recover : `bool`, optional (default=`False`)
        If `True`, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see `Model.from_archive`.
    force : `bool`, optional (default=`False`)
        If `True`, we will overwrite the serialization directory if it already exists.
    node_rank : `int`, optional
        Rank of the current node in distributed training
    include_package : `List[str]`, optional
        In distributed mode, extra packages mentioned will be imported in trainer workers.
    dry_run : `bool`, optional (default=`False`)
        Do not train a model, but create a vocabulary, show dataset statistics and other training
        information.
    file_friendly_logging : `bool`, optional (default=`False`)
        If `True`, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.

    # Returns

    best_model : `Optional[Model]`
        The model with the best epoch weights or `None` if in dry run.
    """
    common_logging.FILE_FRIENDLY_LOGGING = file_friendly_logging

    training_util.create_serialization_dir(params, serialization_dir, recover,
                                           force)
    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    distributed_params = params.params.pop("distributed", None)
    # If distributed isn't in the config and the config contains strictly
    # one cuda device, we just run a single training process.
    if distributed_params is None:
        model = _train_worker(
            process_rank=0,
            params=params,
            serialization_dir=serialization_dir,
            include_package=include_package,
            dry_run=dry_run,
            file_friendly_logging=file_friendly_logging,
        )

        if not dry_run:
            archive_model(serialization_dir)
        return model

    # Otherwise, we are running multiple processes for training.
    else:
        # We are careful here so that we can raise a good error if someone
        # passed the wrong thing - cuda_devices are required.
        device_ids = distributed_params.pop("cuda_devices", None)
        multi_device = isinstance(device_ids, list) and len(device_ids) > 1
        num_nodes = distributed_params.pop("num_nodes", 1)

        if not (multi_device or num_nodes > 1):
            raise ConfigurationError(
                "Multiple cuda devices/nodes need to be configured to run distributed training."
            )
        check_for_gpu(device_ids)

        master_addr = distributed_params.pop("master_address", "127.0.0.1")
        master_port = distributed_params.pop("master_port", 29500)
        num_procs = len(device_ids)
        world_size = num_nodes * num_procs

        # Creating `Vocabulary` objects from workers could be problematic since
        # the data loaders in each worker will yield only `rank` specific
        # instances. Hence it is safe to construct the vocabulary and write it
        # to disk before initializing the distributed context. The workers will
        # load the vocabulary from the path specified.
        vocab_dir = os.path.join(serialization_dir, "vocabulary")
        if recover:
            vocab = Vocabulary.from_files(vocab_dir)
        else:
            vocab = training_util.make_vocab_from_params(
                params.duplicate(),
                serialization_dir,
                print_statistics=dry_run)
        params["vocabulary"] = {
            "type": "from_files",
            "directory": vocab_dir,
            "padding_token": vocab._padding_token,
            "oov_token": vocab._oov_token,
        }

        logging.info(
            "Switching to distributed training mode since multiple GPUs are configured | "
            f"Master is at: {master_addr}:{master_port} | Rank of this node: {node_rank} | "
            f"Number of workers in this node: {num_procs} | Number of nodes: {num_nodes} | "
            f"World size: {world_size}")

        mp.spawn(
            _train_worker,
            args=(
                params.duplicate(),
                serialization_dir,
                include_package,
                dry_run,
                node_rank,
                master_addr,
                master_port,
                world_size,
                device_ids,
                file_friendly_logging,
            ),
            nprocs=num_procs,
        )
        if dry_run:
            return None
        else:
            archive_model(serialization_dir)
            model = Model.load(params, serialization_dir)
            return model
Beispiel #9
0
def train_model(
    params: Params,
    serialization_dir: str,
    file_friendly_logging: bool = False,
    recover: bool = False,
    force: bool = False,
    cache_directory: str = None,
    cache_prefix: str = None,
    node_rank: int = 0,
    include_package: List[str] = None,
) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    # Parameters

    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    force : ``bool``, optional (default=False)
        If ``True``, we will overwrite the serialization directory if it already exists.
    cache_directory : ``str``, optional
        For caching data pre-processing.  See :func:`allennlp.training.util.datasets_from_params`.
    cache_prefix : ``str``, optional
        For caching data pre-processing.  See :func:`allennlp.training.util.datasets_from_params`.
    node_rank : ``int``, optional
        Rank of the current node in distributed training
    include_package : ``List[str]``, optional
        In distributed mode, extra packages mentioned will be imported in trainer workers.

    # Returns

    best_model : ``Model``
        The model with the best epoch weights.
    """
    create_serialization_dir(params, serialization_dir, recover, force)
    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    distributed_params = params.params.pop("distributed", None)
    # If distributed isn't in the config and the config contains strictly
    # one cuda device, we just run a single training process.
    if distributed_params is None:
        model = _train_worker(
            process_rank=0,
            params=params,
            serialization_dir=serialization_dir,
            file_friendly_logging=file_friendly_logging,
            recover=recover,
            cache_directory=cache_directory,
            cache_prefix=cache_prefix,
            include_package=include_package,
        )
        archive_model(serialization_dir,
                      files_to_archive=params.files_to_archive)
        return model

    # Otherwise, we are running multiple processes for training.
    else:
        # We are careful here so that we can raise a good error if someone
        # passed the wrong thing - cuda_devices are required.
        device_ids = distributed_params.pop("cuda_devices", None)
        multi_device = isinstance(device_ids, list) and len(device_ids) > 1
        num_nodes = distributed_params.pop("num_nodes", 1)

        if not (multi_device or num_nodes > 1):
            raise ConfigurationError(
                "Multiple cuda devices/nodes need to be configured to run distributed training."
            )
        check_for_gpu(device_ids)

        master_addr = distributed_params.pop("master_address", "127.0.0.1")
        master_port = distributed_params.pop("master_port", 29500)
        num_procs = len(device_ids)
        world_size = num_nodes * num_procs

        os.environ["MASTER_ADDR"] = master_addr
        os.environ["MASTER_PORT"] = str(master_port)
        os.environ["WORLD_SIZE"] = str(world_size)

        logging.info(
            f"Switching to distributed training mode since multiple GPUs are configured"
            f"Master is at: {master_addr}:{master_port} | Rank of this node: {node_rank} | "
            f"Number of workers in this node: {num_procs} | Number of nodes: {num_nodes} | "
            f"World size: {world_size}")

        # Creating `Vocabulary` objects from workers could be problematic since the data iterators
        # in each worker will yield only `rank` specific instances. Hence it is safe to construct
        # the vocabulary and write it to disk before initializing the distributed context. The workers
        # will load the vocabulary from the path specified.
        make_vocab_from_params(params.duplicate(), serialization_dir)
        params["vocabulary"] = {
            "directory_path": os.path.join(serialization_dir, "vocabulary"),
            "extend": False,  # vocab extension would have been done above
        }

        mp.spawn(
            _train_worker,
            args=(
                params.duplicate(),
                serialization_dir,
                file_friendly_logging,
                recover,
                cache_directory,
                cache_prefix,
                include_package,
                node_rank,
                master_addr,
                master_port,
                world_size,
                device_ids,
            ),
            nprocs=num_procs,
        )
        archive_model(serialization_dir,
                      files_to_archive=params.files_to_archive)
        model = Model.load(params, serialization_dir)
        return model