Esempio n. 1
0
 def test_reset_tqdm_logger_handlers(self):
     serialization_dir_a = os.path.join(self.TEST_DIR, "test_a")
     os.makedirs(serialization_dir_a, exist_ok=True)
     prepare_global_logging(serialization_dir_a)
     serialization_dir_b = os.path.join(self.TEST_DIR, "test_b")
     os.makedirs(serialization_dir_b, exist_ok=True)
     prepare_global_logging(serialization_dir_b)
     # Use range(1) to make sure there should be only 2 lines in the file (0% and 100%)
     for _ in Tqdm.tqdm(range(1)):
         pass
     with open(os.path.join(serialization_dir_a, "out.log"), "r") as f:
         assert len(f.readlines()) == 0
     with open(os.path.join(serialization_dir_b, "out.log"), "r") as f:
         assert len(f.readlines()) == 2
Esempio n. 2
0
def _train_worker(
    process_rank: int,
    params: Params,
    serialization_dir: Union[str, PathLike],
    include_package: List[str] = None,
    dry_run: bool = False,
    node_rank: int = 0,
    master_addr: str = "127.0.0.1",
    master_port: int = 29500,
    world_size: int = 1,
    distributed_device_ids: List[int] = None,
    file_friendly_logging: bool = False,
) -> Optional[Model]:
    """
    Helper to train the configured model/experiment. In distributed mode, this is spawned as a
    worker process. In a single GPU experiment, this returns the `Model` object and in distributed
    training, nothing is returned.

    # Parameters

    process_rank : `int`
        The process index that is initialized using the GPU device id.
    params : `Params`
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : `str`
        The directory in which to save results and logs.
    include_package : `List[str]`, optional
        In distributed mode, since this function would have been spawned as a separate process,
        the extra imports need to be done again. NOTE: This does not have any effect in single
        GPU training.
    dry_run : `bool`, optional (default=`False`)
        Do not train a model, but create a vocabulary, show dataset statistics and other training
        information.
    node_rank : `int`, optional
        Rank of the node.
    master_addr : `str`, optional (default=`"127.0.0.1"`)
        Address of the master node for distributed training.
    master_port : `str`, optional (default=`"29500"`)
        Port of the master node for distributed training.
    world_size : `int`, optional
        The number of processes involved in distributed training.
    distributed_device_ids: `List[str]`, optional
        IDs of the devices used involved in distributed training.
    file_friendly_logging : `bool`, optional (default=`False`)
        If `True`, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.

    # Returns

    best_model : `Optional[Model]`
        The model with the best epoch weights or `None` if in distributed training or in dry run.
    """
    common_logging.FILE_FRIENDLY_LOGGING = file_friendly_logging

    common_logging.prepare_global_logging(
        serialization_dir,
        rank=process_rank,
        world_size=world_size,
    )
    common_util.prepare_environment(params)

    distributed = world_size > 1

    # not using `allennlp.common.util.is_master` as the process group is yet to be initialized
    master = process_rank == 0

    include_package = include_package or []

    if distributed:
        # Since the worker is spawned and not forked, the extra imports need to be done again.
        # Both the ones from the plugins and the ones from `include_package`.
        import_plugins()
        for package_name in include_package:
            common_util.import_module_and_submodules(package_name)

        num_procs_per_node = len(distributed_device_ids)
        # The Unique identifier of the worker process among all the processes in the
        # distributed training group is computed here. This is used while initializing
        # the process group using `init_process_group`
        global_rank = node_rank * num_procs_per_node + process_rank

        # Number of processes per node is useful to know if a process
        # is a master in the local node(node in which it is running)
        os.environ["ALLENNLP_PROCS_PER_NODE"] = str(num_procs_per_node)

        # In distributed training, the configured device is always going to be a list.
        # The corresponding gpu id for the particular worker is obtained by picking the id
        # from the device list with the rank as index
        gpu_id = distributed_device_ids[process_rank]  # type: ignore

        # Till now, "cuda_device" might not be set in the trainer params.
        # But a worker trainer needs to only know about its specific GPU id.
        params["trainer"]["cuda_device"] = gpu_id
        params["trainer"]["world_size"] = world_size
        params["trainer"]["distributed"] = True

        if gpu_id >= 0:
            torch.cuda.set_device(int(gpu_id))
            dist.init_process_group(
                backend="nccl",
                init_method=f"tcp://{master_addr}:{master_port}",
                world_size=world_size,
                rank=global_rank,
            )
        else:
            dist.init_process_group(
                backend="gloo",
                init_method=f"tcp://{master_addr}:{master_port}",
                world_size=world_size,
                rank=global_rank,
            )
        logging.info(f"Process group of world size {world_size} initialized "
                     f"for distributed training in worker {global_rank}")

    train_loop = TrainModel.from_params(
        params=params,
        serialization_dir=serialization_dir,
        local_rank=process_rank,
    )

    if dry_run:
        return None

    try:
        if distributed:  # let the setup get ready for all the workers
            dist.barrier()

        metrics = train_loop.run()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if master and os.path.exists(
                os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir)
        raise

    if master:
        train_loop.finish(metrics)

    if not distributed:
        return train_loop.model

    return None
Esempio n. 3
0
def train_model(
    params: Params,
    serialization_dir: Union[str, PathLike],
    recover: bool = False,
    force: bool = False,
    node_rank: int = 0,
    include_package: List[str] = None,
    dry_run: bool = False,
    file_friendly_logging: bool = False,
) -> Optional[Model]:
    """
    Trains the model specified in the given [`Params`](../common/params.md#params) object, using the data
    and training parameters also specified in that object, and saves the results in `serialization_dir`.

    # Parameters

    params : `Params`
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : `str`
        The directory in which to save results and logs.
    recover : `bool`, optional (default=`False`)
        If `True`, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see `Model.from_archive`.
    force : `bool`, optional (default=`False`)
        If `True`, we will overwrite the serialization directory if it already exists.
    node_rank : `int`, optional
        Rank of the current node in distributed training
    include_package : `List[str]`, optional
        In distributed mode, extra packages mentioned will be imported in trainer workers.
    dry_run : `bool`, optional (default=`False`)
        Do not train a model, but create a vocabulary, show dataset statistics and other training
        information.
    file_friendly_logging : `bool`, optional (default=`False`)
        If `True`, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.

    # Returns

    best_model : `Optional[Model]`
        The model with the best epoch weights or `None` if in dry run.
    """
    common_logging.FILE_FRIENDLY_LOGGING = file_friendly_logging

    training_util.create_serialization_dir(params, serialization_dir, recover, force)
    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    include_in_archive = params.pop("include_in_archive", None)
    verify_include_in_archive(include_in_archive)

    distributed_params = params.params.pop("distributed", None)
    # If distributed isn't in the config and the config contains strictly
    # one cuda device, we just run a single training process.
    if distributed_params is None:
        model = _train_worker(
            process_rank=0,
            params=params,
            serialization_dir=serialization_dir,
            include_package=include_package,
            dry_run=dry_run,
            file_friendly_logging=file_friendly_logging,
        )

        if not dry_run:
            archive_model(serialization_dir, include_in_archive=include_in_archive)
        return model

    # Otherwise, we are running multiple processes for training.
    else:
        common_logging.prepare_global_logging(
            serialization_dir,
            rank=0,
            world_size=1,
        )

        # We are careful here so that we can raise a good error if someone
        # passed the wrong thing - cuda_devices are required.
        device_ids = distributed_params.pop("cuda_devices", None)
        multi_device = isinstance(device_ids, list) and len(device_ids) > 1
        num_nodes = distributed_params.pop("num_nodes", 1)

        if not (multi_device or num_nodes > 1):
            raise ConfigurationError(
                "Multiple cuda devices/nodes need to be configured to run distributed training."
            )
        check_for_gpu(device_ids)

        master_addr = distributed_params.pop("master_address", "127.0.0.1")
        if master_addr in ("127.0.0.1", "0.0.0.0", "localhost"):
            # If running locally, we can automatically find an open port if one is not specified.
            master_port = (
                distributed_params.pop("master_port", None) or common_util.find_open_port()
            )
        else:
            # Otherwise we require that the port be specified.
            master_port = distributed_params.pop("master_port")

        num_procs = len(device_ids)
        world_size = num_nodes * num_procs

        # Creating `Vocabulary` objects from workers could be problematic since
        # the data loaders in each worker will yield only `rank` specific
        # instances. Hence it is safe to construct the vocabulary and write it
        # to disk before initializing the distributed context. The workers will
        # load the vocabulary from the path specified.
        vocab_dir = os.path.join(serialization_dir, "vocabulary")
        if recover:
            vocab = Vocabulary.from_files(vocab_dir)
        else:
            vocab = training_util.make_vocab_from_params(
                params.duplicate(), serialization_dir, print_statistics=dry_run
            )
        params["vocabulary"] = {
            "type": "from_files",
            "directory": vocab_dir,
            "padding_token": vocab._padding_token,
            "oov_token": vocab._oov_token,
        }

        logging.info(
            "Switching to distributed training mode since multiple GPUs are configured | "
            f"Master is at: {master_addr}:{master_port} | Rank of this node: {node_rank} | "
            f"Number of workers in this node: {num_procs} | Number of nodes: {num_nodes} | "
            f"World size: {world_size}"
        )

        mp.spawn(
            _train_worker,
            args=(
                params.duplicate(),
                serialization_dir,
                include_package,
                dry_run,
                node_rank,
                master_addr,
                master_port,
                world_size,
                device_ids,
                file_friendly_logging,
                include_in_archive,
            ),
            nprocs=num_procs,
        )
        if dry_run:
            return None
        else:
            archive_model(serialization_dir, include_in_archive=include_in_archive)
            model = Model.load(params, serialization_dir)
            return model
Esempio n. 4
0
def main(args: argparse.Namespace):

    for package_name in args.include_package:
        import_module_and_submodules(package_name)

    params = Params.from_file(args.param_path, args.overrides)

    random_seed, numpy_seed, pytorch_seed = 41, 11, 302
    if not args.fix:
        random_seed, numpy_seed, pytorch_seed = random.randint(
            0, 999999999), random.randint(0, 999999999), random.randint(
                0, 999999999)

    params["random_seed"] = random_seed
    params["numpy_seed"] = numpy_seed
    params["pytorch_seed"] = pytorch_seed
    prepare_environment(params)
    serialization_dir = args.serialization_dir
    create_serialization_dir(params, serialization_dir, args.recover,
                             args.force)
    prepare_global_logging(serialization_dir, args.file_friendly_logging)

    hyperparams = list(
        get_hyperparams(params.as_dict(infer_type_and_cast=True)))

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    test_file = params.params.get("test_data_path", None)
    validation_data_path = params.get("validation_data_path", None)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)

    test_command = None
    if evaluate_on_test:
        test_command = BaseEvaluationCommand.from_params(
            params.pop("test_command"))

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    check_for_gpu(cuda_device)

    train_model = TrainPipelineModel.from_params(
        params=params, serialization_dir=serialization_dir, local_rank=0)

    trainer = train_model.trainer

    if trainer.validation_command is not None:
        trainer.validation_command.maybe_set_gold_file(validation_data_path)

    params.assert_empty('base train command')

    if args.comet is not None:
        experiment = Experiment(api_key=args.comet,
                                workspace=args.workspace,
                                project_name=args.project,
                                parse_args=False,
                                auto_output_logging=None)
        if args.tags:
            experiment.add_tags(args.tags)
        with open(args.param_path) as fil:
            code = "".join(fil.readlines())
        code += "\n\n#=============Full details=============\n\n"
        full_details = _jsonnet.evaluate_file(args.param_path)
        code += full_details
        code += "\n\n#=============IMPORTANT: overwritten options============\n\n"
        code += args.overrides
        experiment.set_code(code, overwrite=True)

        for key, val in hyperparams:
            experiment.log_parameter(key, val)

        experiment.log_parameter("model_directory", serialization_dir)
        experiment.log_parameter("cuda_device", cuda_device)
        experiment.log_parameter("hostname", socket.gethostname())
        experiment.log_parameter("random_seed", random_seed)
        experiment.log_parameter("numpy_seed", numpy_seed)
        experiment.log_parameter("pytorch_seed", pytorch_seed)
    else:
        experiment = None

    try:
        metrics = trainer.train(experiment)
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir)
        raise

    # Evaluate
    if test_file and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights (see pred_test.txt)."
        )
        trainer.annotator.annotate_file(
            trainer.model, test_file,
            os.path.join(serialization_dir, "pred_test.txt"))

        if test_command:
            logger.info("Comparing against gold standard.")
            test_command.maybe_set_gold_file(test_file)
            test_metrics = test_command.evaluate(
                os.path.join(serialization_dir, "pred_test.txt"))
            if experiment:
                with experiment.test():
                    experiment.log_metrics({
                        k: v
                        for k, v in test_metrics.items() if np.isscalar(v)
                    })
            metrics = merge_dicts(metrics, "test", test_metrics)

    dump_metrics(os.path.join(serialization_dir, "metrics.json"),
                 metrics,
                 log=True)

    if not args.no_archive:
        # Now tar up results
        archive_model(serialization_dir)