예제 #1
0
    def finish(self, metrics: Dict[str, Any]) -> None:
        # import wandb here to be sure that it was initialized
        # before this line was executed
        import wandb  # noqa

        if self.evaluation_data_loader is not None and self.evaluate_on_test:
            logger.info(
                "The model will be evaluated using the best epoch weights.")
            test_metrics = training_util.evaluate(
                self.model,
                self.evaluation_data_loader,  # type:ignore
                cuda_device=self.trainer.cuda_device,  # type: ignore
                batch_weight_key=self.batch_weight_key,
            )

            for key, value in test_metrics.items():
                metrics["test_" + key] = value
        elif self.evaluation_data_loader is not None:
            logger.info(
                "To evaluate on the test set after training, pass the "
                "'evaluate_on_test' flag, or use the 'allennlp evaluate' command."
            )
        common_util.dump_metrics(
            os.path.join(self.serialization_dir, "metrics.json"),
            metrics,
            log=True,
        )
        # update the summary with all metrics
        wandb.run.summary.update(metrics)
예제 #2
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger("allennlp.common.params").disabled = True
    logging.getLogger("allennlp.nn.initializers").disabled = True
    logging.getLogger("allennlp.modules.token_embedders.embedding").setLevel(
        logging.INFO)

    # Load from archive
    archive = load_archive(
        args.archive_file,
        weights_file=args.weights_file,
        cuda_device=args.cuda_device,
        overrides=args.overrides,
    )
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop("validation_dataset_reader",
                                                  None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(
            config.pop("dataset_reader"))
    evaluation_data_path = args.input_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    embedding_sources = (json.loads(args.embedding_sources_mapping)
                         if args.embedding_sources_mapping else {})

    if args.extend_vocab:
        logger.info("Vocabulary is being extended with test instances.")
        model.vocab.extend_from_instances(instances=instances)
        model.extend_embedder_vocab(embedding_sources)

    instances.index_with(model.vocab)
    data_loader_params = config.pop("validation_data_loader", None)
    if data_loader_params is None:
        data_loader_params = config.pop("data_loader")
    if args.batch_size:
        data_loader_params["batch_size"] = args.batch_size
    data_loader = DataLoader.from_params(dataset=instances,
                                         params=data_loader_params)

    metrics = evaluate(model, data_loader, args.cuda_device,
                       args.batch_weight_key)

    logger.info("Finished evaluating.")

    dump_metrics(args.output_file, metrics, log=True)

    return metrics
예제 #3
0
def main(cuda_device, testing=False, testing_vocab=False, experiments=None):
    # ''' Make training happen
    if experiments:
        save_dir = experiments
        os.system('cp training_config/coref.jsonnet ' + save_dir)
        for x in [10, 5, 0]:
            print("Running with " + str(x) + "% of labels")
            serialization_dir = os.path.join(save_dir,
                                             "temp_" + str(cuda_device))
            os.system('rm -rf ' + serialization_dir)
            params = Params.from_file(os.path.join(save_dir, 'coref.jsonnet'))
            params.params['trainer']['cuda_device'] = cuda_device
            params.params['trainer']['active_learning']['use_percent'] = True
            params.params['trainer']['active_learning']['num_labels'] = round(
                0.01 * x, 2)
            best_model, metrics = train_model(params, serialization_dir)
            dump_metrics(os.path.join(save_dir,
                                      str(x) + ".json"),
                         metrics,
                         log=True)
    else:
        params = Params.from_file('training_config/coref.jsonnet')
        if testing or testing_vocab:
            params.params['trainer']['active_learning']['epoch_interval'] = 0
            if testing:
                params.params['model']['text_field_embedder'][
                    'token_embedders']['tokens'] = {
                        'type': 'embedding',
                        'embedding_dim': 300
                    }
        serialization_dir = tempfile.mkdtemp()
        params.params['trainer']['cuda_device'] = cuda_device
        best_model, metrics = train_model(params, serialization_dir)
예제 #4
0
    def end_of_epoch(self, trainer: "CallbackTrainer"):
        # Create overall metrics dict
        training_elapsed_time = time.time() - trainer.training_start_time
        trainer.metrics["training_duration"] = str(
            datetime.timedelta(seconds=training_elapsed_time))
        trainer.metrics["training_start_epoch"] = self.starting_epoch
        trainer.metrics[
            "training_epochs"] = trainer.epoch_number - self.starting_epoch + 1
        trainer.metrics["epoch"] = trainer.epoch_number

        for key, value in trainer.train_metrics.items():
            trainer.metrics["training_" + key] = value
        for key, value in trainer.val_metrics.items():
            trainer.metrics["validation_" + key] = value

        if self.metric_tracker.is_best_so_far():
            # Update all the best_ metrics.
            # (Otherwise they just stay the same as they were.)
            trainer.metrics["best_epoch"] = trainer.epoch_number
            for key, value in trainer.val_metrics.items():
                trainer.metrics["best_validation_" + key] = value

            self.metric_tracker.best_epoch_metrics = copy.deepcopy(
                trainer.val_metrics)

        if trainer._serialization_dir:
            dump_metrics(
                os.path.join(trainer._serialization_dir,
                             f"metrics_epoch_{trainer.epoch_number}.json"),
                trainer.metrics,
            )
예제 #5
0
    def finish(self, metrics: Dict[str, Any]) -> None:
        # import wandb here to be sure that it was initialized
        # before this line was executed
        import wandb  # noqa

        if self.evaluation_data_loader is not None and self.evaluate_on_test:
            logger.info(
                "The model will be evaluated using the best epoch weights.")
            test_metrics = training_util.evaluate(
                self.model,
                self.evaluation_data_loader,  # type:ignore
                cuda_device=self.trainer.cuda_device,  # type: ignore
                batch_weight_key=self.batch_weight_key,
            )

            for key, value in test_metrics.items():
                metrics["test_" + key] = value
        elif self.evaluation_data_loader is not None:
            logger.info(
                "To evaluate on the test set after training, pass the "
                "'evaluate_on_test' flag, or use the 'allennlp evaluate' command."
            )
        common_util.dump_metrics(
            os.path.join(self.serialization_dir, "metrics.json"),
            metrics,
            log=True,
        )
        # update the summary with all metrics

        if wandb.run is None:
            logger.info("wandb run was closed. Resuming to update summary.")
            run = wandb.init(
                id=read_from_env("WANDB_RUN_ID"),
                project=read_from_env("WANDB_PROJECT"),
                entity=read_from_env("WANDB_ENTITY"),
                resume="must",
            )
        else:
            logger.info(
                "There is an active wandb run. Using that to update summary.")
            run = wandb.run

        if run is not None:
            logger.info("Updating summary on wandb.")
            run.summary.update(metrics)
예제 #6
0
    def finish(self, metrics: Dict[str, Any]):
        if self.evaluation_data_loader is not None and self.evaluate_on_test:
            logger.info("The model will be evaluated using the best epoch weights.")
            test_metrics = training_util.evaluate(
                self.model,
                self.evaluation_data_loader,
                cuda_device=self.trainer.cuda_device,
                batch_weight_key=self.batch_weight_key,
            )

            for key, value in test_metrics.items():
                metrics["test_" + key] = value
        elif self.evaluation_data_loader is not None:
            logger.info(
                "To evaluate on the test set after training, pass the "
                "'evaluate_on_test' flag, or use the 'allennlp evaluate' command."
            )
        common_util.dump_metrics(
            os.path.join(self.serialization_dir, "metrics.json"), metrics, log=True
        )
예제 #7
0
def train_one_epoch(
        trainer: Trainer,
        epoch_count: int) -> Tuple[Dict[str, float], Dict[str, float]]:
    train_metrics: Dict[str, float] = {}
    val_metrics: Dict[str, float] = {}
    this_epoch_val_metric: float = None
    metrics: Dict[str, float] = {}

    train_metrics = trainer._train_epoch(epoch_count)

    if trainer._validation_data is not None:
        with torch.no_grad():
            # We have a validation set, so compute all the metrics on it.
            val_loss, num_batches = trainer._validation_loss()
            val_metrics = training_util.get_metrics(trainer.model,
                                                    val_loss,
                                                    num_batches,
                                                    reset=True)
            this_epoch_val_metric = val_metrics[trainer._validation_metric]

    for key, value in train_metrics.items():
        metrics["training_" + key] = value
    for key, value in val_metrics.items():
        metrics["validation_" + key] = value

    if trainer._serialization_dir:
        dump_metrics(
            os.path.join(trainer._serialization_dir,
                         f"metrics_epoch_{epoch_count}.json"), metrics)

    # The Scheduler API is agnostic to whether your schedule requires a validation metric -
    # if it doesn't, the validation metric passed here is ignored.
    if trainer._learning_rate_scheduler:
        trainer._learning_rate_scheduler.step(this_epoch_val_metric,
                                              epoch_count)
    if trainer._momentum_scheduler:
        trainer._momentum_scheduler.step(this_epoch_val_metric, epoch_count)
    #trainer._save_checkpoint(epoch_count)
    return train_metrics, val_metrics
예제 #8
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False,
                cache_directory: str = None,
                cache_prefix: str = None) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    force : ``bool``, optional (default=False)
        If ``True``, we will overwrite the serialization directory if it already exists.
    cache_directory : ``str``, optional
        For caching data pre-processing.  See :func:`allennlp.training.util.datasets_from_params`.
    cache_prefix : ``str``, optional
        For caching data pre-processing.  See :func:`allennlp.training.util.datasets_from_params`.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    create_serialization_dir(params, serialization_dir, recover, force)
    stdout_handler = prepare_global_logging(serialization_dir,
                                            file_friendly_logging)
    prepare_environment(params)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    check_for_gpu(cuda_device)

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)

    trainer_type = params.get("trainer", {}).get("type", "default")

    if trainer_type == "default":
        # Special logic to instantiate backward-compatible trainer.
        pieces = TrainerPieces.from_params(
            params,  # pylint: disable=no-member
            serialization_dir,
            recover,
            cache_directory,
            cache_prefix)
        trainer = Trainer.from_params(
            model=pieces.model,
            serialization_dir=serialization_dir,
            iterator=pieces.iterator,
            train_data=pieces.train_dataset,
            validation_data=pieces.validation_dataset,
            params=pieces.params,
            validation_iterator=pieces.validation_iterator)

        evaluation_iterator = pieces.validation_iterator or pieces.iterator
        evaluation_dataset = pieces.test_dataset

    else:
        if evaluate_on_test:
            raise ValueError(
                "--evaluate-on-test only works with the default Trainer. "
                "If you're using the CallbackTrainer you can use a callback "
                "to evaluate at Events.TRAINING_END; otherwise you'll have "
                "to run allennlp evaluate separately.")

        trainer = TrainerBase.from_params(params, serialization_dir, recover,
                                          cache_directory, cache_prefix)
        evaluation_dataset = None

    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Evaluate
    if evaluation_dataset and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
            trainer.model,
            evaluation_dataset,
            evaluation_iterator,
            cuda_device=trainer._cuda_devices[0],  # pylint: disable=protected-access,
            # TODO(brendanr): Pass in an arg following Joel's trainer refactor.
            batch_weight_key="")

        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif evaluation_dataset:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    cleanup_global_logging(stdout_handler)

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)
    dump_metrics(os.path.join(serialization_dir, "metrics.json"),
                 metrics,
                 log=True)

    # We count on the trainer to have the model with best weights
    return trainer.model
예제 #9
0
    def train(self) -> Dict[str, Any]:
        """
        Trains the supplied model with the supplied parameters.
        """
        try:
            epoch_counter = self._restore_checkpoint()
        except RuntimeError:
            traceback.print_exc()
            raise ConfigurationError(
                "Could not recover training from the checkpoint.  Did you mean to output to "
                "a different serialization directory or delete the existing serialization "
                "directory?")

        training_util.enable_gradient_clipping(self.model, self._grad_clipping)

        logger.info("Beginning training.")

        train_metrics: Dict[str, float] = {}
        val_metrics: Dict[str, float] = {}
        this_epoch_val_metric: float = None
        metrics: Dict[str, Any] = {}
        epochs_trained = 0
        training_start_time = time.time()

        for epoch in range(epoch_counter, self._num_epochs):
            epoch_start_time = time.time()
            train_metrics = self._train_epoch(epoch)

            # get peak of memory usage
            if 'cpu_memory_MB' in train_metrics:
                metrics['peak_cpu_memory_MB'] = max(
                    metrics.get('peak_cpu_memory_MB', 0),
                    train_metrics['cpu_memory_MB'])
            for key, value in train_metrics.items():
                if key.startswith('gpu_'):
                    metrics["peak_" + key] = max(metrics.get("peak_" + key, 0),
                                                 value)

            if self._validation_data is not None:
                with torch.no_grad():
                    # We have a validation set, so compute all the metrics on it.
                    val_loss, num_batches = self._validation_loss()
                    val_metrics = training_util.get_metrics(self.model,
                                                            val_loss,
                                                            num_batches,
                                                            reset=True)

                    # Check validation metric for early stopping
                    this_epoch_val_metric = val_metrics[
                        self._validation_metric]
                    self._metric_tracker.add_metric(this_epoch_val_metric)

                    if self._metric_tracker.should_stop_early():
                        logger.info("Ran out of patience.  Stopping training.")
                        break

            self._tensorboard.log_metrics(train_metrics,
                                          val_metrics=val_metrics,
                                          log_to_console=True)

            # Create overall metrics dict
            training_elapsed_time = time.time() - training_start_time
            metrics["training_duration"] = time.strftime(
                "%H:%M:%S", time.gmtime(training_elapsed_time))
            metrics["training_start_epoch"] = epoch_counter
            metrics["training_epochs"] = epochs_trained
            metrics["epoch"] = epoch

            for key, value in train_metrics.items():
                metrics["training_" + key] = value
            for key, value in val_metrics.items():
                metrics["validation_" + key] = value

            if self._metric_tracker.is_best_so_far():
                # Update all the best_ metrics.
                # (Otherwise they just stay the same as they were.)
                metrics['best_epoch'] = epoch
                for key, value in val_metrics.items():
                    metrics["best_validation_" + key] = value

            if self._serialization_dir:
                dump_metrics(
                    os.path.join(self._serialization_dir,
                                 f'metrics_epoch_{epoch}.json'), metrics)

            if self._learning_rate_scheduler:
                # The LRScheduler API is agnostic to whether your schedule requires a validation metric -
                # if it doesn't, the validation metric passed here is ignored.
                self._learning_rate_scheduler.step(this_epoch_val_metric,
                                                   epoch)

            self._save_checkpoint(epoch)

            epoch_elapsed_time = time.time() - epoch_start_time
            logger.info(
                "Epoch duration: %s",
                time.strftime("%H:%M:%S", time.gmtime(epoch_elapsed_time)))

            if epoch < self._num_epochs - 1:
                training_elapsed_time = time.time() - training_start_time
                estimated_time_remaining = training_elapsed_time * \
                    ((self._num_epochs - epoch_counter) / float(epoch - epoch_counter + 1) - 1)
                formatted_time = str(
                    datetime.timedelta(seconds=int(estimated_time_remaining)))
                logger.info("Estimated training time remaining: %s",
                            formatted_time)

            epochs_trained += 1

        # Load the best model state before returning
        best_model_state = self._checkpointer.best_model_state()
        if best_model_state:
            self.model.load_state_dict(best_model_state)

        return metrics
예제 #10
0
    def train(self) -> Dict[str, Any]:
        try:
            epoch_counter = self._restore_checkpoint()
        except RuntimeError:
            traceback.print_exc()
            raise ConfigurationError(
                "Could not recover training from the checkpoint.  Did you mean to output to "
                "a different serialization directory or delete the existing serialization "
                "directory?")

        training_util.enable_gradient_clipping(self.model, self._grad_clipping)

        logger.info("Beginning training.")

        train_metrics: Dict[str, float] = {}
        val_metrics: Dict[str, float] = {}
        this_epoch_val_metric: float = None
        metrics: Dict[str, Any] = {}
        epochs_trained = 0
        training_start_time = time.time()

        metrics['best_epoch'] = self._metric_tracker.best_epoch
        for key, value in self._metric_tracker.best_epoch_metrics.items():
            metrics["best_validation_" + key] = value
        for epoch in range(epoch_counter, self._num_epochs):
            epoch_start_time = time.time()
            train_metrics = self._train_epoch(epoch)
            if self._validation_data is not None:
                with torch.no_grad():
                    val_loss, num_batches = self._validation_loss()
                    val_metrics = training_util.get_metrics(self.get_model(),
                                                            val_loss,
                                                            num_batches,
                                                            reset=True)
                    this_epoch_val_metric = val_metrics[
                        self._validation_metric]
                    self._metric_tracker.add_metric(this_epoch_val_metric)

                    if self._metric_tracker.should_stop_early():
                        logger.info("Ran out of patience.  Stopping training.")
                        break

            # Create overall metrics dict
            training_elapsed_time = time.time() - training_start_time
            metrics["training_duration"] = str(
                datetime.timedelta(seconds=training_elapsed_time))
            metrics["training_start_epoch"] = epoch_counter
            metrics["training_epochs"] = epochs_trained
            metrics["epoch"] = epoch

            for key, value in train_metrics.items():
                metrics["training_" + key] = value
            for key, value in val_metrics.items():
                metrics["validation_" + key] = value

            if self._metric_tracker.is_best_so_far():
                metrics['best_epoch'] = epoch
                for key, value in val_metrics.items():
                    metrics["best_validation_" + key] = value

                self._metric_tracker.best_epoch_metrics = val_metrics

            if self._serialization_dir and is_master_rank():
                dump_metrics(
                    os.path.join(self._serialization_dir,
                                 f'metrics_epoch_{epoch}.json'), metrics)

            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step(this_epoch_val_metric,
                                                   epoch)
            if is_master_rank():
                self._save_checkpoint(epoch)

            epoch_elapsed_time = time.time() - epoch_start_time
            logger.info("Epoch duration: %s",
                        datetime.timedelta(seconds=epoch_elapsed_time))

            if epoch < self._num_epochs - 1:
                training_elapsed_time = time.time() - training_start_time
                estimated_time_remaining = training_elapsed_time * \
                                           ((self._num_epochs - epoch_counter) / float(epoch - epoch_counter + 1) - 1)
                formatted_time = str(
                    datetime.timedelta(seconds=int(estimated_time_remaining)))
                logger.info("Estimated training time remaining: %s",
                            formatted_time)

            epochs_trained += 1

        best_model_state = self._checkpointer.best_model_state()
        if best_model_state:
            self.model.load_state_dict(best_model_state)

        return metrics
                                          len(reader.alltags))

ser_dir_iter = serialization_dir + "/final"
prepare_global_logging(ser_dir_iter, False)

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=folds[0] + folds[1],
                  validation_dataset=validation_dataset,
                  patience=10,
                  num_epochs=45,
                  validation_metric="+f1-measure-overall",
                  cuda_device=cuda_device,
                  num_serialized_models_to_keep=3,
                  serialization_dir=ser_dir_iter)

trainer.train()

test_metrics = util.evaluate(
    trainer.model,
    test_dataset,
    iterator,
    cuda_device=trainer._cuda_devices[0],  # pylint: disable=protected-access,
    batch_weight_key="")

for key, value in test_metrics.items():
    metrics["test_" + key] = value

dump_metrics(os.path.join(ser_dir_iter, "metrics.json"), metrics, log=True)
예제 #12
0
def _train_worker(
    process_rank: int,
    params: Params,
    serialization_dir: str,
    file_friendly_logging: bool = False,
    recover: bool = False,
    cache_directory: str = None,
    cache_prefix: str = None,
    include_package: List[str] = None,
    node_rank: int = 0,
    master_addr: str = "127.0.0.1",
    master_port: int = 29500,
    world_size: int = 1,
    distributed_device_ids: List[str] = None,
) -> Optional[Model]:
    """
    Helper to train the configured model/experiment. In distributed mode, this is spawned as a
    worker process. In a single GPU experiment, this returns the ``Model`` object and in distributed
    training, nothing is returned.

    # Parameters

    process_rank : ``int``
        The process index that is initialized using the GPU device id.
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    cache_directory : ``str``, optional
        For caching data pre-processing.  See :func:`allennlp.training.util.datasets_from_params`.
    cache_prefix : ``str``, optional
        For caching data pre-processing.  See :func:`allennlp.training.util.datasets_from_params`.
    include_package : ``List[str]``, optional
        In distributed mode, since this function would have been spawned as a separate process,
        the extra imports need to be done again. NOTE: This does not have any effect in single
        GPU training.
    node_rank : ``int``, optional
        Rank of the node
    world_size : ``int``, optional
        The number of processes involved in distributed training.

    # Returns

    best_model : ``Model``
        The model with the best epoch weights.
    """
    prepare_global_logging(serialization_dir,
                           file_friendly_logging,
                           rank=process_rank,
                           world_size=world_size)
    prepare_environment(params)

    distributed = world_size > 1

    # not using `allennlp.common.util.is_master` as the process group is yet to be initialized
    master = process_rank == 0

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)

    if distributed:
        # Since the worker is spawned and not forked, the extra imports
        # need to be done again.
        if include_package is not None:
            for package_name in include_package:
                import_submodules(package_name)

        num_procs_per_node = len(distributed_device_ids)
        # The Unique identifier of the worker process among all the processes in the
        # distributed training group is computed here. This is used while initializing
        # the process group using `init_process_group`
        global_rank = node_rank * num_procs_per_node + process_rank

        # In distributed training, the configured device is always going to be a list.
        # The corresponding gpu id for the particular worker is obtained by picking the id
        # from the device list with the rank as index
        gpu_id = distributed_device_ids[process_rank]  # type: ignore

        # Till now, "cuda_device" might not be set in the trainer params.
        # But a worker trainer needs to only know about its specific GPU id.
        params["trainer"]["cuda_device"] = gpu_id
        params["trainer"]["world_size"] = world_size
        params["trainer"]["distributed"] = True

        torch.cuda.set_device(gpu_id)
        dist.init_process_group(
            backend="nccl",
            init_method=f"tcp://{master_addr}:{master_port}",
            world_size=world_size,
            rank=global_rank,
        )
        logging.info(f"Process group of world size {world_size} initialized "
                     f"for distributed training in worker {global_rank}")

    trainer_type = params.get("trainer", {}).get("type", "default")

    if trainer_type == "default":
        # Special logic to instantiate backward-compatible trainer.
        pieces = TrainerPieces.from_params(params, serialization_dir, recover,
                                           cache_directory, cache_prefix)
        trainer = Trainer.from_params(
            model=pieces.model,
            serialization_dir=serialization_dir,
            iterator=pieces.iterator,
            train_data=pieces.train_dataset,
            validation_data=pieces.validation_dataset,
            params=pieces.params,
            validation_iterator=pieces.validation_iterator,
        )

        evaluation_iterator = pieces.validation_iterator or pieces.iterator
        evaluation_dataset = pieces.test_dataset

    else:
        if evaluate_on_test:
            raise ValueError(
                "--evaluate-on-test only works with the default Trainer. "
                "If you're using the CallbackTrainer you can use a callback "
                "to evaluate at Events.TRAINING_END; otherwise you'll have "
                "to run allennlp evaluate separately.")

        trainer = TrainerBase.from_params(params, serialization_dir, recover,
                                          cache_directory, cache_prefix)
        evaluation_dataset = None

    params.assert_empty("base train command")

    try:
        if distributed:  # let the setup get ready for all the workers
            dist.barrier()

        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if master and os.path.exists(
                os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    if master:
        if evaluation_dataset and evaluate_on_test:
            logger.info(
                "The model will be evaluated using the best epoch weights.")
            test_metrics = evaluate(
                trainer.model,
                evaluation_dataset,
                evaluation_iterator,
                cuda_device=trainer.cuda_device,
                # TODO(brendanr): Pass in an arg following Joel's trainer refactor.
                batch_weight_key="",
            )

            for key, value in test_metrics.items():
                metrics["test_" + key] = value
        elif evaluation_dataset:
            logger.info(
                "To evaluate on the test set after training, pass the "
                "'evaluate_on_test' flag, or use the 'allennlp evaluate' command."
            )
        dump_metrics(os.path.join(serialization_dir, "metrics.json"),
                     metrics,
                     log=True)

    if not distributed:
        return trainer.model

    return None  # to make mypy happy
예제 #13
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False,
                debate_mode: List[str] = ('f'),
                judge_filename: str = None,
                update_judge: bool = False,
                eval_mode: bool = False,
                reward_method: str = None,
                detach_value_head: bool = False,
                breakpoint_level: int = 0,
                search_outputs_path: str = None,
                accumulation_steps: int = 1,
                multi_gpu: bool = False,
                choice_mode: str = None,
                qa_loss_weight: float = 0.,
                influence_reward: bool = False,
                theory_of_mind: bool = False,
                num_pred_rounds: int = -1,
                x_order_prob: float = 0.,
                require_action: bool = False,
                single_shot: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    debate_mode : ``List[str]``
        List of debate turns (e.g. aa, ar, rr, Ar) => capitalization implies search agent
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    force : ``bool``, optional (default=False)
        If ``True``, we will overwrite the serialization directory if it already exists.
    judge_filename : ``str``, optional (default=None)
        Path to judge config or pre-trained judge model. If config, judge trained during debate. Necessary parameter
        if running in debate mode.
    update_judge : ``bool``, optional (default=False)
        Boolean whether or not to update Judge model during debate training.
    eval_mode : ``bool``, optional (default=False)
        Boolean whether or not to run in eval-only mode, on test data. Does not update/train any of the models.
    reward_method : ``str``, optional (default=False)
        Choice of reward function (RL) or loss function (Supervised Learning) for training debate agents
    detach_value_head : ``bool``, optional (default=False)
        Boolean whether or not to detatch value function gradient updates from the policy network. This prevents
        value function gradients from affecting policy network parameters.
    breakpoint_level : ``int`` optional (default=0)
        Debugging option to set breakpoint sensitivity (0 - no breakpoints).
    id_to_search_filename : ``str`` optional (default=None)
        Path to file with search predictions for each agent - necessary for supervised training
    accumulation_steps : ``int`` (default=1)
        Number of gradient steps to accumulate over before performing an update. Poor-man's batching for instances where
        number of examples per batch is small (limited GPU memory)
    multi_gpu : ``bool`` (default=False)
        Boolean whether or not to run models/training in model parallel mode. Requires specifying GPU allocations for
        trainer, judge, and debaters in the training config file (see training_config/bidaf.race.size=0.5.gpu=2.jsonnet
        for example usage).

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    assert (
        not single_shot
    ) or eval_mode, 'Using single shot prediction outside eval_mode not yet supported.'
    assert (not single_shot) or (num_pred_rounds == -1), \
        'Using single shot prediction for a specific number of rounds is not yet supported.'
    # Get number of debate turns, and assert that not performing judge-only training
    num_no_qa_turns = sum([(('l' in debate_turn) or ('w' in debate_turn))
                           for debate_turn in debate_mode])
    if (qa_loss_weight > 0) and (num_no_qa_turns == 0):
        warnings.warn(
            'Unused argument qa_loss_weight in debate mode ' +
            str(debate_mode) +
            '. If this was unintentional, please remove the -q flag.',
            UserWarning)
    not_using_trained_debater = len(
        set('ablwⅰⅱⅲⅳ').intersection(''.join(debate_mode))) == 0
    if (judge_filename is not None) and not_using_trained_debater:
        warnings.warn(
            'Unnecessary to have debaters in debate mode ' + str(debate_mode) +
            '. If this was unintentional, please remove the -j flag.',
            UserWarning)

    prepare_environment(params)
    create_serialization_dir(params, serialization_dir, recover, force)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    # Check that all Desired CUDA Devices exist => trainer => cuda_devices should contain list of required devices
    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    check_for_gpu(cuda_device)

    # Build Allocation Dictionary (to be passed to all future functions)
    if multi_gpu:
        gpu_allocations, allocation_dict = params.params.pop(
            'gpu_allocations', {}), {}
        assert len(gpu_allocations
                   ) == 3, 'Must set gpu_allocations in config if multi-gpu'
        for k in ['debate', 'judge', 'trainer']:
            assert gpu_allocations[
                k] in cuda_device, "Desired GPU not available... current: %s" % str(
                    cuda_device)
            allocation_dict[k] = gpu_allocations[k]
    else:
        allocation_dict = {}

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)

    trainer_type = params.get("trainer", {}).get("type", "default")

    if trainer_type == "default":
        # Special logic to instantiate backward-compatible trainer.
        params['dataset_reader'][
            'debate_mode'] = debate_mode  # If debate_mode requires sample duplicates
        pieces = TrainerPieces.from_params(params,
                                           serialization_dir,
                                           cuda_device,
                                           recover,
                                           judge_filename=judge_filename,
                                           update_judge=update_judge,
                                           eval_mode=eval_mode,
                                           reward_method=reward_method,
                                           detach_value_head=detach_value_head,
                                           allocation_dict=allocation_dict,
                                           qa_loss_weight=qa_loss_weight,
                                           influence_reward=influence_reward,
                                           theory_of_mind=theory_of_mind)  # pylint: disable=no-member
        trainer = Trainer.from_params(
            model=pieces.model,
            serialization_dir=serialization_dir,
            debate_mode=debate_mode,
            iterator=pieces.iterator,
            train_data=pieces.train_dataset,
            validation_data=pieces.validation_dataset,
            params=pieces.params,
            validation_iterator=pieces.validation_iterator,
            eval_mode=eval_mode,
            breakpoint_level=breakpoint_level,
            search_outputs_path=search_outputs_path,
            accumulation_steps=accumulation_steps,
            allocation_dict=allocation_dict,
            choice_mode=choice_mode,
            num_pred_rounds=num_pred_rounds,
            x_order_prob=x_order_prob,
            require_action=require_action,
            single_shot=single_shot)
        evaluation_iterator = pieces.validation_iterator or pieces.iterator
        evaluation_dataset = pieces.test_dataset
    else:
        assert (len(debate_mode)
                == 1) and (debate_mode[0]
                           == 'f'), 'TrainerBase untested for debate training.'
        trainer = TrainerBase.from_params(params, serialization_dir, recover)
        evaluation_iterator = evaluation_dataset = None

    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir,
                                       _DEFAULT_WEIGHTS)) and not eval_mode:
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Evaluate
    if evaluation_dataset and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
            trainer.model,
            evaluation_dataset,
            evaluation_iterator,
            cuda_device=trainer._cuda_devices[0],  # pylint: disable=protected-access,
            batch_weight_key="")

        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif evaluation_dataset:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    # Now tar up results
    if not eval_mode:
        archive_model(serialization_dir,
                      files_to_archive=params.files_to_archive)
        dump_metrics(os.path.join(serialization_dir, "metrics.json"),
                     metrics,
                     log=True)
    else:
        dump_metrics(os.path.join(
            serialization_dir,
            "metrics.eval.d=" + '-'.join(debate_mode) + ".json"),
                     metrics,
                     log=True)

    # We count on the trainer to have the model with best weights
    return trainer.model
예제 #14
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False,
                del_models: bool = False,
                del_vocab: bool = False,
                convert: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    del_models : ``bool``, optional (default=False)
        If ``True``, we will delete existing models and logs if they already exist.
    del_vocab : ``bool``, optional (default=False)
        If ``True``, we will delete existing vocabulary if it already exists.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if cuda_device >= 0:
        check_for_gpu(cuda_device)
        torch.cuda.set_device(cuda_device)

    # Sometimes we might change the config a bit but still want to continue training
    # if recover:
    #     create_serialization_dir(
    #         params, serialization_dir, recover, del_models)
    if del_models:
        for path in glob(f'{serialization_dir}/*'):
            if os.path.isfile(path) and not path.endswith('config.yaml'):
                os.remove(path)
        log_path = f'{serialization_dir}/log'
        if os.path.isdir(log_path):
            shutil.rmtree(log_path)
    if del_vocab:
        vocab_path = f'{serialization_dir}/vocabulary'
        if os.path.isdir(vocab_path):
            shutil.rmtree(vocab_path)

    prepare_global_logging(serialization_dir, file_friendly_logging)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    check_for_gpu(cuda_device)

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)

    trainer_type = params.get("trainer", {}).get("type", "default")

    if trainer_type == 'default':
        # Special logic to instantiate backward-compatible trainer.
        pieces = TrainerPieces.from_params(params, serialization_dir, recover)  # pylint: disable=no-member
        trainer = Trainer.from_params(
            model=pieces.model,
            serialization_dir=serialization_dir,
            iterator=pieces.iterator,
            train_data=pieces.corpus.train,
            validation_data=pieces.corpus.valid,
            params=pieces.params,
            validation_iterator=pieces.validation_iterator)
        evaluation_iterator = pieces.validation_iterator or pieces.iterator
        evaluation_dataset = pieces.corpus.test
        batch_weight_key = pieces.batch_weight_key

    elif trainer_type == 'trainer_fp16_single':
        params.get("trainer").pop('type')
        # Special logic to instantiate backward-compatible trainer.
        pieces = TrainerPieces.from_params(params, serialization_dir, recover)  # pylint: disable=no-member
        trainer = TrainerF16SingleTask.from_params(
            model=pieces.model,
            serialization_dir=serialization_dir,
            files_to_archive=params.files_to_archive,
            iterator=pieces.iterator,
            train_data=pieces.corpus.train,
            validation_data=pieces.corpus.valid,
            params=pieces.params,
            validation_iterator=pieces.validation_iterator)
        evaluation_iterator = pieces.validation_iterator or pieces.iterator
        evaluation_dataset = pieces.corpus.test
        batch_weight_key = pieces.batch_weight_key

    else:
        trainer = TrainerBase.from_params(params, serialization_dir, recover)
        # TODO(joelgrus): handle evaluation in the general case
        evaluation_iterator = evaluation_dataset = None

    params.assert_empty('base train command')

    if convert:
        logging.info('In conversion mode.')
        trainer._save_checkpoint(epoch=0)
        create_model_archive(serialization_dir, params)
        sys.exit(0)

    try:
        metrics = trainer.train()
    except (KeyboardInterrupt, RuntimeError):
        # if we have completed an epoch, try to create a model archive.
        logging.info("Training stopped. Attempting to create "
                     "a model archive using the current best epoch weights.")
        create_model_archive(serialization_dir, params)
        raise

    # Evaluate
    if evaluation_dataset and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
            trainer.model,
            evaluation_dataset,
            evaluation_iterator,
            cuda_device=trainer._cuda_devices[0],  # pylint: disable=protected-access,
            # TODO(brendanr): Pass in an arg following Joel's trainer refactor.
            batch_weight_key=batch_weight_key)

        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif evaluation_dataset:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)
    dump_metrics(os.path.join(serialization_dir, "metrics.json"),
                 metrics,
                 log=True)

    # We count on the trainer to have the model with best weights
    return trainer.model
예제 #15
0
    def train(self) -> Dict[str, Any]:
        """
        Trains the supplied model with the supplied parameters.
        """
        try:
            epoch_counter = self._restore_checkpoint()
        except RuntimeError:
            traceback.print_exc()
            raise ConfigurationError("Could not recover training from the checkpoint.  Did you mean to output to "
                                     "a different serialization directory or delete the existing serialization "
                                     "directory?")

        training_util.enable_gradient_clipping(self.model, self._grad_clipping)

        logger.info("Beginning training.")

        train_metrics: Dict[str, float] = {}
        val_metrics: Dict[str, float] = {}
        this_epoch_val_metric: float = None
        metrics: Dict[str, Any] = {}
        epochs_trained = 0
        training_start_time = time.time()

        metrics['best_epoch'] = self._metric_tracker.best_epoch
        for key, value in self._metric_tracker.best_epoch_metrics.items():
            metrics["best_validation_" + key] = value

        if self.callbacks is not None:
            with torch.no_grad():
                for callback in self.callbacks:
                    callback.on_train_begin()

        for epoch in range(epoch_counter, self._num_epochs):
            epoch_start_time = time.time()

            if self.callbacks is not None:
                with torch.no_grad():
                    for callback in self.callbacks:
                        callback.on_epoch_begin(epoch)

            train_metrics = self._train_epoch(epoch)
            if not self._early_stopping_by_batch:
                # get peak of memory usage
                if 'cpu_memory_MB' in train_metrics:
                    metrics['peak_cpu_memory_MB'] = max(metrics.get('peak_cpu_memory_MB', 0),
                                                        train_metrics['cpu_memory_MB'])
                for key, value in train_metrics.items():
                    if key.startswith('gpu_'):
                        metrics["peak_"+key] = max(metrics.get("peak_"+key, 0), value)

                if self._validation_data is not None:
                    with torch.no_grad():
                        val_metrics_temp = self._estimator.estimate(self._validation_data)
                        # We have a validation set, so compute all the metrics on it.
                        # val_loss, num_batches = self._validation_loss()
                        # val_metrics = training_util.get_metrics(self.model, val_loss, num_batches, reset=True)
                        val_metrics = {'loss': 0}
                        if 'sentiment_acc' in val_metrics_temp:
                            val_metrics['accuracy'] = val_metrics_temp['sentiment_acc']
                        if 'category_f1' in val_metrics_temp:
                            val_metrics['category_f1'] = val_metrics_temp['category_f1']['fscore']
                        if 'other_metrics' in val_metrics_temp and 'merge_micro_f1' in val_metrics_temp['other_metrics']:
                            val_metrics['merge_micro_f1'] = val_metrics_temp['other_metrics']['merge_micro_f1']
                        # Check validation metric for early stopping
                        val_metrics.update(val_metrics_temp)
                        this_epoch_val_metric = val_metrics[self._validation_metric]
                        self._metric_tracker.add_metric(this_epoch_val_metric)

                        if self._metric_tracker.should_stop_early():
                            logger.info("Ran out of patience.  Stopping training.")
                            break

                self._tensorboard.log_metrics(train_metrics,
                                              val_metrics=val_metrics,
                                              log_to_console=True,
                                              epoch=epoch + 1)  # +1 because tensorboard doesn't like 0

                # Create overall metrics dict
                training_elapsed_time = time.time() - training_start_time
                metrics["training_duration"] = str(datetime.timedelta(seconds=training_elapsed_time))
                metrics["training_start_epoch"] = epoch_counter
                metrics["training_epochs"] = epochs_trained
                metrics["epoch"] = epoch

                for key, value in train_metrics.items():
                    metrics["training_" + key] = value
                for key, value in val_metrics.items():
                    metrics["validation_" + key] = value

                if self._metric_tracker.is_best_so_far():
                    # Update all the best_ metrics.
                    # (Otherwise they just stay the same as they were.)
                    metrics['best_epoch'] = epoch
                    for key, value in val_metrics.items():
                        metrics["best_validation_" + key] = value

                    self._metric_tracker.best_epoch_metrics = val_metrics

                if self._serialization_dir:
                    dump_metrics(os.path.join(self._serialization_dir, f'metrics_epoch_{epoch}.json'), metrics)

                # The Scheduler API is agnostic to whether your schedule requires a validation metric -
                # if it doesn't, the validation metric passed here is ignored.
                if self._learning_rate_scheduler:
                    self._learning_rate_scheduler.step(this_epoch_val_metric, epoch)
                if self._momentum_scheduler:
                    self._momentum_scheduler.step(this_epoch_val_metric, epoch)

                self._save_checkpoint(epoch)
            else:
                if self._metric_tracker.should_stop_early():
                    logger.info("Ran out of patience.  Stopping training.")
                    break

            epoch_elapsed_time = time.time() - epoch_start_time
            logger.info("Epoch duration: %s", datetime.timedelta(seconds=epoch_elapsed_time))

            if epoch < self._num_epochs - 1:
                training_elapsed_time = time.time() - training_start_time
                estimated_time_remaining = training_elapsed_time * \
                    ((self._num_epochs - epoch_counter) / float(epoch - epoch_counter + 1) - 1)
                formatted_time = str(datetime.timedelta(seconds=int(estimated_time_remaining)))
                logger.info("Estimated training time remaining: %s", formatted_time)

            if self.callbacks is not None:
                with torch.no_grad():
                    for callback in self.callbacks:
                        callback.on_epoch_end(epoch)
            epochs_trained += 1

        # make sure pending events are flushed to disk and files are closed properly
        # self._tensorboard.close()

        # Load the best model state before returning
        best_model_state = self._checkpointer.best_model_state()
        if best_model_state:
            self.model.load_state_dict(best_model_state)

        return metrics
예제 #16
0
    def train(self) -> Dict[str, Any]:
        """
        Trains the supplied model with the supplied parameters.
        """
        try:
            epoch_counter, validation_metric_per_epoch = self._restore_checkpoint(
            )
        except RuntimeError:
            traceback.print_exc()
            raise ConfigurationError(
                "Could not recover training from the checkpoint.  Did you mean to output to "
                "a different serialization directory or delete the existing serialization "
                "directory?")

        self._enable_gradient_clipping()
        self._enable_activation_logging()

        logger.info("Beginning training.")

        train_metrics: Dict[str, float] = {}
        val_metrics: Dict[str, float] = {}
        metrics: Dict[str, Any] = {}
        epochs_trained = 0
        training_start_time = time.time()

        for epoch in range(epoch_counter, self._num_epochs):
            epoch_start_time = time.time()
            train_metrics = self._train_epoch(epoch)

            if self.predictor is not None:
                # We have a validation set, so compute all the metrics on it.
                #val_loss, num_batches = self._validation_loss()
                #val_metrics = self._get_metrics(val_loss, num_batches, reset=True)
                val_metrics = self.predictor.evaluate(self.model)

                # Check validation metric for early stopping
                this_epoch_val_metric = val_metrics[self._validation_metric]

                # Check validation metric to see if it's the best so far
                is_best_so_far = self._is_best_so_far(
                    this_epoch_val_metric,
                    self._validation_metric_per_interval)
                validation_metric_per_epoch.append(this_epoch_val_metric)
                self._validation_metric_per_interval.append(
                    this_epoch_val_metric)
                if self._should_stop_early(validation_metric_per_epoch):
                    logger.info("Ran out of patience.  Stopping training.")
                    break

            else:
                # No validation set, so just assume it's the best so far.
                is_best_so_far = True
                val_metrics = {}
                this_epoch_val_metric = None

            self._metrics_to_tensorboard(epoch,
                                         train_metrics,
                                         val_metrics=val_metrics)
            self._metrics_to_console(train_metrics, val_metrics)

            # Create overall metrics dict
            training_elapsed_time = time.time() - training_start_time
            metrics["training_duration"] = time.strftime(
                "%H:%M:%S", time.gmtime(training_elapsed_time))
            metrics["training_start_epoch"] = epoch_counter
            metrics["training_epochs"] = epochs_trained
            metrics["epoch"] = epoch

            for key, value in train_metrics.items():
                metrics["training_" + key] = value
            for key, value in val_metrics.items():
                metrics["validation_" + key] = value

            if is_best_so_far:
                # Update all the best_ metrics.
                # (Otherwise they just stay the same as they were.)
                metrics['best_epoch'] = epoch
                for key, value in val_metrics.items():
                    metrics["best_validation_" + key] = value

            if self._serialization_dir:
                dump_metrics(
                    os.path.join(self._serialization_dir,
                                 f'metrics_epoch_{epoch}.json'), metrics)

            if self._learning_rate_scheduler:
                # The LRScheduler API is agnostic to whether your schedule requires a validation metric -
                # if it doesn't, the validation metric passed here is ignored.
                self._learning_rate_scheduler.step(this_epoch_val_metric,
                                                   epoch)

            if self.learning_rate_decay:
                self.optimizer.param_groups[0][
                    'lr'] *= self.learning_rate_decay

            self._save_checkpoint(epoch,
                                  validation_metric_per_epoch,
                                  is_best=is_best_so_far)

            epoch_elapsed_time = time.time() - epoch_start_time
            logger.info(
                "Epoch duration: %s",
                time.strftime("%H:%M:%S", time.gmtime(epoch_elapsed_time)))

            if epoch < self._num_epochs - 1:
                training_elapsed_time = time.time() - training_start_time
                estimated_time_remaining = training_elapsed_time * \
                    ((self._num_epochs - epoch_counter) / float(epoch - epoch_counter + 1) - 1)
                formatted_time = str(
                    datetime.timedelta(seconds=int(estimated_time_remaining)))
                logger.info("Estimated training time remaining: %s",
                            formatted_time)

            epochs_trained += 1

        return metrics
예제 #17
0
    def _try_train(self) -> Tuple[Dict[str, Any], int]:
        try:
            epoch_counter = self._restore_checkpoint()
        except RuntimeError:
            traceback.print_exc()
            raise ConfigurationError(
                "Could not recover training from the checkpoint.  Did you mean to output to "
                "a different serialization directory or delete the existing serialization "
                "directory?"
            )

        training_util.enable_gradient_clipping(self.model, self._grad_clipping)

        logger.info("Beginning training.")

        val_metrics: Dict[str, float] = {}
        metrics: Dict[str, Any] = {}
        epochs_trained = 0
        training_start_time = time.time()

        metrics["best_epoch"] = self._metric_tracker.best_epoch
        for key, value in self._metric_tracker.best_epoch_metrics.items():
            metrics["best_validation_" + key] = value

        for epoch in range(epoch_counter, self._num_epochs):
            epoch_start_time = time.time()
            train_metrics = self._train_epoch(epoch)

            # Back up the model now, in case something goes wrong later with the evaluation
            if self._primary and self._checkpointer is not None:
                self._checkpointer.shelve_model(epoch, self)
            # Wait for the primary process to finish saving the model checkpoint
            if self._distributed:
                dist.barrier()

            # get peak of memory usage
            for key, value in train_metrics.items():
                if key.startswith("gpu_") and key.endswith("_memory_MB"):
                    metrics["peak_" + key] = max(metrics.get("peak_" + key, 0), value)
                elif key.startswith("worker_") and key.endswith("_memory_MB"):
                    metrics["peak_" + key] = max(metrics.get("peak_" + key, 0), value)

            this_epoch_val_metric: float = 0.0
            if self._validation_data_loader is not None:
                with torch.no_grad():
                    # We have a validation set, so compute all the metrics on it.
                    val_loss, val_reg_loss, num_batches = self._validation_loss(epoch)

                    # It is safe again to wait till the validation is done. This is
                    # important to get the metrics right.
                    if self._distributed:
                        dist.barrier()

                    val_metrics = training_util.get_metrics(
                        self.model,
                        val_loss,
                        val_reg_loss,
                        batch_loss=None,
                        batch_reg_loss=None,
                        num_batches=num_batches,
                        reset=True,
                        world_size=self._world_size,
                        cuda_device=self.cuda_device,
                    )

                    # Check validation metric for early stopping
                    this_epoch_val_metric = self._metric_tracker.combined_score(val_metrics)
                    self._metric_tracker.add_metrics(val_metrics)

            # Create overall metrics dict
            training_elapsed_time = time.time() - training_start_time
            metrics["training_duration"] = str(datetime.timedelta(seconds=training_elapsed_time))
            metrics["training_start_epoch"] = epoch_counter
            metrics["training_epochs"] = epochs_trained
            metrics["epoch"] = epoch

            for key, value in train_metrics.items():
                metrics["training_" + key] = value
            for key, value in val_metrics.items():
                metrics["validation_" + key] = value

            if self._metric_tracker.is_best_so_far():
                # Update all the best_ metrics.
                # (Otherwise they just stay the same as they were.)
                metrics["best_epoch"] = epoch
                for key, value in val_metrics.items():
                    metrics["best_validation_" + key] = value

                self._metric_tracker.best_epoch_metrics = val_metrics

            if self._serialization_dir and self._primary:
                common_util.dump_metrics(
                    os.path.join(self._serialization_dir, f"metrics_epoch_{epoch}.json"),
                    metrics,
                )

            # The Scheduler API is agnostic to whether your schedule requires a validation metric -
            # if it doesn't, the validation metric passed here is ignored.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step(this_epoch_val_metric)
            if self._momentum_scheduler:
                self._momentum_scheduler.step(this_epoch_val_metric)

            # The checkpointer saves state from the learning rate scheduler and the momentum
            # scheduler, so we have to make sure those are updated before we save the checkpoint here.
            if self._primary and self._checkpointer is not None:
                self._checkpointer.save_checkpoint(
                    epoch, self, is_best_so_far=self._metric_tracker.is_best_so_far()
                )
            # Wait for the primary process to finish saving the checkpoint
            if self._distributed:
                dist.barrier()

            for callback in self._callbacks:
                callback.on_epoch(self, metrics=metrics, epoch=epoch, is_primary=self._primary)

            epoch_elapsed_time = time.time() - epoch_start_time
            logger.info("Epoch duration: %s", datetime.timedelta(seconds=epoch_elapsed_time))

            if epoch < self._num_epochs - 1:
                training_elapsed_time = time.time() - training_start_time
                estimated_time_remaining = training_elapsed_time * (
                        (self._num_epochs - epoch_counter) / float(epoch - epoch_counter + 1) - 1
                )
                formatted_time = str(datetime.timedelta(seconds=int(estimated_time_remaining)))
                logger.info("Estimated training time remaining: %s", formatted_time)

            epochs_trained += 1

            if self._metric_tracker.should_stop_early():
                logger.info("Ran out of patience. Stopping training.")
                break
        else:
            epoch = self._num_epochs - 1

        # Load the best model state before returning
        best_model_state = (
            None if self._checkpointer is None else self._checkpointer.best_model_state()
        )
        if best_model_state:
            self.model.load_state_dict(best_model_state)

        return metrics, epoch
예제 #18
0
    def train(self) -> Dict[str, Any]:
        """
        Trains the supplied model with the supplied parameters.
        """
        try:
            epoch_counter = self._restore_checkpoint()
        except RuntimeError:
            traceback.print_exc()
            raise ConfigurationError(
                "Could not recover training from the checkpoint.  Did you mean to output to "
                "a different serialization directory or delete the existing serialization "
                "directory?")

        training_util.enable_gradient_clipping(self.model, self._grad_clipping)

        logger.info("Beginning training.")

        val_metrics: Dict[str, float] = {}
        this_epoch_val_metric: float = None
        metrics: Dict[str, Any] = {}
        epochs_trained = 0
        training_start_time = time.time()

        metrics["best_epoch"] = self._metric_tracker.best_epoch
        for key, value in self._metric_tracker.best_epoch_metrics.items():
            metrics["best_validation_" + key] = value

        for callback in self._epoch_callbacks:
            callback(self, metrics={}, epoch=-1)

        for epoch in range(epoch_counter, self._num_epochs):
            epoch_start_time = time.time()
            train_metrics = self._train_epoch(epoch)

            # get peak of memory usage
            if "cpu_memory_MB" in train_metrics:
                metrics["peak_cpu_memory_MB"] = max(
                    metrics.get("peak_cpu_memory_MB", 0),
                    train_metrics["cpu_memory_MB"])
            for key, value in train_metrics.items():
                if key.startswith("gpu_"):
                    metrics["peak_" + key] = max(metrics.get("peak_" + key, 0),
                                                 value)

            if self._validation_data_loader is not None:
                with torch.no_grad():
                    # We have a validation set, so compute all the metrics on it.
                    val_loss, val_reg_loss, num_batches = self._validation_loss(
                        epoch)

                    # It is safe again to wait till the validation is done. This is
                    # important to get the metrics right.
                    if self._distributed:
                        dist.barrier()

                    val_metrics = training_util.get_metrics(
                        self.model,
                        val_loss,
                        val_reg_loss,
                        num_batches,
                        reset=True,
                        world_size=self._world_size,
                        cuda_device=[self.cuda_device],
                    )

                    # Check validation metric for early stopping
                    this_epoch_val_metric = val_metrics[
                        self._validation_metric]
                    self._metric_tracker.add_metric(this_epoch_val_metric)

                    if self._metric_tracker.should_stop_early():
                        logger.info("Ran out of patience.  Stopping training.")
                        break

            if self._master:
                self._tensorboard.log_metrics(
                    train_metrics,
                    val_metrics=val_metrics,
                    log_to_console=True,
                    epoch=epoch + 1)  # +1 because tensorboard doesn't like 0

            # Create overall metrics dict
            training_elapsed_time = time.time() - training_start_time
            metrics["training_duration"] = str(
                datetime.timedelta(seconds=training_elapsed_time))
            metrics["training_start_epoch"] = epoch_counter
            metrics["training_epochs"] = epochs_trained
            metrics["epoch"] = epoch

            for key, value in train_metrics.items():
                metrics["training_" + key] = value
            for key, value in val_metrics.items():
                metrics["validation_" + key] = value

            if self._metric_tracker.is_best_so_far():
                # Update all the best_ metrics.
                # (Otherwise they just stay the same as they were.)
                metrics["best_epoch"] = epoch
                for key, value in val_metrics.items():
                    metrics["best_validation_" + key] = value

                self._metric_tracker.best_epoch_metrics = val_metrics

            if self._serialization_dir and self._master:
                common_util.dump_metrics(
                    os.path.join(self._serialization_dir,
                                 f"metrics_epoch_{epoch}.json"), metrics)

            # The Scheduler API is agnostic to whether your schedule requires a validation metric -
            # if it doesn't, the validation metric passed here is ignored.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step(this_epoch_val_metric)
            if self._momentum_scheduler:
                self._momentum_scheduler.step(this_epoch_val_metric)

            if self._master:
                self._checkpointer.save_checkpoint(
                    epoch,
                    self,
                    is_best_so_far=self._metric_tracker.is_best_so_far())

            # Wait for the master to finish saving the checkpoint
            if self._distributed:
                dist.barrier()

            for callback in self._epoch_callbacks:
                callback(self, metrics=metrics, epoch=epoch)

            epoch_elapsed_time = time.time() - epoch_start_time
            logger.info("Epoch duration: %s",
                        datetime.timedelta(seconds=epoch_elapsed_time))

            if epoch < self._num_epochs - 1:
                training_elapsed_time = time.time() - training_start_time
                estimated_time_remaining = training_elapsed_time * (
                    (self._num_epochs - epoch_counter) /
                    float(epoch - epoch_counter + 1) - 1)
                formatted_time = str(
                    datetime.timedelta(seconds=int(estimated_time_remaining)))
                logger.info("Estimated training time remaining: %s",
                            formatted_time)

            epochs_trained += 1

        # make sure pending events are flushed to disk and files are closed properly
        self._tensorboard.close()

        # Load the best model state before returning
        best_model_state = self._checkpointer.best_model_state()
        if best_model_state:
            self.model.load_state_dict(best_model_state)

        return metrics
예제 #19
0
    def train(self) -> Dict[str, Any]:
        metrics_by_fold = []

        if self.validation_dataset:
            logger.info(
                "Using the concatenation of the training and the validation datasets for"
                " cross-validation.")
            dataset = self.train_dataset + self.validation_dataset
        else:
            dataset = self.train_dataset

        groups = self._get_groups(dataset)

        n_splits = self.cross_validation_splitter.get_n_splits(dataset,
                                                               groups=groups)

        for fold_index, (train_indices, validation_indices,
                         test_indices) in enumerate(
                             self.cross_validation_splitter(dataset,
                                                            groups=groups)):
            logger.info("Fold %d/%d", fold_index, n_splits - 1)
            serialization_dir = os.path.join(self._serialization_dir,
                                             f'fold_{fold_index}')
            os.makedirs(serialization_dir, exist_ok=True)

            train_dataset = [dataset[i] for i in train_indices]
            validation_dataset = [dataset[i]
                                  for i in validation_indices] or None
            test_dataset = [dataset[i] for i in test_indices]

            # TODO: make it generic as a "fold consistency checking", in which the folder and field key is specified.
            with open(f'data/folds/fold{fold_index}_train_ids', 'w') as file:
                for instance in train_dataset:
                    file.write(
                        f'{instance["question_id"].as_tensor({}).item()}\n')
            if validation_dataset:
                with open(f'data/folds/fold{fold_index}_validation_ids',
                          'w') as file:
                    for instance in validation_dataset:
                        file.write(
                            f'{instance["question_id"].as_tensor({}).item()}\n'
                        )
            with open(f'data/folds/fold{fold_index}_test_ids', 'w') as file:
                for instance in test_dataset:
                    file.write(
                        f'{instance["question_id"].as_tensor({}).item()}\n')

            model = copy.deepcopy(self.model)
            subtrainer = self._build_subtrainer(serialization_dir, model,
                                                train_dataset,
                                                validation_dataset)

            # try:
            fold_metrics = subtrainer.train()
            # except KeyboardInterrupt:  # TODO
            #     # if we have completed an epoch, try to create a model archive.
            #     if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            #         logging.info("Training interrupted by the user. Attempting to create "
            #                      "a model archive using the current best epoch weights.")
            #         archive_model(serialization_dir)
            #     raise

            # archive_model(serialization_dir)  # TODO

            for metric_key, metric_value in training_util.evaluate(
                    model,
                    test_dataset,
                    self.iterator,
                    cuda_device=self._cuda_devices[0],
                    batch_weight_key='').items():
                if metric_key in fold_metrics:
                    fold_metrics[f'test_{metric_key}'] = metric_value
                else:
                    fold_metrics[metric_key] = metric_value

            dump_metrics(os.path.join(serialization_dir, 'metrics.json'),
                         fold_metrics,
                         log=True)

            metrics_by_fold.append(fold_metrics)

        metrics = {}

        for metric_key, fold_0_metric_value in metrics_by_fold[0].items():
            if isinstance(fold_0_metric_value, float):
                average = Average()
                for fold_index, fold_metrics in enumerate(metrics_by_fold):
                    metric_value = fold_metrics[metric_key]
                    metrics[f'fold{fold_index}_{metric_key}'] = metric_value
                    average(metric_value)
                metrics[f'average_{metric_key}'] = average.get_metric()
            else:
                for fold_index, fold_metrics in enumerate(metrics_by_fold):
                    metrics[f'fold{fold_index}_{metric_key}'] = fold_metrics[
                        metric_key]

        if self.leave_model_trained:
            subtrainer = self._build_subtrainer(self._serialization_dir,
                                                self.model, self.train_dataset,
                                                self.validation_dataset)
            subtrainer.train()

        return metrics
예제 #20
0
def main(args):
    # validate inputs
    num_ensemble_models = None
    selector = args.selector
    if selector[:3] == 'qbc':
        assert (len(selector) > 3)
        num_ensemble_models = int(selector[3:])
        selector = 'qbc'
    assert(selector == 'entropy' or selector == 'score' or selector == 'random' or selector == 'qbc')
    # 1 and only 1 specified
    assert getattr(args, 'labels_to_query', None) or getattr(args, 'query_time_file', None)
    assert not getattr(args, 'labels_to_query', None) or not getattr(args, 'query_time_file', None)

    # parse inputs
    if getattr(args, 'labels_to_query', None):
        label_times_list = args.labels_to_query.split(",")
    else:
        label_times_list = args.query_time_file.split(":")

    # import submodule
    import_submodules('discrete_al_coref_module')

    if getattr(args, 'experiments', None):
        '''
        Default (experimental) mode
        '''
        # create save dir
        save_dir = args.experiments
        if not os.path.exists(save_dir):
            os.makedirs(save_dir, exist_ok=True)

        for x in label_times_list:
            if getattr(args, 'labels_to_query', None):
                x = int(x)
                assert x >= 0
                print("Running with {} labels per doc".format(x))
                save_fn = x
            else:
                assert os.path.exists(x)
                print("Running with equivalent annotation time to {}".format(x))
                save_fn = x.replace('/', '%').replace('_query_info.json', '').replace(
                    '.json', '').replace('.', '')
            serialization_dir = os.path.join(save_dir, "checkpoint_{}".format(save_fn))

            print("Saving in directory: {}".format(serialization_dir))
            if os.path.exists(serialization_dir):
                print("Deleting existing directory found in same location.")
                shutil.rmtree(serialization_dir)

            # modify parameters according to passed-in arguments
            params = Params.from_file("training_config/coref.jsonnet")
            params.params['trainer']['cuda_device'] = args.cuda_device
            params.params['trainer']['active_learning']['save_al_queries'] = args.save_al_queries
            params.params['trainer']['active_learning']['query_type'] = "pairwise" if args.pairwise else "discrete"
            if selector:
                params.params['trainer']['active_learning']['selector']['type'] = selector
            params.params['trainer']['active_learning']['selector']['use_clusters'] = not args.no_clusters
            if getattr(args, 'labels_to_query', None):
                params.params['trainer']['active_learning']['num_labels'] = x
            else:
                params.params['trainer']['active_learning']['use_equal_annot_time'] = True
                params.params['trainer']['active_learning']['equal_annot_time_file'] = x

            # train model
            best_model, metrics, query_info = train_model(params, serialization_dir, selector, num_ensemble_models, recover=False)
            dump_metrics(os.path.join(save_dir, "{}.json".format(save_fn)), metrics, log=True)
            with open(os.path.join(save_dir, "{}_query_info.json".format(save_fn)), 'w', encoding='utf-8') as f:
                json.dump(query_info, f)
    else:
        '''
        Test mode
        '''
        params = Params.from_file('training_config/coref.jsonnet')
        if getattr(args, 'labels_to_query', None):
            params.params['trainer']['active_learning']['num_labels'] = label_times_list[0]
        else:
            params.params['trainer']['active_learning']['use_equal_annot_time'] = True
            params.params['trainer']['active_learning']['equal_annot_time_file'] = label_times_list[0]
        params.params['trainer']['active_learning']['save_al_queries'] = args.save_al_queries
        if getattr(args, 'testing', None) or getattr(args, 'testing_vocab', None):
            params.params['trainer']['active_learning']['epoch_interval'] = 0
            del params.params['test_data_path']
            ''' Uncomment if necessary
            params.params['train_data_path'] = "/checkpoint/belindali/active_learning_coref/coref_ontonotes/dev.english.v4_gold_conll"
            params.params['dataset_reader']['fully_labelled_threshold'] = 100
            #'''
            if getattr(args, 'testing', None):
                params.params['model']['text_field_embedder']['token_embedders']['tokens'] = {'type': 'embedding', 'embedding_dim': 300}
        with TemporaryDirectory() as serialization_dir:
            print("temp file path: " + str(serialization_dir))
            params.params['trainer']['cuda_device'] = args.cuda_device
            params.params['trainer']['active_learning']['query_type'] = "pairwise" if args.pairwise else "discrete"
            params.params['trainer']['active_learning']['selector']['type'] = selector if selector else "entropy"
            params.params['trainer']['active_learning']['selector']['use_clusters'] = not args.no_clusters
            best_model, metrics, query_info = train_model(params, serialization_dir, selector, num_ensemble_models)
            with open(os.path.join(serialization_dir, "query_info.json"), 'w', encoding='utf-8') as f:
                json.dump(query_info, f)
예제 #21
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover, force)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset
             if key in datasets_for_vocab_creation)
    )

    model = Model.from_params(vocab=vocab, params=params.pop('model'))

    # Initializing the model can have side effect of expanding the vocabulary
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer_choice = trainer_params.pop_choice("type",
                                               Trainer.list_available(),
                                               default_to_first_choice=True)
    trainer = Trainer.by_name(trainer_choice).from_params(model=model,
                                                          serialization_dir=serialization_dir,
                                                          iterator=iterator,
                                                          train_data=train_data,
                                                          validation_data=validation_data,
                                                          params=trainer_params,
                                                          validation_iterator=validation_iterator)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Training interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)

    if test_data and evaluate_on_test:
        logger.info("The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
                best_model, test_data, validation_iterator or iterator,
                cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True)

    return best_model
예제 #22
0
    def train(self) -> Dict[str, Any]:
        """
        Trains the supplied model with the supplied parameters.
        """
        try:
            epoch_counter = self._restore_checkpoint()
        except RuntimeError:
            traceback.print_exc()
            raise ConfigurationError(
                "Could not recover training from the checkpoint.  Did you mean to output to "
                "a different serialization directory or delete the existing serialization "
                "directory?")

        training_util.enable_gradient_clipping(self.model, self._grad_clipping)

        logger.info("Beginning training.")

        train_metrics: Dict[str, float] = {}
        val_metrics: Dict[str, float] = {}
        this_epoch_val_metric: float = None
        metrics: Dict[str, Any] = {}
        epochs_trained = 0
        training_start_time = time.time()

        metrics['best_epoch'] = self._metric_tracker.best_epoch
        for key, value in self._metric_tracker.best_epoch_metrics.items():
            metrics["best_validation_" + key] = value

        ####################################################################################################
        if self.visdom:

            def create_plot_window(vis, xlabel, ylabel, title):
                return vis.line(X=np.array([1]),
                                Y=np.array([np.nan]),
                                opts=dict(xlabel=xlabel,
                                          ylabel=ylabel,
                                          title=title))

            self.train_loss_window = create_plot_window(
                self.visdom, '#Iterations', 'Loss', 'Training Loss')
            self.consume_time_window = create_plot_window(
                self.visdom, "#Epochs", "Seconds", "Consuming time")
            self.left_time_window = self.visdom.text(
                "Waiting for training.......")
            metric_window = {}
        ##########################################################################################

        for epoch in range(epoch_counter, self._num_epochs):
            epoch_start_time = time.time()
            train_metrics = self._train_epoch(epoch)

            # get peak of memory usage
            if 'cpu_memory_MB' in train_metrics:
                metrics['peak_cpu_memory_MB'] = max(
                    metrics.get('peak_cpu_memory_MB', 0),
                    train_metrics['cpu_memory_MB'])
            for key, value in train_metrics.items():
                if key.startswith('gpu_'):
                    metrics["peak_" + key] = max(metrics.get("peak_" + key, 0),
                                                 value)

            if self._validation_data is not None:
                with torch.no_grad():
                    # We have a validation set, so compute all the metrics on it.
                    val_loss, num_batches = self._validation_loss()
                    val_metrics = training_util.get_metrics(self.model,
                                                            val_loss,
                                                            num_batches,
                                                            reset=True)

                    # Check validation metric for early stopping
                    this_epoch_val_metric = val_metrics[
                        self._validation_metric]
                    self._metric_tracker.add_metric(this_epoch_val_metric)

                    if self._metric_tracker.should_stop_early():
                        logger.info("Ran out of patience.  Stopping training.")
                        break

            self._tensorboard.log_metrics(
                train_metrics,
                val_metrics=val_metrics,
                log_to_console=True,
            )  # +1 because tensorboard doesn't like 0

            # Create overall metrics dict
            training_elapsed_time = time.time() - training_start_time
            metrics["training_duration"] = time.strftime(
                "%H:%M:%S", time.gmtime(training_elapsed_time))
            metrics["training_start_epoch"] = epoch_counter
            metrics["training_epochs"] = epochs_trained
            metrics["epoch"] = epoch

            # print(train_metrics.keys())
            # print(val_metrics.keys())

            ###############################################################################################
            if self.visdom:
                for key in train_metrics.keys():

                    newkey = 'training_' + key
                    if newkey in metric_window:
                        continue
                    else:
                        metric_window[newkey] = create_plot_window(
                            self.visdom, '#Epochs', key, newkey)

                for key in val_metrics.keys():

                    newkey = 'validation_' + key
                    if newkey in metric_window:
                        continue
                    else:
                        metric_window[newkey] = create_plot_window(
                            self.visdom, '#Epochs', key, newkey)
            #################################################################################################

            for key, value in train_metrics.items():
                metrics["training_" + key] = value

                ##########################################################
                if self.visdom:
                    self.visdom.line(X=np.array([epoch]),
                                     Y=np.array([value]),
                                     win=metric_window["training_" + key],
                                     update='append')
                #########################################################

            for key, value in val_metrics.items():
                metrics["validation_" + key] = value

                ##########################################################
                if self.visdom:
                    self.visdom.line(X=np.array([epoch]),
                                     Y=np.array([value]),
                                     win=metric_window["validation_" + key],
                                     update='append')
                ############################################################

            if self._metric_tracker.is_best_so_far():
                # Update all the best_ metrics.
                # (Otherwise they just stay the same as they were.)
                metrics['best_epoch'] = epoch
                for key, value in val_metrics.items():
                    metrics["best_validation_" + key] = value

                self._metric_tracker.best_epoch_metrics = val_metrics

            if self._serialization_dir:
                dump_metrics(
                    os.path.join(self._serialization_dir,
                                 f'metrics_epoch_{epoch}.json'), metrics)

            # The Scheduler API is agnostic to whether your schedule requires a validation metric -
            # if it doesn't, the validation metric passed here is ignored.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step(this_epoch_val_metric,
                                                   epoch)
            if self._momentum_scheduler:
                self._momentum_scheduler.step(this_epoch_val_metric, epoch)

            self._save_checkpoint(epoch)

            epoch_elapsed_time = time.time() - epoch_start_time
            logger.info(
                "Epoch duration: %s",
                time.strftime("%H:%M:%S", time.gmtime(epoch_elapsed_time)))
            #######################################################################################
            if self.visdom:
                self.visdom.line(X=np.array([epoch]),
                                 Y=np.array([epoch_elapsed_time / 60]),
                                 win=self.consume_time_window,
                                 update='append')
            ############################################################################################
            if epoch < self._num_epochs - 1:
                training_elapsed_time = time.time() - training_start_time
                estimated_time_remaining = training_elapsed_time * \
                    ((self._num_epochs - epoch_counter) / float(epoch - epoch_counter + 1) - 1)
                formatted_time = str(
                    datetime.timedelta(seconds=int(estimated_time_remaining)))
                logger.info("Estimated training time remaining: %s",
                            formatted_time)
                #######################################################################################
                if self.visdom:
                    self.visdom.text(
                        "Estimated training time remaining: {}".format(
                            formatted_time),
                        win=self.left_time_window,
                        append=True)
                ############################################################################################
            epochs_trained += 1

        # Load the best model state before returning
        best_model_state = self._checkpointer.best_model_state()
        if best_model_state:
            self.model.load_state_dict(best_model_state)

        return metrics
예제 #23
0
파일: util.py 프로젝트: jbrry/allennlp
def evaluate(
    model: Model,
    data_loader: DataLoader,
    cuda_device: int = -1,
    batch_weight_key: str = None,
    output_file: str = None,
    predictions_output_file: str = None,
) -> Dict[str, Any]:
    """
    # Parameters

    model : `Model`
        The model to evaluate
    data_loader : `DataLoader`
        The `DataLoader` that will iterate over the evaluation data (data loaders already contain
        their data).
    cuda_device : `int`, optional (default=`-1`)
        The cuda device to use for this evaluation.  The model is assumed to already be using this
        device; this parameter is only used for moving the input data to the correct device.
    batch_weight_key : `str`, optional (default=`None`)
        If given, this is a key in the output dictionary for each batch that specifies how to weight
        the loss for that batch.  If this is not given, we use a weight of 1 for every batch.
    metrics_output_file : `str`, optional (default=`None`)
        Optional path to write the final metrics to.
    predictions_output_file : `str`, optional (default=`None`)
        Optional path to write the predictions to.

    # Returns

    `Dict[str, Any]`
        The final metrics.
    """
    check_for_gpu(cuda_device)
    data_loader.set_target_device(int_to_device(cuda_device))
    predictions_file = (None if predictions_output_file is None else open(
        predictions_output_file, "w"))

    with torch.no_grad():
        model.eval()

        iterator = iter(data_loader)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator)

        # Number of batches in instances.
        batch_count = 0
        # Number of batches where the model produces a loss.
        loss_count = 0
        # Cumulative weighted loss
        total_loss = 0.0
        # Cumulative weight across all batches.
        total_weight = 0.0

        for batch in generator_tqdm:
            batch_count += 1
            batch = nn_util.move_to_device(batch, cuda_device)
            output_dict = model(**batch)
            loss = output_dict.get("loss")

            metrics = model.get_metrics()

            if loss is not None:
                loss_count += 1
                if batch_weight_key:
                    weight = output_dict[batch_weight_key].item()
                else:
                    weight = 1.0

                total_weight += weight
                total_loss += loss.item() * weight
                # Report the average loss so far.
                metrics["loss"] = total_loss / total_weight

            if not HasBeenWarned.tqdm_ignores_underscores and any(
                    metric_name.startswith("_") for metric_name in metrics):
                logger.warning('Metrics with names beginning with "_" will '
                               "not be logged to the tqdm progress bar.")
                HasBeenWarned.tqdm_ignores_underscores = True
            description = (", ".join([
                "%s: %.2f" % (name, value)
                for name, value in metrics.items() if not name.startswith("_")
            ]) + " ||")
            generator_tqdm.set_description(description, refresh=False)

            if predictions_file is not None:
                predictions = json.dumps(
                    sanitize(model.make_output_human_readable(output_dict)))
                predictions_file.write(predictions + "\n")

        if predictions_file is not None:
            predictions_file.close()

        final_metrics = model.get_metrics(reset=True)
        if loss_count > 0:
            # Sanity check
            if loss_count != batch_count:
                raise RuntimeError(
                    "The model you are trying to evaluate only sometimes produced a loss!"
                )
            final_metrics["loss"] = total_loss / total_weight

        if output_file is not None:
            dump_metrics(output_file, final_metrics, log=True)

        return final_metrics
def train_model(params: Params,
                serialization_dir: str,
                results_fn: str,
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False) -> Tuple[Model, Dict[str, Any]]:
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover, force)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        (instance for key, dataset in all_datasets.items()
         for instance in dataset if key in datasets_for_vocab_creation))

    model = Model.from_params(vocab=vocab, params=params.pop('model'))

    # Initializing the model can have side effect of expanding the vocabulary
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(
            validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None
    held_out_iterator_params = params.pop("held_out_iterator", None)
    if held_out_iterator_params:
        held_out_iterator = DataIterator.from_params(held_out_iterator_params)
        held_out_iterator.index_with(vocab)
    else:
        held_out_iterator = None

    train_data = all_datasets['train']
    held_out_train_data = all_datasets.get('held_out_train')
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer_choice = trainer_params.pop_choice("type",
                                               Trainer.list_available(),
                                               default_to_first_choice=True)
    trainer = Trainer.by_name(trainer_choice).from_params(
        model=model,
        serialization_dir=serialization_dir,
        iterator=iterator,
        train_data=train_data,
        held_out_train_data=held_out_train_data,
        validation_data=validation_data,
        params=trainer_params,
        validation_iterator=validation_iterator,
        held_out_iterator=held_out_iterator)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)

    if test_data and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
            best_model,
            test_data,
            validation_iterator or iterator,
            cuda_device=trainer._cuda_devices[0]  # pylint: disable=protected-access
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    dump_metrics(os.path.join(results_dir, results_fn), metrics, log=True)

    return best_model, metrics
    tempdir = tempfile.mkdtemp()
    with tarfile.open(resolved_archive_file, "r:gz") as archive:
        archive.extractall(tempdir)
    atexit.register(_cleanup_archive_dir, tempdir)
    serialization_dir = tempdir

config = Params.from_file(os.path.join(serialization_dir, "config.json"), "")
model = SemanticRoleLabeler.from_archive(args.archive_file)
archive = Archive(model=model, config=config)

prepare_environment(config)
model.eval()
validation_dataset_reader_params = config.pop("validation_dataset_reader",
                                              None)
if validation_dataset_reader_params is not None:
    dataset_reader = DatasetReader.from_params(
        validation_dataset_reader_params)
else:
    dataset_reader = DatasetReader.from_params(config.pop("dataset_reader"))
instances = dataset_reader.read(args.evaluation_data_path)
instances.index_with(model.vocab)
data_loader_params = config.pop("validation_data_loader", None)
if data_loader_params is None:
    data_loader_params = config.pop("data_loader")

data_loader = DataLoader.from_params(dataset=instances,
                                     params=data_loader_params)

metrics = evaluate(model, data_loader, -1, "")
dump_metrics(args.output_file, metrics)
예제 #26
0
            params.to_file(serialize_config_file)
    dist.barrier()
    params = ConstParams.from_file(serialize_config_file)

    log_dir = os.path.join(serialization_dir, str(dist.get_rank()))
    os.makedirs(log_dir, exist_ok=True)
    stdout_handler = prepare_global_logging(log_dir,
                                            file_friendly_logging=False)
    prepare_environment(params)

    cuda_device = params.trainer.get('cuda_device', -1)
    check_for_gpu(cuda_device)

    trainer_type = params.trainer.type

    trainer = TrainerBase.from_params(params, serialization_dir, recover)
    params_cnt, params_trainable_cnt = count_parameters(trainer.model)
    print("all params cnt: ", params_cnt)
    print("all trainable params cnt: ", params_trainable_cnt)

    metrics = trainer.train()

    cleanup_global_logging(stdout_handler)

    if is_master_rank:
        archive_model(serialization_dir,
                      files_to_archive=params.files_to_archive)
        dump_metrics(os.path.join(serialization_dir, "metrics.json"),
                     metrics,
                     log=True)
예제 #27
0
    def custom_train(self) -> Dict[str, Any]:
        """
        Trains the supplied model with the supplied parameters.
        """
        logger.info("GAN TRAINER HM START")
        try:
            epoch_counter = self.trainer._restore_checkpoint()
        except RuntimeError:
            traceback.print_exc()
            raise ConfigurationError(
                "Could not recover training from the checkpoint.  Did you mean to output to "
                "a different serialization directory or delete the existing serialization "
                "directory?")

        # TODO - gradient clipping?
        training_util.enable_gradient_clipping(self.trainer.model,
                                               self.trainer._grad_clipping)
        #HACK:
        #self.trainer._metric_tracker._patience = 30
        logger.info("Beginning training.")

        train_metrics: Dict[str, float] = {}
        val_metrics: Dict[str, float] = {}
        this_epoch_val_metric: float = None
        metrics: Dict[str, Any] = {}
        epochs_trained = 0
        training_start_time = time.time()

        metrics['best_epoch'] = self.trainer._metric_tracker.best_epoch
        for key, value in self.trainer._metric_tracker.best_epoch_metrics.items(
        ):
            metrics["best_validation_" + key] = value

        for epoch in range(epoch_counter, self.trainer._num_epochs):

            # Start tracemalloc
            # tracemalloc.start()

            epoch_start_time = time.time()
            train_metrics = self.semi_train_epoch(epoch)

            # get peak of memory usage
            if 'cpu_memory_MB' in train_metrics:
                metrics['peak_cpu_memory_MB'] = max(
                    metrics.get('peak_cpu_memory_MB', 0),
                    train_metrics['cpu_memory_MB'])
            for key, value in train_metrics.items():
                if key.startswith('gpu_'):
                    metrics["peak_" + key] = max(metrics.get("peak_" + key, 0),
                                                 value)
            """
            if self.unlabelled_dataset is not None:
                unlabelled_metrics = unlabelled_train_epoch(self.trainer, self.unlabelled_dataset, epoch)
                for key, value in unlabelled_metrics.items():
                    if key.startswith('gpu_'):
                        metrics["peak_"+'un_'+key] = max(unlabelled_metrics.get("peak_"+key, 0), value)
                    else:
                        metrics['un_'+key] = value
            """

            if self.trainer._validation_data is not None and (
                (epoch - epoch_counter) % self.calc_valid_freq
                    == (self.calc_valid_freq - 1)):
                with torch.no_grad():
                    # We have a validation set, so compute all the metrics on it.
                    val_loss, num_batches = self.trainer._validation_loss()
                    val_metrics = training_util.get_metrics(self.trainer.model,
                                                            val_loss,
                                                            num_batches,
                                                            reset=True)

                    # Check validation metric for early stopping
                    this_epoch_val_metric = val_metrics[
                        self.trainer._validation_metric]
                    self.trainer._metric_tracker.add_metric(
                        this_epoch_val_metric)

                    if self.trainer._metric_tracker.should_stop_early():
                        logger.info("Ran out of patience.  Stopping training.")
                        break

            self.trainer._tensorboard.log_metrics(train_metrics,
                                                  val_metrics=val_metrics,
                                                  log_to_console=True)

            # Create overall metrics dict
            training_elapsed_time = time.time() - training_start_time
            metrics["training_duration"] = time.strftime(
                "%H:%M:%S", time.gmtime(training_elapsed_time))
            metrics["training_start_epoch"] = epoch_counter
            metrics["training_epochs"] = epochs_trained
            metrics["epoch"] = epoch

            for key, value in train_metrics.items():
                metrics["training_" + key] = value
            for key, value in val_metrics.items():
                metrics["validation_" + key] = value

            is_best_so_far = False
            if self.trainer._metric_tracker.is_best_so_far():
                is_best_so_far = True
                # Update all the best_ metrics.
                # (Otherwise they just stay the same as they were.)
                metrics['best_epoch'] = epoch
                for key, value in val_metrics.items():
                    metrics["best_validation_" + key] = value

                self.trainer._metric_tracker.best_epoch_metrics = val_metrics

            if self.trainer._serialization_dir:
                dump_metrics(
                    os.path.join(self.trainer._serialization_dir,
                                 f'metrics_epoch_{epoch}.json'), metrics)

            #Pdb().set_trace()
            if self.trainer._learning_rate_scheduler:
                # The LRScheduler API is agnostic to whether your schedule requires a validation metric -
                # if it doesn't, the validation metric passed here is ignored.
                self.trainer._learning_rate_scheduler.step(
                    this_epoch_val_metric, epoch)

            self.trainer._save_checkpoint(epoch)
            if self.constraints_model is not None:
                spath = self.save_constraints_model(epoch)
                if is_best_so_far:
                    shutil.copyfile(
                        spath,
                        os.path.join(self.trainer._serialization_dir,
                                     'best_dd_checkpoint.pth'))

                # Start saving checkpoint models after checkpoint_begin after every checkpoint_interval
                #if (self.trainer._checkpointer._save_intermediate_checkpoints) and (epoch >= self.trainer._checkpointer._checkpoint_begin) and (epoch%self.trainer._checkpointer._checkpoint_interval == 0):
                #    shutil.copyfile(spath,os.path.join(self.trainer._serialization_dir,'dd_checkpoint_epoch_'+str(epoch)+'.cpoint'))

            epoch_elapsed_time = time.time() - epoch_start_time
            logger.info(
                "Epoch duration: %s",
                time.strftime("%H:%M:%S", time.gmtime(epoch_elapsed_time)))

            if epoch < self.trainer._num_epochs - 1:
                training_elapsed_time = time.time() - training_start_time
                estimated_time_remaining = training_elapsed_time * \
                    ((self.trainer._num_epochs - epoch_counter) / float(epoch - epoch_counter + 1) - 1)
                formatted_time = str(
                    datetime.timedelta(seconds=int(estimated_time_remaining)))
                logger.info("Estimated training time remaining: %s",
                            formatted_time)

            self.trainer.model.train()
            epochs_trained += 1

            # Take snapshot and reveal top memory allocation
            # snapshot = tracemalloc.take_snapshot()
            # top_stats = snapshot.statistics('lineno')

            # print("[ Top 10 ]")
            # for stat in top_stats[:10]:
            #     logger.info(stat)

        # Load the best model state before returning
        best_model_state = self.trainer._checkpointer.best_model_state()
        if best_model_state:
            self.trainer.model.load_state_dict(best_model_state)

        return metrics
예제 #28
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    check_for_gpu(params.get('trainer').get('cuda_device', -1))

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset
             if key in datasets_for_vocab_creation)
    )

    model = Model.from_params(vocab=vocab, params=params.pop('model'))

    # Initializing the model can have side effect of expanding the vocabulary
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer_choice = trainer_params.pop_choice("type",
                                               Trainer.list_available(),
                                               default_to_first_choice=True)
    trainer = Trainer.by_name(trainer_choice).from_params(model=model,
                                                          serialization_dir=serialization_dir,
                                                          iterator=iterator,
                                                          train_data=train_data,
                                                          validation_data=validation_data,
                                                          params=trainer_params,
                                                          validation_iterator=validation_iterator)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Training interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)

    if test_data and evaluate_on_test:
        logger.info("The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
                best_model, test_data, validation_iterator or iterator,
                cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True)

    return best_model
예제 #29
0
    def train(self, experiment: Optional[Experiment] = None) -> Dict[str, Any]:
        """
        Trains the supplied model with the supplied parameters.
        """
        try:
            epoch_counter = self._restore_checkpoint()
        except RuntimeError:
            traceback.print_exc()
            raise ConfigurationError(
                "Could not recover training from the checkpoint.  Did you mean to output to "
                "a different serialization directory or delete the existing serialization "
                "directory?")

        training_util.enable_gradient_clipping(self.model, self._grad_clipping)

        self.experiment = experiment

        logger.info("Beginning training.")

        self.val_metrics: Dict[str, float] = {}
        this_epoch_val_metric: float = None
        self.metrics: Dict[str, Any] = {}
        epochs_trained = 0
        training_start_time = time.time()

        self.metrics["best_epoch"] = self._metric_tracker.best_epoch
        for key, value in self._metric_tracker.best_epoch_metrics.items():
            self.metrics["best_validation_" + key] = value

        for callback in self._epoch_callbacks:
            callback(self, metrics={}, epoch=-1, is_master=self._master)

        for epoch in range(epoch_counter, self._num_epochs):
            self.epoch = epoch
            epoch_start_time = time.time()
            train_metrics = self._train_epoch(epoch)

            if experiment:
                with experiment.train():
                    experiment.log_metrics(
                        {
                            k: v
                            for k, v in train_metrics.items() if np.isscalar(v)
                        },
                        step=epoch)

            # get peak of memory usage
            for key, value in train_metrics.items():
                if key.startswith("gpu_") and key.endswith("_memory_MB"):
                    self.metrics["peak_" + key] = max(
                        self.metrics.get("peak_" + key, 0), value)
                elif key.startswith("worker_") and key.endswith("_memory_MB"):
                    self.metrics["peak_" + key] = max(
                        self.metrics.get("peak_" + key, 0), value)

            if self._validation_data_loader is not None and epoch >= self.epochs_before_validate:
                with torch.no_grad():
                    try:
                        if self.external_callbacks:
                            self.external_callbacks.call_if_registered(
                                CallbackName.BEFORE_VALIDATION,
                                annotator=self.annotator,
                                model=self.model,
                                trainer=self,
                                experiment=experiment)

                        # We have a validation set, so compute all the metrics on it.
                        val_loss, val_reg_loss, num_batches, preds = self._validation_loss(
                            epoch)

                        # It is safe again to wait till the validation is done. This is
                        # important to get the metrics right.
                        if self._distributed:
                            dist.barrier()

                        self.val_metrics = training_util.get_metrics(
                            self.model,
                            val_loss,
                            val_reg_loss,
                            num_batches,
                            reset=True,
                            world_size=self._world_size,
                            cuda_device=self.cuda_device,
                        )

                        if self.dataset_writer:
                            if self.decoder:
                                preds = self.decoder.decode_batch(
                                    self.model.vocab, preds)
                            filename = self._serialization_dir + f"/pred_epoch_{epoch}.txt"
                            with open(filename, "w") as f:
                                self.dataset_writer.write_to_file(
                                    self.model.vocab,
                                    OrderedDatasetReader.restore_order(preds),
                                    f)

                            if self.validation_command:
                                self.val_metrics.update(
                                    self.validation_command.evaluate(filename))

                        if self.external_callbacks:
                            self.external_callbacks.call_if_registered(
                                CallbackName.AFTER_VALIDATION,
                                annotator=self.annotator,
                                model=self.model,
                                trainer=self,
                                experiment=experiment)

                        # Check validation metric for early stopping
                        this_epoch_val_metric = self.val_metrics[
                            self._validation_metric]
                        self._metric_tracker.add_metric(this_epoch_val_metric)

                        if self._metric_tracker.should_stop_early():
                            logger.info(
                                "Ran out of patience.  Stopping training.")
                            break

                    except Exception as ex:
                        print("An exception occured:")
                        print(ex)
                        self._checkpointer.save_checkpoint("validation-failed",
                                                           trainer=self)
                        raise

            if self._master:
                self._tensorboard.log_metrics(
                    train_metrics,
                    val_metrics=self.val_metrics,
                    log_to_console=True,
                    epoch=epoch + 1)  # +1 because tensorboard doesn't like 0

            # Create overall metrics dict
            training_elapsed_time = time.time() - training_start_time
            self.metrics["training_duration"] = str(
                datetime.timedelta(seconds=training_elapsed_time))
            self.metrics["training_start_epoch"] = epoch_counter
            self.metrics["training_epochs"] = epochs_trained
            self.metrics["epoch"] = epoch

            for key, value in train_metrics.items():
                self.metrics["training_" + key] = value
            for key, value in self.val_metrics.items():
                self.metrics["validation_" + key] = value

            if experiment:
                with experiment.validate():
                    experiment.log_metrics(
                        {
                            k: v
                            for k, v in self.metrics.items() if np.isscalar(v)
                        },
                        step=epoch)

            if self._metric_tracker.is_best_so_far():
                # Update all the best_ metrics.
                # (Otherwise they just stay the same as they were.)
                self.metrics["best_epoch"] = epoch
                for key, value in self.val_metrics.items():
                    self.metrics["best_validation_" + key] = value

                self._metric_tracker.best_epoch_metrics = self.val_metrics

            if self._serialization_dir and self._master:
                common_util.dump_metrics(
                    os.path.join(self._serialization_dir,
                                 f"metrics_epoch_{epoch}.json"), self.metrics)

            # The Scheduler API is agnostic to whether your schedule requires a validation metric -
            # if it doesn't, the validation metric passed here is ignored.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step(this_epoch_val_metric)
            if self._momentum_scheduler:
                self._momentum_scheduler.step(this_epoch_val_metric)

            if self._master:
                self._checkpointer.save_checkpoint(
                    epoch,
                    self,
                    is_best_so_far=self._metric_tracker.is_best_so_far())

            # Wait for the master to finish saving the checkpoint
            if self._distributed:
                dist.barrier()

            for callback in self._epoch_callbacks:
                callback(self,
                         metrics=self.metrics,
                         epoch=epoch,
                         is_master=self._master)

            epoch_elapsed_time = time.time() - epoch_start_time
            logger.info("Epoch duration: %s",
                        datetime.timedelta(seconds=epoch_elapsed_time))

            if epoch < self._num_epochs - 1:
                training_elapsed_time = time.time() - training_start_time
                estimated_time_remaining = training_elapsed_time * (
                    (self._num_epochs - epoch_counter) /
                    float(epoch - epoch_counter + 1) - 1)
                formatted_time = str(
                    datetime.timedelta(seconds=int(estimated_time_remaining)))
                logger.info("Estimated training time remaining: %s",
                            formatted_time)

            epochs_trained += 1

        # make sure pending events are flushed to disk and files are closed properly
        self._tensorboard.close()

        # Load the best model state before returning
        best_model_state = self._checkpointer.best_model_state()
        if best_model_state:
            self.model.load_state_dict(best_model_state)

        if self.external_callbacks:
            self.external_callbacks.call_if_registered(
                CallbackName.AFTER_TRAINING,
                annotator=self.annotator,
                model=self.model,
                trainer=self,
                experiment=experiment)

        return self.metrics
예제 #30
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger("allennlp.common.params").disabled = True
    logging.getLogger("allennlp.nn.initializers").disabled = True
    logging.getLogger("transformers.modeling_utils").disabled = True
    logging.getLogger("transformers.tokenization_utils").disabled = True
    logging.getLogger("transformers.configuration_utils").disabled = True
    logging.basicConfig(level=logging.INFO)

    # Load from archive
    archive = load_archive(
        args.archive_file,
        weights_file=args.weights_file,
        cuda_device=args.cuda_device,
        overrides=args.overrides,
    )
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop("validation_dataset_reader", None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(config.pop("dataset_reader"))
    evaluation_data_path = args.input_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    embedding_sources = (
        json.loads(args.embedding_sources_mapping) if args.embedding_sources_mapping else {}
    )

    if args.extend_vocab:
        logger.info("Vocabulary is being extended with test instances.")
        model.vocab.extend_from_instances(instances=instances)
        model.extend_embedder_vocab(embedding_sources)

    instances.index_with(model.vocab)
    data_loader_params = config.pop("validation_data_loader", None)
    if data_loader_params is None:
        data_loader_params = config.pop("data_loader")
    if args.batch_size:
        data_loader_params["batch_size"] = args.batch_size
    data_loader = DataLoader.from_params(dataset=instances, params=data_loader_params)

    if "iter_norm" in dir(model.text_field_embedder._token_embedders['tokens']):
        iter_num = model.text_field_embedder._token_embedders['tokens'].iter_norm
    else:
        iter_num = None

    if iter_num:
        # Obtrain evaluation info for iterative normalization:
        iter_mean_eval = []
        for iter_norm_i in range(iter_num):
            logging.info("This is the {} time during iterative normalization for evaluation".format(iter_norm_i))
            mean, embeddings = get_iter_norm_mean_eval(model, data_loader, iter_mean_eval, args.cuda_device)
            logger.info("The degree of isotropy of vectors is {} ".format(degree_anisotropy(embeddings.t(), args.cuda_device)))
            iter_mean_eval.append(mean)

        model.text_field_embedder._token_embedders['tokens'].iter_norm = None 
        model.text_field_embedder._token_embedders['tokens']._matched_embedder.mean_emb_eval = iter_mean_eval
        model.text_field_embedder._token_embedders['tokens']._matched_embedder.is_train = False

    metrics = evaluate(model, data_loader, args.cuda_device, args.batch_weight_key)

    logger.info("Finished evaluating.")

    dump_metrics(args.output_file, metrics, log=True)

    return metrics
예제 #31
0
                       prob_diff_weight=args.prob_diff_weight,
                       learning_rate=args.learning_rate,
                       num_updates=args.num_updates,
                       num_labels=class_model_args['num_classes'],
                       device=args.cuda)

    data = pd.read_csv(args.csv_path)
    sequences = data['sequences'].tolist()[:args.sample]
    labels = data['labels'].tolist()[:args.sample]
    maskers = [args.maskers.split(',')] * len(sequences)

    results_path = Path(
        args.results_path) / datetime.now().strftime('%Y%m%d_%H%M%S')
    results_path.mkdir(exist_ok=True, parents=True)
    path_to_results_file = results_path / 'results.csv'
    dump_metrics(results_path / 'args.json', args.__dict__)
    with open(path_to_results_file, 'w', newline='') as csv_write:
        fieldnames = list(AttackerOutput.__annotations__.keys())
        writer = csv.DictWriter(csv_write, fieldnames=fieldnames)
        writer.writeheader()
        for seq, lab, mask_tokens in tqdm(zip(sequences, labels, maskers)):

            attacker.set_label_to_attack(lab)
            attacker.set_input(sequence=seq, mask_tokens=mask_tokens)

            output = attacker.sample_until_label_is_changed(
                max_steps=args.max_steps,
                early_stopping=args.early_stopping).__dict__

            attacker.empty_history()