コード例 #1
0
 class Config(ConfigBase):
     data: Data.Config = Data.Config()
     model: Model.Config
     trainer: NewTaskTrainer.Config = NewTaskTrainer.Config()
     optimizer: Optimizer.Config = Adam.Config()
     scheduler: Scheduler.Config = Scheduler.Config()
     exporter: Optional[ModelExporter.Config] = None
コード例 #2
0
 def from_config(cls,
                 config: Config,
                 unused_metadata=None,
                 unused_model_state=None):
     tensorizers = {
         name: create_component(ComponentType.TENSORIZER, tensorizer)
         for name, tensorizer in config.model.inputs._asdict().items()
     }
     # This initializes the tensorizers
     data = create_component(ComponentType.DATA_HANDLER, config.data,
                             cls.DATA_SCHEMA, tensorizers)
     # Initialized tensorizers can be used to create the model
     model = create_component(ComponentType.MODEL2, config.model,
                              tensorizers)
     # This is the only place right now that the task actually cares about which
     # features and tensors are being used. This is a strong tie between
     # the implementation of the model and the metric reporter.
     metric_reporter = cls.create_metric_reporter(config, tensorizers)
     trainer = create_component(ComponentType.TRAINER, config.trainer)
     optimizer = create_component(ComponentType.OPTIMIZER, config.optimizer,
                                  model)
     scheduler = Scheduler(optimizer, config.scheduler,
                           metric_reporter.lower_is_better)
     if config.exporter:
         exporter = create_component(ComponentType.EXPORTER,
                                     config.exporter)
     else:
         exporter = None
     return cls(data, model, metric_reporter, trainer, optimizer, scheduler,
                exporter)
コード例 #3
0
ファイル: trainer.py プロジェクト: clercrobin/pytext
    def __init__(self, config: Config, model: torch.nn.Module):
        if config.early_stop_after > 0:
            assert config.do_eval, "can't do early stopping when not running evalution"

        if precision.FP16_ENABLED:
            self.optimizer: torch.optim.Optimizer = create_optimizer(
                config.fp16_args,
                model,
                config.optimizer,
                config.num_accumulated_batches,
            )
        else:
            self.optimizer: torch.optim.Optimizer = create_optimizer(
                config.optimizer, model
            )

        self.scheduler: torch.optim.lr_scheduler = (
            create_scheduler(config.scheduler, self.optimizer)
            if config.scheduler
            else Scheduler()
        )
        self.sparsifier: Sparsifier = (
            create_sparsifier(config.sparsifier) if config.sparsifier else Sparsifier()
        )
        self.config = config
コード例 #4
0
 def __init__(self, config: Config, model: torch.nn.Module):
     optimizer: torch.optim.Optimizer = create_optimizer(
         config.optimizer, model)
     self.scheduler: torch.optim.lr_scheduler = (create_scheduler(
         config.scheduler, optimizer) if config.scheduler else Scheduler())
     model, self.optimizer = precision.initialize(model, optimizer)
     self.config = config
コード例 #5
0
    def from_config(cls, task_config, metadata=None, model_state=None):
        """
        Create the task from config, and optionally load metadata/model_state
        This function will create components including :class:`~DataHandler`,
        :class:`~Trainer`, :class:`~Optimizer`, :class:`~Scheduler`,
        :class:`~MetricReporter`, :class:`~Exporter`, and wire them up.

        Args:
            task_config (Task.Config): the config of the current task
            metadata: saved global context of this task, e.g: vocabulary, will be
                generated by :class:`~DataHandler` if it's None
            model_state: saved model parameters, will be loaded into model when given
        """
        print("Task parameters:\n")
        pprint(config_to_json(type(task_config), task_config))
        featurizer = create_featurizer(task_config.featurizer, task_config.features)
        # load data
        data_handler = create_data_handler(
            task_config.data_handler,
            task_config.features,
            task_config.labels,
            featurizer=featurizer,
        )
        print("\nLoading data...")
        if metadata:
            data_handler.load_metadata(metadata)
        else:
            data_handler.init_metadata()

        metadata = data_handler.metadata

        model = create_model(task_config.model, task_config.features, metadata)
        if model_state:
            model.load_state_dict(model_state)
        if cuda_utils.CUDA_ENABLED:
            model = model.cuda()
        metric_reporter = create_metric_reporter(task_config.metric_reporter, metadata)
        optimizer = create_optimizer(task_config.optimizer, model)
        exporter = (
            create_exporter(
                task_config.exporter,
                task_config.features,
                task_config.labels,
                data_handler.metadata,
                task_config.model,
            )
            if task_config.exporter
            else None
        )
        return cls(
            trainer=create_trainer(task_config.trainer),
            data_handler=data_handler,
            model=model,
            metric_reporter=metric_reporter,
            optimizer=optimizer,
            lr_scheduler=Scheduler(
                optimizer, task_config.scheduler, metric_reporter.lower_is_better
            ),
            exporter=exporter,
        )
コード例 #6
0
    def __init__(self, config: Config, model: torch.nn.Module):
        self.optimizer: torch.optim.Optimizer = create_optimizer(
            config.optimizer, model)
        self.scheduler: torch.optim.lr_scheduler = (create_scheduler(
            config.scheduler, self.optimizer) if config.scheduler else
                                                    Scheduler())

        self.config = config
コード例 #7
0
 class Config(ConfigBase):
     features: FeatureConfig = FeatureConfig()
     featurizer: Featurizer.Config = SimpleFeaturizer.Config()
     data_handler: DataHandler.Config
     trainer: Trainer.Config = Trainer.Config()
     optimizer: Optimizer.Config = Adam.Config()
     scheduler: Optional[Scheduler.Config] = Scheduler.Config()
     exporter: Optional[ModelExporter.Config] = None
コード例 #8
0
    def from_config(cls, task_config, metadata=None, model_state=None):
        print("Task parameters:\n")
        pprint(config_to_json(type(task_config), task_config))

        data_handlers = OrderedDict()
        exporters = OrderedDict()
        for name, task in task_config.tasks.items():
            featurizer = create_featurizer(task.featurizer, task.features)
            data_handlers[name] = create_data_handler(task.data_handler,
                                                      task.features,
                                                      task.labels,
                                                      featurizer=featurizer)
        data_handler = DisjointMultitaskDataHandler(task_config.data_handler,
                                                    data_handlers)
        print("\nLoading data...")
        if metadata:
            data_handler.load_metadata(metadata)
        else:
            data_handler.init_metadata()
        metadata = data_handler.metadata
        exporters = {
            name: (create_exporter(
                task.exporter,
                task.features,
                task.labels,
                data_handler.data_handlers[name].metadata,
                task.model,
            ) if task.exporter else None)
            for name, task in task_config.tasks.items()
        }
        metric_reporter = DisjointMultitaskMetricReporter(
            OrderedDict(
                (name,
                 create_metric_reporter(task.metric_reporter, metadata[name]))
                for name, task in task_config.tasks.items()),
            target_task_name=task_config.metric_reporter.target_task_name,
        )

        model = DisjointMultitaskModel(
            OrderedDict(
                (name, create_model(task.model, task.features, metadata[name]))
                for name, task in task_config.tasks.items()))
        if model_state:
            model.load_state_dict(model_state)
        if cuda_utils.CUDA_ENABLED:
            model = model.cuda()

        optimizers = create_optimizer(model, task_config.optimizer)
        return cls(
            exporters=exporters,
            trainer=create_trainer(task_config.trainer),
            data_handler=data_handler,
            model=model,
            metric_reporter=metric_reporter,
            optimizers=optimizers,
            lr_scheduler=Scheduler(optimizers, task_config.scheduler,
                                   metric_reporter.lower_is_better),
        )
コード例 #9
0
        def test_load_checkpoint(self):
            with tempfile.NamedTemporaryFile() as checkpoint_file:
                train_data = tests_module.test_file("train_data_tiny.tsv")
                eval_data = tests_module.test_file("test_data_tiny.tsv")
                config = PyTextConfig(
                    task=DocumentClassificationTask.Config(data=Data.Config(
                        source=TSVDataSource.Config(
                            train_filename=train_data,
                            eval_filename=eval_data,
                            field_names=["label", "slots", "text"],
                        ))),
                    version=LATEST_VERSION,
                    save_snapshot_path=checkpoint_file.name,
                )
                task = create_task(config.task)
                model = task.model
                # test checkpoint saving and loading
                optimizer = create_optimizer(Adam.Config(), model)
                scheduler = create_scheduler(Scheduler.Config(), optimizer)
                training_state = TrainingState(
                    model=model,
                    optimizer=optimizer,
                    scheduler=scheduler,
                    start_time=0,
                    epoch=0,
                    rank=0,
                    stage=Stage.TRAIN,
                    epochs_since_last_improvement=0,
                    best_model_state=None,
                    best_model_metric=None,
                    tensorizers=None,
                )

                checkpoint_path = checkpoint_file.name
                save(
                    config,
                    model,
                    None,
                    task.data.tensorizers,
                    training_state,
                    checkpoint_file,
                )
                task_restored, config_restored, training_state_restored = load(
                    checkpoint_path)
                optimizer_restored = training_state_restored.optimizer
                scheduler_restored = training_state_restored.scheduler
                self.assertOptimizerEqual(optimizer, optimizer_restored)
                self.assertNotNone(scheduler_restored)
                self.assertEqual(config, config_restored)
                self.assertModulesEqual(model, task_restored.model)
                model.eval()
                task_restored.model.eval()

                inputs = torch.LongTensor([[1, 2, 3]]), torch.LongTensor([3])
                self.assertEqual(
                    model(*inputs).tolist(),
                    task_restored.model(*inputs).tolist())
コード例 #10
0
 def __init__(self, config: Config, model: torch.nn.Module):
     if config.early_stop_after > 0:
         assert config.do_eval, "can't do early stopping when not running evalution"
     optimizer: torch.optim.Optimizer = create_optimizer(
         config.optimizer, model)
     self.scheduler: torch.optim.lr_scheduler = (create_scheduler(
         config.scheduler, optimizer) if config.scheduler else Scheduler())
     self.sparsifier: Sparsifier = (create_sparsifier(config.sparsifier)
                                    if config.sparsifier else Sparsifier())
     model, self.optimizer = precision.initialize(model, optimizer)
     self.config = config
コード例 #11
0
    def test_load_checkpoint_in_dist_training(self):
        with tempfile.NamedTemporaryFile() as checkpoint_file:
            train_data = tests_module.test_file("train_data_tiny.tsv")
            eval_data = tests_module.test_file("test_data_tiny.tsv")
            config = PyTextConfig(
                task=DocumentClassificationTask.Config(data=Data.Config(
                    source=BlockShardedTSVDataSource.Config(
                        train_filename=train_data,
                        eval_filename=eval_data,
                        field_names=["label", "slots", "text"],
                    ))),
                version=LATEST_VERSION,
                save_snapshot_path=checkpoint_file.name,
            )
            task = create_task(config.task)
            model = task.model
            # test checkpoint saving and loading
            optimizer = create_optimizer(Adam.Config(), model)
            scheduler = create_scheduler(Scheduler.Config(), optimizer)
            training_state = TrainingState(
                model=model,
                optimizer=optimizer,
                scheduler=scheduler,
                start_time=0,
                epoch=0,
                rank=0,
                stage=Stage.TRAIN,
                epochs_since_last_improvement=0,
                best_model_state=None,
                best_model_metric=None,
                tensorizers=task.data.tensorizers,
            )

            id = "epoch-1"
            saved_path = save(config, model, None, task.data.tensorizers,
                              training_state, id)
            new_rank = 2
            new_world_size = 4
            task_restored, config_restored, training_state_restored = load(
                saved_path, rank=new_rank, world_size=new_world_size)
            self.assertCheckpointEqual(
                model,
                config,
                training_state,
                task_restored.model,
                config_restored,
                training_state_restored,
            )
            self.assertEqual(task_restored.data.data_source.rank, new_rank)
            self.assertEqual(task_restored.data.data_source.world_size,
                             new_world_size)
コード例 #12
0
ファイル: trainer.py プロジェクト: wwjiang007/pytext
    def __init__(self, config: Config, model: torch.nn.Module):
        if config.early_stop_after > 0:
            assert config.do_eval, "can't do early stopping when not running evalution"

        if (config.discriminative_lr is not None
                or config.freeze_params_pattern is not None):
            optimizer_grouped_parameters = []
            optimizer_parameters_covered = []
            if config.freeze_params_pattern is not None:
                tmp_param = {
                    n: p
                    for n, p in model.named_parameters()
                    if any(nd in n for nd in config.freeze_params_pattern)
                }
                if len(tmp_param) > 0:
                    optimizer_parameters_covered.extend(list(tmp_param.keys()))
                    optimizer_grouped_parameters.append({
                        "params":
                        list(tmp_param.values()),
                        "lr":
                        0.0,
                    })
            if config.discriminative_lr is not None:
                assert (
                    config.discriminative_lr_params_pattern
                    is not None), "Missing discriminative_lr_params_pattern"
                tmp_param = {
                    n: p
                    for n, p in model.named_parameters()
                    if any(nd in n
                           for nd in config.discriminative_lr_params_pattern)
                    and n not in optimizer_parameters_covered
                }
                if len(tmp_param) > 0:
                    optimizer_parameters_covered.extend(list(tmp_param.keys()))
                    optimizer_grouped_parameters.append({
                        "params":
                        list(tmp_param.values()),
                        "lr":
                        config.discriminative_lr,
                    })
            optimizer_grouped_parameters.append({
                "params": [
                    p for n, p in model.named_parameters()
                    if n not in optimizer_parameters_covered
                ]
            })
            if precision.FP16_ENABLED:
                self.optimizer: torch.optim.Optimizer = create_optimizer(
                    config.fp16_args,
                    model,
                    config.optimizer,
                    config.num_accumulated_batches,
                    optimizer_grouped_parameters,
                )
            else:
                self.optimizer: torch.optim.Optimizer = create_optimizer(
                    config.optimizer, model, optimizer_grouped_parameters)
        else:
            if precision.FP16_ENABLED:
                self.optimizer: torch.optim.Optimizer = create_optimizer(
                    config.fp16_args,
                    model,
                    config.optimizer,
                    config.num_accumulated_batches,
                )
            else:
                self.optimizer: torch.optim.Optimizer = create_optimizer(
                    config.optimizer, model)

        self.scheduler: torch.optim.lr_scheduler = (create_scheduler(
            config.scheduler, self.optimizer) if config.scheduler else
                                                    Scheduler())
        self.sparsifier: Sparsifier = (create_sparsifier(config.sparsifier)
                                       if config.sparsifier else Sparsifier())
        self.config = config
コード例 #13
0
ファイル: trainer.py プロジェクト: LinHR000/pytext
    def train(
        self,
        train_iter: BatchIterator,
        eval_iter: BatchIterator,
        model: Model,
        metric_reporter: MetricReporter,
        train_config: PyTextConfig,
        optimizer: torch.optim.Optimizer,
        scheduler: Scheduler = None,
        rank: int = 0,
    ) -> Tuple[torch.nn.Module, Any]:
        """
        Train and eval a model, the model states will be modified. This function
        iterates epochs specified in config, and for each epoch do:

            1. Train model using training data, aggregate and report training results
            2. Adjust learning rate if scheduler is specified
            3. Evaluate model using evaluation data
            4. Calculate metrics based on evaluation results and select best model

        Args:
            train_iter (BatchIterator): batch iterator of training data
            eval_iter (BatchIterator): batch iterator of evaluation data
            model (Model): model to be trained
            metric_reporter (MetricReporter): compute metric based on training
                output and report results to console, file.. etc
            train_config (PyTextConfig): training config
            optimizer (torch.optim.Optimizer): torch optimizer to be used
            scheduler (Scheduler): learning rate scheduler,
                default is None
            training_result (Optional): only meaningful for Hogwild training. default
                is None
            rank (int): only used in distributed training, the rank of the current
                training thread, evaluation will only be done in rank 0

        Returns:
            model, best_metric: the trained model together with the best metric
        """
        with time_utils.time("pre-training"):
            world_size = 1
            if cuda_utils.CUDA_ENABLED:
                model = model.cuda()
                world_size = cuda_utils.DISTRIBUTED_WORLD_SIZE
                if world_size > 1:
                    device_id = torch.cuda.current_device()
                    model = DistributedModel(
                        module=model,
                        device_ids=[device_id],
                        output_device=device_id,
                        broadcast_buffers=False,
                    )

            best_metric = None
            last_best_epoch = 0
            if scheduler:
                scheduler.prepare(train_iter, self.config.epochs)
            optimizer = precision_utils.wrap_optimizer(optimizer)

        def training_pre_batch_callback():
            if world_size > 1:
                # replace optimizer.zero_grad() here to work with DDP
                # in cases where some parameters don't receive grads at each step
                # loss.backward will set grad for params in the computation graph
                # we can thus follow which params are left out and call .backward
                # on them manually
                for p in model.parameters():
                    if p.grad is not None:
                        p.grad.detach_()
                        p.grad = None
            else:
                optimizer.zero_grad()

        def training_backprop(loss):
            with time_utils.time("loss.backward"):
                precision_utils.backward(optimizer, loss)
                if world_size > 1:
                    # DDP fix when some parameters don't receive grads
                    for p in model.parameters():
                        if p.requires_grad and p.grad is None:
                            p.backward(torch.zeros_like(p.data))

            if scheduler:
                scheduler.step_batch()

            if self.config.max_clip_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), self.config.max_clip_norm)
            else:
                grad_norm = None

            with time_utils.time("optimizer.step"):
                optimizer.step()
            # grad_norm could be used to check grads sync in distributed training
            return grad_norm

        time_start = time.time()
        best_model_state = None
        for epoch in range(1, self.config.epochs + 1):
            sys.stdout.flush()
            if self.config.target_time_limit_seconds > 0 and epoch > 1:
                time_elapsed = time.time() - time_start
                mean_epoch_time = time_elapsed / float(epoch - 1)
                expected_next_epoch_time = time_elapsed + mean_epoch_time
                if expected_next_epoch_time > self.config.target_time_limit_seconds:
                    print(
                        f"Training stopped after {epoch - 1} epochs and "
                        f"{int(time_elapsed)} seconds, due to the target max training "
                        f"time of {self.config.target_time_limit_seconds} seconds."
                    )
                    break

            print(f"Rank {rank} worker: Starting epoch #{epoch}")
            model.train()
            lrs = (str(lr) for lr in learning_rates(optimizer))
            print(f"Learning rate(s): {', '.join(lrs)}")

            with time_utils.time("epoch train"):
                self._run_epoch(
                    Stage.TRAIN,
                    epoch,
                    train_iter,
                    model,
                    metric_reporter,
                    pre_batch=training_pre_batch_callback,
                    backprop=training_backprop,
                    rank=rank,
                    num_samples_to_log_progress=self.config.
                    num_samples_to_log_progress,
                )

            if not self.config.do_eval:
                continue

            with time_utils.time("epoch eval"):
                model.eval(Stage.EVAL)
                with torch.no_grad():
                    eval_metric = self._run_epoch(
                        Stage.EVAL,
                        epoch,
                        eval_iter,
                        model,
                        metric_reporter,
                        rank=rank,
                        num_samples_to_log_progress=(
                            self.config.num_samples_to_log_progress),
                    )

            # Step the learning rate scheduler(s)
            if scheduler:
                assert eval_metric is not None
                scheduler.step_epoch(
                    metrics=metric_reporter.get_model_select_metric(
                        eval_metric),
                    epoch=epoch,
                )

            # choose best model.
            if metric_reporter.compare_metric(eval_metric, best_metric):
                with time_utils.time("save checkpoint model"):
                    last_best_epoch = epoch
                    best_metric = eval_metric
                    # Only rank = 0 trainer saves modules.
                    if train_config.save_module_checkpoints and rank == 0:
                        model.save_modules(
                            base_path=train_config.modules_save_dir,
                            suffix=f"-ep{epoch}",
                        )

                    if rank == 0:
                        print(f"Rank {rank} worker: Found a better model!")
                        model_state = model.state_dict()
                        # save to cpu to avoid multiple model copies in gpu memory
                        if cuda_utils.CUDA_ENABLED:
                            for key, state in model_state.items():
                                model_state[key] = state.cpu()
                        best_model_state = model_state

            if self.config.early_stop_after > 0 and (
                    epoch - last_best_epoch == self.config.early_stop_after):
                print(f"Rank {rank} worker: Eval metric hasn't changed for " +
                      f"{self.config.early_stop_after} epochs. Stopping now.")
                break
            sys.stdout.flush()

        if rank == 0 and best_model_state is not None:
            if cuda_utils.CUDA_ENABLED:
                for key, state in best_model_state.items():
                    best_model_state[key] = state.cuda()
            model.load_state_dict(best_model_state)

        return model, best_metric