Exemple #1
0
 def __init__(self, context: det.TrialContext) -> None:
     self.context = context
     self.data_config = context.get_data_config()
     self.hparams = context.get_hparams()
     self.criterion = torch.nn.functional.cross_entropy
     # The last epoch is only used for logging.
     self._last_epoch = -1
Exemple #2
0
 def __init__(self, context: det.TrialContext) -> None:
     self.context = context
     self.data_config = context.get_data_config()
     self.criterion = CrossEntropyLabelSmooth(
         context.get_hparam("num_classes"),  # num classes
         context.get_hparam("label_smoothing_rate"),
     )
     self.last_epoch_idx = -1
Exemple #3
0
    def from_trial(
        trial_inst: det.Trial,
        context: det.TrialContext,
        env: det.EnvContext,
        workloads: workload.Stream,
        load_path: Optional[pathlib.Path],
        rendezvous_info: det.RendezvousInfo,
        hvd_config: horovod.HorovodContext,
    ) -> det.TrialController:
        check.is_instance(
            context, keras.TFKerasTrialContext,
            "TFKerasTrialController needs a TFKerasTrialContext")
        context = cast(keras.TFKerasTrialContext, context)

        check.is_instance(trial_inst, TFKerasTrial,
                          "TFKerasTrialController needs a TFKerasTrial")
        trial = cast(TFKerasTrial, trial_inst)

        session = TFKerasTrialController._configure_session(
            env, hvd_config, trial.session_config())

        training_data = keras._adapt_data_from_data_loader(
            input_data=trial.build_training_data_loader(),
            batch_size=context.get_per_slot_batch_size(),
        )

        validation_data = keras._adapt_data_from_data_loader(
            input_data=trial.build_validation_data_loader(),
            batch_size=context.get_per_slot_batch_size(),
        )

        trial.build_model()
        check.is_not_none(context.model, "Please call wrap_model(...).")

        check.is_not_none(context.compile_args,
                          "Please call model.compile(...).")
        compile_args = cast(inspect.BoundArguments, context.compile_args)

        TFKerasTrialController.compile_model(context=context,
                                             compile_args=compile_args,
                                             env=env,
                                             hvd_config=hvd_config)

        tf_keras_callbacks = trial.keras_callbacks()

        return TFKerasTrialController(
            context.model,
            session,
            keras.TFKerasTrainConfig(training_data, validation_data,
                                     tf_keras_callbacks),
            context,
            env,
            workloads,
            load_path,
            rendezvous_info,
            hvd_config,
        )
Exemple #4
0
 def __init__(self, context: det.TrialContext) -> None:
     self.context = context
     self.data_config = context.get_data_config()
     self.hparams = context.get_hparams()
     self.criterion = torch.nn.functional.cross_entropy
     # The last epoch is only used for logging.
     self._last_epoch = -1
     self.results = {
         "loss": float("inf"),
         "top1_accuracy": 0,
         "top5_accuracy": 0
     }
Exemple #5
0
    def from_trial(
        cls: Type["TFKerasTrialController"],
        trial_inst: det.Trial,
        context: det.TrialContext,
        env: det.EnvContext,
        workloads: Optional[workload.Stream] = None,
    ) -> det.TrialController:
        check.is_instance(
            context, keras.TFKerasTrialContext, "TFKerasTrialController needs a TFKerasTrialContext"
        )
        context = cast(keras.TFKerasTrialContext, context)

        check.is_instance(trial_inst, TFKerasTrial, "TFKerasTrialController needs a TFKerasTrial")
        trial = cast(TFKerasTrial, trial_inst)

        # Keras only supports horovod backend for distributed training
        session = cls._configure_session(
            env, trial.session_config(), use_horovod=context.distributed.size > 1
        )

        training_data = keras._adapt_data_from_data_loader(
            input_data=trial.build_training_data_loader(),
            batch_size=context.get_per_slot_batch_size(),
        )

        validation_data = keras._adapt_data_from_data_loader(
            input_data=trial.build_validation_data_loader(),
            batch_size=context.get_per_slot_batch_size(),
        )

        trial.build_model()
        check.is_not_none(context.model, "Please call wrap_model(...).")

        check.is_not_none(context.compile_args, "Please call model.compile(...).")
        compile_args = cast(inspect.BoundArguments, context.compile_args)

        cls.compile_model(context=context, compile_args=compile_args, env=env)

        tf_keras_callbacks = trial.keras_callbacks()

        return cls(
            context.model,
            session,
            keras.TFKerasTrainConfig(training_data, validation_data, tf_keras_callbacks),
            trial,
            context,
            env,
            workloads,
        )
Exemple #6
0
    def __init__(self, context: det.TrialContext) -> None:
        self.context = context
        self.data_config = context.get_data_config()
        self.hparams = AttrDict(context.get_hparams())

        # Create a unique download directory for each rank so they don't overwrite each other.
        self.download_directory = self.data_config["data_download_dir"]
        data.download_data(self.download_directory)
        corpus = data_util.Corpus(self.download_directory)
        self.corpus = corpus
        self.ntokens = len(corpus.dictionary)
        self.hidden = None

        # This is used to store eval history and will switch to ASGD
        # once validation perplexity stops improving.
        self._last_loss = None
        self._eval_history = []
        self._last_epoch = -1
Exemple #7
0
    def __init__(self, context: det.TrialContext) -> None:
        self.context = context
        self.hparams = context.get_hparams()
        self.data_config = context.get_data_config()
        self.cfg = Config.fromfile(self.hparams["config_file"])

        self.cfg.data.train.ann_file = self.data_config["train_ann_file"]
        self.cfg.data.val.ann_file = self.data_config["val_ann_file"]
        self.cfg.data.val.test_mode = True
        self.cfg.data.workers_per_gpu = self.data_config["workers_per_gpu"]

        if self.data_config["backend"] in ["gcs", "fake"]:
            sub_backend(self.data_config["backend"], self.cfg)

        print(self.cfg)

        self.model = self.context.wrap_model(
            build_detector(self.cfg.model,
                           train_cfg=self.cfg.train_cfg,
                           test_cfg=self.cfg.test_cfg))

        self.optimizer = self.context.wrap_optimizer(
            build_optimizer(self.model, self.cfg.optimizer))

        scheduler_cls = WarmupWrapper(MultiStepLR)
        scheduler = scheduler_cls(
            self.hparams["warmup"],  # warmup schedule
            self.hparams["warmup_iters"],  # warmup_iters
            self.hparams["warmup_ratio"],  # warmup_ratio
            self.optimizer,
            [self.hparams["step1"], self.hparams["step2"]],  # milestones
            self.hparams["gamma"],  # gamma
        )
        self.scheduler = self.context.wrap_lr_scheduler(
            scheduler, step_mode=LRScheduler.StepMode.MANUAL_STEP)

        self.clip_grads_fn = (lambda x: torch.nn.utils.clip_grad_norm_(
            x, self.hparams["clip_grads_norm"])
                              if self.hparams["clip_grads"] else None)
Exemple #8
0
    def __init__(self, context: det.TrialContext) -> None:
        super().__init__()
        # Set hyperparameters that influence the model architecture.
        self.n_filters1 = context.get_hparam("n_filters1")
        self.n_filters2 = context.get_hparam("n_filters2")
        self.dropout = context.get_hparam("dropout")

        # Define the central model.
        self.model = nn.Sequential(
            nn.Conv2d(1, self.n_filters1, kernel_size=5),
            nn.MaxPool2d(2),
            nn.ReLU(),
            nn.Conv2d(self.n_filters1, self.n_filters2, kernel_size=5),
            nn.MaxPool2d(2),
            nn.ReLU(),
            Flatten(),
            nn.Linear(16 * self.n_filters2, 50),
            nn.ReLU(),
            nn.Dropout2d(self.dropout),
        )  # type: nn.Sequential
        # Predict digit labels from self.model.
        self.digit = nn.Sequential(nn.Linear(50, 10), nn.Softmax(dim=0))
        # Predict binary labels from self.model.
        self.binary = nn.Sequential(nn.Linear(50, 1), nn.Sigmoid(), Squeeze())
Exemple #9
0
 def __init__(self, context: det.TrialContext) -> None:
     self.context = context
     self.data_config = context.get_data_config()
     self.num_classes = {
         "train": context.get_hparam("num_classes_train"),
         "val": context.get_hparam("num_classes_val"),
     }
     self.num_support = {
         "train": context.get_hparam("num_support_train"),
         "val": context.get_hparam("num_support_val"),
     }
     self.num_query = {
         "train": context.get_hparam("num_query_train"),
         "val": None,  # Use all available examples for val at meta-test time
     }
     self.get_train_valid_splits()
Exemple #10
0
    def from_trial(
        trial_inst: det.Trial,
        context: det.TrialContext,
        env: det.EnvContext,
        workloads: workload.Stream,
        load_path: Optional[pathlib.Path],
        rendezvous_info: det.RendezvousInfo,
        hvd_config: horovod.HorovodContext,
    ) -> det.TrialController:
        check.is_instance(
            context,
            keras.TFKerasTrialContext,
            "TFKerasTrialController needs a TFKerasTrialContext",
        )
        context = cast(keras.TFKerasTrialContext, context)

        check.is_instance(trial_inst, TFKerasTrial,
                          "TFKerasTrialController needs a TFKerasTrial")
        trial = cast(TFKerasTrial, trial_inst)

        session = TFKerasTrialController._configure_session(
            env, hvd_config, trial.session_config())

        training_x, training_y, training_sample_weight = keras._get_x_y_and_sample_weight(
            input_data=trial.build_training_data_loader())
        training_data = keras._adapt_keras_data(
            x=training_x,
            y=training_y,
            sample_weight=training_sample_weight,
            batch_size=context.get_per_slot_batch_size(),
            drop_leftovers=True,
        )

        val_x, val_y, val_sample_weight = keras._get_x_y_and_sample_weight(
            input_data=trial.build_validation_data_loader())
        validation_data = keras._adapt_keras_data(
            x=val_x,
            y=val_y,
            sample_weight=val_sample_weight,
            batch_size=context.get_per_slot_batch_size(),
            drop_leftovers=False,
        )

        trial.build_model()
        check.is_not_none(context.model, "Please call wrap_model(...).")

        check.is_not_none(context.compile_args,
                          "Please call model.compile(...).")
        compile_args = cast(inspect.BoundArguments, context.compile_args)

        (
            context.model,
            compile_args.arguments["optimizer"],
        ) = keras._get_multi_gpu_model_and_optimizer(
            pre_compiled_model=context.model,
            optimizer=compile_args.arguments["optimizer"],
            env=env,
            hvd_config=hvd_config,
            profile_frequency=env.experiment_config.profile_frequency(),
            profile_filename=DeterminedProfiler.OUTPUT_FILENAME,
        )

        if hvd_config.use and version.parse(
                tf.__version__) >= version.parse("2.0.0"):
            logging.info(
                "Calling `model.compile(...)` with `experimental_run_tf_function=False` to ensure "
                "TensorFlow calls `optimizer.get_gradients()` to compute gradients."
            )
            context.model.compile(*compile_args.args,
                                  **compile_args.kwargs,
                                  experimental_run_tf_function=False)
        else:
            context.model.compile(*compile_args.args, **compile_args.kwargs)

        tf_keras_callbacks = trial.keras_callbacks()

        return TFKerasTrialController(
            context.model,
            session,
            keras.TFKerasTrainConfig(training_data, validation_data,
                                     tf_keras_callbacks),
            context,
            env,
            workloads,
            load_path,
            rendezvous_info,
            hvd_config,
        )
    def from_trial(
        trial_inst: det.Trial,
        context: det.TrialContext,
        env: det.EnvContext,
        workloads: workload.Stream,
        load_path: Optional[pathlib.Path],
        rendezvous_info: det.RendezvousInfo,
        hvd_config: horovod.HorovodContext,
    ) -> det.TrialController:
        check.is_instance(
            context, keras.TFKerasTrialContext,
            "TFKerasTrialController needs a TFKerasTrialContext")
        context = cast(keras.TFKerasTrialContext, context)

        check.is_instance(trial_inst, TFKerasTrial,
                          "TFKerasTrialController needs a TFKerasTrial")
        trial = cast(TFKerasTrial, trial_inst)

        session = TFKerasTrialController._configure_session(
            env, hvd_config, trial.session_config())

        training_data_loader = trial.build_training_data_loader()
        validation_data_loader = trial.build_validation_data_loader()

        trial.build_model()
        check.is_not_none(context.model, "Please call wrap_model(...).")

        training_x, training_y, training_sample_weight = keras._get_x_y_and_sample_weight(
            input_data=training_data_loader)
        training_data = keras._adapt_keras_data(
            x=training_x,
            y=training_y,
            sample_weight=training_sample_weight,
            batch_size=context.get_per_slot_batch_size(),
            use_multiprocessing=context._fit_use_multiprocessing,
            workers=context._fit_workers,
            max_queue_size=context._fit_max_queue_size,
            drop_leftovers=True,
        )

        val_x, val_y, val_sample_weight = keras._get_x_y_and_sample_weight(
            input_data=validation_data_loader)
        validation_data = keras._adapt_keras_data(
            x=val_x,
            y=val_y,
            sample_weight=val_sample_weight,
            batch_size=context.get_per_slot_batch_size(),
            use_multiprocessing=context._fit_use_multiprocessing,
            workers=context._fit_workers,
            max_queue_size=context._fit_max_queue_size,
            drop_leftovers=False,
        )

        check.is_not_none(context.compile_args,
                          "Please call model.compile(...).")
        compile_args = cast(inspect.BoundArguments, context.compile_args)

        TFKerasTrialController.compile_model(context=context,
                                             compile_args=compile_args,
                                             env=env,
                                             hvd_config=hvd_config)

        tf_keras_callbacks = trial.keras_callbacks()

        return TFKerasTrialController(
            context.model,
            session,
            keras.TFKerasTrainConfig(training_data, validation_data,
                                     tf_keras_callbacks),
            context,
            env,
            workloads,
            load_path,
            rendezvous_info,
            hvd_config,
        )