Beispiel #1
0
    def build(self):
        """Build the trainer by assembling the necessary components."""
        super().build()
        if self.config.lr_scheduler.params:
            self.lr_scheduler = LrScheduler()
            dynamic_lr = self.lr_scheduler()(
                base_lr=self.config.optimizer.params["lr"],
                global_step=self.config.epochs * len(self.train_loader),
                total_epoch=self.config.epochs)
            self.optimizer = Optimizer()(model=self.model,
                                         dynamic_lr=dynamic_lr)
        else:
            self.optimizer = Optimizer()(model=self.model)
        if hasattr(self.model, 'add_loss'):
            loss_cls = Loss()()
            self.model.add_loss(loss_cls)
            self.loss = self.model.overall_loss()
        else:
            self.loss = Loss()()
        self.metric_name = self.config.metric.type

        # Some trainer has different train batch size from valid batch
        self.train_metrics = None
        self.valid_metrics = self._init_metrics()

        self.ms_model = MsModel(
            network=self.model,
            loss_fn=self.loss,
            optimizer=self.optimizer,
            metrics={self.metric_name: self.valid_metrics()})
Beispiel #2
0
    def build(self):
        """Build the trainer by assembling the necessary components."""
        super().build()

        self.optimizer = Optimizer()(model=self.model,
                                     distributed=self.distributed)
        if hasattr(self.model, 'add_loss'):
            loss_cls = Loss()()
            self.model.add_loss(loss_cls)
            self.loss = self.model.overall_loss()
        else:
            self.loss = Loss()()
        self.lr_scheduler = LrScheduler()(self.optimizer)
        if self.actions_list is not None:
            self.total_optimizer = self.optimizer
            self.total_loss = self.loss
            self.total_lr_scheduler = self.lr_scheduler
        # Some trainer has different train batch size from valid batch
        self.train_metrics = self._init_metrics()
        self.valid_metrics = self._init_metrics()
        self._init_horovod_setting()
        if self.use_amp:
            from apex import amp
            self.model, self.optimizer = amp.initialize(self.model,
                                                        self.optimizer,
                                                        opt_level='O1')
Beispiel #3
0
    def _default_model_fn(self, features, labels, mode):
        """Define model_fn used by TensorFlow Estimator.

        :params features: input features
        :type features: tensorflow tensors
        :params labels: label data
        :type labels: tensorflow tensors
        :params mode: mode of estimator
        :type mode: tf.estimator.ModeKeys
        :return: tensorflow EstimatorSpec
        :rtype: tf.estimator.EstimatorSpec
        """
        logging.info('model function action')
        self.model.training = mode == tf.estimator.ModeKeys.TRAIN
        logits = self.model(features)
        assign_ops = self.model.pretrained()
        with tf.control_dependencies(assign_ops):
            logits = tf.cast(logits, tf.float32)
            if hasattr(self.model, 'add_loss'):
                loss_cls = Loss()()
                self.model.add_loss(loss_cls)
                self.loss = self.model.overall_loss()
            else:
                self.loss = Loss()()
            loss = self.loss(logits, labels)
            train_op = None
            if mode == tf.estimator.ModeKeys.TRAIN:
                global_step = tf.compat.v1.train.get_or_create_global_step()
                epoch = tf.cast(global_step, tf.float32) / tf.cast(
                    len(self.train_loader), tf.float32)
                self.optimizer = Optimizer()(distributed=self.distributed)
                self.lr_scheduler = LrScheduler()(optimizer=self.optimizer)
                self.lr_scheduler.step(epoch)
                if self.distributed:
                    self.optimizer = Optimizer.set_distributed(self.optimizer)

                update_ops = tf.compat.v1.get_collection(
                    tf.compat.v1.GraphKeys.UPDATE_OPS)
                loss_scale = self.config.loss_scale if self.use_amp else 1
                minimize_op = self.optimizer.step(loss, loss_scale,
                                                  global_step)
                train_op = tf.group(minimize_op, update_ops)

        eval_metric_ops = None
        if mode == tf.estimator.ModeKeys.EVAL:
            eval_metric_ops = self.valid_metrics(logits, labels)
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op,
                                          eval_metric_ops=eval_metric_ops)
Beispiel #4
0
    def build(self):
        """Build the trainer by assembling the necessary components."""
        super().build()

        self.optimizer = Optimizer()(model=self.model)
        if hasattr(self.model, 'add_loss'):
            loss_cls = Loss()()
            self.model.add_loss(loss_cls)
            self.loss = self.model.overall_loss()
        else:
            self.loss = Loss()()
        self.metric_name = self.config.metric.type

        # Some trainer has different train batch size from valid batch
        self.train_metrics = None
        self.valid_metrics = self._init_metrics()
    def model_fn(self, features, labels, mode):
        """Define cars model_fn used by TensorFlow Estimator."""
        logging.info('Cars model function action')
        self.trainer.loss = Loss()()

        train_op = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            global_step = tf.compat.v1.train.get_global_step()
            epoch = tf.cast(global_step, tf.float32) / tf.cast(
                len(self.trainer.train_loader), tf.float32)
            self.trainer.optimizer = Optimizer()(
                distributed=self.trainer.distributed)
            self.trainer.lr_scheduler = LrScheduler()(self.trainer.optimizer)
            self.trainer.lr_scheduler.step(epoch)
            self.trainer.model.training = True
            alphas = tf.convert_to_tensor(self.alphas)
            for j in range(self.alg_policy.num_individual_per_iter):
                i = np.random.randint(0, self.alg_policy.num_individual, 1)[0]
                if self.epoch < self.alg_policy.warmup:
                    alpha = tf.convert_to_tensor(
                        self.search_alg.random_sample_path())
                else:
                    alpha = alphas[i]
                logits = self.trainer.model(features, alpha=alpha)
                logits = tf.cast(logits, tf.float32)
                loss = self.trainer.loss(logits=logits, labels=labels)
                loss = self.trainer.optimizer.regularize_loss(loss)
                grads, vars = zip(
                    *self.trainer.optimizer.compute_gradients(loss))
                if j == 0:
                    accum_grads = [
                        tf.Variable(tf.zeros_like(grad), trainable=False)
                        for grad in grads
                    ]
                accum_grads = [
                    accum_grads[k] + grads[k] for k in range(len(grads))
                ]
                if self.epoch < self.alg_policy.warmup:
                    break
            clipped_grads, _ = tf.clip_by_global_norm(
                accum_grads, self.trainer.config.grad_clip)
            minimize_op = self.trainer.optimizer.apply_gradients(
                list(zip(clipped_grads, vars)), global_step)
            update_ops = tf.compat.v1.get_collection(
                tf.compat.v1.GraphKeys.UPDATE_OPS)
            train_op = tf.group(minimize_op, update_ops)

        eval_metric_ops = None
        if mode == tf.estimator.ModeKeys.EVAL:
            alpha = tf.convert_to_tensor(self.trainer.valid_alpha)
            self.trainer.model.training = False
            logits = self.trainer.model(features, alpha=alpha)
            logits = tf.cast(logits, tf.float32)
            loss = self.trainer.loss(logits=logits, labels=labels)
            eval_metric_ops = self.trainer.valid_metrics(logits, labels)

        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op,
                                          eval_metric_ops=eval_metric_ops)
Beispiel #6
0
    def _build_train_op(self):
        self.inputs = self._create_tensor(self.loss_input['inputs'])
        self.labels = self._create_tensor(self.loss_input['labels'])

        self.input = self.inputs[0]
        logits = self.model(self.input)
        self.logits = logits
        self.actor_var = TFVariables(logits, self.sess)

        loss = Loss()()
        self.loss = loss(logits, self.labels)

        self.optimizer = Optimizer()(distributed=self.distributed)
        grads_and_var = self.optimizer.compute_gradients(self.loss)
        grads, var = zip(*grads_and_var)
        grads_and_var = list(zip(grads, var))
        self.train_op = self.optimizer.apply_gradients(grads_and_var)
        self.sess.run(tf.initialize_all_variables())
Beispiel #7
0
    def model_fn(self, features, labels, mode):
        """Darts model_fn used by TensorFlow Estimator."""
        logging.info('Darts model function action')
        global_step = tf.compat.v1.train.get_global_step()
        train_op = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            features, valid_features = features['train'], features['valid']
            labels, valid_labels = labels['train'], labels['valid']
            # update arch
            epoch = tf.cast(global_step, tf.float32) / tf.cast(
                len(self.trainer.train_loader), tf.float32)
            self.trainer.optimizer = Optimizer()(
                distributed=self.trainer.distributed)
            self.trainer.lr_scheduler = LrScheduler()(self.trainer.optimizer)
            self.trainer.lr_scheduler.step(epoch)
            update_ops = tf.compat.v1.get_collection(
                tf.compat.v1.GraphKeys.UPDATE_OPS)
            arch_minimize_op = self.search_alg.step(
                valid_x=valid_features,
                valid_y=valid_labels,
                lr=self.trainer.lr_scheduler.get_lr()[0])
            train_op = tf.group(arch_minimize_op, update_ops)
        self.model.training = mode == tf.estimator.ModeKeys.TRAIN
        logits = self.model(features)
        logits = tf.cast(logits, tf.float32)
        self.trainer.loss = Loss()()
        loss = self.trainer.loss(logits=logits, labels=labels)

        if mode == tf.estimator.ModeKeys.TRAIN:
            with tf.control_dependencies([train_op]):
                weight_ops = self.model.get_weight_ops()
                loss_scale = self.trainer.config.loss_scale if self.trainer.use_amp else 1
                train_op = self.trainer.optimizer.step(loss, loss_scale,
                                                       global_step, weight_ops)

        eval_metric_ops = None
        if mode == tf.estimator.ModeKeys.EVAL:
            eval_metric_ops = self.trainer.valid_metrics(logits, labels)

        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op,
                                          eval_metric_ops=eval_metric_ops)
Beispiel #8
0
    def _build_multigpu_train_op(self, num_gpus):
        with self.graph.as_default(), tf.device('/gpu:0'):
            tower_grads = []
            self.inputs = []
            self.labels = []
            opt = Optimizer()()
            for i in range(num_gpus):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('tower_%d' % i):
                        # tf.get_variable_scope().reuse_variables()
                        inputs = self._create_tensor(self.loss_input['inputs'])
                        labels = self._create_tensor(self.loss_input['labels'])
                        input = inputs[0]
                        model_output = self.model(input)

                        loss = Loss()()
                        loss = loss(model_output, labels)

                        # Calculate the gradients for the batch of data on this tower.
                        varlist = [
                            x for x in tf.trainable_variables()
                            if x.name.startswith('tower_%d' % i)
                        ]
                        grads = opt.compute_gradients(loss, varlist)

                        tower_grads.append(grads)
                        if i == 0:
                            self.actor_var = TFVariables(
                                model_output, self.sess)
                            self.input = input
                            self.logits = model_output
                            self.loss = loss

                        self.inputs.append(inputs)
                        self.labels.append(labels)

            grads = self._average_gradients(tower_grads)
            self.train_op = opt.apply_gradients(grads)
            self.sess.run(tf.initialize_all_variables())
Beispiel #9
0
    def _default_model_fn(self, features, labels, mode):
        """Define model_fn used by TensorFlow Estimator.

        :params features: input features
        :type features: tensorflow tensors
        :params labels: label data
        :type labels: tensorflow tensors
        :params mode: mode of estimator
        :type mode: tf.estimator.ModeKeys
        :return: tensorflow EstimatorSpec
        :rtype: tf.estimator.EstimatorSpec
        """
        logging.info('model function action')

        self.model.training = mode == tf.estimator.ModeKeys.TRAIN
        if self.config.mixup and mode == tf.estimator.ModeKeys.TRAIN:
            mixup_ratio = tf.compat.v1.distributions.Beta(0.1, 0.1).sample()
            mixed_x, y_a, y_b = self._mixup_batch(features, labels,
                                                  mixup_ratio)
            logits = self.model(mixed_x)
        else:
            logits = self.model(features)
        logits = tf.cast(logits, tf.float32)
        if hasattr(self.model, 'add_loss'):
            loss_cls = Loss()()
            self.model.add_loss(loss_cls)
            self.loss = self.model.overall_loss()
        else:
            self.loss = Loss()()
        # loss
        if self.config.mixup and mode == tf.estimator.ModeKeys.TRAIN:
            loss = self._mixup_loss(self.loss, logits, y_a, y_b, mixup_ratio)
        else:
            loss = self.loss(logits, labels)
        train_op = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            global_step = tf.compat.v1.train.get_or_create_global_step()
            epoch = tf.cast(global_step, tf.float32) / tf.cast(
                len(self.train_loader), tf.float32)
            self.optimizer = Optimizer()(distributed=self.distributed)
            self.lr_scheduler = LrScheduler()(optimizer=self.optimizer)
            self.lr_scheduler.step(epoch)
            if self.distributed:
                self.optimizer = Optimizer.set_distributed(self.optimizer)

            update_ops = tf.compat.v1.get_collection(
                tf.compat.v1.GraphKeys.UPDATE_OPS)
            loss_scale = self.config.loss_scale if self.use_amp else 1
            minimize_op = self.optimizer.step(loss, loss_scale, global_step)
            train_op = tf.group(minimize_op, update_ops)
            logging_hook = list()
            logging_hook.append(
                tf.train.LoggingTensorHook(
                    tensors={"learning rate": self.lr_scheduler.get_lr()[0]},
                    every_n_iter=10))

        eval_metric_ops = None
        if mode == tf.estimator.ModeKeys.EVAL:
            eval_metric_ops = self.valid_metrics(logits, labels)
        if mode == tf.estimator.ModeKeys.TRAIN:
            return tf.estimator.EstimatorSpec(mode=mode,
                                              loss=loss,
                                              train_op=train_op,
                                              eval_metric_ops=eval_metric_ops,
                                              training_hooks=logging_hook)
        else:
            return tf.estimator.EstimatorSpec(mode=mode,
                                              loss=loss,
                                              train_op=train_op,
                                              eval_metric_ops=eval_metric_ops)