Example #1
0
    def build(self):
        """Build the trainer by assembling the necessary components."""
        super().build()

        self.optimizer = Optimizer()(model=self.model)
        if hasattr(self.model, 'add_loss'):
            loss_cls = Loss()()
            self.model.add_loss(loss_cls)
            self.loss = self.model.overall_loss()
        else:
            self.loss = Loss()()
        self.metric_name = self.config.metric.type

        # Some trainer has different train batch size from valid batch
        self.train_metrics = None
        self.valid_metrics = self._init_metrics()
Example #2
0
    def model_fn(self, features, labels, mode):
        """Define cars model_fn used by TensorFlow Estimator."""
        logging.info('Cars model function action')
        self.trainer.loss = Loss()()

        train_op = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            global_step = tf.compat.v1.train.get_global_step()
            epoch = tf.cast(global_step, tf.float32) / tf.cast(
                len(self.trainer.train_loader), tf.float32)
            self.trainer.optimizer = Optimizer()(
                distributed=self.trainer.distributed)
            self.trainer.lr_scheduler = LrScheduler()(self.trainer.optimizer)
            self.trainer.lr_scheduler.step(epoch)
            self.trainer.model.training = True
            alphas = tf.convert_to_tensor(self.alphas)
            for j in range(self.alg_policy.num_individual_per_iter):
                i = np.random.randint(0, self.alg_policy.num_individual, 1)[0]
                if self.epoch < self.alg_policy.warmup:
                    alpha = tf.convert_to_tensor(
                        self.search_alg.random_sample_path())
                else:
                    alpha = alphas[i]
                logits = self.trainer.model(features, alpha=alpha)
                logits = tf.cast(logits, tf.float32)
                loss = self.trainer.loss(logits=logits, labels=labels)
                loss = self.trainer.optimizer.regularize_loss(loss)
                grads, vars = zip(
                    *self.trainer.optimizer.compute_gradients(loss))
                if j == 0:
                    accum_grads = [
                        tf.Variable(tf.zeros_like(grad), trainable=False)
                        for grad in grads
                    ]
                accum_grads = [
                    accum_grads[k] + grads[k] for k in range(len(grads))
                ]
                if self.epoch < self.alg_policy.warmup:
                    break
            clipped_grads, _ = tf.clip_by_global_norm(
                accum_grads, self.trainer.config.grad_clip)
            minimize_op = self.trainer.optimizer.apply_gradients(
                list(zip(clipped_grads, vars)), global_step)
            update_ops = tf.compat.v1.get_collection(
                tf.compat.v1.GraphKeys.UPDATE_OPS)
            train_op = tf.group(minimize_op, update_ops)

        eval_metric_ops = None
        if mode == tf.estimator.ModeKeys.EVAL:
            alpha = tf.convert_to_tensor(self.trainer.valid_alpha)
            self.trainer.model.training = False
            logits = self.trainer.model(features, alpha=alpha)
            logits = tf.cast(logits, tf.float32)
            loss = self.trainer.loss(logits=logits, labels=labels)
            eval_metric_ops = self.trainer.valid_metrics(logits, labels)

        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op,
                                          eval_metric_ops=eval_metric_ops)
Example #3
0
    def _build_multigpu_train_op(self, num_gpus):
        with self.graph.as_default(), tf.device('/gpu:0'):
            tower_grads = []
            self.inputs = []
            self.labels = []
            opt = Optimizer()()
            for i in range(num_gpus):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('tower_%d' % i) as scope:
                        # tf.get_variable_scope().reuse_variables()
                        inputs = self._create_tensor(self.loss_input['inputs'])
                        labels = self._create_tensor(self.loss_input['labels'])
                        input = inputs[0]
                        model_output = self.model(input)

                        loss = Loss()()
                        loss = loss(model_output, labels)

                        # Calculate the gradients for the batch of data on this tower.
                        varlist = [x for x in tf.trainable_variables() if x.name.startswith('tower_%d' % i)]
                        grads = opt.compute_gradients(loss, varlist)

                        tower_grads.append(grads)
                        if i == 0:
                            self.actor_var = TFVariables(model_output, self.sess)
                            self.input = input
                            self.logits = model_output
                            self.loss = loss

                        self.inputs.append(inputs)
                        self.labels.append(labels)

            grads = self._average_gradients(tower_grads)
            self.train_op = opt.apply_gradients(grads)
            self.sess.run(tf.initialize_all_variables())
Example #4
0
    def _default_model_fn(self, features, labels, mode):
        """Define model_fn used by TensorFlow Estimator.

        :params features: input features
        :type features: tensorflow tensors
        :params labels: label data
        :type labels: tensorflow tensors
        :params mode: mode of estimator
        :type mode: tf.estimator.ModeKeys
        :return: tensorflow EstimatorSpec
        :rtype: tf.estimator.EstimatorSpec
        """
        logging.info('model function action')
        self.model.training = mode == tf.estimator.ModeKeys.TRAIN
        logits = self.model(features)
        logits = tf.cast(logits, tf.float32)
        if hasattr(self.model, 'add_loss'):
            loss_cls = Loss()()
            self.model.add_loss(loss_cls)
            self.loss = self.model.overall_loss()
        else:
            self.loss = Loss()()
        loss = self.loss(logits, labels)
        train_op = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            global_step = tf.compat.v1.train.get_or_create_global_step()
            epoch = tf.cast(global_step, tf.float32) / tf.cast(
                len(self.train_loader), tf.float32)
            self.optimizer = Optimizer()(distributed=self.distributed)
            self.lr_scheduler = LrScheduler()(optimizer=self.optimizer)
            self.lr_scheduler.step(epoch)
            update_ops = tf.compat.v1.get_collection(
                tf.compat.v1.GraphKeys.UPDATE_OPS)
            loss_scale = self.config.loss_scale if self.use_amp else 1
            minimize_op = self.optimizer.step(loss, loss_scale, global_step)
            train_op = tf.group(minimize_op, update_ops)

        eval_metric_ops = None
        if mode == tf.estimator.ModeKeys.EVAL:
            eval_metric_ops = self.valid_metrics(logits, labels)
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op,
                                          eval_metric_ops=eval_metric_ops)
Example #5
0
    def build(self):
        """Build the trainer by assembling the necessary components."""
        super().build()

        self.optimizer = Optimizer()(model=self.model,
                                     distributed=self.distributed)
        if hasattr(self.model, 'add_loss'):
            loss_cls = Loss()()
            self.model.add_loss(loss_cls)
            self.loss = self.model.overall_loss()
        else:
            self.loss = Loss()()
        self.lr_scheduler = LrScheduler()(self.optimizer)

        # Some trainer has different train batch size from valid batch
        self.train_metrics = self._init_metrics()
        self.valid_metrics = self._init_metrics()
        self._init_horovod_setting()
        if self.use_amp:
            self.model, self.optimizer = amp.initialize(self.model,
                                                        self.optimizer,
                                                        opt_level='O1')
Example #6
0
    def build(self):
        """Build the trainer by assembling the necessary components."""
        self._init_hps(self.hps)
        logging.debug("Trainer Config: {}".format(self.config))
        self.do_validation = self.config.with_valid
        self.use_syncbn = self.config.syncbn
        if self.use_syncbn and zeus.is_torch_backend():
            self.model = apex.parallel.convert_syncbn_model(self.model)
        self.train_loader = self._init_dataloader(mode='train')
        self.valid_loader = self._init_dataloader(mode='val')
        self.batch_num_train = self.train_loader.get_dataset_size() if zeus.is_ms_backend() else len(self.train_loader)
        self.batch_num_valid = self.valid_loader.get_dataset_size() if zeus.is_ms_backend() else len(self.valid_loader)

        if zeus.is_torch_backend():
            self.optimizer = Optimizer()(model=self.model, distributed=self.distributed)
            if hasattr(self.model, 'add_loss'):
                loss_cls = Loss()()
                self.model.add_loss(loss_cls)
                self.loss = self.model.overall_loss()
            else:
                self.loss = Loss()()
            self.lr_scheduler = LrScheduler()(self.optimizer)
        elif zeus.is_ms_backend():
            self.optimizer = Optimizer()(model=self.model)
            if hasattr(self.model, 'add_loss'):
                loss_cls = Loss()()
                self.model.add_loss(loss_cls)
                self.loss = self.model.overall_loss()
            else:
                self.loss = Loss()()
            self.metric_name = self.config.metric().type
        # Some trainer has different train batch size from valid batch
        self.train_metrics = self._init_metrics() if zeus.is_torch_backend() else None
        self.valid_metrics = self._init_metrics()
        self._init_horovod_setting()
        if self.use_amp and zeus.is_torch_backend():
            self.model, self.optimizer = amp.initialize(
                self.model, self.optimizer, opt_level='O1')
Example #7
0
    def _init_train_op(self):
        self.inputs = self._create_tensor(self.loss_input['inputs'])
        self.labels = self._create_tensor(self.loss_input['labels'])

        self.input = self.inputs[0]
        logits = self.model(self.input)
        self.logits = logits
        self.actor_var = TFVariables(logits, self.sess)

        loss = Loss()()
        self.loss = loss(logits, self.labels)

        self.optimizer = Optimizer()(distributed=self.distributed)
        grads_and_var = self.optimizer.compute_gradients(self.loss)
        grads, var = zip(*grads_and_var)
        grads_and_var = list(zip(grads, var))
        self.train_op = self.optimizer.apply_gradients(grads_and_var)
        self.sess.run(tf.initialize_all_variables())
Example #8
0
    def model_fn(self, features, labels, mode):
        """Darts model_fn used by TensorFlow Estimator."""
        logging.info('Darts model function action')
        global_step = tf.compat.v1.train.get_global_step()
        train_op = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            features, valid_features = features['train'], features['valid']
            labels, valid_labels = labels['train'], labels['valid']
            # update arch
            epoch = tf.cast(global_step, tf.float32) / tf.cast(
                len(self.trainer.train_loader), tf.float32)
            self.trainer.optimizer = Optimizer()(
                distributed=self.trainer.distributed)
            self.trainer.lr_scheduler = LrScheduler()(self.trainer.optimizer)
            self.trainer.lr_scheduler.step(epoch)
            update_ops = tf.compat.v1.get_collection(
                tf.compat.v1.GraphKeys.UPDATE_OPS)
            arch_minimize_op = self.search_alg.step(
                valid_x=valid_features,
                valid_y=valid_labels,
                lr=self.trainer.lr_scheduler.get_lr()[0])
            train_op = tf.group(arch_minimize_op, update_ops)
        self.model.training = mode == tf.estimator.ModeKeys.TRAIN
        logits = self.model(features)
        logits = tf.cast(logits, tf.float32)
        self.trainer.loss = Loss()()
        loss = self.trainer.loss(logits=logits, labels=labels)

        if mode == tf.estimator.ModeKeys.TRAIN:
            with tf.control_dependencies([train_op]):
                weight_ops = self.model.get_weight_ops()
                loss_scale = self.trainer.config.loss_scale if self.trainer.use_amp else 1
                train_op = self.trainer.optimizer.step(loss, loss_scale,
                                                       global_step, weight_ops)

        eval_metric_ops = None
        if mode == tf.estimator.ModeKeys.EVAL:
            eval_metric_ops = self.trainer.valid_metrics(logits, labels)

        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op,
                                          eval_metric_ops=eval_metric_ops)