def build(self): """Build the trainer by assembling the necessary components.""" super().build() if self.config.lr_scheduler.params: self.lr_scheduler = LrScheduler() dynamic_lr = self.lr_scheduler()( base_lr=self.config.optimizer.params["lr"], global_step=self.config.epochs * len(self.train_loader), total_epoch=self.config.epochs) self.optimizer = Optimizer()(model=self.model, dynamic_lr=dynamic_lr) else: self.optimizer = Optimizer()(model=self.model) if hasattr(self.model, 'add_loss'): loss_cls = Loss()() self.model.add_loss(loss_cls) self.loss = self.model.overall_loss() else: self.loss = Loss()() self.metric_name = self.config.metric.type # Some trainer has different train batch size from valid batch self.train_metrics = None self.valid_metrics = self._init_metrics() self.ms_model = MsModel( network=self.model, loss_fn=self.loss, optimizer=self.optimizer, metrics={self.metric_name: self.valid_metrics()})
def build(self): """Build the trainer by assembling the necessary components.""" super().build() self.optimizer = Optimizer()(model=self.model, distributed=self.distributed) if hasattr(self.model, 'add_loss'): loss_cls = Loss()() self.model.add_loss(loss_cls) self.loss = self.model.overall_loss() else: self.loss = Loss()() self.lr_scheduler = LrScheduler()(self.optimizer) if self.actions_list is not None: self.total_optimizer = self.optimizer self.total_loss = self.loss self.total_lr_scheduler = self.lr_scheduler # Some trainer has different train batch size from valid batch self.train_metrics = self._init_metrics() self.valid_metrics = self._init_metrics() self._init_horovod_setting() if self.use_amp: from apex import amp self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level='O1')
def _default_model_fn(self, features, labels, mode): """Define model_fn used by TensorFlow Estimator. :params features: input features :type features: tensorflow tensors :params labels: label data :type labels: tensorflow tensors :params mode: mode of estimator :type mode: tf.estimator.ModeKeys :return: tensorflow EstimatorSpec :rtype: tf.estimator.EstimatorSpec """ logging.info('model function action') self.model.training = mode == tf.estimator.ModeKeys.TRAIN logits = self.model(features) assign_ops = self.model.pretrained() with tf.control_dependencies(assign_ops): logits = tf.cast(logits, tf.float32) if hasattr(self.model, 'add_loss'): loss_cls = Loss()() self.model.add_loss(loss_cls) self.loss = self.model.overall_loss() else: self.loss = Loss()() loss = self.loss(logits, labels) train_op = None if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.compat.v1.train.get_or_create_global_step() epoch = tf.cast(global_step, tf.float32) / tf.cast( len(self.train_loader), tf.float32) self.optimizer = Optimizer()(distributed=self.distributed) self.lr_scheduler = LrScheduler()(optimizer=self.optimizer) self.lr_scheduler.step(epoch) if self.distributed: self.optimizer = Optimizer.set_distributed(self.optimizer) update_ops = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.UPDATE_OPS) loss_scale = self.config.loss_scale if self.use_amp else 1 minimize_op = self.optimizer.step(loss, loss_scale, global_step) train_op = tf.group(minimize_op, update_ops) eval_metric_ops = None if mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = self.valid_metrics(logits, labels) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops)
def build(self): """Build the trainer by assembling the necessary components.""" super().build() self.optimizer = Optimizer()(model=self.model) if hasattr(self.model, 'add_loss'): loss_cls = Loss()() self.model.add_loss(loss_cls) self.loss = self.model.overall_loss() else: self.loss = Loss()() self.metric_name = self.config.metric.type # Some trainer has different train batch size from valid batch self.train_metrics = None self.valid_metrics = self._init_metrics()
def model_fn(self, features, labels, mode): """Define cars model_fn used by TensorFlow Estimator.""" logging.info('Cars model function action') self.trainer.loss = Loss()() train_op = None if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.compat.v1.train.get_global_step() epoch = tf.cast(global_step, tf.float32) / tf.cast( len(self.trainer.train_loader), tf.float32) self.trainer.optimizer = Optimizer()( distributed=self.trainer.distributed) self.trainer.lr_scheduler = LrScheduler()(self.trainer.optimizer) self.trainer.lr_scheduler.step(epoch) self.trainer.model.training = True alphas = tf.convert_to_tensor(self.alphas) for j in range(self.alg_policy.num_individual_per_iter): i = np.random.randint(0, self.alg_policy.num_individual, 1)[0] if self.epoch < self.alg_policy.warmup: alpha = tf.convert_to_tensor( self.search_alg.random_sample_path()) else: alpha = alphas[i] logits = self.trainer.model(features, alpha=alpha) logits = tf.cast(logits, tf.float32) loss = self.trainer.loss(logits=logits, labels=labels) loss = self.trainer.optimizer.regularize_loss(loss) grads, vars = zip( *self.trainer.optimizer.compute_gradients(loss)) if j == 0: accum_grads = [ tf.Variable(tf.zeros_like(grad), trainable=False) for grad in grads ] accum_grads = [ accum_grads[k] + grads[k] for k in range(len(grads)) ] if self.epoch < self.alg_policy.warmup: break clipped_grads, _ = tf.clip_by_global_norm( accum_grads, self.trainer.config.grad_clip) minimize_op = self.trainer.optimizer.apply_gradients( list(zip(clipped_grads, vars)), global_step) update_ops = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.UPDATE_OPS) train_op = tf.group(minimize_op, update_ops) eval_metric_ops = None if mode == tf.estimator.ModeKeys.EVAL: alpha = tf.convert_to_tensor(self.trainer.valid_alpha) self.trainer.model.training = False logits = self.trainer.model(features, alpha=alpha) logits = tf.cast(logits, tf.float32) loss = self.trainer.loss(logits=logits, labels=labels) eval_metric_ops = self.trainer.valid_metrics(logits, labels) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops)
def _build_train_op(self): self.inputs = self._create_tensor(self.loss_input['inputs']) self.labels = self._create_tensor(self.loss_input['labels']) self.input = self.inputs[0] logits = self.model(self.input) self.logits = logits self.actor_var = TFVariables(logits, self.sess) loss = Loss()() self.loss = loss(logits, self.labels) self.optimizer = Optimizer()(distributed=self.distributed) grads_and_var = self.optimizer.compute_gradients(self.loss) grads, var = zip(*grads_and_var) grads_and_var = list(zip(grads, var)) self.train_op = self.optimizer.apply_gradients(grads_and_var) self.sess.run(tf.initialize_all_variables())
def model_fn(self, features, labels, mode): """Darts model_fn used by TensorFlow Estimator.""" logging.info('Darts model function action') global_step = tf.compat.v1.train.get_global_step() train_op = None if mode == tf.estimator.ModeKeys.TRAIN: features, valid_features = features['train'], features['valid'] labels, valid_labels = labels['train'], labels['valid'] # update arch epoch = tf.cast(global_step, tf.float32) / tf.cast( len(self.trainer.train_loader), tf.float32) self.trainer.optimizer = Optimizer()( distributed=self.trainer.distributed) self.trainer.lr_scheduler = LrScheduler()(self.trainer.optimizer) self.trainer.lr_scheduler.step(epoch) update_ops = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.UPDATE_OPS) arch_minimize_op = self.search_alg.step( valid_x=valid_features, valid_y=valid_labels, lr=self.trainer.lr_scheduler.get_lr()[0]) train_op = tf.group(arch_minimize_op, update_ops) self.model.training = mode == tf.estimator.ModeKeys.TRAIN logits = self.model(features) logits = tf.cast(logits, tf.float32) self.trainer.loss = Loss()() loss = self.trainer.loss(logits=logits, labels=labels) if mode == tf.estimator.ModeKeys.TRAIN: with tf.control_dependencies([train_op]): weight_ops = self.model.get_weight_ops() loss_scale = self.trainer.config.loss_scale if self.trainer.use_amp else 1 train_op = self.trainer.optimizer.step(loss, loss_scale, global_step, weight_ops) eval_metric_ops = None if mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = self.trainer.valid_metrics(logits, labels) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops)
def _build_multigpu_train_op(self, num_gpus): with self.graph.as_default(), tf.device('/gpu:0'): tower_grads = [] self.inputs = [] self.labels = [] opt = Optimizer()() for i in range(num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('tower_%d' % i): # tf.get_variable_scope().reuse_variables() inputs = self._create_tensor(self.loss_input['inputs']) labels = self._create_tensor(self.loss_input['labels']) input = inputs[0] model_output = self.model(input) loss = Loss()() loss = loss(model_output, labels) # Calculate the gradients for the batch of data on this tower. varlist = [ x for x in tf.trainable_variables() if x.name.startswith('tower_%d' % i) ] grads = opt.compute_gradients(loss, varlist) tower_grads.append(grads) if i == 0: self.actor_var = TFVariables( model_output, self.sess) self.input = input self.logits = model_output self.loss = loss self.inputs.append(inputs) self.labels.append(labels) grads = self._average_gradients(tower_grads) self.train_op = opt.apply_gradients(grads) self.sess.run(tf.initialize_all_variables())
def _default_model_fn(self, features, labels, mode): """Define model_fn used by TensorFlow Estimator. :params features: input features :type features: tensorflow tensors :params labels: label data :type labels: tensorflow tensors :params mode: mode of estimator :type mode: tf.estimator.ModeKeys :return: tensorflow EstimatorSpec :rtype: tf.estimator.EstimatorSpec """ logging.info('model function action') self.model.training = mode == tf.estimator.ModeKeys.TRAIN if self.config.mixup and mode == tf.estimator.ModeKeys.TRAIN: mixup_ratio = tf.compat.v1.distributions.Beta(0.1, 0.1).sample() mixed_x, y_a, y_b = self._mixup_batch(features, labels, mixup_ratio) logits = self.model(mixed_x) else: logits = self.model(features) logits = tf.cast(logits, tf.float32) if hasattr(self.model, 'add_loss'): loss_cls = Loss()() self.model.add_loss(loss_cls) self.loss = self.model.overall_loss() else: self.loss = Loss()() # loss if self.config.mixup and mode == tf.estimator.ModeKeys.TRAIN: loss = self._mixup_loss(self.loss, logits, y_a, y_b, mixup_ratio) else: loss = self.loss(logits, labels) train_op = None if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.compat.v1.train.get_or_create_global_step() epoch = tf.cast(global_step, tf.float32) / tf.cast( len(self.train_loader), tf.float32) self.optimizer = Optimizer()(distributed=self.distributed) self.lr_scheduler = LrScheduler()(optimizer=self.optimizer) self.lr_scheduler.step(epoch) if self.distributed: self.optimizer = Optimizer.set_distributed(self.optimizer) update_ops = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.UPDATE_OPS) loss_scale = self.config.loss_scale if self.use_amp else 1 minimize_op = self.optimizer.step(loss, loss_scale, global_step) train_op = tf.group(minimize_op, update_ops) logging_hook = list() logging_hook.append( tf.train.LoggingTensorHook( tensors={"learning rate": self.lr_scheduler.get_lr()[0]}, every_n_iter=10)) eval_metric_ops = None if mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = self.valid_metrics(logits, labels) if mode == tf.estimator.ModeKeys.TRAIN: return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops, training_hooks=logging_hook) else: return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops)