def optimizer(self): opt = get_optimizer(self.optimizer_config) if self.quantizer_config['name'] == 'linear' and eval( self.quantizer_config['W_opts']['stop_grad']): self.add_stop_grad() opt = optimizer.apply_grad_processors( opt, [gradproc.MapGradient(self.stop_grad)]) if self.quantizer_config['name'] == 'linear' and eval( self.quantizer_config['W_opts']['centralized']): self.add_centralizing_update() opt = optimizer.PostProcessOptimizer(opt, self.centralizing) if self.quantizer_config['name'] == 'cent': self.add_centralizing_update() opt = optimizer.PostProcessOptimizer(opt, self.centralizing) if self.quantizer_config['name'] == 'cluster' and eval( self.load_config['clustering']): opt = optimizer.apply_grad_processors( opt, [gradproc.MapGradient(self.clustering)]) if self.quantizer_config['name'] == 'linear' and eval( self.quantizer_config['W_opts']['pruning']): self.add_masking_update() opt = optimizer.PostProcessOptimizer(opt, self.masking) if int(self.quantizer_config['BITA'] ) != 32 and self.quantizer_config['name'] == 'intQ': self.add_new_cs_update() opt = optimizer.PostProcessOptimizer(opt, self.ema) return opt
def optimizer(self): opt = get_optimizer(self.optimizer_config) ''' if self.optimizer_config['second'] != None: temp = {'name': self.optimizer_config['second']} opt2 = get_optimizer(temp) choose = tf.get_variable('select_opt', initializer=False, dtype=tf.bool) opt = tf.cond(choose, opt2, opt) ''' if self.quantizer_config['name'] == 'linear' and eval(self.quantizer_config['W_opts']['stop_grad']): self.add_stop_grad() opt = optimizer.apply_grad_processors(opt, [gradproc.MapGradient(self.stop_grad)]) if self.quantizer_config['name'] == 'linear' and eval(self.quantizer_config['W_opts']['centralized']): self.add_centralizing_update() opt = optimizer.PostProcessOptimizer(opt, self.centralizing) if self.quantizer_config['name'] == 'cent': self.add_centralizing_update() opt = optimizer.PostProcessOptimizer(opt, self.centralizing) if self.quantizer_config['name'] == 'cluster' and eval(self.load_config['clustering']): opt = optimizer.apply_grad_processors(opt, [gradproc.MapGradient(self.clustering)]) if self.quantizer_config['name'] == 'linear' and eval(self.quantizer_config['W_opts']['pruning']): self.add_masking_update() opt = optimizer.PostProcessOptimizer(opt, self.masking) return opt
def optimizer(self): opt = tf.train.AdamOptimizer(self.cfg.learning_rate) return optimizer.apply_grad_processors(opt, [ gradproc.MapGradient( lambda grad: tf.clip_by_average_norm(grad, 0.3)), gradproc.SummaryGradient() ])
def optimizer(self): lr = tf.get_variable('learning_rate', initializer=self.learning_rate, trainable=False) # opt = tf.train.AdamOptimizer(lr, epsilon=1e-3) opt = tf.train.AdamOptimizer(lr) return optimizer.apply_grad_processors( opt, [ # gradproc.GlobalNormClip(2.0), gradproc.MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.5)), gradproc.SummaryGradient()])
def optimizer(self): lr = tf.get_variable('learning_rate', initializer=1e-3, trainable=False) # This will also put the summary in tensorboard, stat.json and print in terminal, # but this time without moving average tf.summary.scalar('lr', lr) # opt = tf.train.MomentumOptimizer(lr, 0.9) opt = tf.train.AdamOptimizer(lr) return optimizer.apply_grad_processors( opt, [gradproc.MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.5)), gradproc.SummaryGradient()])
def _get_optimizer(self): gradprocs = [ FilterGradientVariables('.*net2.*', verbose=False), gradproc.MapGradient( lambda grad: tf.clip_by_value(grad, hp.train2.clip_value_min, hp.train2.clip_value_max)), gradproc.GlobalNormClip(hp.train2.clip_norm), # gradproc.PrintGradient(), # gradproc.CheckGradient(), ] lr = tf.get_variable('learning_rate', initializer=hp.train2.lr, trainable=False) opt = tf.train.AdamOptimizer(learning_rate=lr) return optimizer.apply_grad_processors(opt, gradprocs)
def _get_optimizer(self): gradprocs = [ tensorpack_extension.FilterGradientVariables('.*net2.*', verbose=False), gradproc.MapGradient( lambda grad: tf.clip_by_value(grad, hp.train2.clip_value_min, hp.train2.clip_value_max)), gradproc.GlobalNormClip(hp.train2.clip_norm), # gradproc.PrintGradient(), # gradproc.CheckGradient(), ] global_step = tf.Variable(0, name='global_step',trainable=False) #self.lr = self.learning_rate_decay(global_step, hp.train2.lr) #lr = learning_rate_decay(initial_lr = hp.train2.lr, global_step) lr = tf.get_variable('learning_rate', initializer=hp.train2.lr, trainable=False) opt = tf.train.AdamOptimizer(learning_rate=lr) return optimizer.apply_grad_processors(opt, gradprocs)