def get_config(): M = Model() dataflow = data_io from tensorpack.callbacks.base import Callback class CBSyncWeight(Callback): def _before_run(self, ctx): if self.local_step % 10 == 0: return [M._sync_op_pred] import functools from tensorpack.train.config import TrainConfig from tensorpack.callbacks.saver import ModelSaver from tensorpack.callbacks.graph import RunOp from tensorpack.callbacks.param import ScheduledHyperParamSetter, HumanHyperParamSetter, HyperParamSetterWithFunc from tensorpack.tfutils import sesscreate from tensorpack.tfutils.common import get_default_sess_config import tensorpack.tfutils.symbolic_functions as symbf sigma_beta_steering = symbf.get_scalar_var('actor/sigma_beta_steering', 0.3, summary=True, trainable=False) sigma_beta_accel = symbf.get_scalar_var('actor/sigma_beta_accel', 0.3, summary=True, trainable=False) return TrainConfig( model=M, data=dataflow, callbacks=[ ModelSaver(), HyperParamSetterWithFunc( 'learning_rate/actor', functools.partial(M._calc_learning_rate, 'actor')), HyperParamSetterWithFunc( 'learning_rate/critic', functools.partial(M._calc_learning_rate, 'critic')), # ScheduledHyperParamSetter('learning_rate', [(20, 0.0003), (120, 0.0001)]), ScheduledHyperParamSetter('entropy_beta', [(80, 0.005)]), # HumanHyperParamSetter('learning_rate'), # HumanHyperParamSetter('entropy_beta'), ScheduledHyperParamSetter('actor/sigma_beta_accel', [(1, 0.2), (2, 0.01)]), ScheduledHyperParamSetter('actor/sigma_beta_steering', [(1, 0.1), (2, 0.01)]), CBSyncWeight(), data_io, # PeriodicTrigger(Evaluator( # EVAL_EPISODE, ['state'], ['policy'], get_player), # every_k_epochs=3), ] + evaluators, session_creator=sesscreate.NewSessionCreator( config=get_default_sess_config(0.5)), steps_per_epoch=STEPS_PER_EPOCH, max_epoch=1000, )
def get_config(model, algorithm_name): logger.auto_set_dir() dataset = model.get_data() steps_per_epoch = dataset.size() lr = symbf.get_scalar_var('learning_rate', 1e-4, summary=True) extra_display = ["cost"] if not algorithm_name == "cosine": extra_display = extra_display + ["loss/pos-dist", "loss/neg-dist"] return TrainConfig( dataflow=dataset, model=model(), optimizer=tf.train.GradientDescentOptimizer(lr), callbacks=[ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(10, 1e-5), (20, 1e-6)]) ], extra_callbacks=[ MovingAverageSummary(), ProgressBar(extra_display), StatPrinter() ], steps_per_epoch=steps_per_epoch, max_epoch=20, )
def _get_optimizer(self, name): from tensorpack.tfutils import optimizer from tensorpack.tfutils.gradproc import SummaryGradient, GlobalNormClip, MapGradient init_lr = INIT_LEARNING_RATE_A if name == 'actor' else INIT_LEARNING_RATE_C import tensorpack.tfutils.symbolic_functions as symbf lr = symbf.get_scalar_var('learning_rate/' + name, init_lr, summary=True) opt = tf.train.AdamOptimizer(lr) logger.info("create opt {}".format(name)) if name == 'critic': gradprocs = [ MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.05), regex='^critic/.*') ] elif name == 'actor': gradprocs = [ MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.1), regex='^actor/.*') ] else: assert (0) gradprocs.append(SummaryGradient()) opt = optimizer.apply_grad_processors(opt, gradprocs) return opt
def get_config(): logger.auto_set_dir() M = Model() master = MySimulatorMaster(namec2s, names2c, M) dataflow = BatchData(DataFromQueue(master.queue), BATCH_SIZE) lr = symbf.get_scalar_var('learning_rate', 0.0001, summary=True) return TrainConfig( dataset=dataflow, optimizer=tf.train.AdamOptimizer(lr, epsilon=1e-3), callbacks=Callbacks([ StatPrinter(), ModelSaver(), HumanHyperParamSetter('learning_rate', 'hyper.txt'), HumanHyperParamSetter('entropy_beta', 'hyper.txt'), HumanHyperParamSetter('explore_factor', 'hyper.txt'), master, StartProcOrThread(master), # PeriodicCallback(Evaluator(EVAL_EPISODE, ['state'], ['logits']), 1), GlobalStepSetter(), ]), session_config=get_default_sess_config(0.5), model=M, step_per_epoch=STEP_PER_EPOCH, max_epoch=1000, )
def get_config(cifar_classnum): logger.auto_set_dir() # prepare dataset dataset_train = get_data('train', cifar_classnum) step_per_epoch = dataset_train.size() dataset_test = get_data('test', cifar_classnum) sess_config = get_default_sess_config(0.5) lr = symbf.get_scalar_var('learning_rate', 1e-2) def lr_func(lr): if lr < 3e-5: raise StopTraining() return lr * 0.31 return TrainConfig( dataset=dataset_train, optimizer=tf.train.AdamOptimizer(lr, epsilon=1e-3), callbacks=Callbacks([ StatPrinter(), ModelSaver(), InferenceRunner(dataset_test, ClassificationError()), StatMonitorParamSetter('learning_rate', 'val_error', lr_func, threshold=0.001, last_k=10), ]), session_config=sess_config, model=Model(cifar_classnum), step_per_epoch=step_per_epoch, max_epoch=150, )
def get_config(): logger.auto_set_dir() M = Model() name_base = str(uuid.uuid1())[:6] PIPE_DIR = os.environ.get('TENSORPACK_PIPEDIR', '.').rstrip('/') namec2s = 'ipc://{}/sim-c2s-{}'.format(PIPE_DIR, name_base) names2c = 'ipc://{}/sim-s2c-{}'.format(PIPE_DIR, name_base) procs = [MySimulatorWorker(k, namec2s, names2c) for k in range(SIMULATOR_PROC)] ensure_proc_terminate(procs) start_proc_mask_signal(procs) master = MySimulatorMaster(namec2s, names2c, M) dataflow = BatchData(DataFromQueue(master.queue), BATCH_SIZE) lr = symbf.get_scalar_var('learning_rate', 0.001, summary=True) return TrainConfig( dataset=dataflow, optimizer=tf.train.AdamOptimizer(lr, epsilon=1e-3), callbacks=Callbacks([ StatPrinter(), ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(80, 0.0003), (120, 0.0001)]), ScheduledHyperParamSetter('entropy_beta', [(80, 0.005)]), ScheduledHyperParamSetter('explore_factor', [(80, 2), (100, 3), (120, 4), (140, 5)]), master, StartProcOrThread(master), PeriodicCallback(Evaluator(EVAL_EPISODE, ['state'], ['logits']), 2), ]), session_config=get_default_sess_config(0.5), model=M, step_per_epoch=STEP_PER_EPOCH, max_epoch=1000, )
def _get_optimizer(self): lr = symbf.get_scalar_var('learning_rate', 5e-4, summary=True) opt = tf.train.AdamOptimizer(lr, epsilon=1e-3) return optimizer.apply_grad_processors(opt, [ gradproc.ScaleGradient(('STN.*', 0.1)), gradproc.SummaryGradient() ])
def _get_optimizer(self): lr = symbf.get_scalar_var('learning_rate', 0.001, summary=True) opt = tf.train.AdamOptimizer(lr, epsilon=1e-3) gradprocs = [MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.1)), SummaryGradient()] opt = optimizer.apply_grad_processors(opt, gradprocs) return opt
def _get_optimizer(self): lr = symbf.get_scalar_var('learning_rate', 0.003, summary=True) factor = get_batch_factor() if factor != 1: lr = lr / float(factor) opt = tf.train.MomentumOptimizer(lr, 0.9) opt = optimizer.AccumGradOptimizer(opt, factor) else: opt = tf.train.MomentumOptimizer(lr, 0.9) return opt return optimizer.apply_grad_processors( opt, [gradproc.ScaleGradient(('.*/b', 2))])
def get_config(): logger.auto_set_dir() dataset = get_data() lr = symbf.get_scalar_var('learning_rate', 2e-4, summary=True) return TrainConfig( dataflow=dataset, optimizer=tf.train.AdamOptimizer(lr, beta1=0.5, epsilon=1e-6), callbacks=[ModelSaver()], session_config=get_default_sess_config(0.5), model=Model(), steps_per_epoch=500, max_epoch=100, )
def _get_opt(name, init_lr): lr = symbf.get_scalar_var('learning_rate/'+name, init_lr, summary=True) opt = tf.train.AdamOptimizer(lr) logger.info("create opt {}".format(name)) gradprocs = [ # MapGradient(lambda grad: tf.Print(grad, [grad], 'grad {}='.format(grad.op.name), summarize=4)), MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.1), regex='^actor/.*'), MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.05), regex='^critic/.*'), # GlobalNormClip(40.), SummaryGradient(), ] opt = optimizer.apply_grad_processors(opt, gradprocs) return opt
def _get_opt(name, init_lr): lr = symbf.get_scalar_var('learning_rate/' + name, init_lr, summary=True) opt = tf.train.AdamOptimizer(lr) logger.info("create opt {}".format(name)) gradprocs = [ # MapGradient(lambda grad: tf.Print(grad, [grad], 'grad {}='.format(grad.op.name), summarize=4)), MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.1), regex='^actor/.*'), MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.05), regex='^critic/.*'), # GlobalNormClip(40.), SummaryGradient(), ] opt = optimizer.apply_grad_processors(opt, gradprocs) return opt
def get_config(model): logger.auto_set_dir() dataset = model.get_data() steps_per_epoch = dataset.size() lr = symbf.get_scalar_var('learning_rate', 1e-4, summary=True) return TrainConfig( dataflow=dataset, model=model(), optimizer=tf.train.GradientDescentOptimizer(lr), callbacks=[ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(10, 1e-5), (20, 1e-6)]) ], steps_per_epoch=steps_per_epoch, max_epoch=20, )
def get_config(): logger.auto_set_dir() dataset_train, dataset_test = get_data(True), get_data(False) step_per_epoch = dataset_train.size() * 5 lr = symbf.get_scalar_var('learning_rate', 5e-4, summary=True) return TrainConfig( dataset=dataset_train, optimizer=tf.train.AdamOptimizer(lr, epsilon=1e-3), callbacks=Callbacks([ StatPrinter(), ModelSaver(), InferenceRunner(dataset_test, [ScalarStats('cost'), ClassificationError() ]), ScheduledHyperParamSetter('learning_rate', [(200, 1e-4)]) ]), session_config=get_default_sess_config(0.5), model=Model(), step_per_epoch=step_per_epoch, max_epoch=500, )
def get_config(): logger.auto_set_dir() M = Model() dataset_train = ExpReplay( predictor_io_names=(['state'], ['Qvalue']), player=get_player(train=True), batch_size=BATCH_SIZE, memory_size=MEMORY_SIZE, init_memory_size=INIT_MEMORY_SIZE, exploration=INIT_EXPLORATION, end_exploration=END_EXPLORATION, exploration_epoch_anneal=EXPLORATION_EPOCH_ANNEAL, update_frequency=4, reward_clip=(-1, 1), history_len=FRAME_HISTORY) lr = symbf.get_scalar_var('learning_rate', 1e-3, summary=True) return TrainConfig( dataflow=dataset_train, optimizer=tf.train.AdamOptimizer(lr, epsilon=1e-3), callbacks=Callbacks([ StatPrinter(), ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(150, 4e-4), (250, 1e-4), (350, 5e-5)]), RunOp(lambda: M.update_target_param()), dataset_train, PeriodicCallback(Evaluator(EVAL_EPISODE, ['state'], ['Qvalue']), 3), # HumanHyperParamSetter('learning_rate', 'hyper.txt'), # HumanHyperParamSetter(ObjAttrParam(dataset_train, 'exploration'), 'hyper.txt'), ]), # save memory for multiprocess evaluator session_config=get_default_sess_config(0.6), model=M, step_per_epoch=STEP_PER_EPOCH, )
def _get_optimizer(self): lr = symbf.get_scalar_var('learning_rate', 1e-4, summary=True) return tf.train.GradientDescentOptimizer(lr)
def _get_optimizer(self): lr = symbf.get_scalar_var('learning_rate', 1e-2, summary=True) return tf.train.AdamOptimizer(lr, epsilon=1e-3)
def _get_optimizer(self): lr = symbf.get_scalar_var('learning_rate', 0.003, summary=True) opt = tf.train.MomentumOptimizer(lr, 0.9) return optimizer.apply_grad_processors( opt, [gradproc.ScaleGradient(('.*/b', 2))])
def _get_optimizer(self): lr = symbf.get_scalar_var('learning_rate', 2e-4, summary=True) opt = tf.train.AdamOptimizer(lr, beta1=0.5, epsilon=1e-6) # generator learns 5 times faster return optimizer.apply_grad_processors( opt, [gradproc.ScaleGradient(('gen/.*', 5), log=True)])
def _get_optimizer(self): lr = symbolic_functions.get_scalar_var('learning_rate', 2e-3, summary=True) opt = tf.train.AdamOptimizer(lr) return optimizer.apply_grad_processors(opt, [GlobalNormClip(5)])
def _setup(self): from tensorpack.tfutils import symbolic_functions self._v_epoch_num = symbolic_functions.get_scalar_var('epoch_num', 0, summary=True) import multiprocessing as mp self._epoch_shared = mp.Value('i', 0) super(SyncMultiGPUTrainerParameterServer, self)._setup() raw_devices = ['/device:GPU:{}'.format(k) for k in self.config.tower] # raw_devices = ['/gpu:{}'.format(k) for k in self.config.tower] if self._ps_device == 'gpu': devices = [ LeastLoadedDeviceSetter(d, raw_devices) for d in raw_devices ] else: devices = [ tf.train.replica_device_setter(worker_device=d, ps_device='/cpu:0', ps_tasks=1) for d in raw_devices ] from ..model.base import ModelBase model = self.model # type: ModelBase assert (isinstance(model, ModelBase)) logger.info("Building graph ...") model.build_graph(None) from tensorpack.callbacks.summary import MergeAllSummaries_RunWithOp, MovingAverageSummary train_ops_main = [] train_ops_aux = {} for lname, loss in model._losses.items(): logger.info("Building opt for {} loss {} ...".format( 'main' if loss._isMainLoss else 'aux ', lname)) opt = model.get_optimizer() if loss._opt is None else loss._opt grads_array = [] for l in loss._losses: grads = opt.compute_gradients( l, gate_gradients=tf.train.Optimizer.GATE_NONE, colocate_gradients_with_ops=True) grads = [(g, v) for g, v in grads if g is not None] grads_array.append(grads) grads = self._average_grads(grads_array) train_op = opt.apply_gradients(grads) summary_callbacks = [] if isinstance(loss._summary_collection, str): c_vars = tf.get_collection(loss._summary_collection + '-ema_op') if len(c_vars) > 0: summary_callbacks.append( MovingAverageSummary(loss._summary_collection + '-ema_op')) summary_callbacks.append( MergeAllSummaries_RunWithOp(0, loss._summary_collection)) if loss._isMainLoss: train_ops_main.append(train_op) for c in summary_callbacks: self.register_callback(c) if loss._tensor_io: loss._tensor_io._is_main = True self.register_callback(loss._tensor_io) elif loss._trainOpGroup is not None: if loss._trainOpGroup not in train_ops_aux: train_ops_aux[loss._trainOpGroup] = _AuxTrainOp( loss._trainOpGroup) auxTrainOp = train_ops_aux[loss._trainOpGroup] auxTrainOp._train_ops.append(train_op) auxTrainOp._callbacks += summary_callbacks if loss._tensor_io: auxTrainOp._callbacks.append(loss._tensor_io) else: auxTrainOp = _AuxTrainOp(lname) auxTrainOp._train_ops = [train_op] auxTrainOp._callbacks += summary_callbacks if loss._tensor_io: auxTrainOp._callbacks.append(loss._tensor_io) train_ops_aux[lname] = (auxTrainOp) for n, auxTrainOp in train_ops_aux.items(): assert (len(auxTrainOp._train_ops) > 0) auxTrainOp._train_op = tf.group(*auxTrainOp._train_ops, name=n + '/train_op') for c in auxTrainOp._callbacks: c.setup_graph(self) # for rname, rop in model._run_ops.items(): # train_ops_aux.append(tf.group(*rop._run_ops, name=rname + '/run-op')) var_lists = tf.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES) var_lists[:] = [ v for v in var_lists if not v.name.startswith('evaluate/') ] self.train_op = tf.group(*train_ops_main, name='train_op') self._train_ops_aux = train_ops_aux
def _get_optimizer(self): lr = symbf.get_scalar_var('learning_rate', 1e-3, summary=True) opt = tf.train.AdamOptimizer(lr, epsilon=1e-3) return optimizer.apply_grad_processors( opt, [gradproc.GlobalNormClip(10), gradproc.SummaryGradient()])
def _get_optimizer(self): lr = symbf.get_scalar_var('learning_rate', 0.1, summary=True) return tf.train.MomentumOptimizer(lr, 0.9, use_nesterov=True)
def _get_optimizer(self): lr = symbf.get_scalar_var('learning_rate', 3e-5, summary=True) opt = tf.train.AdamOptimizer(lr, epsilon=1e-3) return optimizer.apply_grad_processors(opt, [ gradproc.ScaleGradient([('convfcweight.*', 0.1), ('conv5_.*', 5)]) ])
def _get_optimizer(self): lr = symbf.get_scalar_var('learning_rate', 2e-4, summary=True) return tf.train.AdamOptimizer(lr, beta1=0.5, epsilon=1e-3)
def _build_ad_nn(self, tensor_io): from drlutils.dataflow.tensor_io import TensorIO assert (isinstance(tensor_io, TensorIO)) from drlutils.model.base import get_current_nn_context from tensorpack.tfutils.common import get_global_step_var import tensorpack.tfutils.symbolic_functions as symbf global_step = get_global_step_var() nnc = get_current_nn_context() is_training = nnc.is_training i_state = tensor_io.getInputTensor('state') i_agentIdent = tensor_io.getInputTensor('agentIdent') i_sequenceLength = tensor_io.getInputTensor('sequenceLength') i_resetRNN = tensor_io.getInputTensor('resetRNN') l = i_state # l = tf.Print(l, [i_state, tf.shape(i_state)], 'State = ') # l = tf.Print(l, [i_agentIdent, tf.shape(i_agentIdent)], 'agentIdent = ') # l = tf.Print(l, [i_sequenceLength, tf.shape(i_sequenceLength)], 'SeqLen = ') # l = tf.Print(l, [i_resetRNN, tf.shape(i_resetRNN)], 'resetRNN = ') with tf.variable_scope('critic', reuse=nnc.reuse) as vs: def _get_cell(): cell = tf.nn.rnn_cell.BasicLSTMCell(256) # if is_training: # cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=0.9) return cell cell = tf.nn.rnn_cell.MultiRNNCell([_get_cell() for _ in range(1)]) rnn_outputs = self._buildRNN( l, cell, tensor_io.batchSize, i_agentIdent=i_agentIdent, i_sequenceLength=i_sequenceLength, i_resetRNN=i_resetRNN, ) rnn_outputs = tf.reshape( rnn_outputs, [-1, rnn_outputs.get_shape().as_list()[-1]]) l = rnn_outputs from ad_cur.autodrive.model.selu import fc_selu for lidx in range(2): l = fc_selu( l, 200, keep_prob=1., # 由于我们只使用传感器训练,关键信息不能丢 is_training=is_training, name='fc-{}'.format(lidx)) value = tf.layers.dense(l, 1, name='fc-value') value = tf.squeeze(value, [1], name="value") if not hasattr(self, '_weights_critic'): self._weights_critic = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) with tf.variable_scope('actor', reuse=nnc.reuse) as vs: l = tf.stop_gradient(l) mu_steering = 1. * tf.layers.dense( l, 1, activation=tf.nn.tanh, name='fc-mu-steering') mu_accel = tf.layers.dense(l, 1, activation=tf.nn.tanh, name='fc-mu-accel') mus = tf.concat([mu_steering, mu_accel], axis=-1) # mus = tf.layers.dense(l, 2, activation=tf.nn.tanh, name='fc-mus') # sigmas = tf.layers.dense(l, 2, activation=tf.nn.softplus, name='fc-sigmas') # sigmas = tf.clip_by_value(sigmas, -0.001, 0.5) def saturating_sigmoid(x): """Saturating sigmoid: 1.2 * sigmoid(x) - 0.1 cut to [0, 1].""" with tf.name_scope("saturating_sigmoid", [x]): y = tf.sigmoid(x) return tf.minimum(1.0, tf.maximum(0.0, 1.2 * y - 0.1)) sigma_steering_ = 1. * tf.layers.dense( l, 1, activation=tf.nn.sigmoid, name='fc-sigma-steering') sigma_accel_ = 1. * tf.layers.dense( l, 1, activation=tf.nn.sigmoid, name='fc-sigma-accel') sigma_beta_steering = symbf.get_scalar_var('sigma_beta_steering', 0.3, summary=True, trainable=False) sigma_beta_accel = symbf.get_scalar_var('sigma_beta_accel', 0.3, summary=True, trainable=False) if not nnc.is_evaluating: pass # sigma_beta_steering_exp = tf.train.exponential_decay(0.3, global_step, 1000, 0.5, name='sigma/beta/steering/exp') # sigma_beta_accel_exp = tf.train.exponential_decay(0.5, global_step, 5000, 0.5, name='sigma/beta/accel/exp') else: sigma_beta_steering = tf.constant(1e-4) sigma_beta_accel = tf.constant(1e-4) sigma_steering = (sigma_steering_ + sigma_beta_steering) sigma_accel = (sigma_accel_ + sigma_beta_accel) # * 0.1 # sigma_steering = tf.minimum(sigma_steering_ + sigma_beta_steering, 0.5) # sigma_accel = tf.minimum(sigma_accel_ + sigma_beta_accel, 0.2) # sigma_steering = sigma_steering_ # sigma_steering = sigma_steering_ # sigma_accel = sigma_accel_ sigmas = tf.clip_by_value( tf.concat([sigma_steering, sigma_accel], axis=-1), 0., 1.) # sigma_steering = tf.clip_by_value(sigma_steering, 0.1, 0.5) # sigma_accel = tf.clip_by_value(sigma_accel, 0.1, 0.5) # sigmas = sigmas_orig + 0.001 # sigmas = tf.clip_by_value(sigmas, 0.1, 0.5) # sigma_beta = tf.get_variable('sigma_beta', shape=[], dtype=tf.float32, # initializer=tf.constant_initializer(.5), trainable=False) # if is_training: # pass # # 如果不加sigma_beta,收敛会很慢,并且不稳定,猜测可能是以下原因: # # 1、训练前期尽量大的探索可以避免网络陷入局部最优 # # 2、前期过小的sigma会使normal_dist的log_prob过大,导致梯度更新过大,网络一开始就畸形了,很难恢复回来 # # if is_training: # sigmas += sigma_beta_steering # sigma_steering = tf.clip_by_value(sigma_steering, sigma_beta_steering, 0.5) # sigma_accel = tf.clip_by_value(sigma_accel, sigma_beta_accel, 0.5) # sigmas = tf.clip_by_value(sigmas, 0.1, 0.5) # sigmas_orig = sigmas # sigmas = sigmas + sigma_beta_steering # sigmas = tf.minimum(sigmas + 0.1, 100) # sigmas = tf.clip_by_value(sigmas, sigma_beta_steering, 1) # sigma_steering += sigma_beta_steering # sigma_accel += sigma_beta_accel # mus = tf.concat([mu_steering, mu_accel], axis=-1) from tensorflow.contrib.distributions import Normal dists = Normal(mus, sigmas) policy = tf.squeeze(dists.sample([1]), [0]) # 裁剪到两倍方差之内 policy = tf.clip_by_value(policy, mus - 2 * sigmas, mus + 2 * sigmas) if is_training: self._addMovingSummary( tf.reduce_mean(mu_steering, name='mu/steering/mean'), tf.reduce_mean(mu_accel, name='mu/accel/mean'), tf.reduce_mean(sigma_steering, name='sigma/steering/mean'), tf.reduce_max(sigma_steering, name='sigma/steering/max'), tf.reduce_mean(sigma_accel, name='sigma/accel/mean'), tf.reduce_max(sigma_accel, name='sigma/accel/max'), sigma_beta_accel, sigma_beta_steering, ) # actions = tf.Print(actions, [mus, sigmas, tf.concat([sigma_steering_, sigma_accel_], -1), actions], # 'mu/sigma/sigma.orig/act=', summarize=4) if not hasattr(self, '_weights_actor'): self._weights_actor = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) if not is_training: tensor_io.setOutputTensors(policy, value, mus, sigmas) return i_actions = tensor_io.getInputTensor("action") i_actions = tf.reshape(i_actions, [-1] + i_actions.get_shape().as_list()[2:]) log_probs = dists.log_prob(i_actions) # exp_v = tf.transpose( # tf.multiply(tf.transpose(log_probs), advantage)) # exp_v = tf.multiply(log_probs, advantage) i_advantage = tensor_io.getInputTensor("advantage") i_advantage = tf.reshape(i_advantage, [-1] + i_advantage.get_shape().as_list()[2:]) exp_v = log_probs * tf.expand_dims(i_advantage, -1) entropy = dists.entropy() entropy_beta = tf.get_variable( 'entropy_beta', shape=[], initializer=tf.constant_initializer(0.01), trainable=False) exp_v = entropy_beta * entropy + exp_v loss_policy = tf.reduce_mean(-tf.reduce_sum(exp_v, axis=-1), name='loss/policy') i_futurereward = tensor_io.getInputTensor("futurereward") i_futurereward = tf.reshape(i_futurereward, [-1] + i_futurereward.get_shape().as_list()[2:]) loss_value = tf.reduce_mean(0.5 * tf.square(value - i_futurereward)) loss_entropy = tf.reduce_mean(tf.reduce_sum(entropy, axis=-1), name='xentropy_loss') from tensorflow.contrib.layers.python.layers.regularizers import apply_regularization, l2_regularizer loss_l2_regularizer = apply_regularization(l2_regularizer(1e-4), self._weights_critic) loss_l2_regularizer = tf.identity(loss_l2_regularizer, 'loss/l2reg') loss_value += loss_l2_regularizer loss_value = tf.identity(loss_value, name='loss/value') # self.cost = tf.add_n([loss_policy, loss_value * 0.1, loss_l2_regularizer]) self._addParamSummary([('.*', ['rms', 'absmax'])]) pred_reward = tf.reduce_mean(value, name='predict_reward') advantage = symbf.rms(i_advantage, name='rms_advantage') self._addMovingSummary( loss_policy, loss_value, loss_entropy, pred_reward, advantage, loss_l2_regularizer, tf.reduce_mean(policy[:, 0], name='actor/steering/mean'), tf.reduce_mean(policy[:, 1], name='actor/accel/mean'), ) return loss_policy, loss_value
def _get_optimizer(self): lr = symbolic_functions.get_scalar_var('learning_rate', 2e-4, summary=True) return tf.train.AdamOptimizer(lr, beta1=0.5, epsilon=1e-3)