def __call__(self, inputs, index, initial_state=None, recurrent_weights_initializer=None): """ :param tf.Tensor inputs: shape (time,batch,n_hidden) :param tf.Tensor index: shape (time,batch) :param tf.Tensor|None initial_state: shape (batch,n_hidden) :param ()->tf.Tensor recurrent_weights_initializer: :returns: shape (time,batch,n_hidden), shape (batch,n_hidden) :rtype: (tf.Tensor, tf.Tensor) """ W = tf.get_variable(name="W_re", shape=(self.n_hidden, self.n_hidden * 4), initializer=recurrent_weights_initializer) TFUtil.set_param_axes_split_info( W, [[self.n_hidden], [self.n_hidden] * 4]) if self.rec_weight_dropout: from TFUtil import dropout W = dropout(W, keep_prob=1.0 - self.rec_weight_dropout, cond_on_train=True, seed=TFUtil.get_random_seed()) out, _, _, final_cell_state = self.op(*self.map_layer_inputs_to_op( X=inputs, W=W, i=index, initial_state=initial_state)) from tensorflow.python.ops.nn import rnn_cell return out, rnn_cell.LSTMStateTuple(h=out[-1], c=final_cell_state)
def __call__(self, inputs, index, initial_state=None, recurrent_weights_initializer=None): """ :param tf.Tensor inputs: shape (time,batch,n_input_dim) :param tf.Tensor index: shape (time,batch) :param tf.Tensor|None initial_state: shape (batch,n_hidden) :param ()->tf.Tensor recurrent_weights_initializer: :returns: shape (time,batch,n_hidden), shape (batch,n_hidden) :rtype: (tf.Tensor, tf.Tensor) """ W = tf.get_variable(name="W", shape=(self.n_input_dim + self.n_hidden, self.n_hidden * 4), initializer=recurrent_weights_initializer) b = tf.get_variable(name="b", shape=(self.n_hidden * 4, ), initializer=tf.zeros_initializer()) TFUtil.set_param_axes_split_info( W, [[self.n_input_dim, self.n_hidden], [self.n_hidden] * 4]) TFUtil.set_param_axes_split_info(b, [[self.n_hidden] * 4]) out, _, final_state = self.op(*self.map_layer_inputs_to_op( X=inputs, W=W, b=b, i=index, initial_state=initial_state)) return out, final_state
def have_blocksparse_requirements(): import TFUtil if not TFUtil.is_gpu_available(): return False min_compute_capability = TFUtil.get_available_gpu_min_compute_capability() if min_compute_capability < 3.5: return False return True
def __call__(self, inputs, index, initial_state=None, recurrent_weights_initializer=None): """ :param tf.Tensor inputs: shape (time,batch,n_hidden) :param tf.Tensor index: shape (time,batch) :param tf.Tensor|None initial_state: shape (batch,n_hidden) :param ()->tf.Tensor recurrent_weights_initializer: :returns: shape (time,batch,n_hidden), shape (batch,n_hidden) :rtype: (tf.Tensor, tf.Tensor) """ from tensorflow.python.ops.nn import rnn_cell W = tf.get_variable(name="W_re", shape=(self.n_hidden, self.n_hidden * 4), initializer=recurrent_weights_initializer) TFUtil.set_param_axes_split_info( W, [[self.n_hidden], [self.n_hidden] * 4]) if self.rec_weight_dropout: from TFUtil import dropout W = dropout(W, keep_prob=1.0 - self.rec_weight_dropout, cond_on_train=True, seed=TFUtil.get_random_seed()) inputs.set_shape(tf.TensorShape([None, None, self.n_hidden * 4])) W.set_shape(tf.TensorShape([self.n_hidden, self.n_hidden * 4])) index.set_shape(tf.TensorShape([None, None])) from TFUtil import to_float32 index = to_float32(index) n_batch = tf.shape(inputs)[1] if initial_state is None: c0 = tf.zeros((n_batch, self.n_hidden), dtype=tf.float32, name="initial_c") y0 = tf.zeros((n_batch, self.n_hidden), dtype=tf.float32, name="initial_h") elif isinstance(initial_state, rnn_cell.LSTMStateTuple): c0 = initial_state.c y0 = initial_state.h else: c0 = initial_state y0 = tf.zeros((n_batch, self.n_hidden), dtype=tf.float32, name="initial_h") start = tf.constant(0, name="start") step = tf.constant(self.step or 1, name="step") out, _, _, final_cell_state = self.op(inputs, W, y0, c0, index, start, step) if out.get_shape().as_list()[0] is None or out.get_shape().as_list( )[0] > 0: final_output = out[-1] else: final_output = y0 return out, rnn_cell.LSTMStateTuple(h=final_output, c=final_cell_state)
def init_blocksparse(): import TFUtil assert TFUtil.is_gpu_available(), "we currently need a GPU" min_compute_capability = TFUtil.get_available_gpu_min_compute_capability() assert min_compute_capability and min_compute_capability >= 3.5, "we need at least compute capability 3.5" path = os.path.dirname(__file__) + "/extern/blocksparse" assert os.path.exists(path), "maybe submodule not checked out?" import sys if path not in sys.path: # At the beginning, to make sure we find it firs.t sys.path.insert(0, path) # test it from blocksparse import op_module op_module.get_module()
def __call__(self, inputs, index, initial_state=None, recurrent_weights_initializer=None): """ :param tf.Tensor inputs: shape (time,batch,n_hidden*4) :param tf.Tensor index: shape (time,batch) :param tf.Tensor|None initial_state: shape (batch,n_hidden) :param ()->tf.Tensor recurrent_weights_initializer: :returns: shape (time,batch,n_hidden), shape (batch,n_hidden) :rtype: (tf.Tensor, tf.Tensor) """ W_re = tf.get_variable( name="W_re", shape=(self.n_hidden, self.n_hidden * 4), initializer=recurrent_weights_initializer) TFUtil.set_param_axes_split_info(W_re, [[self.n_hidden], [self.n_hidden] * 4]) out, _, final_state = self.op( *self.map_layer_inputs_to_op(Z=inputs, V_h=W_re, i=index, initial_state=initial_state)) return out, final_state
def test(self, model_save_path, check_point, use_gpu, gpu_id=None): # build network device_str = TFUtil.get_device_str(use_gpu=use_gpu, gpu_id=gpu_id) with tf.device(device_str): self._init_acn() # initialize all variables from the model self.load(model_save_path, check_point) # start new episode episode_idx = 0 total_rewards = 0 state = self._request_new_episode() # perform testing while episode_idx <= ACConfig.max_iterations: # sample and perform action, store history action = self._select_action(state, episode_idx, test_mode=True) next_state, reward, done = self._perform_action(state, action) total_rewards += reward state = next_state if done: episode_idx += 1 print('total_reward received: {0}'.format(total_rewards)) self._history_buffer.clean_up() state = self._request_new_episode() total_rewards = 0
def test(self, check_point, use_gpu, gpu_id=None): assert (len(envs) == 1) env = envs[0] history_buffer = A3CUtil.HistoryBuffer() # build network device_str = TFUtil.get_device_str(use_gpu=use_gpu, gpu_id=gpu_id) with tf.device(device_str): #self._init_network() # initialize all variables from the model self.load(self._model_save_path, check_point) # start new episode episode_idx = 0 total_rewards = 0 state, target = self._request_new_episode(env) # perform testing while episode_idx <= A3CConfig.max_iterations: # sample and perform action, store history action = self._select_action(env, state, target, test_mode=True) next_state, reward, done = self._perform_action( env, state, target, action, history_buffer) total_rewards += reward state = next_state if done: episode_idx += 1 print('total_reward received: {0}'.format(total_rewards)) history_buffer.clean_up() state = self._request_new_episode(env) total_rewards = 0
def get_consumer_device(self): """ :return: e.g. "/device:GPU:0" :rtype: str """ # TODO this is probably incomplete import TFUtil if TFUtil.is_gpu_available(): return "/device:GPU:0" return "/device:CPU:0"
def __init__(self): self.hyps = TFCompat.v1.placeholder(tf.string, [None]) self.refs = TFCompat.v1.placeholder(tf.string, [None]) self.wer, self.ref_num_words = TFUtil.string_words_calc_wer(hyps=self.hyps, refs=self.refs) self.total_wer_var = tf.Variable(initial_value=0, trainable=False, dtype=tf.int64) self.total_ref_num_words_var = tf.Variable(initial_value=0, trainable=False, dtype=tf.int64) self.update_total_wer = self.total_wer_var.assign_add(tf.reduce_sum(self.wer)) self.update_ref_num_words = self.total_ref_num_words_var.assign_add(tf.reduce_sum(self.ref_num_words)) self.updated_normalized_wer = ( tf.cast(self.update_total_wer, tf.float32) / tf.cast(self.update_ref_num_words, tf.float32))
def learn(self, check_point, use_gpu, gpu_id=None): device_str = TFUtil.get_device_str(use_gpu=use_gpu, gpu_id=gpu_id) with tf.Graph().as_default(): with tf.device(device_str): config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # initialize a global agent self._global_agent = A3CAgent( sess=sess, scope="global", feature_mode=self._feature_mode) self._global_agent._init_network(check_point=check_point) self._global_agent.create_global_summary_ops() # start training # create learner threads learner_threads = [] for i in xrange(self._num_agents): local_scope = "agent_{0:03d}".format(i) local_agent = A3CAgent(sess=sess, scope=local_scope, feature_mode=self._feature_mode) local_agent._init_network() learner_threads.append( threading.Thread(target=local_agent.learn, args=(use_gpu, gpu_id))) # initilize all variables sess.run(tf.global_variables_initializer()) # load resnet variables A3CAgent.tf_resnet_saver.restore( sess, A3CConfig.resnet_pretrain_model) # initialize or load network variables if check_point: self._global_agent.load(self._model_save_path, check_point) A3CConfig.num_frames = check_point else: A3CConfig.num_frames = 0 learner_threads.append( threading.Thread( target=self._global_agent.save_model_monitor, args=(A3CConfig.num_frames, self._model_save_path, self._model_save_interval))) print( 'Training started, please open Tensorboard to monitor the training process.' ) for t in learner_threads: t.start() for t in learner_threads: t.join()
def _make_mod(self): if self.cache_key in self.mod_cache: return self.mod_cache[self.cache_key] comp = TFUtil.OpCodeCompiler( base_name=self.name, code_version=self.description.code_version, code=self._make_code(), include_deps=[self.support_native_op_cpp_filename], ld_flags=["-lblas"], **dict(self.compiler_opts)) mod = comp.load_module() self.mod_cache[self.cache_key] = mod return mod
def learn(self, check_point, use_gpu, gpu_id=None): device_str = TFUtil.get_device_str(use_gpu=use_gpu, gpu_id=gpu_id) with tf.Graph().as_default(): with tf.device(device_str): # create resnet if no extracted feature if not self._feature_mode: resnet_saver = tf.train.import_meta_graph(A3CConfig.resnet_meta_graph) graph = tf.get_default_graph() self._tf_resnet_input = graph.get_tensor_by_name("images:0") self._tf_resnet_output = graph.get_tensor_by_name("avg_pool:0") # build network self._init_network(scope = "global") self._saver = tf.train.Saver(var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='global')) # create initializer init = tf.global_variables_initializer() # create auxiliary operations: summary and saver self._tf_summary_op = tf.summary.merge_all() self._summary_writer = tf.summary.FileWriter(A3CConfig.summary_folder, self._tf_sess.graph) # initialize or load network variables if check_point is None: # initialize all variable self._tf_sess.run(init) if not self._feature_mode: # load pretrain resnet resnet_saver.restore(self._tf_sess, A3CConfig.resnet_pretrain_model) self._iter_idx = 0 else: if not self._feature_mode: # load pretrain resnet resnet_saver.restore(self._tf_sess, A3CConfig.resnet_pretrain_model) self.load(self._model_save_path, check_point) self._iter_idx = check_point # start training # create learner threads learner_threads = [threading.Thread(target=self.learner_thread, args=(thread_id, ))\ for thread_id in xrange(self._num_threads)] for t in learner_threads: t.start() for t in learner_threads: t.join() print('Training started, please open Tensorboard to monitor the training process.') # Show the agents training and write summary statistics """
def grad_wrapper(fwd_op, *bwd_grads): """ :param tf.Operation fwd_op: for fwd_op.inputs and fwd_op.outputs :param list[tf.Tensor] bwd_grads: :return: list of tensors of gradients for each input :rtype: list[tf.Tensor] """ assert len(bwd_grads) == len(fwd_op.outputs) grad_inputs = list(fwd_op.inputs) + list(fwd_op.outputs) + list(bwd_grads) grad_inputs = self.description._filter_grad_inputs(grad_inputs) grad_outputs = TFUtil.make_var_tuple(grad_op(*grad_inputs)) if grad_description.num_dummy_outs > 0: grad_outputs = grad_outputs[:-grad_description.num_dummy_outs] grad_outputs = self.description.make_results_of_gradient(grad_outputs) return grad_outputs
def _make_mod(self): if self.cache_key in self.mod_cache: return self.mod_cache[self.cache_key] from Util import find_lib # Note about BLAS linkage: # TensorFlow (or its Eigen lib) likely has linked against some BLAS lib itself. # For our CPU code, we directly call some BLAS functions such as `sgemm_`. # On platforms where there is a flat namespace (e.g. Mac), # it probably is not needed to explicitly link it again for this module. # In other cases, it's probably needed, but it's not so clear which lib has the # right symbols (e.g. the `sgemm_` symbol). ld_flags = [] if self.search_for_numpy_blas: # Find related Numpy libs. # Numpy usually comes with OpenBlas, and Numpy is probably loaded anyway. # Even do this before the other libs below, as it is likely # that this OpenBlas lib is correctly initialized already. import numpy numpy_dir = os.path.dirname(numpy.__file__) if os.path.exists("%s/.libs" % numpy_dir): ld_flags += ["-L%s/.libs" % numpy_dir] from glob import glob for f in glob("%s/.libs/*.so" % numpy_dir): f = os.path.basename(f) if f.startswith("lib"): f = f[3:] if f.endswith(".so"): f = f[:-3] ld_flags += ["-l%s" % f] if self.search_for_system_blas: # Try to just link against blas/f77blas # (both can potentially have the symbol) if it finds the lib. if find_lib("blas"): ld_flags += ["-lblas"] if find_lib("f77blas"): ld_flags += ["-lf77blas"] comp = TFUtil.OpCodeCompiler( base_name=self.name, code_version=self.description.code_version, code=self._make_code(), include_deps=[self.support_native_op_cpp_filename], ld_flags=ld_flags, use_cuda_if_available=self.with_cuda, **dict(self.compiler_opts)) mod = comp.load_tf_module() self.mod_cache[self.cache_key] = mod return mod
def learn(self, check_point, use_gpu, gpu_id=None): device_str = TFUtil.get_device_str(use_gpu=use_gpu, gpu_id=gpu_id) with tf.Graph().as_default(): with tf.device(device_str): # build network self._init_network() # initialize all variables init = tf.global_variables_initializer() # create auxiliary operations: summary and saver self._tf_summary_op = tf.summary.merge_all() self._summary_writer = tf.summary.FileWriter( A3CConfig.summary_folder, self._tf_sess.graph) # initialize or load network variables if check_point is None: self._tf_sess.run(init) self._iter_idx = 0 else: self.load(self._model_save_path, check_point) self._iter_idx = check_point # start training # create learner threads learner_threads = [threading.Thread(target=self.learner_thread, args=(thread_id, ))\ for thread_id in xrange(self._num_threads)] for t in learner_threads: t.start() print( 'Training started, please open Tensorboard to monitor the training process.' ) # Show the agents training and write summary statistics """ last_summary_time = 0 while True: now = time.time() if now - last_summary_time > SUMMARY_INTERVAL: summary_str = session.run(summary_op) writer.add_summary(summary_str, float(T)) last_summary_time = now """ for t in learner_threads: t.join()
def test(self, dueling_dqn, model_load_path, use_gpu, gpu_id, summary_folder): # build network device_str = TFUtil.get_device_str(use_gpu=use_gpu, gpu_id=gpu_id) with tf.device(device_str): self._init_dqn(dueling_dqn) # initialize all variables from the model self.load(model_load_path) # start new episode self._request_new_episode() # perform testing for i in range(config.max_iterations): # select and perform action state = self._get_current_state() state = state[np.newaxis] target = self._cur_target[np.newaxis] Q = self._evaluate_q(state, target) a = self._select_action(Q, i, test_mode=True) self._perform_action(a) if self._env.episode_done(): print('total_reward received: {0}'.format(self._env.get_total_episode_reward())) print('total_steps: {0}'.format(self._env.get_steps_count())) self._request_new_episode()
class OpMaker(object): """ https://www.tensorflow.org/versions/master/how_tos/adding_an_op/ """ with_cuda = None # type: None|bool # https://github.com/tensorflow/tensorflow/issues/6602 tf_blas_gemm_workaround = TFUtil.tf_version_tuple() < (1, 5, 0) global_lock = RLock() mod_cache = {} # cache_key -> mod op_cache = {} # cache_key -> op def __init__(self, description, compiler_opts=None, search_for_numpy_blas=True): """ :param OpDescription description: :param dict[str]|None compiler_opts: passed on to OpCodeCompiler as kwargs """ self._cls_init() self.description = description self.name = description.name self.compiler_opts = compiler_opts or {} self.search_for_numpy_blas = search_for_numpy_blas @classmethod def _cls_init(cls): if cls.with_cuda is None: cls.with_cuda = TFUtil.CudaEnv.get_instance().is_available() if cls.with_cuda and cls.tf_blas_gemm_workaround: cls._load_cuda_blas_gemm() @classmethod def cuda_blas_gemm_so_filename(cls): from tensorflow.contrib.rnn.python.ops import lstm_ops lstm_ops_so = "%s/_lstm_ops.so" % os.path.dirname(lstm_ops.__file__) assert os.path.exists(lstm_ops_so) return lstm_ops_so @classmethod def _load_cuda_blas_gemm(cls): """ https://github.com/tensorflow/tensorflow/issues/6602 As a workaround for TF issue 6602, we link to some functions which are implemented in contrib.rnn.kernels.blas_gemm. See NativeOp.cpp. To make the symbols available in the namespace, load the library now. This issue if fixed with tensorflow 1.5 """ if TFUtil.CudaEnv.verbose_find_cuda: print("Load tf.contrib lstm_ops...") lstm_ops_so = cls.cuda_blas_gemm_so_filename() if TFUtil.CudaEnv.verbose_find_cuda: print("Load tf.contrib lstm_ops lib:", lstm_ops_so) # Maybe a bit hacky: Just load all symbols into the global namespace. from ctypes import RTLD_GLOBAL, CDLL CDLL(lstm_ops_so, mode=RTLD_GLOBAL) if TFUtil.CudaEnv.verbose_find_cuda: print("tf.contrib lstm_ops lib loaded.") @property def op_name(self): return self.name @property def cache_key(self): return self.name @property def support_native_op_cpp_filename(self): my_dir = os.path.abspath(os.path.dirname(__file__) or os.getcwd()) my_dir = os.path.realpath(my_dir) # Make canonical path-name. support_native_op_cpp_filename = "%s/NativeOp.cpp" % my_dir assert os.path.exists(support_native_op_cpp_filename) return support_native_op_cpp_filename def _make_code(self): # In the user code, we assume that we have the following variables: # int n_inputs; int n_outputs; # Ndarray* inputs[n_inputs]; Ndarray** outputs[n_outputs]; # Reference: # https://www.tensorflow.org/extend/adding_an_op # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/g3doc/how_tos/adding_an_op/ # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/op_kernel.h # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/op_def_builder.h # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/pad_op.cc # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/debug_ops.h CopyOp... # http://stackoverflow.com/questions/37565367/designing-an-accumulating-tensorflow-gpu-operator # We also include NativeOp.cpp. in_info, out_info, _ = NativeOp.NativeOp._resolve_want_inplace_dummy( in_info=self.description.in_info, out_info=self.description.out_info) out_is_ref = dict() # output vars which are inplace, out_name -> in_idx # want_inplace: output-index which this input should operate on # Unlike the Theano variant, we always do it inplace, # so the user has to make a copy if this is not the intention. for in_idx, v in enumerate(in_info): out_idx = v.get("want_inplace", -1) if out_idx >= 0: out_name = out_info[out_idx]["name"] assert out_name not in out_is_ref out_is_ref[out_name] = in_idx def map_name(v, is_out=False): name = v["name"].lower() if is_out: # Maybe it clashes with some input name. TF doesn't allow the same name. if any([v["name"].lower() == name for v in in_info]): name = "out_%s" % name return name def map_type(v, is_out=False): t = v.get("dtype", "float32") return t code_register_op_io = "" for v in in_info: code_register_op_io += ".Input(\"%s: %s\")\n" % (map_name(v), map_type(v)) for v in out_info: code_register_op_io += ".Output(\"%s: %s\")\n" % (map_name(v, is_out=True), map_type(v, is_out=True)) code_set_out_shape = "" def make_dim_str(c): if isinstance(c, tuple): in_idx, in_dim = c return "c->Dim(c->input(%i), %i)" % (in_idx, in_dim) elif isinstance(c, int): return str(c) else: raise Exception("type: %s" % type(c)) for i, v in enumerate(in_info): code_set_out_shape += """ if(c->Rank(c->input(%(idx)i)) != tensorflow::shape_inference::InferenceContext::kUnknownRank && c->Rank(c->input(%(idx)i)) != %(rank)i) return errors::InvalidArgument( "wrong rank for input (%(idx)i) '%(name)s'. required %(rank)i but got ", c->Rank(c->input(%(idx)i))); """ % {"idx": i, "rank": v["ndim"], "name": v["name"]} for i, v in enumerate(out_info): code_set_out_shape += "c->set_output(%i, c->MakeShape({%s}));\n" % ( i, ", ".join([make_dim_str(c) for c in v["shape"]])) code_register_op_io += """ .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) { if(c->num_inputs() != %(num_inputs)i) return errors::InvalidArgument("wrong number of inputs. required %(num_inputs)i but got ", c->num_inputs()); if(c->num_outputs() != %(num_outputs)i) return errors::InvalidArgument("wrong number of outputs. required %(num_outputs)i but got ", c->num_outputs()); %(code_set_out_shape)s return Status::OK(); }) """ % { "num_inputs": len(in_info), "num_outputs": len(out_info), "code_set_out_shape": code_set_out_shape } code_forward_io = "" for in_idx, v in enumerate(in_info): out_idx = v.get("want_inplace", -1) if out_idx >= 0: code_forward_io += "context->forward_ref_input_to_ref_output(%i, %i);\n" % (in_idx, out_idx) code_set_io = "" for in_idx, v in enumerate(in_info): ndim = len(v["shape"]) code_set_io += """ OP_REQUIRES( context, context->input(%i).dims() == %i, errors::InvalidArgument("shape ndim is not %i, got shape ", context->input(%i).shape().DebugString())); """ % (in_idx, ndim, ndim, in_idx) for axis, d in enumerate(v["shape"]): if isinstance(d, int): code_set_io += """ OP_REQUIRES( context, context->input(%i).dim_size(%i) == %i, errors::InvalidArgument("shape[%i] != %i, got shape ", context->input(%i).shape().DebugString())); """ % (in_idx, axis, d, axis, d, in_idx) code_set_io += """ Ndarray* inputs[n_inputs]; Ndarray** outputs[n_outputs]; """ for in_idx, v in enumerate(in_info): out_idx = v.get("want_inplace", -1) if out_idx >= 0: # is ref # mutable_input if it is a ref-type, i.e. a Variable. #code_set_io += "Ndarray mutable_input_%i = context->mutable_input(%i, false);\n" % (in_idx, in_idx) #code_set_io += "inputs[%i] = &mutable_input_%i;\n" % (in_idx, in_idx) # Maybe we could use a TemporaryVariable or so but not sure if the gradient will flow through tf.assign(). # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/ops/state_ops.cc # but a normal tensor is never mutable, thus create a copy of the input now. code_set_io += "Ndarray* output_%i = NULL;\n" % (out_idx,) cshape = "TensorShape({%s})" % ", ".join(["context->input(%i).dim_size(%i)" % (in_idx, in_dim) for in_dim in range(len(v["shape"]))]) code_set_io += "OP_REQUIRES_OK(context, context->allocate_output(%i, %s, &output_%i));\n" % (out_idx, cshape, out_idx) code_set_io += "inputs[%i] = output_%i;\n" % (in_idx, out_idx) # We always make a copy for now. # I'm not sure if inplace is an option for TF because we don't know if any other operation in the graph # wants to access it. Maybe we can check the reference count or so? # Some references for inplace operations: # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/inplace_ops.cc # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/strided_slice_op.cc code_set_io += "make_copy(context, inputs[%i], &context->input(%i));\n" % (in_idx, in_idx) else: # no ref # TODO: if not on GPU but GPU requested, move to GPU first, maybe via allocate_temp? code_set_io += "inputs[%i] = const_cast<Ndarray*>(&context->input(%i));\n" % (in_idx, in_idx) for out_idx, v in enumerate(out_info): out_name = out_info[out_idx]["name"] if out_name in out_is_ref: # is ref on input in_idx = out_is_ref[out_name] code_set_io += "outputs[%i] = &inputs[%i];\n" % (out_idx, in_idx) else: # no ref code_set_io += "Ndarray* output_%i = NULL;\n" % (out_idx,) code_set_io += "outputs[%i] = &output_%i;\n" % (out_idx, out_idx) cshape = "TensorShape({%s})" % ", ".join(["inputs[%i]->dim_size(%i)" % (in_idx, in_dim) for (in_idx, in_dim) in v["shape"]]) code_set_io += "OP_REQUIRES_OK(context, context->allocate_output(%i, %s, &output_%i));\n" % (out_idx, cshape, out_idx) code_set_io += "Ndarray_set_zero(*outputs[%i]);\n" % out_idx code_user = self.description.c_fw_code % {"fail": "assert(false);"} code_compute = "\n".join([ code_forward_io, code_set_io, code_user]) register_gpu_kernel_opts = ".Device(DEVICE_GPU)\n" for v in in_info: if v.get("host_memory", False): register_gpu_kernel_opts += """.HostMemory("%s")\n""" % map_name(v) format_args = { "op_name": self.op_name, "code_register_op_io": code_register_op_io, "code_forward_io": code_forward_io, "code_set_io": code_set_io, "code_compute": code_compute, "user_code_kernels": self.description._reduce_c_extra_support_code(self.description.c_extra_support_code), "native_op_cpp_filename": self.support_native_op_cpp_filename, "register_gpu_kernel_opts": register_gpu_kernel_opts, "n_inputs": len(in_info), "n_outputs": len(out_info) } code_header = "" if self.with_cuda: code_header += """ // For Eigen::GpuDevice. #define EIGEN_USE_GPU 1 """ code_header += """ // For Eigen::ThreadPoolDevice. #define EIGEN_USE_THREADS 1 #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/shape_inference.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/common_runtime/device.h" """ if self.with_cuda: # http://docs.nvidia.com/cuda/cublas code_header += """ #include <cuda.h> #include <cuda_runtime.h> #include <cublas_v2.h> #include <math_constants.h> """ if not self.tf_blas_gemm_workaround: # https://github.com/tensorflow/tensorflow/issues/6602 ? code_header += '#include "tensorflow/core/platform/stream_executor.h"\n' # sgemm code_header += """ typedef float real; typedef int integer; extern "C" { extern int sgemm_(char *transa, char *transb, integer *m, integer *n, integer *k, const real *alpha, const real *a, integer *lda, const real *b, integer *ldb, const real *beta, real *c, integer *ldc); } """ code_header += """ using namespace tensorflow; #define _ns // so _ns::something will use the root namespace #define TENSORFLOW 1 #define CUDA 0 #include "%(native_op_cpp_filename)s" static const int n_inputs = %(n_inputs)i, n_outputs = %(n_outputs)i; REGISTER_OP("%(op_name)s") %(code_register_op_io)s; """ % format_args if self.description.cpu_support: code_cpu_op = """ %(user_code_kernels)s class %(op_name)sOp : public OpKernel { public: explicit %(op_name)sOp(OpKernelConstruction* context) : OpKernel(context) {} void Compute(OpKernelContext* context) override { %(code_compute)s } }; REGISTER_KERNEL_BUILDER(Name("%(op_name)s").Device(DEVICE_CPU), %(op_name)sOp); """ % format_args else: code_cpu_op = "" if self.with_cuda: code_gpu_op = """ namespace _gpu { #ifdef _ns #undef _ns #endif namespace _ns = ::_gpu; #undef CUDA #define CUDA 1 #undef Ndarray_memcpy #undef Ndarray_memset #undef Ndarray_sgemm #undef DEF_KERNEL #undef start_dev_kernel #undef assert_cmp #undef threadIdx #undef blockIdx #undef blockDim #undef gridDim #include "%(native_op_cpp_filename)s" %(user_code_kernels)s class %(op_name)sGpuOp : public OpKernel { public: explicit %(op_name)sGpuOp(OpKernelConstruction* context) : OpKernel(context) {} void Compute(OpKernelContext* context) override { %(code_compute)s } }; REGISTER_KERNEL_BUILDER( Name("%(op_name)s") %(register_gpu_kernel_opts)s, %(op_name)sGpuOp); } """ % format_args else: code_gpu_op = "" return code_header + code_cpu_op + code_gpu_op def _make_mod(self): if self.cache_key in self.mod_cache: return self.mod_cache[self.cache_key] from Util import find_lib # Note about BLAS linkage: # TensorFlow (or its Eigen lib) likely has linked against some BLAS lib itself. # For our CPU code, we directly call some BLAS functions such as `sgemm_`. # On platforms where there is a flat namespace (e.g. Mac), # it probably is not needed to explicitly link it again for this module. # In other cases, it's probably needed, but it's not so clear which lib has the # right symbols (e.g. the `sgemm_` symbol). # The current solution is just to link against blas/f77blas # (both can potentially have the symbol) if it finds the lib. ld_flags = [] if find_lib("blas"): ld_flags += ["-lblas"] if find_lib("f77blas"): ld_flags += ["-lf77blas"] # Another option to find some BLAS lib. if self.search_for_numpy_blas: import numpy numpy_dir = os.path.dirname(numpy.__file__) if os.path.exists("%s/.libs" % numpy_dir): ld_flags += ["-L%s/.libs" % numpy_dir] from glob import glob for f in glob("%s/.libs/*.so" % numpy_dir): f = os.path.basename(f) if f.startswith("lib"): f = f[3:] if f.endswith(".so"): f = f[:-3] ld_flags += ["-l%s" % f] comp = TFUtil.OpCodeCompiler( base_name=self.name, code_version=self.description.code_version, code=self._make_code(), include_deps=[self.support_native_op_cpp_filename], ld_flags=ld_flags, use_cuda_if_available=self.with_cuda, **dict(self.compiler_opts)) mod = comp.load_tf_module() self.mod_cache[self.cache_key] = mod return mod def make_op(self): with self.global_lock: if self.cache_key in self.op_cache: return self.op_cache[self.cache_key] mod = self._make_mod() op = getattr(mod, camel_case_to_snake_case(self.op_name)) self.op_cache[self.cache_key] = op if self.description.is_grad_defined: grad_description = self.description.grad() grad_op_maker = OpMaker(description=grad_description, compiler_opts=self.compiler_opts, search_for_numpy_blas=self.search_for_numpy_blas) grad_op = grad_op_maker.make_op() from tensorflow.python.framework import ops def grad_wrapper(fwd_op, *bwd_grads): """ :param tf.Operation fwd_op: for fwd_op.inputs and fwd_op.outputs :param list[tf.Tensor] bwd_grads: :return: list of tensors of gradients for each input :rtype: list[tf.Tensor] """ assert len(bwd_grads) == len(fwd_op.outputs) grad_inputs = list(fwd_op.inputs) + list(fwd_op.outputs) + list(bwd_grads) grad_inputs = self.description._filter_grad_inputs(grad_inputs) grad_outputs = TFUtil.make_var_tuple(grad_op(*grad_inputs)) if grad_description.num_dummy_outs > 0: grad_outputs = grad_outputs[:-grad_description.num_dummy_outs] grad_outputs = self.description.make_results_of_gradient(grad_outputs) return grad_outputs grad_wrapper.__name__ = grad_description.name grad_wrapper.grad_op = grad_op ops.RegisterGradient(self.name)(grad_wrapper) op.grad_wrapper = grad_wrapper op.grad_op = grad_op return op
sys.path += [os.path.dirname(os.path.abspath(__file__)) + "/.."] from nose.tools import assert_equal, assert_is_instance import contextlib import unittest import numpy.testing from pprint import pprint import better_exchook better_exchook.replace_traceback_format_tb() from Config import Config from TFNetwork import * from TFNetworkLayer import * from TFEngine import * from Log import log import TFUtil TFUtil.debug_register_better_repr() log.initialize(verbosity=[5]) @contextlib.contextmanager def make_scope(): with tf.Graph().as_default() as graph: with tf.Session(graph=graph) as session: yield session network = {} _last = "data"
sys.path += [os.path.dirname(os.path.abspath(__file__)) + "/.."] from nose.tools import assert_equal, assert_is_instance import contextlib import unittest import numpy.testing from pprint import pprint import better_exchook better_exchook.replace_traceback_format_tb() from Config import Config from TFNetwork import * from TFNetworkLayer import * from TFEngine import * from Log import log import TFUtil TFUtil.debug_register_better_repr() log.initialize(verbosity=[5]) @contextlib.contextmanager def make_scope(): with tf.Graph().as_default() as graph: with tf.Session(graph=graph) as session: yield session network = {} _last = "data" def build_resnet(conv_time_dim): # network # (also defined by num_inputs & num_outputs)
def build_actor_critic_network(scope, num_action, num_scene): # input: # num_action : number of available actions # num_scene : number of available scene ### maybe better to use list of scene names? with tf.variable_scope(scope): # get the nodes of input images and output features with tf.name_scope('inputs'): global_step = tf.placeholder(name='global_step', shape=None, dtype=tf.int32) state_placeholder = tf.placeholder(name='state', shape=(None, 2048), dtype=tf.float32) target_placeholder = tf.placeholder(name='target', shape=(None, 2048), dtype=tf.float32) action_placeholder = tf.placeholder(name='taken_action', shape=(None, ), dtype=tf.int32) q_value_placeholder = tf.placeholder(name='q_value', shape=(None, ), dtype=tf.float32) """ advantage_placeholder = tf.placeholder( name = 'advantage', shape = (None, ), dtype = tf.float32) """ # compute embedded feature given the input image feature variable_dict = {} with tf.variable_scope('shared_layers') as tmp_scope: # fc1 state_flattened = tf.reshape( state_placeholder, (-1, A3CConfig.num_history_frames * 2048)) fc1_state = TFUtil.fc_layer( 'fc1', state_flattened, input_size=A3CConfig.num_history_frames * 2048, num_neron=512, variable_dict=variable_dict) tmp_scope.reuse_variables() target_flattened = tf.reshape( target_placeholder, (-1, A3CConfig.num_history_frames * 2048)) fc1_target = TFUtil.fc_layer( 'fc1', target_flattened, input_size=A3CConfig.num_history_frames * 2048, num_neron=512, variable_dict=variable_dict) with tf.variable_scope('shared_layers'): # fc2 fc2 = TFUtil.fc_layer('fc2', tf.concat((fc1_state, fc1_target), axis=1), input_size=1024, num_neron=512, variable_dict=variable_dict) # outputs policy_logits_dict = {} policy_prob_dict = {} state_value_dict = {} for scene in THORConfig.supported_envs: with tf.variable_scope(scene, reuse=False): # fc3 shared for policy and value output fc3 = TFUtil.fc_layer('fc3', fc2, input_size=512, num_neron=512, variable_dict=variable_dict) # policy output policy_logits = TFUtil.fc_layer('policy_logits', fc3, input_size=512, num_neron=num_action, activation=None, variable_dict=variable_dict) policy_probs = tf.nn.softmax(name='policy_probs', logits=policy_logits) # value output state_value = tf.squeeze(TFUtil.fc_layer( 'value', fc3, input_size=512, num_neron=1, activation=None, variable_dict=variable_dict), axis=1) # add output to list policy_logits_dict[scene] = policy_logits policy_prob_dict[scene] = policy_probs state_value_dict[scene] = state_value local_summaries = [] with tf.variable_scope('loss'): scene_losses = {} for scene in THORConfig.supported_envs: # policy loss log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits( name='log_prob', labels=action_placeholder, logits=policy_logits_dict[scene]) policy_loss = -tf.reduce_sum(log_prob * tf.stop_gradient( q_value_placeholder - state_value_dict[scene])) # regularization for A3C delay policy_entropy = -0.01 * tf.reduce_sum( policy_prob_dict[scene] * tf.log( tf.clip_by_value(policy_prob_dict[scene], 1e-20, 1))) # value_loss value_loss = 0.5 * tf.reduce_sum( tf.square(q_value_placeholder - state_value_dict[scene])) # need to tweak weight scene_losses[scene] = policy_loss + value_loss - policy_entropy with tf.name_scope(scene): local_summaries.append( tf.summary.scalar('policy_loss', policy_loss)) local_summaries.append( tf.summary.scalar('policy_entropy', policy_entropy)) local_summaries.append( tf.summary.scalar('value_loss', value_loss)) local_summary_op = tf.summary.merge(local_summaries) with tf.variable_scope('train_ops', reuse=(scope != "global")): train_ops = {} # train_op # optional: varying learning_rate learning_rate = tf.train.polynomial_decay( learning_rate=A3CConfig.learning_rate, global_step=global_step, decay_steps=A3CConfig.decay_step, end_learning_rate=A3CConfig.end_learning_rate) # create optimizer #optimizer = tf.train.AdamOptimizer(learning_rate = A3CConfig.learning_rate) optimizer = tf.train.RMSPropOptimizer( learning_rate=learning_rate, decay=A3CConfig.decay_rate, epsilon=0.1) #, momentum = A3CConfig.momentum) for scene in THORConfig.supported_envs: # get local trainable variables and compute gradients local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) grad_var = optimizer.compute_gradients(scene_losses[scene], var_list=local_vars) # optional: gradient clipping clipped_grad_var = [] for grad, var in grad_var: if grad is not None: clipped_grad_var.append((tf.clip_by_value(grad, -40., 40.), var)) else: clipped_grad_var.append((None, var)) grad_var = clipped_grad_var # apply gradient to global variables if scope != 'global': global_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='global') tmp = [] for i in xrange(len(grad_var)): tmp.append((grad_var[i][0], global_vars[i])) grad_var = tmp train_ops[scene] = optimizer.apply_gradients(grad_var) with tf.variable_scope('global', reuse=(scope != "global")): reward = tf.get_variable(name='global/reward', shape=(1, ), dtype=tf.float32, trainable=False) num_step = tf.get_variable(name='global/num_steps', shape=(1, ), dtype=tf.int32, trainable=False) return (global_step, state_placeholder, target_placeholder, action_placeholder, q_value_placeholder), \ (policy_prob_dict, state_value_dict), \ (train_ops, local_summary_op), \ (reward, num_step)
def build_dqn(num_action, dueling_dqn=False): with tf.variable_scope('DQN'): # Prediction Network with tf.variable_scope('Prediction'): pn_variable_dict = {} # inference pn_states = tf.placeholder(name='input', shape=(None, 84, 84, 4), dtype=tf.float32) pn_conv1 = TFUtil.conv_layer('conv1', pn_states, shape=[8, 8, 4, 32], stride=4, variable_dict=pn_variable_dict) pn_conv2 = TFUtil.conv_layer('conv2', pn_conv1, shape=[4, 4, 32, 64], stride=2, variable_dict=pn_variable_dict) pn_conv3 = TFUtil.conv_layer('conv3', pn_conv2, shape=[3, 3, 64, 64], stride=1, variable_dict=pn_variable_dict) pn_conv3_flatten = TFUtil.flatten(pn_conv3, feature_length=(7 * 7 * 64)) if dueling_dqn: pn_fc4_a = TFUtil.fc_layer('fc4_a', pn_conv3_flatten, input_size=(7 * 7 * 64), num_neron=512, variable_dict=pn_variable_dict) pn_value = TFUtil.fc_layer('value', pn_fc4_a, input_size=512, num_neron=1, activation=None, variable_dict=pn_variable_dict) pn_fc4_b = TFUtil.fc_layer('fc4_b', pn_conv3_flatten, input_size=(7 * 7 * 64), num_neron=512, variable_dict=pn_variable_dict) pn_advantage = TFUtil.fc_layer('advantage', pn_fc4_b, input_size=512, num_neron=num_action, activation=None, variable_dict=pn_variable_dict) pn_Q = (pn_advantage - tf.reshape(tf.reduce_mean(pn_advantage, axis=1), (-1, 1))) + tf.reshape(pn_value, (-1, 1)) else: pn_fc4 = TFUtil.fc_layer('fc4', pn_conv3_flatten, input_size=(7 * 7 * 64), num_neron=512, variable_dict=pn_variable_dict) pn_Q = TFUtil.fc_layer('Q', pn_fc4, input_size=512, num_neron=num_action, activation=None, variable_dict=pn_variable_dict) # loss pn_q_target = tf.placeholder(name='q_target', shape=(None, ), dtype=tf.float32) pn_actions = tf.placeholder(name='action', shape=(None, ), dtype=tf.int32) pn_actions_one_hot = tf.one_hot(pn_actions, depth=num_action) pn_delta = tf.reduce_sum(pn_actions_one_hot * pn_Q, axis=1) - pn_q_target pn_loss = tf.reduce_sum( TFUtil.huber_loss(pn_delta)) / DQNConfig.batch_size # summary summary_pn_loss = tf.summary.scalar('pn_loss', pn_loss) summary_averaged_pn_Q = tf.summary.scalar('averaged_pn_Q', tf.reduce_mean(pn_Q)) # optimizer pn_train = tf.train.RMSPropOptimizer( learning_rate=DQNConfig.lr).minimize(pn_loss) # Target Network with tf.variable_scope('Target'): tn_variable_dict = {} # inference tn_states = tf.placeholder(name='input', shape=(None, 84, 84, 4), dtype=tf.float32) tn_conv1 = TFUtil.conv_layer('conv1', tn_states, shape=[8, 8, 4, 32], stride=4, variable_dict=tn_variable_dict) tn_conv2 = TFUtil.conv_layer('conv2', tn_conv1, shape=[4, 4, 32, 64], stride=2, variable_dict=tn_variable_dict) tn_conv3 = TFUtil.conv_layer('conv3', tn_conv2, shape=[3, 3, 64, 64], stride=1, variable_dict=tn_variable_dict) tn_conv3_flatten = TFUtil.flatten(tn_conv3, feature_length=(7 * 7 * 64)) if dueling_dqn: tn_fc4_a = TFUtil.fc_layer('fc4_a', tn_conv3_flatten, input_size=(7 * 7 * 64), num_neron=512, variable_dict=tn_variable_dict) tn_value = TFUtil.fc_layer('value', tn_fc4_a, input_size=512, num_neron=1, activation=None, variable_dict=tn_variable_dict) tn_fc4_b = TFUtil.fc_layer('fc4_b', tn_conv3_flatten, input_size=(7 * 7 * 64), num_neron=512, variable_dict=tn_variable_dict) tn_advantage = TFUtil.fc_layer('advantage', tn_fc4_b, input_size=512, num_neron=num_action, activation=None, variable_dict=tn_variable_dict) tn_Q = (tn_advantage - tf.reshape(tf.reduce_mean(tn_advantage, axis=1), (-1, 1))) + tf.reshape(tn_value, (-1, 1)) else: tn_fc4 = TFUtil.fc_layer('fc4', tn_conv3_flatten, input_size=(7 * 7 * 64), num_neron=512, variable_dict=tn_variable_dict) tn_Q = TFUtil.fc_layer('Q', tn_fc4, input_size=512, num_neron=num_action, activation=None, variable_dict=tn_variable_dict) # Network Cloning with tf.variable_scope('Prediction_to_Target'): network_cloning_ops = [] assert (tn_variable_dict.keys() == pn_variable_dict.keys()) for k in tn_variable_dict.keys(): network_cloning_ops.append( tf.assign(tn_variable_dict[k], pn_variable_dict[k])) # Performance Evaluation with tf.variable_scope('performance_evaluation'): episode_reward = tf.placeholder(name='episode_reward', shape=(), dtype=tf.float32) summary_avg_episode_reward = tf.summary.scalar( 'episode_reward', episode_reward) return (pn_states, pn_Q, pn_loss, pn_actions, pn_q_target, pn_train), (tn_states, tn_Q), network_cloning_ops, ( summary_pn_loss, summary_averaged_pn_Q), (episode_reward, summary_avg_episode_reward)
def build_network(scope, num_action, dueling_dqn): with tf.variable_scope(scope): with tf.variable_scope('Prediction'): with tf.name_scope('inputs'): # resnet feature pn_state_placeholder = tf.placeholder( name='state', shape=(None, config.num_history_frames, 2048), # (n, 4, 2048) dtype=tf.float32) # target feature pn_target_placeholder = tf.placeholder( name='target', shape=(None, config.num_history_frames, 2048), # (n, 4, 2048) dtype=tf.float32) pn_q_target = tf.placeholder(name='q_target', shape=(None, ), dtype=tf.float32) pn_actions = tf.placeholder(name='action', shape=(None, ), dtype=tf.int32) # compute embedded feature given the input image feature pn_variable_dict = {} with tf.variable_scope('shared_layers') as scope: # fc1 state_flattened = tf.reshape( pn_state_placeholder, (-1, config.num_history_frames * 2048)) # (n, 4 * 2048) fc1_state = TFUtil.fc_layer( 'fc1', state_flattened, input_size=config.num_history_frames * 2048, num_neron=512, variable_dict=pn_variable_dict) # (n, 512) scope.reuse_variables() target_flattened = tf.reshape( pn_target_placeholder, (-1, config.num_history_frames * 2048)) # (n, 4 * 2048) fc1_target = TFUtil.fc_layer( 'fc1', target_flattened, input_size=config.num_history_frames * 2048, num_neron=512, variable_dict=pn_variable_dict) # (n, 512) with tf.variable_scope('shared_layers'): # fc2: fc2 = TFUtil.fc_layer( 'fc2', tf.concat((fc1_state, fc1_target), axis=1), # (n, 1024) input_size=1024, num_neron=512, variable_dict=pn_variable_dict) # (n, 512) # output: copied and modified from DQNNet.py if dueling_dqn: pn_fc4_a = TFUtil.fc_layer('fc4_a', fc2, input_size=(512), num_neron=512, variable_dict=pn_variable_dict) pn_value = TFUtil.fc_layer('value', pn_fc4_a, input_size=512, num_neron=1, activation=None, variable_dict=pn_variable_dict) pn_fc4_b = TFUtil.fc_layer('fc4_b', fc2, input_size=(512), num_neron=512, variable_dict=pn_variable_dict) pn_advantage = TFUtil.fc_layer('advantage', pn_fc4_b, input_size=512, num_neron=num_action, activation=None, variable_dict=pn_variable_dict) pn_Q = (pn_advantage - tf.reshape(tf.reduce_mean(pn_advantage, axis=1), (-1, 1))) + tf.reshape(pn_value, (-1, 1)) else: pn_fc4 = TFUtil.fc_layer('fc4', fc2, input_size=(512), num_neron=512, variable_dict=pn_variable_dict) pn_Q = TFUtil.fc_layer('Q', pn_fc4, input_size=512, num_neron=num_action, activation=None, variable_dict=pn_variable_dict) # loss pn_actions_one_hot = tf.one_hot(pn_actions, depth=num_action) pn_delta = tf.reduce_sum(pn_actions_one_hot * pn_Q, axis=1) - pn_q_target pn_importance_weight = tf.placeholder(name='importance_weight', shape=(None), dtype=tf.float32) pn_weighted_delta = tf.multiply(pn_delta, pn_importance_weight) pn_loss = tf.reduce_sum( TFUtil.huber_loss(pn_delta)) / config.batch_size # summary summary_pn_loss = tf.summary.scalar('pn_loss', pn_loss) summary_averaged_pn_Q = tf.summary.scalar('averaged_pn_Q', tf.reduce_mean(pn_Q)) # optimizer pn_train = tf.train.RMSPropOptimizer( learning_rate=config.lr).minimize(pn_loss) with tf.variable_scope('Target'): with tf.name_scope('inputs'): # resnet feature tn_state_placeholder = tf.placeholder( name='state', shape=(None, config.num_history_frames, 2048), # (n, 4, 2048) dtype=tf.float32) # target feature tn_target_placeholder = tf.placeholder( name='target', shape=(None, config.num_history_frames, 2048), # (n, 4, 2048) dtype=tf.float32) # compute embedded feature given the input image feature tn_variable_dict = {} with tf.variable_scope('shared_layers') as scope: # fc1 state_flattened = tf.reshape( tn_state_placeholder, (-1, config.num_history_frames * 2048)) # (n, 4 * 2048) fc1_state = TFUtil.fc_layer( 'fc1', state_flattened, input_size=config.num_history_frames * 2048, num_neron=512, variable_dict=tn_variable_dict) # (n, 512) scope.reuse_variables() target_flattened = tf.reshape( tn_target_placeholder, (-1, config.num_history_frames * 2048)) # (n, 4 * 2048) fc1_target = TFUtil.fc_layer( 'fc1', target_flattened, input_size=config.num_history_frames * 2048, num_neron=512, variable_dict=tn_variable_dict) # (n, 512) with tf.variable_scope('shared_layers'): # fc2: fc2 = TFUtil.fc_layer( 'fc2', tf.concat((fc1_state, fc1_target), axis=1), # (n, 1024) input_size=1024, num_neron=512, variable_dict=tn_variable_dict) # (n, 512) # output: copied and modified from DQNNet.py if dueling_dqn: tn_fc4_a = TFUtil.fc_layer('fc4_a', fc2, input_size=(512), num_neron=512, variable_dict=tn_variable_dict) tn_value = TFUtil.fc_layer('value', tn_fc4_a, input_size=512, num_neron=1, activation=None, variable_dict=tn_variable_dict) tn_fc4_b = TFUtil.fc_layer('fc4_b', fc2, input_size=(512), num_neron=512, variable_dict=tn_variable_dict) tn_advantage = TFUtil.fc_layer('advantage', tn_fc4_b, input_size=512, num_neron=num_action, activation=None, variable_dict=tn_variable_dict) tn_Q = (tn_advantage - tf.reshape(tf.reduce_mean(tn_advantage, axis=1), (-1, 1))) + tf.reshape(tn_value, (-1, 1)) else: tn_fc4 = TFUtil.fc_layer('fc4', fc2, input_size=(512), num_neron=512, variable_dict=tn_variable_dict) tn_Q = TFUtil.fc_layer('Q', tn_fc4, input_size=512, num_neron=num_action, activation=None, variable_dict=tn_variable_dict) # Network Cloning with tf.variable_scope('Prediction_to_Target'): network_cloning_ops = [] assert (tn_variable_dict.keys() == pn_variable_dict.keys()) for k in tn_variable_dict.keys(): network_cloning_ops.append( tf.assign(tn_variable_dict[k], pn_variable_dict[k])) # Performance Evaluation with tf.variable_scope('performance_evaluation'): episode_reward = tf.placeholder(name='episode_reward', shape=(), dtype=tf.int32) summary_avg_episode_reward = tf.summary.scalar( 'episode_reward', episode_reward) episode_steps = tf.placeholder(name='episode_steps', shape=(), dtype=tf.int32) summary_avg_episode_steps = tf.summary.scalar( 'episode_steps', episode_steps) return (pn_state_placeholder, pn_target_placeholder, pn_Q, pn_loss, pn_actions, pn_q_target, pn_train, pn_importance_weight, pn_delta), (tn_state_placeholder, tn_target_placeholder, tn_Q), network_cloning_ops, ( summary_pn_loss, summary_averaged_pn_Q), ( episode_reward, summary_avg_episode_reward, episode_steps, summary_avg_episode_steps)
def learn(self, double_dqn, dueling_dqn, model_save_frequency, model_save_path, model_load_path, use_gpu, gpu_id, summary_folder): device_str = TFUtil.get_device_str(use_gpu=use_gpu, gpu_id=gpu_id) with tf.device(device_str): self._init_dqn(dueling_dqn) # initialize all variables init = tf.global_variables_initializer() # create auxiliary operations: summary and saver summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(summary_folder) # initialize if model_load_path is None: self._tf_sess.run(init) else: self.load(model_load_path) # start new episode self._request_new_episode() # first take some random actions to populate the replay memory before learning starts if model_load_path is None: print('Taking random actions to warm up...') for i in range(config.replay_start_size): if i % 100 == 0: print('{0}/{1}'.format(i, config.replay_start_size)) done = self._perform_random_action() if done: self._request_new_episode() print('Training started, please open Tensorboard to monitor the training process.') episode_count=0 for i in range(config.max_iterations): if i % 1000 ==0: print('{0}/{1}'.format(i, config.max_iterations)) # save model if i % model_save_frequency == 0 and i != 0: self.save(model_save_path) # update target_network if i % config.target_network_update_freq == 0: self._tf_sess.run(self._tf_clone_ops) # select and perform action state = self._get_current_state() state = state[np.newaxis] target = self._cur_target[np.newaxis] Q = self._evaluate_q(state, target) a = self._select_action(Q, i, test_mode=False) self._perform_action(a) if self._env.episode_done(): episode_reward = self._env.get_total_episode_reward() summary_episode_reward = self._tf_sess.run(self._tf_summary_episode_reward, feed_dict={self._tf_episode_reward: episode_reward}) summary_writer.add_summary(summary_episode_reward, global_step=episode_count) episode_steps = self._env.get_steps_count() summary_episode_steps = self._tf_sess.run(self._tf_summary_episode_steps, feed_dict={self._tf_episode_steps: episode_steps}) summary_writer.add_summary(summary_episode_steps, global_step=episode_count) episode_count += 1 self._request_new_episode() # sample mini-batch and perform training if config.prioritized_experience_replay: experiences, weights, indices = self._replay_memory.sample(training_step=i) else: experiences = self._replay_memory.sample(config.batch_size) states, targets, actions, states_new, rewards, dones = DQNAgent.decompose_experiences(experiences) # compute q_targets if double_dqn: # double DQN: use prediction network for action selection, use target network for action's Q value evaluation q_new_p = self._tf_sess.run(self._tf_pn_Q, feed_dict={self._tf_pn_state: states_new, self._tf_pn_target: targets}) action = np.argmax(q_new_p, axis=1) q_new_t = self._tf_sess.run(self._tf_tn_Q, feed_dict={self._tf_tn_state: states_new, self._tf_tn_target: targets}) q_new_max = np.array([q_new_t[j, action[j]] for j in range(config.batch_size)]) else: # DQN: use target network for action selection and evaluation q_new = self._tf_sess.run(self._tf_tn_Q, feed_dict={self._tf_tn_state: states_new, self._tf_tn_target: targets}) q_new_max = np.max(q_new, axis=1) q_targets = rewards + q_new_max * config.discounted_factor * (1. - dones.astype(np.int)) if not config.prioritized_experience_replay: weights = np.array([1.0] * config.batch_size) # train _, loss, delta, summary_pn_loss, summary_averaged_pn_Q = self._tf_sess.run([self._tf_pn_train, self._tf_pn_loss, self._tf_pn_delta, self._tf_summary_pn_loss, self._tf_summary_averaged_pn_Q], feed_dict={self._tf_pn_actions: actions, self._tf_pn_state: states, self._tf_pn_target: targets, self._tf_pn_Q_target: q_targets, self._tf_pn_importance_weight: weights}) if config.prioritized_experience_replay: self._replay_memory.update_priority(indices, delta) summary_writer.add_summary(summary_pn_loss, global_step=i) summary_writer.add_summary(summary_averaged_pn_Q, global_step=i) # save model after training self.save(model_save_path)
def test_pool_layer_NCHW(): with make_scope() as session: import numpy as np net = TFNetwork(extern_data=ExternData()) with tf.variable_scope("src_nhwc"): src_nhwc = InternalLayer(name="src_nhwc", network=net, out_type={"dim": 16, "shape": (None, 16, 16), "batch_dim_axis": 0, "time_dim_axis": 1, "feature_dim_axis": 3, "sparse": False }) src_nhwc.output.placeholder = tf.placeholder(shape=(None, None, 16, 16), dtype=tf.float32) src_nhwc.output.size_placeholder = {0: tf.placeholder(shape=(None,), dtype=tf.int32)} with tf.variable_scope("src_nchw"): src_nchw = InternalLayer(name="src_nchw", network=net, out_type={"dim": 16, "shape": (16, None, 16), "batch_dim_axis": 0, "time_dim_axis": 2, "feature_dim_axis": 1, "sparse": False }) src_nchw.output.placeholder = tf.placeholder(shape=(None, 16, None, 16), dtype=tf.float32) src_nchw.output.size_placeholder = {1: tf.placeholder(shape=(None,), dtype=tf.int32)} pool_size = (5, 5) strides = (1, 2) padding = "VALID" with tf.variable_scope("pool_nhwc_from_nhwc"): pool_nhwc_from_nhwc = PoolLayer( name="pool_nhwc_from_nhwc", network=net, mode="max", pool_size=pool_size, padding=padding, strides=strides, use_channel_first=False, sources=[src_nhwc], output=PoolLayer.get_out_data_from_opts(name="pool_nhwc_from_nhwc", pool_size=pool_size, padding=padding, use_channel_first=False, network=net, sources=[src_nhwc])) with tf.variable_scope("pool_nchw_from_nhwc"): pool_nchw_from_nhwc = PoolLayer( name="pool_nchw_from_nhwc", network=net, mode="max", pool_size=pool_size, padding=padding, strides=strides, use_channel_first=True, sources=[src_nhwc], output=PoolLayer.get_out_data_from_opts(name="pool_nchw_from_nhwc", pool_size=pool_size, padding=padding, use_channel_first=True, network=net, sources=[src_nhwc])) with tf.variable_scope("pool_nchw_from_nchw"): pool_nchw_from_nchw = PoolLayer( name="pool_nchw_from_nchw", network=net, mode="max", pool_size=pool_size, padding=padding, strides=strides, use_channel_first=True, sources=[src_nchw], output=PoolLayer.get_out_data_from_opts(name="pool_nchw_from_nchw", pool_size=pool_size, padding=padding, use_channel_first=True, network=net, sources=[src_nchw])) with tf.variable_scope("pool_nhwc_from_nchw"): pool_nhwc_from_nchw = PoolLayer( name="pool_nhwc_from_nchw", network=net, mode="max", pool_size=pool_size, padding=padding, strides=strides, use_channel_first=False, sources=[src_nchw], output=PoolLayer.get_out_data_from_opts(name="pool_nhwc_from_nchw", pool_size=pool_size, padding=padding, use_channel_first=False, network=net, sources=[src_nchw])) tf.global_variables_initializer().run() out, seq_lens = session.run([pool_nhwc_from_nhwc.output.placeholder, pool_nhwc_from_nhwc.output.size_placeholder[0]], feed_dict={src_nhwc.output.placeholder: np.random.rand(10, 11, 16, 16), src_nhwc.output.size_placeholder[0]: np.full(shape=(10,), fill_value=11)} ) print(out.shape) assert_equal(out.shape, (10, 7, 6, 16)) print(seq_lens) time_dim_axis = 1 if TFUtil.is_gpu_available() else 0 out, seq_lens = session.run([pool_nchw_from_nhwc.output.placeholder, pool_nchw_from_nhwc.output.size_placeholder[time_dim_axis]], feed_dict={src_nhwc.output.placeholder: np.random.rand(10, 11, 16, 16), src_nhwc.output.size_placeholder[0]: np.full(shape=(10,), fill_value=11) }) print(out.shape) if time_dim_axis == 1: assert_equal(out.shape, (10, 16, 7, 6)) else: assert_equal(out.shape, (10, 7, 6, 16)) print(seq_lens) if TFUtil.is_gpu_available(): out, seq_lens = session.run([pool_nchw_from_nchw.output.placeholder, pool_nchw_from_nchw.output.size_placeholder[1]], feed_dict={src_nchw.output.placeholder: np.random.rand(10, 16, 11, 16), src_nchw.output.size_placeholder[1]: np.full(shape=(10,), fill_value=11) }) print(out.shape) assert_equal(out.shape, (10, 16, 7, 6)) print(seq_lens) out, seq_lens = session.run([pool_nhwc_from_nchw.output.placeholder, pool_nhwc_from_nchw.output.size_placeholder[0]], feed_dict={src_nchw.output.placeholder: np.random.rand(10, 16, 11, 16), src_nchw.output.size_placeholder[1]: np.full(shape=(10,), fill_value=11)} ) print(out.shape) assert_equal(out.shape, (10, 7, 6, 16)) print(seq_lens)
# start test like this: nosetests-2.7 tests/test_TFEngine.py import logging logging.getLogger('tensorflow').disabled = True import tensorflow as tf import sys sys.path += ["."] # Python 3 hack from TFEngine import * import Util import TFUtil TFUtil.debugRegisterBetterRepr() from Config import Config from nose.tools import assert_equal, assert_is_instance import numpy import numpy.testing import os from pprint import pprint import better_exchook better_exchook.replace_traceback_format_tb() from Log import log log.initialize(verbosity=[5]) session = tf.InteractiveSession() def test_DataProvider(): """ :param Dataset.Dataset dataset: :param int seq_idx: :param str|None output_layer_name: e.g. "output". if not set, will read from config "forward_output_layer" :return: numpy array, output in time major format (time,batch,dim)
def build_actor_critic_network(num_action): with tf.variable_scope('actor_critic_network'): # Inputs with tf.name_scope('inputs'): state_placeholder = tf.placeholder(name='state', shape=(None, 84, 84, 4), dtype=tf.float32) action_placeholder = tf.placeholder(name='taken_action', shape=(None, ), dtype=tf.int32) q_value_placeholder = tf.placeholder(name='q_value', shape=(None, ), dtype=tf.float32) advantage_placeholder = tf.placeholder(name='advantage', shape=(None, ), dtype=tf.float32) # Main network with tf.variable_scope('shared_network'): variable_dict = {} # inference conv1 = TFUtil.conv_layer('conv1', state_placeholder, shape=[8, 8, 4, 32], stride=4, variable_dict=variable_dict) conv2 = TFUtil.conv_layer('conv2', conv1, shape=[4, 4, 32, 64], stride=2, variable_dict=variable_dict) conv3 = TFUtil.conv_layer('conv3', conv2, shape=[3, 3, 64, 64], stride=1, variable_dict=variable_dict) conv3_flatten = TFUtil.flatten(conv3, feature_length=(7 * 7 * 64)) # outputs with tf.variable_scope('actor_network'): fc4_actor = TFUtil.fc_layer('fc4_actor', conv3_flatten, input_size=(7 * 7 * 64), num_neron=512, variable_dict=variable_dict) actor_logits = TFUtil.fc_layer('logits', fc4_actor, input_size=512, num_neron=num_action, activation=None, variable_dict=variable_dict) policy_probs = tf.nn.softmax(name='policy_probs', logits=actor_logits) with tf.variable_scope('critic_network'): fc4_critic = TFUtil.fc_layer('fc4_critic', conv3_flatten, input_size=(7 * 7 * 64), num_neron=512, variable_dict=variable_dict) state_value = tf.squeeze(TFUtil.fc_layer( 'value', fc4_critic, input_size=512, num_neron=1, activation=None, variable_dict=variable_dict), axis=1) with tf.variable_scope('loss'): # policy loss log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits( name='log_prob', labels=action_placeholder, logits=actor_logits) policy_loss = -tf.reduce_sum( log_prob * advantage_placeholder) / ACConfig.batch_size policy_entropy = -tf.reduce_sum( policy_probs * tf.log(policy_probs + 1e-15)) / ACConfig.batch_size # value_loss value_loss = tf.reduce_sum( tf.square(q_value_placeholder - state_value)) / ACConfig.batch_size # need to tweak weight loss = policy_loss + 0.5 * value_loss - 0.0005 * policy_entropy # train_op optimizer = tf.train.AdamOptimizer(learning_rate=ACConfig.lr) """ grad_var = optimizer.compute_gradients(loss) clipped_grad_var = [(tf.clip_by_value(grad, -10., 10.), var) for grad, var in grad_var] train_op = optimizer.apply_gradients(clipped_grad_var) """ train_op = optimizer.minimize(loss) # sample_action sample_action = tf.multinomial(actor_logits, 1) # reward_history with tf.name_scope('reward_history'): reward_history_placeholder = tf.placeholder(name='reward_history', shape=(None, ), dtype=tf.float32) average_reward = tf.reduce_mean(reward_history_placeholder) # summary with tf.name_scope('summary'): tf.summary.scalar('average_reward_over_100_episodes', average_reward) tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('policy_entropy', policy_entropy) tf.summary.scalar('value_loss', value_loss) tf.summary.scalar('loss', loss) return (state_placeholder, action_placeholder, q_value_placeholder, advantage_placeholder, reward_history_placeholder), \ (train_op, sample_action), (actor_logits, state_value, average_reward)
def learn(self, model_save_frequency, model_save_path, check_point, use_gpu, gpu_id=None): device_str = TFUtil.get_device_str(use_gpu=use_gpu, gpu_id=gpu_id) with tf.Graph().as_default(): with tf.device(device_str): # build AC network self._init_acn() # initialize all variables init = tf.global_variables_initializer() # create auxiliary operations: summary and saver summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(ACConfig.summary_folder, self._tf_sess.graph) # initialize if check_point is None: self._tf_sess.run(init) episode_idx = 0 else: self.load(model_save_path, check_point) episode_idx = check_point reward_history = deque(maxlen=100) state = self._request_new_episode() total_rewards = 0 # start training print( 'Training started, please open Tensorboard to monitor the training process.' ) while episode_idx < ACConfig.max_iterations: # sample and perform action, store history action = self._select_action(state, episode_idx) next_state, reward, done = self._perform_action(state, action) total_rewards += reward state = next_state #if reward != 0: # Pong has either +1 or -1 reward exactly when game ends. # print (('ep %d: game finished, reward: %f' % (episode_idx + 1, reward)) + ('' if reward == -1 else ' !!!!!!!!')) if done: episode_idx += 1 # record reward and reset reward_history.append(total_rewards) print("Reward for episode {}: {}".format( episode_idx, total_rewards)) state = self._request_new_episode() total_rewards = 0 # start to train if episode reaches if episode_idx % ACConfig.batch_size == 0: states = self._history_buffer._state_buffer actions = self._history_buffer._action_buffer q_values, advantages = self._history_buffer.compute_q_value_and_advantages( ) _, average_reward, summary = \ self._tf_sess.run([self._tf_acn_train_op, self._tf_average_reward, summary_op], feed_dict={self._tf_acn_state: states, self._tf_acn_action: actions, self._tf_acn_q_value: q_values, self._tf_acn_advantage: advantages, self._tf_reward_history: reward_history }) self._history_buffer.clean_up() # record summary summary_writer.add_summary(summary, global_step=episode_idx) print("Episode {}".format(episode_idx)) print( "Average reward for last 100 episodes: {}".format( average_reward)) if ( episode_idx ) % model_save_frequency == 0 or episode_idx == ACConfig.max_iterations: print("Model saved after {} episodes".format( episode_idx)) self.save(model_save_path, global_step=episode_idx)
def build_actor_critic_network(scope, num_action, num_scene): # input: # num_action : number of available actions # num_scene : number of available scene ### maybe better to use list of scene names? # Inputs with tf.variable_scope(scope): # get the nodes of input images and output features with tf.name_scope('inputs'): num_frames = tf.placeholder(name='num_frames', shape=None, dtype=tf.int32) state_placeholder = tf.placeholder(name='state', shape=(None, 2048), dtype=tf.float32) target_placeholder = tf.placeholder(name='target', shape=(None, 2048), dtype=tf.float32) action_placeholder = tf.placeholder(name='taken_action', shape=(None, ), dtype=tf.int32) q_value_placeholder = tf.placeholder(name='q_value', shape=(None, ), dtype=tf.float32) advantage_placeholder = tf.placeholder(name='advantage', shape=(None, ), dtype=tf.float32) scene_placeholder = tf.placeholder(name='current_scene', shape=(None, num_scene), dtype=tf.float32) # compute embedded feature given the input image feature variable_dict = {} with tf.variable_scope('shared_layers') as scope: # fc1 state_flattened = tf.reshape( state_placeholder, (-1, A3CConfig.num_history_frames * 2048)) fc1_state = TFUtil.fc_layer( 'fc1', state_flattened, input_size=A3CConfig.num_history_frames * 2048, num_neron=512, variable_dict=variable_dict) scope.reuse_variables() target_flattened = tf.reshape( target_placeholder, (-1, A3CConfig.num_history_frames * 2048)) fc1_target = TFUtil.fc_layer( 'fc1', target_flattened, input_size=A3CConfig.num_history_frames * 2048, num_neron=512, variable_dict=variable_dict) with tf.variable_scope('shared_layers'): # fc2 fc2 = TFUtil.fc_layer('fc2', tf.concat((fc1_state, fc1_target), axis=1), input_size=1024, num_neron=512, variable_dict=variable_dict) # outputs policy_logits_list = [] policy_prob_list = [] state_value_list = [] for i in xrange(num_scene): with tf.variable_scope(THORConfig.supported_envs[i], reuse=False): # fc3 shared for policy and value output fc3 = TFUtil.fc_layer('fc_3_{0}'.format(i), fc2, input_size=512, num_neron=512, variable_dict=variable_dict) # policy output policy_logits = TFUtil.fc_layer('policy_logits', fc3, input_size=512, num_neron=num_action, activation=None, variable_dict=variable_dict) policy_probs = tf.nn.softmax(name='policy_probs', logits=policy_logits) # value output state_value = tf.squeeze(TFUtil.fc_layer( 'value', fc3, input_size=512, num_neron=1, activation=None, variable_dict=variable_dict), axis=1) # add output to list policy_logits_list.append(policy_logits) policy_prob_list.append(policy_probs) state_value_list.append(state_value) with tf.variable_scope('loss'): scene_loss = [] for i in xrange(num_scene): # policy loss log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits( name='log_prob', labels=action_placeholder, logits=policy_logits_list[i]) policy_loss = -tf.reduce_sum( tf.maximum(log_prob, tf.log(1e-6)) * advantage_placeholder) # regularization for A3C delay policy_entropy = -0.01 * tf.reduce_sum( policy_prob_list[i] * tf.log(policy_prob_list[i] + 1e-20)) # value_loss value_loss = 0.5 * tf.reduce_sum( tf.square(q_value_placeholder - state_value_list[i])) # need to tweak weight scene_loss.append(policy_loss + value_loss - policy_entropy) with tf.name_scope('secen_summary_{0}'.format( THORConfig.supported_envs[i])): tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('policy_entropy', policy_entropy) tf.summary.scalar('value_loss', value_loss) loss = tf.reduce_sum( tf.transpose(scene_loss) * scene_placeholder) # scene_placeholder is one-hot vectors # train_op # optional: varying learning_rate """ learning_rate = tf.train.exponential_decay( learning_rate = A3CConfig.learning_rate, global_step = global_step, decay_steps = A3CConfig.decay_step, decay_rate = A3CConfig.decay_rate) """ # create optizer #optimizer = tf.train.AdamOptimizer(learning_rate = A3CConfig.learning_rate) optimizer = tf.train.RMSPropOptimizer( learning_rate=A3CConfig.learning_rate, decay=A3CConfig.decay_rate, epsilon=0.1) #, momentum = A3CConfig.momentum) train_ops = [] for i in xrange(num_scene): clipped_grad_var = [] grad_var = optimizer.compute_gradients(scene_loss[i]) for grad, var in grad_var: if grad is not None: clipped_grad_var.append( (tf.clip_by_value(grad, -10., 10.), var)) else: clipped_grad_var.append((None, var)) grad_var = clipped_grad_var train_ops.append(optimizer.apply_gradients(grad_var)) """ local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = scope) grad_var = optimizer.compute_gradients(loss, var_list = local_vars) # optional: gradient clipping clipped_grad_var = [] for grad, var in grad_var: if grad is not None: clipped_grad_var.append((tf.clip_by_value(grad, -10., 10.), var)) else: clipped_grad_var.append((None, var)) grad_var = clipped_grad_var if scope != 'global': global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'global') tmp = [] for i in xrange(len(grad_var)): tmp.append((grad_var[i][0], global_vars[i])) grad_var = tmp train_op = optimizer.apply_gradients(grad_var) """ # ops to sample_action using multinomial distribution given unnomalized log probability logits action_sampler_ops = [] for i in xrange(num_scene): action_sampler_ops.append( tf.multinomial(name='action_sampler_{0}'.format( THORConfig.supported_envs[i]), logits=policy_logits_list[i], num_samples=1)) # reward_history with tf.name_scope('reward_history'): reward_history_placeholder = tf.placeholder(name='reward_history', shape=(None, ), dtype=tf.float32) average_reward = tf.reduce_mean(reward_history_placeholder) # step_history with tf.name_scope('step_number_history'): step_history_placeholder = tf.placeholder(name='step_history', shape=(None, ), dtype=tf.float32) average_step = tf.reduce_mean(step_history_placeholder) # summary with tf.name_scope('summary'): tf.summary.scalar('average_reward_over_100_episodes', average_reward) tf.summary.scalar('average_steps_over_100_episodes', average_step) tf.summary.scalar('loss', loss) return (num_frames, state_placeholder, target_placeholder, action_placeholder, q_value_placeholder, advantage_placeholder, \ scene_placeholder, reward_history_placeholder, step_history_placeholder), \ (train_ops, action_sampler_ops), \ (policy_logits_list, state_value_list, average_reward, average_step)