Esempio n. 1
0
 def __call__(self,
              inputs,
              index,
              initial_state=None,
              recurrent_weights_initializer=None):
     """
 :param tf.Tensor inputs: shape (time,batch,n_hidden)
 :param tf.Tensor index: shape (time,batch)
 :param tf.Tensor|None initial_state: shape (batch,n_hidden)
 :param ()->tf.Tensor recurrent_weights_initializer:
 :returns: shape (time,batch,n_hidden), shape (batch,n_hidden)
 :rtype: (tf.Tensor, tf.Tensor)
 """
     W = tf.get_variable(name="W_re",
                         shape=(self.n_hidden, self.n_hidden * 4),
                         initializer=recurrent_weights_initializer)
     TFUtil.set_param_axes_split_info(
         W, [[self.n_hidden], [self.n_hidden] * 4])
     if self.rec_weight_dropout:
         from TFUtil import dropout
         W = dropout(W,
                     keep_prob=1.0 - self.rec_weight_dropout,
                     cond_on_train=True,
                     seed=TFUtil.get_random_seed())
     out, _, _, final_cell_state = self.op(*self.map_layer_inputs_to_op(
         X=inputs, W=W, i=index, initial_state=initial_state))
     from tensorflow.python.ops.nn import rnn_cell
     return out, rnn_cell.LSTMStateTuple(h=out[-1], c=final_cell_state)
Esempio n. 2
0
 def __call__(self,
              inputs,
              index,
              initial_state=None,
              recurrent_weights_initializer=None):
     """
 :param tf.Tensor inputs: shape (time,batch,n_input_dim)
 :param tf.Tensor index: shape (time,batch)
 :param tf.Tensor|None initial_state: shape (batch,n_hidden)
 :param ()->tf.Tensor recurrent_weights_initializer:
 :returns: shape (time,batch,n_hidden), shape (batch,n_hidden)
 :rtype: (tf.Tensor, tf.Tensor)
 """
     W = tf.get_variable(name="W",
                         shape=(self.n_input_dim + self.n_hidden,
                                self.n_hidden * 4),
                         initializer=recurrent_weights_initializer)
     b = tf.get_variable(name="b",
                         shape=(self.n_hidden * 4, ),
                         initializer=tf.zeros_initializer())
     TFUtil.set_param_axes_split_info(
         W, [[self.n_input_dim, self.n_hidden], [self.n_hidden] * 4])
     TFUtil.set_param_axes_split_info(b, [[self.n_hidden] * 4])
     out, _, final_state = self.op(*self.map_layer_inputs_to_op(
         X=inputs, W=W, b=b, i=index, initial_state=initial_state))
     return out, final_state
Esempio n. 3
0
def have_blocksparse_requirements():
    import TFUtil
    if not TFUtil.is_gpu_available():
        return False
    min_compute_capability = TFUtil.get_available_gpu_min_compute_capability()
    if min_compute_capability < 3.5:
        return False
    return True
Esempio n. 4
0
 def __call__(self,
              inputs,
              index,
              initial_state=None,
              recurrent_weights_initializer=None):
     """
 :param tf.Tensor inputs: shape (time,batch,n_hidden)
 :param tf.Tensor index: shape (time,batch)
 :param tf.Tensor|None initial_state: shape (batch,n_hidden)
 :param ()->tf.Tensor recurrent_weights_initializer:
 :returns: shape (time,batch,n_hidden), shape (batch,n_hidden)
 :rtype: (tf.Tensor, tf.Tensor)
 """
     from tensorflow.python.ops.nn import rnn_cell
     W = tf.get_variable(name="W_re",
                         shape=(self.n_hidden, self.n_hidden * 4),
                         initializer=recurrent_weights_initializer)
     TFUtil.set_param_axes_split_info(
         W, [[self.n_hidden], [self.n_hidden] * 4])
     if self.rec_weight_dropout:
         from TFUtil import dropout
         W = dropout(W,
                     keep_prob=1.0 - self.rec_weight_dropout,
                     cond_on_train=True,
                     seed=TFUtil.get_random_seed())
     inputs.set_shape(tf.TensorShape([None, None, self.n_hidden * 4]))
     W.set_shape(tf.TensorShape([self.n_hidden, self.n_hidden * 4]))
     index.set_shape(tf.TensorShape([None, None]))
     from TFUtil import to_float32
     index = to_float32(index)
     n_batch = tf.shape(inputs)[1]
     if initial_state is None:
         c0 = tf.zeros((n_batch, self.n_hidden),
                       dtype=tf.float32,
                       name="initial_c")
         y0 = tf.zeros((n_batch, self.n_hidden),
                       dtype=tf.float32,
                       name="initial_h")
     elif isinstance(initial_state, rnn_cell.LSTMStateTuple):
         c0 = initial_state.c
         y0 = initial_state.h
     else:
         c0 = initial_state
         y0 = tf.zeros((n_batch, self.n_hidden),
                       dtype=tf.float32,
                       name="initial_h")
     start = tf.constant(0, name="start")
     step = tf.constant(self.step or 1, name="step")
     out, _, _, final_cell_state = self.op(inputs, W, y0, c0, index, start,
                                           step)
     if out.get_shape().as_list()[0] is None or out.get_shape().as_list(
     )[0] > 0:
         final_output = out[-1]
     else:
         final_output = y0
     return out, rnn_cell.LSTMStateTuple(h=final_output, c=final_cell_state)
Esempio n. 5
0
def init_blocksparse():
    import TFUtil
    assert TFUtil.is_gpu_available(), "we currently need a GPU"
    min_compute_capability = TFUtil.get_available_gpu_min_compute_capability()
    assert min_compute_capability and min_compute_capability >= 3.5, "we need at least compute capability 3.5"
    path = os.path.dirname(__file__) + "/extern/blocksparse"
    assert os.path.exists(path), "maybe submodule not checked out?"
    import sys
    if path not in sys.path:
        # At the beginning, to make sure we find it firs.t
        sys.path.insert(0, path)
    # test it
    from blocksparse import op_module
    op_module.get_module()
Esempio n. 6
0
 def __call__(self, inputs, index, initial_state=None, recurrent_weights_initializer=None):
   """
   :param tf.Tensor inputs: shape (time,batch,n_hidden*4)
   :param tf.Tensor index: shape (time,batch)
   :param tf.Tensor|None initial_state: shape (batch,n_hidden)
   :param ()->tf.Tensor recurrent_weights_initializer:
   :returns: shape (time,batch,n_hidden), shape (batch,n_hidden)
   :rtype: (tf.Tensor, tf.Tensor)
   """
   W_re = tf.get_variable(
     name="W_re", shape=(self.n_hidden, self.n_hidden * 4), initializer=recurrent_weights_initializer)
   TFUtil.set_param_axes_split_info(W_re, [[self.n_hidden], [self.n_hidden] * 4])
   out, _, final_state = self.op(
     *self.map_layer_inputs_to_op(Z=inputs, V_h=W_re, i=index, initial_state=initial_state))
   return out, final_state
Esempio n. 7
0
    def test(self, model_save_path, check_point, use_gpu, gpu_id=None):
        # build network
        device_str = TFUtil.get_device_str(use_gpu=use_gpu, gpu_id=gpu_id)
        with tf.device(device_str):
            self._init_acn()

        # initialize all variables from the model
        self.load(model_save_path, check_point)

        # start new episode
        episode_idx = 0
        total_rewards = 0
        state = self._request_new_episode()

        # perform testing
        while episode_idx <= ACConfig.max_iterations:
            # sample and perform action, store history
            action = self._select_action(state, episode_idx, test_mode=True)
            next_state, reward, done = self._perform_action(state, action)
            total_rewards += reward
            state = next_state

            if done:
                episode_idx += 1
                print('total_reward received: {0}'.format(total_rewards))
                self._history_buffer.clean_up()
                state = self._request_new_episode()
                total_rewards = 0
Esempio n. 8
0
    def test(self, check_point, use_gpu, gpu_id=None):
        assert (len(envs) == 1)
        env = envs[0]
        history_buffer = A3CUtil.HistoryBuffer()

        # build network
        device_str = TFUtil.get_device_str(use_gpu=use_gpu, gpu_id=gpu_id)
        with tf.device(device_str):
            #self._init_network()
            # initialize all variables from the model
            self.load(self._model_save_path, check_point)

        # start new episode
        episode_idx = 0
        total_rewards = 0
        state, target = self._request_new_episode(env)

        # perform testing
        while episode_idx <= A3CConfig.max_iterations:
            # sample and perform action, store history
            action = self._select_action(env, state, target, test_mode=True)
            next_state, reward, done = self._perform_action(
                env, state, target, action, history_buffer)
            total_rewards += reward
            state = next_state

            if done:
                episode_idx += 1
                print('total_reward received: {0}'.format(total_rewards))
                history_buffer.clean_up()
                state = self._request_new_episode(env)
                total_rewards = 0
Esempio n. 9
0
 def get_consumer_device(self):
     """
 :return: e.g. "/device:GPU:0"
 :rtype: str
 """
     # TODO this is probably incomplete
     import TFUtil
     if TFUtil.is_gpu_available():
         return "/device:GPU:0"
     return "/device:CPU:0"
Esempio n. 10
0
 def __init__(self):
   self.hyps = TFCompat.v1.placeholder(tf.string, [None])
   self.refs = TFCompat.v1.placeholder(tf.string, [None])
   self.wer, self.ref_num_words = TFUtil.string_words_calc_wer(hyps=self.hyps, refs=self.refs)
   self.total_wer_var = tf.Variable(initial_value=0, trainable=False, dtype=tf.int64)
   self.total_ref_num_words_var = tf.Variable(initial_value=0, trainable=False, dtype=tf.int64)
   self.update_total_wer = self.total_wer_var.assign_add(tf.reduce_sum(self.wer))
   self.update_ref_num_words = self.total_ref_num_words_var.assign_add(tf.reduce_sum(self.ref_num_words))
   self.updated_normalized_wer = (
     tf.cast(self.update_total_wer, tf.float32) / tf.cast(self.update_ref_num_words, tf.float32))
Esempio n. 11
0
    def learn(self, check_point, use_gpu, gpu_id=None):
        device_str = TFUtil.get_device_str(use_gpu=use_gpu, gpu_id=gpu_id)
        with tf.Graph().as_default():
            with tf.device(device_str):
                config = tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=True)
                config.gpu_options.allow_growth = True
                with tf.Session(config=config) as sess:
                    # initialize a global agent
                    self._global_agent = A3CAgent(
                        sess=sess,
                        scope="global",
                        feature_mode=self._feature_mode)
                    self._global_agent._init_network(check_point=check_point)
                    self._global_agent.create_global_summary_ops()

                    # start training
                    # create learner threads
                    learner_threads = []
                    for i in xrange(self._num_agents):
                        local_scope = "agent_{0:03d}".format(i)
                        local_agent = A3CAgent(sess=sess,
                                               scope=local_scope,
                                               feature_mode=self._feature_mode)
                        local_agent._init_network()
                        learner_threads.append(
                            threading.Thread(target=local_agent.learn,
                                             args=(use_gpu, gpu_id)))

                    # initilize all variables
                    sess.run(tf.global_variables_initializer())
                    # load resnet variables
                    A3CAgent.tf_resnet_saver.restore(
                        sess, A3CConfig.resnet_pretrain_model)

                    # initialize or load network variables
                    if check_point:
                        self._global_agent.load(self._model_save_path,
                                                check_point)
                        A3CConfig.num_frames = check_point
                    else:
                        A3CConfig.num_frames = 0

                    learner_threads.append(
                        threading.Thread(
                            target=self._global_agent.save_model_monitor,
                            args=(A3CConfig.num_frames, self._model_save_path,
                                  self._model_save_interval)))
                    print(
                        'Training started, please open Tensorboard to monitor the training process.'
                    )
                    for t in learner_threads:
                        t.start()
                    for t in learner_threads:
                        t.join()
Esempio n. 12
0
 def _make_mod(self):
   if self.cache_key in self.mod_cache:
     return self.mod_cache[self.cache_key]
   comp = TFUtil.OpCodeCompiler(
     base_name=self.name, code_version=self.description.code_version,
     code=self._make_code(),
     include_deps=[self.support_native_op_cpp_filename],
     ld_flags=["-lblas"],
     **dict(self.compiler_opts))
   mod = comp.load_module()
   self.mod_cache[self.cache_key] = mod
   return mod
Esempio n. 13
0
    def learn(self, check_point, use_gpu, gpu_id=None):
        device_str = TFUtil.get_device_str(use_gpu=use_gpu, gpu_id=gpu_id)
        with tf.Graph().as_default():
            with tf.device(device_str):
                # create resnet if no extracted feature
                if not self._feature_mode:
                    resnet_saver = tf.train.import_meta_graph(A3CConfig.resnet_meta_graph)
                    graph = tf.get_default_graph()
                    self._tf_resnet_input = graph.get_tensor_by_name("images:0") 
                    self._tf_resnet_output = graph.get_tensor_by_name("avg_pool:0")
                
                # build network
                self._init_network(scope = "global")
                self._saver = tf.train.Saver(var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='global'))

                # create initializer
                init = tf.global_variables_initializer()

                # create auxiliary operations: summary and saver
                self._tf_summary_op = tf.summary.merge_all()
                self._summary_writer = tf.summary.FileWriter(A3CConfig.summary_folder, self._tf_sess.graph)

                # initialize or load network variables
                if check_point is None:
                    # initialize all variable
                    self._tf_sess.run(init)
                    if not self._feature_mode:
                        # load pretrain resnet
                        resnet_saver.restore(self._tf_sess, A3CConfig.resnet_pretrain_model)
                    self._iter_idx = 0
                else:
                    if not self._feature_mode:
                        # load pretrain resnet
                        resnet_saver.restore(self._tf_sess, A3CConfig.resnet_pretrain_model)
                    self.load(self._model_save_path, check_point)
                    self._iter_idx = check_point

        # start training
        # create learner threads
        learner_threads = [threading.Thread(target=self.learner_thread, args=(thread_id, ))\
                        for thread_id in xrange(self._num_threads)]
        for t in learner_threads:
            t.start()
        for t in learner_threads:
            t.join()

        print('Training started, please open Tensorboard to monitor the training process.')
        # Show the agents training and write summary statistics
        """
Esempio n. 14
0
        def grad_wrapper(fwd_op, *bwd_grads):
          """
          :param tf.Operation fwd_op: for fwd_op.inputs and fwd_op.outputs
          :param list[tf.Tensor] bwd_grads:
          :return: list of tensors of gradients for each input
          :rtype: list[tf.Tensor]
          """
          assert len(bwd_grads) == len(fwd_op.outputs)

          grad_inputs = list(fwd_op.inputs) + list(fwd_op.outputs) + list(bwd_grads)
          grad_inputs = self.description._filter_grad_inputs(grad_inputs)
          grad_outputs = TFUtil.make_var_tuple(grad_op(*grad_inputs))
          if grad_description.num_dummy_outs > 0:
            grad_outputs = grad_outputs[:-grad_description.num_dummy_outs]
          grad_outputs = self.description.make_results_of_gradient(grad_outputs)
          return grad_outputs
Esempio n. 15
0
 def _make_mod(self):
     if self.cache_key in self.mod_cache:
         return self.mod_cache[self.cache_key]
     from Util import find_lib
     # Note about BLAS linkage:
     # TensorFlow (or its Eigen lib) likely has linked against some BLAS lib itself.
     # For our CPU code, we directly call some BLAS functions such as `sgemm_`.
     # On platforms where there is a flat namespace (e.g. Mac),
     # it probably is not needed to explicitly link it again for this module.
     # In other cases, it's probably needed, but it's not so clear which lib has the
     # right symbols (e.g. the `sgemm_` symbol).
     ld_flags = []
     if self.search_for_numpy_blas:
         # Find related Numpy libs.
         # Numpy usually comes with OpenBlas, and Numpy is probably loaded anyway.
         # Even do this before the other libs below, as it is likely
         # that this OpenBlas lib is correctly initialized already.
         import numpy
         numpy_dir = os.path.dirname(numpy.__file__)
         if os.path.exists("%s/.libs" % numpy_dir):
             ld_flags += ["-L%s/.libs" % numpy_dir]
             from glob import glob
             for f in glob("%s/.libs/*.so" % numpy_dir):
                 f = os.path.basename(f)
                 if f.startswith("lib"):
                     f = f[3:]
                 if f.endswith(".so"):
                     f = f[:-3]
                 ld_flags += ["-l%s" % f]
     if self.search_for_system_blas:
         # Try to just link against blas/f77blas
         # (both can potentially have the symbol) if it finds the lib.
         if find_lib("blas"):
             ld_flags += ["-lblas"]
         if find_lib("f77blas"):
             ld_flags += ["-lf77blas"]
     comp = TFUtil.OpCodeCompiler(
         base_name=self.name,
         code_version=self.description.code_version,
         code=self._make_code(),
         include_deps=[self.support_native_op_cpp_filename],
         ld_flags=ld_flags,
         use_cuda_if_available=self.with_cuda,
         **dict(self.compiler_opts))
     mod = comp.load_tf_module()
     self.mod_cache[self.cache_key] = mod
     return mod
Esempio n. 16
0
    def learn(self, check_point, use_gpu, gpu_id=None):
        device_str = TFUtil.get_device_str(use_gpu=use_gpu, gpu_id=gpu_id)
        with tf.Graph().as_default():
            with tf.device(device_str):
                # build network
                self._init_network()

                # initialize all variables
                init = tf.global_variables_initializer()

                # create auxiliary operations: summary and saver
                self._tf_summary_op = tf.summary.merge_all()
                self._summary_writer = tf.summary.FileWriter(
                    A3CConfig.summary_folder, self._tf_sess.graph)

            # initialize or load network variables
            if check_point is None:
                self._tf_sess.run(init)
                self._iter_idx = 0
            else:
                self.load(self._model_save_path, check_point)
                self._iter_idx = check_point

        # start training
        # create learner threads
        learner_threads = [threading.Thread(target=self.learner_thread, args=(thread_id, ))\
                           for thread_id in xrange(self._num_threads)]
        for t in learner_threads:
            t.start()

        print(
            'Training started, please open Tensorboard to monitor the training process.'
        )
        # Show the agents training and write summary statistics
        """
        last_summary_time = 0
        while True:
            now = time.time()
            if now - last_summary_time > SUMMARY_INTERVAL:
                summary_str = session.run(summary_op)
                writer.add_summary(summary_str, float(T))
                last_summary_time = now
        """
        for t in learner_threads:
            t.join()
Esempio n. 17
0
 def test(self, dueling_dqn, model_load_path, use_gpu, gpu_id, summary_folder):
     # build network
     device_str = TFUtil.get_device_str(use_gpu=use_gpu, gpu_id=gpu_id)
     with tf.device(device_str):
         self._init_dqn(dueling_dqn)
     # initialize all variables from the model
     self.load(model_load_path)
     # start new episode
     self._request_new_episode()
     # perform testing
     for i in range(config.max_iterations):
         # select and perform action
         state = self._get_current_state()
         state = state[np.newaxis]
         target = self._cur_target[np.newaxis]
         Q = self._evaluate_q(state, target)
         a = self._select_action(Q, i, test_mode=True)
         self._perform_action(a)
         if self._env.episode_done():
             print('total_reward received: {0}'.format(self._env.get_total_episode_reward()))
             print('total_steps: {0}'.format(self._env.get_steps_count()))
             self._request_new_episode()
Esempio n. 18
0
class OpMaker(object):
  """
  https://www.tensorflow.org/versions/master/how_tos/adding_an_op/
  """
  with_cuda = None  # type: None|bool
  # https://github.com/tensorflow/tensorflow/issues/6602
  tf_blas_gemm_workaround = TFUtil.tf_version_tuple() < (1, 5, 0)
  global_lock = RLock()
  mod_cache = {}  # cache_key -> mod
  op_cache = {}  # cache_key -> op

  def __init__(self, description, compiler_opts=None, search_for_numpy_blas=True):
    """
    :param OpDescription description:
    :param dict[str]|None compiler_opts: passed on to OpCodeCompiler as kwargs
    """
    self._cls_init()
    self.description = description
    self.name = description.name
    self.compiler_opts = compiler_opts or {}
    self.search_for_numpy_blas = search_for_numpy_blas

  @classmethod
  def _cls_init(cls):
    if cls.with_cuda is None:
      cls.with_cuda = TFUtil.CudaEnv.get_instance().is_available()
      if cls.with_cuda and cls.tf_blas_gemm_workaround:
        cls._load_cuda_blas_gemm()

  @classmethod
  def cuda_blas_gemm_so_filename(cls):
    from tensorflow.contrib.rnn.python.ops import lstm_ops
    lstm_ops_so = "%s/_lstm_ops.so" % os.path.dirname(lstm_ops.__file__)
    assert os.path.exists(lstm_ops_so)
    return lstm_ops_so

  @classmethod
  def _load_cuda_blas_gemm(cls):
    """
    https://github.com/tensorflow/tensorflow/issues/6602
    As a workaround for TF issue 6602, we link to some functions which are implemented in contrib.rnn.kernels.blas_gemm.
    See NativeOp.cpp.
    To make the symbols available in the namespace, load the library now.
    This issue if fixed with tensorflow 1.5
    """
    if TFUtil.CudaEnv.verbose_find_cuda:
      print("Load tf.contrib lstm_ops...")
    lstm_ops_so = cls.cuda_blas_gemm_so_filename()
    if TFUtil.CudaEnv.verbose_find_cuda:
      print("Load tf.contrib lstm_ops lib:", lstm_ops_so)
    # Maybe a bit hacky: Just load all symbols into the global namespace.
    from ctypes import RTLD_GLOBAL, CDLL
    CDLL(lstm_ops_so, mode=RTLD_GLOBAL)
    if TFUtil.CudaEnv.verbose_find_cuda:
      print("tf.contrib lstm_ops lib loaded.")

  @property
  def op_name(self):
    return self.name

  @property
  def cache_key(self):
    return self.name

  @property
  def support_native_op_cpp_filename(self):
    my_dir = os.path.abspath(os.path.dirname(__file__) or os.getcwd())
    my_dir = os.path.realpath(my_dir)  # Make canonical path-name.
    support_native_op_cpp_filename = "%s/NativeOp.cpp" % my_dir
    assert os.path.exists(support_native_op_cpp_filename)
    return support_native_op_cpp_filename

  def _make_code(self):
    # In the user code, we assume that we have the following variables:
    # int n_inputs; int n_outputs;
    # Ndarray* inputs[n_inputs]; Ndarray** outputs[n_outputs];
    # Reference:
    # https://www.tensorflow.org/extend/adding_an_op
    # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/g3doc/how_tos/adding_an_op/
    # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/op_kernel.h
    # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/op_def_builder.h
    # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/pad_op.cc
    # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/debug_ops.h  CopyOp...
    # http://stackoverflow.com/questions/37565367/designing-an-accumulating-tensorflow-gpu-operator
    # We also include NativeOp.cpp.
    in_info, out_info, _ = NativeOp.NativeOp._resolve_want_inplace_dummy(
      in_info=self.description.in_info, out_info=self.description.out_info)
    out_is_ref = dict()  # output vars which are inplace, out_name -> in_idx
    # want_inplace: output-index which this input should operate on
    # Unlike the Theano variant, we always do it inplace,
    # so the user has to make a copy if this is not the intention.
    for in_idx, v in enumerate(in_info):
      out_idx = v.get("want_inplace", -1)
      if out_idx >= 0:
        out_name = out_info[out_idx]["name"]
        assert out_name not in out_is_ref
        out_is_ref[out_name] = in_idx
    def map_name(v, is_out=False):
      name = v["name"].lower()
      if is_out:
        # Maybe it clashes with some input name. TF doesn't allow the same name.
        if any([v["name"].lower() == name for v in in_info]):
          name = "out_%s" % name
      return name
    def map_type(v, is_out=False):
      t = v.get("dtype", "float32")
      return t
    code_register_op_io = ""
    for v in in_info:
      code_register_op_io += ".Input(\"%s: %s\")\n" % (map_name(v), map_type(v))
    for v in out_info:
      code_register_op_io += ".Output(\"%s: %s\")\n" % (map_name(v, is_out=True), map_type(v, is_out=True))
    code_set_out_shape = ""
    def make_dim_str(c):
      if isinstance(c, tuple):
        in_idx, in_dim = c
        return "c->Dim(c->input(%i), %i)" % (in_idx, in_dim)
      elif isinstance(c, int):
        return str(c)
      else:
        raise Exception("type: %s" % type(c))
    for i, v in enumerate(in_info):
      code_set_out_shape += """
      if(c->Rank(c->input(%(idx)i)) != tensorflow::shape_inference::InferenceContext::kUnknownRank && c->Rank(c->input(%(idx)i)) != %(rank)i)
        return errors::InvalidArgument(
          "wrong rank for input (%(idx)i) '%(name)s'. required %(rank)i but got ", c->Rank(c->input(%(idx)i)));
      """ % {"idx": i, "rank": v["ndim"], "name": v["name"]}
    for i, v in enumerate(out_info):
      code_set_out_shape += "c->set_output(%i, c->MakeShape({%s}));\n" % (
        i, ", ".join([make_dim_str(c) for c in v["shape"]]))
    code_register_op_io += """
    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
      if(c->num_inputs() != %(num_inputs)i)
        return errors::InvalidArgument("wrong number of inputs. required %(num_inputs)i but got ", c->num_inputs());
      if(c->num_outputs() != %(num_outputs)i)
        return errors::InvalidArgument("wrong number of outputs. required %(num_outputs)i but got ", c->num_outputs());
      %(code_set_out_shape)s
      return Status::OK();
    })
    """ % {
      "num_inputs": len(in_info),
      "num_outputs": len(out_info),
      "code_set_out_shape": code_set_out_shape
    }
    code_forward_io = ""
    for in_idx, v in enumerate(in_info):
      out_idx = v.get("want_inplace", -1)
      if out_idx >= 0:
        code_forward_io += "context->forward_ref_input_to_ref_output(%i, %i);\n" % (in_idx, out_idx)
    code_set_io = ""
    for in_idx, v in enumerate(in_info):
      ndim = len(v["shape"])
      code_set_io += """
      OP_REQUIRES(
        context, context->input(%i).dims() == %i,
        errors::InvalidArgument("shape ndim is not %i, got shape ",
                                context->input(%i).shape().DebugString()));
      """ % (in_idx, ndim, ndim, in_idx)
      for axis, d in enumerate(v["shape"]):
        if isinstance(d, int):
          code_set_io += """
          OP_REQUIRES(
            context, context->input(%i).dim_size(%i) == %i,
            errors::InvalidArgument("shape[%i] != %i, got shape ",
                                    context->input(%i).shape().DebugString()));
          """ % (in_idx, axis, d, axis, d, in_idx)
    code_set_io += """
    Ndarray* inputs[n_inputs];
    Ndarray** outputs[n_outputs];
    """
    for in_idx, v in enumerate(in_info):
      out_idx = v.get("want_inplace", -1)
      if out_idx >= 0:  # is ref
        # mutable_input if it is a ref-type, i.e. a Variable.
        #code_set_io += "Ndarray mutable_input_%i = context->mutable_input(%i, false);\n" % (in_idx, in_idx)
        #code_set_io += "inputs[%i] = &mutable_input_%i;\n" % (in_idx, in_idx)
        # Maybe we could use a TemporaryVariable or so but not sure if the gradient will flow through tf.assign().
        # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/ops/state_ops.cc
        # but a normal tensor is never mutable, thus create a copy of the input now.
        code_set_io += "Ndarray* output_%i = NULL;\n" % (out_idx,)
        cshape = "TensorShape({%s})" % ", ".join(["context->input(%i).dim_size(%i)" % (in_idx, in_dim)
                                                  for in_dim in range(len(v["shape"]))])
        code_set_io += "OP_REQUIRES_OK(context, context->allocate_output(%i, %s, &output_%i));\n" % (out_idx, cshape, out_idx)
        code_set_io += "inputs[%i] = output_%i;\n" % (in_idx, out_idx)
        # We always make a copy for now.
        # I'm not sure if inplace is an option for TF because we don't know if any other operation in the graph
        # wants to access it. Maybe we can check the reference count or so?
        # Some references for inplace operations:
        # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/inplace_ops.cc
        # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/strided_slice_op.cc
        code_set_io += "make_copy(context, inputs[%i], &context->input(%i));\n" % (in_idx, in_idx)
      else:  # no ref
        # TODO: if not on GPU but GPU requested, move to GPU first, maybe via allocate_temp?
        code_set_io += "inputs[%i] = const_cast<Ndarray*>(&context->input(%i));\n" % (in_idx, in_idx)
    for out_idx, v in enumerate(out_info):
      out_name = out_info[out_idx]["name"]
      if out_name in out_is_ref:  # is ref on input
        in_idx = out_is_ref[out_name]
        code_set_io += "outputs[%i] = &inputs[%i];\n" % (out_idx, in_idx)
      else:  # no ref
        code_set_io += "Ndarray* output_%i = NULL;\n" % (out_idx,)
        code_set_io += "outputs[%i] = &output_%i;\n" % (out_idx, out_idx)
        cshape = "TensorShape({%s})" % ", ".join(["inputs[%i]->dim_size(%i)" % (in_idx, in_dim)
                                                  for (in_idx, in_dim) in v["shape"]])
        code_set_io += "OP_REQUIRES_OK(context, context->allocate_output(%i, %s, &output_%i));\n" % (out_idx, cshape, out_idx)
        code_set_io += "Ndarray_set_zero(*outputs[%i]);\n" % out_idx

    code_user = self.description.c_fw_code % {"fail": "assert(false);"}
    code_compute = "\n".join([
      code_forward_io,
      code_set_io,
      code_user])
    register_gpu_kernel_opts = ".Device(DEVICE_GPU)\n"
    for v in in_info:
      if v.get("host_memory", False):
        register_gpu_kernel_opts += """.HostMemory("%s")\n""" % map_name(v)
    format_args = {
      "op_name": self.op_name,
      "code_register_op_io": code_register_op_io,
      "code_forward_io": code_forward_io,
      "code_set_io": code_set_io,
      "code_compute": code_compute,
      "user_code_kernels": self.description._reduce_c_extra_support_code(self.description.c_extra_support_code),
      "native_op_cpp_filename": self.support_native_op_cpp_filename,
      "register_gpu_kernel_opts": register_gpu_kernel_opts,
      "n_inputs": len(in_info),
      "n_outputs": len(out_info)
    }
    code_header = ""
    if self.with_cuda:
      code_header += """
      // For Eigen::GpuDevice.
      #define EIGEN_USE_GPU 1
      """
    code_header += """
    // For Eigen::ThreadPoolDevice.
    #define EIGEN_USE_THREADS 1

    #include "tensorflow/core/framework/op.h"
    #include "tensorflow/core/framework/shape_inference.h"
    #include "tensorflow/core/framework/op_kernel.h"
    #include "tensorflow/core/common_runtime/device.h"
    """
    if self.with_cuda:
      # http://docs.nvidia.com/cuda/cublas
      code_header += """
      #include <cuda.h>
      #include <cuda_runtime.h>
      #include <cublas_v2.h>
      #include <math_constants.h>

      """

      if not self.tf_blas_gemm_workaround:
        # https://github.com/tensorflow/tensorflow/issues/6602 ?
        code_header += '#include "tensorflow/core/platform/stream_executor.h"\n'
    # sgemm
    code_header += """
    typedef float real;
    typedef int integer;
    extern "C" {
    extern int sgemm_(char *transa, char *transb,
      integer *m, integer *n, integer *k,
      const real *alpha,
      const real *a, integer *lda,
      const real *b, integer *ldb,
      const real *beta,
      real *c, integer *ldc);
    }
    """
    code_header += """
    using namespace tensorflow;

    #define _ns  // so _ns::something will use the root namespace
    #define TENSORFLOW 1
    #define CUDA 0
    #include "%(native_op_cpp_filename)s"

    static const int n_inputs = %(n_inputs)i, n_outputs = %(n_outputs)i;

    REGISTER_OP("%(op_name)s")
    %(code_register_op_io)s;
    """ % format_args
    if self.description.cpu_support:
      code_cpu_op = """
      %(user_code_kernels)s
  
      class %(op_name)sOp : public OpKernel {
      public:
        explicit %(op_name)sOp(OpKernelConstruction* context) : OpKernel(context) {}
        void Compute(OpKernelContext* context) override {
          %(code_compute)s
        }
      };
  
      REGISTER_KERNEL_BUILDER(Name("%(op_name)s").Device(DEVICE_CPU), %(op_name)sOp);
      """ % format_args
    else:
      code_cpu_op = ""
    if self.with_cuda:
      code_gpu_op = """
      namespace _gpu {
        #ifdef _ns
          #undef _ns
        #endif
        namespace _ns = ::_gpu;
        #undef CUDA
        #define CUDA 1
        #undef Ndarray_memcpy
        #undef Ndarray_memset
        #undef Ndarray_sgemm
        #undef DEF_KERNEL
        #undef start_dev_kernel
        #undef assert_cmp
        #undef threadIdx
        #undef blockIdx
        #undef blockDim
        #undef gridDim
        #include "%(native_op_cpp_filename)s"

        %(user_code_kernels)s

        class %(op_name)sGpuOp : public OpKernel {
        public:
          explicit %(op_name)sGpuOp(OpKernelConstruction* context) : OpKernel(context) {}
          void Compute(OpKernelContext* context) override {
            %(code_compute)s
          }
        };

        REGISTER_KERNEL_BUILDER(
          Name("%(op_name)s")
          %(register_gpu_kernel_opts)s,
          %(op_name)sGpuOp);
      }
      """ % format_args
    else:
      code_gpu_op = ""
    return code_header + code_cpu_op + code_gpu_op

  def _make_mod(self):
    if self.cache_key in self.mod_cache:
      return self.mod_cache[self.cache_key]
    from Util import find_lib
    # Note about BLAS linkage:
    # TensorFlow (or its Eigen lib) likely has linked against some BLAS lib itself.
    # For our CPU code, we directly call some BLAS functions such as `sgemm_`.
    # On platforms where there is a flat namespace (e.g. Mac),
    # it probably is not needed to explicitly link it again for this module.
    # In other cases, it's probably needed, but it's not so clear which lib has the
    # right symbols (e.g. the `sgemm_` symbol).
    # The current solution is just to link against blas/f77blas
    # (both can potentially have the symbol) if it finds the lib.
    ld_flags = []
    if find_lib("blas"):
      ld_flags += ["-lblas"]
    if find_lib("f77blas"):
      ld_flags += ["-lf77blas"]
    # Another option to find some BLAS lib.
    if self.search_for_numpy_blas:
      import numpy
      numpy_dir = os.path.dirname(numpy.__file__)
      if os.path.exists("%s/.libs" % numpy_dir):
        ld_flags += ["-L%s/.libs" % numpy_dir]
        from glob import glob
        for f in glob("%s/.libs/*.so" % numpy_dir):
          f = os.path.basename(f)
          if f.startswith("lib"):
            f = f[3:]
          if f.endswith(".so"):
            f = f[:-3]
          ld_flags += ["-l%s" % f]
    comp = TFUtil.OpCodeCompiler(
      base_name=self.name, code_version=self.description.code_version,
      code=self._make_code(),
      include_deps=[self.support_native_op_cpp_filename],
      ld_flags=ld_flags,
      use_cuda_if_available=self.with_cuda,
      **dict(self.compiler_opts))
    mod = comp.load_tf_module()
    self.mod_cache[self.cache_key] = mod
    return mod

  def make_op(self):
    with self.global_lock:
      if self.cache_key in self.op_cache:
        return self.op_cache[self.cache_key]
      mod = self._make_mod()
      op = getattr(mod, camel_case_to_snake_case(self.op_name))
      self.op_cache[self.cache_key] = op

      if self.description.is_grad_defined:
        grad_description = self.description.grad()
        grad_op_maker = OpMaker(description=grad_description, compiler_opts=self.compiler_opts,
                                search_for_numpy_blas=self.search_for_numpy_blas)
        grad_op = grad_op_maker.make_op()

        from tensorflow.python.framework import ops
        def grad_wrapper(fwd_op, *bwd_grads):
          """
          :param tf.Operation fwd_op: for fwd_op.inputs and fwd_op.outputs
          :param list[tf.Tensor] bwd_grads:
          :return: list of tensors of gradients for each input
          :rtype: list[tf.Tensor]
          """
          assert len(bwd_grads) == len(fwd_op.outputs)

          grad_inputs = list(fwd_op.inputs) + list(fwd_op.outputs) + list(bwd_grads)
          grad_inputs = self.description._filter_grad_inputs(grad_inputs)
          grad_outputs = TFUtil.make_var_tuple(grad_op(*grad_inputs))
          if grad_description.num_dummy_outs > 0:
            grad_outputs = grad_outputs[:-grad_description.num_dummy_outs]
          grad_outputs = self.description.make_results_of_gradient(grad_outputs)
          return grad_outputs

        grad_wrapper.__name__ = grad_description.name
        grad_wrapper.grad_op = grad_op
        ops.RegisterGradient(self.name)(grad_wrapper)
        op.grad_wrapper = grad_wrapper
        op.grad_op = grad_op

    return op
Esempio n. 19
0
sys.path += [os.path.dirname(os.path.abspath(__file__)) + "/.."]
from nose.tools import assert_equal, assert_is_instance
import contextlib
import unittest
import numpy.testing
from pprint import pprint
import better_exchook
better_exchook.replace_traceback_format_tb()

from Config import Config
from TFNetwork import *
from TFNetworkLayer import *
from TFEngine import *
from Log import log
import TFUtil
TFUtil.debug_register_better_repr()

log.initialize(verbosity=[5])


@contextlib.contextmanager
def make_scope():
    with tf.Graph().as_default() as graph:
        with tf.Session(graph=graph) as session:
            yield session


network = {}
_last = "data"

Esempio n. 20
0
sys.path += [os.path.dirname(os.path.abspath(__file__)) + "/.."]
from nose.tools import assert_equal, assert_is_instance
import contextlib
import unittest
import numpy.testing
from pprint import pprint
import better_exchook
better_exchook.replace_traceback_format_tb()

from Config import Config
from TFNetwork import *
from TFNetworkLayer import *
from TFEngine import *
from Log import log
import TFUtil
TFUtil.debug_register_better_repr()

log.initialize(verbosity=[5])

@contextlib.contextmanager
def make_scope():
  with tf.Graph().as_default() as graph:
    with tf.Session(graph=graph) as session:
      yield session

network = {}
_last = "data"

def build_resnet(conv_time_dim):
  # network
  # (also defined by num_inputs & num_outputs)
Esempio n. 21
0
def build_actor_critic_network(scope, num_action, num_scene):
    # input:
    #   num_action   : number of available actions
    #   num_scene    : number of available scene ### maybe better to use list of scene names?
    with tf.variable_scope(scope):
        # get the nodes of input images and output features
        with tf.name_scope('inputs'):
            global_step = tf.placeholder(name='global_step',
                                         shape=None,
                                         dtype=tf.int32)
            state_placeholder = tf.placeholder(name='state',
                                               shape=(None, 2048),
                                               dtype=tf.float32)
            target_placeholder = tf.placeholder(name='target',
                                                shape=(None, 2048),
                                                dtype=tf.float32)
            action_placeholder = tf.placeholder(name='taken_action',
                                                shape=(None, ),
                                                dtype=tf.int32)
            q_value_placeholder = tf.placeholder(name='q_value',
                                                 shape=(None, ),
                                                 dtype=tf.float32)
            """
            advantage_placeholder = tf.placeholder(
                name  = 'advantage',
                shape = (None, ),
                dtype = tf.float32)
            """

        # compute embedded feature given the input image feature
        variable_dict = {}
        with tf.variable_scope('shared_layers') as tmp_scope:
            # fc1
            state_flattened = tf.reshape(
                state_placeholder, (-1, A3CConfig.num_history_frames * 2048))
            fc1_state = TFUtil.fc_layer(
                'fc1',
                state_flattened,
                input_size=A3CConfig.num_history_frames * 2048,
                num_neron=512,
                variable_dict=variable_dict)
            tmp_scope.reuse_variables()
            target_flattened = tf.reshape(
                target_placeholder, (-1, A3CConfig.num_history_frames * 2048))
            fc1_target = TFUtil.fc_layer(
                'fc1',
                target_flattened,
                input_size=A3CConfig.num_history_frames * 2048,
                num_neron=512,
                variable_dict=variable_dict)
        with tf.variable_scope('shared_layers'):
            # fc2
            fc2 = TFUtil.fc_layer('fc2',
                                  tf.concat((fc1_state, fc1_target), axis=1),
                                  input_size=1024,
                                  num_neron=512,
                                  variable_dict=variable_dict)

        # outputs
        policy_logits_dict = {}
        policy_prob_dict = {}
        state_value_dict = {}
        for scene in THORConfig.supported_envs:
            with tf.variable_scope(scene, reuse=False):
                # fc3 shared for policy and value output
                fc3 = TFUtil.fc_layer('fc3',
                                      fc2,
                                      input_size=512,
                                      num_neron=512,
                                      variable_dict=variable_dict)
                # policy output
                policy_logits = TFUtil.fc_layer('policy_logits',
                                                fc3,
                                                input_size=512,
                                                num_neron=num_action,
                                                activation=None,
                                                variable_dict=variable_dict)
                policy_probs = tf.nn.softmax(name='policy_probs',
                                             logits=policy_logits)
                # value output
                state_value = tf.squeeze(TFUtil.fc_layer(
                    'value',
                    fc3,
                    input_size=512,
                    num_neron=1,
                    activation=None,
                    variable_dict=variable_dict),
                                         axis=1)
                # add output to list
                policy_logits_dict[scene] = policy_logits
                policy_prob_dict[scene] = policy_probs
                state_value_dict[scene] = state_value

        local_summaries = []
        with tf.variable_scope('loss'):
            scene_losses = {}
            for scene in THORConfig.supported_envs:
                # policy loss
                log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits(
                    name='log_prob',
                    labels=action_placeholder,
                    logits=policy_logits_dict[scene])
                policy_loss = -tf.reduce_sum(log_prob * tf.stop_gradient(
                    q_value_placeholder -
                    state_value_dict[scene]))  # regularization for A3C delay
                policy_entropy = -0.01 * tf.reduce_sum(
                    policy_prob_dict[scene] * tf.log(
                        tf.clip_by_value(policy_prob_dict[scene], 1e-20, 1)))
                # value_loss
                value_loss = 0.5 * tf.reduce_sum(
                    tf.square(q_value_placeholder - state_value_dict[scene]))
                # need to tweak weight
                scene_losses[scene] = policy_loss + value_loss - policy_entropy
                with tf.name_scope(scene):
                    local_summaries.append(
                        tf.summary.scalar('policy_loss', policy_loss))
                    local_summaries.append(
                        tf.summary.scalar('policy_entropy', policy_entropy))
                    local_summaries.append(
                        tf.summary.scalar('value_loss', value_loss))
        local_summary_op = tf.summary.merge(local_summaries)

    with tf.variable_scope('train_ops', reuse=(scope != "global")):
        train_ops = {}
        # train_op
        # optional: varying learning_rate

        learning_rate = tf.train.polynomial_decay(
            learning_rate=A3CConfig.learning_rate,
            global_step=global_step,
            decay_steps=A3CConfig.decay_step,
            end_learning_rate=A3CConfig.end_learning_rate)

        # create optimizer
        #optimizer = tf.train.AdamOptimizer(learning_rate = A3CConfig.learning_rate)
        optimizer = tf.train.RMSPropOptimizer(
            learning_rate=learning_rate,
            decay=A3CConfig.decay_rate,
            epsilon=0.1)  #, momentum = A3CConfig.momentum)

        for scene in THORConfig.supported_envs:
            # get local trainable variables and compute gradients
            local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           scope=scope)
            grad_var = optimizer.compute_gradients(scene_losses[scene],
                                                   var_list=local_vars)

            # optional: gradient clipping
            clipped_grad_var = []
            for grad, var in grad_var:
                if grad is not None:
                    clipped_grad_var.append((tf.clip_by_value(grad, -40.,
                                                              40.), var))
                else:
                    clipped_grad_var.append((None, var))
            grad_var = clipped_grad_var

            # apply gradient to global variables
            if scope != 'global':
                global_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope='global')
                tmp = []
                for i in xrange(len(grad_var)):
                    tmp.append((grad_var[i][0], global_vars[i]))
                grad_var = tmp

            train_ops[scene] = optimizer.apply_gradients(grad_var)

        with tf.variable_scope('global', reuse=(scope != "global")):
            reward = tf.get_variable(name='global/reward',
                                     shape=(1, ),
                                     dtype=tf.float32,
                                     trainable=False)
            num_step = tf.get_variable(name='global/num_steps',
                                       shape=(1, ),
                                       dtype=tf.int32,
                                       trainable=False)

    return (global_step, state_placeholder, target_placeholder, action_placeholder, q_value_placeholder), \
           (policy_prob_dict, state_value_dict), \
           (train_ops, local_summary_op), \
           (reward, num_step)
Esempio n. 22
0
def build_dqn(num_action, dueling_dqn=False):
    with tf.variable_scope('DQN'):
        # Prediction Network
        with tf.variable_scope('Prediction'):
            pn_variable_dict = {}
            # inference
            pn_states = tf.placeholder(name='input',
                                       shape=(None, 84, 84, 4),
                                       dtype=tf.float32)
            pn_conv1 = TFUtil.conv_layer('conv1',
                                         pn_states,
                                         shape=[8, 8, 4, 32],
                                         stride=4,
                                         variable_dict=pn_variable_dict)
            pn_conv2 = TFUtil.conv_layer('conv2',
                                         pn_conv1,
                                         shape=[4, 4, 32, 64],
                                         stride=2,
                                         variable_dict=pn_variable_dict)
            pn_conv3 = TFUtil.conv_layer('conv3',
                                         pn_conv2,
                                         shape=[3, 3, 64, 64],
                                         stride=1,
                                         variable_dict=pn_variable_dict)
            pn_conv3_flatten = TFUtil.flatten(pn_conv3,
                                              feature_length=(7 * 7 * 64))
            if dueling_dqn:
                pn_fc4_a = TFUtil.fc_layer('fc4_a',
                                           pn_conv3_flatten,
                                           input_size=(7 * 7 * 64),
                                           num_neron=512,
                                           variable_dict=pn_variable_dict)
                pn_value = TFUtil.fc_layer('value',
                                           pn_fc4_a,
                                           input_size=512,
                                           num_neron=1,
                                           activation=None,
                                           variable_dict=pn_variable_dict)
                pn_fc4_b = TFUtil.fc_layer('fc4_b',
                                           pn_conv3_flatten,
                                           input_size=(7 * 7 * 64),
                                           num_neron=512,
                                           variable_dict=pn_variable_dict)
                pn_advantage = TFUtil.fc_layer('advantage',
                                               pn_fc4_b,
                                               input_size=512,
                                               num_neron=num_action,
                                               activation=None,
                                               variable_dict=pn_variable_dict)
                pn_Q = (pn_advantage -
                        tf.reshape(tf.reduce_mean(pn_advantage, axis=1),
                                   (-1, 1))) + tf.reshape(pn_value, (-1, 1))
            else:
                pn_fc4 = TFUtil.fc_layer('fc4',
                                         pn_conv3_flatten,
                                         input_size=(7 * 7 * 64),
                                         num_neron=512,
                                         variable_dict=pn_variable_dict)
                pn_Q = TFUtil.fc_layer('Q',
                                       pn_fc4,
                                       input_size=512,
                                       num_neron=num_action,
                                       activation=None,
                                       variable_dict=pn_variable_dict)

            # loss
            pn_q_target = tf.placeholder(name='q_target',
                                         shape=(None, ),
                                         dtype=tf.float32)
            pn_actions = tf.placeholder(name='action',
                                        shape=(None, ),
                                        dtype=tf.int32)
            pn_actions_one_hot = tf.one_hot(pn_actions, depth=num_action)
            pn_delta = tf.reduce_sum(pn_actions_one_hot * pn_Q,
                                     axis=1) - pn_q_target
            pn_loss = tf.reduce_sum(
                TFUtil.huber_loss(pn_delta)) / DQNConfig.batch_size

            # summary
            summary_pn_loss = tf.summary.scalar('pn_loss', pn_loss)
            summary_averaged_pn_Q = tf.summary.scalar('averaged_pn_Q',
                                                      tf.reduce_mean(pn_Q))

            # optimizer
            pn_train = tf.train.RMSPropOptimizer(
                learning_rate=DQNConfig.lr).minimize(pn_loss)

        # Target Network
        with tf.variable_scope('Target'):
            tn_variable_dict = {}
            # inference
            tn_states = tf.placeholder(name='input',
                                       shape=(None, 84, 84, 4),
                                       dtype=tf.float32)
            tn_conv1 = TFUtil.conv_layer('conv1',
                                         tn_states,
                                         shape=[8, 8, 4, 32],
                                         stride=4,
                                         variable_dict=tn_variable_dict)
            tn_conv2 = TFUtil.conv_layer('conv2',
                                         tn_conv1,
                                         shape=[4, 4, 32, 64],
                                         stride=2,
                                         variable_dict=tn_variable_dict)
            tn_conv3 = TFUtil.conv_layer('conv3',
                                         tn_conv2,
                                         shape=[3, 3, 64, 64],
                                         stride=1,
                                         variable_dict=tn_variable_dict)
            tn_conv3_flatten = TFUtil.flatten(tn_conv3,
                                              feature_length=(7 * 7 * 64))
            if dueling_dqn:
                tn_fc4_a = TFUtil.fc_layer('fc4_a',
                                           tn_conv3_flatten,
                                           input_size=(7 * 7 * 64),
                                           num_neron=512,
                                           variable_dict=tn_variable_dict)
                tn_value = TFUtil.fc_layer('value',
                                           tn_fc4_a,
                                           input_size=512,
                                           num_neron=1,
                                           activation=None,
                                           variable_dict=tn_variable_dict)
                tn_fc4_b = TFUtil.fc_layer('fc4_b',
                                           tn_conv3_flatten,
                                           input_size=(7 * 7 * 64),
                                           num_neron=512,
                                           variable_dict=tn_variable_dict)
                tn_advantage = TFUtil.fc_layer('advantage',
                                               tn_fc4_b,
                                               input_size=512,
                                               num_neron=num_action,
                                               activation=None,
                                               variable_dict=tn_variable_dict)
                tn_Q = (tn_advantage -
                        tf.reshape(tf.reduce_mean(tn_advantage, axis=1),
                                   (-1, 1))) + tf.reshape(tn_value, (-1, 1))
            else:
                tn_fc4 = TFUtil.fc_layer('fc4',
                                         tn_conv3_flatten,
                                         input_size=(7 * 7 * 64),
                                         num_neron=512,
                                         variable_dict=tn_variable_dict)
                tn_Q = TFUtil.fc_layer('Q',
                                       tn_fc4,
                                       input_size=512,
                                       num_neron=num_action,
                                       activation=None,
                                       variable_dict=tn_variable_dict)

        # Network Cloning
        with tf.variable_scope('Prediction_to_Target'):
            network_cloning_ops = []
            assert (tn_variable_dict.keys() == pn_variable_dict.keys())
            for k in tn_variable_dict.keys():
                network_cloning_ops.append(
                    tf.assign(tn_variable_dict[k], pn_variable_dict[k]))

        # Performance Evaluation
        with tf.variable_scope('performance_evaluation'):
            episode_reward = tf.placeholder(name='episode_reward',
                                            shape=(),
                                            dtype=tf.float32)
            summary_avg_episode_reward = tf.summary.scalar(
                'episode_reward', episode_reward)

    return (pn_states, pn_Q, pn_loss, pn_actions, pn_q_target,
            pn_train), (tn_states, tn_Q), network_cloning_ops, (
                summary_pn_loss,
                summary_averaged_pn_Q), (episode_reward,
                                         summary_avg_episode_reward)
Esempio n. 23
0
def build_network(scope, num_action, dueling_dqn):
    with tf.variable_scope(scope):
        with tf.variable_scope('Prediction'):
            with tf.name_scope('inputs'):
                # resnet feature
                pn_state_placeholder = tf.placeholder(
                    name='state',
                    shape=(None, config.num_history_frames,
                           2048),  # (n, 4, 2048)
                    dtype=tf.float32)
                # target feature
                pn_target_placeholder = tf.placeholder(
                    name='target',
                    shape=(None, config.num_history_frames,
                           2048),  # (n, 4, 2048)
                    dtype=tf.float32)
                pn_q_target = tf.placeholder(name='q_target',
                                             shape=(None, ),
                                             dtype=tf.float32)
                pn_actions = tf.placeholder(name='action',
                                            shape=(None, ),
                                            dtype=tf.int32)

            # compute embedded feature given the input image feature
            pn_variable_dict = {}
            with tf.variable_scope('shared_layers') as scope:
                # fc1
                state_flattened = tf.reshape(
                    pn_state_placeholder,
                    (-1, config.num_history_frames * 2048))  # (n, 4 * 2048)
                fc1_state = TFUtil.fc_layer(
                    'fc1',
                    state_flattened,
                    input_size=config.num_history_frames * 2048,
                    num_neron=512,
                    variable_dict=pn_variable_dict)  # (n, 512)
                scope.reuse_variables()
                target_flattened = tf.reshape(
                    pn_target_placeholder,
                    (-1, config.num_history_frames * 2048))  # (n, 4 * 2048)
                fc1_target = TFUtil.fc_layer(
                    'fc1',
                    target_flattened,
                    input_size=config.num_history_frames * 2048,
                    num_neron=512,
                    variable_dict=pn_variable_dict)  # (n, 512)
            with tf.variable_scope('shared_layers'):
                # fc2:
                fc2 = TFUtil.fc_layer(
                    'fc2',
                    tf.concat((fc1_state, fc1_target), axis=1),  # (n, 1024)
                    input_size=1024,
                    num_neron=512,
                    variable_dict=pn_variable_dict)  # (n, 512)

            # output: copied and modified from DQNNet.py
            if dueling_dqn:
                pn_fc4_a = TFUtil.fc_layer('fc4_a',
                                           fc2,
                                           input_size=(512),
                                           num_neron=512,
                                           variable_dict=pn_variable_dict)
                pn_value = TFUtil.fc_layer('value',
                                           pn_fc4_a,
                                           input_size=512,
                                           num_neron=1,
                                           activation=None,
                                           variable_dict=pn_variable_dict)
                pn_fc4_b = TFUtil.fc_layer('fc4_b',
                                           fc2,
                                           input_size=(512),
                                           num_neron=512,
                                           variable_dict=pn_variable_dict)
                pn_advantage = TFUtil.fc_layer('advantage',
                                               pn_fc4_b,
                                               input_size=512,
                                               num_neron=num_action,
                                               activation=None,
                                               variable_dict=pn_variable_dict)
                pn_Q = (pn_advantage -
                        tf.reshape(tf.reduce_mean(pn_advantage, axis=1),
                                   (-1, 1))) + tf.reshape(pn_value, (-1, 1))
            else:
                pn_fc4 = TFUtil.fc_layer('fc4',
                                         fc2,
                                         input_size=(512),
                                         num_neron=512,
                                         variable_dict=pn_variable_dict)
                pn_Q = TFUtil.fc_layer('Q',
                                       pn_fc4,
                                       input_size=512,
                                       num_neron=num_action,
                                       activation=None,
                                       variable_dict=pn_variable_dict)

            # loss
            pn_actions_one_hot = tf.one_hot(pn_actions, depth=num_action)
            pn_delta = tf.reduce_sum(pn_actions_one_hot * pn_Q,
                                     axis=1) - pn_q_target

            pn_importance_weight = tf.placeholder(name='importance_weight',
                                                  shape=(None),
                                                  dtype=tf.float32)
            pn_weighted_delta = tf.multiply(pn_delta, pn_importance_weight)

            pn_loss = tf.reduce_sum(
                TFUtil.huber_loss(pn_delta)) / config.batch_size

            # summary
            summary_pn_loss = tf.summary.scalar('pn_loss', pn_loss)
            summary_averaged_pn_Q = tf.summary.scalar('averaged_pn_Q',
                                                      tf.reduce_mean(pn_Q))

            # optimizer
            pn_train = tf.train.RMSPropOptimizer(
                learning_rate=config.lr).minimize(pn_loss)

        with tf.variable_scope('Target'):
            with tf.name_scope('inputs'):
                # resnet feature
                tn_state_placeholder = tf.placeholder(
                    name='state',
                    shape=(None, config.num_history_frames,
                           2048),  # (n, 4, 2048)
                    dtype=tf.float32)
                # target feature
                tn_target_placeholder = tf.placeholder(
                    name='target',
                    shape=(None, config.num_history_frames,
                           2048),  # (n, 4, 2048)
                    dtype=tf.float32)

            # compute embedded feature given the input image feature
            tn_variable_dict = {}
            with tf.variable_scope('shared_layers') as scope:
                # fc1
                state_flattened = tf.reshape(
                    tn_state_placeholder,
                    (-1, config.num_history_frames * 2048))  # (n, 4 * 2048)
                fc1_state = TFUtil.fc_layer(
                    'fc1',
                    state_flattened,
                    input_size=config.num_history_frames * 2048,
                    num_neron=512,
                    variable_dict=tn_variable_dict)  # (n, 512)
                scope.reuse_variables()
                target_flattened = tf.reshape(
                    tn_target_placeholder,
                    (-1, config.num_history_frames * 2048))  # (n, 4 * 2048)
                fc1_target = TFUtil.fc_layer(
                    'fc1',
                    target_flattened,
                    input_size=config.num_history_frames * 2048,
                    num_neron=512,
                    variable_dict=tn_variable_dict)  # (n, 512)
            with tf.variable_scope('shared_layers'):
                # fc2:
                fc2 = TFUtil.fc_layer(
                    'fc2',
                    tf.concat((fc1_state, fc1_target), axis=1),  # (n, 1024)
                    input_size=1024,
                    num_neron=512,
                    variable_dict=tn_variable_dict)  # (n, 512)

            # output: copied and modified from DQNNet.py
            if dueling_dqn:
                tn_fc4_a = TFUtil.fc_layer('fc4_a',
                                           fc2,
                                           input_size=(512),
                                           num_neron=512,
                                           variable_dict=tn_variable_dict)
                tn_value = TFUtil.fc_layer('value',
                                           tn_fc4_a,
                                           input_size=512,
                                           num_neron=1,
                                           activation=None,
                                           variable_dict=tn_variable_dict)
                tn_fc4_b = TFUtil.fc_layer('fc4_b',
                                           fc2,
                                           input_size=(512),
                                           num_neron=512,
                                           variable_dict=tn_variable_dict)
                tn_advantage = TFUtil.fc_layer('advantage',
                                               tn_fc4_b,
                                               input_size=512,
                                               num_neron=num_action,
                                               activation=None,
                                               variable_dict=tn_variable_dict)
                tn_Q = (tn_advantage -
                        tf.reshape(tf.reduce_mean(tn_advantage, axis=1),
                                   (-1, 1))) + tf.reshape(tn_value, (-1, 1))
            else:
                tn_fc4 = TFUtil.fc_layer('fc4',
                                         fc2,
                                         input_size=(512),
                                         num_neron=512,
                                         variable_dict=tn_variable_dict)
                tn_Q = TFUtil.fc_layer('Q',
                                       tn_fc4,
                                       input_size=512,
                                       num_neron=num_action,
                                       activation=None,
                                       variable_dict=tn_variable_dict)

        # Network Cloning
        with tf.variable_scope('Prediction_to_Target'):
            network_cloning_ops = []
            assert (tn_variable_dict.keys() == pn_variable_dict.keys())
            for k in tn_variable_dict.keys():
                network_cloning_ops.append(
                    tf.assign(tn_variable_dict[k], pn_variable_dict[k]))

        # Performance Evaluation
        with tf.variable_scope('performance_evaluation'):
            episode_reward = tf.placeholder(name='episode_reward',
                                            shape=(),
                                            dtype=tf.int32)
            summary_avg_episode_reward = tf.summary.scalar(
                'episode_reward', episode_reward)
            episode_steps = tf.placeholder(name='episode_steps',
                                           shape=(),
                                           dtype=tf.int32)
            summary_avg_episode_steps = tf.summary.scalar(
                'episode_steps', episode_steps)

    return (pn_state_placeholder, pn_target_placeholder, pn_Q, pn_loss,
            pn_actions, pn_q_target, pn_train, pn_importance_weight,
            pn_delta), (tn_state_placeholder, tn_target_placeholder,
                        tn_Q), network_cloning_ops, (
                            summary_pn_loss, summary_averaged_pn_Q), (
                                episode_reward, summary_avg_episode_reward,
                                episode_steps, summary_avg_episode_steps)
Esempio n. 24
0
    def learn(self, double_dqn, dueling_dqn, model_save_frequency, model_save_path, model_load_path, use_gpu, gpu_id, summary_folder):
        device_str = TFUtil.get_device_str(use_gpu=use_gpu, gpu_id=gpu_id)
        with tf.device(device_str):
            self._init_dqn(dueling_dqn)
            # initialize all variables
            init = tf.global_variables_initializer()
            # create auxiliary operations: summary and saver
            summary_op = tf.summary.merge_all()
            summary_writer = tf.summary.FileWriter(summary_folder)

        # initialize
        if model_load_path is None:
            self._tf_sess.run(init)
        else:
            self.load(model_load_path)

        # start new episode
        self._request_new_episode()

        # first take some random actions to populate the replay memory before learning starts
        if model_load_path is None:
            print('Taking random actions to warm up...')
            for i in range(config.replay_start_size):
                if i % 100 == 0:
                    print('{0}/{1}'.format(i, config.replay_start_size))
                done = self._perform_random_action()
                if done:
                    self._request_new_episode()

        print('Training started, please open Tensorboard to monitor the training process.')
        episode_count=0
        for i in range(config.max_iterations):

            if i % 1000 ==0:
                print('{0}/{1}'.format(i, config.max_iterations))

            # save model
            if i % model_save_frequency == 0 and i != 0:
                self.save(model_save_path)

            # update target_network
            if i % config.target_network_update_freq == 0:
                self._tf_sess.run(self._tf_clone_ops)

            # select and perform action
            state = self._get_current_state()
            state = state[np.newaxis]
            target = self._cur_target[np.newaxis]
            Q = self._evaluate_q(state, target)
            a = self._select_action(Q, i, test_mode=False)
            self._perform_action(a)
            if self._env.episode_done():
                episode_reward = self._env.get_total_episode_reward()
                summary_episode_reward = self._tf_sess.run(self._tf_summary_episode_reward, feed_dict={self._tf_episode_reward: episode_reward})
                summary_writer.add_summary(summary_episode_reward, global_step=episode_count)

                episode_steps = self._env.get_steps_count()
                summary_episode_steps = self._tf_sess.run(self._tf_summary_episode_steps, feed_dict={self._tf_episode_steps: episode_steps})
                summary_writer.add_summary(summary_episode_steps, global_step=episode_count)
                episode_count += 1
                self._request_new_episode()


            # sample mini-batch and perform training
            if config.prioritized_experience_replay:
                experiences, weights, indices = self._replay_memory.sample(training_step=i)
            else:
                experiences = self._replay_memory.sample(config.batch_size)
            states, targets, actions, states_new, rewards, dones = DQNAgent.decompose_experiences(experiences)

            # compute q_targets
            if double_dqn:
                # double DQN: use prediction network for action selection, use target network for action's Q value evaluation
                q_new_p = self._tf_sess.run(self._tf_pn_Q, feed_dict={self._tf_pn_state: states_new,
                                                                      self._tf_pn_target: targets})
                action = np.argmax(q_new_p, axis=1)
                q_new_t = self._tf_sess.run(self._tf_tn_Q, feed_dict={self._tf_tn_state: states_new,
                                                                      self._tf_tn_target: targets})
                q_new_max = np.array([q_new_t[j, action[j]] for j in range(config.batch_size)])
            else:
                # DQN: use target network for action selection and evaluation
                q_new = self._tf_sess.run(self._tf_tn_Q, feed_dict={self._tf_tn_state: states_new,
                                                                    self._tf_tn_target: targets})
                q_new_max = np.max(q_new, axis=1)
            q_targets = rewards + q_new_max * config.discounted_factor * (1. - dones.astype(np.int))

            if not config.prioritized_experience_replay:
                weights = np.array([1.0] * config.batch_size)

            # train
            _, loss, delta, summary_pn_loss, summary_averaged_pn_Q = self._tf_sess.run([self._tf_pn_train, self._tf_pn_loss, self._tf_pn_delta, self._tf_summary_pn_loss, self._tf_summary_averaged_pn_Q],
                                                                                       feed_dict={self._tf_pn_actions: actions,
                                                                                                  self._tf_pn_state: states,
                                                                                                  self._tf_pn_target: targets,
                                                                                                  self._tf_pn_Q_target: q_targets,
                                                                                                  self._tf_pn_importance_weight: weights})

            if config.prioritized_experience_replay:
                self._replay_memory.update_priority(indices, delta)

            summary_writer.add_summary(summary_pn_loss, global_step=i)
            summary_writer.add_summary(summary_averaged_pn_Q, global_step=i)
        # save model after training
        self.save(model_save_path)
Esempio n. 25
0
def test_pool_layer_NCHW():
  with make_scope() as session:
    import numpy as np
    net = TFNetwork(extern_data=ExternData())
    with tf.variable_scope("src_nhwc"):
      src_nhwc = InternalLayer(name="src_nhwc", network=net, out_type={"dim": 16,
                                                                       "shape": (None, 16, 16),
                                                                       "batch_dim_axis": 0,
                                                                       "time_dim_axis": 1,
                                                                       "feature_dim_axis": 3,
                                                                       "sparse": False
                                                                       })
      src_nhwc.output.placeholder = tf.placeholder(shape=(None, None, 16, 16), dtype=tf.float32)
      src_nhwc.output.size_placeholder = {0: tf.placeholder(shape=(None,), dtype=tf.int32)}
    with tf.variable_scope("src_nchw"):
      src_nchw = InternalLayer(name="src_nchw", network=net, out_type={"dim": 16,
                                                                       "shape": (16, None, 16),
                                                                       "batch_dim_axis": 0,
                                                                       "time_dim_axis": 2,
                                                                       "feature_dim_axis": 1,
                                                                       "sparse": False
                                                                       })
      src_nchw.output.placeholder = tf.placeholder(shape=(None, 16, None, 16), dtype=tf.float32)
      src_nchw.output.size_placeholder = {1: tf.placeholder(shape=(None,), dtype=tf.int32)}

    pool_size = (5, 5)
    strides = (1, 2)
    padding = "VALID"

    with tf.variable_scope("pool_nhwc_from_nhwc"):
      pool_nhwc_from_nhwc = PoolLayer(
        name="pool_nhwc_from_nhwc", network=net, mode="max", pool_size=pool_size,
        padding=padding, strides=strides, use_channel_first=False, sources=[src_nhwc],
        output=PoolLayer.get_out_data_from_opts(name="pool_nhwc_from_nhwc",
                                                pool_size=pool_size, padding=padding,
                                                use_channel_first=False,
                                                network=net, sources=[src_nhwc]))
    with tf.variable_scope("pool_nchw_from_nhwc"):
      pool_nchw_from_nhwc = PoolLayer(
        name="pool_nchw_from_nhwc", network=net, mode="max", pool_size=pool_size,
        padding=padding, strides=strides, use_channel_first=True, sources=[src_nhwc],
        output=PoolLayer.get_out_data_from_opts(name="pool_nchw_from_nhwc",
                                                pool_size=pool_size, padding=padding,
                                                use_channel_first=True,
                                                network=net, sources=[src_nhwc]))
    with tf.variable_scope("pool_nchw_from_nchw"):
      pool_nchw_from_nchw = PoolLayer(
        name="pool_nchw_from_nchw", network=net, mode="max", pool_size=pool_size,
        padding=padding, strides=strides, use_channel_first=True, sources=[src_nchw],
        output=PoolLayer.get_out_data_from_opts(name="pool_nchw_from_nchw",
                                                pool_size=pool_size, padding=padding,
                                                use_channel_first=True,
                                                network=net, sources=[src_nchw]))
    with tf.variable_scope("pool_nhwc_from_nchw"):
      pool_nhwc_from_nchw = PoolLayer(
        name="pool_nhwc_from_nchw", network=net, mode="max", pool_size=pool_size,
        padding=padding, strides=strides, use_channel_first=False, sources=[src_nchw],
        output=PoolLayer.get_out_data_from_opts(name="pool_nhwc_from_nchw",
                                                pool_size=pool_size, padding=padding,
                                                use_channel_first=False,
                                                network=net, sources=[src_nchw]))
    tf.global_variables_initializer().run()
    out, seq_lens = session.run([pool_nhwc_from_nhwc.output.placeholder,
                                 pool_nhwc_from_nhwc.output.size_placeholder[0]],
                                feed_dict={src_nhwc.output.placeholder: np.random.rand(10, 11, 16, 16),
                                           src_nhwc.output.size_placeholder[0]: np.full(shape=(10,), fill_value=11)}
                                )
    print(out.shape)
    assert_equal(out.shape, (10, 7, 6, 16))
    print(seq_lens)
    time_dim_axis = 1 if TFUtil.is_gpu_available() else 0
    out, seq_lens = session.run([pool_nchw_from_nhwc.output.placeholder,
                                 pool_nchw_from_nhwc.output.size_placeholder[time_dim_axis]],
                                feed_dict={src_nhwc.output.placeholder: np.random.rand(10, 11, 16, 16),
                                           src_nhwc.output.size_placeholder[0]: np.full(shape=(10,), fill_value=11)
                                })
    print(out.shape)
    if time_dim_axis == 1:
      assert_equal(out.shape, (10, 16, 7, 6))
    else:
      assert_equal(out.shape, (10, 7, 6, 16))
    print(seq_lens)
    if TFUtil.is_gpu_available():
      out, seq_lens = session.run([pool_nchw_from_nchw.output.placeholder,
                                   pool_nchw_from_nchw.output.size_placeholder[1]],
                                  feed_dict={src_nchw.output.placeholder: np.random.rand(10, 16, 11, 16),
                                             src_nchw.output.size_placeholder[1]: np.full(shape=(10,), fill_value=11)
                                  })
      print(out.shape)
      assert_equal(out.shape, (10, 16, 7, 6))
      print(seq_lens)
    out, seq_lens = session.run([pool_nhwc_from_nchw.output.placeholder,
                                 pool_nhwc_from_nchw.output.size_placeholder[0]],
                                feed_dict={src_nchw.output.placeholder: np.random.rand(10, 16, 11, 16),
                                           src_nchw.output.size_placeholder[1]: np.full(shape=(10,), fill_value=11)}
                                )
    print(out.shape)
    assert_equal(out.shape, (10, 7, 6, 16))
    print(seq_lens)
Esempio n. 26
0
# start test like this:  nosetests-2.7  tests/test_TFEngine.py

import logging
logging.getLogger('tensorflow').disabled = True
import tensorflow as tf
import sys
sys.path += ["."]  # Python 3 hack
from TFEngine import *
import Util
import TFUtil
TFUtil.debugRegisterBetterRepr()
from Config import Config
from nose.tools import assert_equal, assert_is_instance
import numpy
import numpy.testing
import os
from pprint import pprint
import better_exchook
better_exchook.replace_traceback_format_tb()
from Log import log
log.initialize(verbosity=[5])

session = tf.InteractiveSession()


def test_DataProvider():
    """
  :param Dataset.Dataset dataset:
  :param int seq_idx:
  :param str|None output_layer_name: e.g. "output". if not set, will read from config "forward_output_layer"
  :return: numpy array, output in time major format (time,batch,dim)
Esempio n. 27
0
def build_actor_critic_network(num_action):
    with tf.variable_scope('actor_critic_network'):
        # Inputs
        with tf.name_scope('inputs'):
            state_placeholder = tf.placeholder(name='state',
                                               shape=(None, 84, 84, 4),
                                               dtype=tf.float32)
            action_placeholder = tf.placeholder(name='taken_action',
                                                shape=(None, ),
                                                dtype=tf.int32)
            q_value_placeholder = tf.placeholder(name='q_value',
                                                 shape=(None, ),
                                                 dtype=tf.float32)
            advantage_placeholder = tf.placeholder(name='advantage',
                                                   shape=(None, ),
                                                   dtype=tf.float32)

        # Main network
        with tf.variable_scope('shared_network'):
            variable_dict = {}
            # inference
            conv1 = TFUtil.conv_layer('conv1',
                                      state_placeholder,
                                      shape=[8, 8, 4, 32],
                                      stride=4,
                                      variable_dict=variable_dict)
            conv2 = TFUtil.conv_layer('conv2',
                                      conv1,
                                      shape=[4, 4, 32, 64],
                                      stride=2,
                                      variable_dict=variable_dict)
            conv3 = TFUtil.conv_layer('conv3',
                                      conv2,
                                      shape=[3, 3, 64, 64],
                                      stride=1,
                                      variable_dict=variable_dict)
            conv3_flatten = TFUtil.flatten(conv3, feature_length=(7 * 7 * 64))

        # outputs
        with tf.variable_scope('actor_network'):
            fc4_actor = TFUtil.fc_layer('fc4_actor',
                                        conv3_flatten,
                                        input_size=(7 * 7 * 64),
                                        num_neron=512,
                                        variable_dict=variable_dict)
            actor_logits = TFUtil.fc_layer('logits',
                                           fc4_actor,
                                           input_size=512,
                                           num_neron=num_action,
                                           activation=None,
                                           variable_dict=variable_dict)
            policy_probs = tf.nn.softmax(name='policy_probs',
                                         logits=actor_logits)

        with tf.variable_scope('critic_network'):
            fc4_critic = TFUtil.fc_layer('fc4_critic',
                                         conv3_flatten,
                                         input_size=(7 * 7 * 64),
                                         num_neron=512,
                                         variable_dict=variable_dict)
            state_value = tf.squeeze(TFUtil.fc_layer(
                'value',
                fc4_critic,
                input_size=512,
                num_neron=1,
                activation=None,
                variable_dict=variable_dict),
                                     axis=1)

        with tf.variable_scope('loss'):
            # policy loss
            log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits(
                name='log_prob',
                labels=action_placeholder,
                logits=actor_logits)
            policy_loss = -tf.reduce_sum(
                log_prob * advantage_placeholder) / ACConfig.batch_size
            policy_entropy = -tf.reduce_sum(
                policy_probs *
                tf.log(policy_probs + 1e-15)) / ACConfig.batch_size
            # value_loss
            value_loss = tf.reduce_sum(
                tf.square(q_value_placeholder -
                          state_value)) / ACConfig.batch_size
            # need to tweak weight
            loss = policy_loss + 0.5 * value_loss - 0.0005 * policy_entropy

        # train_op
        optimizer = tf.train.AdamOptimizer(learning_rate=ACConfig.lr)
        """
        grad_var = optimizer.compute_gradients(loss)
        clipped_grad_var = [(tf.clip_by_value(grad, -10., 10.), var) for grad, var in grad_var]
        train_op = optimizer.apply_gradients(clipped_grad_var)
        """
        train_op = optimizer.minimize(loss)

        # sample_action
        sample_action = tf.multinomial(actor_logits, 1)

        # reward_history
        with tf.name_scope('reward_history'):
            reward_history_placeholder = tf.placeholder(name='reward_history',
                                                        shape=(None, ),
                                                        dtype=tf.float32)
            average_reward = tf.reduce_mean(reward_history_placeholder)

        # summary
        with tf.name_scope('summary'):
            tf.summary.scalar('average_reward_over_100_episodes',
                              average_reward)
            tf.summary.scalar('policy_loss', policy_loss)
            tf.summary.scalar('policy_entropy', policy_entropy)
            tf.summary.scalar('value_loss', value_loss)
            tf.summary.scalar('loss', loss)

    return (state_placeholder, action_placeholder, q_value_placeholder, advantage_placeholder, reward_history_placeholder), \
           (train_op, sample_action), (actor_logits, state_value, average_reward)
Esempio n. 28
0
    def learn(self,
              model_save_frequency,
              model_save_path,
              check_point,
              use_gpu,
              gpu_id=None):
        device_str = TFUtil.get_device_str(use_gpu=use_gpu, gpu_id=gpu_id)
        with tf.Graph().as_default():
            with tf.device(device_str):
                # build AC network
                self._init_acn()

                # initialize all variables
                init = tf.global_variables_initializer()

                # create auxiliary operations: summary and saver
                summary_op = tf.summary.merge_all()
                summary_writer = tf.summary.FileWriter(ACConfig.summary_folder,
                                                       self._tf_sess.graph)

            # initialize
            if check_point is None:
                self._tf_sess.run(init)
                episode_idx = 0
            else:
                self.load(model_save_path, check_point)
                episode_idx = check_point

            reward_history = deque(maxlen=100)
            state = self._request_new_episode()
            total_rewards = 0

            # start training
            print(
                'Training started, please open Tensorboard to monitor the training process.'
            )

            while episode_idx < ACConfig.max_iterations:
                # sample and perform action, store history
                action = self._select_action(state, episode_idx)
                next_state, reward, done = self._perform_action(state, action)
                total_rewards += reward
                state = next_state

                #if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
                #    print (('ep %d: game finished, reward: %f' % (episode_idx + 1, reward)) + ('' if reward == -1 else ' !!!!!!!!'))

                if done:
                    episode_idx += 1
                    # record reward and reset
                    reward_history.append(total_rewards)
                    print("Reward for episode {}: {}".format(
                        episode_idx, total_rewards))
                    state = self._request_new_episode()
                    total_rewards = 0

                    # start to train if episode reaches
                    if episode_idx % ACConfig.batch_size == 0:
                        states = self._history_buffer._state_buffer
                        actions = self._history_buffer._action_buffer
                        q_values, advantages = self._history_buffer.compute_q_value_and_advantages(
                        )

                        _, average_reward, summary = \
                        self._tf_sess.run([self._tf_acn_train_op, self._tf_average_reward, summary_op],
                                            feed_dict={self._tf_acn_state: states,
                                                       self._tf_acn_action: actions,
                                                       self._tf_acn_q_value: q_values,
                                                       self._tf_acn_advantage: advantages,
                                                       self._tf_reward_history: reward_history
                                                      })
                        self._history_buffer.clean_up()

                        # record summary
                        summary_writer.add_summary(summary,
                                                   global_step=episode_idx)

                        print("Episode {}".format(episode_idx))
                        print(
                            "Average reward for last 100 episodes: {}".format(
                                average_reward))

                    if (
                            episode_idx
                    ) % model_save_frequency == 0 or episode_idx == ACConfig.max_iterations:
                        print("Model saved after {} episodes".format(
                            episode_idx))
                        self.save(model_save_path, global_step=episode_idx)
Esempio n. 29
0
def build_actor_critic_network(scope, num_action, num_scene):
    # input:
    #   num_action   : number of available actions
    #   num_scene    : number of available scene ### maybe better to use list of scene names?
    # Inputs
    with tf.variable_scope(scope):
        # get the nodes of input images and output features
        with tf.name_scope('inputs'):
            num_frames = tf.placeholder(name='num_frames',
                                        shape=None,
                                        dtype=tf.int32)
            state_placeholder = tf.placeholder(name='state',
                                               shape=(None, 2048),
                                               dtype=tf.float32)
            target_placeholder = tf.placeholder(name='target',
                                                shape=(None, 2048),
                                                dtype=tf.float32)
            action_placeholder = tf.placeholder(name='taken_action',
                                                shape=(None, ),
                                                dtype=tf.int32)
            q_value_placeholder = tf.placeholder(name='q_value',
                                                 shape=(None, ),
                                                 dtype=tf.float32)
            advantage_placeholder = tf.placeholder(name='advantage',
                                                   shape=(None, ),
                                                   dtype=tf.float32)
            scene_placeholder = tf.placeholder(name='current_scene',
                                               shape=(None, num_scene),
                                               dtype=tf.float32)

        # compute embedded feature given the input image feature
        variable_dict = {}
        with tf.variable_scope('shared_layers') as scope:
            # fc1
            state_flattened = tf.reshape(
                state_placeholder, (-1, A3CConfig.num_history_frames * 2048))
            fc1_state = TFUtil.fc_layer(
                'fc1',
                state_flattened,
                input_size=A3CConfig.num_history_frames * 2048,
                num_neron=512,
                variable_dict=variable_dict)
            scope.reuse_variables()
            target_flattened = tf.reshape(
                target_placeholder, (-1, A3CConfig.num_history_frames * 2048))
            fc1_target = TFUtil.fc_layer(
                'fc1',
                target_flattened,
                input_size=A3CConfig.num_history_frames * 2048,
                num_neron=512,
                variable_dict=variable_dict)
        with tf.variable_scope('shared_layers'):
            # fc2
            fc2 = TFUtil.fc_layer('fc2',
                                  tf.concat((fc1_state, fc1_target), axis=1),
                                  input_size=1024,
                                  num_neron=512,
                                  variable_dict=variable_dict)

        # outputs
        policy_logits_list = []
        policy_prob_list = []
        state_value_list = []
        for i in xrange(num_scene):
            with tf.variable_scope(THORConfig.supported_envs[i], reuse=False):
                # fc3 shared for policy and value output
                fc3 = TFUtil.fc_layer('fc_3_{0}'.format(i),
                                      fc2,
                                      input_size=512,
                                      num_neron=512,
                                      variable_dict=variable_dict)
                # policy output
                policy_logits = TFUtil.fc_layer('policy_logits',
                                                fc3,
                                                input_size=512,
                                                num_neron=num_action,
                                                activation=None,
                                                variable_dict=variable_dict)
                policy_probs = tf.nn.softmax(name='policy_probs',
                                             logits=policy_logits)
                # value output
                state_value = tf.squeeze(TFUtil.fc_layer(
                    'value',
                    fc3,
                    input_size=512,
                    num_neron=1,
                    activation=None,
                    variable_dict=variable_dict),
                                         axis=1)
                # add output to list
                policy_logits_list.append(policy_logits)
                policy_prob_list.append(policy_probs)
                state_value_list.append(state_value)

        with tf.variable_scope('loss'):
            scene_loss = []
            for i in xrange(num_scene):
                # policy loss
                log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits(
                    name='log_prob',
                    labels=action_placeholder,
                    logits=policy_logits_list[i])
                policy_loss = -tf.reduce_sum(
                    tf.maximum(log_prob, tf.log(1e-6)) *
                    advantage_placeholder)  # regularization for A3C delay
                policy_entropy = -0.01 * tf.reduce_sum(
                    policy_prob_list[i] * tf.log(policy_prob_list[i] + 1e-20))
                # value_loss
                value_loss = 0.5 * tf.reduce_sum(
                    tf.square(q_value_placeholder - state_value_list[i]))
                # need to tweak weight
                scene_loss.append(policy_loss + value_loss - policy_entropy)
                with tf.name_scope('secen_summary_{0}'.format(
                        THORConfig.supported_envs[i])):
                    tf.summary.scalar('policy_loss', policy_loss)
                    tf.summary.scalar('policy_entropy', policy_entropy)
                    tf.summary.scalar('value_loss', value_loss)

            loss = tf.reduce_sum(
                tf.transpose(scene_loss) *
                scene_placeholder)  # scene_placeholder is one-hot vectors

            # train_op
            # optional: varying learning_rate
            """
            learning_rate = tf.train.exponential_decay(
                learning_rate = A3CConfig.learning_rate, 
                global_step   = global_step, 
                decay_steps   = A3CConfig.decay_step,
                decay_rate    = A3CConfig.decay_rate)
            """
            # create optizer
            #optimizer = tf.train.AdamOptimizer(learning_rate = A3CConfig.learning_rate)
            optimizer = tf.train.RMSPropOptimizer(
                learning_rate=A3CConfig.learning_rate,
                decay=A3CConfig.decay_rate,
                epsilon=0.1)  #, momentum = A3CConfig.momentum)
            train_ops = []
            for i in xrange(num_scene):
                clipped_grad_var = []
                grad_var = optimizer.compute_gradients(scene_loss[i])
                for grad, var in grad_var:
                    if grad is not None:
                        clipped_grad_var.append(
                            (tf.clip_by_value(grad, -10., 10.), var))
                    else:
                        clipped_grad_var.append((None, var))
                grad_var = clipped_grad_var
                train_ops.append(optimizer.apply_gradients(grad_var))
            """
            local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = scope)
            grad_var = optimizer.compute_gradients(loss, var_list = local_vars)
            
            # optional: gradient clipping
            
            clipped_grad_var = []
            for grad, var in grad_var:
                if grad is not None:
                    clipped_grad_var.append((tf.clip_by_value(grad, -10., 10.), var))
                else:
                    clipped_grad_var.append((None, var))
            grad_var = clipped_grad_var
            
                
            
            if scope != 'global':
                global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'global')
                tmp = []
                for i in xrange(len(grad_var)):
                    tmp.append((grad_var[i][0], global_vars[i]))
                grad_var = tmp
                    
                
            train_op = optimizer.apply_gradients(grad_var)
            """

        # ops to sample_action using multinomial distribution given unnomalized log probability logits
        action_sampler_ops = []
        for i in xrange(num_scene):
            action_sampler_ops.append(
                tf.multinomial(name='action_sampler_{0}'.format(
                    THORConfig.supported_envs[i]),
                               logits=policy_logits_list[i],
                               num_samples=1))

        # reward_history
        with tf.name_scope('reward_history'):
            reward_history_placeholder = tf.placeholder(name='reward_history',
                                                        shape=(None, ),
                                                        dtype=tf.float32)
            average_reward = tf.reduce_mean(reward_history_placeholder)

        # step_history
        with tf.name_scope('step_number_history'):
            step_history_placeholder = tf.placeholder(name='step_history',
                                                      shape=(None, ),
                                                      dtype=tf.float32)
            average_step = tf.reduce_mean(step_history_placeholder)

        # summary
        with tf.name_scope('summary'):
            tf.summary.scalar('average_reward_over_100_episodes',
                              average_reward)
            tf.summary.scalar('average_steps_over_100_episodes', average_step)
            tf.summary.scalar('loss', loss)

        return (num_frames, state_placeholder, target_placeholder, action_placeholder, q_value_placeholder, advantage_placeholder, \
                scene_placeholder, reward_history_placeholder, step_history_placeholder), \
               (train_ops, action_sampler_ops), \
               (policy_logits_list, state_value_list, average_reward, average_step)