Python DRLNetwork Examples

Programming Language: Python

Namespace/Package Name: network

Class/Type: DRLNetwork

Examples at hotexamples.com: 7

Python DRLNetwork - 7 examples found. These are the top rated real world Python examples of network.DRLNetwork extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

DRLNetwork(4)

get_vars(4)

prepare_loss(2)

run_DQN(1)

run_policy(1)

run_policy_and_value(1)

run_value(1)

Example #1

Show file

File: ADQN_Thread.py Project: fLY9636/Topic-Reinforcement-Learning

 def _set_local_network(self, device, network_scope, scene_scope, task_scope):
   self.local_network = DRLNetwork(action_size=ACTION_SIZE, device=device, network_scope=network_scope,
                          scene_scopes=[scene_scope])
   
   self.network_scope = network_scope
   self.scene_scope = scene_scope
   self.task_scope = task_scope
   self.scopes = [network_scope, scene_scope, task_scope]
   self.local_network.prepare_loss(ENTROPY_BETA, self.scopes)

Example #2

Show file

File: ADQN_Train.py Project: fLY9636/Topic-Reinforcement-Learning

    def __init__(self):
        if not os.path.exists(CHECKPOINT_DIR):
            os.mkdir(CHECKPOINT_DIR)

        # self.evluation_gap = 10**6
        print(MAX_TIME_STEP)
        self.device = "/gpu:0" if USE_GPU else "/cpu:0"
        self.network_scope = TASK_TYPE
        self.list_of_tasks = TASK_LIST
        self.scene_scopes = self.list_of_tasks.keys()
        self.global_t = 0
        self.stop_requested = False

        self.initial_learning_rate = self.log_uniform(LR_ALPHA_LOW,
                                                      LR_ALPHA_HIGH,
                                                      LR_ALPHA_LOG_RATE)

        self.global_network = DRLNetwork(action_size=ACTION_SIZE,
                                         device=self.device,
                                         network_scope=self.network_scope,
                                         scene_scopes=self.scene_scopes)

        self.branches = []
        for scene in self.scene_scopes:
            for task in self.list_of_tasks[scene]:
                self.branches.append((scene, task))

        self.NUM_TASKS = len(self.branches)
        assert NUM_THREADS >= self.NUM_TASKS, \
            "Not enough threads for multitasking: at least {} threads needed.".format(self.NUM_TASKS)

        self.learning_rate_input = tf.placeholder("float")
        self.grad_applier = RMSPropApplier(
            learning_rate=self.learning_rate_input,
            decay=RMSP_ALPHA,
            momentum=0.0,
            epsilon=RMSP_EPSILON,
            clip_norm=GRAD_NORM_CLIP,
            device=self.device)

        # instantiate each training thread
        # each thread is training for one target in one scene
        self.training_threads = []
        for i in range(NUM_THREADS):
            scene, task = self.branches[i % self.NUM_TASKS]
            training_thread = ADQN_Thread(i,
                                          self.global_network,
                                          self.initial_learning_rate,
                                          self.learning_rate_input,
                                          self.grad_applier,
                                          MAX_TIME_STEP,
                                          device=self.device,
                                          network_scope="thread-%d" % (i + 1),
                                          scene_scope=scene,
                                          task_scope=task)
            self.training_threads.append(training_thread)

Example #3

Show file

    def __init__(self, global_step):

        device = "/cpu:0"  # use CPU for display tool
        network_scope = TASK_TYPE
        list_of_tasks = TASK_LIST
        scene_scopes = list_of_tasks.keys()

        global_network = DRLNetwork(action_size=4,
                                    device=device,
                                    network_scope=network_scope,
                                    scene_scopes=scene_scopes)

        sess = tf.Session()
        init = tf.global_variables_initializer()
        sess.run(init)

        saver = tf.train.Saver()

        # Read network from checkpoint file
        checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR)

        if checkpoint and checkpoint.model_checkpoint_path:
            saver.restore(sess, checkpoint.model_checkpoint_path)
            print("checkpoint loaded: {}".format(
                checkpoint.model_checkpoint_path))
        else:
            print("Could not find old checkpoint")

        # Read checkpoint directly, 'meta' file
        # saver.restore(sess, checkpoint.model_checkpoint_path)

        scene_stats = dict()
        self.results = dict()
        for scene_scope in scene_scopes:

            scene_stats[scene_scope] = []
            for task_scope in list_of_tasks[scene_scope]:

                env = Environment({
                    'scene_name': scene_scope,
                    'terminal_state_id': int(task_scope)
                })
                ep_rewards = []
                ep_lengths = []
                ep_collisions = []

                scopes = [network_scope, scene_scope, task_scope]

                for i_episode in range(NUM_EVAL_EPISODES):

                    env.reset()
                    terminal = False
                    ep_reward = 0
                    ep_collision = 0
                    ep_t = 0

                    while not terminal:

                        pi_values = global_network.run_policy(
                            sess, env.s_t, env.target, scopes)
                        pi_values = np.array(pi_values) / np.sum(pi_values)
                        action = np.random.choice(np.arange(len(pi_values)),
                                                  p=pi_values)
                        env.step(action)

                        terminal = env.terminal
                        if ep_t == 10000: break
                        if env.collided: ep_collision += 1
                        ep_reward += env.reward
                        ep_t += 1

                    ep_lengths.append(ep_t)
                    ep_rewards.append(ep_reward)
                    ep_collisions.append(ep_collision)
                    if VERBOSE:
                        print("episode #{} ends after {} steps".format(
                            i_episode, ep_t))

                print('evaluation: %s %s' % (scene_scope, task_scope))
                print('mean episode reward: %.2f' % np.mean(ep_rewards))
                print('mean episode length: %.2f' % np.mean(ep_lengths))
                print('mean episode collision: %.2f' % np.mean(ep_collisions))

                scene_stats[scene_scope].extend(ep_lengths)

        print('\nResults (average trajectory length):')
        for scene_scope in scene_stats:
            self.results[scene_scope] = np.mean(scene_stats[scene_scope])
            print('%s: %.2f steps' % (scene_scope, self.results[scene_scope]))

        with open("./Evaluation/result_%d.txt" % global_step, 'wb') as fp:
            pickle.dump(self.results, fp)

Example #4

Show file

class A3C_Thread(object):
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device,
                 network_scope="network",
                 scene_scope="scene",
                 task_scope="task"):

        self.thread_index = thread_index  # Number the thread

        self._set_local_network(device, network_scope, scene_scope,
                                task_scope)  # Set local network

        self.sync = self.local_network.sync_from(
            global_network)  # Synthesize from the global network

        self.learning_rate_input = learning_rate_input  # Set learning rate

        self.max_global_time_step = max_global_time_step  # Set maximum of global time step

        self._set_trainer_optimizer(device, global_network,
                                    grad_applier)  # Set trainer

        self._set_environment(initial_learning_rate)  # Set environment

    # Create local network
    def _set_local_network(self, device, network_scope, scene_scope,
                           task_scope):
        self.local_network = DRLNetwork(action_size=ACTION_SIZE,
                                        device=device,
                                        network_scope=network_scope,
                                        scene_scopes=[scene_scope])

        self.network_scope = network_scope
        self.scene_scope = scene_scope
        self.task_scope = task_scope
        self.scopes = [network_scope, scene_scope, task_scope]
        self.local_network.prepare_loss(ENTROPY_BETA, self.scopes)

    # Set trainer and optimizer
    # Set Actor-Critic gradient and optimizer
    # Use the accumulated trainer from Zhu
    def _set_trainer_optimizer(self, device, global_network, grad_applier):
        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize(self.local_network.total_loss,
                                      self.local_network.get_vars())

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        accum_grad_names = [
            self._local_var_name(x)
            for x in self.trainer.get_accum_grad_list()
        ]
        global_net_vars = [
            x for x in global_network.get_vars()
            if self._get_accum_grad_name(x) in accum_grad_names
        ]

        self.apply_gradients = grad_applier.apply_gradients(
            global_net_vars, self.trainer.get_accum_grad_list())

    def _local_var_name(self, var):
        return '/'.join(var.name.split('/')[1:])

    def _get_accum_grad_name(self, var):
        return self._local_var_name(var).replace(':', '_') + '_accum_grad:0'

    # Set environments
    def _set_environment(self, initial_learning_rate):
        self.episode_max_q = -np.inf
        self.env = None
        self.local_t = 0
        self.initial_learning_rate = initial_learning_rate
        self.episode_reward = 0
        self.episode_length = 0

    # Choose one action according to the pi values
    def choose_action(self, pi_values):
        action = np.random.choice(np.arange(len(pi_values)), p=pi_values)
        return action

    # Take LOCAL_T_MAX step in one process
    # And update the accumulated gradients
    def process(self, sess, global_t, summary_writer, summary_op,
                summary_placeholders):

        if self.env is None:
            # lazy evaluation
            time.sleep(self.thread_index * 1.0)
            self.env = Environment({
                'scene_name': self.scene_scope,
                'terminal_state_id': int(self.task_scope)
            })
        start_local_t = self.local_t

        # Initialization
        states = []
        actions = []
        rewards = []
        values = []
        targets = []
        terminal_end = False

        # Reset accmulated gradient variables
        sess.run(self.reset_gradients)
        # Obtain shared parameters from global
        sess.run(self.sync)

        # t_max times loop
        for i in range(LOCAL_T_MAX):
            pi_, value_ = self.local_network.run_policy_and_value(
                sess, self.env.s_t, self.env.target, self.scopes)

            pi_ = np.array(pi_) / np.sum(pi_)
            action = self.choose_action(pi_)

            states.append(self.env.s_t)
            actions.append(action)
            values.append(value_)
            targets.append(self.env.target)

            if VERBOSE and (self.thread_index
                            == 0) and (self.local_t % 1000) == 0:
                sys.stdout.write("%s:" % self.scene_scope)
                sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_))

            # process game
            self.env.step(action)

            # receive game result
            reward = self.env.reward
            terminal = self.env.terminal

            # ad-hoc reward for navigation
            # reward = 10.0 if terminal else -0.01
            if self.episode_length > 5e3: terminal = True

            self.episode_reward += reward
            self.episode_length += 1
            self.episode_max_q = max(self.episode_max_q, np.max(value_))

            # clip reward
            rewards.append(np.clip(reward, -1, 1))

            self.local_t += 1

            if terminal:
                terminal_end = True
                sys.stdout.write(
                    "#Thread: %d \n time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s episode max Q  = %.3f\n"
                    % (self.thread_index, global_t, self.thread_index,
                       self.scene_scope, self.task_scope, self.scene_scope,
                       self.task_scope, self.episode_reward, self.scene_scope,
                       self.task_scope, self.episode_length, self.scene_scope,
                       self.task_scope, self.episode_max_q))

                summary_values = {
                    "episode_reward_input": self.episode_reward,
                    "episode_length_input": float(self.episode_length),
                    "episode_max_q_input": self.episode_max_q,
                    "learning_rate_input": self._anneal_learning_rate(global_t)
                }

                self._record_score(sess, summary_writer, summary_op,
                                   summary_placeholders, summary_values,
                                   global_t)
                self.episode_reward = 0
                self.episode_length = 0
                self.episode_max_q = -np.inf
                self.env.reset()

                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_value(sess, self.env.s_t,
                                             self.env.target, self.scopes)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []
        batch_t = []

        # compute and accmulate gradients
        for (ai, ri, si, Vi, ti) in zip(actions, rewards, states, values,
                                        targets):
            R = ri + GAMMA * R
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1

            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)
            batch_t.append(ti)

        sess.run(self.accum_gradients,
                 feed_dict={
                     self.local_network.s: batch_si,
                     self.local_network.a: batch_a,
                     self.local_network.t: batch_t,
                     self.local_network.td: batch_td,
                     self.local_network.r: batch_R
                 })

        cur_learning_rate = self._anneal_learning_rate(global_t)

        sess.run(self.apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})

        if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0:
            sys.stdout.write(
                "#Thread-%d-%s-Local timestep-%d\n" %
                (self.thread_index, self.scene_scope, self.local_t))

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t

    def _record_score(self, sess, writer, summary_op, placeholders, values,
                      global_t):
        feed_dict = {}
        for k in placeholders:
            feed_dict[placeholders[k]] = values[k]
        summary_str = sess.run(summary_op, feed_dict=feed_dict)
        if VERBOSE:
            sys.stdout.write('writing to summary writer at time %d\n' %
                             (global_t))
        writer.add_summary(summary_str, global_t)
        # writer.flush()

    def _anneal_learning_rate(self, global_time_step):
        time_step_to_go = max(self.max_global_time_step - global_time_step,
                              0.0)
        learning_rate = self.initial_learning_rate * time_step_to_go / self.max_global_time_step
        return learning_rate

Example #5

Show file

File: ADQN_Train.py Project: fLY9636/Topic-Reinforcement-Learning

class Train(object):
    def __init__(self):
        if not os.path.exists(CHECKPOINT_DIR):
            os.mkdir(CHECKPOINT_DIR)

        # self.evluation_gap = 10**6
        print(MAX_TIME_STEP)
        self.device = "/gpu:0" if USE_GPU else "/cpu:0"
        self.network_scope = TASK_TYPE
        self.list_of_tasks = TASK_LIST
        self.scene_scopes = self.list_of_tasks.keys()
        self.global_t = 0
        self.stop_requested = False

        self.initial_learning_rate = self.log_uniform(LR_ALPHA_LOW,
                                                      LR_ALPHA_HIGH,
                                                      LR_ALPHA_LOG_RATE)

        self.global_network = DRLNetwork(action_size=ACTION_SIZE,
                                         device=self.device,
                                         network_scope=self.network_scope,
                                         scene_scopes=self.scene_scopes)

        self.branches = []
        for scene in self.scene_scopes:
            for task in self.list_of_tasks[scene]:
                self.branches.append((scene, task))

        self.NUM_TASKS = len(self.branches)
        assert NUM_THREADS >= self.NUM_TASKS, \
            "Not enough threads for multitasking: at least {} threads needed.".format(self.NUM_TASKS)

        self.learning_rate_input = tf.placeholder("float")
        self.grad_applier = RMSPropApplier(
            learning_rate=self.learning_rate_input,
            decay=RMSP_ALPHA,
            momentum=0.0,
            epsilon=RMSP_EPSILON,
            clip_norm=GRAD_NORM_CLIP,
            device=self.device)

        # instantiate each training thread
        # each thread is training for one target in one scene
        self.training_threads = []
        for i in range(NUM_THREADS):
            scene, task = self.branches[i % self.NUM_TASKS]
            training_thread = ADQN_Thread(i,
                                          self.global_network,
                                          self.initial_learning_rate,
                                          self.learning_rate_input,
                                          self.grad_applier,
                                          MAX_TIME_STEP,
                                          device=self.device,
                                          network_scope="thread-%d" % (i + 1),
                                          scene_scope=scene,
                                          task_scope=task)
            self.training_threads.append(training_thread)

    def log_uniform(self, lo, hi, rate):
        log_lo = np.log(lo)
        log_hi = np.log(hi)
        v = log_lo * (1 - rate) + log_hi * rate
        return np.exp(v)

    def train(self):
        # prepare session
        self.sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=False, allow_soft_placement=True))

        init = tf.global_variables_initializer()
        self.sess.run(init)

        # create tensorboard summaries
        self.create_summary()
        self.summary_writer = tf.summary.FileWriter(LOG_FILE, self.sess.graph)

        # init or load checkpoint with saver
        # if you don't need to be able to resume training, use the next line instead.
        # it will result in a much smaller checkpoint file.
        self.saver = tf.train.Saver(max_to_keep=10,
                                    var_list=self.global_network.get_vars())
        # saver = tf.train.Saver(max_to_keep=10)

        self.checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR)
        if self.checkpoint and self.checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess,
                               self.checkpoint.model_checkpoint_path)
            print("checkpoint loaded: {}".format(
                self.checkpoint.model_checkpoint_path))
            tokens = self.checkpoint.model_checkpoint_path.split("-")
            # set global step
            self.global_t = int(tokens[1])
            print(">>> global step set: {}".format(self.global_t))
        else:
            print("Could not find old checkpoint")

        train_threads = []
        for i in range(NUM_THREADS):
            train_threads.append(
                threading.Thread(target=self.train_function, args=(i, )))

        signal.signal(signal.SIGINT, self.signal_handler)

        # start each training thread
        for t in train_threads:
            t.start()

        print('Press Ctrl+C to stop.')
        signal.pause()

        # wait for all threads to finish
        for t in train_threads:
            t.join()

        print('Now saving data. Please wait.')
        self.saver.save(self.sess,
                        CHECKPOINT_DIR + '/' + 'checkpoint',
                        global_step=self.global_t)
        self.summary_writer.close()

    def create_summary(self):
        self.summary_op = dict()
        self.summary_placeholders = dict()
        for i in range(NUM_THREADS):
            scene, task = self.branches[i % self.NUM_TASKS]
            key = scene + "-" + task

            # summary for tensorboard
            episode_reward_input = tf.placeholder("float")
            episode_length_input = tf.placeholder("float")
            #episode_max_q_input  = tf.placeholder("float")

            scalar_summaries = [
                tf.summary.scalar(key + "/Episode Reward",
                                  episode_reward_input),
                tf.summary.scalar(key + "/Episode Length",
                                  episode_length_input)
                #tf.summary.scalar(key+"/Episode Max Q", episode_max_q_input)
            ]

            self.summary_op[key] = tf.summary.merge(scalar_summaries)
            self.summary_placeholders[key] = {
                "episode_reward_input": episode_reward_input,
                "episode_length_input": episode_length_input,
                #"episode_max_q_input": episode_max_q_input,
                "learning_rate_input": self.learning_rate_input
            }

    def train_function(self, parallel_index):
        training_thread = self.training_threads[parallel_index]
        last_global_t = 0

        scene, task = self.branches[parallel_index % self.NUM_TASKS]
        key = scene + "-" + task
        while self.global_t < MAX_TIME_STEP and not self.stop_requested:
            diff_global_t = training_thread.process(
                self.sess, self.global_t, self.summary_writer,
                self.summary_op[key], self.summary_placeholders[key])
            self.global_t += diff_global_t
            # periodically save checkpoints to disk
            if parallel_index == 0 and self.global_t - last_global_t > 1000000:
                print('Save checkpoint at timestamp %d' % self.global_t)
                self.saver.save(self.sess,
                                CHECKPOINT_DIR + '/' + 'checkpoint',
                                global_step=self.global_t)
                last_global_t = self.global_t

    def signal_handler(self, signal, frame):
        print('You pressed Ctrl+C!')
        self.stop_requested = True

Example #6

Show file

File: Train.py Project: fLY9636/Topic-Reinforcement-Learning

    scene_scopes = list_of_tasks.keys()
    global_t = 0
    stop_requested = False

    if not os.path.exists(CHECKPOINT_DIR):
        os.mkdir(CHECKPOINT_DIR)

    # Initialize learning rate
    log_lo = np.log(LR_ALPHA_LOW)
    log_hi = np.log(LR_ALPHA_HIGH)
    v = log_lo * (1 - LR_ALPHA_LOG_RATE) + log_hi * LR_ALPHA_LOG_RATE
    initial_learning_rate = np.exp(v)

    # Create global network
    global_network = DRLNetwork(action_size=ACTION_SIZE,
                                device=device,
                                network_scope=network_scope,
                                scene_scopes=scene_scopes)

    # Initialize scene-task specific branch
    branches = []
    for scene in scene_scopes:
        for task in list_of_tasks[scene]:
            branches.append((scene, task))

    NUM_TASKS = len(branches)
    assert NUM_THREADS >= NUM_TASKS, \
      "Not enough threads for multitasking: at least {} threads needed.".format(NUM_TASKS)

    # Create gradient applier
    learning_rate_input = tf.placeholder("float")
    grad_applier = RMSPropApplier(learning_rate=learning_rate_input,

Example #7

Show file

File: ADQN_Thread.py Project: fLY9636/Topic-Reinforcement-Learning

class ADQN_Thread(object):
  def __init__(self, thread_index, global_network, initial_learning_rate,
               learning_rate_input, grad_applier, max_global_time_step,
               device, network_scope="network", scene_scope="scene",
               task_scope="task"):
    
    self.thread_index = thread_index                                        # Number the thread

    self._set_local_network(device, network_scope, scene_scope, task_scope) # Set local network

    self.sync = self.local_network.sync_from(global_network)                # Synthesize from the global network

    self.learning_rate_input = learning_rate_input                          # Set learning rate

    self.max_global_time_step = max_global_time_step                        # Set maximum of global time step
    
    self._set_trainer_optimizer(device, global_network, grad_applier)                     # Set trainer
    
    self._set_environment(initial_learning_rate)                            # Set environment

    self.memory_size = MEMORY_SIZE # memory size for replay buffer

    self.memory = np.zeros((self.memory_size, 2048 * 4 * 2 + 2))  # initialize zero memory [s, a, r, s_]

    self.replace_target_iter = DQN_REPLACE_TARGET_ITER

    self.batch_size = DQN_BATCH_SIZE

    self.gamma = REWARD_DECAY


  # Create local network
  def _set_local_network(self, device, network_scope, scene_scope, task_scope):
    self.local_network = DRLNetwork(action_size=ACTION_SIZE, device=device, network_scope=network_scope,
                           scene_scopes=[scene_scope])
    
    self.network_scope = network_scope
    self.scene_scope = scene_scope
    self.task_scope = task_scope
    self.scopes = [network_scope, scene_scope, task_scope]
    self.local_network.prepare_loss(ENTROPY_BETA, self.scopes)

  # Set trainer and optimizer
  # Set Actor-Critic gradient and optimizer
  # Use the accumulated trainer from Zhu
  def _set_trainer_optimizer(self, device, global_network, grad_applier):
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize(self.local_network.total_loss,
                                  self.local_network.get_vars())

    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()

    accum_grad_names = [self._local_var_name(x) for x in self.trainer.get_accum_grad_list()]
    global_net_vars = [x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names]

    self.apply_gradients = grad_applier.apply_gradients(global_net_vars, self.trainer.get_accum_grad_list() )

  def _local_var_name(self, var):
    return '/'.join(var.name.split('/')[1:])

  def _get_accum_grad_name(self, var):
    return self._local_var_name(var).replace(':','_') + '_accum_grad:0'

  # Set environments
  def _set_environment(self, initial_learning_rate):
    self.env = None
    self.local_t = 0
    self.initial_learning_rate = initial_learning_rate
    self.episode_reward = 0
    self.episode_length = 0


  def choose_action(self, actions_value):
    # epsilon-greedy
    if np.random.uniform() < EPSILON:
      action = np.argmax(actions_value)
    else:
      action = np.random.randint(0, ACTION_SIZE)
    return action

  # Take LOCAL_T_MAX step in one process
  def process(self, sess, global_t, summary_writer, summary_op, summary_placeholders):
    #print("start process")

    if self.env is None:
      # lazy evaluation
      time.sleep(self.thread_index*1.0)
      self.env = Environment({
        'scene_name': self.scene_scope,
        'terminal_state_id': int(self.task_scope)
      })
    start_local_t = self.local_t

    # Reset accmulated gradient variables
    sess.run(self.reset_gradients)
    # Obtain shared parameters from global 
    sess.run( self.sync )

    # t_max times loop
    for i in range(LOCAL_T_MAX):
      old_s_t = self.env.s_t
      actions_value = self.local_network.run_DQN(sess, self.env.s_t, self.env.target, self.scopes)
      action = self.choose_action(actions_value)

      if VERBOSE and (self.thread_index == 0) and (self.local_t % 1000) == 0:
        sys.stdout.write("%s:" % self.scene_scope)
        sys.stdout.write("Pi = {0} V = {1}\n".format(actions_value, action))

      # process game
      self.env.step(action)

      # receive game result
      reward = self.env.reward
      terminal = self.env.terminal

      # ad-hoc reward for navigation
      # reward = 10.0 if terminal else -0.01
      if self.episode_length > 5e3: terminal = True

      self.episode_reward += reward
      self.episode_length += 1

      """
      print("Local t: {0:d}".format(self.local_t))
      print("Reward: {0:f}".format(reward))
      print("Episode reward: {0:f}".format(self.episode_reward))
      print("Episode length: {0:d}".format(self.episode_length))
      """

      self.local_t += 1

      # store transition to replay buffer
      self.store_transition(old_s_t, action, reward, self.env.s_t)

      if terminal:
        sys.stdout.write("#Thread: %d \n time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s \n" % (self.thread_index, global_t, self.thread_index, self.scene_scope, self.task_scope, self.scene_scope, self.task_scope, self.episode_reward, self.scene_scope, self.task_scope, self.episode_length, self.scene_scope, self.task_scope))

        summary_values = {
          "episode_reward_input": self.episode_reward,
          "episode_length_input": float(self.episode_length),
          "learning_rate_input": self._anneal_learning_rate(global_t)
        }

        self._record_score(sess, summary_writer, summary_op, summary_placeholders,
                           summary_values, global_t)
        self.episode_reward = 0
        self.episode_length = 0
        self.env.reset()

        break

    # update target network
    if self.local_t % self.replace_target_iter == 0:
      sess.run(self.local_network.replace_target_op)
      # print('\ntarget_params_replaced\n')

    # sample batch memory from all memory
    if self.memory_counter > self.memory_size:
      sample_index = np.random.choice(self.memory_size, size=self.batch_size)
    else:
      sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
    batch_memory = self.memory[sample_index, :]

    batch_memory_s_ = np.reshape(batch_memory[:, -2048*4:], (-1, 2048, 4))
    batch_memory_s = np.reshape(batch_memory[:, :2048*4], (-1, 2048, 4))
    batch_memory_t = np.reshape(np.tile(self.env.target, [self.batch_size, 1]), (-1, 2048, 4))

    q_next, q_eval = sess.run(
      [self.local_network.q_next, self.local_network.q_eval],
      feed_dict={
        self.local_network.s_: batch_memory_s_,  # fixed params
        self.local_network.s: batch_memory_s,  # newest params
        self.local_network.t: batch_memory_t
      })

    # change q_target w.r.t q_eval's action
    q_target = q_eval.copy()

    batch_index = np.arange(self.batch_size, dtype=np.int32)
    eval_act_index = batch_memory[:, 2048*4].astype(int)
    reward = batch_memory[:, 2048*4 + 1]

    key_eval = self.network_scope + '/' + self.scene_scope + '/eval'
    if terminal:
      q_target[key_eval][batch_index, eval_act_index] = reward
    else:
      key_target = self.network_scope + '/'+ self.scene_scope + '/target'
      q_target[key_eval][batch_index, eval_act_index] = reward + self.gamma * np.max(q_next[key_target], axis=1)

    for idx in batch_index:
      # train eval network
      sess.run(self.accum_gradients,
               feed_dict={
                 self.local_network.s: [batch_memory_s[idx]],
                 self.local_network.t: [batch_memory_t[idx]],
                 self.local_network.q_target: [q_target[key_eval][idx]]})

      cur_learning_rate = self._anneal_learning_rate(global_t)

      # update global network
      sess.run( self.apply_gradients,
                feed_dict = { self.learning_rate_input: cur_learning_rate } )

    if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0:
      sys.stdout.write("#Thread-%d-%s-Local timestep-%d\n" % (self.thread_index, self.scene_scope, self.local_t))

    # return advanced local step size
    diff_local_t = self.local_t - start_local_t
    return diff_local_t


  def _record_score(self, sess, writer, summary_op, placeholders, values, global_t):
    feed_dict = {}
    for k in placeholders:
      feed_dict[placeholders[k]] = values[k]
    summary_str = sess.run(summary_op, feed_dict=feed_dict)
    if VERBOSE: sys.stdout.write('writing to summary writer at time %d\n' % (global_t))
    writer.add_summary(summary_str, global_t)
    # writer.flush()


  def _anneal_learning_rate(self, global_time_step):
    time_step_to_go = max(self.max_global_time_step - global_time_step, 0.0)
    learning_rate = self.initial_learning_rate * time_step_to_go / self.max_global_time_step
    return learning_rate

  def store_transition(self, s, a, r, s_):
    if not hasattr(self, 'memory_counter'):
      self.memory_counter = 0

    transition = np.hstack((np.reshape(s, -1), [a, r], np.reshape(s_,-1)))

    # replace the old memory with new memory
    index = self.memory_counter % self.memory_size
    self.memory[index, :] = transition

    self.memory_counter += 1