Beispiel #1
0
    def _fill_experience(self, sess):
        """
    Fill experience buffer until buffer is full.
    """
        prev_state = self.environment.last_state
        last_action = self.environment.last_action
        last_reward = self.environment.last_reward
        last_action_reward = ExperienceFrame.concat_action_and_reward(
            last_action, self.action_size, last_reward)

        pi_, _ = self.local_network.run_base_policy_and_value(
            sess, self.environment.last_state, last_action_reward)
        action = self.choose_action(pi_)

        new_state, reward, terminal, pixel_change = self.environment.process(
            action)

        frame = ExperienceFrame(prev_state, reward, action, terminal,
                                pixel_change, last_action, last_reward)
        self.experience.add_frame(frame)

        if terminal:
            self.environment.reset()
        if self.experience.is_full():
            self.environment.reset()
            print("Replay buffer filled")
Beispiel #2
0
    def _fill_experience(self, sess):
        """
    Fill experience buffer until buffer is full.
    """
        prev_state = self.environment.last_state
        last_action = self.environment.last_action
        last_reward = self.environment.last_reward
        last_intrinsic_reward = self.environment.last_intrinsic_reward
        last_action_reward = ExperienceFrame.concat_action_and_reward(
            last_action, self.action_size, last_reward)
        input_map = self.environment.map
        prev_localization_state, pi_, _, short_term_goal, shift_weights, location_distribution = self.local_network.run_policy_and_value(
            sess, prev_state, last_action_reward, input_map, replan=False)
        action = self.choose_action(pi_)

        new_state, reward, intrinsic_reward, terminal = self.environment.process(
            action, short_term_goal, shift_weights)

        frame = ExperienceFrame(prev_state, input_map, prev_localization_state,
                                location_distribution, reward,
                                intrinsic_reward, action, terminal,
                                last_action, last_reward,
                                last_intrinsic_reward)
        self.experience.add_frame(frame)

        if terminal:
            self.level_seed = np.random.randint(LEVEL_SET_SIZE)
            self.environment.reset(self.maze_size, self.level_seed)
        if self.experience.is_full():
            print(
                "Replay buffer filled--------------------------------------------------------------------------------------"
            )
            sys.stdout.flush()
Beispiel #3
0
    def _fill_experience(self, sess):
        """
    Fill experience buffer until buffer is full.
    """
        #print("Start experience filling", flush=True)
        prev_state = self.environment.last_state
        last_action = self.environment.last_action
        last_reward = self.environment.last_reward
        last_action_reward = ExperienceFrame.concat_action_and_reward(
            last_action, self.action_size, last_reward, prev_state)

        #print("Local network run base policy, value!", flush=True)
        pi_, _, _ = self.local_network.run_base_policy_and_value(
            sess, self.environment.last_state, last_action_reward)
        action = self.choose_action(pi_)

        new_state, reward, terminal, pixel_change = self.environment.process(
            action, flag=0)

        frame = ExperienceFrame(
            {
                key: val
                for key, val in prev_state.items() if 'objectType' not in key
            }, reward, action, terminal, pixel_change, last_action,
            last_reward)
        self.experience.add_frame(frame)

        if terminal:
            self.environment.reset()
        if self.experience.is_full():
            self.environment.reset()
            print("Replay buffer filled")
Beispiel #4
0
    def _add_batch_to_exp(self, batch):
        # if we just started, copy the first state as last state
        if self.last_state is None:
                self.last_state = batch.si[0]
        #logger.debug("adding batch to exp. len:{}".format(len(batch.si)))
        for k in range(len(batch.si)):
            state = batch.si[k]
            action = batch.a[k]#np.argmax(batch.a[k])
            reward = batch.a_r[k][-1]

            self.episode_reward += reward
            features = batch.features[k]
            pixel_change = batch.pc[k]
            #logger.debug("k = {} of {} -- terminal = {}".format(k,len(batch.si), batch.terminal))
            if k == len(batch.si)-1 and batch.terminal:
                terminal = True
            else:
                terminal = False
            frame = ExperienceFrame(state, reward, action, terminal, features, pixel_change,

                            self.last_action, self.last_reward)
            self.experience.add_frame(frame)
            self.last_state = state
            self.last_action = action
            self.last_reward = reward
            
        if terminal:
            total_ep_reward = self.episode_reward
            self.episode_reward = 0
            return total_ep_reward
        else:
            return None
Beispiel #5
0
    def process(self, sess):
        self.img = np.zeros(shape=(HEIGHT, WIDTH, 3), dtype=np.uint8)
        last_action = self.env.last_action
        last_reward = np.clip(self.env.last_reward, -1, 1)
        last_action_reward = ExperienceFrame.concat_action_and_reward(
            last_action, self.action_size, last_reward)
        if not USE_PIXEL_CHANGE:
            pi_values, v_value = self.global_network.run_base_policy_and_value(
                sess, self.env.last_state, last_action_reward)
        else:
            pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q(
                sess, self.env.last_state, last_action_reward)
        self.value_history.add_value(v_value)
        action = self.choose_action(pi_values)
        state, reward, terminal, pc, vtrans, vrot = self.env.process(action)
        self.state_history.add_state(state)
        self.ep_reward += reward
        self.mazemap.update(vtrans, vrot)
        if reward > 9:  # agent到达迷宫终点时,reward为10,地图需要重置
            self.mazemap.reset()
        if terminal:  # lab环境默认3600帧为一个episode而不是到达迷宫终点时给terminal信号
            self.env.reset()
            self.ep_reward = 0
            self.mazemap.reset()

        self.show_ob(state, 3, 3, "Observation")
        self.show_pc(pc, 100, 3, 3.0, "Pixel Change")
        self.show_pc(pc_q[:, :, action], 200, 3, 0.4, "PC Q")
        self.show_map(300, 3, "Maze Map")
        self.show_pi(pi_values)
        self.show_reward()
        self.show_rp()
        self.show_value()
Beispiel #6
0
  def process(self, sess, global_t, summary_writer, summary_op, score_input):
    """ TODO """
    self.environment.reset()
    for ep in range(self.environment.num_episodes):
      print("starting episode number {}!".format(ep))

      terminal = False
      while not terminal:
        # Prepare last action reward
        last_action = self.environment.last_action
        last_reward = self.environment.last_reward
        last_action_reward = ExperienceFrame.concat_action_and_reward(last_action,
                                                                      self.action_size,
                                                                      last_reward)

        _last_state = self.environment.last_state

        pi_, value_ = self.local_network.run_base_policy_and_value(sess,
                                                                   self.environment.last_state,
                                                                   last_action_reward)

        action = self.choose_action(pi_)

        # Process game
        new_state, reward, terminal, pixel_change = self.environment.process(action)
        self.episode_reward += reward

        if terminal:
          print("score={}".format(self.episode_reward))
          self.episode_reward = 0
          self.environment.reset()
          self.local_network.reset_state()
          break
    self.environment.env.close()
Beispiel #7
0
 def record(self, obs, reward, terminal, pc, action):
     last_state = self.env.last_state
     last_action = self.env.last_action
     last_reward = self.env.last_reward
     frame = ExperienceFrame(last_state, reward, action, terminal, pc,
                             last_action, last_reward)
     self.ExpPool.add_frame(frame)
     if self.ExpPool.is_full():
         print('Experience pool is filled!')
     print('Filled %d/%d.' % (len(self.ExpPool._frames), MAX_EXP), end='\r')
     sys.stdout.flush()
Beispiel #8
0
    def process(self, sess):
        last_action = self.environment.last_action
        last_reward = np.clip(self.environment.last_reward, -1, 1)
        last_action_reward = ExperienceFrame.concat_action_and_reward(
            last_action, self.action_size, last_reward)

        if not USE_PIXEL_CHANGE:
            pi_values, v_value = self.global_network.run_base_policy_and_value(
                sess, self.environment.last_state, last_action_reward)
        else:
            pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q(
                sess, self.environment.last_state, last_action_reward)
        self.value_history.add_value(v_value)

        action = self.choose_action(pi_values)
        state, reward, terminal, pixel_change, vtrans, vrot = self.environment.process(
            action)
        self.episode_reward += reward
        self.mazemap.update(vtrans, vrot)
        if reward > 9:
            self.mazemap.reset()

        if terminal:
            self.environment.reset()
            self.episode_reward = 0
            self.mazemap.reset()

        self.show_image(state[:, :, :3])
        self.show_policy(pi_values)
        self.show_value()
        self.show_reward()
        self.show_map()

        if USE_PIXEL_CHANGE:
            self.show_pixel_change(pixel_change, 100, 0, 3.0, "PC")
            self.show_pixel_change(pc_q[:, :, action], 200, 0, 0.4, "PC Q")

        if USE_REWARD_PREDICTION:
            if self.state_history.is_full:
                rp_c = self.global_network.run_rp_c(sess,
                                                    self.state_history.states)
                self.show_reward_prediction(rp_c, reward)

        self.state_history.add_state(state)
Beispiel #9
0
    def process(self, sess):
        last_action = self.environment.last_action
        last_reward = np.clip(self.environment.last_reward, -1, 1)
        last_action_reward = ExperienceFrame.concat_action_and_reward(
            last_action, self.action_size, last_reward,
            self.environment.last_state)

        if not flags.use_pixel_change:
            pi_values, v_value = self.global_network.run_base_policy_and_value(
                sess, self.environment.last_state, last_action_reward)
        else:
            pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q(
                sess, self.environment.last_state, last_action_reward)
        action = self.choose_action(pi_values)
        state, reward, terminal, pixel_change = self.environment.process(
            action)
        self.episode_reward += reward

        if terminal:
            self.environment.reset()
            self.episode_reward = 0
Beispiel #10
0
    def process(self, sess):
        last_action = self.environment.last_action
        last_reward = np.clip(self.environment.last_reward, -1, 1)
        last_action_reward = ExperienceFrame.concat_action_and_reward(
            last_action, self.action_size, last_reward,
            self.environment.last_state)

        if not flags.use_pixel_change:
            pi_values, v_value = self.global_network.run_base_policy_and_value(
                sess, self.environment.last_state, last_action_reward)
        else:
            pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q(
                sess, self.environment.last_state, last_action_reward)
        self.value_history.add_value(v_value)

        action = self.choose_action(pi_values)
        state, reward, terminal, pixel_change = self.environment.process(
            action)
        self.episode_reward += reward

        if terminal:
            self.environment.reset()
            self.episode_reward = 0

        self.show_image(state['image'])
        self.show_policy(pi_values)
        self.show_value()
        self.show_reward()

        if flags.use_pixel_change:
            self.show_pixel_change(pixel_change, 100, 0, 3.0, "PC")
            self.show_pixel_change(pc_q[:, :, action], 200, 0, 0.4, "PC Q")

        if flags.use_reward_prediction:
            if self.state_history.is_full:
                rp_c = self.global_network.run_rp_c(sess,
                                                    self.state_history.states)
                self.show_reward_prediction(rp_c, reward)

        self.state_history.add_state(state)
Beispiel #11
0
  def process(self, sess):
    last_action = self.environment.last_action
    last_reward = self.environment.last_reward
    last_action_reward = ExperienceFrame.concat_action_and_reward(last_action, self.action_size,
                                                                  last_reward, self.environment.last_state)
    if random_policy:
      pi_values = [1/3.0, 1/3.0, 1/3.0]
      action = self.choose_action(pi_values)
      state, reward, terminal, pixel_change = self.environment.process(action)
      self.episode_reward[-1] += reward
    else:
      mode = "segnet" if flags.segnet >= 2 else ""
      segnet_preds = None
      if not flags.use_pixel_change:
        pi_values, v_value, segnet_preds = self.global_network.run_base_policy_and_value(sess,
                                                                           self.environment.last_state,
                                                                           last_action_reward, mode=mode)
      else:
        pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q(sess,
                                                                                  self.environment.last_state,
                                                                                  last_action_reward)

      if segnet_preds is not None:
          mask = self.environment.last_state.get('objectType', None)
          if mask is not None:
              new_classes = np.unique(mask)
              if segnet_preds.shape != mask.shape:
                  print("Predictions have shape {}, but groundtruth mask has shape {}".format(segnet_preds.shape, mask.shape))
              else:
                  similar = segnet_preds == mask
                  for id_class in new_classes:
                      id_list = self.segnet_class_dict.get(id_class, None)
                      if id_list is None:
                          id_list = []
                      id_list += [[np.sum(similar[mask == id_class]), np.sum(mask == id_class)]]
                      self.segnet_class_dict[id_class] = id_list

      self.batch_cur_num += 1
      if flags.segnet == -1: #just not necessary
        if self.batch_cur_num != 0 and self.batch_cur_num - self.batch_prev_num >= self.batch_size:

          #print(np.unique(self.batch_sobjT))
          feed_dict = {self.global_network.base_input: self.batch_si,
                       self.global_network.base_segm_mask: self.batch_sobjT,
                       self.global_network.is_training: not True}

          segm_loss, preds, confusion_mtx = sess.run([self.global_network.decoder_loss,
                                                    self.global_network.preds, self.global_network.update_evaluation_vars],
                                                   feed_dict=feed_dict)
          total_loss = 0
          self.total_loss += [total_loss]
          self.segm_loss += [segm_loss] # TODO: here do something with it, store somwhere?

          #update every_thing else
          self.batch_prev_num = self.batch_cur_num
          self.batch_si = []
          self.batch_sobjT = []
          self.batch_a = []
        else:
          self.batch_si += [self.environment.last_state["image"]]
          self.batch_sobjT += [self.environment.last_state["objectType"]]
          self.batch_a += [self.environment.ACTION_LIST[self.environment.last_action]]

      action = self.choose_action(pi_values)
      state, reward, terminal, pixel_change = self.environment.process(action)
      self.episode_reward[-1] += reward

    if terminal:
      ep_info = self.environment._episode_info
      if ep_info['task'] == 'room_goal':
          one_hot_room = ep_info['goal']['roomTypeEncoded']
          room_type = ep_info['goal']['roomType']
          ind = np.where(one_hot_room)[0][0]
          self.roomType_dict[ind] = room_type
          self.episode_roomtype += [ind]
      self.success_rate += [int(self.environment._last_full_state["success"])]
      self.environment.reset()
      self.episode_reward += [0]
Beispiel #12
0
    def _process_base(self, sess, global_t, summary_writer, summary_op,
                      score_input):
        # [Base A3C]
        states = []
        last_action_rewards = []
        actions = []
        rewards = []
        values = []

        terminal_end = False

        start_lstm_state = self.local_network.base_lstm_state_out

        # t_max times loop
        for _ in range(self.local_t_max):
            # Prepare last action reward
            last_action = self.environment.last_action
            last_reward = self.environment.last_reward
            last_action_reward = ExperienceFrame.concat_action_and_reward(
                last_action, self.action_size, last_reward)
            #Modify Last State - with attention
            pi_, value_ = self.local_network.run_base_policy_and_value(
                sess, self.environment.last_state, last_action_reward)

            action = self.choose_action(pi_)

            states.append(self.environment.last_state)
            last_action_rewards.append(last_action_reward)
            actions.append(action)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
                print("pi={}".format(pi_))
                print(" V={}".format(value_))

            prev_state = self.environment.last_state

            # Process game
            new_state, reward, terminal, pixel_change = self.environment.process(
                action)  #Modify New State - with attention
            frame = ExperienceFrame(prev_state, reward, action, terminal,
                                    pixel_change, last_action, last_reward)

            # Store to experience
            self.experience.add_frame(frame)

            self.episode_reward += reward

            rewards.append(reward)

            self.local_t += 1

            if terminal:
                terminal_end = True
                print("score={}".format(self.episode_reward))

                self._record_score(sess, summary_writer, summary_op,
                                   score_input, self.episode_reward, global_t)

                self.episode_reward = 0
                self.environment.reset()
                self.local_network.reset_state()
                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_base_value(
                sess, new_state, frame.get_action_reward(self.action_size))

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_adv = []
        batch_R = []

        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + self.gamma * R
            adv = R - Vi
            a = np.zeros([self.action_size])
            a[ai] = 1.0

            batch_si.append(si)
            batch_a.append(a)
            batch_adv.append(adv)
            batch_R.append(R)

        batch_si.reverse()
        batch_a.reverse()
        batch_adv.reverse()
        batch_R.reverse()

        return batch_si, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state
Beispiel #13
0
  def process(self, sess):

    last_action_reward = ExperienceFrame.concat_action_and_reward(self.environment.last_action,
                                                                  self.action_size,
                                                                  self.environment.last_reward)
    map_input = self.environment.map

    pi_values, v_value, location,angle,value_map,reward_map,short_term_goal,angle_neurons, local_map_prediction, \
    local_map, actual_local_map, vlm_target,vlm_prediction,location_estimate,shift_weights = self.global_network.run_display_values(sess,
                                                                         self.environment.last_state,
                                                                         last_action_reward,
                                                                         map_input,
                                                                         self.replan)
    if self.replan:
      self.path = []
      self.step_count = 0
      self.episode_reward = 0
      self.episode_intrinsic_reward = 0
    self.replan = False
    self.value_history.add_value(v_value)
    action = self.choose_action(pi_values)
    state, reward, intrinsic_reward, terminal = self.environment.process(action, short_term_goal, shift_weights)
    self.replan = False
    if terminal:
        print('Steps needed: ', self.step_count)
        sys.stdout.flush()
        self.environment.reset(DISPLAY_LEVEL[0],np.random.randint(LEVEL_SET_SIZE))
        self.global_network.reset_state()
        self.replan = True
    self.episode_reward += reward
    self.episode_intrinsic_reward += intrinsic_reward
    self.step_count += 1


    self.show_image(self.state)
    self.show_angle(angle)
    self.show_pixels(np.reshape(angle_neurons,[1,30]),370, 176, 4, 1, "Discretized Angle")

    self.show_pixels(np.reshape(shift_weights,[3,3]),400, 250, 20, 1, "Egomotion Estimation")

    self.show_pixels(vlm_target,550, 8, 5, 1, "Visible Local Map Target",True)
    self.show_pixels(vlm_prediction,550, 176, 5, 1, "Visible Local Map Estimation",True)

    self.show_pixels(actual_local_map,725, 8, 5, 1, "Local Map Target",True)
    self.show_pixels(local_map_prediction,725, 176, 5, 1, "Local Map Estimation",True)

    self.show_pixels(local_map,900, 8, 5, 1, "Map Feedback Local Map",True)

    self.draw_text("Estimated Position: " + str(np.around(location_estimate)), 900, 220)
    self.draw_text("Actual Position:      " + str(np.asarray(self.state['position'][2], 'float')), 900, 240)
    self.draw_text("STEPS: {}".format(int(self.step_count)), 900, 260)
    self.draw_text("REWARD: {}".format(float(self.episode_reward)), 900, 280)
    self.draw_text("INTRINSIC REWARD: {}".format(float(self.episode_intrinsic_reward)), 900, 300)


    disp_map = np.reshape(map_input, [126, 126,1])
    self.show_map(disp_map,8,400,3,1,"Map",location,self.state['position'][1])

    self.show_map(self.scale_image(reward_map, 2), 400, 400, 3, 1, "Reward Map, R = 0, G = +, B = -")

    stg = np.asarray([[0, short_term_goal[2], 0],
                      [short_term_goal[3], short_term_goal[4], short_term_goal[1]],
                      [0, short_term_goal[0], 0]])
    self.show_pixels(stg, 840, 400, 20, 1, "Short Term")
    self.draw_center_text("Target Direction", 870, 490)

    rp_c = self.global_network.run_map_rp_c(sess, self.state, state, map_input)
    self.show_reward_prediction(rp_c, reward, 820, 600, "Reward Prediction")

    self.show_policy(pi_values,action)
    self.show_value()

    self.state = state
    time.sleep(DISPLAY_SLOW_DOWN)
Beispiel #14
0
    def process(self, sess):
        sess.run([
            tf.global_variables_initializer(),
            tf.local_variables_initializer()
        ])
        #sess.run(tf.initialize_all_variables())

        last_action = self.environment.last_action
        last_reward = self.environment.last_reward
        last_action_reward = ExperienceFrame.concat_action_and_reward(
            last_action, self.action_size, last_reward,
            self.environment.last_state)
        preds = None
        mode = "segnet" if flags.segnet >= 2 else ""
        mode = ""  #don't want preds
        if not flags.use_pixel_change:
            pi_values, v_value, preds = self.global_network.run_base_policy_and_value(
                sess,
                self.environment.last_state,
                last_action_reward,
                mode=mode)
        else:
            pi_values, v_value, pc_q = self.global_network.run_base_policy_value_pc_q(
                sess, self.environment.last_state, last_action_reward)

        #print(preds)
        self.value_history.add_value(v_value)

        prev_state = self.environment.last_state

        action = self.choose_action(pi_values)
        state, reward, terminal, pixel_change = self.environment.process(
            action)
        self.episode_reward += reward

        if terminal:
            self.environment.reset()
            self.episode_reward = 0

        self.show_image(state['image'])
        self.show_policy(pi_values)
        self.show_value()
        self.show_reward()

        if not flags.use_pixel_change:
            if preds is not None:
                self.show_pixel_change(self.label_to_rgb(preds), 100, 0, 3.0,
                                       "Preds")
                self.show_pixel_change(self.label_to_rgb(state['objectType']),
                                       200, 0, 0.4, "Segm Mask")
        else:
            self.show_pixel_change(pixel_change, 100, 0, 3.0, "PC")
            self.show_pixel_change(pc_q[:, :, action], 200, 0, 0.4, "PC Q")

        if flags.use_reward_prediction:
            if self.state_history.is_full:
                rp_c = self.global_network.run_rp_c(sess,
                                                    self.state_history.states)
                self.show_reward_prediction(rp_c, reward)

        self.state_history.add_state(state)
Beispiel #15
0
    def _process_base(self, sess, global_t, summary_writer, summary_op_dict,
                      summary_dict):  #, losses_input):
        # [Base A3C]
        states = []
        last_action_rewards = []
        actions = []
        rewards = []
        values = []

        terminal_end = False

        start_lstm_state = None
        if self.use_lstm:
            start_lstm_state = self.local_network.base_lstm_state_out

        mode = "segnet" if self.segnet_mode >= 2 else ""
        # t_max times loop
        flag = 0
        for _ in range(self.n_step_TD):
            # Prepare last action reward
            last_action = self.environment.last_action
            last_reward = self.environment.last_reward
            last_action_reward = ExperienceFrame.concat_action_and_reward(
                last_action, self.action_size, last_reward,
                self.environment.last_state)

            pi_, value_, losses = self.local_network.run_base_policy_and_value(
                sess, self.environment.last_state, last_action_reward, mode)

            action = self.choose_action(pi_)

            states.append(self.environment.last_state)
            last_action_rewards.append(last_action_reward)
            actions.append(action)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
                print("Trainer {}>>> Local step {}:".format(
                    self.thread_index, self.local_t))
                print("Trainer {}>>> pi={}".format(self.thread_index, pi_))
                print("Trainer {}>>> V={}".format(self.thread_index, value_))
                flag = 1

            prev_state = self.environment.last_state

            # Process game
            new_state, reward, terminal, pixel_change = self.environment.process(
                action, flag=flag)
            frame = ExperienceFrame(
                {
                    key: val
                    for key, val in prev_state.items()
                    if 'objectType' not in key
                }, reward, action, terminal, pixel_change, last_action,
                last_reward)

            # Store to experience
            self.experience.add_frame(frame)

            # Use to know about Experience collection
            #print(self.experience.get_debug_string())

            self.episode_reward += reward
            rewards.append(reward)
            self.local_t += 1

            if terminal:
                terminal_end = True
                print("Trainer {}>>> score={}".format(
                    self.thread_index, self.episode_reward))  #, flush=True)

                summary_dict['values'].update(
                    {'score_input': self.episode_reward})

                success = 1 if self.environment._last_full_state[
                    "success"] else 0
                #print("Type:", type(self.environment._last_full_state["success"]), len(self.success_rates), success)
                self.success_rates.append(success)
                summary_dict['values'].update({
                    'sr_input':
                    np.mean(self.success_rates)
                    if len(self.success_rates) == self.sr_size else 0
                })

                self.episode_reward = 0
                self.environment.reset()
                self.local_network.reset_state()
                if flag:
                    flag = 0
                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_base_value(
                sess, new_state, frame.get_action_reward(self.action_size))

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_adv = []
        batch_R = []
        batch_sobjT = []

        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + self.gamma * R
            adv = R - Vi
            a = np.zeros([self.action_size])
            a[ai] = 1.0

            batch_si.append(si['image'])
            batch_a.append(a)
            batch_adv.append(adv)
            batch_R.append(R)
            if self.segnet_param_dict["segnet_mode"] >= 2:
                batch_sobjT.append(si['objectType'])

        batch_si.reverse()
        batch_a.reverse()
        batch_adv.reverse()
        batch_R.reverse()
        batch_sobjT.reverse()

        #print(np.unique(batch_sobjT))

        ## HERE Mathematical Error A3C: only last values should be used for base/ or aggregate with last made

        return batch_si, batch_sobjT, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state
Beispiel #16
0
    def _process_base(self, sess, global_t, summary_writer, summary_op,
                      score_input, average_entropy):
        # [Base A3C]
        states = []
        map_states = []
        last_action_rewards = []
        actions = []
        rewards = []
        values = []
        episode_entropy = 0.0
        episode_steps = 0
        terminal_end = False

        start_lstm_state = self.local_network.base_lstm_state_out

        # t_max times loop
        for _ in range(LOCAL_T_MAX):
            # Prepare last action reward
            last_action = self.environment.last_action
            last_reward = self.environment.last_reward
            last_action_reward = ExperienceFrame.concat_action_and_reward(
                last_action, self.action_size, last_reward)

            prev_map_state = self.mazemap.get_map(84, 84)
            pi_, value_ = self.local_network.run_base_policy_and_value(
                sess, self.environment.last_state, prev_map_state,
                last_action_reward)
            action = self.choose_action(pi_)

            states.append(self.environment.last_state)
            map_states.append(prev_map_state)
            last_action_rewards.append(last_action_reward)
            actions.append(action)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
                print("pi={}".format(pi_))
                print(" V={}".format(value_))

            prev_state = self.environment.last_state

            # Process game
            new_state, reward, terminal, pixel_change, vtrans, vrot = self.environment.process(
                action)

            self.mazemap.update(vtrans, vrot)
            if reward > 9:
                self.mazemap.reset()

            frame = ExperienceFrame(prev_state, prev_map_state, reward, action,
                                    terminal, pixel_change, last_action,
                                    last_reward)

            # Store to experience
            self.experience.add_frame(frame)

            self.episode_reward += reward
            episode_entropy += np.sum(pi_ * np.log(pi_))
            episode_steps += 1

            rewards.append(reward)

            self.local_t += 1

            if terminal:
                terminal_end = True
                print("score={}".format(self.episode_reward))

                self._record_score(sess, summary_writer, summary_op,
                                   score_input, self.episode_reward,
                                   average_entropy,
                                   episode_entropy / episode_steps, global_t)

                self.episode_reward = 0
                episode_entropy = 0.0
                episode_steps = 0
                self.environment.reset()
                self.mazemap.reset()
                self.local_network.reset_state()
                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_base_value(
                sess, new_state, self.mazemap.get_map(84, 84),
                frame.get_last_action_reward(self.action_size))

        actions.reverse()
        states.reverse()
        map_states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_mi = []
        batch_a = []
        batch_adv = []
        batch_R = []

        for (ai, ri, si, mi, Vi) in zip(actions, rewards, states, map_states,
                                        values):
            R = ri + GAMMA * R
            adv = R - Vi
            a = np.zeros([self.action_size])
            a[ai] = 1.0

            batch_si.append(si)
            batch_mi.append(mi)
            batch_a.append(a)
            batch_adv.append(adv)
            batch_R.append(R)

        batch_si.reverse()
        batch_mi.reverse()
        batch_a.reverse()
        batch_adv.reverse()
        batch_R.reverse()

        return batch_si, batch_mi, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state
Beispiel #17
0
 def _add_frame(self, experience, reward):
   frame = ExperienceFrame(0, reward, 0, False, 0, 0, 0)
   experience.add_frame(frame)
Beispiel #18
0
    def _process_base(self, sess, global_t, map_input):
        # [Base A3C]
        states = []
        actions = []
        batch_last_action_rewards = []
        rewards = []
        values = []

        terminal_end = False
        replan = (self.apply_next_location_loss == 0.0)

        start_localization_state = self.local_network.localization_state_out

        # t_max times loop
        for _ in range(LOCAL_T_MAX):
            self.local_t += 1

            # Previous state
            prev_state = self.environment.last_state
            last_action = self.environment.last_action
            last_reward = self.environment.last_reward
            last_intrinsic_reward = self.environment.last_intrinsic_reward
            last_action_reward = ExperienceFrame.concat_action_and_reward(
                last_action, self.action_size, last_reward)

            prev_localization_state, pi_, value_, short_term_goal, shift_weights, location_distribution = self.local_network.run_policy_and_value(
                sess, prev_state, last_action_reward, map_input, replan)
            replan = False

            action = self.choose_action(pi_)

            states.append(prev_state)
            actions.append(
                ExperienceFrame.get_action_neurons(action, self.action_size))
            batch_last_action_rewards.append(last_action_reward)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
                print("pi={}".format(pi_))
                print(" V={}".format(value_))

            # Process game
            new_state, reward, intrinsic_reward, terminal = self.environment.process(
                action, short_term_goal, shift_weights)

            frame = ExperienceFrame(prev_state, map_input,
                                    prev_localization_state,
                                    location_distribution, reward,
                                    intrinsic_reward, action, terminal,
                                    last_action, last_reward,
                                    last_intrinsic_reward)

            # Store to experience
            self.experience.add_frame(frame)

            self.episode_reward += reward + intrinsic_reward

            rewards.append(reward + intrinsic_reward)

            if terminal:
                terminal_end = True
                if reward > 0: self.correct_exits += 1
                steps_needed = self.local_t - self.last_terminal_local_t
                self.last_terminal_local_t = self.local_t
                self.steps_buffer.append(steps_needed)
                if len(self.steps_buffer) > 50:
                    self.steps_buffer.popleft()
                print("Steps needed: ", steps_needed)
                print("score={}".format(self.episode_reward))
                self.episode_reward = 0

                if (np.mean(self.steps_buffer) < 100 +
                    (self.maze_size - 7) * 20
                        and len(self.steps_buffer) == 50):
                    self.maze_size += 2
                    if self.maze_size > 13:
                        print(">>>>>>>>>>> REACHED END <<<<<<<<<<<")
                        self.environment.stop()
                        sys.stdout.flush()
                        self.running = False
                        break
                    print(">>>>>> SWITCHING TO MAZES OF SIZE ", self.maze_size,
                          "x", self.maze_size, " AT GLOBAL T ", global_t,
                          " <<<<<<<<<<<<<<<")
                    sys.stdout.flush()
                    #reset moving average
                    self.correct_exits = 0
                    self.steps_buffer = deque()

                self.level_seed = np.random.randint(LEVEL_SET_SIZE)
                self.environment.reset(self.maze_size, self.level_seed)
                self.local_network.reset_state()
                break

        last_action_reward = ExperienceFrame.concat_action_and_reward(
            action, self.action_size, reward)
        R = 0.0
        if not terminal_end:
            R = self.local_network.run_value(sess, new_state,
                                             last_action_reward, frame.map)
            self.apply_next_location_loss = 1.0
        else:
            self.apply_next_location_loss = 0.0

        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_adv = []
        batch_R = []

        for (ri, si, Vi) in zip(rewards, states, values):
            R = ri + GAMMA * R
            adv = R - Vi

            batch_si.append(si)
            batch_adv.append(adv)
            batch_R.append(R)

        batch_si.reverse()
        batch_adv.reverse()
        batch_R.reverse()

        return batch_si, batch_last_action_rewards, actions, batch_adv, batch_R, start_localization_state