Esempio n. 1
0
    def _fill_experience(self, sess):
        """
    Fill experience buffer until buffer is full.
    """
        prev_state = self.environment.last_state
        last_action = self.environment.last_action
        last_reward = self.environment.last_reward
        last_action_reward = ExperienceFrame.concat_action_and_reward(
            last_action, self.action_size, last_reward)

        pi_, _ = self.local_network.run_base_policy_and_value(
            sess, self.environment.last_state, last_action_reward)
        action = self.choose_action(pi_)

        new_state, reward, terminal, pixel_change = self.environment.process(
            action)

        frame = ExperienceFrame(prev_state, reward, action, terminal,
                                pixel_change, last_action, last_reward)
        self.experience.add_frame(frame)

        if terminal:
            self.environment.reset()
        if self.experience.is_full():
            self.environment.reset()
            print("Replay buffer filled")
Esempio n. 2
0
    def _fill_experience(self, sess):
        """
    Fill experience buffer until buffer is full.
    """
        prev_state = self.environment.last_state
        last_action = self.environment.last_action
        last_reward = self.environment.last_reward
        last_intrinsic_reward = self.environment.last_intrinsic_reward
        last_action_reward = ExperienceFrame.concat_action_and_reward(
            last_action, self.action_size, last_reward)
        input_map = self.environment.map
        prev_localization_state, pi_, _, short_term_goal, shift_weights, location_distribution = self.local_network.run_policy_and_value(
            sess, prev_state, last_action_reward, input_map, replan=False)
        action = self.choose_action(pi_)

        new_state, reward, intrinsic_reward, terminal = self.environment.process(
            action, short_term_goal, shift_weights)

        frame = ExperienceFrame(prev_state, input_map, prev_localization_state,
                                location_distribution, reward,
                                intrinsic_reward, action, terminal,
                                last_action, last_reward,
                                last_intrinsic_reward)
        self.experience.add_frame(frame)

        if terminal:
            self.level_seed = np.random.randint(LEVEL_SET_SIZE)
            self.environment.reset(self.maze_size, self.level_seed)
        if self.experience.is_full():
            print(
                "Replay buffer filled--------------------------------------------------------------------------------------"
            )
            sys.stdout.flush()
Esempio n. 3
0
    def _fill_experience(self, sess):
        """
    Fill experience buffer until buffer is full.
    """
        #print("Start experience filling", flush=True)
        prev_state = self.environment.last_state
        last_action = self.environment.last_action
        last_reward = self.environment.last_reward
        last_action_reward = ExperienceFrame.concat_action_and_reward(
            last_action, self.action_size, last_reward, prev_state)

        #print("Local network run base policy, value!", flush=True)
        pi_, _, _ = self.local_network.run_base_policy_and_value(
            sess, self.environment.last_state, last_action_reward)
        action = self.choose_action(pi_)

        new_state, reward, terminal, pixel_change = self.environment.process(
            action, flag=0)

        frame = ExperienceFrame(
            {
                key: val
                for key, val in prev_state.items() if 'objectType' not in key
            }, reward, action, terminal, pixel_change, last_action,
            last_reward)
        self.experience.add_frame(frame)

        if terminal:
            self.environment.reset()
        if self.experience.is_full():
            self.environment.reset()
            print("Replay buffer filled")
Esempio n. 4
0
    def _add_batch_to_exp(self, batch):
        # if we just started, copy the first state as last state
        if self.last_state is None:
                self.last_state = batch.si[0]
        #logger.debug("adding batch to exp. len:{}".format(len(batch.si)))
        for k in range(len(batch.si)):
            state = batch.si[k]
            action = batch.a[k]#np.argmax(batch.a[k])
            reward = batch.a_r[k][-1]

            self.episode_reward += reward
            features = batch.features[k]
            pixel_change = batch.pc[k]
            #logger.debug("k = {} of {} -- terminal = {}".format(k,len(batch.si), batch.terminal))
            if k == len(batch.si)-1 and batch.terminal:
                terminal = True
            else:
                terminal = False
            frame = ExperienceFrame(state, reward, action, terminal, features, pixel_change,

                            self.last_action, self.last_reward)
            self.experience.add_frame(frame)
            self.last_state = state
            self.last_action = action
            self.last_reward = reward
            
        if terminal:
            total_ep_reward = self.episode_reward
            self.episode_reward = 0
            return total_ep_reward
        else:
            return None
Esempio n. 5
0
 def record(self, obs, reward, terminal, pc, action):
     last_state = self.env.last_state
     last_action = self.env.last_action
     last_reward = self.env.last_reward
     frame = ExperienceFrame(last_state, reward, action, terminal, pc,
                             last_action, last_reward)
     self.ExpPool.add_frame(frame)
     if self.ExpPool.is_full():
         print('Experience pool is filled!')
     print('Filled %d/%d.' % (len(self.ExpPool._frames), MAX_EXP), end='\r')
     sys.stdout.flush()
Esempio n. 6
0
    def _process_base(self, sess, global_t, summary_writer, summary_op,
                      score_input):
        # [Base A3C]
        states = []
        last_action_rewards = []
        actions = []
        rewards = []
        values = []

        terminal_end = False

        start_lstm_state = self.local_network.base_lstm_state_out

        # t_max times loop
        for _ in range(self.local_t_max):
            # Prepare last action reward
            last_action = self.environment.last_action
            last_reward = self.environment.last_reward
            last_action_reward = ExperienceFrame.concat_action_and_reward(
                last_action, self.action_size, last_reward)
            #Modify Last State - with attention
            pi_, value_ = self.local_network.run_base_policy_and_value(
                sess, self.environment.last_state, last_action_reward)

            action = self.choose_action(pi_)

            states.append(self.environment.last_state)
            last_action_rewards.append(last_action_reward)
            actions.append(action)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
                print("pi={}".format(pi_))
                print(" V={}".format(value_))

            prev_state = self.environment.last_state

            # Process game
            new_state, reward, terminal, pixel_change = self.environment.process(
                action)  #Modify New State - with attention
            frame = ExperienceFrame(prev_state, reward, action, terminal,
                                    pixel_change, last_action, last_reward)

            # Store to experience
            self.experience.add_frame(frame)

            self.episode_reward += reward

            rewards.append(reward)

            self.local_t += 1

            if terminal:
                terminal_end = True
                print("score={}".format(self.episode_reward))

                self._record_score(sess, summary_writer, summary_op,
                                   score_input, self.episode_reward, global_t)

                self.episode_reward = 0
                self.environment.reset()
                self.local_network.reset_state()
                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_base_value(
                sess, new_state, frame.get_action_reward(self.action_size))

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_adv = []
        batch_R = []

        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + self.gamma * R
            adv = R - Vi
            a = np.zeros([self.action_size])
            a[ai] = 1.0

            batch_si.append(si)
            batch_a.append(a)
            batch_adv.append(adv)
            batch_R.append(R)

        batch_si.reverse()
        batch_a.reverse()
        batch_adv.reverse()
        batch_R.reverse()

        return batch_si, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state
Esempio n. 7
0
    def _process_base(self, sess, global_t, summary_writer, summary_op_dict,
                      summary_dict):  #, losses_input):
        # [Base A3C]
        states = []
        last_action_rewards = []
        actions = []
        rewards = []
        values = []

        terminal_end = False

        start_lstm_state = None
        if self.use_lstm:
            start_lstm_state = self.local_network.base_lstm_state_out

        mode = "segnet" if self.segnet_mode >= 2 else ""
        # t_max times loop
        flag = 0
        for _ in range(self.n_step_TD):
            # Prepare last action reward
            last_action = self.environment.last_action
            last_reward = self.environment.last_reward
            last_action_reward = ExperienceFrame.concat_action_and_reward(
                last_action, self.action_size, last_reward,
                self.environment.last_state)

            pi_, value_, losses = self.local_network.run_base_policy_and_value(
                sess, self.environment.last_state, last_action_reward, mode)

            action = self.choose_action(pi_)

            states.append(self.environment.last_state)
            last_action_rewards.append(last_action_reward)
            actions.append(action)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
                print("Trainer {}>>> Local step {}:".format(
                    self.thread_index, self.local_t))
                print("Trainer {}>>> pi={}".format(self.thread_index, pi_))
                print("Trainer {}>>> V={}".format(self.thread_index, value_))
                flag = 1

            prev_state = self.environment.last_state

            # Process game
            new_state, reward, terminal, pixel_change = self.environment.process(
                action, flag=flag)
            frame = ExperienceFrame(
                {
                    key: val
                    for key, val in prev_state.items()
                    if 'objectType' not in key
                }, reward, action, terminal, pixel_change, last_action,
                last_reward)

            # Store to experience
            self.experience.add_frame(frame)

            # Use to know about Experience collection
            #print(self.experience.get_debug_string())

            self.episode_reward += reward
            rewards.append(reward)
            self.local_t += 1

            if terminal:
                terminal_end = True
                print("Trainer {}>>> score={}".format(
                    self.thread_index, self.episode_reward))  #, flush=True)

                summary_dict['values'].update(
                    {'score_input': self.episode_reward})

                success = 1 if self.environment._last_full_state[
                    "success"] else 0
                #print("Type:", type(self.environment._last_full_state["success"]), len(self.success_rates), success)
                self.success_rates.append(success)
                summary_dict['values'].update({
                    'sr_input':
                    np.mean(self.success_rates)
                    if len(self.success_rates) == self.sr_size else 0
                })

                self.episode_reward = 0
                self.environment.reset()
                self.local_network.reset_state()
                if flag:
                    flag = 0
                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_base_value(
                sess, new_state, frame.get_action_reward(self.action_size))

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_adv = []
        batch_R = []
        batch_sobjT = []

        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + self.gamma * R
            adv = R - Vi
            a = np.zeros([self.action_size])
            a[ai] = 1.0

            batch_si.append(si['image'])
            batch_a.append(a)
            batch_adv.append(adv)
            batch_R.append(R)
            if self.segnet_param_dict["segnet_mode"] >= 2:
                batch_sobjT.append(si['objectType'])

        batch_si.reverse()
        batch_a.reverse()
        batch_adv.reverse()
        batch_R.reverse()
        batch_sobjT.reverse()

        #print(np.unique(batch_sobjT))

        ## HERE Mathematical Error A3C: only last values should be used for base/ or aggregate with last made

        return batch_si, batch_sobjT, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state
Esempio n. 8
0
    def _process_base(self, sess, global_t, map_input):
        # [Base A3C]
        states = []
        actions = []
        batch_last_action_rewards = []
        rewards = []
        values = []

        terminal_end = False
        replan = (self.apply_next_location_loss == 0.0)

        start_localization_state = self.local_network.localization_state_out

        # t_max times loop
        for _ in range(LOCAL_T_MAX):
            self.local_t += 1

            # Previous state
            prev_state = self.environment.last_state
            last_action = self.environment.last_action
            last_reward = self.environment.last_reward
            last_intrinsic_reward = self.environment.last_intrinsic_reward
            last_action_reward = ExperienceFrame.concat_action_and_reward(
                last_action, self.action_size, last_reward)

            prev_localization_state, pi_, value_, short_term_goal, shift_weights, location_distribution = self.local_network.run_policy_and_value(
                sess, prev_state, last_action_reward, map_input, replan)
            replan = False

            action = self.choose_action(pi_)

            states.append(prev_state)
            actions.append(
                ExperienceFrame.get_action_neurons(action, self.action_size))
            batch_last_action_rewards.append(last_action_reward)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
                print("pi={}".format(pi_))
                print(" V={}".format(value_))

            # Process game
            new_state, reward, intrinsic_reward, terminal = self.environment.process(
                action, short_term_goal, shift_weights)

            frame = ExperienceFrame(prev_state, map_input,
                                    prev_localization_state,
                                    location_distribution, reward,
                                    intrinsic_reward, action, terminal,
                                    last_action, last_reward,
                                    last_intrinsic_reward)

            # Store to experience
            self.experience.add_frame(frame)

            self.episode_reward += reward + intrinsic_reward

            rewards.append(reward + intrinsic_reward)

            if terminal:
                terminal_end = True
                if reward > 0: self.correct_exits += 1
                steps_needed = self.local_t - self.last_terminal_local_t
                self.last_terminal_local_t = self.local_t
                self.steps_buffer.append(steps_needed)
                if len(self.steps_buffer) > 50:
                    self.steps_buffer.popleft()
                print("Steps needed: ", steps_needed)
                print("score={}".format(self.episode_reward))
                self.episode_reward = 0

                if (np.mean(self.steps_buffer) < 100 +
                    (self.maze_size - 7) * 20
                        and len(self.steps_buffer) == 50):
                    self.maze_size += 2
                    if self.maze_size > 13:
                        print(">>>>>>>>>>> REACHED END <<<<<<<<<<<")
                        self.environment.stop()
                        sys.stdout.flush()
                        self.running = False
                        break
                    print(">>>>>> SWITCHING TO MAZES OF SIZE ", self.maze_size,
                          "x", self.maze_size, " AT GLOBAL T ", global_t,
                          " <<<<<<<<<<<<<<<")
                    sys.stdout.flush()
                    #reset moving average
                    self.correct_exits = 0
                    self.steps_buffer = deque()

                self.level_seed = np.random.randint(LEVEL_SET_SIZE)
                self.environment.reset(self.maze_size, self.level_seed)
                self.local_network.reset_state()
                break

        last_action_reward = ExperienceFrame.concat_action_and_reward(
            action, self.action_size, reward)
        R = 0.0
        if not terminal_end:
            R = self.local_network.run_value(sess, new_state,
                                             last_action_reward, frame.map)
            self.apply_next_location_loss = 1.0
        else:
            self.apply_next_location_loss = 0.0

        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_adv = []
        batch_R = []

        for (ri, si, Vi) in zip(rewards, states, values):
            R = ri + GAMMA * R
            adv = R - Vi

            batch_si.append(si)
            batch_adv.append(adv)
            batch_R.append(R)

        batch_si.reverse()
        batch_adv.reverse()
        batch_R.reverse()

        return batch_si, batch_last_action_rewards, actions, batch_adv, batch_R, start_localization_state
Esempio n. 9
0
 def _add_frame(self, experience, reward):
   frame = ExperienceFrame(0, reward, 0, False, 0, 0, 0)
   experience.add_frame(frame)
Esempio n. 10
0
    def _process_base(self, sess, global_t, summary_writer, summary_op,
                      score_input, average_entropy):
        # [Base A3C]
        states = []
        map_states = []
        last_action_rewards = []
        actions = []
        rewards = []
        values = []
        episode_entropy = 0.0
        episode_steps = 0
        terminal_end = False

        start_lstm_state = self.local_network.base_lstm_state_out

        # t_max times loop
        for _ in range(LOCAL_T_MAX):
            # Prepare last action reward
            last_action = self.environment.last_action
            last_reward = self.environment.last_reward
            last_action_reward = ExperienceFrame.concat_action_and_reward(
                last_action, self.action_size, last_reward)

            prev_map_state = self.mazemap.get_map(84, 84)
            pi_, value_ = self.local_network.run_base_policy_and_value(
                sess, self.environment.last_state, prev_map_state,
                last_action_reward)
            action = self.choose_action(pi_)

            states.append(self.environment.last_state)
            map_states.append(prev_map_state)
            last_action_rewards.append(last_action_reward)
            actions.append(action)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
                print("pi={}".format(pi_))
                print(" V={}".format(value_))

            prev_state = self.environment.last_state

            # Process game
            new_state, reward, terminal, pixel_change, vtrans, vrot = self.environment.process(
                action)

            self.mazemap.update(vtrans, vrot)
            if reward > 9:
                self.mazemap.reset()

            frame = ExperienceFrame(prev_state, prev_map_state, reward, action,
                                    terminal, pixel_change, last_action,
                                    last_reward)

            # Store to experience
            self.experience.add_frame(frame)

            self.episode_reward += reward
            episode_entropy += np.sum(pi_ * np.log(pi_))
            episode_steps += 1

            rewards.append(reward)

            self.local_t += 1

            if terminal:
                terminal_end = True
                print("score={}".format(self.episode_reward))

                self._record_score(sess, summary_writer, summary_op,
                                   score_input, self.episode_reward,
                                   average_entropy,
                                   episode_entropy / episode_steps, global_t)

                self.episode_reward = 0
                episode_entropy = 0.0
                episode_steps = 0
                self.environment.reset()
                self.mazemap.reset()
                self.local_network.reset_state()
                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_base_value(
                sess, new_state, self.mazemap.get_map(84, 84),
                frame.get_last_action_reward(self.action_size))

        actions.reverse()
        states.reverse()
        map_states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_mi = []
        batch_a = []
        batch_adv = []
        batch_R = []

        for (ai, ri, si, mi, Vi) in zip(actions, rewards, states, map_states,
                                        values):
            R = ri + GAMMA * R
            adv = R - Vi
            a = np.zeros([self.action_size])
            a[ai] = 1.0

            batch_si.append(si)
            batch_mi.append(mi)
            batch_a.append(a)
            batch_adv.append(adv)
            batch_R.append(R)

        batch_si.reverse()
        batch_mi.reverse()
        batch_a.reverse()
        batch_adv.reverse()
        batch_R.reverse()

        return batch_si, batch_mi, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state