Esempio n. 1
0
class ProcessAgent(Process):
    def __init__(self,
                 id,
                 prediction_q,
                 training_q,
                 episode_log_q,
                 display=False):
        super(ProcessAgent, self).__init__()

        self.id = id
        self.prediction_q = prediction_q
        self.training_q = training_q
        self.episode_log_q = episode_log_q

        self.env = Environment(display=display)

        self.discount_factor = Config.DISCOUNT
        # one frame at a time
        self.wait_q = Queue(maxsize=1)
        self.exit_flag = Value('i', 0)

    @staticmethod
    def _accumulate_rewards(experiences, discount_factor, terminal_reward):
        reward_sum = terminal_reward
        for t in reversed(range(0, len(experiences) - 1)):
            r = np.clip(experiences[t].reward, Config.REWARD_MIN,
                        Config.REWARD_MAX)
            reward_sum = discount_factor * reward_sum + r
            experiences[t].reward = reward_sum
        return experiences[:-1]

    @staticmethod
    def convert_data(experiences):
        x_ = np.array([exp.state for exp in experiences])
        a_ = np.array([exp.action for exp in experiences])
        r_ = np.array([exp.reward for exp in experiences])
        return x_, r_, a_

    def predict(self, state):
        # put the state in the prediction q
        # print('agent%d put one prediction'%self.id)
        self.prediction_q.put((self.id, state))
        # wait for the prediction to come back
        p, v = self.wait_q.get()
        return p, v

    def select_action(self, prediction):

        return prediction

    def run_episode(self):
        self.env.reset()
        done = False
        experiences = []

        time_count = 0
        reward_sum = 0.0

        while not done:

            prediction, value = self.predict(self.env.current_state)
            action = self.select_action(prediction)
            reward, done = self.env.step(action)
            reward_sum += reward
            exp = Experience(self.env.previous_state, action, prediction,
                             reward, done)
            experiences.append(exp)

            if done or time_count == Config.TIME_MAX:
                terminal_reward = 0 if done else value

                updated_exps = ProcessAgent._accumulate_rewards(
                    experiences, self.discount_factor, terminal_reward)
                x_, r_, a_ = self.convert_data(updated_exps)

                yield x_, r_, a_, reward_sum

                # reset the tmax count
                time_count = 0
                # keep the last experience for the next batch
                experiences = [experiences[-1]]
                reward_sum = 0.0

            time_count += 1

    def run(self):
        # randomly sleep up to 1 second. helps agents boot smoothly.
        time.sleep(np.random.rand())
        np.random.seed(np.int32(time.time() % 1 * 1000 + self.id * 10))

        print('start agent')
        while self.exit_flag.value == 0:
            total_reward = 0
            total_length = 0
            for x_, r_, a_, reward_sum in self.run_episode():
                total_reward += reward_sum
                if self.id == 0:
                    print('sum of reward is %f' % total_reward)
                total_length += len(r_) + 1  # +1 for last frame that we drop
                self.training_q.put((x_, r_, a_))
            self.episode_log_q.put(
                (datetime.now(), total_reward, total_length))
Esempio n. 2
0
            11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
            5, 5, 5
        ]
    }

    models = {
        name: NetworkVP('cpu:0', name, len(actions))
        for name in model_names
    }
    for model in models.values():
        model.load()

    while not done:
        if env.current_state is None:
            env.step(0)  # NO-OP while we wait for the frame buffer to fill.
        else:
            if command_steps > 0:
                command_steps -= 1
                if command.isdigit():
                    action = int(command)
                else:
                    model = models[command]
                    p = model.predict_p(
                        np.expand_dims(env.current_state, axis=0))[0]
                    # action = np.argmax(p)
                    action = np.random.choice(actions, p=p)
                _, done, _ = env.step(action)
            else:
                if command is None:
                    print(
Esempio n. 3
0
class ProcessAgent(Process):
    def __init__(self,
                 id,
                 prediction_q,
                 training_q,
                 episode_log_q,
                 reward_modifier_q=None):
        super(ProcessAgent, self).__init__()

        self.id = id
        self.prediction_q = prediction_q
        self.training_q = training_q
        self.episode_log_q = episode_log_q
        self.reward_modifier_q = reward_modifier_q

        self.env = Environment()
        self.num_actions = self.env.get_num_actions()
        self.onehots = np.eye(self.num_actions)
        self.actions = np.arange(self.num_actions)

        self.discount_factor = Config.DISCOUNT
        # one frame at a time
        self.wait_q = Queue(maxsize=1)
        self.exit_flag = Value('i', 0)

    @staticmethod
    def _accumulate_rewards(experiences, discount_factor, terminal_reward):
        reward_sum = terminal_reward
        for t in reversed(range(0, len(experiences) - 1)):
            r = np.clip(experiences[t].reward, Config.REWARD_MIN,
                        Config.REWARD_MAX)
            reward_sum = discount_factor * reward_sum + r
            experiences[t].reward = reward_sum
        return experiences[:-1]

    def convert_data(self, experiences):
        x_ = np.array([exp.state for exp in experiences])
        a_ = self.onehots[np.array([exp.action for exp in experiences],
                                   dtype=int)].astype(np.float32)
        r_ = np.array([exp.reward for exp in experiences])
        return x_, r_, a_

    def predict(self, state):
        # put the state in the prediction q
        self.prediction_q.put((self.id, state))
        # wait for the prediction to come back
        p, v = self.wait_q.get()
        return p, v

    def select_action(self, prediction):
        if Config.PLAY_MODE:
            action = np.argmax(prediction)
        else:
            action = np.random.choice(self.actions, p=prediction)
        return action

    def run_episode(self):
        self.env.reset()
        done = False
        experiences = []

        path = {
            "obs": [],
            "original_rewards": [],
            "actions": [],
            "human_obs": [],
        }

        time_count = 0

        while not done:
            # very first few frames
            if self.env.current_state is None:
                self.env.step(0)  # 0 == NOOP
                continue

            prediction, value = self.predict(self.env.current_state)
            action = self.select_action(prediction)
            reward, done, info = self.env.step(action)
            exp = Experience(self.env.previous_state, action, prediction,
                             reward, done, info["human_obs"])
            experiences.append(exp)

            if done or time_count == Config.TIME_MAX:
                terminal_reward = 0 if done else value

                ################################
                #  START REWARD MODIFICATIONS  #
                ################################
                if self.reward_modifier_q:
                    # Translate the experiences into the "path" that RL-Teacher expects
                    if len(path["obs"]) > 0:
                        # Cut off the first item in the list because it's from an old episode
                        new_experiences = experiences[1:]
                    else:
                        new_experiences = experiences

                    path["obs"] += [e.state for e in new_experiences]
                    path["original_rewards"] += [
                        e.reward for e in new_experiences
                    ]
                    path["actions"] += [e.action for e in new_experiences]
                    path["human_obs"] += [e.human_obs for e in new_experiences]

                    #  TODO SPEED UP!! THIS IS SLOWING THINGS DOWN!
                    self.reward_modifier_q.put((self.id, done, path))
                    path["rewards"] = self.wait_q.get()

                    # Translate new rewards back into the experiences
                    for i in range(len(experiences)):
                        # Work backwards because the path is longer than the experience list, but their ends are synced
                        experiences[-(1 + i)].reward = path["rewards"][-(1 +
                                                                         i)]
                ################################
                #   END REWARD MODIFICATIONS   #
                ################################

                reward_sum = sum([x.reward for x in experiences])
                updated_exps = ProcessAgent._accumulate_rewards(
                    experiences, self.discount_factor, terminal_reward)
                x_, r_, a_ = self.convert_data(updated_exps)
                yield x_, r_, a_, reward_sum

                # reset the tmax count
                time_count = 0
                # keep the last experience for the next batch
                experiences = [experiences[-1]]
                reward_sum = 0.0

            time_count += 1

    def run(self):
        # randomly sleep up to 1 second. helps agents boot smoothly.
        time.sleep(np.random.rand())
        np.random.seed(np.int32(time.time() % 1 * 1000 + self.id * 10))

        while self.exit_flag.value == 0:
            total_reward = 0
            total_length = 0
            for x_, r_, a_, reward_sum in self.run_episode():
                total_reward += reward_sum
                total_length += len(r_) + 1  # +1 for last frame that we drop
                self.training_q.put((x_, r_, a_))
            self.episode_log_q.put(
                (datetime.now(), total_reward, total_length))