Esempi in Python per Environment.step

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: ga3c.Environment

Classe/tipologia: Environment

Metodo/funzione: step

Esempi su hotexamples.com: 3

Environment.step in Python: 3 esempi trovati. Questi sono i migliori esempi reali in Python per ga3c.Environment.Environment.step, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

Environment(4)

step(3)

get_num_actions(2)

reset(2)

Esempio n. 1

Mostra file

File: ProcessAgent.py Progetto: wesley-stone/rfmask

class ProcessAgent(Process):
    def __init__(self,
                 id,
                 prediction_q,
                 training_q,
                 episode_log_q,
                 display=False):
        super(ProcessAgent, self).__init__()

        self.id = id
        self.prediction_q = prediction_q
        self.training_q = training_q
        self.episode_log_q = episode_log_q

        self.env = Environment(display=display)

        self.discount_factor = Config.DISCOUNT
        # one frame at a time
        self.wait_q = Queue(maxsize=1)
        self.exit_flag = Value('i', 0)

    @staticmethod
    def _accumulate_rewards(experiences, discount_factor, terminal_reward):
        reward_sum = terminal_reward
        for t in reversed(range(0, len(experiences) - 1)):
            r = np.clip(experiences[t].reward, Config.REWARD_MIN,
                        Config.REWARD_MAX)
            reward_sum = discount_factor * reward_sum + r
            experiences[t].reward = reward_sum
        return experiences[:-1]

    @staticmethod
    def convert_data(experiences):
        x_ = np.array([exp.state for exp in experiences])
        a_ = np.array([exp.action for exp in experiences])
        r_ = np.array([exp.reward for exp in experiences])
        return x_, r_, a_

    def predict(self, state):
        # put the state in the prediction q
        # print('agent%d put one prediction'%self.id)
        self.prediction_q.put((self.id, state))
        # wait for the prediction to come back
        p, v = self.wait_q.get()
        return p, v

    def select_action(self, prediction):

        return prediction

    def run_episode(self):
        self.env.reset()
        done = False
        experiences = []

        time_count = 0
        reward_sum = 0.0

        while not done:

            prediction, value = self.predict(self.env.current_state)
            action = self.select_action(prediction)
            reward, done = self.env.step(action)
            reward_sum += reward
            exp = Experience(self.env.previous_state, action, prediction,
                             reward, done)
            experiences.append(exp)

            if done or time_count == Config.TIME_MAX:
                terminal_reward = 0 if done else value

                updated_exps = ProcessAgent._accumulate_rewards(
                    experiences, self.discount_factor, terminal_reward)
                x_, r_, a_ = self.convert_data(updated_exps)

                yield x_, r_, a_, reward_sum

                # reset the tmax count
                time_count = 0
                # keep the last experience for the next batch
                experiences = [experiences[-1]]
                reward_sum = 0.0

            time_count += 1

    def run(self):
        # randomly sleep up to 1 second. helps agents boot smoothly.
        time.sleep(np.random.rand())
        np.random.seed(np.int32(time.time() % 1 * 1000 + self.id * 10))

        print('start agent')
        while self.exit_flag.value == 0:
            total_reward = 0
            total_length = 0
            for x_, r_, a_, reward_sum in self.run_episode():
                total_reward += reward_sum
                if self.id == 0:
                    print('sum of reward is %f' % total_reward)
                total_length += len(r_) + 1  # +1 for last frame that we drop
                self.training_q.put((x_, r_, a_))
            self.episode_log_q.put(
                (datetime.now(), total_reward, total_length))

Esempio n. 2

Mostra file

            11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
            5, 5, 5
        ]
    }

    models = {
        name: NetworkVP('cpu:0', name, len(actions))
        for name in model_names
    }
    for model in models.values():
        model.load()

    while not done:
        if env.current_state is None:
            env.step(0)  # NO-OP while we wait for the frame buffer to fill.
        else:
            if command_steps > 0:
                command_steps -= 1
                if command.isdigit():
                    action = int(command)
                else:
                    model = models[command]
                    p = model.predict_p(
                        np.expand_dims(env.current_state, axis=0))[0]
                    # action = np.argmax(p)
                    action = np.random.choice(actions, p=p)
                _, done, _ = env.step(action)
            else:
                if command is None:
                    print(

Esempio n. 3

Mostra file

class ProcessAgent(Process):
    def __init__(self,
                 id,
                 prediction_q,
                 training_q,
                 episode_log_q,
                 reward_modifier_q=None):
        super(ProcessAgent, self).__init__()

        self.id = id
        self.prediction_q = prediction_q
        self.training_q = training_q
        self.episode_log_q = episode_log_q
        self.reward_modifier_q = reward_modifier_q

        self.env = Environment()
        self.num_actions = self.env.get_num_actions()
        self.onehots = np.eye(self.num_actions)
        self.actions = np.arange(self.num_actions)

        self.discount_factor = Config.DISCOUNT
        # one frame at a time
        self.wait_q = Queue(maxsize=1)
        self.exit_flag = Value('i', 0)

    @staticmethod
    def _accumulate_rewards(experiences, discount_factor, terminal_reward):
        reward_sum = terminal_reward
        for t in reversed(range(0, len(experiences) - 1)):
            r = np.clip(experiences[t].reward, Config.REWARD_MIN,
                        Config.REWARD_MAX)
            reward_sum = discount_factor * reward_sum + r
            experiences[t].reward = reward_sum
        return experiences[:-1]

    def convert_data(self, experiences):
        x_ = np.array([exp.state for exp in experiences])
        a_ = self.onehots[np.array([exp.action for exp in experiences],
                                   dtype=int)].astype(np.float32)
        r_ = np.array([exp.reward for exp in experiences])
        return x_, r_, a_

    def predict(self, state):
        # put the state in the prediction q
        self.prediction_q.put((self.id, state))
        # wait for the prediction to come back
        p, v = self.wait_q.get()
        return p, v

    def select_action(self, prediction):
        if Config.PLAY_MODE:
            action = np.argmax(prediction)
        else:
            action = np.random.choice(self.actions, p=prediction)
        return action

    def run_episode(self):
        self.env.reset()
        done = False
        experiences = []

        path = {
            "obs": [],
            "original_rewards": [],
            "actions": [],
            "human_obs": [],
        }

        time_count = 0

        while not done:
            # very first few frames
            if self.env.current_state is None:
                self.env.step(0)  # 0 == NOOP
                continue

            prediction, value = self.predict(self.env.current_state)
            action = self.select_action(prediction)
            reward, done, info = self.env.step(action)
            exp = Experience(self.env.previous_state, action, prediction,
                             reward, done, info["human_obs"])
            experiences.append(exp)

            if done or time_count == Config.TIME_MAX:
                terminal_reward = 0 if done else value

                ################################
                #  START REWARD MODIFICATIONS  #
                ################################
                if self.reward_modifier_q:
                    # Translate the experiences into the "path" that RL-Teacher expects
                    if len(path["obs"]) > 0:
                        # Cut off the first item in the list because it's from an old episode
                        new_experiences = experiences[1:]
                    else:
                        new_experiences = experiences

                    path["obs"] += [e.state for e in new_experiences]
                    path["original_rewards"] += [
                        e.reward for e in new_experiences
                    ]
                    path["actions"] += [e.action for e in new_experiences]
                    path["human_obs"] += [e.human_obs for e in new_experiences]

                    #  TODO SPEED UP!! THIS IS SLOWING THINGS DOWN!
                    self.reward_modifier_q.put((self.id, done, path))
                    path["rewards"] = self.wait_q.get()

                    # Translate new rewards back into the experiences
                    for i in range(len(experiences)):
                        # Work backwards because the path is longer than the experience list, but their ends are synced
                        experiences[-(1 + i)].reward = path["rewards"][-(1 +
                                                                         i)]
                ################################
                #   END REWARD MODIFICATIONS   #
                ################################

                reward_sum = sum([x.reward for x in experiences])
                updated_exps = ProcessAgent._accumulate_rewards(
                    experiences, self.discount_factor, terminal_reward)
                x_, r_, a_ = self.convert_data(updated_exps)
                yield x_, r_, a_, reward_sum

                # reset the tmax count
                time_count = 0
                # keep the last experience for the next batch
                experiences = [experiences[-1]]
                reward_sum = 0.0

            time_count += 1

    def run(self):
        # randomly sleep up to 1 second. helps agents boot smoothly.
        time.sleep(np.random.rand())
        np.random.seed(np.int32(time.time() % 1 * 1000 + self.id * 10))

        while self.exit_flag.value == 0:
            total_reward = 0
            total_length = 0
            for x_, r_, a_, reward_sum in self.run_episode():
                total_reward += reward_sum
                total_length += len(r_) + 1  # +1 for last frame that we drop
                self.training_q.put((x_, r_, a_))
            self.episode_log_q.put(
                (datetime.now(), total_reward, total_length))