Exemple #1
0
    def replay_steps(self, states, actions, rewards, last_state, last_terminal: bool):
        # get predicted reward for the last state - we didn't do action in that state
        R = 0 if last_terminal and rewards[-1] < 9 else self.Critic.predict(t(last_state))
        # reset gradients for optimizers
        self.Actor.optimizer.zero_grad()
        self.Critic.optimizer.zero_grad()
        critic_loss, actor_loss = 0, 0
        # go backwards through states, actions and rewards taken in this episode
        for i in reversed(range(len(rewards))):
            self.accum_rewards += rewards[i]
            R = rewards[i] + self.discount_rate * R
            advantage = (R - self.Critic.predict(t(states[i])))
            # get Beta distribution parameters with which the action was drawn
            alpha, beta = self.Actor.predict(t(states[i]))

            torch.distributions.Beta.set_default_validate_args(True)
            dist = torch.distributions.Beta(alpha, beta)

            # accumulate critic loss
            critic_loss = critic_loss + advantage.pow(2).mean()
            # accumulate actor loss - we maximize the rewards, thus we take negation of gradient.
            # Adam opt. then negates it again, so weights are updated in a way which makes advantages higher
            actor_loss = actor_loss - dist.log_prob(self.Actor.action_to_beta(t(actions[i]))) * advantage.detach()

        # compute gradients wrt. weights
        actor_loss.backward()
        critic_loss.backward()
Exemple #2
0
async def _(event):
    start = datetime.now()
    vent = event.chat_id
    end = datetime.now()
    ms = (end - start).microseconds / 1000
    uptime = t((time.time() - StartTime))
    await AnimeBot.send_message(vent, f"🏓Ping speed: {ms}\n😵Uptime: {uptime}")
 def render(self):
     for e in range(10):
         state = self.env.reset()
         done = False
         score = 0
         while not done:
             self.env.render()
             action = self.Actor.get_best_action(t(state))
             state, reward, done, _ = self.env.step(action)
             score += reward
             if done:
                 a3c_logger.info("episode: {}, score: {}".format(e, score))
                 break
     self.env.close()
Exemple #4
0
    def run(self):
        if self.globalA3C is None:
            raise Exception("Global model is not set! Please call set_global_model(global_model) to set the parent model.")

        state = self.env.reset()  # reset env and get initial state
        episode = 0
        while episode < self.max_episodes:
            # reset stuff
            is_terminal = False
            states, actions, rewards = [], [], []
            step_start = self.step

            while not is_terminal and self.step - step_start < self.step_max:
                states.append(state)  # register current state
                action = self.Actor.draw_action(t(state))  # draw action
                next_state, reward, is_terminal, info = self.env.step(action)  # perform action
                actions.append(action)  # register action
                rewards.append(reward)  # register reward
                state = next_state
                self.step += 1

            # replay experience backwards and compute gradients
            self.replay_steps(states, actions, rewards, state, is_terminal)
            self.lock.acquire()
            self.update_global_models()
            self.sync_models()
            self.globalA3C.episode += 1
            episode = self.globalA3C.episode
            self.lock.release()

            if episode % self.measure_step == 0 and self.eval_repeats != 0:
                self.lock.acquire()
                mean, _ = self.evaluate(self.eval_repeats)
                self.globalA3C.performance.append([episode, mean])
                self.lock.release()
                if self.log_info:
                    a3c_logger.info(f"\nEpisode: {episode}\nMean accumulated rewards: {mean}")

            if is_terminal:
                self.update_local_results()
                state = self.env.reset()  # reset env and get initial state
                self.local_episode += 1

        self.env.close()
    def evaluate(self, eval_repeats=20):
        self.Actor.model.eval()
        self.Critic.model.eval()
        scores = []
        for ep in range(eval_repeats):
            state = self.env.reset()
            done = False
            performance = 0
            while not done:
                with torch.no_grad():
                    action = self.Actor.get_best_action(t(state))
                state, reward, done, _ = self.env.step(action)
                performance += reward

            scores.append([ep + 1, performance])

        scores = np.array(scores)
        self.Actor.model.train()
        self.Critic.model.train()
        return scores[:, 1].mean(), scores