Esempio n. 1
0
class Agent:
    def __init__(self, env, config, wt):
        self.C = config
        self.n_state = list(env.observation_space.shape)
        self.n_action = env.action_space.n
        self.epsilon = 0.99
        self.lr = 1e-3
        self.wt = wt
        self.buffer = ReplayBuffer(self.C['max_size'], self.C['frame_stack'])
        self.buffer2 = ReplayBuffer(self.C['max_size'], self.C['frame_stack'])
        self.net = Net(self.n_state, self.n_action, self.C, self.wt)

    #Random action during Practice
    def act_pre(self):
        a = np.random.randint(self.n_action)
        return a
        #Epsilon greedy action selection function
    def act(self, s):
        a = self.greedy_act(
            s) if np.random.random() > self.epsilon else np.random.randint(
                self.n_action)
        return a

    def greedy_act(self, s):
        return self.net.action(s)
        #Practice without recording experiences
    def practice(self):
        self.lr = 1e-3  #possible
        self.net.pre_train(self.buffer, self.lr)

        #Records experiences and calls training functions
    def record(self, s, a, r, d, it, pre):

        #Variable pre is used to differentiate practice from RL training.
        if pre:
            self.buffer.append(s, a, r, d)
            if it > self.C['pre_training_start']:
                if it % self.C['pre_train_freq'] == 0:
                    self.lr = 1e-3
                    self.net.pre_train(self.buffer, self.lr)

        else:
            self.buffer.append(s, a, r, d)
            if it <= 5e5:

                self.epsilon = linear_interp(0, 5e5, it, 0.1, 1.0)
            else:

                self.epsilon = max(linear_interp(5e5, 10e6, it, 0.01, 0.1),
                                   0.01)

            if it > self.C['training_start']:
                if it % self.C['train_freq'] == 0:
                    self.lr = 1e-4  #Learning rate for RL training
                    self.net.train(self.buffer, self.lr)

                if it % self.C['update_target_freq'] == 0:
                    self.net.update_target_network()
Esempio n. 2
0
class Agent:
    def __init__(self, env, config, wt):
        self.C = config
        self.n_state = list(env.observation_space.shape)
        self.n_action = env.action_space.n
        self.epsilon = 0.99
        self.lr = 1e-3
        self.wt = wt
        self.buffer = ReplayBuffer(self.C['max_size'], self.C['frame_stack'])
        self.buffer2 = ReplayBuffer(self.C['max_size'], self.C['frame_stack'])
        self.net = Net(self.n_state, self.n_action, self.C, self.wt)

    def act_pre(self):
        a = np.random.randint(self.n_action)
        return a

    def act(self, s):
        a = self.greedy_act(
            s) if np.random.random() > self.epsilon else np.random.randint(
                self.n_action)
        return a

    def greedy_act(self, s):
        return self.net.action(s)

    def record(self, s, a, r, d, it, pre):

        if pre:
            self.buffer.append(s, a, r, d)
            if it > self.C['pre_training_start']:
                if it % self.C['pre_train_freq'] == 0:
                    self.lr = 1e-3  #possible
                    self.net.pre_train(self.buffer, self.lr)

        else:
            self.buffer.append(s, a, r, d)
            if it <= 6e5:

                self.epsilon = linear_interp(0, 6e5, it, 0.1, 1.0)
            else:

                self.epsilon = max(linear_interp(6e5, 10e6, it, 0.01, 0.1),
                                   0.01)

            if it > self.C['training_start']:
                if it % self.C['train_freq'] == 0:
                    self.lr = 1e-4
                    self.net.train(self.buffer, self.lr)
                    # print(Q)

                if it % self.C['update_target_freq'] == 0:
                    self.net.update_target_network()
Esempio n. 3
0
            #get replay tuple
            n_state = getState(game)
            n_actions = []
            acts = []
            for el in processActions(game, state):
                n_actions.append(el[2])
                acts.append(el[0])
            if action == 'E':
                game.playerUpdate()

            if action == 'S' or action == 'C':
                r = game.board.vertices[spec[0]][spec[1]].score
            else:
                r = rewards[action]
            rBuffer.append((state, r + win * winnings, n_actions, acts))
            rBuffer.pop()

            if count > wait:
                states = []
                targets = []
                for s_0, r, s_1, a in rBuffer.sample(batch_size):

                    states.append(s_0)
                    q_t = np.max(Qtarget.evaluate(s_1))
                    targets.append(r + gamma * q_t)

                loss += Qprincipal.train(states, targets).history['loss'][0]

                softUpdate(Qprincipal, Qtarget, alpha)
Esempio n. 4
0
    def train(self):
        """Train."""
        logs_path = self.args.logs_path
        video_path = self.args.video_path
        restore = self.args.restore
        train = self.args.train

        # Initial PLE environment
        os.putenv('SDL_VIDEODRIVER', 'fbcon')
        os.environ["SDL_VIDEODRIVER"] = "dummy"

        # Design reward
        reward_values = {
            "positive": 1,
            "tick": 0.1,
            "loss": -1,
        }

        # Create FlappyBird game env
        env = PLE(FlappyBird(),
                  display_screen=False,
                  reward_values=reward_values)

        # Gets the actions FlappyBird supports
        action_set = env.getActionSet()

        replay_buffer = ReplayBuffer(self.hparams.replay_buffer_size)
        agent = Agent(action_set, self.hparams)

        # restore model
        if restore:
            agent.restore(restore)

        reward_logs = []
        loss_logs = []

        for episode in range(1, self.hparams.total_episode + 1):
            # reset env
            env.reset_game()
            env.act(0)
            obs = convert(env.getScreenGrayscale())
            state = np.stack([[obs for _ in range(4)]], axis=0)
            t_alive = 0
            total_reward = 0

            if episode % self.hparams.save_video_frequency == 0 and episode > self.hparams.initial_observe_episode:
                agent.stop_epsilon()
                frames = [env.getScreenRGB()]

            while not env.game_over():
                action = agent.take_action(state)
                reward = env.act(action_set[action])

                if episode % self.hparams.save_video_frequency == 0 and episode > self.hparams.initial_observe_episode:
                    frames.append(env.getScreenRGB())
                obs = convert(env.getScreenGrayscale())
                obs = np.reshape(obs, [1, 1, obs.shape[0], obs.shape[1]])

                state_new = np.append(state[:, 1:, ...], obs, axis=1)
                action_onehot = np.zeros(len(action_set))
                action_onehot[action] = 1

                t_alive += 1
                total_reward += reward
                replay_buffer.append(
                    (state, action_onehot, reward, state_new, env.game_over()))
                state = state_new

            # save video
            if episode % self.hparams.save_video_frequency == 0 and episode > self.hparams.initial_observe_episode:
                os.makedirs(video_path, exist_ok=True)
                clip = make_video(frames, fps=60).rotate(-90)
                clip.write_videofile(os.path.join(
                    video_path, 'env_{}.mp4'.format(episode)),
                                     fps=60)
                agent.restore_epsilon()
                print('Episode: {} t: {} Reward: {:.3f}'.format(
                    episode, t_alive, total_reward))
                # danger
                mp4list = glob.glob('./video_XXX/*.mp4')
                if len(mp4list) > 0:
                    latest = mp4list[0]
                    latest_timestamp = os.path.getmtime(mp4list[0])
                    for mp4 in mp4list:
                        ts = os.path.getmtime(mp4)
                        if (ts > latest_timestamp):
                            latest_timestamp = ts
                            latest = mp4
                    video = io.open(latest, 'r+b').read()
                    encoded = base64.b64encode(video)
                    ipythondisplay.display(
                        HTML(data='''<video alt="test" autoplay 
                                    loop controls style="height: 400px;">
                                    <source src="data:video/mp4;base64,{0}" type="video/mp4" />
                                 </video>'''.format(encoded.decode('ascii'))))
                #end danger
                else:
                    print("Could not find video")

            if episode > self.hparams.initial_observe_episode and train:
                # save model
                if episode % self.hparams.save_logs_frequency == 0:
                    agent.save(episode, logs_path)
                    np.save(os.path.join(logs_path, 'loss.npy'),
                            np.array(loss_logs))
                    np.save(os.path.join(logs_path, 'reward.npy'),
                            np.array(reward_logs))

                # update target network
                if episode % self.hparams.update_target_frequency == 0:
                    agent.update_target_network()

                # sample batch from replay buffer
                batch_state, batch_action, batch_reward, batch_state_new, batch_over = replay_buffer.sample(
                    self.hparams.batch_size)

                # update policy network
                loss = agent.update_Q_network(batch_state, batch_action,
                                              batch_reward, batch_state_new,
                                              batch_over)

                loss_logs.extend([[episode, loss]])
                reward_logs.extend([[episode, total_reward]])

                # print reward and loss
                if episode % self.hparams.show_loss_frequency == 0:
                    print(
                        'Episode: {} t: {} Reward: {:.3f} Loss: {:.3f}'.format(
                            episode, t_alive, total_reward, loss))

                agent.update_epsilon()