Example #1
0
    def play(self):
        env = Env()
        if self.render:
            env.init_render()
        scores = 0
        steps = 0
        episodes = 10
        for e in range(episodes):
            step = 0
            done = False
            observe, _, _, _ = env.reset()
            state = preprocess(observe).reshape((1, RESIZE, RESIZE))
            state = np.float32(state / 255.)
            history = np.copy(state)
            for _ in range(self.seq_size - 1):
                history = np.append(history, state, axis=0)
                state = np.copy(state)
            history = np.reshape([history], (1, self.seq_size, RESIZE, RESIZE))
            while not done:
                # snap1 = history[0][0]
                # snap2 = history[0][1]
                # Image.fromarray(snap1 * 255.).show()
                # Image.fromarray(snap2 * 255.).show()
                step += 1
                if self.render:
                    env.render()
                action, policy = self.get_action(history)
                pmax_action = np.argmax(policy)
                print(ACTION[action], policy, ACTION[pmax_action])
                while True:
                    key = input('Press y or action: ')
                    if key in ['0', '1', '2', '3']:
                        action = int(key)
                        break
                    elif key == 'y':
                        break
                # if np.random.uniform() > 0.5:
                #     action = pmax_action
                next_observe, reward, done, info = env.step(action + 1)
                next_state = preprocess(next_observe)
                next_state = np.reshape([next_state], (1, 1, RESIZE, RESIZE))
                next_state = np.float32(next_state / 255.)
                next_history = np.append(next_state,
                                         history[:, :(self.seq_size -
                                                      1), :, :],
                                         axis=1)

                history = next_history

            steps += step
            scores += env.game.score
            step = 0

        print('AvgScore: %s AvgStep: %s' %
              (scores / episodes, steps / episodes))
        return scores / episodes, steps / episodes
Example #2
0
    def play(self, episodes=10, delay=0, improve='policy', debug=False, SNAPSHOT=False):
        env = Env()
        scores = 0
        steps = 0
        print('Value\tRandom\tGreedy\tPolicy')
        for e in range(episodes):
            step = 0
            done = False
            observe, _, _, _ = env.reset()
            state = preprocess(observe).reshape((1, RESIZE, RESIZE, 1))
            state = np.float32(state / 255.)
            history = np.stack([state] * self.seq_size, axis=1)
            while not done:
                time.sleep(delay)
                step += 1
                if self.render:
                    env.render()
                    if SNAPSHOT:
                        snapshot = np.array([]).reshape([0,RESIZE])
                        for snap in history[0]:
                            snapshot = np.append(snapshot, snap, axis=0)
                        Image.fromarray(snapshot*255.).show()
                action, policy = self.get_action(history)
                if improve == 'greedy':
                    real_action = int(np.argmax(policy))
                elif improve == 'e-greedy':
                    real_action = int(np.argmax(policy)) if np.random.uniform(low=0.0, high=1.0) > 0.1 else action 
                else:
                    real_action = action
                value = self.critic.predict(history)
                print(value, '\t', ACTION[action], '\t', ACTION[int(np.argmax(policy))], '\t', policy)
                if debug:
                    while True:
                        a = input('Press y or action(w(stay), a(left), d(right)):')
                        if a=='y':
                            break
                        elif a=='w':
                            real_action = 0
                            break
                        elif a=='a':
                            real_action = 1
                            break
                        elif a=='d':
                            real_action = 2
                            break
                next_observe, reward, done, info = env.step(real_action)
                next_state = preprocess(next_observe).reshape((1, RESIZE, RESIZE, 1))
                next_state = np.float32(next_state / 255.)
                next_history = np.append(history[0][1:], next_state, axis=0)
                next_history = np.float32([next_history])
                history = next_history

            steps += step
            scores += env.game.score

            print('Score: %s Step: %s' % (env.game.score, step))
        return scores/episodes, steps/episodes
Example #3
0
    def run(self):
        global episode
        env = Env()

        while True:
            # print(self.tid, 'Still Training!')
            step = 0
            self.avg_p_max = 0
            self.reward_sum = 0
            self.actor_loss = 0
            self.critic_loss = 0
            done = False
            observe, _, _, _ = env.reset()

            state = preprocess(observe).reshape((1, RESIZE, RESIZE, 1))
            state = np.float32(state / 255.)
            history = np.stack([state] * self.seq_size, axis=1)

            while not done:
                step += 1
                self.t += 1
                if self.render:
                    env.render()
                action, policy = self.get_action(history)
                real_action = action
                next_observe, reward, done, info = env.step(real_action)
                if REWARD_CLIP == 'clip':
                    reward = np.clip(reward, -1.0, 1.0)
                next_state = preprocess(next_observe).reshape(
                    (1, RESIZE, RESIZE, 1))
                next_state = np.float32(next_state / 255.)
                next_history = np.append(history[0][1:], next_state, axis=0)
                next_history = np.float32([next_history])

                self.avg_p_max += np.amax(policy)
                self.reward_sum += reward
                self.append_sample(history, action, reward)

                history = next_history

                if self.t >= self.t_max or done:
                    self.t = 0
                    actor_loss, critic_loss = self.upload_sample(
                        next_history, done)
                    self.actor_loss += abs(actor_loss[0])
                    self.critic_loss += abs(critic_loss[0])

            episode += 1
            avg_p_max = self.avg_p_max / float(step)
            train_num = step // self.t_max + 1
            avg_actor_loss = self.actor_loss / train_num
            avg_critic_loss = self.critic_loss / train_num
            stats = [
                episode, step, self.reward_sum, env.game.score, avg_p_max,
                avg_actor_loss, avg_critic_loss, info
            ]
            self.stats.append(stats)
Example #4
0
    def run(self):
        global episode
        env = Env()
        while True:
            # print(self.tid, 'Still Training!')
            step = 0
            avg_p_max = 0
            reward_sum = 0
            actor_loss = 0
            critic_loss = 0
            done = False
            observe, _, _, _ = env.reset()

            state = preprocess(observe)
            history = np.stack([state] * self.seq_size, axis=1)

            while not done:
                step += 1
                self.t += 1
                if self.render:
                    env.render()
                action, policy = self.get_action(history)
                real_action = action
                next_observe, reward, done, info = env.step(real_action)
                if self.reward_clip:
                    reward = np.clip(reward, -1.0, 1.0)
                next_state = preprocess(next_observe).reshape([1] + self.state_size)
                next_history = np.append(history[:, 1:], next_state, axis=1)

                avg_p_max += np.amax(policy)
                reward_sum += reward
                self.append_sample(history, action, reward)

                history = next_history

                if self.t >= self.t_max or done:
                    self.t = 0
                    actor_loss, critic_loss = self.train_model(next_history, done)
                    actor_loss += actor_loss
                    critic_loss += critic_loss
                    self.update_local_model()

            episode += 1
            avg_p_max = avg_p_max / float(step)
            train_num = step // self.t_max + 1
            avg_actor_loss = actor_loss / train_num
            avg_critic_loss = critic_loss / train_num
            stats = [
                episode, step,
                reward_sum, env.game.score,
                avg_p_max, avg_actor_loss, avg_critic_loss,
                info
            ]
            with self.lock:
                self.stats.append(stats)
Example #5
0
    def run(self):
        global episode
        env = Env()
        while True:
            # print(self.tid, 'Still Training!')
            step = 0
            avg_p_max = 0
            reward_sum = 0
            actor_loss = 0
            critic_loss = 0
            done = False
            observe, _, _, _ = env.reset()

            state = preprocess(observe).reshape(self.state_shape)
            while not done:
                step += 1
                self.t += 1
                if self.render:
                    env.render()
                action, policy = self.get_action(state)
                real_action = action
                next_observe, reward, done, info = env.step(real_action)
                if self.reward_clip:
                    reward = np.clip(reward, -1.0, 1.0)
                next_state = preprocess(next_observe).reshape(self.state_shape)

                avg_p_max += np.amax(policy)
                reward_sum += reward
                self.append_sample(state, action, reward)

                state = next_state

                if self.t >= self.t_max or done:
                    self.t = 0
                    # if timeout, get returns with next pred value
                    mask = False if done and info == 'timeout' else done
                    actor_loss, critic_loss = self.train_model(next_state, mask)
                    actor_loss += actor_loss
                    critic_loss += critic_loss
                    self.update_local_model()

            episode += 1
            avg_p_max = avg_p_max / float(step)
            train_num = step // self.t_max + 1
            avg_actor_loss = actor_loss / train_num
            avg_critic_loss = critic_loss / train_num
            stats = [
                episode, step,
                reward_sum, env.game.score,
                avg_p_max, avg_actor_loss, avg_critic_loss,
                info
            ]
            with self.lock:
                self.stats.append(stats)
Example #6
0
        return action_index, policy


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--delay', type=float, default=0.)
    parser.add_argument('--episode', type=int, default=1)
    parser.add_argument('--verbose', action='store_true')
    parser.add_argument('--debug', action='store_true')
    parser.add_argument('--load_model', action='store_true')
    parser.add_argument('--seqsize', type=int, default=2)
    args = parser.parse_args()
    keymap = {'w': 0, 'a': 1, 'd': 2}

    env = Env()
    agent = Agent(state_size=env.state_size,
                  action_size=env.action_size,
                  seq_size=args.seqsize,
                  load_model=args.load_model,
                  verbose=args.verbose,
                  render=args.render)
    np.set_printoptions(precision=4, suppress=True)

    for e in range(args.episode):
        step = 0
        reward_sum = 0
        done = False
        observe, _, _, _ = env.reset()

        state = observe.reshape(agent.state_shape) / 20.
Example #7
0
    def train(self):
        global episode
        highscore = 0

        if os.path.exists('ppo_highscore.csv'):
            with open('ppo_highscore.csv', 'r') as f:
                read = csv.reader(f)
                highscore = float(next(reversed(list(read)))[1])
        print('Highscore: %.3f' % highscore)

        env = Env()

        self.states = np.zeros([0] + self.state_size)
        self.actions = np.zeros([0] + [self.action_size])
        self.rewards, self.old_pi, self.advantages, self.returns = [], [], [], []

        actor_loss = 0
        critic_loss = 0
        t = 0
        self.t = 0
        while True:
            done = False
            step = 0
            reward_sum = 0
            pmax = 0
            timeout = 0
            observe, _, _, _ = env.reset()
            state = observe.reshape(self.state_shape) / 20.
            while not done and timeout < self.timeout:
                if self.render:
                    env.render()
                action, policy = self.get_action(state)
                if self.debug:
                    print(state, policy)
                    while True:
                        a = input('Press 0 1 2: ')
                        if a in ['0', '1', '2']:
                            action = int(a)
                            break
                    
                next_observe, reward, done, info = env.step(action)
                next_state = next_observe.reshape(self.state_shape) / 20.
                
                self.append_sample(state, action, reward, policy[action])

                timeout = 0 if info == 'goal' else timeout + 1
                step += 1
                t += 1
                pmax += np.amax(policy)
                reward_sum += reward

                state = next_state

            if not done:
                info = 'timeout'
            self.get_gae(next_state, done)
            self.t = t
            episode += 1

            avg_pmax = pmax / float(step)
            stats = [
                episode, step,
                reward_sum, env.game.score,
                avg_pmax, actor_loss, critic_loss,
                info
            ]
            self.stats.append(stats)

            if t >= self.horizon:
                actor_loss, critic_loss = self.train_model()
                t = 0
                self.t = 0
                # actor_loss += a_loss
                # critic_loss += c_loss
                if len(self.stats) >= self.save_rate:
                    with open('ppo_output.csv', 'a', encoding='utf-8', newline='') as f:
                        wr = csv.writer(f)
                        for row in self.stats:
                            wr.writerow(row)
                    self.save_model('./save_model/ppo')
                    mean = np.mean(np.float32(np.split(self.stats, [-1], axis=1)[0]), axis=0)
                    if mean[3] > highscore:
                        highscore = mean[3]
                        with open('ppo_highscore.csv', 'a', encoding='utf-8', newline='') as f:
                            wr = csv.writer(f)
                            wr.writerow([episode, highscore, dt.now().strftime('%Y-%m-%d %H:%M:%S')])
                        self.save_model('./save_model/ppo_high')
                    print('%s: %s Episodes Trained! Reward:%.3f Score:%.3f Step:%.3f Pmax:%.3f' 
                            % (dt.now().strftime('%Y-%m-%d %H:%M:%S'), 
                            len(self.stats), mean[2], mean[3], mean[1], mean[4]))
                    self.stats.clear()
Example #8
0
 parser.add_argument('--gamma',      type=float, default=0.99,   help='Discount factor')
 parser.add_argument('--lambd',      type=float, default=0.95,   help='TD(lambda). The bigger lambda, The bigger weight on future reward')
 parser.add_argument('--batch_size', type=int,   default=16,     help='Mini-batch size')
 parser.add_argument('--horizon',    type=int,   default=256,    help='Time horizon')
 parser.add_argument('--seqsize',    type=int,   default=1,      help='Length of sequence')
 parser.add_argument('--epoch',      type=int,   default=3,      help='Update epochs')
 parser.add_argument('--clip',       type=float, default=0.2,    help='Clip ratio')
 parser.add_argument('--timeout',    type=int,   default=400,    help='After this step, timeout')
 parser.add_argument('--reward_clip',action='store_true',        help='Reward will be clipped in [-1, 1]')
 parser.add_argument('--render',     action='store_true',        help='First agent render')
 parser.add_argument('--load_model', action='store_true',        help='Load model in ./save_model/')
 parser.add_argument('--verbose',    action='store_true',        help='Print summary of model of global network')
 parser.add_argument('--debug',      action='store_true')
 args = parser.parse_args()
 
 env = Env()
 global_agent = PPOAgent(
     state_size=env.state_size,
     action_size=env.action_size,
     seq_size=args.seqsize,
     gamma=args.gamma,
     lambd=args.lambd,
     entropy=args.entropy,
     horizon=args.horizon,
     actor_lr=args.lr,
     critic_lr=args.lr,
     batch_size=args.batch_size,
     epoch=args.epoch,
     clip=args.clip,
     thread_num=args.threads,
     load_model=args.load_model,