Ejemplo n.º 1
0
class MonsterKongEnv(object):

    def __init__(self):
        self.game = MonsterKong()
        self.p = PLE(self.game, fps=30, display_screen=True)

        # self.actions = self.p.getActionSet()
        # self._action_space = list(range(self.actions[0]))
        # self._action_space.append(self.actions[-1])
        self.action_space = self.p.getActionSet()

    def reset(self):
        self.p.init()
        self.p.act(None)
        # return self.p.getScreenRGB()
        return self.p.getScreenGrayscale()

    def step(self, action):
        reward = self.p.act(self.action_space[action])
        # reward = self.p.act(119)
        # print(self.action_space[action], reward)
        # return self.p.getScreenRGB(), reward, self.p.game_over()
        return self.p.getScreenGrayscale(), reward, self.p.game_over()

    @property
    def action_space(self):
        return self._action_space

    @action_space.setter
    def action_space(self, action_space):
        self._action_space = action_space
Ejemplo n.º 2
0
class PLEEnv(gym.Env):
    def __init__(self, env_config):
        game = Catcher(width=screen_wh, height=screen_wh)

        fps = 30  # fps we want to run at
        frame_skip = 2
        num_steps = 2
        force_fps = False  # False for slower speed
        display_screen = True
        # make a PLE instance.
        self.env = PLE(game,
                       fps=fps,
                       frame_skip=frame_skip,
                       num_steps=num_steps,
                       force_fps=force_fps,
                       display_screen=display_screen)
        self.env.init()
        self.action_dict = {0: None, 1: 97, 2: 100}
        #PLE env starts with black screen
        self.env.act(self.env.NOOP)

        self.action_space = Discrete(3)
        self.k = 4
        self.observation_space = spaces.Box(low=0,
                                            high=255,
                                            shape=(screen_wh, screen_wh,
                                                   1 * self.k))
        self.frames = deque([], maxlen=self.k)

    def reset(self):
        self.env.reset_game()
        # PLE env starts with black screen, NOOP step to get initial screen
        self.env.act(self.env.NOOP)
        ob = np.reshape(self.env.getScreenGrayscale(),
                        (screen_wh, screen_wh, 1))
        for _ in range(self.k):
            self.frames.append(ob)
        return self._get_ob()

    def step(self, action):
        #traditional gym env step
        #_obs, _rew, done, _info = env.step(env.action_space.sample())
        action_value = self.action_dict[action]
        _rew = self.env.act(action_value)
        #_obs = self.env.getScreenGrayscale()
        _obs = np.reshape(self.env.getScreenGrayscale(),
                          (screen_wh, screen_wh, 1))
        self.frames.append(_obs)
        _done = self.env.game_over()
        _info = {}

        return self._get_ob(), _rew, _done, _info

    def _get_ob(self):
        assert len(self.frames) == self.k
        return np.concatenate(self.frames, axis=2)
Ejemplo n.º 3
0
def main():
    env = FlappyBird()
    penv = PLE(env, fps=30, display_screen=True, force_fps=True)
    #penv.init()
    np.random.seed(0)

    obs_shape = len(penv.getGameState())
    IMG_shape = penv.getScreenGrayscale().shape
    action_dim = len(penv.getActionSet())

    print(obs_shape, action_dim)

    rpm = ReplayMemory(MEMORY_SIZE)

    model = Model(act_dim=action_dim)
    algorithm = parl.algorithms.DQN(model,
                                    act_dim=action_dim,
                                    gamma=GAMMA,
                                    lr=LEARNING_RATE)
    agent = Agent(
        algorithm,
        obs_dim=obs_shape,
        act_dim=action_dim,
        e_greed=0.15,  # explore  0.1
        e_greed_decrement=1e-6  #1e-6
    )  # probability of exploring is decreasing during training

    # 加载模型
    if os.path.exists('./dqn_model.ckpt'):
        save_path = './dqn_model.ckpt'
        agent.restore(save_path)
        print("模型加载成功")
    eval_reward = evaluate(agent, penv)
def main():
    env = FlappyBird()
    penv = PLE(env, fps=30, display_screen=True,force_fps=True)
    #penv.init()
    np.random.seed(0)

    obs_shape = len(penv.getGameState())
    IMG_shape = penv.getScreenGrayscale().shape
    action_dim = len(penv.getActionSet())


    print(obs_shape,action_dim)

    rpm = ReplayMemory(MEMORY_SIZE)

    model = Model(act_dim=action_dim)
    algorithm = parl.algorithms.DQN(
        model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE)
    agent = Agent(
        algorithm,
        obs_dim=obs_shape,
        act_dim=action_dim,
        e_greed=0.15,  # explore  0.1
        e_greed_decrement=1e-6   #1e-6
    )  # probability of exploring is decreasing during training




    # 加载模型
    if os.path.exists('./dqn_model.ckpt'):
        save_path = './dqn_model.ckpt'
        agent.restore(save_path)
        print("模型加载成功")

    while len(rpm) < MEMORY_WARMUP_SIZE:  # warm up replay memory
        run_episode(agent, penv, rpm)

    max_episode = 1000

    # start train
    episode = 0
    while episode < max_episode:

        # train part
        for i in range(0, 50):
            total_reward = run_episode(agent, penv, rpm)
            episode += 1

        eval_reward = evaluate(agent, penv)
        logger.info('episode:{}    test_reward:{}'.format(
            episode, eval_reward))
        # 训练结束,保存模型
        save_path = './model/dqn_model_{}_{}.ckpt'.format(episode, eval_reward)
        agent.save(save_path)

    # 训练结束,保存模型
    save_path = './dqn_model.ckpt'
    agent.save(save_path)
def evaluate_step(agent, seed, sess):
    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=False, rng=np.random.RandomState(seed))
    env.reset_game()
    env.act(0) # dummy input
    # grayscale input screen for this episode  
    input_screens = [agent.preprocess(env.getScreenGrayscale())]
    t = 0
    while not env.game_over():
        # feed four previous screen, select an action
        action = agent.select_action(input_screens, sess)
        # execute the action and get reward
        reward = env.act(env.getActionSet()[action])  # reward = +1 when pass a pipe, -5 when die       
        # observe the result
        screen_plum = env.getScreenGrayscale()  # get next screen
        # append grayscale screen for this episode
        input_screens.append(agent.preprocess(screen_plum))
        t+=1
        if t >= 1000: # maximum score to prevent run forever
            break
    return t
Ejemplo n.º 6
0
class FlappyBirdEnv(gym.Env):
    def __init__(self):
        self.resize_factor = 0.125
        self.width = 288
        self.height = 512
        self.ple = PLE(game=FlappyBird(), fps=30, frame_skip=8)
        self.action_set = self.ple.getActionSet()
        self.action_space = spaces.Discrete(len(self.action_set))
        self.observation_space = spaces.Box(
            low=0.0,
            high=255.0,
            shape=(
                int(self.width * self.resize_factor),
                int(self.height * self.resize_factor),
                1,
            ),
            dtype=np.uint32,
        )
        self._steps = 0

    def reset(self):
        self._steps = 0
        self.ple.display_screen = False
        self.ple.reset_game()
        return self._get_state()

    def step(self, action):
        self._steps += 1
        reward = self.ple.act(self.action_set[action])
        next_state = self._get_state()
        terminal = self.ple.game_over()
        return next_state, reward, terminal, {}

    def render(self, mode="human"):
        self.ple.display_screen = True

    def _get_state(self):
        return np.expand_dims(imresize(self.ple.getScreenGrayscale(),
                                       self.resize_factor),
                              axis=-1)
Ejemplo n.º 7
0
    def train(self):
        """Train."""
        logs_path = self.args.logs_path
        video_path = self.args.video_path
        restore = self.args.restore
        train = self.args.train

        # Initial PLE environment
        os.putenv('SDL_VIDEODRIVER', 'fbcon')
        os.environ["SDL_VIDEODRIVER"] = "dummy"

        # Design reward
        reward_values = {
            "positive": 1,
            "tick": 0.1,
            "loss": -1,
        }

        # Create FlappyBird game env
        env = PLE(FlappyBird(),
                  display_screen=False,
                  reward_values=reward_values)

        # Gets the actions FlappyBird supports
        action_set = env.getActionSet()

        replay_buffer = ReplayBuffer(self.hparams.replay_buffer_size)
        agent = Agent(action_set, self.hparams)

        # restore model
        if restore:
            agent.restore(restore)

        reward_logs = []
        loss_logs = []

        for episode in range(1, self.hparams.total_episode + 1):
            # reset env
            env.reset_game()
            env.act(0)
            obs = convert(env.getScreenGrayscale())
            state = np.stack([[obs for _ in range(4)]], axis=0)
            t_alive = 0
            total_reward = 0

            if episode % self.hparams.save_video_frequency == 0 and episode > self.hparams.initial_observe_episode:
                agent.stop_epsilon()
                frames = [env.getScreenRGB()]

            while not env.game_over():
                action = agent.take_action(state)
                reward = env.act(action_set[action])

                if episode % self.hparams.save_video_frequency == 0 and episode > self.hparams.initial_observe_episode:
                    frames.append(env.getScreenRGB())
                obs = convert(env.getScreenGrayscale())
                obs = np.reshape(obs, [1, 1, obs.shape[0], obs.shape[1]])

                state_new = np.append(state[:, 1:, ...], obs, axis=1)
                action_onehot = np.zeros(len(action_set))
                action_onehot[action] = 1

                t_alive += 1
                total_reward += reward
                replay_buffer.append(
                    (state, action_onehot, reward, state_new, env.game_over()))
                state = state_new

            # save video
            if episode % self.hparams.save_video_frequency == 0 and episode > self.hparams.initial_observe_episode:
                os.makedirs(video_path, exist_ok=True)
                clip = make_video(frames, fps=60).rotate(-90)
                clip.write_videofile(os.path.join(
                    video_path, 'env_{}.mp4'.format(episode)),
                                     fps=60)
                agent.restore_epsilon()
                print('Episode: {} t: {} Reward: {:.3f}'.format(
                    episode, t_alive, total_reward))
                # danger
                mp4list = glob.glob('./video_XXX/*.mp4')
                if len(mp4list) > 0:
                    latest = mp4list[0]
                    latest_timestamp = os.path.getmtime(mp4list[0])
                    for mp4 in mp4list:
                        ts = os.path.getmtime(mp4)
                        if (ts > latest_timestamp):
                            latest_timestamp = ts
                            latest = mp4
                    video = io.open(latest, 'r+b').read()
                    encoded = base64.b64encode(video)
                    ipythondisplay.display(
                        HTML(data='''<video alt="test" autoplay 
                                    loop controls style="height: 400px;">
                                    <source src="data:video/mp4;base64,{0}" type="video/mp4" />
                                 </video>'''.format(encoded.decode('ascii'))))
                #end danger
                else:
                    print("Could not find video")

            if episode > self.hparams.initial_observe_episode and train:
                # save model
                if episode % self.hparams.save_logs_frequency == 0:
                    agent.save(episode, logs_path)
                    np.save(os.path.join(logs_path, 'loss.npy'),
                            np.array(loss_logs))
                    np.save(os.path.join(logs_path, 'reward.npy'),
                            np.array(reward_logs))

                # update target network
                if episode % self.hparams.update_target_frequency == 0:
                    agent.update_target_network()

                # sample batch from replay buffer
                batch_state, batch_action, batch_reward, batch_state_new, batch_over = replay_buffer.sample(
                    self.hparams.batch_size)

                # update policy network
                loss = agent.update_Q_network(batch_state, batch_action,
                                              batch_reward, batch_state_new,
                                              batch_over)

                loss_logs.extend([[episode, loss]])
                reward_logs.extend([[episode, total_reward]])

                # print reward and loss
                if episode % self.hparams.show_loss_frequency == 0:
                    print(
                        'Episode: {} t: {} Reward: {:.3f} Loss: {:.3f}'.format(
                            episode, t_alive, total_reward, loss))

                agent.update_epsilon()
Ejemplo n.º 8
0
class DdqnBirdSyr():

    def __init__(self, playback_mode, mod=None):
        self._playback_mode = playback_mode

        env = FlappyBird(pipe_gap=200)
        self._ple = PLE(env, fps=30, display_screen=DISPLAY)
        self._ple.init()

        self._sess = tf.Session()
        self._agent = DDQNAgent(self._sess, DIM_STATE, DIM_ACTION, LR, TAU, net_name='cnn_bird')
        self._sess.run(tf.global_variables_initializer())
        self._agent.update_target_paras()

        self._saver = tf.train.Saver()
        self._replay_buffer = ReplayBuffer(BUFFER_SIZE)
        self._explorer = Explorer(EPS_BEGIN, EPS_END, EPS_STEPS, playback_mode)
        self.summary = Summary(self._sess, DIR_SUM)

        self.summary.add_variable(tf.Variable(0.), 'reward')
        self.summary.add_variable(tf.Variable(0.), 'loss')
        self.summary.add_variable(tf.Variable(0.), 'maxq')
        self.summary.build()
        self.summary.write_variables(FLAGS)

        self._steps = 0

        if mod and os.path.exists(FLAGS.dir_mod.format(mod)):
            checkpoint = tf.train.get_checkpoint_state(FLAGS.dir_mod.format(mod))
            self._saver.restore(self._sess, save_path=checkpoint.model_checkpoint_path)
            print("Loaded checkpoints {0}".format(checkpoint.model_checkpoint_path))

    def start(self):
        for ep in range(MAX_EP):
            sum_reward = 0
            last_state = []
            for _ in range(STATE_FRAMES):
                last_state.append(self._ple.getScreenGrayscale())
            last_state = np.dstack(last_state)

            last_max_qvalue = 0

            for step in range(EP_STEPS):
                time.sleep(0.01)
                if not step % STATE_FRAMES:
                    q_value = self._agent.predict([last_state])[0]
                    last_max_qvalue = np.max(q_value)

                    act_1_hot = self._explorer.get_action(q_value)
                    act_index = np.argmax(act_1_hot)
                else:
                    # do nothing
                    act_index = 1
                    act_1_hot = np.zeros(DIM_ACTION)
                    act_1_hot[act_index] = 1

                reward = self._ple.act(self._ple.getActionSet()[act_index])
                if reward == 0:
                    reward = 0.1
                elif reward == -5:
                    reward = -1

                state = np.reshape(self._ple.getScreenGrayscale(), (SCREEN_WIDTH, SCREEN_HEIGHT, 1))
                state = np.append(state, last_state[:, :, :3], axis=2)

                done = False
                if self._ple.game_over():
                    done = True

                self._replay_buffer.add(last_state, act_1_hot, reward, state, done)

                loss = None
                if not self._playback_mode and len(self._replay_buffer) > OBV_STEPS:
                    loss = self._train()

                last_state = state
                sum_reward += reward
                self._steps += 1

                if done or step == EP_STEPS - 1:
                    print('| Step: %i' % self._steps,
                          '| Episode: %i' % ep,
                          '| Epoch: %i' % step,
                          '| qvalue: %.5f' % last_max_qvalue,
                          '| Sum_Reward: %i' % sum_reward)
                    if loss != None:
                        self.summary.run(feed_dict={
                            'loss': loss,
                            'reward': sum_reward,
                            'maxq': last_max_qvalue})
                    self._ple.reset_game()
                    break


    def _train(self):
        batch_state, batch_action, batch_reward, batch_state_next, batch_done = \
            self._replay_buffer.sample_batch(MINI_BATCH)

        q_value = self._agent.predict(batch_state_next)
        max_q_value_index = np.argmax(q_value, axis=1)
        target_q_value = self._agent.predict_target(batch_state_next)
        double_q = target_q_value[range(len(target_q_value)), max_q_value_index]

        batch_y = []
        for r, q, d in zip(batch_reward, double_q, batch_done):
            if d:
                batch_y.append(r)
            else:
                batch_y.append(r + GAMMA * q)

        opt, loss = self._agent.train(batch_state, batch_action, batch_y)
        self._agent.update_target_paras()

        if not self._steps % CKP_STEP:
            self._saver.save(self._sess, DIR_MOD + '/net', global_step=self._steps)
            print('Mod saved!')

        return loss
Ejemplo n.º 9
0
def main(args):
    logs_path = args.logs_path
    video_path = args.video_path
    restore = args.restore
    train = args.train

    # Initial PLE environment
    os.putenv('SDL_VIDEODRIVER', 'fbcon')
    os.environ["SDL_VIDEODRIVER"] = "dummy"
    # Design reward
    reward_values = {
        "positive": 1, 
        "tick": 0.1, 
        "loss": -1,  
    }
    env = PLE(FlappyBird(), fps=30, display_screen=False, reward_values=reward_values)
    action_set = env.getActionSet()
    
    reply_buffer = Reply_Buffer(Config.reply_buffer_size)
    agent = Agent(action_set)

    reward_logs = []
    loss_logs = []

    # restore model
    if restore:
        agent.restore(restore)

    for episode in range(1, Config.total_episode+1):
        # reset env
        env.reset_game()
        env.act(0)
        obs = convert(env.getScreenGrayscale())
        state = np.stack([[obs for _ in range(4)]], axis=0)
        t_alive = 0
        total_reward = 0
        
        if episode % Config.save_video_frequency == 0 and episode > Config.initial_observe_episode: 
            agent.stop_epsilon()
            frames = [env.getScreenRGB()] 
    
        while not env.game_over():
            action = agent.take_action(state)
            reward = env.act(action_set[action])
            if episode % Config.save_video_frequency == 0 and episode > Config.initial_observe_episode: 
                frames.append(env.getScreenRGB()) 
            obs = convert(env.getScreenGrayscale())
            obs = np.reshape(obs, [1, 1, obs.shape[0], obs.shape[1]])
            state_new = np.append(state[:, 1:,...], obs, axis=1)
            action_onehot = np.zeros(len(action_set))
            action_onehot[action] = 1
            t_alive += 1
            total_reward += reward
            reply_buffer.append((state, action_onehot, reward, state_new, env.game_over()))
            state = state_new
        
        # save video
        # if episode % Config.save_video_frequency == 0 and episode > Config.initial_observe_episode:
        #     os.makedirs(video_path, exist_ok=True)
        #     clip = make_video(frames, fps=60).rotate(-90)
        #     clip.write_videofile(os.path.join(video_path, 'env_{}.mp4'.format(episode)), fps=60)
        #     agent.restore_epsilon()
        #     print('Episode: {} t: {} Reward: {:.3f}' .format(episode, t_alive, total_reward))
  
        if episode > Config.initial_observe_episode and train:
            # save model
            if episode % Config.save_logs_frequency == 0:
                agent.save(episode, logs_path)
                np.save(os.path.join(logs_path, 'loss.npy'), np.array(loss_logs))
                np.save(os.path.join(logs_path, 'reward.npy'), np.array(reward_logs))
        
            # update target network
            if episode % Config.update_target_frequency == 0:
                agent.update_target_network()
            
            # sample batch from reply buffer 
            batch_state, batch_action, batch_reward, batch_state_new, batch_over = reply_buffer.sample(Config.batch_size)
            
            # update policy network
            loss = agent.update_Q_network(batch_state, batch_action, batch_reward, batch_state_new, batch_over)
        
            loss_logs.extend([[episode, loss]]) 
            reward_logs.extend([[episode, total_reward]]) 
        
            # print reward and loss
            if episode % Config.show_loss_frequency == 0: 
                print('Episode: {} t: {} Reward: {:.3f} Loss: {:.3f}' .format(episode, t_alive, total_reward, loss))
        
            agent.update_epsilon()
Ejemplo n.º 10
0
    env.init()

    for epoch in range(1, num_epochs + 1):
        steps, num_episodes = 0, 0
        losses, rewards = [], []
        env.display_screen = True

        # training loop
        while num_episodes < num_steps_train:
            episode_reward = 0.0
            agent.start_episode()

            while env.game_over() == False:  #and steps < num_steps_train:
                state = env.getGameState()
                screen = env.getScreenGrayscale()
                screen = preprocess(state, screen)
                # screen = np.reshape(screen,(200,200,1))
                # print((screen[0]))
                #screen = screen[:,np.newaxis]
                reward, action = agent.act(screen, epsilon=epsilon)
                memory.add([screen, action, reward, env.game_over()])
                if steps % update_frequency == 0:
                    loss = memory.train_agent_batch(agent)

                    if loss is not None:
                        losses.append(loss)
                        epsilon = np.max([epsilon_min, epsilon - epsilon_rate])

                episode_reward += reward
                steps += 1
Ejemplo n.º 11
0
class Environment():
    def __init__(self, device, display=True):
        # Design reward
        reward_values = {
            "positive": 1,
            "tick": 0.1,
            "loss": -1,
        }
        self.env = PLE(FlappyBird(),
                       display_screen=display,
                       reward_values=reward_values)
        self.device = device
        self.action_set = self.env.getActionSet()

        self.frames = []

    def reset(self):
        self.env.reset_game()

    def start(self):
        self.env.act(0)
        obs = convert(self.env.getScreenGrayscale())
        self.state = np.stack([[obs for _ in range(4)]], axis=0)
        self.t_alive = 0
        self.total_reward = 0

        return self.state

    def game_over(self):
        return self.env.game_over()

    def getScore(self):
        return self.env.score()

    def step(self, action):

        reward = self.env.act(self.action_set[action])

        # make next state
        obs = convert(self.env.getScreenGrayscale())
        obs = np.reshape(obs, [1, 1, obs.shape[0], obs.shape[1]])
        next_state = np.append(self.state[:, 1:, ...], obs, axis=1)

        self.t_alive += 1
        self.total_reward += reward
        self.state = next_state

        return self.state, reward, self.env.game_over()

    def get_screen(self):
        return self.env.getScreenRGB()

    def record(self):
        self.frames.append(self.env.getScreenRGB())

    def saveVideo(self, episode, video_path):
        os.makedirs(video_path, exist_ok=True)
        clip = make_video(self.frames, fps=60).rotate(-90)
        clip.write_videofile(os.path.join(video_path,
                                          'env_{}.mp4'.format(episode)),
                             fps=60)
        print('Episode: {} t: {} Reward: {:.3f}'.format(
            episode, self.t_alive, self.total_reward))
Ejemplo n.º 12
0
def DeepQLearning(mode, fname = '', epsilon = 1, discount = 0.99):
	game = FlappyBird()

	rewards = {
		'positive' : 10,
		'stick' : 0,
		'loss' : -10	
	}

	env = PLE(game, fps = 30, display_screen = False, reward_values = rewards)
	env.init()

	if mode == 'new':
		model = build_model(env)
	elif mode == 'retrain':
		model = load_model(fname)

	#parameters
	actions = env.getActionSet()
	print(actions)
	nA = len(env.getActionSet())

	final_epsilon = 0.1
	epsilon_decay = nth_root(NUMBER_EPISODES, final_epsilon/epsilon)

	print("=========== Start Training ===========\n")

	avg_score = []
	score = 0
	for i in range(1, NUMBER_EPISODES):
		epsilon = epsilon*epsilon_decay
		action_reward = []
		if (i % 10000 == 0):
			avg = mean(avg_score)
			model.save("/content/drive/My Drive/"+'episode_{}_AvgScore_{}.h5'.format(i, avg))
			avg_score.clear()
			print("\nEpisode_{}_AvgScore_{}.hdf5 Saved !".format(i, avg))

		for t in itertools.count():
			#approx next action
			state = img_as_float(resize(env.getScreenGrayscale(), (80,80)))
			state = state.reshape((80, 80, 1))
			action_index = epsilon_greedy_policy(model, state, nA, epsilon)
			action = actions[action_index]
			reward = env.act(action)
			next_state = img_as_float(resize(env.getScreenGrayscale(), (80,80)))
			next_state = next_state.reshape((80, 80, 1))
			score += reward
			done = env.game_over()

			#action_reward.append((action, reward))

			if len(MEMORY_BUFFER) == MEMORY_BUFFER_SIZE:
				MEMORY_BUFFER.pop(0)
			MEMORY_BUFFER.append((state, action_index, reward, next_state, done))

			experience_replay(env, model, discount)

			if env.game_over():
				break
		
		env.reset_game()
		avg_score.append(score)
		
		#print(action_reward)
		
		print("\nEpisode {}/{} ---- Score : {}".format(i,NUMBER_EPISODES, score))
		score = 0

	with open("MEMORY_BUFFER.txt", "wb") as fp:
		pickle.dump("/content/drive/My Drive/"+MEMORY_BUFFER, fp)

	return model
Ejemplo n.º 13
0
game = FlappyBird()
p = PLE(game, fps=30, display_screen=True, force_fps=False)

agent = Agent(allowed_actions=p.getActionSet(), channels=1, learning_rate=0.0085)
try:
    agent.model.load_state_dict(load('memento_movement.pt'))
except EOFError:
    print("Error loading the saved model state")


p.init()

nb_frames = 10000000
rewards = []
episode = []
old_observation = preprocessing(p.getScreenGrayscale())
movement_captioning = 0
for i in range(nb_frames):
    if p.game_over():
        p.reset_game()
    preprocessed_observation = preprocessing(p.getScreenGrayscale())
    # Forward
    action, log_action = agent.pickAction(preprocessed_observation-old_observation)
    if movement_captioning < 2:
        old_observation = preprocessed_observation
        movement_captioning += 1
    elif movement_captioning < 15:
        movement_captioning += 1
    else:
        movement_captioning = 0
    reward_action = p.act(action)
Ejemplo n.º 14
0
def DeepQLearning(epsilon=1, discount=0.99):
    game = FlappyBird()

    rewards = {'positive': 1, 'stick': 0, 'loss': 0}

    env = PLE(game, fps=30, display_screen=True, reward_values=rewards)
    env.init()

    model_selection = build_model(env)
    model_evaluation = bulid_model(env)

    MEMORY_BUFFERS = [MEMORY_BUFFER_EVALUATION, MEMORY_BUFFER_SELECTION]
    models = [model_evaluation, model_selection]

    #parameters
    actions = env.getActionSet()
    print(actions)
    nA = len(env.getActionSet())
    final_epsilon = 0.001
    epsilon_decay = nth_root(NUMBER_EPISODES, final_epsilon / epsilon)

    print("=========== Start Training ===========\n")

    for i in range(1, NUMBER_EPISODES):
        epsilon = epsilon * epsilon_decay
        score = 0
        avg_score = []
        #cdaction_reward = []
        if (i % 1000 == 0):
            avg = np.mean(np.asarray(avg_score))
            model.save_weights('episode_{}_AvgScore_{}.hdf5'.format(i, avg))
            avg_score.clear()
            print("\nEpisode_{}_AvgScore_{}.hdf5 Saved !".format(i, avg))

        for t in itertools.count():
            #appro next action
            state = img_as_float(resize(env.getScreenGrayscale(), (64, 64)))
            state = state.reshape((64, 64, 1))
            action_index = epsilon_greedy_policy(model, state, nA, epsilon)
            action = actions[action_index]
            reward = env.act(action)
            next_state = img_as_float(
                resize(env.getScreenGrayscale(), (64, 64)))
            next_state = next_state.reshape((64, 64, 1))
            score += reward
            done = env.game_over()

            avg_score.append(score)

            #action_reward.append((action, reward))
            if not env.game_over():
                reward += discount * np.max(approximation(model, next_state))

            if len(MEMORY_BUFFER) == MEMORY_BUFFER_SIZE:
                MEMORY_BUFFER.pop(0)

            model_choice = np.random.choice(np.array([0, 1]))
            MEMORY_BUFFERS[model_choice].append(
                (state, action_index, reward, next_state, done))

            if env.game_over():
                break
        env.reset_game()
        experience_replay(env, models[model_choice], discount)
        #print(action_reward)
        if i % 100 == 0:
            print("\nEpisode {}/{} ---- Score : {}".format(
                i, NUMBER_EPISODES, score))

    return model
Ejemplo n.º 15
0
                        env.act(119)
                        new_action = 119
                    else:
                        env.act(None)
                        new_action = None
        else:
            action = model.predict([prev_obs.reshape(80, 80, 1)])[0]
            new_action = np.argmax(action)
            if new_action == 0:
                new_action = 119
                env.act(119)
            else:
                new_action = None
                env.act(None)

        choices.append(new_action)
        new_observation = cv2.resize(env.getScreenGrayscale(), (80, 80))
        prev_obs = new_observation
        game_memory.append([new_observation, new_action])
        score = env.score()
        if env.game_over():
            break
    env.reset_game()
    scores.append(score)
    if score >= score_requirement:
        accepted_scores.append(score)

print('Average Score:', sum(scores) / len(scores))
print('Success rate:', len(accepted_scores) / len(scores))
print(score_requirement)
Ejemplo n.º 16
0
from ple import PLE
import numpy as np


def get_obs(env):
    # game_state = env.getGameState()
    # obs = list(game_state.values())
    """ 预处理 210x160x3 uint8 frame into 6400 (80x80) 1维 float vector """
    # image = env.getScreenRGB()
    # image = image[35:195]  # 裁剪
    # image = image[::2, ::2, 0]  # 下采样,缩放2倍
    # image[image == 144] = 0  # 擦除背景 (background type 1)
    # image[image == 109] = 0  # 擦除背景 (background type 2)
    # image[image != 0] = 1  # 转为灰度图,除了黑色外其他都是白色
    obs = env.getScreenGrayscale() / 255.0
    return obs.astype(np.float).ravel()


if __name__ == '__main__':
    game = Pong(width=128, height=128, MAX_SCORE=11)
    p = PLE(game, fps=30, display_screen=True, force_fps=True)
    # 根据parl框架构建agent
    print(p.getActionSet())
    #obs = p.getScreenRGB()
    obs = p.getScreenGrayscale()
    print(obs)
    print(obs.shape)

    act_dim = len(p.getActionSet())
    game_state = p.getGameState()
    print(game_state)
Ejemplo n.º 17
0
class MyEnv(Environment):
    VALIDATION_MODE = 0
    memSize = 4
    # original size is 288x512 so dividing
    dividing_factor = 8
    width = 288 // dividing_factor
    height = 512 // dividing_factor

    def __init__(self,
                 rng,
                 game=None,
                 frame_skip=4,
                 ple_options={
                     "display_screen": True,
                     "force_fps": True,
                     "fps": 30
                 }):

        self._mode = -1
        self._mode_score = 0.0
        self._mode_episode_count = 0

        self._frame_skip = frame_skip if frame_skip >= 1 else 1
        self._random_state = rng

        if game is None:
            raise ValueError("Game must be provided")

        self._ple = PLE(game, **ple_options)
        self._ple.init()

        w, h = self._ple.getScreenDims()
        self._screen = np.empty((w, h), dtype=np.uint8)
        self._reduced_screen = np.empty((self.width, self.height),
                                        dtype=np.uint8)
        self._actions = self._ple.getActionSet()

    def reset(self, mode):
        if mode == MyEnv.VALIDATION_MODE:
            if self._mode != MyEnv.VALIDATION_MODE:
                self._mode = MyEnv.VALIDATION_MODE
                self._mode_score = 0.0
                self._mode_episode_count = 0
            else:
                self._mode_episode_count += 1
        elif self._mode != -1:  # and thus mode == -1
            self._mode = -1

        print("Dead at score {}".format(self._ple.game.getScore()))
        self._ple.reset_game()
        # for _ in range(self._random_state.randint(15)):
        # self._ple.act(self._ple.NOOP)
        # self._screen = self._ple.getScreenGrayscale()
        # cv2.resize(self._screen, (48, 48),
        # self._reduced_screen,
        # interpolation=cv2.INTER_NEAREST)

        return [self.memSize * [self.width * [self.height * [0]]]]

    def act(self, action):
        action = self._actions[action]

        reward = 0
        for _ in range(self._frame_skip):
            reward += self._ple.act(action)
            if self.inTerminalState():
                break

        self._screen = self._ple.getScreenGrayscale()
        self._reduced_screen = cv2.resize(self._screen,
                                          (self.height, self.width),
                                          interpolation=cv2.INTER_NEAREST)
        cv2.imshow("debug", self._reduced_screen.T)
        cv2.waitKey(1)
        self._mode_score += reward
        return np.sign(reward)

    def summarizePerformance(self, test_data_set):
        if self.inTerminalState() == False:
            self._mode_episode_count += 1
        mean = (self._mode_score / self._mode_episode_count
                if self._mode_episode_count else "N/A")
        print("== Mean score per episode is {} over {} episodes ==".format(
            mean, self._mode_episode_count))

    def inputDimensions(self):
        return [(self.memSize, self.width, self.height)]

    def observationType(self, subject):
        return np.float32

    def nActions(self):
        return len(self._actions)

    def observe(self):
        return [np.array(self._reduced_screen) / 256.]

    def inTerminalState(self):
        return self._ple.game_over()
Ejemplo n.º 18
0

podium = 0
podium_index = 0
background = 0
bird_pixels = None
ok = False
city_pixels = None
city_index = None
ok2 = False
for i in range(nb_frames):
    if p.game_over():
        p.reset_game()
        ok = True

    img = p.getScreenGrayscale().transpose()

    if ok == True:
        ok2 = True
        ok = False

    if i == 1 or ok2 == True:
        ok2 = False
        bincount_vector = np.bincount(img[0])
        background = np.argmax(bincount_vector)
        podium, podium_index = compute_podium(img)
        #bird_pixels = get_bird_pixels(img, background)
        city_pixels, city_index = get_city_pixels(img, background,
                                                  podium_index)
        transform_image(img, background, podium, podium_index + 1, city_index,
                        city_pixels)
Ejemplo n.º 19
0
class Bot():
    """
            This is our Test agent. It's gonna pick some actions after training!
    """
    def __init__(self, lr):

        self.lr = lr
        self.game = Pixelcopter(width=480, height=480)
        self.p = PLE(self.game, fps=60, display_screen=True)
        self.actions = self.p.getActionSet()

    #def pickAction(self, reward, obs):
    #   return random.choice(self.actions)

    def frame_step(self, act_inp):
        terminal = False
        reward = self.p.act(act_inp)
        if self.p.game_over():
            self.p.reset_game()
            terminal = True
            reward = -1
        else:
            reward = 1

        self.score = self.p.score()
        img = self.p.getScreenGrayscale()
        img = transform.resize(img, (80, 80))
        img = exposure.rescale_intensity(img, out_range=(0, 255))
        img = img / 255.0

        return img, reward, terminal

    def build_model(self):
        print("Building the model..")
        model = Sequential()
        model.add(
            Convolution2D(32,
                          8,
                          8,
                          subsample=(4, 4),
                          border_mode='same',
                          input_shape=(img_rows, img_cols,
                                       img_channels)))  #80*80*4
        model.add(Activation('relu'))
        model.add(Convolution2D(64, 4, 4, subsample=(2, 2),
                                border_mode='same'))
        model.add(Activation('relu'))
        model.add(Convolution2D(64, 3, 3, subsample=(1, 1),
                                border_mode='same'))
        model.add(Activation('relu'))
        model.add(Flatten())
        model.add(Dense(512))
        model.add(Activation('relu'))
        model.add(Dense(2))

        adam = Adam(lr=self.lr)
        model.compile(loss='mse', optimizer=adam)
        self.model = model
        print("Finished building the model..")

    def trainNetwork(self, mode):
        D = deque()

        x_t, r_0, terminal = self.frame_step(self.actions[1])

        s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)
        #print (s_t.shape)

        #need to reshape for keras
        s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1],
                          s_t.shape[2])  #1*80*80*4

        if mode == 'Run':
            OBSERVE = 999999999  #We keep observe, never train
            epsilon = FINAL_EPSILON
            print("Now we load weight")
            self.model.load_weights("model.h5")
            adam = Adam(lr=self.lr)
            self.model.compile(loss='mse', optimizer=adam)
            print("Weight load successfully")
        else:  #We go to training mode
            OBSERVE = OBSERVATION
            epsilon = INITIAL_EPSILON

        t = 0
        while (True):
            loss = 0
            Q_sa = 0
            action_index = 0
            r_t = 0
            #choose an action epsilon greedy
            if t % FRAME_PER_ACTION == 0:
                if random.random() <= epsilon:
                    print("----------Random Action----------")
                    action_index = random.randrange(num_actions)
                    chosen_act = self.actions[action_index]
                else:
                    q = self.model.predict(
                        s_t)  #input a stack of 4 images, get the prediction
                    max_Q = np.argmax(q)
                    action_index = max_Q
                    chosen_act = self.actions[action_index]

            #We reduced the epsilon gradually
            if epsilon > FINAL_EPSILON and t > OBSERVE:
                epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

            #run the selected action and observed next state and reward
            x_t1, r_t, terminal = self.frame_step(chosen_act)

            x_t1 = x_t1.reshape(1, x_t1.shape[0], x_t1.shape[1], 1)  #1x80x80x1
            s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3)

            # store the transition in D
            D.append((s_t, action_index, r_t, s_t1, terminal))
            if len(D) > REPLAY_MEMORY:
                D.popleft()

            #only train if done observing
            if t > OBSERVE:
                #sample a minibatch to train on
                minibatch = random.sample(D, BATCH)

                #Now we do the experience replay
                state_t, action_t, reward_t, state_t1, terminal = zip(
                    *minibatch)
                state_t = np.concatenate(state_t)
                state_t1 = np.concatenate(state_t1)
                targets = self.model.predict(state_t)
                Q_sa = self.model.predict(state_t1)
                targets[range(BATCH), action_t] = reward_t + GAMMA * np.max(
                    Q_sa, axis=1) * np.invert(terminal)

                loss += self.model.train_on_batch(state_t, targets)

            s_t = s_t1
            t = t + 1

            # save progress every 10000 iterations
            if t % 1000 == 0:
                print("Now we save model")
                self.model.save_weights("model.h5", overwrite=True)
                with open("model.json", "w") as outfile:
                    json.dump(self.model.to_json(), outfile)

            # print info
            state = ""
            if t <= OBSERVE:
                state = "observe"
            elif t > OBSERVE and t <= OBSERVE + EXPLORE:
                state = "explore"
            else:
                state = "train"

            print("TIMESTEP", t, "/ STATE", state, \
                "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, \
                "/ Q_MAX " , np.max(Q_sa), "/ Loss ", loss)

        print("Episode finished!")
        print("************************")

    def playGame(self, mode):
        self.build_model()
        self.trainNetwork(mode)

    def main(self):
        modes = ["Train", "Run"]
        mode = modes[input("Do you wanna Train(0) or Run(1): ")]
        self.playGame(mode)
Ejemplo n.º 20
0
class Catcher3:
    """
    Environment Specifications:
    Short summary: Player controls paddle and gains points for catching apples that fall from the sky; loses points and
                   lives otherwise.
    Number of Actions = 3 (move left, do nothing, move right)
    Observation Dimension = 4 (paddle x-position, paddle velocity, apple x-position, apple y-position)
    Observation Dtype = np.float64
    Reward =    1, if paddle touches apple
               -1, if apple touches floor (and -1 life)
               -5, if out of lives
                0, on any other transition
    Summary Name: steps_per_episode, reward_per_step
    """
    def __init__(self, config, summary=None):
        assert isinstance(config, Config)
        """ Parameters:
        Name:                       Type            Default:        Description(omitted when self-explanatory):
        max_episode_length          int             500000          The max number of steps executed in an episoe
                                                                    before forcing a time out
        norm_state                  bool            True            Normalize the state to [-1,1]
        display                     bool            False           Whether to display the screen of the game
        init_lives                  int             3               Number of lives at the start of the game
        store_summary               bool            False           Whether to store the summary of the environment
        number_of_steps             int             500000          Total number of environment steps
        """
        check_attribute(config, 'current_step', 0)
        self.config = config

        # environment parameters
        self.max_episode_length = check_attribute(config,
                                                  'max_episode_length',
                                                  default_value=500000)
        self.norm_state = check_attribute(config,
                                          'norm_state',
                                          default_value=True)
        self.display = False
        self.init_lives = 3
        # self.display = check_attribute(config, 'display', default_value=False)
        # self.init_lives = check_attribute(config, 'init_lives', default_value=3)

        # summary parameters
        self.store_summary = check_attribute(config,
                                             'store_summary',
                                             default_value=False)
        self.summary = summary
        self.number_of_steps = check_attribute(config, 'number_of_steps',
                                               500000)

        if self.store_summary:
            assert isinstance(self.summary, dict)
            self.reward_per_step = np.zeros(self.number_of_steps,
                                            dtype=np.float64)
            check_dict_else_default(self.summary, "steps_per_episode", [])
            check_dict_else_default(self.summary, "reward_per_step",
                                    self.reward_per_step)

        # setting up original catcher environment with the specified parameters
        self.catcherOb = Catcher(init_lives=self.init_lives)
        if not self.display:
            # do not open a pygame window
            os.putenv('SDL_VIDEODRIVER', 'fbcon')
            os.environ["SDL_VIDEODRIVER"] = "dummy"
        if self.norm_state:
            self.pOb = PLE(self.catcherOb,
                           fps=30,
                           state_preprocessor=get_ob_normalize,
                           display_screen=self.display)
        else:
            self.pOb = PLE(self.catcherOb,
                           fps=30,
                           state_preprocessor=get_ob,
                           display_screen=self.display)
        self.pOb.init()

        # environment internal state
        self.actions = [
            97, None, 100
        ]  # self.pOb.getActionSet() (left = 97, do nothing = None, right = 100)
        self.num_action = 3
        self.num_state = 4
        self.episode_step_count = 0
        self.pOb.reset_game()
        self.current_state = self.pOb.getGameState()

    def _get_image(self):
        """return a np array with shape = [64, 64, 3]"""
        return self.pOb.getScreenGrayscale()

    def setseed(self, value):
        self.pOb.rng.seed(value)
        return 0

    def reset(self):
        if self.store_summary:
            self.summary["steps_per_episode"].append(self.episode_step_count)
        self.pOb.reset_game()
        self.episode_step_count = 0
        self.current_state = self.pOb.getGameState()
        return self.current_state

    def step(self, a):
        self.config.current_step += 1
        self.episode_step_count += 1
        reward = self.pOb.act(self.actions[a])
        if self.store_summary:
            self.reward_per_step[self.config.current_step - 1] += reward

        terminate = self.pOb.game_over()
        self.current_state = self.pOb.getGameState()
        timeout = bool(self.episode_step_count >= self.max_episode_length
                       or self.config.current_step >= self.number_of_steps)
        return self.current_state, reward, terminate, timeout

    def get_current_state(self):
        return self.current_state

    def close(self):
        return
Ejemplo n.º 21
0
OBSERVATIONS = 300
#reward_discount = 0.99
time_per_episode = 1000
game = Pixelcopter(img_size,img_size)
env = PLE(game)
action_size = 2
score_mean = np.zeros(EPISODES//10)
score_std = np.zeros(EPISODES//10)
score_last10 = []
training_count = 0
plt.figure()
max_score = 0

for e in range(EPISODES):
    env.init()
    state = process(env.getScreenGrayscale())
    for time in range(time_per_episode):
        # Set actions
        if time < 3:
            action = act_dict_decode[0]
        else:
            action_input = np.concatenate((state,
                                           memory[-1][0],
                                           memory[-2][0],
                                           memory[-3][0]), axis=3)
            action = act(action_input)

        reward = env.act(action) # get reward from action
        next_state = process(env.getScreenGrayscale()) #@ next state
        done = env.game_over() # check game over and reassign reward
        if reward >= 0: reward = 1
Ejemplo n.º 22
0
class MyEnv(Environment):
    VALIDATION_MODE = 0

    def __init__(self, rng, game=None, frame_skip=4, 
            ple_options={"display_screen": True, "force_fps":True, "fps":30}):

        self._mode = -1
        self._mode_score = 0.0
        self._mode_episode_count = 0

        self._frameSkip = frame_skip if frame_skip >= 1 else 1
        self._random_state = rng
       
        if game is None:
            raise ValueError("Game must be provided")

        self._ple = PLE(game, **ple_options)
        self._ple.init()

        w, h = self._ple.getScreenDims()
        self._screen = np.empty((h, w), dtype=np.uint8)
        self._reducedScreen = np.empty((48, 48), dtype=np.uint8)
        self._actions = self._ple.getActionSet()

                
    def reset(self, mode):
        if mode == MyEnv.VALIDATION_MODE:
            if self._mode != MyEnv.VALIDATION_MODE:
                self._mode = MyEnv.VALIDATION_MODE
                self._mode_score = 0.0
                self._mode_episode_count = 0
            else:
                self._mode_episode_count += 1
        elif self._mode != -1: # and thus mode == -1
            self._mode = -1

        self._ple.reset_game()
        for _ in range(self._random_state.randint(15)):
            self._ple.act(self._ple.NOOP)
        self._screen = self._ple.getScreenGrayscale()
        cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST)
        
        return [4 * [48 * [48 * [0]]]]
        
        
    def act(self, action):
        action = self._actions[action]
        
        reward = 0
        for _ in range(self._frameSkip):
            reward += self._ple.act(action)
            if self.inTerminalState():
                break
            
        self._screen = self._ple.getScreenGrayscale()
        cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST)
  
        self._mode_score += reward
        return np.sign(reward)

    def summarizePerformance(self, test_data_set):
        if self.inTerminalState() == False:
            self._mode_episode_count += 1
        print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count))


    def inputDimensions(self):
        return [(4, 48, 48)]

    def observationType(self, subject):
        return np.uint8

    def nActions(self):
        return len(self._actions)

    def observe(self):
        return [np.array(self._reducedScreen)]

    def inTerminalState(self):
        return self._ple.game_over()
Ejemplo n.º 23
0
def trainNetwork(s, readout, h_fc1, sess):
    # define the cost function
    a = tf.placeholder("float", [None, ACTIONS])
    y = tf.placeholder("float", [None])
    readout_action = tf.reduce_sum(tf.mul(readout, a), reduction_indices = 1)
    cost = tf.reduce_mean(tf.square(y - readout_action))
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    # open up a game state to communicate with emulator
    #setupGame()
    gameClass = FlappyBird(width=288, height=512, pipe_gap=100)
    
    fps = 30
    frame_skip = 2
    num_steps = 1
    force_fps = False
    display_screen = True
    reward = 0.0
    nb_frames = 15000

    game = PLE(gameClass, fps=fps, frame_skip=frame_skip, num_steps=num_steps,
            force_fps=force_fps, display_screen=display_screen)

    game.init()

    # store the previous observations in replay memory
    D = deque()

    # printing
    logdir = "logs_" + GAME
    if not os.path.exists(logdir):
        os.makedirs(logdir)
    a_file = open(logdir + "/readout.txt", 'w')
    h_file = open(logdir + "/hidden.txt", 'w')

    # get the first state by doing nothing and preprocess the image to 80x80x4
    r_0 = game.act(game.NOOP)
    x_t = game.getScreenGrayscale()
    terminal = game.game_over()
    if terminal:
        print "NOOOO"
        game.reset_game()
    
    x_t = cv2.resize(x_t, (80, 80))
    ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis = 2)

    # saving and loading networks
    #saver = tf.train.Saver()
    sess.run(tf.initialize_all_variables())
    '''
    checkpoint = tf.train.get_checkpoint_state("saved_networks")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print "Successfully loaded:", checkpoint.model_checkpoint_path
    else:
        print "Could not find old network weights"
    '''
    epsilon = INITIAL_EPSILON
    t = 0
    while True:
        # choose an action epsilon greedily
        readout_t = readout.eval(feed_dict = {s : [s_t]})[0]
        a_t = np.zeros([ACTIONS])
        action_index = 0
        if random.random() <= epsilon or t <= OBSERVE:
            action_index = random.randrange(ACTIONS)
            a_t[random.randrange(ACTIONS)] = 1
        else:
            action_index = np.argmax(readout_t)
            a_t[action_index] = 1

        # scale down epsilon
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        for i in range(0, K):
            # run the selected action and observe next state and reward
            r_t = game.act(np.argmax(a_t))
            x_t1 = game.getScreenGrayscale()
            terminal = game.game_over()
            if terminal:
                print "NOOO2"
                game.reset_game()

            x_t1 = cv2.resize(x_t1, (80, 80))
            ret, x_t1 = cv2.threshold(x_t1,1,255,cv2.THRESH_BINARY)
            x_t1 = np.reshape(x_t1, (80, 80, 1))
            s_t1 = np.append(x_t1, s_t[:,:,1:], axis = 2)

            # store the transition in D
            D.append((s_t, a_t, r_t, s_t1, terminal))
            if len(D) > REPLAY_MEMORY:
                D.popleft()

        # only train if done observing
        if t > OBSERVE:
            # sample a minibatch to train on
            minibatch = random.sample(D, BATCH)

            # get the batch variables
            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]
            s_j1_batch = [d[3] for d in minibatch]

            y_batch = []
            readout_j1_batch = readout.eval(feed_dict = {s : s_j1_batch})
            for i in range(0, len(minibatch)):
                # if terminal only equals reward
                if minibatch[i][4]:
                    y_batch.append(r_batch[i])
                else:
                    y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i]))

            # perform gradient step
            train_step.run(feed_dict = {
                y : y_batch,
                a : a_batch,
                s : s_j_batch})

        # update the old values
        s_t = s_t1
        t += 1

        # save progress every 10000 iterations
        if t % 10000 == 0:
            saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step = t)

        # print info
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif t > OBSERVE and t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"
        print "TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t)

        # write info to files
        '''
Ejemplo n.º 24
0
class SnakeQNetwork:
    def __init__(self,
                 food_reward=10,
                 dead_reward=-10,
                 alive_reward=2,
                 discount_factor=0.95,
                 batch_size=10,
                 train_epochs=100,
                 history_size=1000,
                 history_sample_size=50):
        self.food_reward = food_reward
        self.dead_reward = dead_reward
        self.alive_reward = alive_reward
        self.discount_factor = discount_factor
        self.batch_size = batch_size
        self.train_epochs = train_epochs
        self.history_size = history_size
        self.history_sample_size = history_sample_size
        self.q_learning_history = QLearningHistory(history_size)
        self.exploration_factor = 0.2
        self.next_move_prediction = None
        self.is_neural_network_initialized = False
        pygame.init()
        self.game = Snake(width=64, height=64)
        self.env = PLE(self.game, display_screen=True)
        self.env.init()
        self.LOG = gym.logger

    def run(self,
            maximum_number_of_iterations=10000,
            learning_rate=0.5,
            training=False):

        for iteration in range(0, maximum_number_of_iterations):

            if not self.is_neural_network_initialized:
                self.___initialize_neural_newtork()
                self.is_neural_network_initialized = True

            observation = self.env.getScreenGrayscale()
            observation_width = self.env.getScreenDims()[0]
            observation_height = self.env.getScreenDims()[1]
            self.game.init()

            # exit the while loop only if it's GAME OVER
            while True:

                q_values = self.next_move_prediction.predict(
                    x=observation.reshape(
                        1, observation_width * observation_height),
                    batch_size=1)
                best_snake_action = np.argmax(q_values)
                reward = self.__take_snake_action(best_snake_action)
                previous_observation = copy.deepcopy(observation)
                observation = self.env.getScreenGrayscale()
                is_game_over = self.env.game_over()

                self.LOG.info(
                    "Current action reward: {r}. Is game over: {d}".format(
                        r=reward, d=is_game_over))

                if training:
                    reward = self.__get_custom_reward(reward)

                    self.q_learning_history.record_event(
                        state=previous_observation,
                        action=best_snake_action,
                        reward=reward,
                        new_state=observation)

                    last_event = self.q_learning_history.get_last_event()
                    self.LOG.info(
                        "Added event #{n} to history. Action: {a}; Reward: {r}"
                        .format(a=last_event[1],
                                r=reward,
                                n=self.q_learning_history.size))

                    if self.q_learning_history.is_full():
                        history_batch = random.sample(
                            self.q_learning_history.get_events(),
                            self.history_sample_size)
                        self.LOG.info(
                            "Sampling {n} events from history.".format(
                                n=self.history_sample_size))

                        training_batch_data = []
                        training_batch_labels = []

                        for history_event in history_batch:
                            old_state, action, reward, new_state = history_event

                            q_values_before_action = self.next_move_prediction.predict(
                                x=old_state.reshape(
                                    1, observation_width * observation_height),
                                batch_size=1)

                            q_values_after_action = self.next_move_prediction.predict(
                                x=new_state.reshape(
                                    1, observation_width * observation_height),
                                batch_size=1)

                            best_q_value_after_action = np.argmax(
                                q_values_after_action)

                            training_q_values = np.zeros((1, 4))

                            for value_idx in range(
                                    0, len(q_values_before_action)):
                                training_q_values[
                                    value_idx] = q_values_before_action[
                                        value_idx]

                            output_update = learning_rate * (
                                reward + (self.discount_factor *
                                          best_q_value_after_action))

                            training_q_values[0][:] = 0
                            training_q_values[0][action] = output_update

                            training_batch_data.append(
                                old_state.reshape(
                                    observation_width * observation_height, ))
                            training_batch_labels.append(
                                training_q_values.reshape(4, ))

                        training_batch_data = np.array(training_batch_data)
                        training_batch_labels = np.array(training_batch_labels)

                        self.next_move_prediction.fit(
                            x=training_batch_data,
                            y=training_batch_labels,
                            epochs=self.train_epochs,
                            batch_size=self.batch_size)

                if is_game_over:
                    break

            if self.exploration_factor > 0.1:
                self.exploration_factor -= (1.0 / maximum_number_of_iterations)
                self.LOG.info(
                    "Exploration factor updated! New value: {v}".format(
                        v=self.exploration_factor))

    def ___initialize_neural_newtork(self):
        input_layer_size = self.env.getScreenDims(
        )[0] * self.env.getScreenDims()[1]
        hidden_layer_size = 100
        output_layer_size = 4

        input_layer = Dense(kernel_initializer='lecun_uniform',
                            units=hidden_layer_size,
                            input_shape=(input_layer_size, ),
                            activation='sigmoid')

        hidden_layer = Dense(kernel_initializer='lecun_uniform',
                             units=output_layer_size,
                             activation='linear')

        self.next_move_prediction = Sequential()
        self.next_move_prediction.add(input_layer)
        self.next_move_prediction.add(hidden_layer)

        self.next_move_prediction.compile(optimizer='rmsprop',
                                          loss='mean_squared_error')

    def __take_snake_action(self, snake_action):

        random_number = np.random.random_sample()

        if not self.q_learning_history.is_full():
            snake_action = random.choice(self.env.getActionSet())
            self.LOG.info("Snake chose to do a random move - add to qHistory!")
            return self.env.act(snake_action)

        elif random_number < self.exploration_factor:
            snake_action = random.choice(self.env.getActionSet())
            self.LOG.info(
                "Random number is smaller than exploration factor, {r} < {ef}! Snake chose random move!"
                .format(r=random_number, ef=self.exploration_factor))
            return self.env.act(snake_action)

        elif snake_action == 0:
            self.LOG.info("Snake chose to go up")
            return self.env.act(115)

        elif snake_action == 1:
            self.LOG.info("Snake chose to go left")
            return self.env.act(97)

        elif snake_action == 2:
            self.LOG.info("Snake chose to go down")
            return self.env.act(119)

        elif snake_action == 3:
            self.LOG.info("Snake chose to go right")
            return self.env.act(100)

    def __get_custom_reward(self, reward):
        if reward >= 1:
            self.LOG.info(
                "Has eaten food! Reward is {r}".format(r=self.food_reward))
            return self.food_reward
        elif reward >= 0:
            self.LOG.info(
                "Stayed alive! Reward is {r}".format(r=self.alive_reward))
            return self.alive_reward
        else:
            self.LOG.info("Crashed! Reward is {r}".format(r=self.dead_reward))
            return self.dead_reward
Ejemplo n.º 25
0
class MyEnv(Environment):
    VALIDATION_MODE = 0

    def __init__(self,
                 rng,
                 game=None,
                 frame_skip=4,
                 ple_options={
                     "display_screen": True,
                     "force_fps": True,
                     "fps": 30
                 }):

        self._mode = -1
        self._mode_score = 0.0
        self._mode_episode_count = 0

        self._frame_skip = frame_skip if frame_skip >= 1 else 1
        self._random_state = rng

        if game is None:
            raise ValueError("Game must be provided")

        self._ple = PLE(game, **ple_options)
        self._ple.init()

        w, h = self._ple.getScreenDims()
        self._screen = np.empty((h, w), dtype=np.uint8)
        self._reduced_screen = np.empty((48, 48), dtype=np.uint8)
        self._actions = self._ple.getActionSet()

    def reset(self, mode):
        if mode == MyEnv.VALIDATION_MODE:
            if self._mode != MyEnv.VALIDATION_MODE:
                self._mode = MyEnv.VALIDATION_MODE
                self._mode_score = 0.0
                self._mode_episode_count = 0
            else:
                self._mode_episode_count += 1
        elif self._mode != -1:  # and thus mode == -1
            self._mode = -1

        self._ple.reset_game()
        for _ in range(self._random_state.randint(15)):
            self._ple.act(self._ple.NOOP)
        self._screen = self._ple.getScreenGrayscale()
        cv2.resize(self._screen, (48, 48),
                   self._reduced_screen,
                   interpolation=cv2.INTER_NEAREST)

        return [4 * [48 * [48 * [0]]]]

    def act(self, action):
        action = self._actions[action]

        reward = 0
        for _ in range(self._frame_skip):
            reward += self._ple.act(action)
            if self.inTerminalState():
                break

        self._screen = self._ple.getScreenGrayscale()
        cv2.resize(self._screen, (48, 48),
                   self._reduced_screen,
                   interpolation=cv2.INTER_NEAREST)

        self._mode_score += reward
        return np.sign(reward)

    def summarizePerformance(self, test_data_set):
        if self.inTerminalState() == False:
            self._mode_episode_count += 1
        print("== Mean score per episode is {} over {} episodes ==".format(
            self._mode_score / self._mode_episode_count,
            self._mode_episode_count))

    def inputDimensions(self):
        return [(4, 48, 48)]

    def observationType(self, subject):
        return np.float32

    def nActions(self):
        return len(self._actions)

    def observe(self):
        return [np.array(self._reduced_screen) / 256.]

    def inTerminalState(self):
        return self._ple.game_over()
Ejemplo n.º 26
0
class FlappyBirdEnv:
    def __init__(self):
        self.fps = 30
        self.game = flappyBird()
        self.env = PLE(self.game, fps=self.fps, display_screen=False)  # environment interface to game
        self.env.reset_game()

    def reset(self, is_show = False):
        self.env = PLE(self.game, fps=self.fps, display_screen=is_show)  # environment interface to game
        self.env.reset_game()
        state = self.get_state()

        return state
    
    def act(self, action):
        # return state_prime, reward, done, info
        reward = self.env.act(self.env.getActionSet()[action])
#         print(reward)
        
        # Survive reward +1
#         reward += 1
        
        # Get closer to the middle of top and bottom pipe and get more reward
#         state = self.game.getGameState()
        
#         next_dis_to_mid = abs((state['next_pipe_top_y'] + state['next_pipe_bottom_y']) / 2 - state['player_y'])
#         print('State')
#         print(state)
#         print('Mid')
#         print((state['next_pipe_top_y'] + state['next_pipe_bottom_y']) / 2)
#         print('next_dis_to_mid')
#         print(abs((state['next_pipe_top_y'] + state['next_pipe_bottom_y']) / 2 - state['player_y']))
#         next_next_dis_to_mid = abs((state['next_next_pipe_top_y'] + state['next_next_pipe_bottom_y']) / 2 - state['player_y'])
#         dis_reward_coef = 0.01
        
#         reward += dis_reward_coef * ((-next_dis_to_mid) + 0.5 * (-next_next_dis_to_mid))
        
        state_prime = self.get_state()
        is_done = self.is_over()
        info = ""
        return state_prime, reward, is_done, info

    def get_num_actions(self):
        return len(self.env.getActionSet())

    def get_action_set(self):
        return self.env.getActionSet()

    def get_screen_rgb(self):
        return self.env.getScreenRGB()

    def get_screen_gray(self):
        return self.env.getScreenGrayscale()

    def get_num_state_features(self):
        return len(self.game.getGameState())

    def get_state(self):
        # dict
        #     * player y position.
        #     * players velocity.
        #     * next pipe distance to player
        #     * next pipe top y position
        #     * next pipe bottom y position
        #     * next next pipe distance to player
        #     * next next pipe top y position
        #     * next next pipe bottom y position

        # state = {
        #     "player_y": self.player.pos_y,
        #     "player_vel": self.player.vel,
            
        #     "next_pipe_dist_to_player": next_pipe.x - self.player.pos_x,
        #     "next_pipe_top_y": next_pipe.gap_start,
        #     "next_pipe_bottom_y": next_pipe.gap_start+self.pipe_gap, 
            
        #     "next_next_pipe_dist_to_player": next_next_pipe.x - self.player.pos_x,
        #     "next_next_pipe_top_y": next_next_pipe.gap_start,
        #     "next_next_pipe_bottom_y": next_next_pipe.gap_start+self.pipe_gap 
        # }
        state = self.game.getGameState()
        state['next_pipe_top_y'] -= state['player_y']
        state['next_pipe_bottom_y'] -= state['player_y']
        state['next_next_pipe_top_y'] -= state['player_y']
        state['next_next_pipe_bottom_y'] -= state['player_y']
        return list(state.values())

    def is_over(self):
        return self.env.game_over()
Ejemplo n.º 27
0
class FlappyBirdEnv:
    """ 
    This is the Reinforcement Learning Environment that wraps the PLE Flappy Bird Game. The RL agent 
    interacts with the environment by providing which action it wants to take in the current state. 
    The environment in turn provides the reward and the next state to agent after executing the provided
    action.
    """
    def __init__(self, display=False):
        """
        Initializes a new environment for FlappyBird game.
        """
        game = game = FlappyBird()
        self._game = PLE(game, fps=30, display_screen=display)
        
        # _display_game flag controls whether or not to render the state that is being provided by the 
        # environment.
        self._display_game = display
        
        if self._display_game:
            self._display = self.show_img() # display sets up a cv2 window where the current state is displayed.
            self._display.__next__() # iterate over the display generator.
        
        self.NUM_ACTIONS = len(self._game.getActionSet()) # defines the number of action agent can take in the environment.

        self._ACTION_MAP = {}
        for i, action in enumerate(self._game.getActionSet()):
            self._ACTION_MAP[i] = action
        
        # Number contiguous images the environment provides as state. Basically at any time, the
        # environment provides a stack of last 4 (including the current) images as the state to the agent.
        self._IMAGE_STACK_SIZE = 4

        # Dimension of the (greyscale) image provided as state.
        self._PROCESSED_IMAGE_SIZE = 84

        # Determines the number of times the provided action is executed before returning the next
        # state.
        self._SKIP_FRAMES = 4 

        # Used by the RL agent to set up it's CNN model.
        self.STATE_SPACE = (self._PROCESSED_IMAGE_SIZE, self._PROCESSED_IMAGE_SIZE, self._IMAGE_STACK_SIZE)
        self._init_states()

    def _init_states(self):
        """
        Initializes/Resets the states for the environment.
        """
        self._image_stack = None # holds the current state, i.e., stack of 4 images.
        self._score = 0
    
    def step(self, action):
        """
        Provides the next state and rewards after executing the provided action.

        Args
        ------
        `action` (int): Action to be taken from the current state.
        """
        reward = 0
        for i in range(self._SKIP_FRAMES):
            reward += self._game.act(self._ACTION_MAP[action])
        

        done = self._game.game_over()
        self._score += reward
        
        clipped_reward = self._clip_reward(reward)

        self.grab_screen()
        if self._display_game:
            self._display.send(self._image_stack) # display image on the screen
        
        return (self._image_stack.copy(), clipped_reward, done, self._score) 
    
    def _clip_reward(self, reward):
        """
        Clips the provided reward between [-1, 1]

        Args
        ----
        `reward` (float): The reward that is to be clipped.

        Returns
        -------
        A float represent the clipped reward.
        """
        if reward > 1.0:
            reward = 1.0
        elif reward < -1.0:
            reward = -1.0
        
        return reward
    
    def reset(self):
        """
        Resets the game and provides the starting state.

        Returns
        -------
        A numpy `_IMAGE_STACK_SIZE`-d numpy array (or greyscale image) representing the current state
        of the environment
        """
        self._game.reset_game()
        
        self._init_states()
        
        self.grab_screen()
        if self._display_game:
            self._display.send(self._image_stack)

        return self._image_stack.copy()

    def show_img(self):
        '''
        Show current state (`_IMAGE_STACK_SIZE` greyscale images) in an opencv window.

        Returns
        -------
        A generator that to which the images can be sent for displaying.
        '''
        return utils.show_image('Model Input (4 images)')
    
    def grab_screen(self):
        """
        Grabs 1 to _IMAGE_STACK_SIZE images (depending upon whether called after reseting or not) and
        adds it to the image_stack in chronological order, i.e., most recent image is the last.
        """
        if self._image_stack is None:
            self._image_stack = np.zeros(self.STATE_SPACE, dtype=np.uint8)
            for i in range(self._IMAGE_STACK_SIZE):
                self._game.act(None)
                self._image_stack[:, :, i] = self.get_processed_image()
        else:
            self._image_stack[:, :, :self._IMAGE_STACK_SIZE-1] = self._image_stack[:, :, 1:]
            self._image_stack[:, :, self._IMAGE_STACK_SIZE-1] = self.get_processed_image()
    
    def get_processed_image(self):
        """
        Fetches the current gameplay screenshot and processes it.

        Returns
        -------
        A processed greyscale image (as numpy array) representing the current gameplay state.
        """
        screen = self._game.getScreenGrayscale()
        image = self.process_image(screen)
        return image

    def process_image(self, image):
        """
        Processes the input image by performing following steps:
        i. Cropping and transposing the image to obtain the Region Of Interest (ROI)
        ii. Resizing the ROI to (`_PROCESSED_IMAGE_SIZE`, `_PROCESSED_IMAGE_SIZE`) dimension.

        Args
        ----
        `image` (numpy array): The image which is to be processed.

        Returns
        -------
        A processed greyscale image (as numpy array).
        """
        # Step 1.
        image = image[:, :410]
        image = np.transpose(image, (1, 0))
        
        # Step 2.
        image = cv2.resize(image, (self._PROCESSED_IMAGE_SIZE, self._PROCESSED_IMAGE_SIZE), interpolation=cv2.INTER_AREA)
        
        return image
Ejemplo n.º 28
0
# use lower fps so we can see whats happening a little easier
game = WaterWorld(width=100, height=100, num_creeps=15)

# p = PLE(game, reward_values=rewards)
p = PLE(game, fps=30, force_fps=False, display_screen=True,
        reward_values=rewards)

p.init()
actions = p.getActionSet()[:-1]
agent = Agent(len(actions))

epochs = 10000000
game_duration = 1000
for epoch in range(epochs):
    p.reset_game()

    for it in range(1000):
        if p.game_over():
            p.reset_game()
            print "Finished with score:" + str(p.score())

        current_state = np.array(p.getScreenGrayscale()).reshape((10000, ))

        action = agent.act(np.array([current_state]))
        # action = actions[np.random.randint(0, len(actions))]
        reward = p.act(actions[action])

        print "Current score: " + str(p.score())
    print "Finished with score:" + str(p.score())

Ejemplo n.º 29
0
#coding:utf-8
from ple.games.pong import Pong
from ple import PLE
import numpy as np
def get_obs(env):
    # game_state = env.getGameState()
    # obs = list(game_state.values())
    obs = env.getScreenGrayscale()/255.0
    return obs.astype(np.float).ravel()


if __name__ == '__main__':
    game = Pong(width=128, height=96,MAX_SCORE=11)
    p = PLE(game, fps=30, display_screen=True, force_fps=True)
    # 根据parl框架构建agent
    print(p.getActionSet())

    act_dim = len(p.getActionSet())
    p.getScreenGrayscale()
    game_state = p.getGameState()
    print(game_state)
Ejemplo n.º 30
0
class Bot():
    """
            This is our Test agent. It's gonna pick some actions after training!
    """
    def __init__(self, lr):

        self.lr = lr
        self.game = Pixelcopter(width=480, height=480)
        self.p = PLE(self.game, fps=60, display_screen=True)
        self.actions = self.p.getActionSet()

    #def pickAction(self, reward, obs):
    #   return random.choice(self.actions)

    def frame_step(act_inp):
        terminal = False
        reward = self.p.act(act_inp)
        if self.p.game_over():
            self.p.reset_game()
            terminal = True
            reward = -1
        else:
            reward = 1

        self.score = self.p.getScore()
        img = self.p.getScreenGrayscale()
        img = transform.resize(img, (80, 80))
        img = np.ravel(exposure.rescale_intensity(img, out_range=(0, 255)))

        return img, reward, terminal

    def build_model(self):
        print("Building the model..")
        model = Sequential()
        model.add(
            Convolution2D(32,
                          8,
                          8,
                          subsample=(4, 4),
                          border_mode='same',
                          input_shape=(img_rows, img_cols,
                                       img_channels)))  #80*80*4
        model.add(Activation('relu'))
        model.add(Convolution2D(64, 4, 4, subsample=(2, 2),
                                border_mode='same'))
        model.add(Activation('relu'))
        model.add(Convolution2D(64, 3, 3, subsample=(1, 1),
                                border_mode='same'))
        model.add(Activation('relu'))
        model.add(Flatten())
        model.add(Dense(512))
        model.add(Activation('relu'))
        model.add(Dense(2))

        adam = Adam(lr=self.lr)
        model.compile(loss='mse', optimizer=adam)
        self.model = model
        print("Finished building the model..")

    def trainNetwork(self, mode):
        D = deque()

        x_t, r_0, terminal = self.frame_step(self.actions[0])
        x_t = x_t / 255.0

        s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)
        #print (s_t.shape)

        #need to reshape for keras
        s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1],
                          s_t.shape[2])  #1*80*80*4

        if mode == 'Run':
            OBSERVE = 999999999  #We keep observe, never train
            epsilon = FINAL_EPSILON
            print("Now we load weight")
            self.model.load_weights("model.h5")
            adam = Adam(lr=self.lr)
            self.model.compile(loss='mse', optimizer=adam)
            print("Weight load successfully")
        else:  #We go to training mode
            OBSERVE = OBSERVATION
            epsilon = INITIAL_EPSILON