Ejemplo n.º 1
0
class Environment(object):
    def __init__(self, game, record=False, width=84, height=84, seed=0):
        self.game = gym.make(game)
        self.game.seed(seed)

        if record:
            self.game = Monitor(self.game, './video', force=True)

        self.width = width
        self.height = height
        self._toTensor = T.Compose([T.ToPILImage(), T.ToTensor()])
        gym_ple

    def play_sample(self, mode: str = 'human'):
        observation = self.game.reset()

        while True:
            screen = self.game.render(mode=mode)
            if mode == 'rgb_array':
                screen = self.preprocess(screen)
            action = self.game.action_space.sample()
            observation, reward, done, info = self.game.step(action)
            if done:
                break
        self.game.close()

    def preprocess(self, screen):
        preprocessed: np.array = cv2.resize(screen, (self.height, self.width))  # 84 * 84 로 변경
        preprocessed = np.dot(preprocessed[..., :3], [0.299, 0.587, 0.114])  # Gray scale 로 변경
        # preprocessed: np.array = preprocessed.transpose((2, 0, 1))  # (C, W, H) 로 변경
        preprocessed: np.array = preprocessed.astype('float32') / 255.

        return preprocessed

    def init(self):
        """
        @return observation
        """
        return self.game.reset()

    def get_screen(self):
        screen = self.game.render('rgb_array')
        screen = self.preprocess(screen)
        return screen

    def step(self, action: int):
        observation, reward, done, info = self.game.step(action)
        return observation, reward, done, info

    def reset(self):
        """
        :return: observation array
        """
        observation = self.game.reset()
        observation = self.preprocess(observation)
        return observation

    @property
    def action_space(self):
        return self.game.action_space.n
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(description='Load the Saved Model')
    parser.add_argument('-checkpoint',
                        '--checkpoint',
                        help='Give me a checkpoint for the model',
                        required=True)

    args = vars(parser.parse_args())
    checkpoint_path = 'saved_networks/DUEL_DQN-SpaceInvaders-v0_evaluation/SpaceInvaders-v0-' + args[
        'checkpoint']

    env = gym.make(ENV_NAME)

    # if TRAIN:  # Train mode
    #     for _ in range(NUM_EPISODES):
    #         terminal = False
    #         observation = env.reset()
    #         for _ in range(random.randint(1, NO_OP_STEPS)):
    #             last_observation = observation
    #             observation, _, _, _ = env.step(0)  # Do nothing
    #         state = agent.get_initial_state(observation, last_observation)
    #         while not terminal:
    #             last_observation = observation
    #             action = agent.get_action(state)
    #             observation, reward, terminal, _ = env.step(action)
    #             # env.render()
    #             processed_observation = preprocess(observation, last_observation)
    #             state = agent.run(state, action, reward, terminal, processed_observation)

    agent = Agent(num_actions=env.action_space.n,
                  checkpoint_path=checkpoint_path)
    env = Monitor(env, './SpaceInvaders-1', force=True)
    total_reward = 0.0

    with open('log_DuelingDQN.txt', 'a+') as open_file:
        for _ in range(NUM_EPISODES_AT_TEST):
            terminal = False
            observation = env.reset()
            for _ in range(random.randint(1, NO_OP_STEPS)):
                last_observation = observation
                observation, _, _, _ = env.step(0)  # Do nothing
            state = agent.get_initial_state(observation, last_observation)
            while not terminal:
                last_observation = observation
                action = agent.get_action_at_test(state)
                observation, reward, terminal, _ = env.step(action)
                #env.render()
                processed_observation = preprocess(observation,
                                                   last_observation)
                state = np.append(state[1:, :, :],
                                  processed_observation,
                                  axis=0)

                ## Collect all the things you want
                total_reward += reward

        avg_reward = total_reward / float(NUM_EPISODES_AT_TEST)
        open_file.write(args['checkpoint'] + '\t' + 'average_reward=' +
                        str(avg_reward))
        open_file.write('\n')
Ejemplo n.º 3
0
def test_semisuper_succeeds():
    """Regression test. Ensure that this can write"""
    with helpers.tempdir() as temp:
        env = gym.make('SemisuperPendulumDecay-v0')
        env = Monitor(env, temp)
        env.reset()
        env.step(env.action_space.sample())
        env.close()
Ejemplo n.º 4
0
def test_semisuper_succeeds():
    """Regression test. Ensure that this can write"""
    with helpers.tempdir() as temp:
        env = gym.make('SemisuperPendulumDecay-v0')
        env = Monitor(temp)(env)
        env.reset()
        env.step(env.action_space.sample())
        env.close()
def main():
    finishedTraining = EPISODES
    startTime = time.time()
    env = filter_env.makeFilteredEnv(gym.make(ENV_NAME))
    results_file = open("ResultsNew.csv", 'a')
    agent = DDPG(env, results_file)
    env = Monitor(env, directory='experiments/' + ENV_NAME, force=True)
    results_file.write("Episodes Spent Training; " + str(TEST) +
                       " Episode Eval Avg \n")
    for episode in range(EPISODES):
        state = env.reset()
        if (episode % 20 == 0):
            print("episode:", episode)
        # Train
        for step in range(env.spec.timestep_limit):
            action = agent.noise_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.perceive(state, action, reward, next_state, done)
            state = next_state
            if done:
                break
        # Testing:
        if (episode + 1) % 100 == 0 and episode > 100:
            total_reward = 0
            for i in range(TEST):
                state = env.reset()
                for j in range(env.spec.timestep_limit):
                    env.render()
                    action = agent.action(state)  # direct action for test
                    state, reward, done, _ = env.step(action)
                    total_reward += reward
                    if done:
                        break
            ave_reward = total_reward / TEST
            print('episode: ', episode, 'Evaluation Average Reward:',
                  ave_reward)
            results_file.write(str(episode) + "; " + str(ave_reward) + "\n")
            if ave_reward > 800 and finishedTraining > episode + 300:
                finishedTraining = episode + 300
            elif (episode >= finishedTraining):
                break

    results_file.write("Time Training (" + str(EPISODES) + "episodes);" +
                       str(time.time() - startTime) + "\n")
    results_file.write("Evaluation Episode; Reward \n")
    for episode in range(100):
        total_reward = 0
        env.reset()
        state = env.env.env.set_test(episode)
        for j in range(env.spec.timestep_limit):
            action = agent.action(state)  # direct action for test
            state, reward, done, _ = env.step(action)
            total_reward += reward
            if done:
                break
        results_file.write(str(episode) + "; " + str(total_reward) + "\n")
    results_file.write("endExperiment\n\n")
    results_file.close()
def main():
    startTime = time.time()
    env = filter_env.makeFilteredEnv(gym.make(ENV_NAME))
    results_file = open("MoreExactReward12.csv", 'a')
    agent = DDPG(env, results_file)
    env = Monitor(env, directory='experiments/' + ENV_NAME, force=True)
    results_file.write("Episodes Spent Training; " + str(TEST) +
                       " Episode Eval Avg; Learned Reward Map \n")
    for episode in range(EPISODES):
        state = env.reset()
        if (episode % 20 == 0):
            print("episode:", episode)
        # Train
        for step in range(env.spec.timestep_limit):
            action = agent.noise_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.perceive(state, action, reward, next_state, done)
            state = next_state
            if done:
                break
        # Testing:
        if episode % 100 == 0 and episode > 100:
            total_reward = 0
            for i in range(TEST):
                state = env.reset()
                for j in range(env.spec.timestep_limit):
                    #env.render()
                    action = agent.action(state)  # direct action for test
                    state, reward, done, _ = env.step(action)
                    total_reward += reward
                    if done:
                        break
            ave_reward = total_reward / TEST
            print('episode: ', episode, 'Evaluation Average Reward:',
                  ave_reward)
            results_file.write(str(episode) + "; " + str(ave_reward) + ";")
            results_file.write("%s \n" % (np.array_str(
                agent.actor_network.net[-1].eval())).replace("\n", " "))

    results_file.write("Time Training (" + str(EPISODES) + "episodes);" +
                       str(time.time() - startTime) + "\n")
    results_file.write(
        "Final Learned Reward Map; %s \n" %
        (np.array_str(agent.actor_network.net[-1].eval())).replace("\n", " "))

    results_file.write("Evaluation Episode; Reward \n")
    for episode in range(100):
        total_reward = 0
        state = env.reset()
        for j in range(env.spec.timestep_limit):
            action = agent.action(state)  # direct action for test
            state, reward, done, _ = env.step(action)
            total_reward += reward
            if done:
                break
        results_file.write(str(episode) + "; " + str(total_reward) + "\n")
    results_file.write("endExperiment\n\n")
    results_file.close()
Ejemplo n.º 7
0
class GymEnvironment(VideoCapableEnvironment):
    """
    Wraps an Open AI Gym environment
    """

    def __init__(self, env_name, state_builder=ALEStateBuilder(), repeat_action=4, no_op=30, monitoring_path=None):
        assert isinstance(state_builder, StateBuilder), 'state_builder should inherit from StateBuilder'
        assert isinstance(repeat_action, (int, tuple)), 'repeat_action should be int or tuple'
        if isinstance(repeat_action, int):
            assert repeat_action >= 1, "repeat_action should be >= 1"
        elif isinstance(repeat_action, tuple):
            assert len(repeat_action) == 2, 'repeat_action should be a length-2 tuple: (min frameskip, max frameskip)'
            assert repeat_action[0] < repeat_action[1], 'repeat_action[0] should be < repeat_action[1]'

        super(GymEnvironment, self).__init__()

        self._state_builder = state_builder
        self._env = gym.make(env_name)
        self._env.env.frameskip = repeat_action
        self._no_op = max(0, no_op)
        self._done = True

        if monitoring_path is not None:
            self._env = Monitor(self._env, monitoring_path, video_callable=need_record)

    @property
    def available_actions(self):
        return self._env.action_space.n

    @property
    def state(self):
        return None if self._state is None else self._state_builder(self._state)

    @property
    def lives(self):
        return self._env.env.ale.lives()

    @property
    def frame(self):
        return Image.fromarray(self._state)

    def do(self, action):
        self._state, self._reward, self._done, _ = self._env.step(action)
        self._score += self._reward
        return self.state, self._reward, self._done

    def reset(self):
        super(GymEnvironment, self).reset()

        self._state = self._env.reset()

        # Random number of initial no-op to introduce stochasticity
        if self._no_op > 0:
            for _ in six.moves.range(np.random.randint(1, self._no_op)):
                self._state, _, _, _ = self._env.step(0)

        return self.state
Ejemplo n.º 8
0
    def log_policy_rollout(self, policy, env_name, pytorch_policy=False):
        env = Monitor(gym.make(env_name), './video', force=True)
        # env = gym.make(env_name)
        # env.monitor.start('./video', force=True)

        done = False
        episode_reward = 0
        episode_length = 0
        observation = env.reset()

        while not done:
            if pytorch_policy:
                observation = torch.tensor(observation, dtype=torch.float32)
                action = policy.act(observation)[0].data.cpu().numpy()
            else:
                action = policy.act(observation)[0]
            observation, reward, done, info = env.step(action)

            episode_reward += reward
            episode_length += 1

        print('Total reward:', episode_reward)
        print('Total length:', episode_length)

        env.close()
        # env.monitor.close()
        show_video()
Ejemplo n.º 9
0
def test_steps_limit_restart():
    with helpers.tempdir() as temp:
        env = gym.make('test.StepsLimitCartpole-v0')
        env = Monitor(env, temp, video_callable=False)
        env.reset()

        # Episode has started
        _, _, done, info = env.step(env.action_space.sample())
        assert done == False

        # Limit reached, now we get a done signal and the env resets itself
        _, _, done, info = env.step(env.action_space.sample())
        assert done == True
        assert env.episode_id == 1

        env.close()
Ejemplo n.º 10
0
def cart_pole_1():

    env = gym.make('CartPole-v0')
    # print('[cart_pole_1]', env.action_space)                    # Discrete(2)
    # print('[cart_pole_1]', env.observation_space)               # Box(4,)
    # # action取非负整数0或1。Box表示一个n维的盒子,因此observation是一个4维的数组。我们可以试试box的上下限。
    # print('[cart_pole_1]', env.observation_space.high)          # [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
    # print('[cart_pole_1]', env.observation_space.low)           # [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]

    env = Monitor(env=env,
                  directory='./tmp/cartpole-experiment-0202',
                  video_callable=False,
                  write_upon_reset=True)

    observation = env.reset()  # 重置环境的状态,返回观察

    for t in range(100):
        env.render()  # 重绘环境的一帧
        print('[cart_pole_1] observation old:', observation)
        action = env.action_space.sample()
        # action = t % 2
        print('[cart_pole_1] action', action)
        observation, reward, done, info = env.step(
            action)  # 推进一个时间步长,返回observation,reward,done,info
        print('[cart_pole_1] observation new:', observation,
              '[reward, done, info]:', reward, done, info)

        if done:
            print("[observation] Done after {} time steps".format(t + 1))
            break

    env.close()
Ejemplo n.º 11
0
def test_steps_limit_restart():
    with helpers.tempdir() as temp:
        env = gym.make('test.StepsLimitCartpole-v0')
        env = Monitor(env, temp, video_callable=False)
        env.reset()

        # Episode has started
        _, _, done, info = env.step(env.action_space.sample())
        assert done == False

        # Limit reached, now we get a done signal and the env resets itself
        _, _, done, info = env.step(env.action_space.sample())
        assert done == True
        assert env.episode_id == 1

        env.close()
Ejemplo n.º 12
0
def enjoy(policy,
          env,
          save_path=None,
          save_video=False,
          obs_fn=None,
          nepochs=100):
    """
        Enjoy and flush your result using Monitor class.
    """
    if save_video:
        assert save_path is not None, 'A path to save videos must be provided!'
    policy.cuda()
    policy.eval()
    if save_video:
        env = Monitor(env, directory=save_path)

    for e in range(0, 100):
        done = False
        obs = env.reset()
        episode_rwd = 0
        while not done:
            env.render()
            if obs_fn is not None:
                obs = obs_fn(obs)
            obs = Variable(torch.from_numpy(obs[np.newaxis])).float().cuda()
            value, action, logprob, mean = policy(obs)
            action = action.data[0].cpu().numpy()
            obs, reward, done, _ = env.step(action)
            episode_rwd += reward
        print('Episode reward is', episode_rwd)
def play(N=1000):

    # Change this to 'AssaultNoFrameskip-v4' to play the second game
    env = wrap_atari_deepmind('BreakoutNoFrameskip-v4', False)
    env = Monitor(env, directory + "/", force=True)
    agent.copy(DQN_online[4], sess_o)
    tot_reward = []
    episode = 1
    i = 0
    while i < N:

        r = 0
        s = env.reset()
        terminal = False
        episode_reward = 0
        while not terminal:

            env.render()
            a = agent.get_action(agent, env, np.array(s))
            s_next, r, terminal, dizi = env.step(a)
            episode_reward += r
            i = i + 1
            s = s_next
        tot_reward.append(episode_reward)
        print("Episode reward: ", episode_reward)
        episode = episode + 1
    env.close()
Ejemplo n.º 14
0
def log_policy_rollout(params, actor, env_name, video_name):
    cur_time = time.strftime("[%Y-%m-%d_%H:%M:%S]", time.localtime())
    save_path_name = os.path.join(
        params.save_path, 'video',
        '{}->{}{}.mp4'.format(params.prefix, video_name, cur_time))
    env = gen_env(env_name)
    env = Monitor(env, save_path_name, force=True)
    done = False
    episode_reward = 0.
    episode_length = 0.
    action_list = []
    observation = env.reset()
    print('\n    > Sampling trajectory...')
    while not done:
        action = actor.gen_action(
            torch.tensor(observation, dtype=torch.float32).cuda())[0]
        action_list.append(action)
        action = action.cpu()
        if type(env.action_space.sample()) is type(int(0)):
            action = int(action)
        observation, reward, done, info = env.step(action)
        episode_reward += reward
        episode_length += 1
    # print("Action Series: {}".format(action_list))
    print('    > Total reward:', episode_reward)
    print('    > Total length:', episode_length)
    print('------------------------------------')
    env.close()
    print('Finished Sampling, saved video in {}.\n'.format(save_path_name))
Ejemplo n.º 15
0
def run_video_agent(model, eps=500):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    env = gym.make('BipedalWalker-v3')
    env = Monitor(env,
                  './vid',
                  video_callable=lambda episode_id: True,
                  force=True)
    obs = env.reset()
    last_obs = obs

    fitness = 0.0

    for _ in range(eps):
        env.render()

        obs = torch.from_numpy(obs).float().to(device)
        action = (model(obs).detach()).cpu().numpy()
        new_obs, reward, done, info = env.step(action)
        fitness += reward
        obs = new_obs

        if done:
            break

    env.close()
    print("Best score ", fitness)
Ejemplo n.º 16
0
def play(agent_dir, num_episodes, max_episode_steps, save_videos):
    agent = get_agent(gin.query_parameter("train.agent"))(make_env_fn(
        gin.query_parameter("train.env_id"),
        episode_time_limit=max_episode_steps))
    agent.pretrain_setup(gin.query_parameter("train.total_timesteps"))

    ckpt_path = tf.train.latest_checkpoint(
        os.path.join(agent_dir, "best-weights"))
    checkpoint = tf.train.Checkpoint(agent)
    checkpoint.restore(
        ckpt_path).assert_existing_objects_matched().expect_partial()

    env = agent.make_env()

    if save_videos:
        env = Monitor(
            env,
            os.path.join(agent_dir, "monitor"),
            video_callable=lambda _: True,
            force=True,
        )

    try:
        episodes = 0
        obs = env.reset()
        while episodes < num_episodes:
            action = agent.act(np.expand_dims(obs, 0),
                               deterministic=True).numpy()
            obs, _, done, _ = env.step(action[0])
            env.render()
            if done:
                obs = env.reset()
                episodes += 1
    except KeyboardInterrupt:
        env.close()
Ejemplo n.º 17
0
def test_one(agent, dir_record, itr):
    agent.env.seed(itr)

    env_record = Monitor(agent.env, directory=dir_record)

    ob = env_record.reset()
    agent.frame_sequence.insert(atari_img_preprocess(ob))
    while True:
        fs1 = agent.frame_sequence.memory_as_array()
        ## Find next action
        action = agent.next_action()
        ob, reward, done, _ = env_record.step(action)
        agent.frame_sequence.insert(atari_img_preprocess(ob))
        fs2 = agent.frame_sequence.memory_as_array()
        ## Save results into the replay memory
        agent.replay_memory.insert(fs1, action, np.clip(reward, -1, 1), fs2, done)
        if done:
            break
    #end

    total_reward = env_record.get_episode_rewards()[0]
    
    env_record.close()

    return total_reward
#end
Ejemplo n.º 18
0
def test(model, args, verbose=True):
    # Initialize environment and model
    env = Monitor(gym.make(args.env), './recordings', force=True)
    model.eval()

    # Initialize variables
    done, ep_reward = False, []
    s = env.reset()
    hx, cx = init_hidden(1, args.size_hidden)

    # Generate rollout
    while not done:  # and step < env.spec.timestep_limit:

        # Render if enabled
        if args.render: env.render()

        # Take a step in environment
        logit, _, _, _ = model.forward(s, hx, cx)
        prob = F.softmax(logit, dim=-1)
        action = prob.multinomial(1).data
        s, r, done, _ = env.step(action.squeeze().numpy())
        ep_reward.append(r)

        if done: break

    # Close environment and show performance
    env.close()
    if verbose is True:
        print('Test agent achieved a reward of', np.sum(ep_reward))
Ejemplo n.º 19
0
    def __init__(self,
                 render=None,
                 max_episode_steps=2000,
                 deterministic=True):
        monitor = None
        action_repeat = True
        episodic_life = True
        env = retro.make("SuperMarioBros-Nes")
        if monitor is not None:
            env = Monitor(env, monitor)
        if render is not None:
            env = AutoRenderer(env, auto_render_period=render)
        if action_repeat:
            env = FrameStack(env, 8)
        env = TimeLimit(env, max_episode_steps=max_episode_steps)
        if episodic_life:
            env = EpisodicLifeEnv(env, [0] * 9)

        env.reset()
        _, _, _, first_info = env.step(
            [0] * 9)  # TODO the order of the info dict is random
        self.first_info = first_info
        env.reset()
        self.env = env
        raw_env = env.unwrapped
        self.index_right = raw_env.buttons.index("RIGHT")
        self.index_a = raw_env.buttons.index("A")
        self.index_b = raw_env.buttons.index("B")
        self.obs_shape = len(first_info.values())
        self.agent = Agent(2, self.obs_shape, deterministic, embed=True)
        self.weight_shape = self.agent.weight_shape()
        self.n_weights = self.weight_shape[0] * self.agent.weight_shape()[1]
        with open("scaler.pickle", "rb") as pickle_out_file:
            self.scaler = pickle.load(pickle_out_file)
Ejemplo n.º 20
0
def main():
    """
    You can test your game when you finish setting up your environment.
    Input range from 0 to 5:
        0 : South (Down)
        1 : North (Up)
        2 : East (Right)
        3 : West (Left)
        4: Pick up
        5: Drop off
    """

    GAME = "Assignment1-Taxi-v2"
    env = gym.make(GAME)
    n_state = env.observation_space.n
    n_action = env.action_space.n
    env = Monitor(env, "taxi_simple", force=True)

    s = env.reset()
    steps = 100
    for step in range(steps):
        env.render()
        action = int(input("Please type in the next action:"))
        s, r, done, info = env.step(action)
        print(s)
        print(r)
        print(done)
        print(info)

    # close environment and monitor
    env.close()
Ejemplo n.º 21
0
class Simulation():
    def __init__(self, environment="CartPole-v0", save_every=5):
        env = gym.make(environment)
        self.env = Monitor(
            env,
            './video',
            video_callable=lambda episode_no: episode_no % save_every == 0,
            force=True)
        if environment == "Pong-v0":
            self.env = wrap_deepmind(env, frame_stack=True, scale=True)
        self.environment = environment
        #self.env.seed(0)
    def reset(self):
        observation = self.env.reset()
        if self.environment == "Pong-v0":
            observation = torch.from_numpy(np.stack(observation)).transpose_(
                0, 2).transpose_(1, 2).float().unsqueeze(0)
        else:
            observation = torch.from_numpy(observation).float().unsqueeze(0)
        return observation

    def step(self, action):
        observation, reward, is_done, info = self.env.step(action)
        if self.environment == "Pong-v0":
            observation = torch.from_numpy(np.stack(observation)).transpose_(
                0, 2).transpose_(1, 2).float().unsqueeze(0)
        else:
            observation = torch.from_numpy(observation).float().unsqueeze(0)
        return observation, reward, is_done, info

    def render(self):
        self.env.render()

    def close(self):
        self.env.close()
Ejemplo n.º 22
0
def evaluate(agent, env, n_episodes=5, render=False, record=False):
    total_rewards = []

    if record:
        env = Monitor(env, './videos/', force=True)

    for episode in range(n_episodes):

        obs = env.reset()
        obs = obs_reshape(obs)
        total_reward = 0.0
        episode_length = 0

        done = False
        while not done:
            action = agent.act(obs.reshape(1, *obs.shape))
            next_obs, reward, done, _ = env.step(action[0])
            next_obs = obs_reshape(next_obs)
            obs = next_obs
            
            total_reward += reward
            episode_length += 1

            if render:
                env.render()                
                
        total_rewards.append(total_reward)
        
#         print(f">> episode = {episode + 1} / {n_episodes}, total_reward = {total_reward:10.4f}, episode_length = {episode_length}")
        
    if render:
        env.close()

    return np.mean(total_rewards)
Ejemplo n.º 23
0
    def run(self):
        """
        Run the agent to see it work
        """
        from gym.wrappers import Monitor
        env = Monitor(self.env, './video', force=True)
        state = env.reset()
        reward_sum = 0
        episode_number = 0
        while episode_number < 2:
            # forward the policy network and sample an action from the returned probability
            aprob, h = policy_forward(state)

            action = 0 if np.random.uniform(
            ) < aprob else 1  # randomly take 1 of two actions. we are sampling from a bernoulli distribution here

            # step the environment and get new measurements
            state, reward, done, info = env.step(action)
            reward_sum += reward
            env.render()

            if done:  # an episode finished
                episode_number += 1
                print("Episode finished with total reward", reward_sum)
                reward_sum = 0
                state = env.reset()  # reset env
Ejemplo n.º 24
0
def simulate(env,
             agent,
             deterministic=True,
             num_episodes=3,
             render=True,
             wait_after_render=1e-3,
             render_kwargs=None,
             record_video=False):
    render_kwargs = render_kwargs or dict()

    assert env.max_episode_steps > 0
    if record_video:
        env = Monitor(env, directory='./data')

    episode_info = []
    for _ in range(num_episodes):
        obs = env.reset()
        agent.reset()
        done = False
        episode_return = 0
        t = 0
        while not done:
            if render:
                env.render(**render_kwargs)
                time.sleep(wait_after_render)

            with torch.no_grad():
                action = agent.act(obs, deterministic)
            obs, reward, done, _ = env.step(action)
            episode_return += reward
            t += 1
        episode_info.append((t, episode_return))

    return episode_info
Ejemplo n.º 25
0
def test(id):
    q = train(id)
    env = gym.make(id)
    env = env.unwrapped
    bounds, state_size = getBounds_Statesize(id)
    if RECORD:
        env = Monitor(env, './cartpole-experiment-0201', force=True)
        observation = env.reset()
        state = observation2state(observation, state_size, bounds)
        for j in range(20000):
            #env.render()
            action = np.argmax(q[state])
            observation, reward, done, info = env.step(action)
            new_state = observation2state(observation, state_size, bounds)
            state = new_state
            if done:
                print(j)
                break
        env.close()
    episode = 200
    if id == 'CartPole-v0':
        train_size = 20000
    else:
        train_size = 2000
    result = []
    for i in range(episode):
        observation = env.reset()
        state = observation2state(observation, state_size, bounds)
        for j in range(train_size):
            #env.render()
            action = np.argmax(q[state])
            observation, reward, done, info = env.step(action)
            new_state = observation2state(observation, state_size, bounds)
            state = new_state
            if done or j == train_size - 1:
                result.append(j + 1)
                break
    result = np.array(result)
    if id != 'CartPole-v0':
        result = -result
    plt.plot(result)
    plt.xlabel("number")
    plt.ylabel("reward")
    plt.show()
    print("mean", np.mean(result))
    print("var", np.std(result))
    print("len", len(result))
Ejemplo n.º 26
0
def TestDQNAgent(sess,
                 env,
                 q_value_estimator,
                 state_preprocessor,
                 num_episodes,
                 experiment_dir,
                 record_steps=1):

    EpisodeStats = namedtuple('Stats', ['episode_lengths', 'episode_rewards'])
    stats = EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes))

    ckpt_dir = os.path.join(experiment_dir, 'checkpoints')
    record_path = os.path.join(experiment_dir, 'record/tests/')

    if not os.path.exists(record_path):
        os.makedirs(record_path)

    saver = tf.train.Saver()
    latest_checkpoint = tf.train.latest_checkpoint(ckpt_dir)
    if latest_checkpoint:
        print('\nLoading model checkpoint {}...'.format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)

    total_t = sess.run(tf.contrib.framework.get_global_step())
    epsilon = 0.1
    policy = make_epsilon_greedy_policy(q_value_estimator, len(VALID_ACTIONS))

    env = Monitor(env, directory=record_path, video_callable=lambda count: count % record_steps == 0, resume=True)
    for i_episode in range(num_episodes):
        state = env.reset()
        state = state_preprocessor.process(sess, state)
        state = np.stack([state] * 4, axis=2)

        for t in itertools.count():
            env.render()

            print("\rStep {} ({}) | Episode {}/{}".format(t, total_t, i_episode + 1, num_episodes), end="")
            sys.stdout.flush()

            action_probs = policy(sess, state, epsilon)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
            next_state = state_preprocessor.process(sess, next_state)
            next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, 2), axis=2)

            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            if done:
                break

            state = next_state
            total_t += 1

        episode_stats = EpisodeStats(episode_lengths=stats.episode_lengths[:i_episode + 1],
                                     episode_rewards=stats.episode_rewards[:i_episode + 1])
        yield total_t, episode_stats

    return stats
Ejemplo n.º 27
0
def test_only_complete_episodes_written():
    with helpers.tempdir() as temp:
        env = gym.make('CartPole-v0')
        env = Monitor(env, temp, video_callable=False)
        env.reset()
        d = False
        while not d:
            _, _, d, _ = env.step(env.action_space.sample())

        env.reset()
        env.step(env.action_space.sample())

        env.close()

        # Only 1 episode should be written
        results = monitoring.load_results(temp)
        assert len(results['episode_lengths']) == 1, "Found {} episodes written; expecting 1".format(len(results['episode_lengths']))
Ejemplo n.º 28
0
class OpenAIGym(Environment):

    def __init__(self, gym_id, monitor=None, monitor_safe=False, monitor_video=0):
        """
        Initialize OpenAI Gym.

        Args:
            gym_id: OpenAI Gym environment ID. See https://gym.openai.com/envs
            monitor: Output directory. Setting this to None disables monitoring.
            monitor_safe: Setting this to True prevents existing log files to be overwritten. Default False.
            monitor_video: Save a video every monitor_video steps. Setting this to 0 disables recording of videos.
        """

        self.gym_id = gym_id
        self.gym = gym.make(gym_id)  # Might raise gym.error.UnregisteredEnv or gym.error.DeprecatedEnv

        if monitor:
            if monitor_video == 0:
                video_callable = False
            else:
                video_callable = (lambda x: x % monitor_video == 0)
            self.gym = Monitor(self.gym, monitor, force=not monitor_safe, video_callable=video_callable)

    def __str__(self):
        return 'OpenAIGym({})'.format(self.gym_id)

    def close(self):
        self.gym = None

    def reset(self):
        return self.gym.reset()

    def execute(self, action):
        if isinstance(self.gym.action_space, gym.spaces.Box):
            action = [action]  # some gym environments expect a list (f.i. Pendulum-v0)
        state, reward, terminal, _ = self.gym.step(action)
        return state, reward, terminal

    @property
    def states(self):
        if isinstance(self.gym.observation_space, Discrete):
            return dict(shape=(), type='float')
        else:
            return dict(shape=tuple(self.gym.observation_space.shape), type='float')

    @property
    def actions(self):
        if isinstance(self.gym.action_space, Discrete):
            return dict(continuous=False, num_actions=self.gym.action_space.n)
        elif len(self.gym.action_space.shape) == 1:
            return dict(continuous=True)
        elif len(self.gym.action_space.shape) > 1:
            return {'action' + str(n): dict(continuous=True) for n in range(len(self.gym.action_space.shape))}
        else:
            raise TensorForceError()

    def monitor(self, path):
        self.gym = Monitor(self.gym, path)
Ejemplo n.º 29
0
def test_env_reuse():
    with helpers.tempdir() as temp:
        env = gym.make('Autoreset-v0')
        env = Monitor(env, temp)

        env.reset()

        _, _, done, _ = env.step(None)
        assert not done
        _, _, done, _ = env.step(None)
        assert done

        _, _, done, _ = env.step(None)
        assert not done
        _, _, done, _ = env.step(None)
        assert done

        env.close()
Ejemplo n.º 30
0
def test_env_reuse():
    with helpers.tempdir() as temp:
        env = gym.make('Autoreset-v0')
        env = Monitor(env, temp)

        env.reset()

        _, _, done, _ = env.step(None)
        assert not done
        _, _, done, _ = env.step(None)
        assert done

        _, _, done, _ = env.step(None)
        assert not done
        _, _, done, _ = env.step(None)
        assert done

        env.close()
Ejemplo n.º 31
0
def train_one(agent, dir_record, seed=None):
    if not seed is None:
        agent.env.seed(seed)

    env_record = Monitor(agent.env, directory=dir_record)

    ob = env_record.reset()
    agent.frame_sequence.reset()
    agent.frame_sequence.insert(atari_img_preprocess(ob))
    while True:
        fs1 = agent.frame_sequence.memory_as_array()
        ## Find next action
        action = agent.next_action()
        ob, reward, done, _ = env_record.step(action)
        agent.frame_sequence.insert(atari_img_preprocess(ob))
        fs2 = agent.frame_sequence.memory_as_array()
        ## Save results into the replay memory
        agent.replay_memory.insert(fs1, action, reward, fs2, done)
        ## Perform learning
        if len(agent.replay_memory.memory) >= REPLAY_START_SIZE:
            agent.learn()
        ## If done == True, then this game is finished
        if done:
            break
    #end

    ## Save the model
    agent.save_model(os.path.join(dir_record, 'model.ckpt'))

    total_reward = env_record.get_episode_rewards()[0]

    env_record.close()

    ## Save cost graph per iteration
    costs_episode = zip(*agent.costs)
    fig = plt.figure()
    plt.plot(*costs_episode)
    plt.title('Costs during training the agent')
    plt.xlabel('Iteration')
    plt.ylabel('Cost')
    fig.savefig(os.path.join(dir_record, 'costs.png'))
    plt.close(fig)

    ## Save error graph per iteration
    errors_episode = zip(*agent.errors)
    fig = plt.figure()
    plt.plot(*errors_episode)
    plt.title('Errors during training the agent')
    plt.xlabel('Iteration')
    plt.ylabel('Error')
    fig.savefig(os.path.join(dir_record, 'errors.png'))
    plt.close(fig)

    return total_reward


#end
Ejemplo n.º 32
0
    def run(self, agent, render=False):
        agent.fitness = 0
        self.env.seed(self.seed)
        env = self.env
        if render:
            env = Monitor(env, './videos/' + str(time()) + '/')
        observation = env.reset()

        action_frequency = [0] * self.num_actions

        action_count = 0
        done = False
        while not done:
            # if render:
            #    env.render()

            pos = min(action_count // self.num_rep, len(agent.commands) - 1)
            action = agent.commands[pos]
            action_count += 1

            observation, reward, done, info = env.step(action)
            agent.fitness += reward

            action_frequency[action] += 1

        final_observation = list(observation)

        # For experiment 2D MAP-Elites polyhashBC
        if self.mode == ME_POLYHASH_BC:
            # calculate polynomial hash
            b1 = 3
            b2 = 7

            runningHash1 = 0
            runningHash2 = 0
            for cmd in agent.commands:
                runningHash1 = (runningHash1 * b1 + cmd) % len(agent.commands)
                runningHash2 = (runningHash2 * b2 + cmd) % len(agent.commands)
            agent.features = (runningHash1, runningHash2)
        # For experiment fitnessBC
        elif self.mode == ME_FITNESS_BC:
            agent.features = (agent.fitness, agent.fitness)
        # For experiment entropyBC
        elif self.mode == ME_ENTROPY_BC:
            # calculate RLE approximation
            numNewChars = 0
            prevChar = -2
            for cmd in agent.commands:
                if cmd != prevChar:
                    numNewChars = numNewChars + 1
                    prevChar = cmd
            agent.features = (numNewChars, numNewChars)
        # For experiment endpointBC and others
        else:
            agent.features = tuple(final_observation[:1])

        agent.action_count = action_count
Ejemplo n.º 33
0
class Gym_atari_env(object):
    def __init__(self,env_name,max_path_length=None,video=False,video_dir=None,image_per_state=4):
        ## arguments
        ## env_name: name of the gym env
        ## max_path_length: max path length before a hard reset, if None, just use env default
        print("INFO: creating new env of %s" % env_name)
        ## create a new env
        self.env = gym.make(env_name)
        # remove the timelimitwrapper
        #self.env = env.env
        self.image_per_state=image_per_state
        ## define max path, either from user input or env default
        self.max_path_length = max_path_length or self.env.spec.max_episode_steps
        
        ## define parameters directly
        self.discrete =True
        ##all image will be processed to the dimension below
        self.ob_dim = [84,84,self.image_per_state]
        self.ac_dim = 4 if env_name == "Pong-v0" or env_name == "Breakout-v0" else self.env.action_space.n
        
        ## if video, use monitor
        if video:
            self.env = Monitor(self.env, directory=video_dir, video_callable=lambda x: True, resume=True)
        ## init process image
        #self.process_image
    
    #@define_scope
    def process_image(self,raw_image):
    
        ## grep scale
        out = np.mean(raw_image, axis=2).astype(np.uint8)
        ## crop, down sample and normalise
        out = out[34:194,:][::2, ::2] / 255
        ## dim expand
        out = np.hstack([np.zeros((80,2)),out,np.zeros((80,2))])
        out = np.vstack([np.zeros((2,84)),out,np.zeros((2,84))])

        return out

    def step(self,action,obs,sess=None,training=True):
        ## arguments
        ## action: action to take        
        ## return ob, rew, done, info
 
        raw_image,rew,done,info = self.env.step(action)
        next_obs = self.process_image(raw_image)
        next_obs = np.append(obs[:,:,1:], np.expand_dims(next_obs, 2), axis=2)
        
        return (next_obs,rew,done,info)
    
    def reset(self,sess=None):
        raw_image = self.env.reset()
        obs = self.process_image(raw_image)
        return np.stack([obs] * self.image_per_state, axis=2)
    
    def statistic(self):
        return self.discrete, self.max_path_length, self.ob_dim, self.ac_dim
Ejemplo n.º 34
0
def test_only_complete_episodes_written():
    with helpers.tempdir() as temp:
        env = gym.make('CartPole-v0')
        env = Monitor(temp, video_callable=False)(env)
        env.reset()
        d = False
        while not d:
            _, _, d, _ = env.step(env.action_space.sample())

        env.reset()
        env.step(env.action_space.sample())

        env.close()

        # Only 1 episode should be written
        results = monitoring.load_results(temp)
        assert len(results['episode_lengths']
                   ) == 1, "Found {} episodes written; expecting 1".format(
                       len(results['episode_lengths']))
Ejemplo n.º 35
0
class GymEnvironment(Environment):
    def __init__(self, env_id, directory=None, force=True, monitor_video=0):
        super(GymEnvironment, self).__init__(env_id=env_id)
        self._env = gym.make(env_id)

        if directory:
            if monitor_video == 0:
                video_callable = False
            else:
                video_callable = (lambda x: x % monitor_video == 0)
            self._env = Monitor(self._env, directory, video_callable=video_callable, force=force)

    def __str__(self):
        return 'OpenAIGym({})'.format(self._env_id)

    def close(self):
        if not self._closed:
            self._env.close()
            self._closed = True

    def reset(self, return_spec=True):
        self._reset()
        state = self._env.reset()
        if return_spec:
            return EnvSpec(action=None, state=None, reward=0, done=False, next_state=state)
        return state

    def step(self, action, state, return_spec=True):
        self._step()
        if isinstance(action, (list, np.ndarray)):
            if isinstance(self._env.action_space, Discrete) or isinstance(action,
                                                                          (list, np.ndarray)):
                action = action[0]
        if isinstance(self._env.action_space, Box) and not isinstance(action, (list, np.ndarray)):
            action = list(action)
        next_state, reward, done, _ = self._env.step(action)
        if return_spec:
            return EnvSpec(
                action=action, state=state, reward=reward, done=done, next_state=next_state)
        return next_state, reward, done

    @property
    def num_states(self):
        return self._env.observation_space.shape[0]

    @property
    def num_actions(self):
        if isinstance(self._env.action_space, Box):
            return self._env.action_space.shape[0]
        else:
            return self._env.action_space.n

    @property
    def is_continuous(self):
        return not isinstance(self._env.action_space, Discrete)
Ejemplo n.º 36
0
class FlappyBirdDNN(gym.Wrapper):
    ''' Game environment for Function Approximation with Feed Forward Neural Networks. '''
    def __init__(self, env):
        '''
        Initializes the environment.
        
        Args:
            env (PLEEnv): A Pygame environment.
        '''
        super().__init__(env)

    def save_output(self, outdir=None):
        '''
        Saves videos of the game.
        
        Args:
            outdir (str): Output directory.
        '''
        if outdir:
            self.env = Monitor(self.env, directory=outdir, force=True)

    def step(self, action):
        '''
        Lets the agent take an action and observe the next state and reward.
        
        Args:
            action (int): 0 or 1.
        
        Returns:
            tuple: state, reward, terminal.
        '''
        _, reward, terminal, _ = self.env.step(action)
        state = self.getGameState()
        if not terminal: reward += 0.5
        else: reward = -1000
        if reward >= 1: reward = 5
        return state, reward, terminal, {}

    def getGameState(self):
        '''
        Returns the current game state.
        
        Returns:
            list: A list representing the game state.
        '''
        gameState = self.env.game_state.getGameState()
        hor_dist_to_next_pipe = gameState['next_pipe_dist_to_player']
        ver_dist_to_next_pipe = gameState['next_pipe_bottom_y'] - gameState[
            'player_y']
        state = []
        state.append(gameState['player_vel'])
        state.append(hor_dist_to_next_pipe)
        state.append(ver_dist_to_next_pipe)
        return state
Ejemplo n.º 37
0
class GymEnvironment(Environment):
    def __init__(self, env_id, directory=None, force=True, monitor_video=0):
        super(GymEnvironment, self).__init__(env_id=env_id)
        self._env = gym.make(env_id)

        if directory:
            if monitor_video == 0:
                video_callable = False
            else:
                video_callable = (lambda x: x % monitor_video == 0)
            self._env = Monitor(self._env, directory, video_callable=video_callable, force=force)

    def __str__(self):
        return 'OpenAIGym({})'.format(self._env_id)

    def close(self):
        if not self._closed:
            self._env.close()
            self._closed = True

    def reset(self, return_spec=True):
        self._reset()
        state = self._env.reset()
        if return_spec:
            return EnvSpec(action=None, state=None, reward=0, done=False, next_state=state)
        return state

    def step(self, action, state, return_spec=True):
        self._step()
        if isinstance(action, (list, np.ndarray)):
            if isinstance(self._env.action_space, Discrete) or isinstance(action, (list, np.ndarray)):
                action = action[0]
        if isinstance(self._env.action_space, Box) and not isinstance(action, (list, np.ndarray)):
            action = list(action)
        next_state, reward, done, _ = self._env.step(action)
        if return_spec:
            return EnvSpec(
                action=action, state=state, reward=reward, done=done, next_state=next_state)
        return next_state, reward, done

    @property
    def num_states(self):
        return self._env.observation_space.shape[0]

    @property
    def num_actions(self):
        if isinstance(self._env.action_space, Box):
            return self._env.action_space.shape[0]
        else:
            return self._env.action_space.n

    @property
    def is_continuous(self):
        return not isinstance(self._env.action_space, Discrete)
Ejemplo n.º 38
0
def test_no_monitor_reset_unless_done():
    def assert_reset_raises(env):
        errored = False
        try:
            env.reset()
        except error.Error:
            errored = True
        assert errored, "Env allowed a reset when it shouldn't have"

    with helpers.tempdir() as temp:
        # Make sure we can reset as we please without monitor
        env = gym.make('CartPole-v0')
        env.reset()
        env.step(env.action_space.sample())
        env.step(env.action_space.sample())
        env.reset()

        # can reset once as soon as we start
        env = Monitor(env, temp, video_callable=False)
        env.reset()

        # can reset multiple times in a row
        env.reset()
        env.reset()

        env.step(env.action_space.sample())
        env.step(env.action_space.sample())
        assert_reset_raises(env)

        # should allow resets after the episode is done
        d = False
        while not d:
            _, _, d, _ = env.step(env.action_space.sample())

        env.reset()
        env.reset()

        env.step(env.action_space.sample())
        assert_reset_raises(env)

        env.close()
Ejemplo n.º 39
0
def cart_pole_with_qlearning():
    from gym.wrappers import Monitor
    env = gym.make('CartPole-v0')
    experiment_filename = './cartpole-experiment-1'
    env = Monitor(env, experiment_filename, force=True)
    observation = env.reset()

    goal_average_steps = 195
    max_number_of_steps = 200
    number_of_iterations_to_average = 100

    number_of_features = env.observation_space.shape[0]
    last_time_steps = np.ndarray(0)

    cart_position_bins = pd.cut([-2.4, 2.4], bins=10, retbins=True)[1][1:-1]
    pole_angle_bins = pd.cut([-2, 2], bins=10, retbins=True)[1][1:-1]
    cart_velocity_bins = pd.cut([-1, 1], bins=10, retbins=True)[1][1:-1]
    angle_rate_bins = pd.cut([-3.5, 3.5], bins=10, retbins=True)[1][1:-1]

    learner = QLearner(state_discretization=Binning([[-2.4, 2.4], [-2, 2], [-1., 1], [-3.5, 3.5]], [10] * 4),
                       discrete_actions=[i for i in range(env.action_space.n)],
                       alpha=0.2,
                       gamma=1,
                       random_action_rate=0.5,
                       random_action_decay_rate=0.99)

    for episode in range(50000):
        action = learner.set_initial_state(observation)

        for step in range(max_number_of_steps - 1):
            observation, reward, done, info = env.step(action)

            if done:
                reward = -200
                observation = env.reset()

            action = learner.move(observation, reward)

            if done:
                last_time_steps = np.append(last_time_steps, [int(step + 1)])
                if len(last_time_steps) > number_of_iterations_to_average:
                    last_time_steps = np.delete(last_time_steps, 0)
                break

        if last_time_steps.mean() > goal_average_steps:
            print "Goal reached!"
            print "Episodes before solve: ", episode + 1
            print u"Best 100-episode performance {} {} {}".format(last_time_steps.max(),
                                                                  unichr(177),  # plus minus sign
                                                                  last_time_steps.std())
            break

    env.close()
Ejemplo n.º 40
0
    def evaluate(self, n_games=1, save_path="./records", use_monitor=True, record_video=True, verbose=True,
                 t_max=100000):
        """Plays an entire game start to end, records the logs(and possibly mp4 video), returns reward.

        :param save_path: where to save the report
        :param record_video: if True, records mp4 video
        :return: total reward (scalar)
        """
        env = self.make_env()

        if not use_monitor and record_video:
            raise warn("Cannot video without gym monitor. If you still want video, set use_monitor to True")

        if record_video :
            env = Monitor(env,save_path,force=True)
        elif use_monitor:
            env = Monitor(env, save_path, video_callable=lambda i: False, force=True)

        game_rewards = []
        for _ in range(n_games):
            # initial observation
            observation = env.reset()
            # initial memory
            prev_memories = [np.zeros((1,) + tuple(mem.output_shape[1:]),
                                      dtype=get_layer_dtype(mem))
                             for mem in self.agent.agent_states]

            t = 0
            total_reward = 0
            while True:

                res = self.agent_step(self.preprocess_observation(observation)[None, ...], *prev_memories)
                action, new_memories = res[0], res[1:]

                observation, reward, done, info = env.step(action[0])

                total_reward += reward
                prev_memories = new_memories

                if done or t >= t_max:
                    if verbose:
                        print("Episode finished after {} timesteps with reward={}".format(t + 1, total_reward))
                    break
                t += 1
            game_rewards.append(total_reward)

        env.close()
        del env
        return game_rewards
class PolicyMonitor(object):
  """
  Helps evaluating a policy by running an episode in an environment,
  saving a video, and plotting summaries to Tensorboard.

  Args:
    env: environment to run in
    policy_net: A policy estimator
    summary_writer: a tf.train.SummaryWriter used to write Tensorboard summaries
  """
  def __init__(self, env, policy_net, summary_writer, saver=None):

    self.video_dir = os.path.join(summary_writer.get_logdir(), "../videos")
    self.video_dir = os.path.abspath(self.video_dir)

    self.env = Monitor(env, directory=self.video_dir, video_callable=lambda x: True, resume=True)
    self.global_policy_net = policy_net
    self.summary_writer = summary_writer
    self.saver = saver
    self.sp = StateProcessor()

    self.checkpoint_path = os.path.abspath(os.path.join(summary_writer.get_logdir(), "../checkpoints/model"))

    try:
      os.makedirs(self.video_dir)
    except FileExistsError:
      pass

    # Local policy net
    with tf.variable_scope("policy_eval"):
      self.policy_net = PolicyEstimator(policy_net.num_outputs)

    # Op to copy params from global policy/value net parameters
    self.copy_params_op = make_copy_params_op(
      tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES),
      tf.contrib.slim.get_variables(scope="policy_eval", collection=tf.GraphKeys.TRAINABLE_VARIABLES))

  def _policy_net_predict(self, state, sess):
    feed_dict = { self.policy_net.states: [state] }
    preds = sess.run(self.policy_net.predictions, feed_dict)
    return preds["probs"][0]

  def eval_once(self, sess):
    with sess.as_default(), sess.graph.as_default():
      # Copy params to local model
      global_step, _ = sess.run([tf.contrib.framework.get_global_step(), self.copy_params_op])

      # Run an episode
      done = False
      state = atari_helpers.atari_make_initial_state(self.sp.process(self.env.reset()))
      total_reward = 0.0
      episode_length = 0
      while not done:
        action_probs = self._policy_net_predict(state, sess)
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        next_state, reward, done, _ = self.env.step(action)
        next_state = atari_helpers.atari_make_next_state(state, self.sp.process(next_state))
        total_reward += reward
        episode_length += 1
        state = next_state

      # Add summaries
      episode_summary = tf.Summary()
      episode_summary.value.add(simple_value=total_reward, tag="eval/total_reward")
      episode_summary.value.add(simple_value=episode_length, tag="eval/episode_length")
      self.summary_writer.add_summary(episode_summary, global_step)
      self.summary_writer.flush()

      if self.saver is not None:
        self.saver.save(sess, self.checkpoint_path)

      tf.logging.info("Eval results at step {}: total_reward {}, episode_length {}".format(global_step, total_reward, episode_length))

      return total_reward, episode_length

  def continuous_eval(self, eval_every, sess, coord):
    """
    Continuously evaluates the policy every [eval_every] seconds.
    """
    try:
      while not coord.should_stop():
        self.eval_once(sess)
        # Sleep until next evaluation cycle
        time.sleep(eval_every)
    except tf.errors.CancelledError:
      return
Ejemplo n.º 42
0
env = Monitor(env, directory=monitor_path, video_callable=lambda count: count % 50 == 0, resume=True)
for i in [6]:
    print("Loading Checkpoint from dqn{}.model".format(i))
    checkpoint = torch.load("dqn{}.model".format(i))
    episode = checkpoint['episode']
    policy_net.load_state_dict(checkpoint['state_dict'])
    for i_episode in range(200):
        state = env.reset()
        state = process(state)
        state = torch.cat(tuple([state] * 4), dim=1)
        episode_reward = 0
           
        for t in count():         
            action = get_action()
            next_state, reward, done, _ = env.step(action)
            num_steps+=1
            episode_reward += reward
            next_state = process(next_state)
            next_state = torch.cat((state[:,1:,:,:],next_state), dim=1)
            if done:
                break
        print("reward is {}".format(episode_reward))
        print(num_steps)
             

# In[ ]:


experiment_dir = os.path.abspath("./experiments/{}".format(env.spec.id))
i = 0
Ejemplo n.º 43
0
def deep_q_learning(sess,
                    env,
                    q_estimator,
                    target_estimator,
                    state_processor,
                    num_episodes,
                    experiment_dir,
                    replay_memory_size=500000,
                    replay_memory_init_size=50000,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    batch_size=32,
                    record_video_every=50):
    """
    Q-Learning algorithm for off-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.

    Args:
        sess: Tensorflow Session object
        env: OpenAI environment
        q_estimator: Estimator object used for the q values
        target_estimator: Estimator object used for the targets
        state_processor: A StateProcessor object
        num_episodes: Number of episodes to run for
        experiment_dir: Directory to save Tensorflow summaries in
        replay_memory_size: Size of the replay memory
        replay_memory_init_size: Number of random experiences to sampel when initializing 
          the reply memory.
        update_target_estimator_every: Copy parameters from the Q estimator to the 
          target estimator every N steps
        discount_factor: Gamma discount factor
        epsilon_start: Chance to sample a random action when taking an action.
          Epsilon is decayed over time and this is the start value
        epsilon_end: The final minimum value of epsilon after decaying is done
        epsilon_decay_steps: Number of steps to decay epsilon over
        batch_size: Size of batches to sample from the replay memory
        record_video_every: Record a video every N episodes

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])

    # The replay memory
    replay_memory = []

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)

    saver = tf.train.Saver()
    # Load a previous checkpoint if we find one
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)

    total_t = sess.run(tf.contrib.framework.get_global_step())

    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    # The policy we're following
    policy = make_epsilon_greedy_policy(
        q_estimator,
        len(VALID_ACTIONS))

    # Populate the replay memory with initial experience
    print("Populating replay memory...")
    state = env.reset()
    state = state_processor.process(sess, state)
    state = np.stack([state] * 4, axis=2)
    for i in range(replay_memory_init_size):
        action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps-1)])
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
        next_state = state_processor.process(sess, next_state)
        next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)
        replay_memory.append(Transition(state, action, reward, next_state, done))
        if done:
            state = env.reset()
            state = state_processor.process(sess, state)
            state = np.stack([state] * 4, axis=2)
        else:
            state = next_state

    # Record videos
    # Use the gym env Monitor wrapper
    env = Monitor(env,
                  directory=monitor_path,
                  resume=True,
                  video_callable=lambda count: count % record_video_every ==0)

    for i_episode in range(num_episodes):

        # Save the current checkpoint
        saver.save(tf.get_default_session(), checkpoint_path)

        # Reset the environment
        state = env.reset()
        state = state_processor.process(sess, state)
        state = np.stack([state] * 4, axis=2)
        loss = None

        # One step in the environment
        for t in itertools.count():

            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps-1)]

            # Add epsilon to Tensorboard
            episode_summary = tf.Summary()
            episode_summary.value.add(simple_value=epsilon, tag="epsilon")
            q_estimator.summary_writer.add_summary(episode_summary, total_t)

            # Maybe update the target estimator
            if total_t % update_target_estimator_every == 0:
                copy_model_parameters(sess, q_estimator, target_estimator)
                print("\nCopied model parameters to target network.")

            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
                    t, total_t, i_episode + 1, num_episodes, loss), end="")
            sys.stdout.flush()

            # Take a step
            action_probs = policy(sess, state, epsilon)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
            next_state = state_processor.process(sess, next_state)
            next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)

            # If our replay memory is full, pop the first element
            if len(replay_memory) == replay_memory_size:
                replay_memory.pop(0)

            # Save transition to replay memory
            replay_memory.append(Transition(state, action, reward, next_state, done))   

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            # Sample a minibatch from the replay memory
            samples = random.sample(replay_memory, batch_size)
            states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples))

            # Calculate q values and targets (Double DQN)
            q_values_next = q_estimator.predict(sess, next_states_batch)
            best_actions = np.argmax(q_values_next, axis=1)
            q_values_next_target = target_estimator.predict(sess, next_states_batch)
            targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * \
                discount_factor * q_values_next_target[np.arange(batch_size), best_actions]

            # Perform gradient descent update
            states_batch = np.array(states_batch)
            loss = q_estimator.update(sess, states_batch, action_batch, targets_batch)

            if done:
                break

            state = next_state
            total_t += 1

        # Add summaries to tensorboard
        episode_summary = tf.Summary()
        episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward")
        episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], node_name="episode_length", tag="episode_length")
        q_estimator.summary_writer.add_summary(episode_summary, total_t)
        q_estimator.summary_writer.flush()

        yield total_t, plotting.EpisodeStats(
            episode_lengths=stats.episode_lengths[:i_episode+1],
            episode_rewards=stats.episode_rewards[:i_episode+1])

    env.monitor.close()
    return stats