Ejemplo n.º 1
0
def test_monitor_filename():
    with helpers.tempdir() as temp:
        env = gym.make('CartPole-v0')
        env = Monitor(env, directory=temp)
        env.close()

        manifests = glob.glob(os.path.join(temp, '*.manifest.*'))
        assert len(manifests) == 1
Ejemplo n.º 2
0
def test_video_callable_false_does_not_record():
    with helpers.tempdir() as temp:
        env = gym.make('CartPole-v0')
        env = Monitor(env, temp, video_callable=False)
        env.reset()
        env.close()
        results = monitoring.load_results(temp)
        assert len(results['videos']) == 0
Ejemplo n.º 3
0
def test_video_callable_records_videos():
    with helpers.tempdir() as temp:
        env = gym.make('CartPole-v0')
        env = Monitor(env, temp)
        env.reset()
        env.close()
        results = monitoring.load_results(temp)
        assert len(results['videos']) == 1, "Videos: {}".format(results['videos'])
Ejemplo n.º 4
0
def test_semisuper_succeeds():
    """Regression test. Ensure that this can write"""
    with helpers.tempdir() as temp:
        env = gym.make('SemisuperPendulumDecay-v0')
        env = Monitor(env, temp)
        env.reset()
        env.step(env.action_space.sample())
        env.close()
Ejemplo n.º 5
0
class GymEnvironment(Environment):
    def __init__(self, env_id, directory=None, force=True, monitor_video=0):
        super(GymEnvironment, self).__init__(env_id=env_id)
        self._env = gym.make(env_id)

        if directory:
            if monitor_video == 0:
                video_callable = False
            else:
                video_callable = (lambda x: x % monitor_video == 0)
            self._env = Monitor(self._env, directory, video_callable=video_callable, force=force)

    def __str__(self):
        return 'OpenAIGym({})'.format(self._env_id)

    def close(self):
        if not self._closed:
            self._env.close()
            self._closed = True

    def reset(self, return_spec=True):
        self._reset()
        state = self._env.reset()
        if return_spec:
            return EnvSpec(action=None, state=None, reward=0, done=False, next_state=state)
        return state

    def step(self, action, state, return_spec=True):
        self._step()
        if isinstance(action, (list, np.ndarray)):
            if isinstance(self._env.action_space, Discrete) or isinstance(action, (list, np.ndarray)):
                action = action[0]
        if isinstance(self._env.action_space, Box) and not isinstance(action, (list, np.ndarray)):
            action = list(action)
        next_state, reward, done, _ = self._env.step(action)
        if return_spec:
            return EnvSpec(
                action=action, state=state, reward=reward, done=done, next_state=next_state)
        return next_state, reward, done

    @property
    def num_states(self):
        return self._env.observation_space.shape[0]

    @property
    def num_actions(self):
        if isinstance(self._env.action_space, Box):
            return self._env.action_space.shape[0]
        else:
            return self._env.action_space.n

    @property
    def is_continuous(self):
        return not isinstance(self._env.action_space, Discrete)
Ejemplo n.º 6
0
def cart_pole_with_qlearning():
    from gym.wrappers import Monitor
    env = gym.make('CartPole-v0')
    experiment_filename = './cartpole-experiment-1'
    env = Monitor(env, experiment_filename, force=True)
    observation = env.reset()

    goal_average_steps = 195
    max_number_of_steps = 200
    number_of_iterations_to_average = 100

    number_of_features = env.observation_space.shape[0]
    last_time_steps = np.ndarray(0)

    cart_position_bins = pd.cut([-2.4, 2.4], bins=10, retbins=True)[1][1:-1]
    pole_angle_bins = pd.cut([-2, 2], bins=10, retbins=True)[1][1:-1]
    cart_velocity_bins = pd.cut([-1, 1], bins=10, retbins=True)[1][1:-1]
    angle_rate_bins = pd.cut([-3.5, 3.5], bins=10, retbins=True)[1][1:-1]

    learner = QLearner(state_discretization=Binning([[-2.4, 2.4], [-2, 2], [-1., 1], [-3.5, 3.5]], [10] * 4),
                       discrete_actions=[i for i in range(env.action_space.n)],
                       alpha=0.2,
                       gamma=1,
                       random_action_rate=0.5,
                       random_action_decay_rate=0.99)

    for episode in range(50000):
        action = learner.set_initial_state(observation)

        for step in range(max_number_of_steps - 1):
            observation, reward, done, info = env.step(action)

            if done:
                reward = -200
                observation = env.reset()

            action = learner.move(observation, reward)

            if done:
                last_time_steps = np.append(last_time_steps, [int(step + 1)])
                if len(last_time_steps) > number_of_iterations_to_average:
                    last_time_steps = np.delete(last_time_steps, 0)
                break

        if last_time_steps.mean() > goal_average_steps:
            print "Goal reached!"
            print "Episodes before solve: ", episode + 1
            print u"Best 100-episode performance {} {} {}".format(last_time_steps.max(),
                                                                  unichr(177),  # plus minus sign
                                                                  last_time_steps.std())
            break

    env.close()
Ejemplo n.º 7
0
def test_write_upon_reset_false():
    with helpers.tempdir() as temp:
        env = gym.make('CartPole-v0')
        env = Monitor(env, directory=temp, video_callable=False, write_upon_reset=False)
        env.reset()

        files = glob.glob(os.path.join(temp, '*'))
        assert not files, "Files: {}".format(files)

        env.close()
        files = glob.glob(os.path.join(temp, '*'))
        assert len(files) > 0
Ejemplo n.º 8
0
    def evaluate(self, n_games=1, save_path="./records", use_monitor=True, record_video=True, verbose=True,
                 t_max=100000):
        """Plays an entire game start to end, records the logs(and possibly mp4 video), returns reward.

        :param save_path: where to save the report
        :param record_video: if True, records mp4 video
        :return: total reward (scalar)
        """
        env = self.make_env()

        if not use_monitor and record_video:
            raise warn("Cannot video without gym monitor. If you still want video, set use_monitor to True")

        if record_video :
            env = Monitor(env,save_path,force=True)
        elif use_monitor:
            env = Monitor(env, save_path, video_callable=lambda i: False, force=True)

        game_rewards = []
        for _ in range(n_games):
            # initial observation
            observation = env.reset()
            # initial memory
            prev_memories = [np.zeros((1,) + tuple(mem.output_shape[1:]),
                                      dtype=get_layer_dtype(mem))
                             for mem in self.agent.agent_states]

            t = 0
            total_reward = 0
            while True:

                res = self.agent_step(self.preprocess_observation(observation)[None, ...], *prev_memories)
                action, new_memories = res[0], res[1:]

                observation, reward, done, info = env.step(action[0])

                total_reward += reward
                prev_memories = new_memories

                if done or t >= t_max:
                    if verbose:
                        print("Episode finished after {} timesteps with reward={}".format(t + 1, total_reward))
                    break
                t += 1
            game_rewards.append(total_reward)

        env.close()
        del env
        return game_rewards
Ejemplo n.º 9
0
def test_write_upon_reset_true():
    with helpers.tempdir() as temp:
        env = gym.make('CartPole-v0')

        # TODO: Fix Cartpole to not configure itself automatically
        # assert not env._configured
        env = Monitor(env, directory=temp, video_callable=False, write_upon_reset=True)
        env.configure()
        env.reset()

        files = glob.glob(os.path.join(temp, '*'))
        assert len(files) > 0, "Files: {}".format(files)

        env.close()
        files = glob.glob(os.path.join(temp, '*'))
        assert len(files) > 0
Ejemplo n.º 10
0
def test_steps_limit_restart():
    with helpers.tempdir() as temp:
        env = gym.make('test.StepsLimitCartpole-v0')
        env = Monitor(env, temp, video_callable=False)
        env.reset()

        # Episode has started
        _, _, done, info = env.step(env.action_space.sample())
        assert done == False

        # Limit reached, now we get a done signal and the env resets itself
        _, _, done, info = env.step(env.action_space.sample())
        assert done == True
        assert env.episode_id == 1

        env.close()
Ejemplo n.º 11
0
def test_only_complete_episodes_written():
    with helpers.tempdir() as temp:
        env = gym.make('CartPole-v0')
        env = Monitor(env, temp, video_callable=False)
        env.reset()
        d = False
        while not d:
            _, _, d, _ = env.step(env.action_space.sample())

        env.reset()
        env.step(env.action_space.sample())

        env.close()

        # Only 1 episode should be written
        results = monitoring.load_results(temp)
        assert len(results['episode_lengths']) == 1, "Found {} episodes written; expecting 1".format(len(results['episode_lengths']))
Ejemplo n.º 12
0
def test_env_reuse():
    with helpers.tempdir() as temp:
        env = gym.make('Autoreset-v0')
        env = Monitor(env, temp)

        env.reset()

        _, _, done, _ = env.step(None)
        assert not done
        _, _, done, _ = env.step(None)
        assert done

        _, _, done, _ = env.step(None)
        assert not done
        _, _, done, _ = env.step(None)
        assert done

        env.close()
Ejemplo n.º 13
0
def test_no_monitor_reset_unless_done():
    def assert_reset_raises(env):
        errored = False
        try:
            env.reset()
        except error.Error:
            errored = True
        assert errored, "Env allowed a reset when it shouldn't have"

    with helpers.tempdir() as temp:
        # Make sure we can reset as we please without monitor
        env = gym.make('CartPole-v0')
        env.reset()
        env.step(env.action_space.sample())
        env.step(env.action_space.sample())
        env.reset()

        # can reset once as soon as we start
        env = Monitor(env, temp, video_callable=False)
        env.reset()

        # can reset multiple times in a row
        env.reset()
        env.reset()

        env.step(env.action_space.sample())
        env.step(env.action_space.sample())
        assert_reset_raises(env)

        # should allow resets after the episode is done
        d = False
        while not d:
            _, _, d, _ = env.step(env.action_space.sample())

        env.reset()
        env.reset()

        env.step(env.action_space.sample())
        assert_reset_raises(env)

        env.close()
Ejemplo n.º 14
0
class GymEnvironment(Environment):
    def __init__(self, env_id, directory=None, force=True, monitor_video=0):
        super(GymEnvironment, self).__init__(env_id=env_id)
        self._env = gym.make(env_id)

        if directory:
            if monitor_video == 0:
                video_callable = False
            else:
                video_callable = (lambda x: x % monitor_video == 0)
            self._env = Monitor(self._env,
                                directory,
                                video_callable=video_callable,
                                force=force)

    def __str__(self):
        return 'OpenAIGym({})'.format(self._env_id)

    def close(self):
        if not self._closed:
            self._env.close()
            self._closed = True

    def reset(self, return_spec=True):
        self._reset()
        state = self._env.reset()
        if return_spec:
            return EnvSpec(action=None,
                           state=None,
                           reward=0,
                           done=False,
                           next_state=state)
        return state

    def step(self, action, state, return_spec=True):
        self._step()
        if isinstance(action, (list, np.ndarray)):
            if isinstance(self._env.action_space, Discrete) or isinstance(
                    action, (list, np.ndarray)):
                action = action[0]
        if isinstance(self._env.action_space,
                      Box) and not isinstance(action, (list, np.ndarray)):
            action = list(action)
        next_state, reward, done, _ = self._env.step(action)
        if return_spec:
            return EnvSpec(action=action,
                           state=state,
                           reward=reward,
                           done=done,
                           next_state=next_state)
        return next_state, reward, done

    @property
    def num_states(self):
        return self._env.observation_space.shape[0]

    @property
    def num_actions(self):
        if isinstance(self._env.action_space, Box):
            return self._env.action_space.shape[0]
        else:
            return self._env.action_space.n

    @property
    def is_continuous(self):
        return not isinstance(self._env.action_space, Discrete)
Ejemplo n.º 15
0
def run(seed, episodes, batch_size, gamma, inverting_gradients,
        initial_memory_threshold, replay_memory_size, epsilon_steps, tau_actor,
        tau_actor_param, use_ornstein_noise, learning_rate_actor,
        learning_rate_actor_param, title, epsilon_final, clip_grad, beta,
        scale_actions, split, indexed, zero_index_gradients,
        action_input_layer, evaluation_episodes, multipass, weighted, average,
        random_weighted, update_ratio, save_freq, save_dir, layers):

    if save_freq > 0 and save_dir:
        save_dir = os.path.join(save_dir, title + "{}".format(str(seed)))
        os.makedirs(save_dir, exist_ok=True)

    env = make_env(scale_actions)
    dir = os.path.join(save_dir, title)
    env = Monitor(env,
                  directory=os.path.join(dir, str(seed)),
                  video_callable=False,
                  write_upon_reset=False,
                  force=True)
    # env.seed(seed)  # doesn't work on HFO
    np.random.seed(seed)

    from agents.pdqn_nstep import PDQNNStepAgent
    from agents.pdqn_split_nstep import PDQNNStepSplitAgent
    from agents.pdqn_multipass_nstep import MultiPassPDQNNStepAgent
    assert not (split and multipass)
    agent_class = PDQNNStepAgent
    if split:
        agent_class = PDQNNStepSplitAgent
    elif multipass:
        agent_class = MultiPassPDQNNStepAgent
    assert action_input_layer >= 0
    if action_input_layer > 0:
        assert split
    agent = agent_class(
        env.observation_space,
        env.action_space,
        actor_kwargs={
            "hidden_layers": layers,
            'action_input_layer': action_input_layer,
            'activation': "leaky_relu",
            'output_layer_init_std': 0.01
        },
        actor_param_kwargs={
            "hidden_layers": layers,
            'activation': "leaky_relu",
            'output_layer_init_std': 0.01
        },
        batch_size=batch_size,
        learning_rate_actor=learning_rate_actor,  # 0.0001
        learning_rate_actor_param=learning_rate_actor_param,  # 0.001
        epsilon_steps=epsilon_steps,
        epsilon_final=epsilon_final,
        gamma=gamma,  # 0.99
        tau_actor=tau_actor,
        tau_actor_param=tau_actor_param,
        clip_grad=clip_grad,
        beta=beta,
        indexed=indexed,
        weighted=weighted,
        average=average,
        random_weighted=random_weighted,
        initial_memory_threshold=initial_memory_threshold,
        use_ornstein_noise=use_ornstein_noise,
        replay_memory_size=replay_memory_size,
        inverting_gradients=inverting_gradients,
        zero_index_gradients=zero_index_gradients,
        seed=seed)
    print(agent)
    network_trainable_parameters = sum(p.numel()
                                       for p in agent.actor.parameters()
                                       if p.requires_grad)
    network_trainable_parameters += sum(
        p.numel() for p in agent.actor_param.parameters() if p.requires_grad)
    print("Total Trainable Network Parameters: %d" %
          network_trainable_parameters)
    max_steps = 15000
    total_reward = 0.
    returns = []
    timesteps = []
    goals = []
    start_time_train = time.time()

    for i in range(episodes):
        if save_freq > 0 and save_dir and i % save_freq == 0:
            agent.save_models(os.path.join(save_dir, str(i)))
        info = {'status': "NOT_SET"}
        state = env.reset()
        state = np.array(state, dtype=np.float32, copy=False)

        act, act_param, all_action_parameters = agent.act(state)
        action = pad_action(act, act_param)
        episode_reward = 0.
        agent.start_episode()
        transitions = []
        for j in range(max_steps):
            next_state, reward, terminal, info = env.step(action)
            next_state = np.array(next_state, dtype=np.float32, copy=False)
            # status = info['status']
            # if status != 'IN_GAME':
            #     print(status)

            next_act, next_act_param, next_all_action_parameters = agent.act(
                next_state)
            next_action = pad_action(next_act, next_act_param)
            transitions.append([
                state,
                np.concatenate(([act], all_action_parameters.data)).ravel(),
                reward, next_state,
                np.concatenate(
                    ([next_act], next_all_action_parameters.data)).ravel(),
                terminal
            ])

            act, act_param, all_action_parameters = next_act, next_act_param, next_all_action_parameters
            action = next_action
            state = next_state
            episode_reward += reward
            #env.render()

            if terminal:
                break
        agent.end_episode()

        # calculate n-step returns
        n_step_returns = compute_n_step_returns(transitions, gamma)
        for t, nsr in zip(transitions, n_step_returns):
            t.append(nsr)
            agent.replay_memory.append(state=t[0],
                                       action=t[1],
                                       reward=t[2],
                                       next_state=t[3],
                                       next_action=t[4],
                                       terminal=t[5],
                                       time_steps=None,
                                       n_step_return=nsr)

        n_updates = int(update_ratio * j)
        for _ in range(n_updates):
            agent._optimize_td_loss()

        returns.append(episode_reward)
        timesteps.append(j)
        goals.append(info['status'] == 'GOAL')

        total_reward += episode_reward
        if i % 100 == 0:
            print('{0:5s} R:{1:.4f} r100:{2:.4f}'.format(
                str(i + 1), total_reward / (i + 1),
                np.array(returns[-100:]).mean()))
    end_time_train = time.time()
    if save_freq > 0 and save_dir:
        agent.save_models(os.path.join(save_dir, str(i)))

    returns = env.get_episode_rewards()
    np.save(os.path.join(dir, title + "{}".format(str(seed))),
            np.column_stack((returns, timesteps, goals)))

    if evaluation_episodes > 0:
        print("Evaluating agent over {} episodes".format(evaluation_episodes))
        agent.epsilon_final = 0.
        agent.epsilon = 0.
        agent.noise = None
        agent.actor.eval()
        agent.actor_param.eval()
        start_time_eval = time.time()
        evaluation_results = evaluate(
            env, agent, evaluation_episodes)  # returns, timesteps, goals
        end_time_eval = time.time()
        print("Ave. evaluation return =",
              sum(evaluation_results[:, 0]) / evaluation_results.shape[0])
        print("Ave. timesteps =",
              sum(evaluation_results[:, 1]) / evaluation_results.shape[0])
        goal_timesteps = evaluation_results[:, 1][evaluation_results[:,
                                                                     2] == 1]
        if len(goal_timesteps) > 0:
            print("Ave. timesteps per goal =",
                  sum(goal_timesteps) / evaluation_results.shape[0])
        else:
            print("Ave. timesteps per goal =",
                  sum(goal_timesteps) / evaluation_results.shape[0])
        print("Ave. goal prob. =",
              sum(evaluation_results[:, 2]) / evaluation_results.shape[0])
        np.save(os.path.join(dir, title + "{}e".format(str(seed))),
                evaluation_results)
        print("Evaluation time: %.2f seconds" %
              (end_time_eval - start_time_eval))
    print("Training time: %.2f seconds" % (end_time_train - start_time_train))

    print(agent)
    env.close()
Ejemplo n.º 16
0
def train(env, estimator, target_network, num_episodes=1000,
                    replay_memory_size=500000,
                    frame_history_len=4,
                    save_every=10,
                    update_every=1000,
                    discount=0.99, epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=50000,
                    batch_size=32, record_every=50):
    """
    deep q learning algorithm
    :param env: openAI gym environment
    :param estimator: estimator model for predicting values
    :param target_network:
    :param num_episodes: number of episodes to run
    :param replay_memory_size: size of replay memory
    :param update_every: copy params from estimator into target estimator after this many steps
    :param discount: discount factor
    :param epsilon_start: starting epsilon value
    :param epsilon_end: ending epsilon value
    :param batch_size: 32 lol
    :param record_every: record a video every N episodes
    :return:
    """

    # Load previous state here
    replay_memory = ReplayBuffer(replay_memory_size, frame_history_len)

    # epsilon delay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    loss_func = nn.SmoothL1Loss()
    optimizer = torch.optim.Adam(estimator.parameters())

    policy = make_epsilon_greedy_policy(estimator, len(VALID_ACTIONS))

    env = Monitor(env, directory="./monitor",
                  resume=True,
                  video_callable=lambda count: count % record_every == 0)

    total_t = 0
    pbar = tqdm(range(num_episodes))
    pbar.set_description("ep: %d, er: %.2f, et: %d, tt: %d, exp_size: %d" % (0, 0.0, 0, 0, 0))

    for ep in pbar:

        state = env.reset()  # 210 x 160 x 4
        state = process_state(state)  # 94 x 94 x 3
        episode_loss = 0
        episode_reward = 0
        episode_t = 0

        for t in itertools.count():
            epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)]

            last_idx = replay_memory.store_frame(state)

            recent_observations = replay_memory.encode_recent_observation()

            action_dist = policy(recent_observations, epsilon)
            action_dist = action_dist.squeeze(0).numpy()
            action = np.random.choice(np.arange(len(action_dist)), p=action_dist)

            next_state, reward, done, _ = env.step(action)
            reward = max(-1.0, min(reward, 1.0))

            episode_reward += reward

            replay_memory.store_effect(last_idx, action, reward, done)
            next_state = process_state(next_state)

            state = next_state

            if replay_memory.can_sample(batch_size):
                obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_memory.sample(batch_size)
                obs_batch = torch.from_numpy(obs_batch).float()
                obs_batch = obs_batch.to(device)
                act_batch = torch.from_numpy(act_batch).long().to(device) / 255.0
                rew_batch = torch.from_numpy(rew_batch).to(device)
                next_obs_batch = torch.from_numpy(next_obs_batch).float().to(device) / 255.0
                not_done_mask = torch.from_numpy(1 - done_mask).float().to(device)

                state_values = estimator(obs_batch)  # b x VALID_ACTIONS
                state_action_values = torch.gather(state_values, 1, act_batch.unsqueeze(1))  # b x 1

                next_state_values_max = target_network(next_obs_batch).detach().max(dim=1)[0]
                next_state_values = not_done_mask * next_state_values_max

                expected_q_value = (next_state_values * discount) + rew_batch

                # bellman_error = expected_q_value - state_action_values.squeeze(1)
                #
                # clipped_bellman_error = bellman_error.clamp(-1, 1)
                #
                # d_error = clipped_bellman_error * -1.0

                loss = loss_func(state_action_values, expected_q_value.unsqueeze(1))
                episode_loss += loss

                # state_action_values.backward(d_error.data.unsqueeze(1))

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            if done:
                break

            total_t += 1
            episode_t = t

        pbar.set_description("ep: %d, el: %.5f, er: %.2f, et: %d, tt: %d, exp_size: %d" % (ep, episode_loss, episode_reward, episode_t, total_t, replay_memory.num_in_buffer))
        if total_t % update_every == 0:
            copy_model_params(estimator, target_network)

        # save checkpoint
        if ep % save_every == 0:
            torch.save(estimator.state_dict(), './checkpoints/checkpoint.pt')

    env.close()
Ejemplo n.º 17
0
class PongAgent:
    def __init__(self, mode=None):
        self.env = wrap_dqn(gym.make('PongDeterministic-v4'))
        if mode == 'test':
            self.env = Monitor(self.env,
                               './video',
                               force=True,
                               video_callable=lambda episode_id: True)
        self.num_actions = self.env.action_space.n

        self.dqn = DQN(self.num_actions)
        self.target_dqn = DQN(self.num_actions)

        if use_gpu:
            self.dqn.cuda()
            self.target_dqn.cuda()

        self.buffer = ReplayMemory(1000)

        self.gamma = 0.99

        self.mse_loss = nn.MSELoss()
        self.optim = optim.RMSprop(self.dqn.parameters(), lr=0.01)

        self.out_dir = './model'
        self.writer = SummaryWriter()

        if not os.path.exists(self.out_dir):
            os.makedirs(self.out_dir)

    def to_var(self, x):
        x_var = Variable(x)
        if use_gpu:
            x_var = x_var.cuda()
        return x_var

    def predict_q_values(self, states):
        states = self.to_var(torch.from_numpy(states).float())
        actions = self.dqn(states)
        return actions

    def predict_q_target_values(self, states):
        states = self.to_var(torch.from_numpy(states).float())
        actions = self.target_dqn(states)
        return actions

    def select_action(self, state, epsilon):
        choice = np.random.choice([0, 1], p=(epsilon, (1 - epsilon)))

        if choice == 0:
            return np.random.choice(range(self.num_actions))
        else:
            state = np.expand_dims(state, 0)
            actions = self.predict_q_values(state)
            return np.argmax(actions.data.cpu().numpy())

    def update(self, predicts, targets, actions):
        targets = self.to_var(
            torch.unsqueeze(torch.from_numpy(targets).float(), -1))
        actions = self.to_var(
            torch.unsqueeze(torch.from_numpy(actions).long(), -1))

        affected_values = torch.gather(predicts, 1, actions)
        loss = self.mse_loss(affected_values, targets)

        self.optim.zero_grad()
        loss.backward()
        self.optim.step()

    def get_epsilon(self, total_steps, max_epsilon_steps, epsilon_start,
                    epsilon_final):
        return max(epsilon_final,
                   epsilon_start - total_steps / max_epsilon_steps)

    def sync_target_network(self):
        primary_params = list(self.dqn.parameters())
        target_params = list(self.target_dqn.parameters())
        for i in range(0, len(primary_params)):
            target_params[i].data[:] = primary_params[i].data[:]

    def calculate_q_targets(self, next_states, rewards, dones):
        dones_mask = (dones == 1)

        predicted_q_target_values = self.predict_q_target_values(next_states)

        next_max_q_values = np.max(
            predicted_q_target_values.data.cpu().numpy(), axis=1)
        next_max_q_values[
            dones_mask] = 0  # no next max Q values if the game is over
        q_targets = rewards + self.gamma * next_max_q_values

        return q_targets

    def save_final_model(self):
        filename = '{}/final_model.pth'.format(self.out_dir)
        torch.save(self.dqn.state_dict(), filename)

    def save_model_during_training(self, episode):
        filename = '{}/current_model_{}.pth'.format(self.out_dir, episode)
        torch.save(self.dqn.state_dict(), filename)

    def load_model(self, filename):
        self.dqn.load_state_dict(torch.load(filename))
        self.sync_target_network()

    def play(self, episodes):
        for i in range(1, episodes + 1):
            done = False
            state = self.env.reset()
            while not done:
                action = self.select_action(
                    state, 0)  # force to choose an action from the network
                state, reward, done, _ = self.env.step(action)
                # self.env.render()

    def close_env(self):
        self.env.close()

    def train(self, replay_buffer_fill_len, batch_size, episodes,
              max_epsilon_steps, epsilon_start, epsilon_final,
              sync_target_net_freq):
        start_time = time.time()
        print('Start training at: ' + time.asctime(time.localtime(start_time)))

        total_steps = 0
        running_episode_reward = 0

        # populate replay memory
        print('Populating replay buffer... ')
        print('\n')
        state = self.env.reset()
        for i in range(replay_buffer_fill_len):
            action = self.select_action(state,
                                        1)  # force to choose a random action
            next_state, reward, done, _ = self.env.step(action)

            self.buffer.add(state, action, reward, done, next_state)

            state = next_state
            if done:
                self.env.reset()

        print('replay buffer populated with {} transitions, start training...'.
              format(self.buffer.count()))
        print('\n')

        # main loop - iterate over episodes
        for i in range(1, episodes + 1):
            # reset the environment
            done = False
            state = self.env.reset()

            # reset spisode reward and length
            episode_reward = 0
            episode_length = 0

            # play until it is possible
            while not done:
                # synchronize target network with estimation network in required frequence
                if (total_steps % sync_target_net_freq) == 0:
                    self.sync_target_network()

                # calculate epsilon and select greedy action
                epsilon = self.get_epsilon(total_steps, max_epsilon_steps,
                                           epsilon_start, epsilon_final)
                action = self.select_action(state, epsilon)

                # execute action in the environment
                next_state, reward, done, _ = self.env.step(action)

                # store transition in replay memory
                self.buffer.add(state, action, reward, done, next_state)

                # sample random minibatch of transitions
                s_batch, a_batch, r_batch, d_batch, next_s_batch = self.buffer.sample(
                    batch_size)

                # predict Q value using the estimation network
                predicted_values = self.predict_q_values(s_batch)

                # estimate Q value using the target network
                q_targets = self.calculate_q_targets(next_s_batch, r_batch,
                                                     d_batch)

                # update weights in the estimation network
                self.update(predicted_values, q_targets, a_batch)

                # set the state for the next action selction and update counters and reward
                state = next_state
                total_steps += 1
                episode_length += 1
                episode_reward += reward
                self.writer.add_scalar('data/reward', reward, total_steps)
                self.writer.add_scalar('data/epsilon', epsilon, total_steps)

            running_episode_reward = running_episode_reward * 0.9 + 0.1 * episode_reward
            self.writer.add_scalar('data/episode_reward', episode_reward, i)
            self.writer.add_scalar('data/running_episode_reward',
                                   running_episode_reward, i)

            if (i % 30) == 0:
                print('global step: {}'.format(total_steps))
                print('episode: {}'.format(i))
                print('running reward: {}'.format(
                    round(running_episode_reward, 2)))
                print('current epsilon: {}'.format(round(epsilon, 2)))
                print('episode_length: {}'.format(episode_length))
                print('episode reward: {}'.format(episode_reward))
                curr_time = time.time()
                print('current time: ' +
                      time.asctime(time.localtime(curr_time)))
                print('running for: ' +
                      str(datetime.timedelta(seconds=curr_time - start_time)))
                print('saving model after {} episodes...'.format(i))
                print('\n')
                self.save_model_during_training(i)

        print('Finish training at: ' +
              time.asctime(time.localtime(start_time)))
Ejemplo n.º 18
0
class Environment(object):
    def __init__(self, game, record=False, width=84, height=84, seed=0):
        self.game = gym.make(game)
        self.game.seed(seed)

        if record:
            self.game = Monitor(self.game, './video', force=True)

        self.width = width
        self.height = height
        self._toTensor = T.Compose([T.ToPILImage(), T.ToTensor()])
        gym_ple

    def play_sample(self, mode: str = 'human'):
        observation = self.game.reset()

        while True:
            screen = self.game.render(mode=mode)
            if mode == 'rgb_array':
                screen = self.preprocess(screen)
            action = self.game.action_space.sample()
            observation, reward, done, info = self.game.step(action)
            if done:
                break
        self.game.close()

    def preprocess(self, screen):
        preprocessed: np.array = cv2.resize(
            screen, (self.height, self.width))  # 84 * 84 로 변경
        preprocessed = np.dot(preprocessed[..., :3],
                              [0.299, 0.587, 0.114])  # Gray scale 로 변경
        # preprocessed: np.array = preprocessed.transpose((2, 0, 1))  # (C, W, H) 로 변경
        preprocessed: np.array = preprocessed.astype('float32') / 255.

        return preprocessed

    def init(self):
        """
        @return observation
        """
        return self.game.reset()

    def get_screen(self):
        screen = self.game.render('rgb_array')
        screen = self.preprocess(screen)
        return screen

    def step(self, action: int):
        observation, reward, done, info = self.game.step(action)
        return observation, reward, done, info

    def reset(self):
        """
        :return: observation array
        """
        observation = self.game.reset()
        observation = self.preprocess(observation)
        return observation

    @property
    def action_space(self):
        return self.game.action_space.n
Ejemplo n.º 19
0
    def train(self, function, discount_factor, actor_learning_rate,
              learning_env, testing_env, total_observations, test_interval,
              total_number_of_testing_episodes,
              gym_training_logs_directory_path,
              gym_testing_logs_directory_path, actor_weights_saving_interval):
        """Train the agent

        function -- An instance of the class implemnenting the
                    actor critic model
        discount_factor -- Quantifies how much the agent cares about future
                           rewards while learning. Often referred to as gamma in
                           the literature.
        actor_learning_rate -- Learning rate of the actor
        learning_env -- A Gym environment (wrapped or vanilla) used for learning
        testing_env -- A Gym environment (wrapped or vanilla) used for testing.
        total_observations -- Train till this observation number
        test_interval -- Test after this many observations
        total_number_of_testing_episodes -- Number of episodes to test the agent
                                            in every testing round
        gym_training_logs_directory_path - Directory to save automatic Gym logs
                                           related to training. We save the
                                           rewards for every learning episode.
        gym_testing_logs_directory_path - Directory to save automatic Gym logs
                                          related to testing. We save a video
                                          for the first test episode.
        actor_weights_saving_interval -- Save the actor weights
                                         (i.e. write to file)
                                         after this many episodes.
        """

        # This keeps track of the number of observations made so far
        observation_number = 0

        # Keep count of the episode number
        episode_number = 1

        # The learning env should always be wrapped by the Monitor provided
        # by Gym. This lets us automatically save the rewards for every episode.

        learning_env = Monitor(
            learning_env,
            gym_training_logs_directory_path,
            # Don't want video recording during training, only during testing
            video_callable=False,
            # Write after every reset so that we don't lose data for
            # prematurely interrupted training runs
            write_upon_reset=True,
        )

        while observation_number < total_observations:

            # initialize environment
            observation = learning_env.reset()

            total_rewards_obtained_in_this_episode = 0

            action = function.get_action(observation)

            # Execute an episode
            while True:

                # take the action determined by the Softmax policy
                next_observation, reward, done, info = learning_env.step(
                    action)

                # Determine the next action. This is required for the
                # model update.

                next_action = function.get_action(next_observation)

                # Update the model
                function.update_model(
                    discount_factor,
                    actor_learning_rate,
                    observation,
                    action,
                    reward,
                    done,
                    next_observation,
                    next_action,
                )

                observation = next_observation
                action = next_action

                observation_number += 1
                # Test the current performance after every test_interval
                if observation_number % test_interval == 0:
                    # The testing env is also wrapped by a Monitor so that we
                    # can take automatic videos during testing. We will take a
                    # video for the very first testing episode.

                    video_callable = lambda count: count == 0

                    # Since the environment is closed after every testing round,
                    # the video for different testing round will end up having
                    # the same name! To differentiate the videos, we pass
                    # an unique uid parameter.

                    monitored_testing_env = Monitor(
                        testing_env,
                        gym_testing_logs_directory_path,
                        video_callable=video_callable,
                        resume=True,
                        uid=observation_number / test_interval)

                    # Run the test
                    average_reward = self.test(
                        monitored_testing_env,
                        total_number_of_episodes=
                        total_number_of_testing_episodes,
                        function=function,
                        render=False)
                    print(
                        "[{0}] Episode number : {1}, Observation number : {2} "
                        "Average reward (100 eps) : {3}".format(
                            datetime.datetime.now(), episode_number,
                            observation_number, average_reward))

                total_rewards_obtained_in_this_episode += reward

                if done:

                    episode_number += 1

                    # Save table to file at regular intervals
                    if episode_number % actor_weights_saving_interval == 0:
                        function.save()

                    break

            print("[{0}] Episode number : {1}, Obervation number: {2}, "
                  "Reward in this episode : {3}".format(
                      datetime.datetime.now(),
                      episode_number - 1,
                      observation_number,
                      total_rewards_obtained_in_this_episode,
                  ))

        learning_env.close()

        # There's a bug in the Gym Monitor. The Monitor's close method does not
        # close the wrapped environment. This makes the script exit with an
        # error if the environment is being rendered at some point. To make
        # this error go away, we have to close the unwrapped testing
        # environment. The learning environment is not being rendered, so we
        # don't need to bother about that.
        testing_env.env.close()
Ejemplo n.º 20
0
def main_test(id):
    config(id)
    env = gym.make(id)
    env = env.unwrapped
    dqn = MyDQN(env)
    if id == 'CartPole-v0':
        T = 20000
    else:
        T = 2000

    count = 0
    train_result = []
    train_loss = []
    for i in range(2000):
        observation = env.reset()
        for j in range(T):
            action = dqn.action(observation, i)
            new_observation, reward, done, info = env.step(action)
            if id == 'CartPole-v0':
                r1 = (env.x_threshold -
                      abs(new_observation[0])) / env.x_threshold - 0.8
                r2 = (env.theta_threshold_radians - abs(
                    new_observation[2])) / env.theta_threshold_radians - 0.5
                reward = r1 + r2
                '''if j<2000:
                    reward=-200'''

            elif done:
                reward = 100
            dqn.perceive(observation, action, reward, new_observation, done)
            observation = new_observation
            if done == False and j != T - 1:
                continue
            train_result.append(j)

            if id == 'CartPole-v0':
                if done or j == T - 1:
                    if j > 5000:
                        count += 1
                    else:
                        count = 0
                    print(i, j)
                    break
            elif id == 'MountainCar-v0':
                print(i, j)
                if done and j < 300:
                    count += 1
                else:
                    count = 0
                break
            else:
                print(i, j)
                if done and j < 300:
                    count += 1
                else:
                    count = 0
                break
        train_loss.append(dqn.get_loss() / train_result[-1])
        if id == 'CartPole-v0' and count >= 5:
            break
        if id != 'CartPole-v0' and count >= 200:
            break
    print(train_loss)
    print(train_result)
    plt.plot(train_loss)
    plt.xlabel("round")
    plt.ylabel("loss")
    plt.show()
    if id != 'CartPole-v0':
        train_result = -np.array(train_result)
    plt.plot(train_result)
    plt.xlabel("round")
    plt.ylabel("reward")
    plt.show()

    if RECORD:
        env = Monitor(env, './cartpole-experiment-0201', force=True)
        observation = env.reset()
        for j in range(T):
            #env.render()
            action = dqn.best_action(observation)
            observation, reward, done, info = env.step(action)
        env.close()

    result = []
    for i in range(200):
        observation = env.reset()
        for j in range(T):
            #env.render()
            action = dqn.best_action(observation)
            observation, reward, done, info = env.step(action)
            if done or j == T - 1:
                print("test", j + 1)
                result.append(j + 1)
                break
    result = np.array(result)
    if id != 'CartPole-v0':
        result = -result
    plt.plot(result)
    plt.xlabel("round")
    plt.ylabel("reward")
    plt.show()
    print("mean", np.mean(result))
    print("var", np.std(result))
    print("len", len(result))
Ejemplo n.º 21
0
class Environment(object):
    def __init__(self, game, record=False, width=64, height=64, seed=0,additional=12,activateAdditional=True,videoFolder="0/"):
        self.activateAdditional=activateAdditional
        self.game = gym.make(game)
        self.game.seed(seed)
        print("record",record)
        if record:
            print("record")
            self.game = Monitor(self.game, f'./videos/{videoFolder}', force=True)

        self.width = width
        self.height = height
        self.additional=additional
        self._toTensor = T.Compose([T.ToPILImage(), T.ToTensor()])
        gym_ple

    def play_sample(self, mode: str = 'human'):
        observation = self.game.reset()

        while True:
            screen = self.game.render(mode=mode)
            if mode == 'rgb_array':
                screen,_ = self.preprocess(screen)
            action = self.game.action_space.sample()
            observation, reward, done, info = self.game.step(action)
            if done:
                break
        self.game.close()

    """
        looks if the snake is able to go into the desired direction. If not, another guess is calculated.
    """
    def desiredmove(self,head, nextstep):
        if head==0:
            if nextstep is not 3:
                return nextstep
            else:
                return 1
        elif head==1:
            if nextstep is not 2:
                return nextstep
            else:
                return 0
        elif head==2:
            if nextstep is not 1:
                return nextstep
            else:
                return 3
        elif head==3:
            if nextstep is not 0:
                return nextstep
            else:
                return 2
        else:
            #time.sleep(5)

            return -1

    #returns position of food for snake
    def getfood(self,observation):
        for i in range(observation.shape[0]):
            #i is north->south
            for j in range(observation.shape[1]):
                #j is west->east
                if observation[i,j]==100:
                    #if not food_found:
                        #food_found=True
                        #print("position of food is to the south from {} to {} and to the east from {} to {}".format(i,i+5,j,j+5))
                        foodlocation=(i,i+5,j,j+5)
                        return foodlocation
        print("Error, couldn't find food!!!")
        print(np.array(observation))
        return (-4,-4,-4,-4)

    """
        gets position of the snakes head and the direction its facing
    """
    def getsnake(self,observation):
        direction_counter=0
        headlocation=(-20,-20)#-1
        direction=-1
        dirx=observation.shape[0]-1
        diry=observation.shape[1]-1
        for i in range(0,dirx+1):
            #i is north->south
            for j in range(0,diry+1):
                #j is west->east

                #0 means direction snake is looking at
                if observation[i,j]==0:
                        direction_counter+=1
                        #only takes the second of the three direction pixel. If only one is visible that one is taken.
                        if observation[max(0,i-2),j]==255:
                            headlocation=(max(0,i-2),j)
                            direction=3
                            if direction_counter > 1 :
                                return (headlocation,direction)
                        elif observation[min(i+2,dirx),j]==255:
                            headlocation=(min(i+2,dirx),j)
                            direction=0
                            if direction_counter > 1 :
                                return (headlocation,direction)
                        elif observation[i,max(0,j-2)]==255:
                            headlocation=(i,max(0,j-2))
                            direction=2
                            if direction_counter > 1 :
                                return (headlocation,direction)
                        elif observation[i,min(j+2,diry)]==255:
                            headlocation=(i,min(j+2,diry))
                            direction=1
                            if direction_counter > 1 :
                                return (headlocation,direction)
                        #else:
                        #There was a problem, that sometimes the direction is displayed 3 pixel ahead of the snake instead of 2 pixel.
                        elif observation[max(0,i-3),j]==255:
                            headlocation=(max(0,i-3),j)
                            direction=3
                            if direction_counter > 1 :
                                return (headlocation,direction)
                        elif observation[min(i+3,dirx),j]==255:
                            headlocation=(min(i+3,dirx),j)
                            direction=0
                            if direction_counter > 1 :
                                return (headlocation,direction)
                        elif observation[i,max(0,j-3)]==255:
                            headlocation=(i,max(0,j-3))
                            direction=2
                            if direction_counter > 1 :
                                return (headlocation,direction)
                        elif observation[i,min(j+3,diry)]==255:
                            headlocation=(i,min(j+3,diry))
                            direction=1
                            if direction_counter > 1 :
                                return (headlocation,direction)
                        else:
                            pass
                            #print("no direction could be found!!!!")
        return headlocation,direction

    """
        returns the positions of food, snake head, distance and guesses for the next action
    """
    def printpositions(self,observation):
        #init the output values for case they are not found in image
        food_found=False
        direction_counter=0
        foodlocation=(-4,-4,-4,-4)
        headlocation=(-3,-3)
        direction=-1
        foodlocation=self.getfood(observation)

        headlocation,direction=self.getsnake(observation)

        #get direction:
        newdirection=-1
        #generates biggest distance to food
        distsouth1=foodlocation[0]-headlocation[0]
        distsouth2=foodlocation[1]-headlocation[0]
        disteast1=foodlocation[2]-headlocation[1]
        disteast2=foodlocation[3]-headlocation[1]
        distances=np.array([distsouth1,distsouth2,disteast1,disteast2])

        bigindex=np.argmax(np.absolute(distances))

        #goes into direction of biggest distance
        pos=distances[bigindex]
        if bigindex <=1:
            if pos>=0:
                newdirection=self.desiredmove(direction,3)
            else:
                newdirection=self.desiredmove(direction,0)
        else:
            if pos>=0:
                newdirection=self.desiredmove(direction,2)
            else:
                newdirection=self.desiredmove(direction,1)

        #didn't manage to add all inputs in an array in some nice way
        additional_states=np.append(np.append(np.append(foodlocation,headlocation),[direction,newdirection]),distances)

        #if no food is found, something is wrong -> set everything to -2
        if (foodlocation[0]<0):
            additional_states.fill(-2)
            print("didn't found anything")

        return additional_states

    """
        used to put background at 0 and have equally distant values
    """
    def revaluescreen(self,element):
        if element<12:
            return 85
        if element<50:
            return 0
        if element<150:
            return 170
        else:
            return 255

    """
        shapes the image in a way, we want (1 channel) and calculates additional features if required
    """
    def preprocess(self, screen):
        #print(np.shape(screen))
        preprocessed = screen[:, :,1]
        #print(preprocessed)
        #print(preprocessed)
        #print([[self.revaluescreen(e) for e in row] for row in screen[:, :,1]])
        #for i in preprocessed
        #print(np.shape(preprocessed))
        if self.activateAdditional:
            additional_states=self.printpositions(preprocessed)#np.zeros(12)#
        else:
            additional_states=np.zeros(12)

        preprocessed=np.array([[self.revaluescreen(e) for e in row] for row in screen[:, :,1]])

        #print (preprocessed)
        preprocessed: np.array = preprocessed.astype('float32') / 255.

        return preprocessed,additional_states

    def init(self):
        """
        @return observation
        """
        return self.game.reset()

    def get_screen(self):
        screen = self.game.render('rgb_array')
        screen,additional_states = self.preprocess(screen)
        #print("additional_states",additional_states)
        return screen,additional_states

    def step(self, action: int):
        observation, reward, done, info = self.game.step(action)
        return observation, reward, done, info

    def reset(self):
        """
        :return: observation array
        """
        observation = self.game.reset()
        observation = self.preprocess(observation)
        return observation

    @property
    def action_space(self):
        return self.game.action_space.n
Ejemplo n.º 22
0
class Q:

    def __init__(self, max_ep, folder, constraints, goal):  # constraints: the boundaries for s in Q^g(s, a)
        self.max_episode = max_ep
        self.log = Logger(folder)
        self.env = gym.make('PendulumGoal-v0')
        self.constraints = constraints
        self.start = None
        self.goal = goal

        self.q = np.zeros((ROWS, COLS, DPTS))
        self.alpha_rec = np.ones((ROWS, COLS, DPTS))
        self.lr = 1.0
        self.gamma = 0.8

        self.epsilon = 0.5

    def __env_init_fn(self):
        self.env.reset()
        th = random.uniform(self.constraints[0][0], self.constraints[0][1])
        thdot = random.uniform(self.constraints[1][0], self.constraints[1][1])
        self.env.setup([np.array([th, thdot]), self.goal])
        return np.array([math.cos(th), math.sin(th), thdot])

    def __indexer(self, state, action=None):  # state=(cos(th), sin(th), thdot)
        c, s, dot = state[0], state[1], state[2]
        theta = math.acos(math.fabs(c))
        if c >= 0 and s >= 0:
            theta = theta
        elif c < 0 and s > 0:
            theta = PI - theta
        elif c < 0 and s < 0:
            theta = PI + theta
        else:
            theta = 2.0 * PI - theta

        row = int(round(math.degrees(theta)) / 360 * (ROWS-1))

        col = int(round((dot + 8.0) / 16.0 * (COLS - 1)))

        if action is None:
            return (row, col)
        else:
            dph = 1 if action == 1.0 else 0  #int(round((action + 1.0)))#/ 4.0 * (DPTS -1)))
            return (row, col, dph)

    def __argmin(self, state):
        idx = self.__indexer(state)
        return np.argmin(self.q[idx[0], idx[1], :])

    def __min(self, state):
        idx = self.__indexer(state)
        return np.min(self.q[idx[0], idx[1], :])

    def __update(self, state, action, cost, next_state, done, next_action=None):
        idx = self.__indexer(state, action)
        self.lr = 1.0 / self.alpha_rec[idx]
        idx_next = self.__indexer(next_state, next_action)
        self.q[idx] = ((1 - self.lr) * self.q[idx] + self.lr * (cost + (0.0 if done else self.gamma * self.q[idx_next])))
        self.alpha_rec[idx] += 1

    def __select_act(self, state, explorefree=False):
        epsilon = (0.1 if explorefree else self.epsilon)
        best_act = self.__argmin(state)
        dice = random.randint(1, 1000)
        if dice > epsilon * 1000:
            return best_act * 2.0 - 1.0
        else:
            idcs = [idx for idx in range(0, DPTS)]
            idcs.remove(best_act)
            return random.choice(idcs) * 2.0 - 1.0

    def __decrease_eps(self):
        self.epsilon = max(self.epsilon - 0.001, 0.1)

    def __decrease_lr(self):
        self.lr = max(self.lr - 0.0002, 0.01)

    def run(self):

        self.log.log(Mode.STDOUT, 'Learning started.')

        total_cost = 0
        episode = 0
        cntr = 0
        done = True
        state = None
        min_cost = 0
        action = [0.0]
        avg_scs = 0

        while episode < self.max_episode:

            if done or cntr % 250 == 0:
                episode += 1
                state = self.__env_init_fn()
                action = self.__select_act(state)
                self.log.log(Mode.TRAIN_RET_F, [cntr, episode, total_cost])
                total_cost = 0
                nzeros = np.count_nonzero(self.q)
                self.__decrease_eps()
                self.__decrease_lr()

                if episode % 10 == 0:
                    rtn, scs = self.evaluate()
                    avg_scs += scs
                    self.log.log(Mode.STD_LOG, str(episode) + ': ' + str(rtn) + ' cost: ' + str(min_cost) + ' scs: ' + str(scs) + ' nonzeros: ' + str(nzeros))
                    self.log.log(Mode.RET_F, [cntr, episode, rtn])

            next_state, cost, done, inf = self.env.step([action])
            next_action = self.__select_act(next_state)
            min_cost = inf['min_cost']
            self.__update(state, action, cost, next_state, done, next_action)
            action = next_action

            cntr += 1
            total_cost += cost
            state = next_state

        self.log.log(Mode.NUMPY, self.q)
        self.log.log(Mode.STDOUT, 'Learning finished. matrix was saved.')

        return avg_scs * 10 / self.max_episode

    def evaluate(self, video=False, show=False):

        total_cost = 0
        cntr = 0
        episode = 0
        done = True
        success = -1
        state = None

        orig_env = self.env
        if video:
            self.env = Monitor(orig_env, self.log.video_folder())

        while episode < 20:

            if done:
                success += 1

            if done or cntr > 2000:
                if video:
                    print(str(episode) + ' ' + str(total_cost))
                episode += 1
                state = self.__env_init_fn()
                cntr = 0

            action = self.__select_act(state, explorefree=True)
            state, cost, done, _ = self.env.step([action])
            if show:
                self.env.render()
            total_cost += cost
            cntr += 1

        self.env = orig_env

        return total_cost / 20.0, success

    def load_from_file(self):
        self.q = self.log.deserialize_numpy()

    def __del__(self):
        self.env.close()
    def train(
        self,
        function,
        discount_factor,
        start_epsilon,
        end_epsilon,
        observation_number_when_epsilon_annealing_ends,
        replay_memory_size,
        learning_env,
        testing_env,
        total_observations,
        observation_number_when_training_starts,
        test_interval,
        total_number_of_testing_episodes,
        gym_training_logs_directory_path,
        gym_testing_logs_directory_path,
    ):
        """Train the agent

        function -- An instance of the class implemnenting the
                    function approximation model
        discount_factor -- Quantifies how much the agent cares about future
                           rewards while learning. Often referred to as gamma in
                           the literature.
        start_epsilon -- Probability of random actions at start of training
        end_epsilon -- Probability of random actions at end of training
        observation_number_when_epsilon_annealing_ends -- Epsilon annealing
                                                          ends when
                                                          observation_number
                                                          reaches this value
        replay_memory_size -- Replay memory contains at most this many
                              experiences at any given point in training.
                              When replay memory grows bigger than this, some
                              of the earlier experiences are thrown away.
        learning_env -- A Gym environment (wrapped or vanilla) used for learning
        testing_env -- A Gym environment (wrapped or vanilla) used for testing.
        total_observations -- Train till this observation number
        observation_number_when_training_starts -- Training starts when
                                                   observation_number
                                                   reaches this value
        test_interval -- Test after this many observations
        total_number_of_testing_episodes -- Number of episodes to test the agent
                                            in every testing round
        gym_training_logs_directory_path - Directory to save automatic Gym logs
                                           related to training. We save the
                                           rewards for every learning episode.
        gym_testing_logs_directory_path - Directory to save automatic Gym logs
                                          related to testing. We save a video
                                          for the first test episode.
        weight_saving_interval -- Save the model weights (i.e. write to file)
                                  after this many observations.
        """

        # This keeps track of the number of observations made so far
        observation_number = 0

        # Keep count of the episode number
        episode_number = 1

        # The learning env should always be wrapped by the Monitor provided
        # by Gym. This lets us automatically save the rewards for every episode.

        learning_env = Monitor(
            learning_env,
            gym_training_logs_directory_path,
            # Don't want video recording during training, only during testing
            video_callable=False,
            # Write after every reset so that we don't lose data for
            # prematurely interrupted training runs
            write_upon_reset=True,
        )

        # To ensure that the replay memory never exceeds replay_memory_size,
        # we use a deque, which is a last in, first out type of data structure
        replay_memory = deque([], maxlen=replay_memory_size)

        while observation_number < total_observations:

            # initialize environment
            observation = learning_env.reset()

            total_rewards_obtained_in_this_episode = 0

            # Execute an episode
            while True:

                # Determine the action according to the epsilon greedy policy

                epsilon = self.get_epsilon(
                    start_epsilon,
                    end_epsilon,
                    observation_number,
                    observation_number_when_epsilon_annealing_ends,
                )

                action = function.get_action(observation, epsilon)

                # take the action determined by the epsilon-greedy policy
                next_observation, reward, done, info = learning_env.step(
                    action)

                # Store experience in replay memory
                transition = {
                    "observation": observation,
                    "action": action,
                    "reward": reward,
                    "done": done,
                    "next_observation": next_observation
                }

                replay_memory.append(transition)

                # # Update the model
                if observation_number > observation_number_when_training_starts:
                    function.update_model(discount_factor, replay_memory)

                observation = next_observation

                observation_number += 1
                # Test the current performance after every test_interval
                if observation_number % test_interval == 0:
                    # The testing env is also wrapped by a Monitor so that we
                    # can take automatic videos during testing. We will take a
                    # video for the very first testing episode.

                    video_callable = lambda count: count == 0

                    # Since the environment is closed after every testing round,
                    # the video for different testing round will end up having
                    # the same name! To differentiate the videos, we pass
                    # an unique uid parameter.

                    monitored_testing_env = Monitor(
                        testing_env,
                        gym_testing_logs_directory_path,
                        video_callable=video_callable,
                        resume=True,
                        uid=observation_number / test_interval)

                    # Run the test
                    average_reward = self.test(
                        monitored_testing_env,
                        total_number_of_episodes=
                        total_number_of_testing_episodes,
                        function=function,
                        epsilon=0.05,
                        render=False)
                    print(
                        "[{0}] Episode number : {1}, Observation number : {2} "
                        "Average reward (100 eps) : {3}".format(
                            datetime.datetime.now(), episode_number,
                            observation_number, average_reward))

                total_rewards_obtained_in_this_episode += reward

                if done:
                    episode_number += 1
                    break

            print("[{0}] Episode number : {1}, Obervation number: {2}, "
                  "Reward in this episode : {3}, Epsilon : {4}".format(
                      datetime.datetime.now(), episode_number - 1,
                      observation_number,
                      total_rewards_obtained_in_this_episode, epsilon))

        learning_env.close()

        # There's a bug in the Gym Monitor. The Monitor's close method does not
        # close the wrapped environment. This makes the script exit with an
        # error if the environment is being rendered at some point. To make
        # this error go away, we have to close the unwrapped testing
        # environment. The learning environment is not being rendered, so we
        # don't need to bother about that.
        testing_env.env.close()
Ejemplo n.º 24
0
def deep_q_learning(sess,
                    env,
                    q_estimator,
                    target_estimator,
                    state_processor,
                    num_episodes,
                    experiment_dir,
                    replay_memory_size=500000,
                    replay_memory_init_size=50000,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    batch_size=32,
                    record_video_every=50):
    """
	Q-Learning algorithm for off-policy TD control using Function Approximation.
	Finds the optimal greedy policy while following an epsilon-greedy policy.

	Args:
	    sess: Tensorflow Session object
	    env: OpenAI environment
	    q_estimator: Estimator object used for the q values
	    target_estimator: Estimator object used for the targets
	    state_processor: A StateProcessor object
	    num_episodes: Number of episodes to run for
	    experiment_dir: Directory to save Tensorflow summaries in
	    replay_memory_size: Size of the replay memory
	    replay_memory_init_size: Number of random experiences to sampel when initializing 
	      the reply memory.
	    update_target_estimator_every: Copy parameters from the Q estimator to the 
	      target estimator every N steps
	    discount_factor: Gamma discount factor
	    epsilon_start: Chance to sample a random action when taking an action.
	      Epsilon is decayed over time and this is the start value
	    epsilon_end: The final minimum value of epsilon after decaying is done
	    epsilon_decay_steps: Number of steps to decay epsilon over
	    batch_size: Size of batches to sample from the replay memory
	    record_video_every: Record a video every N episodes

	Returns:
	    An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
	"""

    Transition = namedtuple(
        "Transition", ["state", "action", "reward", "next_state", "done"])

    # The replay memory
    replay_memory = []

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)

    saver = tf.train.Saver()
    # Load a previous checkpoint if we find one
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)

    # Get the current time step
    total_t = sess.run(tf.contrib.framework.get_global_step())

    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    # The policy we're following
    policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS))

    # Populate the replay memory with initial experience
    print("Populating replay memory...")
    ############################################################
    # YOUR CODE 1 : Populate replay memory!
    # Hints : use function "populate_replay_buffer"
    # about 1 line code
    replay_memory = populate_replay_buffer(sess, env, state_processor,
                                           replay_memory_init_size,
                                           VALID_ACTIONS, Transition, policy)

    # Record videos
    env = Monitor(env,
                  directory=monitor_path,
                  resume=True,
                  video_callable=lambda count: count % record_video_every == 0)

    for i_episode in range(num_episodes):
        # Save the current checkpoint
        saver.save(tf.get_default_session(), checkpoint_path)

        # Reset the environment
        state = env.reset()
        state = state_process(sess, state_processor, state)
        loss = None

        # One step in the environment
        for t in itertools.count():

            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)]

            # Add epsilon to Tensorboard
            episode_summary = tf.Summary()
            episode_summary.value.add(simple_value=epsilon, tag="epsilon")
            q_estimator.summary_writer.add_summary(episode_summary, total_t)

            ###########################################################
            # YOUR CODE 2: Target network update
            # Hints : use function  "copy_model_parameters"
            if total_t % update_target_estimator_every == 0:
                copy_model_parameters(sess, q_estimator, target_estimator)

            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, loss: {}  Memory Len {} ".
                  format(t, total_t, i_episode + 1, num_episodes, loss,
                         len(replay_memory)),
                  end="")
            sys.stdout.flush()

            ##############################################
            # YOUR CODE 3: Take a step in the environment
            # Hints 1 :  be careful to use function 'state_process' to deal the RPG state
            # Hints 2 :  you can see function "populate_replay_buffer()"
            #				for detail about how to TAKE A STEP
            # about 2 or 3 line codes
            action = np.random.choice(len(VALID_ACTIONS),
                                      p=policy(sess, state, epsilon))
            next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
            next_state = state_processor.process(sess, next_state)
            next_state = np.append(state[:, :, 1:],
                                   np.expand_dims(next_state, 2),
                                   axis=2)

            # If our replay memory is full, pop the first element
            if len(replay_memory) == replay_memory_size:
                replay_memory.pop(0)

            #############################
            # YOUR CODE 4: Save transition to replay memory
            #  Hints : you can see function 'populate_replay_buffer' for detail
            # about 1 or 2 line codes
            replay_memory.append(
                Transition(state, action, reward, next_state, done))

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            #########################################################
            # YOUR CODE 5: Sample a minibatch from the replay memory,
            # hints: can use function "random.sample( replay_memory, batch_size )" to get minibatch
            # about 1-2 lines codes
            minibatch = np.array(random.sample(replay_memory, batch_size))
            state_batch, action_batch, reward_batch, next_state_batch, done_batch = map(
                np.array, zip(*minibatch))

            ###########################################################
            # YOUR CODE 6: use minibatch sample to calculate q values and targets
            # Hints 1 : use function 'q_estimator.predict' to get q values
            # Hints 2 : use function 'target_estimator.predict' to get targets values
            #				remember 'targets = reward + gamma * max q( s, a' )'
            # about 2 line codes

            q = target_estimator.predict(sess, next_state_batch)
            done_batch = np.invert(done_batch).astype(float)
            targets = reward_batch + done_batch * discount_factor * np.max(
                q, axis=1)

            ################################################
            # YOUR CODE 7: Perform gradient descent update
            # hints : use function 'q_estimator.update'
            # about 1 line code
            loss = q_estimator.update(sess, state_batch,
                                      np.array(action_batch), targets)
            if done:
                break
            state = next_state
            total_t += 1

        # Add summaries to tensorboard
        episode_summary = tf.Summary()
        episode_summary.value.add(
            simple_value=stats.episode_rewards[i_episode],
            node_name="episode_reward",
            tag="episode_reward")
        episode_summary.value.add(
            simple_value=stats.episode_lengths[i_episode],
            node_name="episode_length",
            tag="episode_length")
        q_estimator.summary_writer.add_summary(episode_summary, total_t)
        q_estimator.summary_writer.flush()

        yield total_t, plotting.EpisodeStats(
            episode_lengths=stats.episode_lengths[:i_episode + 1],
            episode_rewards=stats.episode_rewards[:i_episode + 1])

    env.close()
    return stats
Ejemplo n.º 25
0
class Runner:
    """Define runner for reinforcement learning update on Grid World."""
    def __init__(
        self,
        policy: str,
        env_config: Dict[str, Any],
        agent_config: Dict[str, Any],
        n_episode: int = 10,
        max_length: int = 100,
        save_video: bool = True,
        save_dir: str = "./result",
    ) -> None:
        """Initialize."""
        # learning settings
        self.env = CustomLavaEnv(**env_config)
        self.policy = policy
        self.agent = self.get_agent(policy, self.env, agent_config)

        self.env_config = env_config
        self.agent_config = agent_config

        self.n_episode = n_episode
        self.max_length = max_length
        self.episode_lengths: List[int] = []
        self.episode_rewards: List[float] = []

        # log setttings
        self.save_video = save_video
        if not os.path.exists(save_dir):
            os.mkdir(save_dir)

        self.save_dir = os.path.join(save_dir, policy)
        if not os.path.exists(self.save_dir):
            os.mkdir(self.save_dir)

        log_file = os.path.join(self.save_dir, "log.txt")
        # Delete old log if it exists.
        if os.path.exists(log_file):
            os.remove(log_file)

        logging.basicConfig(
            filename=log_file,
            format="%(asctime)s [%(levelname)s] %(message)s",
            datefmt="%Y/%m/%d %I:%M:%S",
            level=logging.INFO,
        )
        logging.getLogger().addHandler(logging.StreamHandler())

        logging.info("POLICY: %s", self.policy)
        logging.info("ENV CONFIG: %s", self.env_config)
        logging.info("AGENT CONFIG: %s", self.agent_config)
        logging.info("N_EPISODES: %s", self.n_episode)
        logging.info("MAX_LENGTH: %s", self.max_length)
        logging.info("")

    def get_agent(self, policy: str, env: gym.Env,
                  agent_config: Dict[str, Any]) -> AbstractAgent:
        """Get agent with policy."""
        if policy == "random":
            agent = RandomPolicy(env)
        elif policy == "pi":
            agent = PolicyIteration(env, agent_config)
        elif policy == "vi":
            agent = ValueIteration(env, agent_config)
        elif policy == "mc":
            agent = MCAgent(env, agent_config)
        elif policy == "sarsa":
            agent = SARSAAgent(env, agent_config)
        elif policy == "qlearning":
            agent = QLearningAgent(env, agent_config)
        else:
            raise NotImplementedError
        return agent

    def run(self) -> None:
        """Start Agent-Environment Interaction and update policy."""
        for episode in range(self.n_episode):
            # Rewrap the env every episode to save all episode video.
            if self.save_video:
                save_dir = os.path.join(self.save_dir,
                                        "{}_{}".format(self.policy, episode))
                self.env = Monitor(self.env, save_dir, force=True)

            if self.policy in ["pi", "vi"]:
                self.run_dynamic_programming()
            elif self.policy in ["mc"]:
                self.run_monte_carlo()
            elif self.policy in ["sarsa", "qlearning"]:
                self.run_temporal_difference()
            elif self.policy in ["random"]:
                self.run_random_agent()
            else:
                raise NotImplementedError

            logging.info(
                "Episode: %d | Episode Length: %d | Episode reward: %d",
                episode,
                self.episode_lengths[-1],
                self.episode_rewards[-1],
            )
            self.agent.print_results()

            self.env.close()

    def run_random_agent(self) -> None:
        """Run single episode for random agent."""
        done = False
        episode_reward = 0

        obs = self.env.reset()

        for _step in range(self.max_length):
            cur_state = obs["pos"]
            action = self.agent.get_action(cur_state)
            obs, reward, done, _ = self.env.step(action)

            episode_reward += reward

            if done:
                break

        self.episode_lengths.append(_step + 1)
        self.episode_rewards.append(episode_reward)

    def run_dynamic_programming(self) -> None:
        """Run single episode and update DP methods."""
        done = False
        episode_reward = 0

        obs = self.env.reset()

        for _step in range(self.max_length):
            cur_state = obs["pos"]
            action = self.agent.get_action(cur_state)
            obs, reward, done, _ = self.env.step(action)

            episode_reward += reward

            if done:
                break

        update_info = dict(agent_pos=obs["pos"],
                           reward_grid=obs["reward_grid"])

        self.agent.update_policy(update_info)

        self.episode_lengths.append(_step + 1)
        self.episode_rewards.append(episode_reward)

    def run_monte_carlo(self) -> None:
        """Run single episode and update MC methods."""
        done = False
        episode_reward = 0

        obs = self.env.reset()
        transactions = []

        for _step in range(self.max_length):
            cur_state = obs["pos"]
            action = self.agent.get_action(cur_state)
            obs, reward, done, _ = self.env.step(action)
            next_state = obs["pos"]

            transactions.append((cur_state, action, next_state, reward))

            episode_reward += reward

            if done:
                break

        update_info = dict(transactions=transactions)
        self.agent.update_policy(update_info)

        self.episode_lengths.append(_step + 1)
        self.episode_rewards.append(episode_reward)

    def run_temporal_difference(self) -> None:
        """Run single episode and update TD methods."""
        done = False
        episode_reward = 0

        obs = self.env.reset()

        for _step in range(self.max_length):
            cur_state = obs["pos"]
            action = self.agent.get_action(cur_state)
            obs, reward, done, _ = self.env.step(action)
            next_state = obs["pos"]

            update_info = dict(
                state=cur_state,
                action=action,
                reward=reward,
                next_state=next_state,
            )

            self.agent.update_policy(update_info)

            episode_reward += reward

            if done:
                break

        self.episode_lengths.append(_step + 1)
        self.episode_rewards.append(episode_reward)
Ejemplo n.º 26
0
def runExperiment(experiment):
    import numpy as np
    from collections import deque
    import gym
    from gym.wrappers import Monitor
    from agents.dqnagent import DQNAgent

    #environment parameters
    gym_id = experiment["gym_id"]
    sliding_window_solved_score = experiment["sliding_window_solved_score"]
    sliding_window_score_length = experiment["sliding_window_score_length"]
    env_seed = experiment["env_seed"]
    max_episode = experiment["max_episode"]

    env = gym.make(gym_id)
    env = Monitor(env,
                  "{}".format(experiment['folder']),
                  video_callable=False,
                  force=True,
                  resume=False,
                  write_upon_reset=False,
                  uid=None,
                  mode=None)

    env.seed(env_seed)
    scores = deque()
    sw_scores = deque(maxlen=sliding_window_score_length)

    #agent parameters
    agent_seed = experiment["agent_seed"]
    activation = experiment["activation"]
    min_episode_before_acting = experiment["min_episode_before_acting"]
    epsilon = experiment["epsilon"]
    nb_hidden_layer = experiment["nb_hidden_layer"]
    layer_width = experiment["layer_width"]
    memory_length = experiment["memory_length"]
    batch_size = experiment["batch_size"]
    agent = DQNAgent(env.observation_space, env.action_space, agent_seed,
                     min_episode_before_acting, activation, epsilon,
                     layer_width, nb_hidden_layer, memory_length)

    current_episode = 0
    while (len(sw_scores) == 0
           or np.mean(sw_scores) < sliding_window_solved_score) and (
               max_episode == None or current_episode < max_episode):
        state = env.reset()

        current_episode += 1
        reward = 0
        done = False
        episode_score = 0

        while not done:
            action = agent.act(state)

            next_state, reward, done, _ = env.step(action)

            agent.remember(state, action, reward, next_state, done)

            state = next_state

            episode_score += reward

            # if np.mean(sw_scores) > 180:
            #     env.render()

            if done:
                scores.append(episode_score)
                sw_scores.append(episode_score)

                print(
                    'Episode: {}\t Epsilon: {}\t Score: {}\t Mean Score:{}\t Sliding Score:{}\t'
                    .format(current_episode, agent.epsilon, episode_score,
                            np.mean(scores), np.mean(sw_scores)))
                agent.train(batch_size=batch_size)
    env.close()
Ejemplo n.º 27
0
    def train(self):
        """
        The training loop. This runs a single episode.

        TODO: Implement the following as desired:
            1. Storing transitions to the ReplayMemory
            2. Updating the network at some frequency
            3. Backing up the current parameters to a reference, target network
        """
        # Initially perform some random walks and make a replay memory
        env = Monitor(self.env, self.monitor_dir, force=True)
        for episode in range(1000):
            done = False
            obs = env.reset()
            while not done:
                action = random.randint(0, env.action_space.n - 1)
                encoded_action = np.zeros(env.action_space.n)
                encoded_action[action] = 1
                next_obs, reward, done, info = env.step(action)
                self.replay_memory.append(
                    (obs, encoded_action, reward, next_obs, done))
                obs = next_obs
                if len(self.replay_memory) > self.min_replay_size:
                    self.replay_memory.popleft()

        sum_of_reward = 0
        for episode in range(self.max_episode + 1):
            obs = env.reset()
            if self.change_eps == True:
                if self.eps_start > self.eps_mid:
                    self.eps_start -= (
                        initial_eps - mid_eps
                    ) / self.eps_decay  # Linear decay of exploration
                elif self.eps_start > self.eps_end:
                    self.eps_start -= (mid_eps -
                                       final_eps) / self.eps_decay_later
            else:
                self.eps_start = initial_eps
            done = False  #     self.num_steps += 1
            # self.num_episodes += 1
            reward_per_episode = 0
            while not done:
                action = self.select_action(obs)
                next_obs, reward, done, info = env.step(action)
                self.train_network(obs, action, reward, next_obs, done)
                obs = next_obs
                reward_per_episode += reward
            sum_of_reward += reward_per_episode
            if episode % 100 == 0:
                avg_reward = sum_of_reward / 100
                self.saver.save(self.sess, 'models/dqn-model')
                print("Avg reward: %s" % avg_reward)
                if avg_reward > 210:
                    test_reward = 0
                    for i in range(self.sanity_epochs):
                        obs = env.reset()
                        done = False
                        while not done:
                            action = self.select_action(obs,
                                                        evaluation_mode=True)
                            next_obs, reward, done, info = env.step(action)
                            test_reward += reward
                    avg_test_reward = test_reward / self.sanity_epochs
                    print("Episode: ", episode, "Average test reward: ",
                          avg_test_reward)
                    if avg_test_reward >= 200:
                        env.close()
                        break
                sum_of_reward = 0
Ejemplo n.º 28
0
monitor_dir = '/tmp/cartpole_exp1'

monitor = Monitor(env, monitor_dir, force=True)

sess.run(tf.global_variables_initializer())
b_obs, b_acts, b_rews = [], [], []

# for _ in range(eparams['ep_per_batch']):

obs, acts, rews = policy_rollout(env)

print('Episode steps: {}'.format(len(obs)))

b_obs.extend(obs)
b_acts.extend(acts)

advantages_rew = process_rewards(rews)
b_rews.extend(advantages_rew)

#29, 36
np.array(b_obs).shape
np.array(b_acts).shape
np.array(b_rews).shape

b_rews = (b_rews - np.mean(b_rews)) / (np.std(b_rews) + 1e-10)

train_step(b_obs, b_acts, b_rews)

monitor.close()

sess.close()
Ejemplo n.º 29
0
def deep_q_learning(sess,
                    env,
                    q_estimator,
                    target_estimator,
                    state_processor,
                    num_episodes,
                    experiment_dir,
                    replay_memory_size=500000,
                    replay_memory_init_size=50000,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    batch_size=32,
                    record_video_every=50):
    """
    Q-Learning algorithm for fff-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.

    Args:
        sess: Tensorflow Session object
        env: OpenAI environment
        q_estimator: Estimator object used for the q values
        target_estimator: Estimator object used for the targets
        state_processor: A StateProcessor object
        num_episodes: Number of episodes to run for
        experiment_dir: Directory to save Tensorflow summaries in
        replay_memory_size: Size of the replay memory
        replay_memory_init_size: Number of random experiences to sampel when initializing 
          the reply memory.
        update_target_estimator_every: Copy parameters from the Q estimator to the 
          target estimator every N steps
        discount_factor: Lambda time discount factor
        epsilon_start: Chance to sample a random action when taking an action.
          Epsilon is decayed over time and this is the start value
        epsilon_end: The final minimum value of epsilon after decaying is done
        epsilon_decay_steps: Number of steps to decay epsilon over
        batch_size: Size of batches to sample from the replay memory
        record_video_every: Record a video every N episodes

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    #print "q_learning starts"
    Transition = namedtuple(
        "Transition", ["state", "action", "reward", "next_state", "done"])

    # The replay memory
    replay_memory = []

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)

    saver = tf.train.Saver()
    # Load a previous checkpoint if we find one
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)

    # Get the current time step
    total_t = sess.run(tf.contrib.framework.get_global_step())

    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    # The policy we're following
    policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS))

    # Populate the replay memory with initial experience
    print("Populating replay memory...")
    state = env.reset()
    state = state_processor.process(sess, state)
    state = np.stack([state] * 4, axis=2)
    for i in range(replay_memory_init_size):
        action_probs = policy(sess, state,
                              epsilons[min(total_t, epsilon_decay_steps - 1)])
        #print "action_probs is", action_probs
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        #print "action is", action
        next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
        next_state = state_processor.process(sess, next_state)
        #print "next state is", next_state.shape
        next_state = np.append(state[:, :, 1:],
                               np.expand_dims(next_state, 2),
                               axis=2)  # (84,84) to (84,84,1)
        replay_memory.append(
            Transition(state, action, reward, next_state, done))
        if done:
            state = env.reset()
            state = state_processor.process(sess, state)
            state = np.stack([state] * 4, axis=2)
        else:
            state = next_state

    # Record videos
    env = Monitor(env,
                  directory=monitor_path,
                  video_callable=lambda count: count % record_video_every == 0,
                  resume=True)

    for i_episode in range(num_episodes):

        # Save the current checkpoint
        saver.save(tf.get_default_session(), checkpoint_path)

        # Reset the environment
        state = env.reset()
        state = state_processor.process(sess, state)
        state = np.stack([state] * 4, axis=2)
        loss = None

        # One step in the environment
        for t in itertools.count():

            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)]

            # Add epsilon to Tensorboard
            episode_summary = tf.Summary()
            episode_summary.value.add(simple_value=epsilon, tag="epsilon")
            q_estimator.summary_writer.add_summary(episode_summary, total_t)

            # TODO: Maybe update the target estimator
            if total_t % update_target_estimator_every == 0:
                copy_model_parameters(sess, q_estimator, target_estimator)

            # Print out which step we're on, useful for debugging.
            #print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
            #        t, total_t, i_episode + 1, num_episodes, loss))
            sys.stdout.flush()

            # Take a step in the environment
            # The policy we're following
            policy = make_epsilon_greedy_policy(target_estimator,
                                                len(VALID_ACTIONS))

            action_probs = policy(
                sess, state, epsilons[min(total_t, epsilon_decay_steps - 1)])
            action = np.random.choice(np.arange(len(action_probs)),
                                      p=action_probs)
            next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
            next_state = state_processor.process(sess, next_state)
            next_state = np.append(state[:, :, 1:],
                                   np.expand_dims(next_state, 2),
                                   axis=2)

            # If our replay memory is full, pop the first element
            if len(replay_memory) == replay_memory_size:
                replay_memory.pop(0)

            replay_memory.append(
                Transition(state, action, reward, next_state, done))

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            # Sample a minibatch from the replay memory
            samples = random.sample(replay_memory, batch_size)
            states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(
                np.array, zip(*samples))

            # Calculate q values and targets
            q_values_next = target_estimator.predict(sess, next_states_batch)
            targets_batch = reward_batch + np.invert(done_batch).astype(
                np.float32) * discount_factor * np.amax(q_values_next, axis=1)

            # Perform gradient descent update
            states_batch = np.array(states_batch)
            loss = q_estimator.update(sess, states_batch, action_batch,
                                      targets_batch)

            if done:
                print("\rEpisode {}/{}, done, loss: {}".format(
                    i_episode + 1, num_episodes, loss))
                break

            state = next_state
            total_t += 1

        # Add summaries to tensorboard
        episode_summary = tf.Summary()
        episode_summary.value.add(
            simple_value=stats.episode_rewards[i_episode],
            node_name="episode_reward",
            tag="episode_reward")
        episode_summary.value.add(
            simple_value=stats.episode_lengths[i_episode],
            node_name="episode_length",
            tag="episode_length")
        q_estimator.summary_writer.add_summary(episode_summary, total_t)
        q_estimator.summary_writer.flush()

        yield total_t, plotting.EpisodeStats(
            episode_lengths=stats.episode_lengths[:i_episode + 1],
            episode_rewards=stats.episode_rewards[:i_episode + 1])

    env.close()
Ejemplo n.º 30
0
def main(argv=()):
    del argv  # Unused.

    # Build an environment
    
    # Create and record episode - remove Monitor statement if recording not desired
    env = Monitor(gym.make('one-random-evader-v0'), './tmp/pursuit_evasion_infer_pursuer_vs_random_evader', force=True)

    #Reset state
    state = env.reset()
    
    #Initialize Agent Parameters
    #Get observed state space
    observed_state_space = env.get_observed_state_space()
    #Set initial state distribution
    initial_state_dist = []
    initial_state = env.get_initial_state()
    for state in observed_state_space:
        if state == initial_state:
            initial_state_dist.append(1)
        else:
            initial_state_dist.append(0)
    #Get action space
    action_space = range(0, env.action_space.n)
    #Set action prior to uniform dist
    action_prior = []
    for action in action_space:
        action_prior.append(1/len(action_space))
    #Get reward function
    reward_function = env.get_reward_function()
    #Get transition function 
    transition_function = env.get_transition_function()
    #Set max trajectory length
    max_trajectory_length = 11 #needs to be greater than shortest distance to evader for any meaningful inference

    #Create Agent
    agent = infer.DiceInferenceEngine(observed_state_space, action_space, initial_state_dist, action_prior, reward_function, transition_function, max_trajectory_length)
    print("\nAgent created.\n")
    #Set current observed state to initial state
    uncolored_obs = initial_state
    #Initialize actions list
    actions = []
    print("\nInfering action " + str(0) + "\n")
    actions.append(dist.Categorical(torch.tensor(agent.next(uncolored_obs))).sample().item())

    #Game Loop
    for t in range(0, 11):

        #Render
        env.render()
         
        #Delay to make video easier to watch
        #sleep(5)

        #Take action and get observations, rewards, termination from environment 
        observation, reward, done, info = env.step(actions[t]) 

        #If termination signal received, break out of loop
        if done:
            break

        #Pick next action based on agent's reasoning
        uncolored_obs = env.uncolor_board(observation)
        print("\nInfering action " + str(t + 1) + "\n")
        actions.append(dist.Categorical(torch.tensor(agent.next(uncolored_obs))).sample().item())


 

    env.close()
Ejemplo n.º 31
0
class Environment(object):
    def __init__(self,
                 game='FlappyBird-v0',
                 record=False,
                 width=84,
                 height=84,
                 seed=0):
        self.game = gym.make(game)
        self.game.seed(seed)

        if record:
            self.game = Monitor(self.game, './video', force=True)

        self.width = width
        self.height = height

    def play_sample(self, mode: str = 'human'):
        observation = self.game.reset()

        while True:
            screen = self.game.render(mode=mode)
            if mode == 'rgb_array':
                screen = self.preprocess(screen)
            action = self.game.action_space.sample()
            observation, reward, done, info = self.game.step(action)
            if done:
                break
        self.game.close()

    def preprocess(self, screen):
        #         preprocessed = screen[:400, 40:]
        preprocessed = screen
        preprocessed = transform.resize(preprocessed,
                                        (self.height, self.width))
        preprocessed = color.rgb2gray(preprocessed)
        preprocessed = preprocessed.astype('float32') / 255.

        return preprocessed

    def init(self):
        return self.game.reset()

    def get_screen(self):
        screen = self.game.render('rgb_array')
        screen = self.preprocess(screen)
        return screen

    def step(self, action: int):
        observation, reward, done, info = self.game.step(action)
        return observation, reward, done, info

    def reset(self):
        observation = self.game.reset()
        observation = self.preprocess(observation)
        return observation

    def close(self):
        self.game.close()

    @property
    def action_space(self):
        return self.game.action_space.n

    @property
    def observation_space(self):
        return self.game.observation_space
Ejemplo n.º 32
0
def record_sessions(env_id, agent, n_actions):
    env = Monitor(gym.make(env_id), directory='videos', force=True)
    for _ in range(100):
        generate_agent_session(env, agent, n_actions)

    env.close()
Ejemplo n.º 33
0
def train():

    logger.configure()
    set_global_seeds(args.seed)

    directory = os.path.join(
        args.log_dir,
        '_'.join([args.env,
                  datetime.datetime.now().strftime("%m%d%H%M")]))
    if not os.path.exists(directory):
        os.makedirs(directory)
    else:
        ValueError("The directory already exists...", directory)
    json.dump(vars(args),
              open(os.path.join(directory, 'learning_prop.json'), 'w'))

    env = make_atari(args.env)
    env = bench.Monitor(env, logger.get_dir())
    env = models.wrap_atari_dqn(env)

    nb_test_steps = args.nb_test_steps if args.nb_test_steps > 0 else None
    reload_path = args.reload_path if args.reload_path else None
    if args.record:
        env = Monitor(env, directory=directory)

    with tf.device(args.device):
        model = models.cnn_to_mlp(
            convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
            hiddens=[args.num_units] * args.num_layers,
            dueling=bool(args.dueling),
            init_mean=args.init_mean,
            init_sd=args.init_sd,
        )

        act, records = simple.learn(
            env,
            q_func=model,
            lr=args.learning_rate,
            lr_decay_factor=args.lr_decay_factor,
            lr_growth_factor=args.lr_growth_factor,
            max_timesteps=args.nb_train_steps,
            buffer_size=args.buffer_size,
            exploration_fraction=args.eps_fraction,
            exploration_final_eps=args.eps_min,
            train_freq=4,
            print_freq=1000,
            checkpoint_freq=int(args.nb_train_steps / 10),
            learning_starts=args.nb_warmup_steps,
            target_network_update_freq=args.target_update_freq,
            gamma=0.99,
            prioritized_replay=bool(args.prioritized),
            prioritized_replay_alpha=args.prioritized_replay_alpha,
            epoch_steps=args.nb_epoch_steps,
            alg=args.alg,
            noise=args.noise,
            gpu_memory=args.gpu_memory,
            varTH=args.varth,
            act_policy=args.act_policy,
            save_dir=directory,
            nb_test_steps=nb_test_steps,
            scope=args.scope,
            test_eps=args.test_eps,
            checkpoint_path=reload_path,
            init_t=args.init_t,
        )
        print("Saving model to model.pkl")
        act.save(os.path.join(directory, "model.pkl"))
    plot(records, directory)
    env.close()
Ejemplo n.º 34
0
def main(argv=None):
    try:
        options, args = getopt.getopt(sys.argv[1:], "s:x:b:u:mh", [
                                      "step=", "max_eps=", "buffer_size=", "hidden_unit=","monitor", "help"])
    except getopt.GetoptError as err:
        print(str(err))
        print(usage.__doc__)
        sys.exit(1)

    GAME_NAME = 'CartPole-v1'
    AGENT_NAME = 'DQN-lr_1_e-3'
    MONITOR = False
    print_step = 10
    max_eps = 500
    buffer_size=1000000
    hidden_unit = 16
    lr=1e-3

    print(options)
    for o, v in options:
        if o in ("-h", "--help"):
            print(usage.__doc__)
            sys.exit()
        elif o in ("-m", "--monitor"):
            MONITOR = True
        elif o in ("-s", "--step"):
            print_step = int(v)
        elif o in ("-x", "--max_eps"):
            max_eps = int(v)
        elif o in ("-b", "--buffer_size"):
            buffer_size = int(v)
        elif o in ("-u", "--hidden_unit"):
            hidden_unit = int(v)
        else:
            print(usage.__doc__)
            sys.exit()

    print('process game: %s\tusing agent: %s' % (GAME_NAME, AGENT_NAME))

    # -------------------------------- loop for training -----------------------------
    # preparing env
    output_dir = '%s/%s' % (GAME_NAME, AGENT_NAME)
    cmd = 'mkdir -p %s && mkdir -p %s/%s' % (GAME_NAME, GAME_NAME, AGENT_NAME)
    os.system(cmd)

    env = gym.make(GAME_NAME)
    if MONITOR:
        env = Monitor(env, directory=output_dir, force=True, video_callable=lambda ep: ep % 10 == 0, write_upon_reset=True, mode='training')
    
    env.seed(0)

    state_num = len(env.reset())
    print(state_num)
    action_sample = env.action_space.sample()
    action_num = env.action_space.n if isinstance(action_sample, int) else len(action_sample)
    print('state_num: %d\taction_num: %d' % (state_num, action_num))
    
    device = torch.device('cpu')
    agent = DQNAgent(state_num, action_num, buffer_size=buffer_size, batch_size=128, device=device, hidden_unit=hidden_unit, lr=lr)

    scores_window = deque(maxlen=print_step)  # last 10 scores
    avg_scores = []

    for i_episode in range(max_eps):
        score = 0
        state = env.reset()
        
        while True:
            action = agent.choose_action(state)
            next_state, reward, done, _ = env.step(action)
            
            agent.step(state, action, reward, next_state, done)
            score += reward
            state = next_state
            if done:
                break

        scores_window.append(score)

        print('\rEpisode {}\tAverage Score: {:.2f} '.format(
            i_episode, np.mean(scores_window)), end="")
        if i_episode % print_step == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)))
            # save model
            agent.save_model_params(output_dir, i_episode)

        avg_scores.append(np.mean(scores_window))
        sys.stdout.flush()

    env.close()
Ejemplo n.º 35
0
def main():
    # Play settings
    parser = argparse.ArgumentParser(description='A3C:Play')
    parser.add_argument(
        '--name',
        type=str,
        required=True,
        help=
        "Experiment name. All outputs will be stored in checkpoints/[name]/")
    parser.add_argument('--model_name',
                        default='best_model',
                        help='Model to play with (defualt: best_model)')

    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='Random seed (default: 1)')

    parser.add_argument('--n_eps',
                        default=100,
                        help='# of epsisode (default: 100)')
    parser.add_argument('--gpu_id', default=0, help='GPU id (default: 0)')

    parser.add_argument('--no_render',
                        action='store_true',
                        help='Do not render to screen (default: False)')
    parser.add_argument('--random',
                        action='store_true',
                        help='Act randomly (default: False)')
    parser.add_argument('--duration',
                        type=float,
                        default=5,
                        help='How long does the play last (default: 5 [min])')
    args = parser.parse_args()

    args.save_path = os.path.join('checkpoints', args.name)
    args.model_path = os.path.join(args.save_path, 'snapshots',
                                   '{}.pth'.format(args.model_name))
    args.gif_path = os.path.join(
        args.save_path, 'gifs',
        '{}_{}'.format(args.model_name,
                       datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")))

    with open(os.path.join(args.save_path, 'config')) as f:
        vargs = json.loads(''.join(f.readlines()))
    vargs.update(vars(args))
    args.__dict__ = vargs

    print('------------ Options -------------')
    for k, v in sorted(vars(args).items()):
        print('{}: {}'.format(k, v))
    print('-------------- End ----------------')

    if not os.path.isdir(args.gif_path):
        os.makedirs(args.gif_path)

    setproctitle('{}:play'.format(args.name))

    torch.manual_seed(args.seed)
    env = create_env(args.game_type, args.env_name, 'play', 1)
    env = Monitor(env, args.gif_path, force=True)
    env._max_episode_seconds = args.duration * 60
    env.seed(args.seed)

    model = ActorCriticLSTM(env.observation_space.shape[0], env.action_space.n)
    model.load_state_dict(torch.load(args.model_path))
    if args.gpu_id >= 0:
        with torch.cuda.device(args.gpu_id):
            model.cuda()

    model.eval()

    best_reward = None
    for eps in range(args.n_eps):
        model.reset()
        reward, _ = play_game(env,
                              model,
                              render=not args.no_render,
                              rand=args.random,
                              gpu_id=args.gpu_id)
        best_reward = reward if best_reward is None else max(
            best_reward, reward)
        print('EPS: {}/{}, Reward: {}'.format(eps + 1, args.n_eps, reward))

    env.close()

    if arg.n_eps > 10:
        gym.upload(args.gif_path, api_key='sk_aQXs9Po5RUyv0ZDQnkZ2A')
    os.rename(args.gif_path, args.gif_path + '_' + str(best_reward))
Ejemplo n.º 36
0
def deep_q_learning(sess,
                    env,
                    q_estimator,
                    target_estimator,
                    state_processor,
                    num_episodes,
                    experiment_dir,
                    replay_memory_size=500000,
                    replay_memory_init_size=20000,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    batch_size=32,
                    record_video_every=50):
    Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])

    # The replay memory
    replay_memory = []

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)

    saver = tf.train.Saver()
    # Load a previous checkpoint if we find one
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)

    # Get the current time step
    total_t = sess.run(tf.train.get_global_step())

    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    # The policy we're following
    policy = make_epsilon_greedy_policy(
        q_estimator,
        len(VALID_ACTIONS))

    # Populate the replay memory with initial experience
    print("Populating replay memory...")
    ############################################################
    # YOUR CODE 1 : Populate replay memory!
    # Hints : use function "populate_replay_buffer"
    # about 1 line code
    replay_memory = populate_replay_buffer(sess, env, state_processor, replay_memory_init_size, VALID_ACTIONS,
                                           Transition, policy)

    # Record videos
    env = Monitor(env,
                  directory=monitor_path,
                  resume=True,
                  video_callable=lambda count: count % record_video_every == 0)

    for i_episode in range(num_episodes):
        # Save the current checkpoint
        saver.save(tf.get_default_session(), checkpoint_path)

        # Reset the environment
        state = env.reset()
        state = state_process(sess, state_processor, state)
        loss = None

        # One step in the environment
        for t in itertools.count():
            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)]

            # Add epsilon to Tensorboard
            episode_summary = tf.Summary()
            episode_summary.value.add(simple_value=epsilon, tag="epsilon")
            q_estimator.summary_writer.add_summary(episode_summary, total_t)

            ###########################################################
            # YOUR CODE 2: Target network update
            # Hints : use function  "copy_model_parameters"
            if total_t % update_target_estimator_every == 0:
                copy_model_parameters(sess, q_estimator, target_estimator)

            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
                t, total_t, i_episode + 1, num_episodes, loss), end="")
            sys.stdout.flush()

            ##############################################
            # YOUR CODE 3: Take a step in the environment
            # Hints 1 :  be careful to use function 'state_process' to deal the RPG state
            # Hints 2 :  you can see function "populate_replay_buffer()"
            #				for detail about how to TAKE A STEP
            # about 2 or 3 line codes
            action = np.random.choice(len(VALID_ACTIONS), p=policy(sess, state, epsilon))
            next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
            next_state = state_processor.process(sess, next_state)
            next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, 2), axis=2)

            # If our replay memory is full, pop the first element
            if len(replay_memory) == replay_memory_size:
                replay_memory.pop(0)

            #############################
            # YOUR CODE 4: Save transition to replay memory
            #  Hints : you can see function 'populate_replay_buffer' for detail
            # about 1 or 2 line codes
            replay_memory.append(Transition(state, action, reward, next_state, done))

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            #########################################################
            # YOUR CODE 5: Sample a minibatch from the replay memory,
            # hints: can use function "random.sample( replay_memory, batch_size )" to get minibatch
            # about 1-2 lines codes
            #minibatch = np.array(rd.sample(replay_memory, batch_size))
            samples = rd.sample(replay_memory, batch_size)
            s, a, r, s_next, done_ = map(np.array, zip(*samples))

            ###########################################################
            # YOUR CODE 6: use minibatch sample to calculate q values and targets
            # Hints 1 : use function 'q_estimator.predict' to get q values
            # Hints 2 : use function 'target_estimator.predict' to get targets values
            #				remember 'target = reward + gamma * max q( s, a' )'
            # about 2 line codes

            q_eval_next = q_estimator.predict(sess, s_next)
            best_actions = np.argmax(q_eval_next, axis=1)
            q_target_next = target_estimator.predict(sess, s_next)
            q_targets = r + np.invert(done_).astype(np.float32) * discount_factor * q_target_next[np.arange(batch_size), best_actions]

            ################################################
            # YOUR CODE 7: Perform gradient descent update
            # hints : use function 'q_estimator.update'
            # about 1 line code
            loss = q_estimator.update(sess, np.array(s), a, q_targets)

            if done:
                break

            state = next_state
            total_t += 1

        # Add summaries to tensorboard
        episode_summary = tf.Summary()
        episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], node_name="episode_reward",
                                  tag="episode_reward")
        episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], node_name="episode_length",
                                  tag="episode_length")
        q_estimator.summary_writer.add_summary(episode_summary, total_t)
        q_estimator.summary_writer.flush()

        yield total_t, plotting.EpisodeStats(
            episode_lengths=stats.episode_lengths[:i_episode + 1],
            episode_rewards=stats.episode_rewards[:i_episode + 1])

    env.close()
    return stats
Ejemplo n.º 37
0
def run_trial(args):
    # tries to get agent type
    agent_t = args.agent
    results_dir = ''
    if agent_t == AgentType.Testing:

        # tries to load config from provided results dir path
        results_dir = args.results if args.results is not None else \
            get_agent_output_dir(DEFAULT_CONFIG, AgentType.Learning)
        config_file = join(results_dir, 'config.json')
        if not exists(results_dir) or not exists(config_file):
            raise ValueError('Could not load configuration from: {}.'.format(config_file))
        config = EnvironmentConfiguration.load_json(config_file)

        # if testing, we want to force a seed different than training (diff. test environments)
        config.seed += 1

    else:
        # tries to load env config from provided file path
        config_file = args.config
        config = DEFAULT_CONFIG if config_file is None or not exists(config_file) \
            else EnvironmentConfiguration.load_json(config_file)

    # creates env helper
    helper = create_helper(config)

    # checks for provided output dir
    output_dir = args.output if args.output is not None else get_agent_output_dir(config, agent_t, args.trial)
    if not exists(output_dir):
        makedirs(output_dir)

    # saves / copies configs to file
    config.save_json(join(output_dir, 'config.json'))
    helper.save_state_features(join(output_dir, 'state_features.csv'))

    # register environment in Gym according to env config
    env_id = '{}-{}-v0'.format(config.gym_env_id, args.trial)
    helper.register_gym_environment(env_id, False, FPS, SHOW_SCORE_BAR)

    # create environment and monitor
    env = gym.make(env_id)
    # todo
    config.num_episodes = 100
    video_callable = video_schedule(config, args.record)
    env = Monitor(env, directory=output_dir, force=True, video_callable=video_callable)

    # adds reference to monitor to allow for gym environments to update video frames
    if video_callable(0):
        env.env.monitor = env

    # initialize seeds (one for the environment, another for the agent)
    env.seed(config.seed + args.trial)
    agent_rng = np.random.RandomState(config.seed + args.trial)

    # creates the agent
    agent, exploration_strategy = create_agent(helper, agent_t, agent_rng)

    # if testing, loads tables from file (some will be filled by the agent during the interaction)
    if agent_t == AgentType.Testing:
        agent.load(results_dir, )

    # runs episodes
    behavior_tracker = BehaviorTracker(config.num_episodes)
    recorded_episodes = []
    for e in range(config.num_episodes):

        # checks whether to activate video monitoring
        env.env.monitor = env if video_callable(e) else None

        # reset environment
        old_obs = env.reset()
        old_s = helper.get_state_from_observation(old_obs, 0, False)

        if args.verbose:
            helper.update_stats_episode(e)
        exploration_strategy.update(e)

        t = 0
        done = False
        while not done:
            # select action
            a = agent.act(old_s)

            # observe transition
            obs, r, done, _ = env.step(a)
            s = helper.get_state_from_observation(obs, r, done)
            r = helper.get_reward(old_s, a, r, s, done)

            # update agent and stats
            agent.update(old_s, a, r, s)
            behavior_tracker.add_sample(old_s, a)
            helper.update_stats(e, t, old_obs, obs, old_s, a, r, s)

            old_s = s
            old_obs = obs
            t += 1

        # adds to recorded episodes list
        if video_callable(e):
            recorded_episodes.append(e)

        # signals new episode to tracker
        behavior_tracker.new_episode()

    # writes results to files
    agent.save(output_dir)
    behavior_tracker.save(output_dir)
    write_table_csv(recorded_episodes, join(output_dir, 'rec_episodes.csv'))
    helper.save_stats(join(output_dir, 'results'), CLEAR_RESULTS)
    print('\nResults of trial {} written to:\n\t\'{}\''.format(args.trial, output_dir))

    env.close()
Ejemplo n.º 38
0
            for i, act_name in enumerate(action_names):
                print('{:10.5f}'.format(agent.q[s][i]), end='\t')
            print()

            # checks if save feature was pressed
            if save_features:
                helper.save_features_image(obs_vec,
                                           join(output_dir, 'features.png'))
                save_features = False

            # checks if save environment was pressed
            if save_environment:
                save_image(env, join(output_dir, 'environment.png'))
                save_environment = False

            window_still_open = env.render() is not None
            old_s = s
            old_obs = obs
            t += 1

        # signals new episode to tracker
        behavior_tracker.new_episode()
        e += 1

    # writes results to files
    behavior_tracker.save(output_dir)
    helper.save_stats(join(output_dir, 'results'))

    print('\nResults written to:\n\t\'{}\''.format(output_dir))
    env.close()
Ejemplo n.º 39
0
    def train(self, table, discount_factor, start_epsilon, end_epsilon,
              learning_env, testing_env, total_observations, test_interval,
              total_number_of_testing_episodes,
              gym_training_logs_directory_path,
              gym_testing_logs_directory_path, table_saving_interval):
        """Train the GLIE Monte Carlo agent

        table -- Table storing Q values and visit numbers for state action pairs
        discount_factor -- Quantifies how much the agent cares about future
                           rewards while learning. Often referred to as gamma in
                           the literature.
        start_epsilon -- Probability of random actions at start of training
        end_epsilon -- Probability of random actions at end of training
        learning_env -- A Gym environment (wrapped or vanilla) used for learning
        testing_env -- A Gym environment (wrapped or vanilla) used for testing.
        total_observations -- Train till this observation number
        test_interval -- Test after this many observations
        total_number_of_testing_episodes -- Number of episodes to test the agent
                                            in every testing round
        gym_training_logs_directory_path - Directory to save automatic Gym logs
                                           related to training. We save the
                                           rewards for every learning episode.
        gym_testing_logs_directory_path - Directory to save automatic Gym logs
                                          related to testing. We save a video
                                          for the first test episode.
        table_saving_interval -- Save the table (i.e. write the table to file) after
                                 this many observations.
        """

        # This keeps track of the number of observations made so far
        observation_number = 0

        # Keep count of the episode number
        episode_number = 1

        # The learning env should always be wrapped by the Monitor provided
        # by Gym. This lets us automatically save the rewards for every episode.

        learning_env = Monitor(
            learning_env,
            gym_training_logs_directory_path,
            # Don't want video recording during training, only during testing
            video_callable=False,
            # Write after every reset so that we don't lose data for
            # prematurely interrupted training runs
            write_upon_reset=True,
        )

        while observation_number < total_observations:

            # Need a list to hold the relevant information for this episode.
            # This will be used in updating Q values.
            # structure : [
            #   {
            #       "observation" : observation,
            #       "action" : action,
            #       "reward" : immediate reward
            #       },...
            #    ]
            episode_history = []

            # initialize environment
            observation = learning_env.reset()

            total_rewards_obtained_in_this_episode = 0

            # Execute an episode
            while True:

                epsilon = self.get_epsilon(start_epsilon, end_epsilon,
                                           observation_number,
                                           total_observations)

                # use the epsilon-greedy policy to choose an action
                action = self.get_action(learning_env, table, observation,
                                         epsilon)

                # take the action determined by the epsilon-greedy policy
                next_observation, reward, done, info = learning_env.step(
                    action)

                # add the current state and resulting reward to history
                episode_history.append({
                    "observation": observation,
                    "action": action,
                    "reward": reward
                })

                observation = next_observation

                observation_number += 1
                # Test the current performance after every test_interval
                if observation_number % test_interval == 0:
                    # The testing env is also wrapped by a Monitor so that we
                    # can take automatic videos during testing. We will take a
                    # video for the very first testing episode.

                    video_callable = lambda count: count == 0

                    # Since the environment is closed after every testing round,
                    # the video for different testing round will end up having
                    # the same name! To differentiate the videos, we pass
                    # an unique uid parameter.

                    monitored_testing_env = Monitor(
                        testing_env,
                        gym_testing_logs_directory_path,
                        video_callable=video_callable,
                        resume=True,
                        uid=observation_number / test_interval)

                    # Run the test
                    average_reward = self.test(
                        monitored_testing_env,
                        total_number_of_episodes=
                        total_number_of_testing_episodes,
                        table=table,
                        epsilon=0,
                        render=False)
                    print(
                        "[{0}] Episode number : {1}, Observation number : {2} "
                        "Average reward (100 eps) : {3}".format(
                            datetime.datetime.now(), episode_number,
                            observation_number, average_reward))

                total_rewards_obtained_in_this_episode += reward

                if done:
                    # episode has ended, update Q and N values
                    self.update_table(table, discount_factor, episode_history)
                    episode_number += 1
                    # save the table at regular intervals
                    if episode_number % table_saving_interval == 0:
                        table.save()
                    break

            print("[{0}] Episode number : {1}, Obervation number: {2}, "
                  "Reward in this episode : {3}, Epsilon : {4}".format(
                      datetime.datetime.now(), episode_number - 1,
                      observation_number,
                      total_rewards_obtained_in_this_episode, epsilon))

        learning_env.close()

        # There's a bug in the Gym Monitor. The Monitor's close method does not
        # close the wrapped environment. This makes the script exit with an
        # error if the environment is being rendered at some point. To make
        # this error go away, we have to close the unwrapped testing
        # environment. The learning environment is not being rendered, so we
        # don't need to bother about that.
        testing_env.env.close()
Ejemplo n.º 40
0
    def evaluate(self,
                 n_games=1,
                 save_path="./records",
                 use_monitor=True,
                 record_video=True,
                 verbose=True,
                 t_max=100000):
        """Plays an entire game start to end, records the logs(and possibly mp4 video), returns reward.

        :param save_path: where to save the report
        :param record_video: if True, records mp4 video
        :return: total reward (scalar)
        """
        env = self.make_env()

        if not use_monitor and record_video:
            raise warn(
                "Cannot video without gym monitor. If you still want video, set use_monitor to True"
            )

        if record_video:
            env = Monitor(env, save_path, force=True)
        elif use_monitor:
            env = Monitor(env,
                          save_path,
                          video_callable=lambda i: False,
                          force=True)

        game_rewards = []
        for _ in range(n_games):
            # initial observation
            observation = env.reset()
            # initial memory
            prev_memories = [
                np.zeros((1, ) + tuple(mem.output_shape[1:]),
                         dtype=get_layer_dtype(mem))
                for mem in self.agent.agent_states
            ]

            t = 0
            total_reward = 0
            while True:

                res = self.agent_step(
                    self.preprocess_observation(observation)[None, ...],
                    *prev_memories)
                action, new_memories = res[0], res[1:]

                observation, reward, done, info = env.step(action[0])

                total_reward += reward
                prev_memories = new_memories

                if done or t >= t_max:
                    if verbose:
                        print(
                            "Episode finished after {} timesteps with reward={}"
                            .format(t + 1, total_reward))
                    break
                t += 1
            game_rewards.append(total_reward)

        env.close()
        del env
        return game_rewards
    def train(self, actor, critic, discount_factor, lambda_value, learning_env,
              testing_env, horizon, minibatch_size, epochs, total_observations,
              test_interval, total_number_of_testing_episodes,
              gym_training_logs_directory_path,
              gym_testing_logs_directory_path):
        """Train the PPO agent

        actor -- The actor instance
        critic -- The critic instance
        discount_factor -- Quantifies how much the agent cares about future
                           rewards while learning. Often referred to as gamma in
                           the literature.
        lambda_value -- The lambda in TD(lambda)
        learning_env -- A Gym environment (wrapped or vanilla) used for learning
        testing_env -- A Gym environment (wrapped or vanilla) used for testing.
                       This may be different from learning_env. For example, we
                       might be scaling the rewards in the learning environment.
                       But we want to benchmark performance in a testing
                       environment where the rewards are not scaled.
        horizon -- Number of experiences to collect before performing a training
                   step. Must be a integer multiple of minibatch_size.
        minibatch_size -- Minibatch size for training the actor and critic.
        epochs -- Number of epochs of training on one set of experiences
        total_observations -- Train till this observation number
        test_interval -- Test after this many observations
        total_number_of_testing_episodes -- Number of episodes to test the agent
                                            in every testing round
        gym_training_logs_directory_path - Directory to save automatic Gym logs
                                           related to training. We save the
                                           rewards for every learning episode.
        gym_testing_logs_directory_path - Directory to save automatic Gym logs
                                          related to testing. We save a video
                                          for the first test episode.
        """

        # We will fill training_samples with the agent's experience till it
        # reaches a size equal to horizon. Then we will train the actor
        # and critic on this data. After training is done, we will empty the
        # list and repeat the process for the next sequence of experiences.
        training_samples = []

        # To make computing advantages and value function targets easier, we
        # put the experiences first in a different list
        # training_samples_this_episode. When the episode ends or horizon is
        # reached (whichever happens earlier), we compute advantages and
        # value function targets using this list. Then the list is emptied
        # and the data transfered to the other list training_samples.
        training_samples_this_episode = []

        # This keeps track of the number of observations made so far
        observation_number = 0

        # Keep count of the episode number
        episode_number = 1

        # The learning env should always be wrapped by the Monitor provided
        # by Gym. This lets us automatically save the rewards for every episode.

        learning_env = Monitor(
            learning_env,
            gym_training_logs_directory_path,
            # Don't want video recording during training, only during testing
            video_callable=False,
            # Write after every reset so that we don't lose data for
            # prematurely interrupted training runs
            write_upon_reset=True,
        )

        while observation_number < total_observations:

            # Start of an episode
            observation = learning_env.reset()

            # Predicted value for this observation
            value = critic.get_value(np.array([observation]))[0][0]

            total_rewards_obtained_in_this_episode = 0

            while True:

                policy = actor.get_policies(np.array([observation]))[0]
                action = actor.get_actions(np.array([policy]))[0]

                # The actor may not keep the actions within the bounds accepted
                # by the environment. Therefore, we clip the action manually to
                # make it conform to the bounds.
                clipped_action = np.clip(action, learning_env.action_space.low,
                                         learning_env.action_space.high)

                next_observation, reward, done, info = (
                    learning_env.step(clipped_action))

                # Predicted value of the next observation, necessary for
                # calculating TD error
                next_value = (critic.get_value(np.array(
                    [next_observation]))[0][0] if not done else 0)

                experience = {
                    "observation": observation,
                    "next_observation": next_observation,
                    "means": policy["means"],
                    "vars": policy["vars"],
                    "action": action,
                    "clipped_action": clipped_action,
                    "reward": reward,
                    "terminal": done,
                    "value": value,
                    "next_value": next_value,
                }

                training_samples_this_episode.append(experience)

                observation = next_observation
                value = next_value

                observation_number += 1
                # Test the current performance after every test_interval
                if observation_number % test_interval == 0:
                    # The testing env is also wrapped by a Monitor so that we
                    # can take automatic videos during testing. We will take a
                    # video for the very first testing episode.

                    video_callable = lambda count: count == 0

                    # Since the environment is closed after every testing round,
                    # the video for different testing round will end up having
                    # the same name! To differentiate the videos, we pass
                    # an unique uid parameter.

                    monitored_testing_env = Monitor(
                        testing_env,
                        gym_testing_logs_directory_path,
                        video_callable=video_callable,
                        resume=True,
                        uid=observation_number / test_interval)

                    # Run the test
                    average_reward = self.test(
                        monitored_testing_env,
                        total_number_of_episodes=
                        total_number_of_testing_episodes,
                        actor=actor,
                        render=False)
                    print(
                        "[{0}] Episode number : {1}, Observation number : {2} "
                        "Average reward (100 eps) : {3}".format(
                            datetime.datetime.now(), episode_number,
                            observation_number, average_reward))

                total_rewards_obtained_in_this_episode += reward

                ## Training starts here

                # If previous episodes ended quickly before we could reach the
                # horizon, these experiences have already been transfered to
                # training_samples. So, to get the total number of experiences
                # gathered since the last training step, we have sum up the
                # experiences gathered in this episode and the experiences
                # from previous episodes which have been transferred to
                # training_samples.
                number_of_experiences_since_last_training_step = (
                    len(training_samples_this_episode) + len(training_samples))

                # If horizon is reached or the episode ended, we compute
                # advantages and value targets using
                # training_samples_this_episode. The experiences are then
                # transfered to the list training_samples. Finally
                # training_samples_this_episode is emptied to accomodate
                # further experiences.
                if (number_of_experiences_since_last_training_step == horizon
                        or done):
                    training_samples_this_episode_with_targets = (
                        self.compute_advantages_and_value_targets(
                            training_samples_this_episode, discount_factor,
                            lambda_value))
                    training_samples += (
                        training_samples_this_episode_with_targets)
                    training_samples_this_episode = []

                # If horizon is reached, we train the actor and critic on the
                # stored experiences. Then we forget about those experiences
                # by emptying training_samples.
                if number_of_experiences_since_last_training_step == horizon:
                    self.perform_training_step(actor, critic, training_samples,
                                               minibatch_size, epochs)
                    training_samples = []

                    # After a round of training, the actor and critic weights
                    # have changed. So we use the updated model to compute the
                    # value function instead of using the value function
                    # predicted by the older models.
                    value = critic.get_value(np.array([next_observation]))[0]

                # Start over when the episode ends
                if done:
                    episode_number += 1
                    break

            print("[{0}] Episode number : {1}, Obervation number: {2}, "
                  "Reward in this episode : {3}".format(
                      datetime.datetime.now(), episode_number - 1,
                      observation_number,
                      total_rewards_obtained_in_this_episode))
        learning_env.close()

        # There's a bug in the Gym Monitor. The Monitor's close method does not
        # close the wrapped environment. This makes the script exit with an
        # error if the environment is being rendered at some point. To make
        # this error go away, we have to close the unwrapped testing
        # environment. The learning environment is not being rendered, so we
        # don't need to bother about that.
        testing_env.env.close()
Ejemplo n.º 42
0
def run_trial(args):
    # tries to get agent type
    agent_t = args.agent
    if agent_t == AgentType.Testing:
        # tries to load a pre-trained agent configuration file
        config, results_dir = load_agent_config(args.results, args.trial)
    else:
        # tries to load env config from provided file path
        config_file = args.config_file_path
        config = args.default_frogger_config if config_file is None or not exists(config_file) \
            else EnvironmentConfiguration.load_json(config_file)
    # creates env helper
    helper = create_helper(config)
    # checks for provided output dir
    output_dir = args.output if args.output is not None else \
        get_agent_output_dir(config, agent_t, args.trial)
    if not exists(output_dir):
        makedirs(output_dir)
    # saves / copies configs to file
    config.save_json(join(output_dir, 'config.json'))
    helper.save_state_features(join(output_dir, 'state_features.csv'))
    # register environment in Gym according to env config
    env_id = '{}-{}-v0'.format(config.gym_env_id, args.trial)
    helper.register_gym_environment(env_id, False, args.fps,
                                    args.show_score_bar)
    # create environment and monitor
    env = gym.make(env_id)
    config.num_episodes = args.num_episodes
    video_callable = video_schedule(config, args.record)
    env = Monitor(env,
                  directory=output_dir,
                  force=True,
                  video_callable=video_callable)
    # adds reference to monitor to allow for gym environments to update video frames
    if video_callable(0):
        env.env.monitor = env
    # initialize seeds (one for the environment, another for the agent)
    env.seed(config.seed + args.trial)
    agent_rng = np.random.RandomState(config.seed + args.trial)
    # creates the agent
    agent, exploration_strategy = create_agent(helper, agent_t, agent_rng)
    # if testing, loads tables from file (some will be filled by the agent during the interaction)
    if agent_t == AgentType.Testing:
        agent.load(results_dir)
    # runs episodes
    behavior_tracker = BehaviorTracker(config.num_episodes)
    recorded_episodes = []
    for e in range(config.num_episodes):
        # checks whether to activate video monitoring
        env.env.monitor = env if video_callable(e) else None
        # reset environment
        old_obs = env.reset()
        old_s = helper.get_state_from_observation(old_obs, 0, False)
        if args.verbose:
            print(f'Episode: {e}')
            # helper.update_stats_episode(e)
        exploration_strategy.update(e)  # update for learning agent
        t = 0
        done = False
        while not done:
            # select action
            a = agent.act(old_s)
            # observe transition
            obs, r, done, _ = env.step(a)
            s = helper.get_state_from_observation(obs, r, done)
            r = helper.get_reward(old_s, a, r, s, done)
            # update agent and stats
            agent.update(old_s, a, r, s)
            behavior_tracker.add_sample(old_s, a)
            helper.update_stats(e, t, old_obs, obs, old_s, a, r, s)
            old_s = s
            old_obs = obs
            t += 1
        # adds to recorded episodes list
        if video_callable(e):
            recorded_episodes.append(e)
        # signals new episode to tracker
        behavior_tracker.new_episode()
    # writes results to files
    agent.save(output_dir)
    behavior_tracker.save(output_dir)
    write_table_csv(recorded_episodes, join(output_dir, 'rec_episodes.csv'))
    helper.save_stats(join(output_dir, 'results'), args.clear_results)
    print('\nResults of trial {} written to:\n\t\'{}\''.format(
        args.trial, output_dir))
    env.close()
Ejemplo n.º 43
0
class BaseAgent:
    def __init__(self, config):
        self.config = config
        self.env = config['env']
        make_seed(config['seed'])
        self.env.seed(config['seed'])
        self.use_cuda = config['use_cuda']
        self.gamma = config['gamma']
        self.verbose = config['verbose']
        self.max_episode_length = config['max_episode_length']
        self.use_mean_baseline = config.get('use_mean_baseline', False)

        self.model = config['model']

        # the optimizer used by PyTorch (Stochastic Gradient, Adagrad, Adam, etc.)
        self.optimizer = torch.optim.Adam(self.model.net.parameters(),
                                          lr=config['learning_rate'])
        self.monitor_env = Monitor(self.env,
                                   "./gym-results",
                                   force=True,
                                   video_callable=lambda episode: True)

    @abc.abstractmethod
    def _compute_returns(self, rewards):
        """Returns the cumulative discounted rewards at each time step

        Parameters
        ----------
        rewards : array
            The array of rewards of one episode

        Returns
        -------
        array
            The cumulative discounted rewards at each time step

        Example
        -------
        for rewards=[1, 2, 3] this method outputs [1 + 2 * gamma + 3 * gamma**2, 2 + 3 * gamma, 3]
        """

        raise NotImplementedError

    def sample_trajectories(self, n_trajectories):
        trajectories = []
        for _ in range(n_trajectories):
            states = [
                torch.from_numpy(self.env.reset()).type(self.model.dtype)
            ]
            actions = []
            rewards = []
            log_probs = []

            done = False
            count = 0

            # stop after self.max_episode_length steps,
            # otherwise episodes run for too long when
            # the agent is skilled enough
            while not done and count < self.max_episode_length:
                action = int(self.model.select_action(states[-1]))

                prob = self.model.forward(states[-1])
                # clip prob away from 0 and 1 to avoid numerical issues when taking the log
                prob = torch.clamp(prob,
                                   np.finfo(np.float32).eps,
                                   1 - np.finfo(np.float32).eps)
                log_prob = torch.log(prob)

                state, reward, done, _ = self.env.step(action)
                states.append(torch.from_numpy(state).type(self.model.dtype))
                actions.append(action)
                rewards.append(reward)
                log_probs.append(log_prob)

                count += 1

            trajectories.append({
                'states': states,
                'actions': actions,
                'rewards': rewards,
                'log_probs': log_probs,
            })

        return trajectories

    @abc.abstractmethod
    def optimize_model(self, n_trajectories):
        """Perform a gradient update using n_trajectories

        Parameters
        ----------
        n_trajectories : int
            The number of trajectories used to approximate the expectation card(D) in the formula above

        Returns
        -------
        array
            The cumulative discounted rewards of each trajectory
        """
        raise NotImplementedError

    def train(self, n_trajectories, n_update):
        """Training method

        Parameters
        ----------
        n_trajectories : int
            The number of trajectories used to approximate the expected gradient
        n_update : int
            The number of gradient updates

        """

        rewards = []
        for episode in range(n_update):
            rewards.append(self.optimize_model(n_trajectories))
            if (episode + 1) % self.verbose == 0:
                rewards_np = np.array(rewards)
                # Print the reward stats averaged across all last self.verbose steps
                mean = rewards_np[-1 - self.verbose:-1].mean()
                std = rewards_np[-1 - self.verbose:-1].std()
                print(
                    f'Episode {episode + 1}/{n_update}: rewards {round(mean, 2)} +/- {round(std, 2)}'
                )

        # Plotting
        r = pd.DataFrame((itertools.chain(*(itertools.product([i], rewards[i])
                                            for i in range(len(rewards))))),
                         columns=['Epoch', 'Reward'])
        sns.lineplot(x="Epoch", y="Reward", data=r, ci='sd')

    def evaluate(self, render=False):
        """Evaluate the agent on a single trajectory
        """

        observation = self.monitor_env.reset()
        observation = torch.tensor(observation, dtype=torch.float)
        reward_episode = 0
        done = False

        while not done:
            action = self.model.select_action(observation)
            observation, reward, done, info = self.monitor_env.step(
                int(action))
            observation = torch.tensor(observation, dtype=torch.float)
            reward_episode += reward

        self.monitor_env.close()
        if render:
            self.env.render()
        print(f'Reward: {reward_episode}')