Example #1
0
class Evaluator(object):
    def __init__(self, env_name, num_episodes, exp_name, policy):
        self.exp_name = exp_name
        self.env = gym.make(env_name)
        self.env = ProcessFrame84(self.env, crop=False)
        self.env = FrameStack(self.env, 4)
        self.num_episodes = 1
        self.ep_len = 4500
        self.policy = policy
        if not os.path.exists('images'):
            os.mkdir('images')
        self.image_folder = os.path.join(
            os.path.abspath(os.path.dirname(__file__)), 'images')

    def format_obs(self, obs_name, obs):
        nums = ",".join(map(str, obs))
        dict_format = "{" + nums + "}"
        final_str = "observation \"{}\" - {}\n".format(obs_name, dict_format)
        return final_str

    def eval_model(self, ep_num):
        for i in range(self.num_episodes):
            trajectory_file = self.exp_name + "_ep" + str(
                ep_num) + "_itr" + str(i) + "_trajectory.txt"
            if not os.path.exists("trajectories"):
                os.makedirs("trajectories")
            trajectory_path = os.path.join("trajectories", trajectory_file)
            ep_images = []
            ob = self.env.reset()
            ob = np.array(ob)
            eprews = []
            if i == 0:
                ep_images.append(self.env.unwrapped._last_observation)
            for step in range(self.ep_len):
                action, vpred, nlp = self.policy.get_ac_value_nlp_eval(ob)
                ob, rew, done, info = self.env.step(action[0])
                if i == 0:
                    ep_images.append(self.env.unwrapped._last_observation)
                if rew is None:
                    eprews.append(0)
                else:
                    eprews.append(rew)
                if step > 0:
                    pos_trans, pos_rot, vel_trans, vel_rot = self.env.unwrapped.get_pos_and_vel(
                    )

                    with open(trajectory_path, 'a') as f:
                        f.write(self.format_obs("DEBUG.POS.TRANS", pos_trans))
                        f.write(self.format_obs("DEBUG.POS.ROT", pos_rot))
                        f.write(self.format_obs("VEL.TRANS", vel_trans))
                        f.write(self.format_obs("VEL.ROT", vel_rot))

            for j in range(len(ep_images)):
                image_file = os.path.join(
                    self.image_folder,
                    self.exp_name + "_{}_{}_{}_".format(ep_num, i, j) + ".png")
                cv2.imwrite(image_file, ep_images[j])
            print("Episode {} cumulative reward: {}".format(i, sum(eprews)))
Example #2
0
class Evaluator(object):
    def __init__(self, env_name, num_episodes, exp_name, policy):
        self.exp_name = exp_name
        self.env = gym.make(env_name)
        self.env = ProcessFrame84(self.env, crop=False)
        self.env = FrameStack(self.env, 4)
        self.num_episodes = 1
        self.policy = policy
        if not os.path.exists('images'):
            os.mkdir('images')
        self.image_folder = os.path.join(
            os.path.abspath(os.path.dirname(__file__)), 'images')
        print('Image folder', self.image_folder)

    def eval_model(self, ep_num):
        for i in range(self.num_episodes):
            ep_images = []
            ob = self.env.reset()
            ob = np.array(ob)
            eprews = []
            if i == 0:
                ep_images.append(self.env.unwrapped._last_observation)
            for step in range(900):
                action, vpred, nlp = self.policy.get_ac_value_nlp_eval(ob)
                ob, rew, done, info = self.env.step(action[0])
                if i == 0:
                    ep_images.append(self.env.unwrapped._last_observation)
                if rew is None:
                    eprews.append(0)
                else:
                    eprews.append(rew)
                if done:
                    print("Episode finished after {} timesteps".format(step +
                                                                       1))
                    print("Episode Reward is {}".format(sum(eprews)))
                    break
            for j in range(len(ep_images)):
                image_file = os.path.join(
                    self.image_folder,
                    self.exp_name + "_{}_{}_{}_".format(ep_num, i, j) + ".png")
                cv2.imwrite(image_file, ep_images[j])
            print("Episode {} cumulative reward: {}".format(i, sum(eprews)))
class DeepQNetwork:
    def __init__(self):
        self.env = gym.make('SpaceInvaders-v0')
        self.replay_buffer = []
        self.replay_buffer_size_thresh = 100000
        self.env = WarpFrame(self.env)
        self.env = FrameStack(self.env, 4)
        self.episodes = 100
        self.max_actions_per_episode = 100
        self.epsilon = 1
        self.min_epsilon = 0.01
        self.eps_decay = 0.00025
        self.decay_step = 0
        self.learning_rate = 0.8
        self.discount_factor = 0.99
        self.rewards = []
        self.test_eps = 50
        self.test_rewards = []
        self.model = None
        self.batch_size = 64
        self.model_path = 'models/DQN.hdf5'

    def create_model(self):
        inputs = layers.Input(shape=(84, 84, 4))

        conv1 = layers.Conv2D(32, 8, 2)(inputs)
        batch_norm1 = layers.BatchNormalization()(conv1)
        relu1 = layers.Activation('relu')(batch_norm1)

        conv2 = layers.Conv2D(64, 4, 2)(relu1)
        batch_norm2 = layers.BatchNormalization()(conv2)
        relu2 = layers.Activation('relu')(batch_norm2)

        conv3 = layers.Conv2D(128, 4, 2)(relu2)
        batch_norm3 = layers.BatchNormalization()(conv3)
        relu3 = layers.Activation('relu')(batch_norm3)

        x = layers.Flatten()(relu3)
        fc1 = layers.Dense(512)(x)
        fc2 = layers.Dense(self.env.action_space.n)(fc1)

        model = models.Model(inputs=inputs, outputs=fc2)
        model.compile(optimizer='rmsprop', loss='mse')
        model.summary()
        self.model = model

    def save_to_memory(self, experience):
        # experience = (observation, action, reward, done, new_observation)
        if len(self.replay_buffer) > self.replay_buffer_size_thresh:
            del self.replay_buffer[0]
        self.replay_buffer.append(experience)

    def sample_from_memory(self):
        return random.sample(self.replay_buffer,
                             min(len(self.replay_buffer), self.batch_size))

    def fill_empty_memory(self):
        observation = self.env.reset()
        for _ in range(10000):
            new_observation, action, reward, done = self.take_action(
                observation)
            self.save_to_memory(
                (observation, action, reward, done, new_observation))
            if done:
                new_observation = self.env.reset()
            observation = new_observation

    def take_action(self, observation):
        # take random actiom
        if np.random.rand() < self.epsilon:
            action = self.env.action_space.sample()
        # take best action
        else:
            action = np.argmax(
                self.model.predict(np.expand_dims(observation, axis=0)))

        # take action
        new_observation, reward, done, info = self.env.step(action)
        new_observation = np.asarray(list(new_observation))
        return new_observation, action, reward, done

    def optimize_model(self):
        # sample minibatch from memory
        minibatch = self.sample_from_memory()
        x_batch = []
        q_targets = []

        # for each experience in minibatch, set q-target
        for idx, (state, act, rew, done, next_state) in enumerate(minibatch):
            x_batch.append(state)
            if done:
                next_state_q_value = rew
            else:
                next_state_q_value = np.max(
                    self.model.predict(
                        np.expand_dims(np.asarray(list(next_state)), axis=0)))

            curr_q_vals = self.model.predict(
                np.expand_dims(np.asarray(list(state)), axis=0))
            curr_q_vals[0][
                act] = rew + self.discount_factor * next_state_q_value
            q_targets.append(curr_q_vals[0])

        # train agent on minibatch
        self.model.fit(np.asarray(x_batch),
                       np.asarray(q_targets),
                       batch_size=len(minibatch),
                       verbose=0)

    def train(self):
        # initialize deep-q agent
        self.create_model()

        # fill empty memory before starting training
        self.fill_empty_memory()

        for i in range(self.episodes):
            print("Episode: ", i)
            observation = np.asarray(list(self.env.reset()))
            total_reward_per_episode = 0

            for a in range(self.max_actions_per_episode):
                self.epsilon = self.min_epsilon + (
                    1 - self.min_epsilon) * np.exp(
                        -self.eps_decay * self.decay_step)
                self.decay_step += 1

                # take step
                new_observation, action, reward, done = self.take_action(
                    observation)

                # save to experience buffer
                self.save_to_memory(
                    (observation, action, reward, done, new_observation))

                # fit model
                self.optimize_model()

                # track reward per episode
                total_reward_per_episode += reward

                # update state
                observation = new_observation

                if done:
                    break

            self.rewards.append(total_reward_per_episode)
            print("Reward: ", total_reward_per_episode)
        self.model.save(self.model_path)
        self.env.close()

        print("Average reward: ", sum(self.rewards) / self.episodes)

    def test(self):
        # test agent
        self.model = models.load_model('models/DQN.hdf5')
        for i in range(self.test_eps):
            observation = np.asarray(list(self.env.reset()))
            total_reward_per_episode = 0
            for _ in range(self.max_actions_per_episode):
                self.env.render()
                action = np.argmax(
                    self.model.predict(np.expand_dims(observation, axis=0)))
                new_observation, reward, done, info = self.env.step(action)
                total_reward_per_episode += reward
                observation = new_observation
                if done:
                    break
            self.test_rewards.append(total_reward_per_episode)

        print("Average reward for test agent: ",
              sum(self.test_rewards) / self.test_eps)