Beispiel #1
0
def train():
    # Fetch the requested environment set in flags.
    env_class = attrgetter(FLAGS.env)(sc2g.env)

    env = env_class.make_env(
        map_name=FLAGS.map_name,
        feature_screen_size=FLAGS.screen_size,
        feature_minimap_size=FLAGS.minimap_size,
        visualize=FLAGS.visualize,
        save_replay_episodes=FLAGS.save_replay_episodes,
        replay_dir=FLAGS.replay_dir,
    )

    # Stack frames (memory optimisation)
    if FLAGS.num_stack_frames > 0:
        print("Stack frames enabled: n=%d" % FLAGS.num_stack_frames)
        env = FrameStack(env, FLAGS.num_stack_frames)

    model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
                                    hiddens=[256],
                                    dueling=True)

    act = deepq.learn(
        env,
        q_func=model,
        lr=FLAGS.learning_rate,  # Learning rate for adam optimizer
        max_timesteps=FLAGS.max_timesteps,  # Max timesteps
        buffer_size=FLAGS.buffer_size,  # Size of replay buffer
        exploration_fraction=FLAGS.
        exploration_fraction,  # Fraction of max_timesteps over which exploration rate is annealed
        exploration_final_eps=FLAGS.
        exploration_final_eps,  # Final value of random action probability
        train_freq=FLAGS.
        train_freq,  # How often the model is updated, in steps
        print_freq=FLAGS.
        print_freq,  # How often training progress is printed, in episodes
        checkpoint_freq=FLAGS.
        checkpoint_freq,  # How often to save the model, in steps
        learning_starts=FLAGS.
        learning_starts,  # How many steps before learning starts
        gamma=FLAGS.gamma,  # Discount factor
        target_network_update_freq=FLAGS.
        target_network_update_freq,  # How often the target network is updated
        prioritized_replay=FLAGS.prioritized_replay,
        callback=deepq_callback,
    )

    print("Saving model...")
    save_model(act)

    print("Saving replay...")
    env.unwrapped.sc2_env.save_replay(FLAGS.map_name)

    print("Closing environment...")
    env.close()
class DeepQNetwork:
    def __init__(self):
        self.env = gym.make('SpaceInvaders-v0')
        self.replay_buffer = []
        self.replay_buffer_size_thresh = 100000
        self.env = WarpFrame(self.env)
        self.env = FrameStack(self.env, 4)
        self.episodes = 100
        self.max_actions_per_episode = 100
        self.epsilon = 1
        self.min_epsilon = 0.01
        self.eps_decay = 0.00025
        self.decay_step = 0
        self.learning_rate = 0.8
        self.discount_factor = 0.99
        self.rewards = []
        self.test_eps = 50
        self.test_rewards = []
        self.model = None
        self.batch_size = 64
        self.model_path = 'models/DQN.hdf5'

    def create_model(self):
        inputs = layers.Input(shape=(84, 84, 4))

        conv1 = layers.Conv2D(32, 8, 2)(inputs)
        batch_norm1 = layers.BatchNormalization()(conv1)
        relu1 = layers.Activation('relu')(batch_norm1)

        conv2 = layers.Conv2D(64, 4, 2)(relu1)
        batch_norm2 = layers.BatchNormalization()(conv2)
        relu2 = layers.Activation('relu')(batch_norm2)

        conv3 = layers.Conv2D(128, 4, 2)(relu2)
        batch_norm3 = layers.BatchNormalization()(conv3)
        relu3 = layers.Activation('relu')(batch_norm3)

        x = layers.Flatten()(relu3)
        fc1 = layers.Dense(512)(x)
        fc2 = layers.Dense(self.env.action_space.n)(fc1)

        model = models.Model(inputs=inputs, outputs=fc2)
        model.compile(optimizer='rmsprop', loss='mse')
        model.summary()
        self.model = model

    def save_to_memory(self, experience):
        # experience = (observation, action, reward, done, new_observation)
        if len(self.replay_buffer) > self.replay_buffer_size_thresh:
            del self.replay_buffer[0]
        self.replay_buffer.append(experience)

    def sample_from_memory(self):
        return random.sample(self.replay_buffer,
                             min(len(self.replay_buffer), self.batch_size))

    def fill_empty_memory(self):
        observation = self.env.reset()
        for _ in range(10000):
            new_observation, action, reward, done = self.take_action(
                observation)
            self.save_to_memory(
                (observation, action, reward, done, new_observation))
            if done:
                new_observation = self.env.reset()
            observation = new_observation

    def take_action(self, observation):
        # take random actiom
        if np.random.rand() < self.epsilon:
            action = self.env.action_space.sample()
        # take best action
        else:
            action = np.argmax(
                self.model.predict(np.expand_dims(observation, axis=0)))

        # take action
        new_observation, reward, done, info = self.env.step(action)
        new_observation = np.asarray(list(new_observation))
        return new_observation, action, reward, done

    def optimize_model(self):
        # sample minibatch from memory
        minibatch = self.sample_from_memory()
        x_batch = []
        q_targets = []

        # for each experience in minibatch, set q-target
        for idx, (state, act, rew, done, next_state) in enumerate(minibatch):
            x_batch.append(state)
            if done:
                next_state_q_value = rew
            else:
                next_state_q_value = np.max(
                    self.model.predict(
                        np.expand_dims(np.asarray(list(next_state)), axis=0)))

            curr_q_vals = self.model.predict(
                np.expand_dims(np.asarray(list(state)), axis=0))
            curr_q_vals[0][
                act] = rew + self.discount_factor * next_state_q_value
            q_targets.append(curr_q_vals[0])

        # train agent on minibatch
        self.model.fit(np.asarray(x_batch),
                       np.asarray(q_targets),
                       batch_size=len(minibatch),
                       verbose=0)

    def train(self):
        # initialize deep-q agent
        self.create_model()

        # fill empty memory before starting training
        self.fill_empty_memory()

        for i in range(self.episodes):
            print("Episode: ", i)
            observation = np.asarray(list(self.env.reset()))
            total_reward_per_episode = 0

            for a in range(self.max_actions_per_episode):
                self.epsilon = self.min_epsilon + (
                    1 - self.min_epsilon) * np.exp(
                        -self.eps_decay * self.decay_step)
                self.decay_step += 1

                # take step
                new_observation, action, reward, done = self.take_action(
                    observation)

                # save to experience buffer
                self.save_to_memory(
                    (observation, action, reward, done, new_observation))

                # fit model
                self.optimize_model()

                # track reward per episode
                total_reward_per_episode += reward

                # update state
                observation = new_observation

                if done:
                    break

            self.rewards.append(total_reward_per_episode)
            print("Reward: ", total_reward_per_episode)
        self.model.save(self.model_path)
        self.env.close()

        print("Average reward: ", sum(self.rewards) / self.episodes)

    def test(self):
        # test agent
        self.model = models.load_model('models/DQN.hdf5')
        for i in range(self.test_eps):
            observation = np.asarray(list(self.env.reset()))
            total_reward_per_episode = 0
            for _ in range(self.max_actions_per_episode):
                self.env.render()
                action = np.argmax(
                    self.model.predict(np.expand_dims(observation, axis=0)))
                new_observation, reward, done, info = self.env.step(action)
                total_reward_per_episode += reward
                observation = new_observation
                if done:
                    break
            self.test_rewards.append(total_reward_per_episode)

        print("Average reward for test agent: ",
              sum(self.test_rewards) / self.test_eps)