Beispiel #1
0
    def run(self):
        """
        Apply procedures of training for a QN

        Args:
            exp_schedule: exploration strategy for epsilon
            lr_schedule: schedule for learning rate
        """
        # initialize
        self.initialize()

        config = self.config
        for index, env in enumerate(self.envs):
            if index < self.config.start_index:
                continue
            exp_schedule = LinearExploration(env, config.eps_begin,
                                             config.eps_end, config.eps_nsteps)

            lr_schedule = LinearSchedule(config.lr_begin, config.lr_end,
                                         config.lr_nsteps)

            self.q, self.target_q, self.update_target_op, self.loss, self.train_op, self.grad_norm = self.ops[
                index]
            self.sess.run(self.update_target_op)

            # important
            self.env = env
            self.index = index

            # record one game at the beginning
            if self.config.record:
                self.record()
            self.train(exp_schedule, lr_schedule)
def _train(opponents, train_from_scratch=False, render=False):
    env = pommerman.make('PommeFFACompetition-v0', [])

    # Exploration strategy
    exp_schedule = LinearExploration(env, config.eps_begin,
                                     config.eps_end, config.eps_nsteps)

    # Learning rate schedule
    lr_schedule = LinearSchedule(config.lr_begin, config.lr_end,
                                 config.lr_nsteps)

    # Initialize agents.
    dqn_agent = DQNAgent(env, config, exp_schedule, lr_schedule, True, train_from_scratch=train_from_scratch)
    dqn_agent_index = _init_agents(env, exp_schedule, lr_schedule, opponents, dqn_agent)

    t = 1
    while t < config.nsteps_train:
        state = env.reset()

        done = False
        while not done:
            t += 1
            if render:
                env.render()
            actions = env.act(state)
            state, reward, done, info = env.step(actions)

            if reward[dqn_agent_index] == -1 and not done:
                # Stop the episode when the training agent dies.
                dqn_agent.episode_end(-1)
                done = True

    env.close()
def _test(opponents, match_num=20, render=True):
    env = pommerman.make('PommeFFACompetition-v0', [])

    # Exploration strategy
    exp_schedule = LinearExploration(env, 0, 0, 1)

    # Learning rate schedule
    lr_schedule = LinearSchedule(config.lr_begin, config.lr_end,
                                 config.lr_nsteps)

    # Initialize agents.
    dqn_agent = DQNAgent(env, config, exp_schedule, lr_schedule, False)
    dqn_agent_index = _init_agents(env, exp_schedule, lr_schedule, opponents, dqn_agent)

    count = 0
    win = 0
    for _ in range(match_num):
        state = env.reset()

        done = False

        while not done:
            if render:
                env.render()
            actions = env.act(state)
            state, reward, done, info = env.step(actions)
            if reward[0] == 1:
                win += 1
                print('win at episode %d' % count)

            if reward[dqn_agent_index] == -1 and not done:
                # Stop the episode when the testing agent dies.
                done = True
        count += 1
    print(win / count)

    env.close()
Beispiel #4
0
        ##############################################################
        #################### YOUR CODE HERE - 8-12 lines #############

        optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
        scope_variable = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)
        grads_and_vars = optimizer.compute_gradients(self.loss, scope_variable)
        if self.config.grad_clip:
            clipped_grads_and_vars = [(tf.clip_by_norm(item[0], self.config.clip_val), item[1]) for item in grads_and_vars]
        self.train_op = optimizer.apply_gradients(clipped_grads_and_vars)
        self.grad_norm = tf.global_norm([item[0] for item in grads_and_vars])
        
        ##############################################################
        ######################## END YOUR CODE #######################
    


if __name__ == '__main__':
    env = EnvTest((5, 5, 1))

    # exploration strategy
    exp_schedule = LinearExploration(env, config.eps_begin, 
            config.eps_end, config.eps_nsteps)

    # learning rate schedule
    lr_schedule  = LinearSchedule(config.lr_begin, config.lr_end,
            config.lr_nsteps)

    # train model
    model = Linear(env, config)
    model.run(exp_schedule, lr_schedule)
Beispiel #5
0

def print_config(config):
    print 'Current config:\n'
    variables = zip(vars(config).keys(), vars(config).values())
    for var, val in sorted(variables):
        print var + ' = ' + str(val)


if __name__ == '__main__':
    args = parse_args()
    my_config = modify_config(args)
    print_config(my_config)
    with tf.device('/gpu:' + str(args.gpu)):
        # make env
        env = gym.make(my_config.env_name)
        env = wrap_dqn(env)

        # exploration strategy
        exp_schedule = LinearExploration(env, my_config.eps_begin,
                                         my_config.eps_end,
                                         my_config.eps_nsteps)

        # learning rate schedule
        lr_schedule = LinearSchedule(my_config.lr_begin, my_config.lr_end,
                                     my_config.lr_nsteps)

        # train model
        model = NatureQN(env, my_config)
        model.run(exp_schedule, lr_schedule)