コード例 #1
0
    def record(self, exp_schedule=None):
        self.logger.info("Recording training episode")
        # evaluate with no exp
        env = gym.make(self.config.env_name)
        env = gym.wrappers.Monitor(env,
                                   self.config.record_path,
                                   video_callable=lambda x: True,
                                   resume=True)
        env = MaxAndSkipEnv(env, skip=self.config.skip_frame)
        env = PreproWrapper(env,
                            prepro=greyscale,
                            shape=(80, 80, 1),
                            overwrite_render=self.config.overwrite_render)
        self.evaluate(env, 1)

        # evaluate with Exp
        env = gym.make(self.config.env_name)
        env = gym.wrappers.Monitor(env,
                                   self.config.record_path,
                                   video_callable=lambda x: True,
                                   resume=True)
        env = MaxAndSkipEnv(env, skip=self.config.skip_frame)
        env = PreproWrapper(env,
                            prepro=greyscale,
                            shape=(80, 80, 1),
                            overwrite_render=self.config.overwrite_render)
        self.evaluate(env, 1, exp_schedule=exp_schedule)
コード例 #2
0
def main():
    import config
    # make env
    g_config = config.config()
    g_config.env_name = "Pong2p-v0"
    env = gym.make(g_config.env_name)
    env = MaxAndSkipEnv(env, skip=g_config.skip_frame)
    env = PreproWrapper(env,
                        prepro=greyscale,
                        shape=(80, 80, 1),
                        overwrite_render=g_config.overwrite_render)

    # exploration strategy
    # you may want to modify this schedule
    exp_schedule = LinearExploration(env, g_config.eps_begin, g_config.eps_end,
                                     g_config.eps_nsteps)

    # you may want to modify this schedule
    # learning rate schedule
    lr_schedule = LinearSchedule(g_config.lr_begin, g_config.lr_end,
                                 g_config.lr_nsteps)

    # train model
    # model_0 = dqns.AdvantageQN(env, config.config(), name="Adv_A")
    # model_1 = dqns.AdvantageQN(env, config.config(), name="Adv_B")
    model_0 = dqns.NatureQN(env, config.config(), name="Nature_A")
    model_1 = dqns.NatureQN(env, config.config(), name="Nature_B")
    trainer = SelfPlayTrainer(model_0, model_1, env, g_config)
    trainer.run_parallel_models(exp_schedule, lr_schedule, True, True)
コード例 #3
0
def main():
    import config
    g_config = config.config()

    # make env
    env = gym.make("Pong-v0")
    env = MaxAndSkipEnv(env, skip=g_config.skip_frame)
    env = PreproWrapper(env,
                        prepro=greyscale,
                        shape=(80, 80, 1),
                        overwrite_render=g_config.overwrite_render)

    # exploration strategy
    # you may want to modify this schedule
    exp_schedule = LinearExploration(env, g_config.eps_begin, g_config.eps_end,
                                     g_config.eps_nsteps)

    # you may want to modify this schedule
    # learning rate schedule
    lr_schedule = LinearSchedule(g_config.lr_begin, g_config.lr_end,
                                 g_config.lr_nsteps)

    # train model
    # model = AdvantageQN(env, config.config(), name="SingleADV")
    model = NatureQN(env, config.config(), name="SingleNatureQN")
    model.run(exp_schedule, lr_schedule)
コード例 #4
0
 def record(self):
     """
     Re create an env and record a video for one episode
     """
     env = gym.make(self.config.env_name)
     env = gym.wrappers.Monitor(env, self.config.record_path, video_callable=lambda x: True, resume=True)
     env = MaxAndSkipEnv(env, skip=self.config.skip_frame)
     env = PreproWrapper(env, prepro=greyscale, shape=(80, 80, 1), 
                     overwrite_render=self.config.overwrite_render)
     self.evaluate(env, 1)
コード例 #5
0
def single_train():
    import config
    # make env
    g_config = config.config()
    g_config.env_name = "Pong2p-v0"
    env = gym.make(g_config.env_name)
    env = MaxAndSkipEnv(env, skip=g_config.skip_frame)
    env = PreproWrapper(env,
                        prepro=greyscale,
                        shape=(80, 80, 1),
                        overwrite_render=g_config.overwrite_render)

    # exploration strategy
    # you may want to modify this schedule
    exp_schedule = LinearExploration(env, g_config.eps_begin, g_config.eps_end,
                                     g_config.eps_nsteps)

    # you may want to modify this schedule
    # learning rate schedule
    lr_schedule = LinearSchedule(g_config.lr_begin, g_config.lr_end,
                                 g_config.lr_nsteps)

    # train model
    model_0 = dqns.AdvantageQN(env, config.config(), name="Adv_Single")

    model_1 = dqns.AdvantageQN(env, config.config(), name="Adv_FixedOpp")
    exp_schedule1 = LinearExploration(env, 0.00001, 0.00000001,
                                      g_config.eps_nsteps)
    """
    model_1 = dqns.AdvantageQN(env, config.config(), name="Random")
    exp_schedule1 = LinearExploration(env, 1,
                                     1, g_config.eps_nsteps)
    """

    model_0.initialize()
    model_1.load("trained_models/03_1521/Adv_A/model.weights/model-250244")

    trainer = FixedTargetTrainer(model_0, model_1, env, g_config)

    trainer.record(exp_schedule)  # record one at beginning
    trainer.train(exp_schedule, lr_schedule, exp_schedule1)
    trainer.record(exp_schedule)  # record one at end
コード例 #6
0
            out = Value + (Advantage -
                           tf.reduce_mean(Advantage, axis=1, keep_dims=True))

        ##############################################################
        ######################## END YOUR CODE #######################
        return out


"""
Use a different architecture for the Atari game. Please report the final result.
Feel free to change the configuration. If so, please report your hyperparameters.
"""
if __name__ == '__main__':
    # make env
    env = gym.make(config.env_name)
    env = MaxAndSkipEnv(env, skip=config.skip_frame)
    env = PreproWrapper(env,
                        prepro=greyscale,
                        shape=(80, 80, 1),
                        overwrite_render=config.overwrite_render)

    # exploration strategy
    # you may want to modify this schedule
    exp_schedule = LinearExploration(env, config.eps_begin, config.eps_end,
                                     config.eps_nsteps)

    # you may want to modify this schedule
    # learning rate schedule
    lr_schedule = LinearSchedule(config.lr_begin, config.lr_end,
                                 config.lr_nsteps)
コード例 #7
0
            initial_action = action
        # perform action in env
        new_state, reward, done, info = env.step(action)
        # store in replay memory
        state = new_state
        experts_replay_buffer.store_effect(idx, action, reward, done)
        # count reward
        if abs(reward) == 1:
            break
        counter += 1
    print("PLAY POINT ENDED")
    return (config.gamma**counter) * reward, initial_action


env = gym.make(config.env_name)
env = MaxAndSkipEnv(env, skip=config.skip_frame)
env = PreproWrapper(env, prepro=greyscale, shape=(80, 80, 1),
                    overwrite_render=config.overwrite_render)

rewards = []

experts_meta_lis = [
    './core/checkpoints/q_learning/skip_connection/q5_train_atari_nature/deepdqn_weights/.meta', './core/checkpoints/q_learning/skip_connection/q5_train_atari_nature/resnet_weights/.meta', './core/checkpoints/policy_gradients/policy_network.ckpt.meta']
experts_chkpt_lis = [
    './core/checkpoints/q_learning/skip_connection/q5_train_atari_nature/deepdqn_weights/', './core/checkpoints/q_learning/skip_connection/q5_train_atari_nature/resnet_weights/', './core/checkpoints/policy_gradients/policy_network.ckpt']
experts = []

#temp_sess = None
for meta_path, chkpt_path in zip(experts_meta_lis, experts_chkpt_lis):
    print([n.name for n in tf.get_default_graph().as_graph_def().node])
    if "deepdqn" in meta_path:
コード例 #8
0
        'The state subspace to restrict this teacher to. If \'none\', then no restrictions.'
    )

    args = parser.parse_args()

    # get config
    teacher_config_class = eval('config.{0}_config_teacher'.format(
        args.env_name.replace('-', '_')))
    output_path = "results/{0}/teacher_{0}/".format(args.exp_name)
    teacher_config = teacher_config_class(args.env_name, args.exp_name,
                                          output_path, args.nsteps_train)

    # make env
    env = gym.make(teacher_config.env_name)
    if hasattr(teacher_config, 'skip_frame'):
        env = MaxAndSkipEnv(env, skip=teacher_config.skip_frame)
    if hasattr(teacher_config, 'preprocess_state'
               ) and teacher_config.preprocess_state is not None:
        env = PreproWrapper(env,
                            prepro=eval(teacher_config.preprocess_state),
                            shape=(80, 80, 1),
                            overwrite_render=teacher_config.overwrite_render)

    # set config variables
    if args.q_network_sizes == 'large':
        q_network_sizes = (32, 64, 64, 512)
    elif args.q_network_sizes == 'small':
        q_network_sizes = (16, 16, 16, 128)
    else:
        print('"{0}" is not a valid teacher Q network size.'.format(s))
        sys.exit()
コード例 #9
0
def run_games():
    g_config = config.config()
    g_config.env_name = "Pong2p-v0"
    env = gym.make(g_config.env_name)
    env = MaxAndSkipEnv(env, skip=g_config.skip_frame)
    env = PreproWrapper(env,
                        prepro=greyscale,
                        shape=(80, 80, 1),
                        overwrite_render=g_config.overwrite_render)
    exp_schedule = LinearExploration(env, g_config.eps_begin, g_config.eps_end,
                                     g_config.eps_nsteps)
    lr_schedule = LinearSchedule(g_config.lr_begin, g_config.lr_end,
                                 g_config.lr_nsteps)

    evaluator = Evaluator(env, config)
    csv_file = open(evaluator.config.output_path + "results.csv",
                    mode='w',
                    newline="")
    csv_res = csv.writer(csv_file)
    csv_res.writerow([
        *[m + "_0" for m in model_info_names],
        *[m + "_1" for m in model_info_names], "win_0", "win_1"
    ])

    def enumerate_models(model_dir,
                         model_nums,
                         name,
                         m_class=dqns.AdvantageQN):
        models = []
        for m in model_nums:
            cur_model = m_class(env, g_config, name=name)
            cur_model.num = m
            cur_model.elo = 0
            cur_model.model_dir = model_dir
            cur_model.load(model_dir + "-" + str(m))
            models.append(cur_model)
        return tuple(models)

    pairs = []

    def compatable_with(model_set_a, model_sets_b):
        msb = itertools.chain.from_iterable(model_sets_b)
        pairs.extend(itertools.product(model_set_a, msb))

    # Now to specify the models that are available

    # scoring 1 game takes
    # 1.5 min * (15/ 100) ~= 15 sec
    # one hour =  240 games -> 480 game results
    rounds = 0
    models = []

    def first_run():
        # ok, first goal is to get scores (25 games each) for

        # 1 single play , 2 self-play @ 250k
        # 1 single play , 2 self-play @ 1M
        # 1 single play , 2 self-play @ 2.5M
        # 2 single play , 4 self-play ends
        # total 15 models

        model_dir = "trained_models/{}/model.weights/model"

        single_play = enumerate_models(model_dir.format("02_2204/SingleADV"),
                                       [4011594, 4764484], "Single")
        self_play0A = enumerate_models(model_dir.format("02_2205/Adv_A"),
                                       [4006694, 4757864], "Adv0A")
        self_play0B = enumerate_models(model_dir.format("02_2205/Adv_B"),
                                       [4006694, 4757864], "Adv0B")
        self_play1A = enumerate_models(model_dir.format("02_2209/Adv_A"),
                                       [4005221, 4756947], "Adv1A")
        self_play1B = enumerate_models(model_dir.format("02_2209/Adv_B"),
                                       [4005221, 4756947], "Adv1B")
        self_play0 = self_play0A + self_play0B
        self_play1 = self_play1A + self_play1B

        compatable_with(single_play, [self_play0, self_play1])
        compatable_with(self_play0, [single_play, self_play1])
        compatable_with(self_play1, [single_play, self_play0])

        nonlocal models
        nonlocal rounds
        models = single_play + self_play0 + self_play1
        rounds = 5

    def second_run():
        model_dir = "trained_models/{}/model.weights/model"

        single_play = enumerate_models(model_dir.format("03_1501/SingleADV"),
                                       [250272, 1001582, 2506140, 4763791],
                                       "Single")
        single_play += enumerate_models(model_dir.format("02_2204/SingleADV"),
                                        [4764484], "Single")

        single_play1 = enumerate_models(
            model_dir.format("03_2349/SingleNatureQN"),
            [250068, 1002011, 2505567, 4764637], "SingleDQN", dqns.NatureQN)
        single_play2 = enumerate_models(
            model_dir.format("04_0232/SingleNatureQN"),
            [250360, 1001844, 2508136, 4766454], "SingleDQN", dqns.NatureQN)

        self_play0A = enumerate_models(model_dir.format("02_2205/Adv_A"),
                                       [4757864], "Adv0A")
        self_play0B = enumerate_models(model_dir.format("02_2205/Adv_B"),
                                       [4757864], "Adv0B")
        self_play0 = self_play0A + self_play0B

        self_play1A = enumerate_models(model_dir.format("02_2209/Adv_A"),
                                       [4756947], "Adv1A")
        self_play1B = enumerate_models(model_dir.format("02_2209/Adv_B"),
                                       [4756947], "Adv1B")
        self_play1 = self_play1A + self_play1B

        self_play2A = enumerate_models(model_dir.format("03_1520/Adv_A"),
                                       [250020, 1001335, 2505766], "Adv2A")
        self_play2B = enumerate_models(model_dir.format("03_1520/Adv_B"),
                                       [250020, 1001335, 2505766], "Adv2B")
        self_play2 = self_play2A + self_play2B

        self_play3A = enumerate_models(model_dir.format("03_1521/Adv_A"),
                                       [250244, 1002194, 2505204], "Adv3A")
        self_play3B = enumerate_models(model_dir.format("03_1521/Adv_B"),
                                       [250244, 1002194, 2505204], "Adv3B")
        self_play3 = self_play3A + self_play3B

        self_play0A = enumerate_models(model_dir.format("03_2357/Nature_A"),
                                       [250101, 1001653, 2503025, 4758399],
                                       "Nature4A", dqns.NatureQN)
        self_play0B = enumerate_models(model_dir.format("03_2357/Nature_B"),
                                       [250101, 1001653, 2503025, 4758399],
                                       "Nature4B", dqns.NatureQN)
        self_play4 = self_play4A + self_play4B

        self_play5 = enumerate_models(model_dir.format("04_1006/Adv_Single"),
                                      [250047, 1000867, 2501982, 4753393],
                                      "ADV_vs_Random")
        self_play6 = enumerate_models(model_dir.format("04_1009/Adv_Single"),
                                      [250016, 1000902, 2501784, 4753522],
                                      "ADV_vs_250k")

        all_sets = {
            single_play, single_play1, single_play2, self_play0, self_play1,
            self_play2, self_play3
        }

        compatable_with(single_play,
                        all_sets - {single_play, self_play0, self_play1})
        compatable_with(self_play0,
                        all_sets - {single_play, self_play0, self_play1})
        compatable_with(self_play1,
                        all_sets - {single_play, self_play0, self_play1})
        compatable_with(single_play1, all_sets - {single_play1})
        compatable_with(single_play2, all_sets - {single_play2})
        compatable_with(self_play2, all_sets - {self_play2})
        compatable_with(self_play3, all_sets - {self_play3})

        nonlocal models
        nonlocal rounds
        models = list(itertools.chain.from_iterable(all_sets))
        rounds = 1

    def third_run():
        model_dir = "trained_models/{}/model.weights/model"

        single_play = enumerate_models(model_dir.format("03_1501/SingleADV"),
                                       [250272, 1001582, 2506140, 4763791],
                                       "Single")
        single_play += enumerate_models(model_dir.format("02_2204/SingleADV"),
                                        [4764484], "Single")

        single_play1 = enumerate_models(
            model_dir.format("03_2349/SingleNatureQN"),
            [250068, 1002011, 2505567, 4764637], "SingleDQN0", dqns.NatureQN)
        single_play2 = enumerate_models(
            model_dir.format("04_0232/SingleNatureQN"),
            [250360, 1001844, 2508136, 4766454], "SingleDQN1", dqns.NatureQN)

        self_play0A = enumerate_models(model_dir.format("02_2205/Adv_A"),
                                       [4757864], "Adv0A")
        self_play0B = enumerate_models(model_dir.format("02_2205/Adv_B"),
                                       [4757864], "Adv0B")
        self_play0 = self_play0A + self_play0B

        self_play1A = enumerate_models(model_dir.format("02_2209/Adv_A"),
                                       [4756947], "Adv1A")
        self_play1B = enumerate_models(model_dir.format("02_2209/Adv_B"),
                                       [4756947], "Adv1B")
        self_play1 = self_play1A + self_play1B

        self_play2A = enumerate_models(model_dir.format("03_1520/Adv_A"),
                                       [250020, 1001335, 2505766], "Adv2A")
        self_play2B = enumerate_models(model_dir.format("03_1520/Adv_B"),
                                       [250020, 1001335, 2505766], "Adv2B")
        self_play2 = self_play2A + self_play2B

        self_play3A = enumerate_models(model_dir.format("03_1521/Adv_A"),
                                       [250244, 1002194, 2505204], "Adv3A")
        self_play3B = enumerate_models(model_dir.format("03_1521/Adv_B"),
                                       [250244, 1002194, 2505204], "Adv3B")
        self_play3 = self_play3A + self_play3B

        self_play4A = enumerate_models(model_dir.format("03_2357/Nature_A"),
                                       [250101, 1001653, 2503205, 4758399],
                                       "Nature4A", dqns.NatureQN)
        self_play4B = enumerate_models(model_dir.format("03_2357/Nature_B"),
                                       [250101, 1001653, 2503205, 4758399],
                                       "Nature4B", dqns.NatureQN)
        self_play4 = self_play4A + self_play4B

        self_play5 = enumerate_models(model_dir.format("04_1006/Adv_Single"),
                                      [250047, 1000867, 2501982, 4753393],
                                      "ADV_vs_Random")
        self_play6 = enumerate_models(model_dir.format("04_1109/Adv_Single"),
                                      [250016, 1000902, 2501784, 4753522],
                                      "ADV_vs_250k")

        all_sets = {
            single_play, single_play1, single_play2, self_play0, self_play1,
            self_play2, self_play3, self_play4, self_play5, self_play6
        }
        new_sets = {self_play4, self_play5, self_play6}

        for s in all_sets - new_sets:
            compatable_with(s, new_sets)
        for s in new_sets:
            compatable_with(s, all_sets - {s})

        nonlocal models
        nonlocal rounds
        models = list(itertools.chain.from_iterable(all_sets))
        rounds = 1

    # Which environment to run
    third_run()

    # now to actually score the games
    results = []

    for i in range(rounds):
        for m0, m1 in pairs:
            score_0, score_1 = evaluator.evaluate(m0, m1)
            update_elo(m0, m1, 30, score_0, score_1)
            info = [*model_info(m0), *model_info(m1), score_0, score_1]
            print(info)
            print(m0.elo, m1.elo)
            csv_res.writerow(info)
            results.append([m0, m1, score_0, score_1])
    csv_res.close()
    return results, evaluator.config.output_path
コード例 #10
0
    # set config variables from command-line arguments
    student_config.student_loss = args.student_loss
    student_config.process_teacher_q = args.process_teacher_q
    student_config.choose_teacher_q = args.choose_teacher_q
    student_config.softmax_teacher_q_tau = args.softmax_teacher_q_tau
    student_config.mse_prob_loss_weight = args.mse_prob_loss_weight
    student_config.nll_loss_weight = args.nll_loss_weight
    student_config.nsteps_train = args.nsteps_train
    student_config.lr_nsteps = args.nsteps_train / 2
    student_config.exp_policy = args.exp_policy

    # make env
    env = gym.make(student_config.env_name)
    if hasattr(student_config, 'skip_frame'):
        env = MaxAndSkipEnv(env, skip=student_config.skip_frame)
    if hasattr(student_config, 'preprocess_state'
               ) and student_config.preprocess_state is not None:
        env = PreproWrapper(env,
                            prepro=greyscale,
                            shape=(80, 80, 1),
                            overwrite_render=student_config.overwrite_render)

    # exploration strategy
    if student_config.exp_policy == 'egreedy':
        exp_schedule = LinearExploration(env, student_config.eps_begin,
                                         student_config.eps_end,
                                         student_config.eps_nsteps)
    else:
        exp_schedule = LinearGreedyExploration(env, student_config.eps_begin,
                                               student_config.eps_end,