コード例 #1
0
            def func(algo=algo, locals=locals):
                GlobalConfig().set(
                    'DEFAULT_EXPERIMENT_END_POINT',
                    dict(TOTAL_AGENT_TRAIN_SAMPLE_COUNT=500,
                         TOTAL_AGENT_TEST_SAMPLE_COUNT=None,
                         TOTAL_AGENT_UPDATE_COUNT=None))
                if not algo:
                    algo, locals = self.create_dqn()
                env_spec = locals['env_spec']
                env = locals['env']
                agent = self.create_agent(env=locals['env'],
                                          algo=algo,
                                          name='agent',
                                          eps=self.create_eps(env_spec)[0],
                                          env_spec=env_spec)[0]

                exp = self.create_exp(name='model_free', env=env, agent=agent)
                algo.parameters.set_scheduler(
                    param_key='LEARNING_RATE',
                    to_tf_ph_flag=True,
                    scheduler=LinearScheduler(
                        t_fn=exp.TOTAL_ENV_STEP_TRAIN_SAMPLE_COUNT,
                        schedule_timesteps=GlobalConfig(
                        ).DEFAULT_EXPERIMENT_END_POINT[
                            'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'],
                        final_p=0.0001,
                        initial_p=0.01))
                exp.run()
                self.assertEqual(exp.TOTAL_AGENT_TEST_SAMPLE_COUNT(),
                                 exp.TOTAL_ENV_STEP_TEST_SAMPLE_COUNT())
                self.assertEqual(exp.TOTAL_AGENT_TRAIN_SAMPLE_COUNT(),
                                 exp.TOTAL_ENV_STEP_TRAIN_SAMPLE_COUNT(), 500)
コード例 #2
0
ファイル: acrobot.py プロジェクト: yitongx/Baconian-public
def acrobot_task_fn():
    exp_config = ACROBOT_BENCHMARK_CONFIG_DICT
    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('Acrobot-v1')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              **exp_config['MLPQValueFunction'])
    dqn = DQN(env_spec=env_spec,
              name=name + '_dqn',
              value_func=mlp_q,
              **exp_config['DQN'])
    agent = Agent(env=env, env_spec=env_spec,
                  algo=dqn,
                  name=name + '_agent',
                  exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                                     prob_scheduler=LinearScheduler(
                                                         t_fn=lambda: get_global_status_collect()(
                                                             'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                                                         **exp_config['EpsilonGreedy']['LinearScheduler']),
                                                     **exp_config['EpsilonGreedy']['config_or_config_dict']))
    flow = TrainTestFlow(train_sample_count_func=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                         config_or_config_dict=exp_config['TrainTestFlow']['config_or_config_dict'],
                         func_dict={
                             'test': {'func': agent.test,
                                      'args': list(),
                                      'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TEST_SAMPLES_COUNT']),
                                      },
                             'train': {'func': agent.train,
                                       'args': list(),
                                       'kwargs': dict(),
                                       },
                             'sample': {'func': agent.sample,
                                        'args': list(),
                                        'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TRAIN_SAMPLES_COUNT'],
                                                       env=agent.env,
                                                       in_which_status='TRAIN',
                                                       store_flag=True),
                                        },
                         })

    experiment = Experiment(
        tuner=None,
        env=env,
        agent=agent,
        flow=flow,
        name=name
    )
    experiment.run()
コード例 #3
0
    def test_eps_with_scheduler(self):
        dqn, locals = self.create_dqn()
        env = locals['env']

        def func():
            global x
            return x

        dqn.init()
        eps = EpsilonGreedy(action_space=dqn.env_spec.action_space,
                            prob_scheduler=LinearScheduler(initial_p=1.0, t_fn=func, schedule_timesteps=10,
                                                           final_p=0.0),
                            init_random_prob=1.0)
        st = env.reset()
        for i in range(10):
            global x
            ac = eps.predict(obs=st, sess=self.sess, batch_flag=False, algo=dqn)
            st_new, re, done, _ = env.step(action=ac)
            self.assertAlmostEqual(eps.parameters('random_prob_func')(), 1.0 - (1.0 - 0.0) / 10 * x)
            x += 1
コード例 #4
0
def task_fn():
    env = make('Acrobot-v1')
    name = 'example_scheduler_'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[
                                  {
                                      "ACT": "RELU",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "1",
                                      "N_UNITS": 16,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  },
                                  {
                                      "ACT": "LINEAR",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "OUPTUT",
                                      "N_UNITS": 1,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  }
                              ])
    dqn = DQN(env_spec=env_spec,
              config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000,
                                         GAMMA=0.99,
                                         BATCH_SIZE=10,
                                         LEARNING_RATE=0.001,
                                         TRAIN_ITERATION=1,
                                         DECAY=0.5),
              name=name + '_dqn',
              value_func=mlp_q)
    agent = Agent(env=env, env_spec=env_spec,
                  algo=dqn,
                  name=name + '_agent',
                  algo_saving_scheduler=PeriodicalEventSchedule(
                      t_fn=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                      trigger_every_step=20,
                      after_t=10),
                  exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                                     prob_scheduler=PiecewiseScheduler(
                                                         t_fn=lambda: get_global_status_collect()(
                                                             'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                                                         endpoints=((10, 0.3), (100, 0.1), (200, 0.0)),
                                                         outside_value=0.0
                                                     ),
                                                     init_random_prob=0.5))
    flow = create_train_test_flow(
        test_every_sample_count=10,
        train_every_sample_count=10,
        start_test_after_sample_count=5,
        start_train_after_sample_count=5,
        train_func_and_args=(agent.train, (), dict()),
        test_func_and_args=(agent.test, (), dict(sample_count=10)),
        sample_func_and_args=(agent.sample, (), dict(sample_count=100,
                                                     env=agent.env,
                                                     store_flag=True))
    )
    experiment = Experiment(
        tuner=None,
        env=env,
        agent=agent,
        flow=flow,
        name=name + 'experiment_debug'
    )

    dqn.parameters.set_scheduler(param_key='LEARNING_RATE',
                                 scheduler=LinearScheduler(
                                     t_fn=experiment.TOTAL_AGENT_TRAIN_SAMPLE_COUNT,
                                     schedule_timesteps=GlobalConfig().DEFAULT_EXPERIMENT_END_POINT[
                                         'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'],
                                     final_p=0.0001,
                                     initial_p=0.01))
    experiment.run()
コード例 #5
0
def task_fn():
    env = make('Acrobot-v1')
    name = 'demo_exp'
    env.env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)
    env_spec = env.env_spec
    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[
                                  {
                                      "ACT": "TANH",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "1",
                                      "N_UNITS": 64,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  },
                                  {
                                      "ACT": "TANH",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "2",
                                      "N_UNITS": 64,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  },
                                  {
                                      "ACT": "RELU",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "3",
                                      "N_UNITS": 256,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  },
                                  {
                                      "ACT": "LINEAR",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "OUPTUT",
                                      "N_UNITS": 1,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  }
                              ])
    dqn = DQN(env_spec=env_spec,
              config_or_config_dict=dict(REPLAY_BUFFER_SIZE=50000,
                                         GAMMA=0.99,
                                         BATCH_SIZE=32,
                                         LEARNING_RATE=0.001,
                                         TRAIN_ITERATION=1,
                                         DECAY=0),
              name=name + '_dqn',
              value_func=mlp_q)

    epsilon_greedy = EpsilonGreedy(action_space=env_spec.action_space,
                                   prob_scheduler=LinearScheduler(
                                                         t_fn=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                                                         schedule_timesteps=int(0.1 * 100000),
                                                         initial_p=1.0,
                                                         final_p=0.02),
                                                         init_random_prob=0.1)

    agent = Agent(env=env,
                  env_spec=env_spec,
                  algo=dqn,
                  name=name + '_agent',
                  exploration_strategy=epsilon_greedy,
                  noise_adder=None)


    flow = create_train_test_flow(
        test_every_sample_count=1000,
        train_every_sample_count=1,
        start_test_after_sample_count=0,
        start_train_after_sample_count=10000,
        sample_func_and_args=(agent.sample, (), dict(sample_count=1, env=agent.env, store_flag=True)),
        train_func_and_args=(agent.train, (), dict()),
        test_func_and_args=(agent.test, (), dict(sample_count=1)),
    )
    experiment = Experiment(
        tuner=None,
        env=env,
        agent=agent,
        flow=flow,
        name=name
    )

    experiment.run()