def mountaincar_task_fn():
    exp_config = MOUNTAINCAR_BENCHMARK_CONFIG_DICT
    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('MountainCar-v0')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              **exp_config['MLPQValueFunction'])
    dqn = DQN(env_spec=env_spec,
              name=name + '_dqn',
              value_func=mlp_q,
              **exp_config['DQN'])
    agent = Agent(env=env, env_spec=env_spec,
                  algo=dqn,
                  name=name + '_agent',
                  exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                                     prob_scheduler=LinearScheduler(
                                                         t_fn=lambda: get_global_status_collect()(
                                                             'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                                                         **exp_config['EpsilonGreedy']['LinearScheduler']),
                                                     **exp_config['EpsilonGreedy']['config_or_config_dict']))
    flow = TrainTestFlow(train_sample_count_func=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                         config_or_config_dict=exp_config['TrainTestFlow']['config_or_config_dict'],
                         func_dict={
                             'test': {'func': agent.test,
                                      'args': list(),
                                      'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TEST_SAMPLES_COUNT']),
                                      },
                             'train': {'func': agent.train,
                                       'args': list(),
                                       'kwargs': dict(),
                                       },
                             'sample': {'func': agent.sample,
                                        'args': list(),
                                        'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TRAIN_SAMPLES_COUNT'],
                                                       env=agent.env,
                                                       in_which_status='TRAIN',
                                                       store_flag=True),
                                        },
                         })

    experiment = Experiment(
        tuner=None,
        env=env,
        agent=agent,
        flow=flow,
        name=name
    )
    experiment.run()
 def create_dqn(self, env_id='Acrobot-v1', name='dqn'):
     mlp_q, local = self.create_mlp_q_func(env_id,
                                           name='{}_mlp_q'.format(name))
     env_spec = local['env_spec']
     env = local['env']
     dqn = DQN(env_spec=env_spec,
               config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000,
                                          GAMMA=0.99,
                                          BATCH_SIZE=10,
                                          LEARNING_RATE=0.001,
                                          TRAIN_ITERATION=1,
                                          DECAY=0.5),
               name=name,
               value_func=mlp_q)
     return dqn, locals()
Exemple #3
0
    def test_l1_l2_norm(self):
        env = make('Acrobot-v1')
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)
        name = 'dqn'

        mlp_q = MLPQValueFunction(env_spec=env_spec,
                                  name_scope=name + '_mlp',
                                  name=name + '_mlp',
                                  mlp_config=[{
                                      "ACT": "RELU",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "1",
                                      "N_UNITS": 16,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03,
                                      "L1_NORM": 1000.0,
                                      "L2_NORM": 1000.0
                                  }, {
                                      "ACT": "LINEAR",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "OUPTUT",
                                      "N_UNITS": 1,
                                      "L1_NORM": 1000.0,
                                      "L2_NORM": 1000.0,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  }])
        dqn = DQN(env_spec=env_spec,
                  config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000,
                                             GAMMA=0.99,
                                             BATCH_SIZE=10,
                                             LEARNING_RATE=0.01,
                                             TRAIN_ITERATION=1,
                                             DECAY=0.5),
                  name=name,
                  value_func=mlp_q)
        dqn2, _ = self.create_dqn(name='dqn_2')
        a = TransitionData(env_spec)
        st = env.reset()
        dqn.init()
        dqn2.init()
        for i in range(100):
            ac = dqn.predict(obs=st, sess=self.sess, batch_flag=False)
            st_new, re, done, _ = env.step(action=ac)
            a.append(state=st,
                     new_state=st_new,
                     action=ac,
                     done=done,
                     reward=re)
            st = st_new
            dqn.append_to_memory(a)
        for i in range(20):
            print(
                'dqn1 loss: ',
                dqn.train(batch_data=a,
                          train_iter=10,
                          sess=None,
                          update_target=True))
            print(
                'dqn2 loss: ',
                dqn2.train(batch_data=a,
                           train_iter=10,
                           sess=None,
                           update_target=True))
        var_list = self.sess.run(dqn.q_value_func.parameters('tf_var_list'))
        print(var_list)
        var_list2 = self.sess.run(dqn2.q_value_func.parameters('tf_var_list'))
        print(var_list2)
        for var, var2 in zip(var_list, var_list2):
            diff = np.abs(var2) - np.abs(var)
            self.assertTrue(np.greater(np.mean(diff), 0.0).all())
Exemple #4
0
def task_fn():
    env = make('Acrobot-v1')
    name = 'example_scheduler_'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[
                                  {
                                      "ACT": "RELU",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "1",
                                      "N_UNITS": 16,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  },
                                  {
                                      "ACT": "LINEAR",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "OUPTUT",
                                      "N_UNITS": 1,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  }
                              ])
    dqn = DQN(env_spec=env_spec,
              config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000,
                                         GAMMA=0.99,
                                         BATCH_SIZE=10,
                                         LEARNING_RATE=0.001,
                                         TRAIN_ITERATION=1,
                                         DECAY=0.5),
              name=name + '_dqn',
              value_func=mlp_q)
    agent = Agent(env=env, env_spec=env_spec,
                  algo=dqn,
                  name=name + '_agent',
                  algo_saving_scheduler=PeriodicalEventSchedule(
                      t_fn=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                      trigger_every_step=20,
                      after_t=10),
                  exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                                     prob_scheduler=PiecewiseScheduler(
                                                         t_fn=lambda: get_global_status_collect()(
                                                             'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                                                         endpoints=((10, 0.3), (100, 0.1), (200, 0.0)),
                                                         outside_value=0.0
                                                     ),
                                                     init_random_prob=0.5))
    flow = create_train_test_flow(
        test_every_sample_count=10,
        train_every_sample_count=10,
        start_test_after_sample_count=5,
        start_train_after_sample_count=5,
        train_func_and_args=(agent.train, (), dict()),
        test_func_and_args=(agent.test, (), dict(sample_count=10)),
        sample_func_and_args=(agent.sample, (), dict(sample_count=100,
                                                     env=agent.env,
                                                     store_flag=True))
    )
    experiment = Experiment(
        tuner=None,
        env=env,
        agent=agent,
        flow=flow,
        name=name + 'experiment_debug'
    )

    dqn.parameters.set_scheduler(param_key='LEARNING_RATE',
                                 scheduler=LinearScheduler(
                                     t_fn=experiment.TOTAL_AGENT_TRAIN_SAMPLE_COUNT,
                                     schedule_timesteps=GlobalConfig().DEFAULT_EXPERIMENT_END_POINT[
                                         'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'],
                                     final_p=0.0001,
                                     initial_p=0.01))
    experiment.run()
def task_fn():
    env = make('Acrobot-v1')
    name = 'demo_exp'
    env.env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)
    env_spec = env.env_spec
    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[
                                  {
                                      "ACT": "TANH",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "1",
                                      "N_UNITS": 64,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  },
                                  {
                                      "ACT": "TANH",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "2",
                                      "N_UNITS": 64,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  },
                                  {
                                      "ACT": "RELU",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "3",
                                      "N_UNITS": 256,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  },
                                  {
                                      "ACT": "LINEAR",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "OUPTUT",
                                      "N_UNITS": 1,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  }
                              ])
    dqn = DQN(env_spec=env_spec,
              config_or_config_dict=dict(REPLAY_BUFFER_SIZE=50000,
                                         GAMMA=0.99,
                                         BATCH_SIZE=32,
                                         LEARNING_RATE=0.001,
                                         TRAIN_ITERATION=1,
                                         DECAY=0),
              name=name + '_dqn',
              value_func=mlp_q)

    epsilon_greedy = EpsilonGreedy(action_space=env_spec.action_space,
                                   prob_scheduler=LinearScheduler(
                                                         t_fn=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                                                         schedule_timesteps=int(0.1 * 100000),
                                                         initial_p=1.0,
                                                         final_p=0.02),
                                                         init_random_prob=0.1)

    agent = Agent(env=env,
                  env_spec=env_spec,
                  algo=dqn,
                  name=name + '_agent',
                  exploration_strategy=epsilon_greedy,
                  noise_adder=None)


    flow = create_train_test_flow(
        test_every_sample_count=1000,
        train_every_sample_count=1,
        start_test_after_sample_count=0,
        start_train_after_sample_count=10000,
        sample_func_and_args=(agent.sample, (), dict(sample_count=1, env=agent.env, store_flag=True)),
        train_func_and_args=(agent.train, (), dict()),
        test_func_and_args=(agent.test, (), dict(sample_count=1)),
    )
    experiment = Experiment(
        tuner=None,
        env=env,
        agent=agent,
        flow=flow,
        name=name
    )

    experiment.run()
Exemple #6
0
    def test_integration_with_dqn(self):
        env = make('Acrobot-v1')
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)

        mlp_q = MLPQValueFunction(env_spec=env_spec,
                                  name='mlp_q',
                                  name_scope='mlp_q',
                                  mlp_config=[{
                                      "ACT": "RELU",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "1",
                                      "N_UNITS": 16,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  }, {
                                      "ACT": "LINEAR",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "OUPTUT",
                                      "N_UNITS": 1,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  }])
        dqn = DQN(env_spec=env_spec,
                  name='dqn_test',
                  config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000,
                                             GAMMA=0.99,
                                             BATCH_SIZE=10,
                                             LEARNING_RATE=0.001,
                                             TRAIN_ITERATION=1,
                                             DECAY=0.5),
                  value_func=mlp_q)
        dqn.init()
        st = env.reset()
        from baconian.common.sampler.sample_data import TransitionData
        a = TransitionData(env_spec)
        res = []
        for i in range(100):
            ac = dqn.predict(obs=st, sess=self.sess, batch_flag=False)
            st_new, re, done, _ = env.step(action=ac)
            a.append(state=st,
                     new_state=st_new,
                     action=ac,
                     done=done,
                     reward=re)
            dqn.append_to_memory(a)
        res.append(
            dqn.train(batch_data=a,
                      train_iter=10,
                      sess=None,
                      update_target=True)['average_loss'])
        res.append(
            dqn.train(batch_data=None,
                      train_iter=10,
                      sess=None,
                      update_target=True)['average_loss'])
        self.assertTrue(dqn in dqn.recorder._obj_log)
        self.assertTrue('average_loss' in dqn.recorder._obj_log[dqn])
        self.assertTrue(len(dqn.recorder._obj_log[dqn]['average_loss']) == 2)
        self.assertTrue(
            np.equal(np.array(res), [
                x['log_val']
                for x in dqn.recorder._obj_log[dqn]['average_loss']
            ]).all())

        self.assertTrue(len(Logger()._registered_recorders) > 0)
        self.assertTrue(dqn.recorder in Logger()._registered_recorders)

        Logger().flush_recorder()
Exemple #7
0
def task_fn():
    env = make('Acrobot-v1')
    name = 'demo_exp'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[
                                  {
                                      "ACT": "RELU",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "1",
                                      "N_UNITS": 16,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  },
                                  {
                                      "ACT": "LINEAR",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "OUPTUT",
                                      "N_UNITS": 1,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  }
                              ])
    dqn = DQN(env_spec=env_spec,
              config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000,
                                         GAMMA=0.99,
                                         BATCH_SIZE=10,
                                         LEARNING_RATE=0.01,
                                         TRAIN_ITERATION=1,
                                         DECAY=0.5),
              name=name + '_dqn',
              value_func=mlp_q)

    agent = Agent(env=env, env_spec=env_spec,
                  algo=dqn,
                  name=name + '_agent',
                  exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                                     init_random_prob=0.5))

    flow = create_train_test_flow(
        test_every_sample_count=10,
        train_every_sample_count=10,
        start_test_after_sample_count=5,
        start_train_after_sample_count=5,
        train_func_and_args=(agent.train, (), dict()),
        test_func_and_args=(agent.test, (), dict(sample_count=10)),
        sample_func_and_args=(agent.sample, (), dict(sample_count=100,
                                                     env=agent.env,
                                                     in_which_status='TRAIN',
                                                     store_flag=True))
    )

    experiment = Experiment(
        tuner=None,
        env=env,
        agent=agent,
        flow=flow,
        name=name
    )
    experiment.run()
Exemple #8
0
    def test_integration_with_dqn(self):
        env = make('Acrobot-v1')
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)

        mlp_q = MLPQValueFunction(env_spec=env_spec,
                                  name='mlp_q',
                                  name_scope='mlp_q',
                                  mlp_config=[
                                      {
                                          "ACT": "RELU",
                                          "B_INIT_VALUE": 0.0,
                                          "NAME": "1",
                                          "N_UNITS": 16,
                                          "TYPE": "DENSE",
                                          "W_NORMAL_STDDEV": 0.03
                                      },
                                      {
                                          "ACT": "LINEAR",
                                          "B_INIT_VALUE": 0.0,
                                          "NAME": "OUPTUT",
                                          "N_UNITS": 1,
                                          "TYPE": "DENSE",
                                          "W_NORMAL_STDDEV": 0.03
                                      }
                                  ])
        dqn = DQN(env_spec=env_spec,
                  name='dqn_test',
                  config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000,
                                             GAMMA=0.99,
                                             BATCH_SIZE=10,
                                             LEARNING_RATE=0.001,
                                             TRAIN_ITERATION=1,
                                             DECAY=0.5),
                  value_func=mlp_q)
        agent = Agent(env=env, env_spec=env_spec,
                      algo=dqn,
                      name='agent')
        agent.init()
        # dqn.init()
        st = env.reset()
        from baconian.common.sampler.sample_data import TransitionData
        a = TransitionData(env_spec)
        res = []
        agent.sample(env=env,
                     sample_count=100,
                     in_which_status='TRAIN',
                     store_flag=True,
                     sample_type='transition')
        agent.sample(env=env,
                     sample_count=100,
                     in_which_status='TRAIN',
                     store_flag=True,
                     sample_type='transition')
        res.append(dqn.train(batch_data=a, train_iter=10, sess=None, update_target=True)['average_loss'])
        res.append(dqn.train(batch_data=None, train_iter=10, sess=None, update_target=True)['average_loss'])
        self.assertTrue(dqn in dqn.recorder._obj_log)
        self.assertTrue('average_loss' in dqn.recorder._obj_log[dqn])
        self.assertTrue(len(dqn.recorder._obj_log[dqn]['average_loss']) == 2)
        self.assertTrue(
            np.equal(np.array(res), [x['value'] for x in dqn.recorder._obj_log[dqn]['average_loss']]).all())

        self.assertTrue(len(Logger()._registered_recorders) > 0)
        self.assertTrue(dqn.recorder in Logger()._registered_recorders)
        res = dqn.recorder.get_log(attr_name='average_loss', filter_by_status=dict())
        self.assertEqual(len(res), 2)
        res = agent.recorder.get_log(attr_name='sum_reward', filter_by_status={'status': 'TRAIN'})
        self.assertEqual(len(res), 2)
        res = agent.recorder.get_log(attr_name='sum_reward', filter_by_status={'status': 'TEST'})
        self.assertEqual(len(res), 0)
        Logger().flush_recorder()