Ejemplo n.º 1
0
def task_fn():
    env = make('Acrobot-v1')
    name = 'demo_exp'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)
    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[
                                  {
                                      "ACT": "TANH",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "1",
                                      "N_UNITS": 64,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  },
                                  {
                                      "ACT": "TANH",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "2",
                                      "N_UNITS": 64,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  },
                                  {
                                      "ACT": "RELU",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "3",
                                      "N_UNITS": 256,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  },
                                  {
                                      "ACT": "LINEAR",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "OUPTUT",
                                      "N_UNITS": 1,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  }
                              ])
    dqn = DQN(env_spec=env_spec,
              config_or_config_dict=dict(REPLAY_BUFFER_SIZE=50000,
                                         GAMMA=0.99,
                                         BATCH_SIZE=32,
                                         LEARNING_RATE=0.001,
                                         TRAIN_ITERATION=1,
                                         DECAY=0),
              name=name + '_dqn',
              value_func=mlp_q)
    agent = Agent(env=env, env_spec=env_spec,
                  algo=dqn,
                  name=name + '_agent',
                  exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                                     prob_scheduler=LinearScheduler(
                                                         t_fn=lambda: get_global_status_collect()(
                                                             'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                                                         schedule_timesteps=int(0.1 * 100000),
                                                         initial_p=1.0,
                                                         final_p=0.02),
                                                     init_random_prob=0.1),
                  noise_adder=None)

    flow = create_train_test_flow(
        test_every_sample_count=100,
        train_every_sample_count=1,
        start_test_after_sample_count=0,
        start_train_after_sample_count=1000,
        sample_func_and_args=(agent.sample, (), dict(sample_count=1,
                                                     env=agent.env,
                                                     store_flag=True)),
        train_func_and_args=(agent.train, (), dict()),
        test_func_and_args=(agent.test, (), dict(sample_count=3)),
    )
    experiment = Experiment(
        tuner=None,
        env=env,
        agent=agent,
        flow=flow,
        name=name
    )
    experiment.run()
Ejemplo n.º 2
0
def task_fn():
    env = make('Acrobot-v1')
    name = 'example_scheduler_'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[{
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }, {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }])
    dqn = DQN(env_spec=env_spec,
              config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000,
                                         GAMMA=0.99,
                                         BATCH_SIZE=10,
                                         LEARNING_RATE=0.001,
                                         TRAIN_ITERATION=1,
                                         DECAY=0.5),
              name=name + '_dqn',
              value_func=mlp_q)
    agent = Agent(env=env,
                  env_spec=env_spec,
                  algo=dqn,
                  name=name + '_agent',
                  algo_saving_scheduler=PeriodicalEventSchedule(
                      t_fn=lambda: get_global_status_collect()
                      ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                      trigger_every_step=20,
                      after_t=10),
                  exploration_strategy=EpsilonGreedy(
                      action_space=env_spec.action_space,
                      prob_scheduler=PiecewiseScheduler(
                          t_fn=lambda: get_global_status_collect()
                          ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                          endpoints=((10, 0.3), (100, 0.1), (200, 0.0)),
                          outside_value=0.0),
                      init_random_prob=0.5))
    flow = create_train_test_flow(test_every_sample_count=10,
                                  train_every_sample_count=10,
                                  start_test_after_sample_count=5,
                                  start_train_after_sample_count=5,
                                  train_func_and_args=(agent.train, (),
                                                       dict()),
                                  test_func_and_args=(agent.test, (),
                                                      dict(sample_count=10)),
                                  sample_func_and_args=(agent.sample, (),
                                                        dict(sample_count=100,
                                                             env=agent.env,
                                                             store_flag=True)))
    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name + 'experiment_debug')

    dqn.parameters.set_scheduler(
        param_key='LEARNING_RATE',
        scheduler=LinearScheduler(
            t_fn=experiment.TOTAL_AGENT_TRAIN_SAMPLE_COUNT,
            schedule_timesteps=GlobalConfig(
            ).DEFAULT_EXPERIMENT_END_POINT['TOTAL_AGENT_TRAIN_SAMPLE_COUNT'],
            final_p=0.0001,
            initial_p=0.01))
    experiment.run()
Ejemplo n.º 3
0
def task_fn():
    env = make('Pendulum-v0')
    name = 'demo_exp'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[{
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }, {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }])
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    mlp_config=[{
                                        "ACT": "RELU",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "1",
                                        "N_UNITS": 16,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }, {
                                        "ACT": "LINEAR",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "OUPTUT",
                                        "N_UNITS": env_spec.flat_action_dim,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }],
                                    reuse=False)

    ddpg = DDPG(env_spec=env_spec,
                config_or_config_dict={
                    "REPLAY_BUFFER_SIZE": 10000,
                    "GAMMA": 0.999,
                    "Q_NET_L1_NORM_SCALE": 0.01,
                    "Q_NET_L2_NORM_SCALE": 0.01,
                    "CRITIC_LEARNING_RATE": 0.001,
                    "ACTOR_LEARNING_RATE": 0.001,
                    "DECAY": 0.5,
                    "BATCH_SIZE": 50,
                    "TRAIN_ITERATION": 1,
                    "critic_clip_norm": 0.1,
                    "actor_clip_norm": 0.1,
                },
                value_func=mlp_q,
                policy=policy,
                name=name + '_ddpg',
                replay_buffer=None)
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=ddpg,
        algo_saving_scheduler=PeriodicalEventSchedule(
            t_fn=lambda: get_global_status_collect()
            ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
            trigger_every_step=20,
            after_t=10),
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))
    flow = TrainTestFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict={
            "TEST_EVERY_SAMPLE_COUNT": 10,
            "TRAIN_EVERY_SAMPLE_COUNT": 10,
            "START_TRAIN_AFTER_SAMPLE_COUNT": 5,
            "START_TEST_AFTER_SAMPLE_COUNT": 5,
        },
        func_dict={
            'test': {
                'func': agent.test,
                'args': list(),
                'kwargs': dict(sample_count=10),
            },
            'train': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(),
            },
            'sample': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=100,
                     env=agent.env,
                     in_which_status='TRAIN',
                     store_flag=True),
            },
        })

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
Ejemplo n.º 4
0
def pendulum_task_fn():
    exp_config = PENDULUM_BENCHMARK_CONFIG_DICT
    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('Pendulum-v0')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_dyna = ContinuousMLPGlobalDynamicsModel(env_spec=env_spec,
                                                name_scope=name + '_mlp_dyna',
                                                name=name + '_mlp_dyna',
                                                **exp_config['DynamicsModel'])
    dyna_env = DynamicsEnvWrapper(mlp_dyna)
    dyna_env.set_terminal_reward_func(
        terminal_func=FixedEpisodeLengthTerminalFunc(
            max_step_length=env.unwrapped._max_episode_steps,
            step_count_fn=dyna_env.total_step_count_fn),
        reward_func=REWARD_FUNC_DICT['Pendulum-v0']())

    policy = iLQRPolicy(env_spec=env_spec,
                        **exp_config['ILQR'],
                        dynamics=dyna_env,
                        cost_fn=RewardFuncCostWrapper(
                            reward_func=REWARD_FUNC_DICT['Pendulum-v0']()))

    algo = iLQRAlogWrapper(policy=policy,
                           env_spec=env_spec,
                           dynamics_env=dyna_env)

    agent = Agent(env=env,
                  env_spec=env_spec,
                  algo=algo,
                  exploration_strategy=None,
                  noise_adder=None,
                  name=name + '_agent')

    flow = DynaFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict=exp_config['DynaFlow'],
        func_dict={
            'train_dynamics': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_dynamics_training')
            },
            'train_algo': None,
            'test_algo': {
                'func': agent.test,
                'args': list(),
                'kwargs': dict(sample_count=1)
            },
            'test_dynamics': {
                'func': agent.algo.test_dynamics,
                'args': list(),
                'kwargs': dict(sample_count=100, env=env)
            },
            'sample_from_real_env': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=10,
                     env=agent.env,
                     in_which_status='TRAIN',
                     store_flag=True)
            },
            'sample_from_dynamics_env': None,
            'train_algo_from_synthesized_data': None
        })

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
Ejemplo n.º 5
0
def task_fn():
    env = make('Pendulum-v0')
    name = 'demo_exp_'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_v = MLPVValueFunc(env_spec=env_spec,
                          name_scope=name + 'mlp_v',
                          name=name + 'mlp_v',
                          mlp_config=[
                              {
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "L1_NORM": 0.01,
                                  "L2_NORM": 0.01,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              },
                              {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }
                          ])

    policy = NormalDistributionMLPPolicy(env_spec=env_spec,
                                         name_scope=name + 'mlp_policy',
                                         name=name + 'mlp_policy',
                                         mlp_config=[
                                             {
                                                 "ACT": "RELU",
                                                 "B_INIT_VALUE": 0.0,
                                                 "NAME": "1",
                                                 "L1_NORM": 0.01,
                                                 "L2_NORM": 0.01,
                                                 "N_UNITS": 16,
                                                 "TYPE": "DENSE",
                                                 "W_NORMAL_STDDEV": 0.03
                                             },
                                             {
                                                 "ACT": "LINEAR",
                                                 "B_INIT_VALUE": 0.0,
                                                 "NAME": "OUPTUT",
                                                 "N_UNITS": env_spec.flat_action_dim,
                                                 "TYPE": "DENSE",
                                                 "W_NORMAL_STDDEV": 0.03
                                             }
                                         ],
                                         reuse=False)

    ppo = PPO(
        env_spec=env_spec,
        config_or_config_dict={
            "gamma": 0.995,
            "lam": 0.98,
            "policy_train_iter": 10,
            "value_func_train_iter": 10,
            "clipping_range": None,
            "beta": 1.0,
            "eta": 50,
            "value_func_memory_size": 10,
            "log_var_init": -1.0,
            "kl_target": 0.003,
            "policy_lr": 0.01,
            "value_func_lr": 0.01,
            "value_func_train_batch_size": 10,
            "lr_multiplier": 1.0
        },
        value_func=mlp_v,
        stochastic_policy=policy,
        name=name + 'ppo'
    )
    agent = Agent(env=env, env_spec=env_spec,
                  algo=ppo,
                  algo_saving_scheduler=PeriodicalEventSchedule(
                      t_fn=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                      trigger_every_step=20,
                      after_t=10),
                  name=name + 'agent',
                  exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                                     init_random_prob=0.5))
    flow = create_train_test_flow(
        test_every_sample_count=10,
        train_every_sample_count=10,
        start_test_after_sample_count=5,
        start_train_after_sample_count=5,
        train_func_and_args=(agent.train, (), dict()),
        test_func_and_args=(agent.test, (), dict(sample_count=10)),
        sample_func_and_args=(agent.sample, (), dict(sample_count=100,
                                                     env=agent.env,
                                                     sample_type='trajectory',
                                                     store_flag=True))
    )

    experiment = Experiment(
        tuner=None,
        env=env,
        agent=agent,
        flow=flow,
        name=name
    )
    experiment.run()
Ejemplo n.º 6
0
    def test_integration_with_dqn(self):
        env = make('Acrobot-v1')
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)

        mlp_q = MLPQValueFunction(env_spec=env_spec,
                                  name='mlp_q',
                                  name_scope='mlp_q',
                                  mlp_config=[{
                                      "ACT": "RELU",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "1",
                                      "N_UNITS": 16,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  }, {
                                      "ACT": "LINEAR",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "OUPTUT",
                                      "N_UNITS": 1,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  }])
        dqn = DQN(env_spec=env_spec,
                  name='dqn_test',
                  config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000,
                                             GAMMA=0.99,
                                             BATCH_SIZE=10,
                                             LEARNING_RATE=0.001,
                                             TRAIN_ITERATION=1,
                                             DECAY=0.5),
                  value_func=mlp_q)
        dqn.init()
        st = env.reset()
        from baconian.common.sampler.sample_data import TransitionData
        a = TransitionData(env_spec)
        res = []
        for i in range(100):
            ac = dqn.predict(obs=st, sess=self.sess, batch_flag=False)
            st_new, re, done, _ = env.step(action=ac)
            a.append(state=st,
                     new_state=st_new,
                     action=ac,
                     done=done,
                     reward=re)
            dqn.append_to_memory(a)
        res.append(
            dqn.train(batch_data=a,
                      train_iter=10,
                      sess=None,
                      update_target=True)['average_loss'])
        res.append(
            dqn.train(batch_data=None,
                      train_iter=10,
                      sess=None,
                      update_target=True)['average_loss'])
        self.assertTrue(dqn in dqn.recorder._obj_log)
        self.assertTrue('average_loss' in dqn.recorder._obj_log[dqn])
        self.assertTrue(len(dqn.recorder._obj_log[dqn]['average_loss']) == 2)
        self.assertTrue(
            np.equal(np.array(res), [
                x['log_val']
                for x in dqn.recorder._obj_log[dqn]['average_loss']
            ]).all())

        self.assertTrue(len(Logger()._registered_recorders) > 0)
        self.assertTrue(dqn.recorder in Logger()._registered_recorders)

        Logger().flush_recorder()
    def test_mlp_norm_dist_policy(self):
        env = make('Pendulum-v0')
        env.reset()
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)

        policy = NormalDistributionMLPPolicy(env_spec=env_spec,
                                             name='mlp_policy',
                                             name_scope='mlp_policy',
                                             mlp_config=[{
                                                 "ACT":
                                                 "RELU",
                                                 "B_INIT_VALUE":
                                                 0.0,
                                                 "NAME":
                                                 "1",
                                                 "N_UNITS":
                                                 16,
                                                 "TYPE":
                                                 "DENSE",
                                                 "W_NORMAL_STDDEV":
                                                 0.03
                                             }, {
                                                 "ACT":
                                                 "LINEAR",
                                                 "B_INIT_VALUE":
                                                 0.0,
                                                 "NAME":
                                                 "OUPTUT",
                                                 "N_UNITS":
                                                 env_spec.flat_action_dim,
                                                 "TYPE":
                                                 "DENSE",
                                                 "W_NORMAL_STDDEV":
                                                 0.03
                                             }],
                                             output_high=None,
                                             output_low=None,
                                             output_norm=None,
                                             input_norm=None,
                                             reuse=False)
        policy.init()
        dist_info = policy.get_dist_info()
        self.assertTrue(
            np.equal(dist_info[0]['shape'],
                     policy.mean_output.shape.as_list()).all())
        self.assertTrue(
            np.equal(dist_info[1]['shape'],
                     policy.logvar_output.shape.as_list()).all())
        for _ in range(10):
            ac = policy.forward(obs=env.observation_space.sample())
            self.assertTrue(env.action_space.contains(ac[0]))
        p2 = policy.make_copy(name='test',
                              name_scope='mlp_policy_2',
                              reuse=False)
        p2.init()
        self.assertGreater(len(policy.parameters('tf_var_list')), 0)
        self.assertGreater(len(p2.parameters('tf_var_list')), 0)
        for var1, var2 in zip(policy.parameters('tf_var_list'),
                              p2.parameters('tf_var_list')):
            self.assertEqual(var1.shape, var2.shape)
            self.assertNotEqual(id(var1), id(var2))

        p3 = policy.make_copy(name='mlp_policy_ttt',
                              name_scope='mlp_policy',
                              reuse=True)
        p3.init()
        self.assertGreater(len(p3.parameters('tf_var_list')), 0)
        for var1, var2 in zip(policy.parameters('tf_var_list'),
                              p3.parameters('tf_var_list')):
            self.assertEqual(var1.shape, var2.shape)
            self.assertEqual(id(var1), id(var2))

        # policy.copy_from(p2)]
        res_not_true = []
        for var1, var2, var3 in zip(policy.parameters('tf_var_list'),
                                    p2.parameters('tf_var_list'),
                                    p3.parameters('tf_var_list')):
            re1, re2, re3 = self.sess.run([var1, var2, var3])
            res_not_true.append(np.isclose(re1, re2).all())
            res_not_true.append(np.isclose(re3, re2).all())
            self.assertTrue(np.isclose(re1, re3).all())
        self.assertFalse(np.array(res_not_true).all())

        policy.copy_from(p2)

        for var1, var2, var3 in zip(policy.parameters('tf_var_list'),
                                    p2.parameters('tf_var_list'),
                                    p3.parameters('tf_var_list')):
            re1, re2, re3 = self.sess.run([var1, var2, var3])
            self.assertTrue(np.isclose(re1, re3).all())
            self.assertTrue(np.isclose(re2, re3).all())
            self.assertTrue(np.isclose(re1, re2).all())
Ejemplo n.º 8
0
    def create_ddpg(self, env_id='Pendulum-v0', name='ddpg'):
        env = make(env_id)
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)

        mlp_q = MLPQValueFunction(env_spec=env_spec,
                                  name_scope=name + 'mlp_q',
                                  name=name + 'mlp_q',
                                  mlp_config=[{
                                      "ACT": "RELU",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "1",
                                      "N_UNITS": 16,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  }, {
                                      "ACT": "LINEAR",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "OUPTUT",
                                      "N_UNITS": 1,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  }])
        self.assertTrue(len(mlp_q.parameters('tf_var_list')) == 4)
        policy = DeterministicMLPPolicy(env_spec=env_spec,
                                        name_scope=name + 'mlp_policy',
                                        name=name + 'mlp_policy',
                                        mlp_config=[{
                                            "ACT": "RELU",
                                            "B_INIT_VALUE": 0.0,
                                            "NAME": "1",
                                            "N_UNITS": 16,
                                            "TYPE": "DENSE",
                                            "W_NORMAL_STDDEV": 0.03
                                        }, {
                                            "ACT": "LINEAR",
                                            "B_INIT_VALUE": 0.0,
                                            "NAME": "OUPTUT",
                                            "N_UNITS":
                                            env_spec.flat_action_dim,
                                            "TYPE": "DENSE",
                                            "W_NORMAL_STDDEV": 0.03
                                        }],
                                        reuse=False)
        self.assertTrue(len(policy.parameters('tf_var_list')) == 4)

        ddpg = DDPG(env_spec=env_spec,
                    config_or_config_dict={
                        "REPLAY_BUFFER_SIZE": 10000,
                        "GAMMA": 0.999,
                        "CRITIC_LEARNING_RATE": 0.001,
                        "ACTOR_LEARNING_RATE": 0.001,
                        "DECAY": 0.5,
                        "BATCH_SIZE": 50,
                        "TRAIN_ITERATION": 1,
                        "critic_clip_norm": 0.1,
                        "actor_clip_norm": 0.1,
                    },
                    value_func=mlp_q,
                    policy=policy,
                    name=name,
                    replay_buffer=None)
        return ddpg, locals()
Ejemplo n.º 9
0
 def create_env(self, env_id):
     return make(env_id)
Ejemplo n.º 10
0
    def test_transition_data(self):
        env = make('Acrobot-v1')
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)
        a = TransitionData(env_spec)
        st = env.reset()
        for i in range(100):
            ac = env_spec.action_space.sample()
            st_new, re, done, _ = env.step(action=ac)
            a.append(state=st,
                     new_state=st_new,
                     action=ac,
                     done=done,
                     reward=re)
        self.assertEqual(a.reward_set.shape[0], 100)
        self.assertEqual(a.done_set.shape[0], 100)
        self.assertEqual(a.action_set.shape[0], 100)
        self.assertEqual(a.state_set.shape[0], 100)
        self.assertEqual(a.new_state_set.shape[0], 100)

        self.assertEqual(a('reward_set').shape[0], 100)
        self.assertEqual(a('done_set').shape[0], 100)

        self.assertEqual(a('state_set').shape[0], 100)
        self.assertEqual(a('new_state_set').shape[0], 100)
        self.assertEqual(a('action_set').shape[0], 100)
        iterator = a.return_generator()
        count = 0
        for st, new_st, ac, reward, terminal in iterator:
            count += 1
            self.assertTrue(env_spec.action_space.contains(ac))
            self.assertTrue(env_spec.obs_space.contains(st))
            self.assertTrue(env_spec.obs_space.contains(new_st))
            self.assertTrue(np.isscalar(reward))
            self.assertTrue(isinstance(terminal, bool))
        self.assertEqual(count, 100)

        a = TransitionData(
            obs_shape=list(np.array(env_spec.obs_space.sample()).shape),
            action_shape=list(np.array(env_spec.action_space.sample()).shape))
        st = env.reset()
        for i in range(100):
            ac = env_spec.action_space.sample()
            st_new, re, done, _ = env.step(action=ac)
            a.append(state=st,
                     new_state=st_new,
                     action=ac,
                     done=done,
                     reward=re)
        self.assertEqual(a.reward_set.shape[0], 100)
        self.assertEqual(a.done_set.shape[0], 100)

        self.assertEqual(a.action_set.shape[0], 100)
        self.assertEqual(a.state_set.shape[0], 100)
        self.assertEqual(a.new_state_set.shape[0], 100)

        self.assertEqual(a('reward_set').shape[0], 100)
        self.assertEqual(a('done_set').shape[0], 100)

        self.assertEqual(a('state_set').shape[0], 100)
        self.assertEqual(a('new_state_set').shape[0], 100)
        self.assertEqual(a('action_set').shape[0], 100)

        self.assertTrue(
            np.equal(a.get_mean_of('state_set'),
                     a.apply_op('state_set', np.mean)).all())
        self.assertTrue(
            np.equal(a.get_sum_of('state_set'),
                     a.apply_op('state_set', np.sum)).all())

        self.assertTrue(
            np.equal(a.get_sum_of('reward_set'),
                     a.apply_op('reward_set', np.sum)).all())
        self.assertTrue(
            np.equal(a.get_sum_of('reward_set'),
                     a.apply_op('reward_set', np.sum)).all())

        self.assertTrue(
            np.equal(a.get_sum_of('action_set'),
                     a.apply_op('action_set', np.sum)).all())
        self.assertTrue(
            np.equal(a.get_sum_of('action_set'),
                     a.apply_op('action_set', np.sum)).all())
        self.assertTrue(
            np.equal(a.apply_op('state_set', np.max, axis=-1),
                     np.max(a('state_set'), axis=-1)).all())

        tmp_action = a('action_set').copy()
        a.apply_transformation(set_name='action_set',
                               func=lambda x: x * 2,
                               direct_apply=False)
        self.assertTrue(np.equal(tmp_action, a('action_set')).all())
        a.apply_transformation(set_name='action_set',
                               func=lambda x: x * 2,
                               direct_apply=True)
        self.assertTrue(np.equal(tmp_action * 2.0, a('action_set')).all())
        try:
            a.apply_transformation(set_name='action_set',
                                   func=lambda _: np.array([1, 2, 3]),
                                   direct_apply=True)
        except TransformationResultedToDifferentShapeError as e:
            pass
        else:
            raise TypeError

        a.apply_transformation(set_name='action_set',
                               func=lambda x: x // 2,
                               direct_apply=True)
        self.assertTrue(np.equal(tmp_action, a('action_set')).all())

        index = np.arange(len(a._internal_data_dict['state_set'][0])).tolist()
        b = a.get_copy()
        a.shuffle(index=list(index))
        for i in range(len(index)):
            for key in a._internal_data_dict.keys():
                self.assertTrue(
                    np.equal(np.array(a._internal_data_dict[key][0][i]),
                             np.array(b._internal_data_dict[key][0][i])).all())

        iterator = a.return_generator()
        count = 0
        for st, new_st, ac, reward, terminal in iterator:
            count += 1
            self.assertTrue(env_spec.action_space.contains(ac))
            self.assertTrue(env_spec.obs_space.contains(st))
            self.assertTrue(env_spec.obs_space.contains(new_st))
            self.assertTrue(np.isscalar(reward))
            self.assertTrue(isinstance(terminal, bool))
        self.assertEqual(count, 100)
        count = 0
        iter = a.return_generator(batch_size=10)
        for st, new_st, ac, reward, terminal in iter:
            self.assertEqual(len(st), 10)
            self.assertEqual(len(new_st), 10)
            self.assertEqual(len(ac), 10)
            self.assertEqual(len(reward), 10)
            self.assertEqual(len(terminal), 10)
            count += 1
        self.assertEqual(count, 10)
        count = 0
        iter = a.return_generator(batch_size=10, infinite_run=True)
        for st, new_st, ac, reward, terminal in iter:
            self.assertEqual(len(st), 10)
            self.assertEqual(len(new_st), 10)
            self.assertEqual(len(ac), 10)
            self.assertEqual(len(reward), 10)
            self.assertEqual(len(terminal), 10)
            count += 1
            if count > 20:
                break
        self.assertGreater(count, 20)

        a.append_new_set(name='test',
                         data_set=np.ones_like(
                             a._internal_data_dict['state_set'][0]),
                         shape=a._internal_data_dict['state_set'][1])
        iter = a.return_generator(batch_size=10,
                                  assigned_keys=('state_set', 'new_state_set',
                                                 'action_set', 'reward_set',
                                                 'done_set', 'test'))
        count = 0
        for st, new_st, ac, reward, terminal, test in iter:
            self.assertEqual(len(test), 10)
            count += 1
        self.assertEqual(count, 10)

        a.reset()
        self.assertEqual(a.reward_set.shape[0], 0)
        self.assertEqual(a.done_set.shape[0], 0)

        self.assertEqual(a.action_set.shape[0], 0)
        self.assertEqual(a.state_set.shape[0], 0)
        self.assertEqual(a.new_state_set.shape[0], 0)

        self.assertEqual(a('reward_set').shape[0], 0)
        self.assertEqual(a('done_set').shape[0], 0)

        self.assertEqual(a('state_set').shape[0], 0)
        self.assertEqual(a('new_state_set').shape[0], 0)
        self.assertEqual(a('action_set').shape[0], 0)
Ejemplo n.º 11
0
    def test_init(self):
        sess = self.sess
        env = make('Pendulum-v0')
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)
        action_dim = env_spec.flat_action_dim
        state_dim = env_spec.flat_obs_dim
        # bs_shape = tf.placeholder(dtype=tf.int8, shape=[])
        bs_shape = 4
        action_ph = tf.placeholder(dtype=tf.float32, shape=[None, action_dim])
        state_ph = tf.placeholder(dtype=tf.float32, shape=[None, state_dim])

        mean_old = tf.layers.dense(inputs=state_ph,
                                   name='layer1',
                                   units=action_dim)

        mean2 = tf.layers.dense(inputs=state_ph,
                                name='layer2',
                                units=action_dim)

        # mean1 = tf.get_variable(name='mean1', shape=[bs_shape, action_dim], dtype=tf.float32)
        var1 = tf.get_variable(name='var1', shape=[action_dim], dtype=tf.float32,
                               initializer=tf.initializers.random_uniform(0.0, 1.0))

        # mean2 = tf.get_variable(name='mean2', shape=[bs_shape, action_dim], dtype=tf.float32)
        var2 = tf.get_variable(name='var2', shape=[action_dim], dtype=tf.float32,
                               initializer=tf.initializers.random_uniform(0.0, 1.0))

        # var1 = tf.get_variable('logvars', (10, action_dim), tf.float32,
        #                        tf.constant_initializer(0.0))
        # var1 = tf.expand_dims(tf.reduce_sum(var1, axis=0), axis=0)
        # var1 = tf.tile(var1, [bs_shape, 1])
        #
        # var2 = tf.get_variable('logvars2', (10, action_dim), tf.float32,
        #                        tf.constant_initializer(0.0))
        # var2 = tf.expand_dims(tf.reduce_sum(var2, axis=0), 0)
        # var2 = tf.tile(var2, [bs_shape, 1])

        dist_old = tfp.distributions.MultivariateNormalDiag(mean_old, tf.sqrt(var1), validate_args=True)
        dis2 = tfp.distributions.MultivariateNormalDiag(mean2, tf.sqrt(var2), validate_args=True)

        dist_norm1 = tfp.distributions.Normal(mean_old, var1)
        dist_norm2 = tfp.distributions.Normal(mean2, var2)

        print(dist_old, dis2)
        # dis1 = tfp.distributions.Independent(dis1, reinterpreted_batch_ndims=1)
        # dis2 = tfp.distributions.Independent(dis2, reinterpreted_batch_ndims=1)

        # op = tf.train.AdamOptimizer(learning_rate=0.1).minimize(tfp.distributions.kl_divergence(dis1, dis2),
        #                                                         var_list=[mean1, var1])

        ac = [env_spec.action_space.sample() for _ in range(bs_shape)]
        ac = make_batch(np.array(ac), original_shape=env_spec.action_shape)

        state = [env_spec.obs_space.sample() for _ in range(bs_shape)]
        state = make_batch(np.array(state), original_shape=env_spec.obs_shape)

        feed_dict = {
            state_ph: state,
            action_ph: ac
        }
        sess.run(tf.global_variables_initializer())

        kl, entropy, logp, log_p_old = kl_entropy_logprob_from_pat_cody(old_mean=mean_old,
                                                                        old_var=var1,
                                                                        mean=mean2,
                                                                        var=var2,
                                                                        feed_dict=feed_dict,
                                                                        sess=sess,
                                                                        action_ph=action_ph,
                                                                        action_dim=action_dim)

        kl_tfp = sess.run(tf.reduce_mean(tfp.distributions.kl_divergence(dist_old, dis2)), feed_dict=feed_dict)
        entropy_tfp = sess.run(tf.reduce_mean(dis2.entropy()), feed_dict=feed_dict)

        log_prob_tfp = sess.run(dis2.log_prob(value=ac), feed_dict=feed_dict)
        log_p_old_tfp = sess.run(dist_old.log_prob(value=ac), feed_dict=feed_dict)

        test_log_prob_tfp = dis2.log_prob(ac) + tf.cast(0.5 * np.log(2. * np.pi * action_dim), dtype=tf.float32)
        test_log_prob_tfp_old = dist_old.log_prob(ac) + tf.cast(0.5 * np.log(2. * np.pi * action_dim), dtype=tf.float32)

        print("ac shape {}".format(ac.shape))
        print("a sample from dis1 shape {}".format(sess.run(dist_old.sample(), feed_dict=feed_dict).shape))
        print("shape of dis under feeddict {}".format(
            sess.run([dist_old.batch_shape_tensor(), dist_old.event_shape_tensor()],
                     feed_dict=feed_dict)))

        # print(sess.run(dis2.log_prob(value=ac)).shape)
        # print(sess.run(dis1.log_prob(value=ac)).shape)
        for i in range(bs_shape):
            feed_dict_i = {
                state_ph: make_batch(state[i], env_spec.obs_shape),
                action_ph: make_batch(ac[i], env_spec.action_shape)
            }
            print("i dis2 log prob: {}".format(sess.run(dis2.log_prob(value=ac[i]), feed_dict=feed_dict_i)))
            print("i dis1 log prob: {}".format(sess.run(dist_old.log_prob(value=ac[i]), feed_dict=feed_dict_i)))

        print(kl, kl_tfp)
        print(entropy, entropy_tfp)

        print(logp, log_prob_tfp)
        print(log_p_old, log_p_old_tfp)
        print('new log p {}'.format(sess.run(test_log_prob_tfp, feed_dict=feed_dict)))
        print('new log p old {}'.format(sess.run(test_log_prob_tfp_old, feed_dict=feed_dict)))

        print('new log p norm {}'.format(sess.run(tf.reduce_sum(dist_norm1.log_prob(ac), axis=1), feed_dict=feed_dict)))
        print('new log p old norm {}'.format(
            sess.run(tf.reduce_sum(dist_norm2.log_prob(ac), axis=1), feed_dict=feed_dict)))

        self.assertTrue(np.isclose(logp, log_prob_tfp).all())
        self.assertTrue(np.isclose(log_p_old, log_p_old_tfp).all())
        self.assertTrue(np.isclose(kl, kl_tfp).all())
        self.assertTrue(np.isclose(entropy, entropy_tfp).all())

        kl, entropy, logp, log_p_old = kl_entropy_logprob_from_mvn(old_mean=mean_old,
                                                                   old_var=var1,
                                                                   mean=mean2,
                                                                   var=var2,
                                                                   feed_dict=feed_dict,
                                                                   sess=sess,
                                                                   action_ph=action_ph,
                                                                   action_dim=action_dim)
        print(kl, entropy, logp, log_p_old)
        self.assertTrue(np.isclose(logp, log_prob_tfp).all())
        self.assertTrue(np.isclose(log_p_old, log_p_old_tfp).all())
        self.assertTrue(np.isclose(entropy, entropy_tfp).all())
        self.assertTrue(np.isclose(kl, kl_tfp).all())
Ejemplo n.º 12
0
def pendulum_task_fn():
    exp_config = PENDULUM_BENCHMARK_CONFIG_DICT
    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('Pendulum-v0')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_dyna = ContinuousMLPGlobalDynamicsModel(
        env_spec=env_spec,
        name_scope=name + '_mlp_dyna',
        name=name + '_mlp_dyna',
        output_low=env_spec.obs_space.low,
        output_high=env_spec.obs_space.high,
        **exp_config['DynamicsModel']
    )
    algo = ModelPredictiveControl(
        dynamics_model=mlp_dyna,
        env_spec=env_spec,
        config_or_config_dict=exp_config['MPC'],
        name=name + '_mpc',
        policy=UniformRandomPolicy(env_spec=env_spec, name='uni_policy')
    )
    algo.set_terminal_reward_function_for_dynamics_env(reward_func=REWARD_FUNC_DICT['Pendulum-v0'](),
                                                       terminal_func=FixedEpisodeLengthTerminalFunc(
                                                           max_step_length=env.unwrapped._max_episode_steps,
                                                           step_count_fn=algo.dynamics_env.total_step_count_fn), )
    agent = Agent(env=env, env_spec=env_spec,
                  algo=algo,
                  exploration_strategy=None,
                  noise_adder=None,
                  name=name + '_agent')

    flow = DynaFlow(
        train_sample_count_func=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict=exp_config['DynaFlow'],
        func_dict={
            'train_dynamics': {'func': agent.train,
                               'args': list(),
                               'kwargs': dict()},
            'train_algo': None,
            'test_algo': {'func': agent.test,
                          'args': list(),
                          'kwargs': dict(sample_count=1, sample_trajectory_flag=True)},
            'test_dynamics': {'func': agent.algo.test_dynamics,
                              'args': list(),
                              'kwargs': dict(sample_count=100, env=env)},
            'sample_from_real_env': {'func': agent.sample,
                                     'args': list(),
                                     'kwargs': dict(sample_count=10,
                                                    env=agent.env,
                                                    in_which_status='TRAIN',
                                                    store_flag=True)},
            'sample_from_dynamics_env': None,
            'train_algo_from_synthesized_data': None
        }
    )

    experiment = Experiment(
        tuner=None,
        env=env,
        agent=agent,
        flow=flow,
        name=name
    )
    experiment.run()
Ejemplo n.º 13
0
def task_fn():
    env = make('Acrobot-v1')
    name = 'demo_exp'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[{
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }, {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }])
    dqn = DQN(env_spec=env_spec,
              config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000,
                                         GAMMA=0.99,
                                         BATCH_SIZE=10,
                                         Q_NET_L1_NORM_SCALE=0.001,
                                         Q_NET_L2_NORM_SCALE=0.001,
                                         LEARNING_RATE=0.01,
                                         TRAIN_ITERATION=1,
                                         DECAY=0.5),
              name=name + '_dqn',
              value_func=mlp_q)

    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=dqn,
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))

    flow = TrainTestFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict={
            "TEST_EVERY_SAMPLE_COUNT": 10,
            "TRAIN_EVERY_SAMPLE_COUNT": 10,
            "START_TRAIN_AFTER_SAMPLE_COUNT": 5,
            "START_TEST_AFTER_SAMPLE_COUNT": 5,
        },
        func_dict={
            'test': {
                'func': agent.test,
                'args': list(),
                'kwargs': dict(sample_count=10),
            },
            'train': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(),
            },
            'sample': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=100,
                     env=agent.env,
                     in_which_status='TRAIN',
                     store_flag=True),
            },
        })

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
Ejemplo n.º 14
0
def task_fn():
    # create the gym environment by make function
    env = make('Pendulum-v0')
    # give your experiment a name which is used to generate the log path etc.
    name = 'demo_exp'
    # construct the environment specification
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)
    # construct the neural network to approximate q function of DDPG
    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[{
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }, {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }])
    # construct the neural network to approximate policy for DDPG
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    mlp_config=[{
                                        "ACT": "RELU",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "1",
                                        "N_UNITS": 16,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }, {
                                        "ACT": "LINEAR",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "OUPTUT",
                                        "N_UNITS": env_spec.flat_action_dim,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }],
                                    reuse=False)
    # construct the DDPG algorithms
    ddpg = DDPG(env_spec=env_spec,
                config_or_config_dict={
                    "REPLAY_BUFFER_SIZE": 10000,
                    "GAMMA": 0.999,
                    "CRITIC_LEARNING_RATE": 0.001,
                    "ACTOR_LEARNING_RATE": 0.001,
                    "DECAY": 0.5,
                    "BATCH_SIZE": 50,
                    "TRAIN_ITERATION": 1,
                    "critic_clip_norm": 0.1,
                    "actor_clip_norm": 0.1,
                },
                value_func=mlp_q,
                policy=policy,
                name=name + '_ddpg',
                replay_buffer=None)
    # construct a neural network based global dynamics model to approximate the state transition of environment
    mlp_dyna = ContinuousMLPGlobalDynamicsModel(
        env_spec=env_spec,
        name_scope=name + '_mlp_dyna',
        name=name + '_mlp_dyna',
        learning_rate=0.01,
        state_input_scaler=RunningStandardScaler(dims=env_spec.flat_obs_dim),
        action_input_scaler=RunningStandardScaler(
            dims=env_spec.flat_action_dim),
        output_delta_state_scaler=RunningStandardScaler(
            dims=env_spec.flat_obs_dim),
        mlp_config=[{
            "ACT": "RELU",
            "B_INIT_VALUE": 0.0,
            "NAME": "1",
            "L1_NORM": 0.0,
            "L2_NORM": 0.0,
            "N_UNITS": 16,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }, {
            "ACT": "LINEAR",
            "B_INIT_VALUE": 0.0,
            "NAME": "OUPTUT",
            "L1_NORM": 0.0,
            "L2_NORM": 0.0,
            "N_UNITS": env_spec.flat_obs_dim,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }])
    # finally, construct the Dyna algorithms with a model free algorithm DDGP, and a NN model.
    algo = Dyna(env_spec=env_spec,
                name=name + '_dyna_algo',
                model_free_algo=ddpg,
                dynamics_model=mlp_dyna,
                config_or_config_dict=dict(dynamics_model_train_iter=10,
                                           model_free_algo_train_iter=10))
    # To make the NN based dynamics model a proper environment so be a sampling source for DDPG, reward function and
    # terminal function need to be set.

    # For examples only, we use random reward function and terminal function with fixed episode length.
    algo.set_terminal_reward_function_for_dynamics_env(
        terminal_func=FixedEpisodeLengthTerminalFunc(
            max_step_length=env.unwrapped._max_episode_steps,
            step_count_fn=algo.dynamics_env.total_step_count_fn),
        reward_func=RandomRewardFunc())
    # construct agent with additional exploration strategy if needed.
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=algo,
        algo_saving_scheduler=PeriodicalEventSchedule(
            t_fn=lambda: get_global_status_collect()
            ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
            trigger_every_step=20,
            after_t=10),
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))
    # construct the training flow, called Dyna flow. It defines how the training proceed, and the terminal condition
    flow = create_dyna_flow(
        train_algo_func=(agent.train, (), dict(state='state_agent_training')),
        train_algo_from_synthesized_data_func=(
            agent.train, (), dict(state='state_agent_training')),
        train_dynamics_func=(agent.train, (),
                             dict(state='state_dynamics_training')),
        test_algo_func=(agent.test, (), dict(sample_count=1)),
        test_dynamics_func=(agent.algo.test_dynamics, (),
                            dict(sample_count=10, env=env)),
        sample_from_real_env_func=(agent.sample, (),
                                   dict(sample_count=10,
                                        env=agent.env,
                                        store_flag=True)),
        sample_from_dynamics_env_func=(agent.sample, (),
                                       dict(sample_count=10,
                                            env=agent.algo.dynamics_env,
                                            store_flag=True)),
        train_algo_every_real_sample_count_by_data_from_real_env=40,
        train_algo_every_real_sample_count_by_data_from_dynamics_env=40,
        test_algo_every_real_sample_count=40,
        test_dynamics_every_real_sample_count=40,
        train_dynamics_ever_real_sample_count=20,
        start_train_algo_after_sample_count=1,
        start_train_dynamics_after_sample_count=1,
        start_test_algo_after_sample_count=1,
        start_test_dynamics_after_sample_count=1,
        warm_up_dynamics_samples=1)
    # construct the experiment
    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name + '_exp')
    # run!
    experiment.run()
Ejemplo n.º 15
0
def task_fn():
    env = make('Pendulum-v0')
    name = 'demo_exp'
    env_spec = env.env_spec
    mlp_dyna = ContinuousMLPGlobalDynamicsModel(env_spec=env_spec,
                                                name_scope=name + '_mlp_dyna',
                                                name=name + '_mlp_dyna',
                                                learning_rate=0.01,
                                                mlp_config=[{
                                                    "ACT":
                                                    "TANH",
                                                    "B_INIT_VALUE":
                                                    0.0,
                                                    "NAME":
                                                    "1",
                                                    "L1_NORM":
                                                    0.0,
                                                    "L2_NORM":
                                                    0.0,
                                                    "N_UNITS":
                                                    128,
                                                    "TYPE":
                                                    "DENSE",
                                                    "W_NORMAL_STDDEV":
                                                    0.03
                                                }, {
                                                    "ACT":
                                                    "LINEAR",
                                                    "B_INIT_VALUE":
                                                    0.0,
                                                    "NAME":
                                                    "OUPTUT",
                                                    "L1_NORM":
                                                    0.0,
                                                    "L2_NORM":
                                                    0.0,
                                                    "N_UNITS":
                                                    env_spec.flat_obs_dim,
                                                    "TYPE":
                                                    "DENSE",
                                                    "W_NORMAL_STDDEV":
                                                    0.03
                                                }])
    algo = ModelPredictiveControl(
        dynamics_model=mlp_dyna,
        env_spec=env_spec,
        config_or_config_dict=dict(SAMPLED_HORIZON=2,
                                   SAMPLED_PATH_NUM=5,
                                   dynamics_model_train_iter=10),
        name=name + '_mpc',
        policy=UniformRandomPolicy(env_spec=env_spec, name='uni_policy'))
    algo.set_terminal_reward_function_for_dynamics_env(
        reward_func=RandomRewardFunc(name='reward_func'),
        terminal_func=RandomTerminalFunc(name='random_terminal'),
    )
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=algo,
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))
    flow = create_train_test_flow(test_every_sample_count=10,
                                  train_every_sample_count=10,
                                  start_test_after_sample_count=5,
                                  start_train_after_sample_count=5,
                                  train_func_and_args=(agent.train, (),
                                                       dict()),
                                  test_func_and_args=(agent.test, (),
                                                      dict(sample_count=10)),
                                  sample_func_and_args=(agent.sample, (),
                                                        dict(sample_count=100,
                                                             env=agent.env,
                                                             store_flag=True)))
    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
Ejemplo n.º 16
0
def pendulum_task_fn():
    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('Pendulum-v0')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              **exp_config['MLPQValueFunction'])
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    output_low=env_spec.action_space.low,
                                    output_high=env_spec.action_space.high,
                                    **exp_config['DeterministicMLPPolicy'],
                                    reuse=False)

    ddpg = DDPG(env_spec=env_spec,
                policy=policy,
                value_func=mlp_q,
                name=name + '_ddpg',
                **exp_config['DDPG'])

    mlp_dyna = ContinuousMLPGlobalDynamicsModel(env_spec=env_spec,
                                                name_scope=name + '_mlp_dyna',
                                                name=name + '_mlp_dyna',
                                                **exp_config['DynamicsModel'])
    algo = Dyna(env_spec=env_spec,
                name=name + '_dyna_algo',
                model_free_algo=ddpg,
                dynamics_model=mlp_dyna,
                config_or_config_dict=dict(dynamics_model_train_iter=10,
                                           model_free_algo_train_iter=10))
    algo.set_terminal_reward_function_for_dynamics_env(
        terminal_func=FixedEpisodeLengthTerminalFunc(
            max_step_length=env.unwrapped._max_episode_steps,
            step_count_fn=algo.dynamics_env.total_step_count_fn),
        reward_func=REWARD_FUNC_DICT['Pendulum-v0']())
    agent = Agent(env=env,
                  env_spec=env_spec,
                  algo=algo,
                  exploration_strategy=None,
                  noise_adder=AgentActionNoiseWrapper(
                      noise=NormalActionNoise(),
                      noise_weight_scheduler=ConstantSchedule(value=0.3),
                      action_weight_scheduler=ConstantSchedule(value=1.0)),
                  name=name + '_agent')

    flow = DynaFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict=exp_config['DynaFlow'],
        func_dict={
            'train_algo': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_agent_training')
            },
            'train_algo_from_synthesized_data': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_agent_training', train_iter=1)
            },
            'train_dynamics': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_dynamics_training')
            },
            'test_algo': {
                'func': agent.test,
                'args': list(),
                'kwargs': dict(sample_count=1, sample_trajectory_flag=True)
            },
            'test_dynamics': {
                'func': agent.algo.test_dynamics,
                'args': list(),
                'kwargs': dict(sample_count=10, env=env)
            },
            'sample_from_real_env': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=10,
                     env=agent.env,
                     in_which_status='TRAIN',
                     store_flag=True)
            },
            'sample_from_dynamics_env': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=50,
                     sample_type='transition',
                     env=agent.algo.dynamics_env,
                     in_which_status='TRAIN',
                     store_flag=False)
            }
        })

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
Ejemplo n.º 17
0
from baconian.core.core import EnvSpec
from baconian.envs.gym_env import make
import numpy as np

env = make("HalfCheetah-v2")
env_spec = EnvSpec(obs_space=env.observation_space,
                   action_space=env.action_space)

OBS_DIM = env_spec.flat_obs_dim
HID1_SIZE = 400
HID2_SIZE = 300

POLICY_HID_MULTI = 10
ACT_DIM = env_spec.flat_action_dim
POLICY_HID1_SIZE = 400
POLICY_HID2_SIZE = 300

CHEETAH_BENCHMARK_CONFIG_DICT = {
    'env_id':
    "HalfCheetah-v2",
    'MLP_V': {
        'mlp_config': [{
            "ACT": "RELU",
            "B_INIT_VALUE": 0.0,
            "NAME": "1",
            "N_UNITS": HID1_SIZE,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": np.sqrt(1 / OBS_DIM)
        }, {
            "ACT": "RELU",
            "B_INIT_VALUE": 0.0,
Ejemplo n.º 18
0
def mountiancar_task_fn():
    exp_config = MOUNTAIN_CAR_CONTINUOUS_BENCHMARK_CONFIG_DICT

    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('MountainCarContinuous-v0')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              **exp_config['MLPQValueFunction'])
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    output_low=env_spec.action_space.low,
                                    output_high=env_spec.action_space.high,
                                    **exp_config['DeterministicMLPPolicy'],
                                    reuse=False)

    ddpg = DDPG(env_spec=env_spec,
                policy=policy,
                value_func=mlp_q,
                name=name + '_ddpg',
                **exp_config['DDPG'])
    n_actions = env.action_space.shape[0]
    agent = Agent(env=env,
                  env_spec=env_spec,
                  algo=ddpg,
                  exploration_strategy=None,
                  noise_adder=AgentActionNoiseWrapper(
                      noise=OrnsteinUhlenbeckActionNoise(
                          mu=np.zeros(n_actions),
                          sigma=0.5 * np.ones(n_actions)),
                      noise_weight_scheduler=ConstantScheduler(value=1),
                      action_weight_scheduler=ConstantScheduler(value=1.0)),
                  reset_noise_every_terminal_state=True,
                  name=name + '_agent')

    flow = TrainTestFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict=exp_config['TrainTestFlow']
        ['config_or_config_dict'],
        func_dict={
            'test': {
                'func':
                agent.test,
                'args':
                list(),
                'kwargs':
                dict(sample_count=exp_config['TrainTestFlow']
                     ['TEST_SAMPLES_COUNT']),
            },
            'train': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(),
            },
            'sample': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=exp_config['TrainTestFlow']
                     ['TRAIN_SAMPLES_COUNT'],
                     env=agent.env,
                     in_which_status='TRAIN',
                     store_flag=True),
            },
        })

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
    def test_func(self):
        env = make('Pendulum-v0')
        env.reset()
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)

        policy = NormalDistributionMLPPolicy(env_spec=env_spec,
                                             name='mlp_policy',
                                             name_scope='mlp_policy',
                                             mlp_config=[{
                                                 "ACT":
                                                 "RELU",
                                                 "B_INIT_VALUE":
                                                 0.0,
                                                 "NAME":
                                                 "1",
                                                 "N_UNITS":
                                                 16,
                                                 "TYPE":
                                                 "DENSE",
                                                 "W_NORMAL_STDDEV":
                                                 0.03
                                             }, {
                                                 "ACT":
                                                 "LINEAR",
                                                 "B_INIT_VALUE":
                                                 0.0,
                                                 "NAME":
                                                 "OUPTUT",
                                                 "N_UNITS":
                                                 env_spec.flat_action_dim,
                                                 "TYPE":
                                                 "DENSE",
                                                 "W_NORMAL_STDDEV":
                                                 0.03
                                             }],
                                             output_high=None,
                                             output_low=None,
                                             output_norm=None,
                                             input_norm=None,
                                             reuse=False)
        policy.init()
        print(
            policy.compute_dist_info(name='entropy',
                                     feed_dict={
                                         policy.state_input:
                                         make_batch(
                                             env_spec.obs_space.sample(),
                                             original_shape=env_spec.obs_shape)
                                     }))
        print(
            policy.compute_dist_info(name='prob',
                                     value=env_spec.action_space.sample(),
                                     feed_dict={
                                         policy.state_input:
                                         make_batch(
                                             env_spec.obs_space.sample(),
                                             original_shape=env_spec.obs_shape)
                                     }))
        new_policy = policy.make_copy(
            reuse=False,
            name='new_p',
            name_scope='mlp_policy_2',
        )
        new_policy.init()
        for var1, var2 in zip(policy.parameters('tf_var_list'),
                              new_policy.parameters('tf_var_list')):
            print(var1.name)
            print(var2.name)
            self.assertNotEqual(var1.name, var2.name)
            self.assertNotEqual(id(var1), id(var2))
        obs1 = make_batch(
            env_spec.obs_space.sample(),
            original_shape=env_spec.obs_shape,
        )
        obs2 = make_batch(env_spec.obs_space.sample(),
                          original_shape=env_spec.obs_shape)
        kl1 = policy.compute_dist_info(name='kl',
                                       other=new_policy,
                                       feed_dict={
                                           policy.state_input: obs1,
                                           new_policy.state_input: obs2
                                       })
        kl2 = self.sess.run(policy.kl(other=new_policy),
                            feed_dict={
                                policy.state_input: obs1,
                                new_policy.state_input: obs2
                            })
        self.assertTrue(np.isclose(kl1, kl2).all())
Ejemplo n.º 20
0
def task_fn():
    name = 'mpc_ModifiedHalfCheetah'
    env = make('ModifiedHalfCheetah')
    env_spec = env.env_spec

    mlp_dyna = MBMPC_MLPDynamics(env_spec=env.env_spec,
                                 name_scope=name + '_mlp_dyna',
                                 name=name + '_mlp_dyna',
                                 learning_rate=1e-3,
                                 mlp_config=[{
                                     "ACT": "TANH",
                                     "B_INIT_VALUE": 0.0,
                                     "NAME": "1",
                                     "L1_NORM": 0.0,
                                     "L2_NORM": 0.0,
                                     "N_UNITS": 128,
                                     "TYPE": "DENSE",
                                     "W_NORMAL_STDDEV": 0.03
                                 }, {
                                     "ACT": "TANH",
                                     "B_INIT_VALUE": 0.0,
                                     "NAME": "2",
                                     "L1_NORM": 0.0,
                                     "L2_NORM": 0.0,
                                     "N_UNITS": 64,
                                     "TYPE": "DENSE",
                                     "W_NORMAL_STDDEV": 0.03
                                 }, {
                                     "ACT": "LINEAR",
                                     "B_INIT_VALUE": 0.0,
                                     "NAME": "OUPTUT",
                                     "L1_NORM": 0.0,
                                     "L2_NORM": 0.0,
                                     "N_UNITS": env_spec.flat_obs_dim,
                                     "TYPE": "DENSE",
                                     "W_NORMAL_STDDEV": 0.03
                                 }])

    # buffer
    rl_size = 500  # default 1000
    random_size = 500  # default 1000

    ### algo
    horizon = 20
    dyna_epoch = 60

    ### agent
    max_step = 500  # default 1000 # TODO: 9.22 should max_step == rl_size == random_size?
    batch_size = 128
    rand_rl_ratio = 0.1
    random_trajectory = 1  # TODO: 9.22 Is there situations when tranjectory num must != 1
    on_policy_trajectory = 1
    on_policy_iter = 10
    num_simulated_paths = 50  # default 1000

    algo = ModelBasedModelPredictiveControl(
        dynamics_model=mlp_dyna,
        env_spec=env_spec,
        config_or_config_dict=dict(SAMPLED_HORIZON=horizon,
                                   SAMPLED_PATH_NUM=num_simulated_paths,
                                   dynamics_model_train_iter=dyna_epoch),
        name=name + '_algo',
        policy=UniformRandomPolicy(env_spec=env_spec, name='uniform_random'))

    algo.set_terminal_reward_function_for_dynamics_env(
        reward_func=MBMPC_HalfCheetah_CostFunc(name='cost_fn'),
        terminal_func=MBMPC_HalfCheetah_TerminalFunc(name='terminal_fn'))
    agent = MB_MPC_Agent(name=name + '_agent',
                         env=env,
                         env_spec=env_spec,
                         algo=algo,
                         exploration_strategy=None,
                         algo_saving_scheduler=None)
    flow = create_train_test_flow(env=env,
                                  env_spec=env_spec,
                                  rl_size=rl_size,
                                  max_step=max_step,
                                  batch_size=batch_size,
                                  random_size=random_size,
                                  rand_rl_ratio=rand_rl_ratio,
                                  train_iter=dyna_epoch,
                                  on_policy_iter=on_policy_iter,
                                  random_trajectory=random_trajectory,
                                  on_policy_trajectory=on_policy_trajectory,
                                  num_simulated_paths=num_simulated_paths,
                                  train_func_and_args=(agent.train, (),
                                                       dict()),
                                  test_func_and_args=(agent.test, (), dict()),
                                  sample_func_and_args=(agent.sample, (),
                                                        dict()),
                                  train_every_sample_count=None,
                                  test_every_sample_count=None,
                                  start_train_after_sample_count=None,
                                  start_test_after_sample_count=None,
                                  flow_type='MBMPC_TrainFlow')

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
Ejemplo n.º 21
0
def task_fn():
    env = make('Acrobot-v1')
    name = 'demo_exp'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[{
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }, {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }])
    dqn = DQN(env_spec=env_spec,
              config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000,
                                         GAMMA=0.99,
                                         BATCH_SIZE=10,
                                         Q_NET_L1_NORM_SCALE=0.001,
                                         Q_NET_L2_NORM_SCALE=0.001,
                                         LEARNING_RATE=0.01,
                                         TRAIN_ITERATION=1,
                                         DECAY=0.5),
              name=name + '_dqn',
              value_func=mlp_q)

    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=dqn,
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))

    flow = create_train_test_flow(
        test_every_sample_count=10,
        train_every_sample_count=10,
        start_test_after_sample_count=5,
        start_train_after_sample_count=5,
        train_func_and_args=(agent.train, (), dict()),
        test_func_and_args=(agent.test, (), dict(sample_count=10)),
        sample_func_and_args=(agent.sample, (),
                              dict(sample_count=100,
                                   env=agent.env,
                                   in_which_status='TRAIN',
                                   store_flag=True)))

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
Ejemplo n.º 22
0
    def test_standard_scaler(self):
        for env in (make('Pendulum-v0'), make('Acrobot-v1'),
                    make('RoboschoolAnt-v1')):
            for sample_space in (env.observation_space, env.action_space):
                sample_fn = sample_space.sample
                dims = sample_space.flat_dim
                try:
                    # test batch standard scaler
                    standard_scaler = BatchStandardScaler(dims=dims)
                    data_list = []
                    for i in range(100):
                        data_list.append(sample_fn())
                    data = standard_scaler.process(np.array(data_list))
                    self.assertTrue(
                        np.isclose(np.mean(data, axis=0), 0.0).all())
                    # TODO a theoretical bound should be given
                    self.assertTrue(
                        np.isclose(np.var(data, axis=0), 1.0, atol=0.04).all())
                    data = standard_scaler.inverse_process(data)
                    self.assertTrue(
                        np.isclose(data, np.array(data_list)).all())

                    # test running standard scaler
                    standard_scaler = RunningStandardScaler(dims=dims)
                    data_list = []
                    for i in range(100):
                        data_list.append(sample_fn())
                    standard_scaler.update_scaler(np.array(data_list))
                    self.assertEqual(standard_scaler._data_count, 100)
                    data = standard_scaler.process(np.array(data_list))
                    self.assertTrue(
                        np.isclose(np.mean(data, axis=0), 0.0).all())

                    # TODO a theoretical bound should be given
                    self.assertTrue(
                        np.isclose(np.var(data, axis=0), 1.0, atol=0.04).all())
                    # test update function
                    new_data_list = []
                    for i in range(100):
                        new_data_list.append(sample_fn())
                    standard_scaler.update_scaler(np.array(new_data_list))
                    self.assertEqual(standard_scaler._data_count, 200)

                    data_list += new_data_list
                    data = standard_scaler.process(np.array(data_list))
                    self.assertTrue(
                        np.isclose(np.mean(data, axis=0), 0.0).all())

                    # TODO a theoretical bound should be given
                    self.assertTrue(
                        np.isclose(np.var(data, axis=0), 1.0, atol=0.04).all())

                    # test running scaler with given data
                    data_list = []
                    for i in range(100):
                        data_list.append(sample_fn())
                    standard_scaler = RunningStandardScaler(
                        dims=dims, init_data=np.array(data_list))

                    self.assertEqual(standard_scaler._data_count, 100)
                    data = standard_scaler.process(np.array(data_list))
                    self.assertTrue(
                        np.isclose(np.mean(data, axis=0), 0.0).all())
                    # TODO a theoretical bound should be given
                    self.assertTrue(
                        np.isclose(np.var(data, axis=0), 1.0, atol=0.04).all())
                    # test update of running scaler with given data
                    new_data_list = []
                    for i in range(100):
                        new_data_list.append(sample_fn())
                    standard_scaler.update_scaler(np.array(new_data_list))
                    self.assertEqual(standard_scaler._data_count, 200)

                    data_list += new_data_list
                    data = standard_scaler.process(np.array(data_list))
                    self.assertTrue(
                        np.isclose(np.mean(data, axis=0), 0.0).all())

                    # TODO a theoretical bound should be given
                    self.assertTrue(
                        np.isclose(np.var(data, axis=0), 1.0, atol=0.04).all())

                    # test running scaler with given initial mean, var.
                    data_list = []
                    for i in range(100):
                        data_list.append(sample_fn())
                    standard_scaler = RunningStandardScaler(
                        dims=dims,
                        init_mean=np.mean(data_list, axis=0),
                        init_var=np.var(data_list, axis=0),
                        init_mean_var_data_count=100)

                    self.assertEqual(standard_scaler._data_count, 100)
                    data = standard_scaler.process(np.array(data_list))
                    self.assertTrue(
                        np.isclose(np.mean(data, axis=0), 0.0).all())
                    # TODO a theoretical bound should be given
                    self.assertTrue(
                        np.isclose(np.var(data, axis=0), 1.0, atol=0.04).all())

                    new_data_list = []
                    for i in range(100):
                        new_data_list.append(sample_fn())
                    standard_scaler.update_scaler(np.array(new_data_list))
                    self.assertEqual(standard_scaler._data_count, 200)

                    data_list += new_data_list
                    data = standard_scaler.process(np.array(data_list))
                    self.assertTrue(
                        np.isclose(np.mean(data, axis=0), 0.0).all())

                    # TODO a theoretical bound should be given
                    self.assertTrue(
                        np.isclose(np.var(data, axis=0), 1.0, atol=0.04).all())
                except ShapeNotCompatibleError as e:
                    from baconian.common.spaces import Box
                    if isinstance(sample_space, Box):
                        raise ValueError
                    else:
                        pass
Ejemplo n.º 23
0
def inverted_pendulum_task_fn():
    exp_config = INVERTED_PENDULUM_BENCHMARK_CONFIG_DICT
    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('InvertedPendulum-v2')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_v = MLPVValueFunc(env_spec=env_spec,
                          name_scope=name + 'mlp_v',
                          name=name + 'mlp_v',
                          **exp_config['MLP_V'])
    policy = NormalDistributionMLPPolicy(
        env_spec=env_spec,
        name_scope=name + 'mlp_policy',
        name=name + 'mlp_policy',
        **exp_config['POLICY'],
        output_low=env_spec.action_space.low,
        output_high=env_spec.action_space.high,
        reuse=False)

    ppo = PPO(env_spec=env_spec,
              **exp_config['PPO'],
              value_func=mlp_v,
              stochastic_policy=policy,
              name=name + '_ppo')
    agent = Agent(env=env,
                  env_spec=env_spec,
                  algo=ppo,
                  exploration_strategy=None,
                  noise_adder=None,
                  name=name + '_agent')

    flow = TrainTestFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_FUNC_COUNT'),
        config_or_config_dict=exp_config['TrainTestFlow']
        ['config_or_config_dict'],
        func_dict={
            'test': {
                'func':
                agent.test,
                'args':
                list(),
                'kwargs':
                dict(sample_count=exp_config['TrainTestFlow']
                     ['TEST_SAMPLES_COUNT'],
                     sample_trajectory_flag=True),
            },
            'train': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(),
            },
            'sample': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=exp_config['TrainTestFlow']
                     ['TRAIN_SAMPLES_COUNT'],
                     env=agent.env,
                     sample_type='trajectory',
                     in_which_status='TRAIN',
                     store_flag=True),
            },
        })

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
Ejemplo n.º 24
0
    def test_min_max(self):
        for env in (make('Pendulum-v0'), make('Acrobot-v1'),
                    make('RoboschoolAnt-v1')):
            for sample_space in (env.observation_space, env.action_space):
                sample_fn = sample_space.sample
                dims = sample_space.flat_dim
                try:
                    print("test {} with sample {} dims {}".format(
                        env, sample_fn, dims))
                    # test batch scaler
                    min_max = BatchMinMaxScaler(dims=dims)
                    data_list = []
                    for i in range(100):
                        data_list.append(sample_fn())
                    data = min_max.process(np.array(data_list))
                    self.assertTrue(
                        np.greater_equal(np.ones(dims), data).all())
                    self.assertTrue(np.less_equal(np.zeros(dims), data).all())
                    # test batch scaler with given range
                    min_max = BatchMinMaxScaler(
                        dims=dims,
                        desired_range=(np.ones(dims) * -1.0,
                                       np.ones(dims) * 5.0))
                    data_list = []
                    for i in range(100):
                        data_list.append(sample_fn())
                    data = min_max.process(np.array(data_list))
                    self.assertTrue(
                        np.greater_equal(np.ones(dims) * 5.0, data).all())
                    self.assertTrue(
                        np.less_equal(np.ones(dims) * -1.0, data).all())
                    self.assertEqual(np.max(data), 5.0)
                    self.assertEqual(np.min(data), -1.0)
                    data = min_max.inverse_process(data)
                    self.assertTrue(
                        np.isclose(data, np.array(data_list)).all())

                    # test batch scaler with given range and given initial data
                    data_list = []
                    for i in range(100):
                        data_list.append(sample_fn())

                    min_max = RunningMinMaxScaler(
                        dims=dims,
                        desired_range=(np.ones(dims) * -1.0,
                                       np.ones(dims) * 5.0),
                        init_data=np.array(data_list))

                    data = min_max.process(np.array(data_list))
                    self.assertTrue(
                        np.greater_equal(np.ones(dims) * 5.0, data).all())
                    self.assertTrue(
                        np.less_equal(np.ones(dims) * -1.0, data).all())
                    self.assertEqual(np.max(data), 5.0)
                    self.assertEqual(np.min(data), -1.0)

                    # test batch scaler with given range and given initial min and max
                    data_list = []
                    for i in range(100):
                        data_list.append(sample_fn())

                    min_max = RunningMinMaxScaler(
                        dims=dims,
                        desired_range=(np.ones(dims) * -1.0,
                                       np.ones(dims) * 5.0),
                        init_min=np.min(np.array(data_list), axis=0),
                        init_max=np.max(np.array(data_list), axis=0))

                    data = min_max.process(np.array(data_list))
                    self.assertTrue(
                        np.greater_equal(np.ones(dims) * 5.0, data).all())
                    self.assertTrue(
                        np.less_equal(np.ones(dims) * -1.0, data).all())
                    self.assertEqual(np.max(data), 5.0)
                    self.assertEqual(np.min(data), -1.0)

                    # test update function by a larger range of data
                    pre_min = np.min(np.array(data_list), axis=0)
                    pre_max = np.max(np.array(data_list), axis=0)
                    data_list = np.array(data_list) * 2.0
                    min_max.update_scaler(data_list)
                    self.assertTrue(
                        np.equal(pre_min * 2.0, min_max._min).all())
                    self.assertTrue(
                        np.equal(pre_max * 2.0, min_max._max).all())
                except ShapeNotCompatibleError as e:
                    from baconian.common.spaces import Box
                    if isinstance(sample_space, Box):
                        raise ValueError
                    else:
                        pass
Ejemplo n.º 25
0
    def test_transition_data(self):
        env = make('Acrobot-v1')
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)
        a = TransitionData(env_spec)
        st = env.reset()
        for i in range(100):
            ac = env_spec.action_space.sample()
            st_new, re, done, _ = env.step(action=ac)
            a.append(state=st,
                     new_state=st_new,
                     action=ac,
                     done=done,
                     reward=re)
        self.assertEqual(a.reward_set.shape[0], 100)
        self.assertEqual(a.done_set.shape[0], 100)
        self.assertEqual(a.action_set.shape[0], 100)
        self.assertEqual(a.state_set.shape[0], 100)
        self.assertEqual(a.new_state_set.shape[0], 100)

        self.assertEqual(a('reward_set').shape[0], 100)
        self.assertEqual(a('done_set').shape[0], 100)

        self.assertEqual(a('state_set').shape[0], 100)
        self.assertEqual(a('new_state_set').shape[0], 100)
        self.assertEqual(a('action_set').shape[0], 100)

        a = TransitionData(
            obs_shape=list(np.array(env_spec.obs_space.sample()).shape),
            action_shape=list(np.array(env_spec.action_space.sample()).shape))
        st = env.reset()
        for i in range(100):
            ac = env_spec.action_space.sample()
            st_new, re, done, _ = env.step(action=ac)
            a.append(state=st,
                     new_state=st_new,
                     action=ac,
                     done=done,
                     reward=re)
        self.assertEqual(a.reward_set.shape[0], 100)
        self.assertEqual(a.done_set.shape[0], 100)

        self.assertEqual(a.action_set.shape[0], 100)
        self.assertEqual(a.state_set.shape[0], 100)
        self.assertEqual(a.new_state_set.shape[0], 100)

        self.assertEqual(a('reward_set').shape[0], 100)
        self.assertEqual(a('done_set').shape[0], 100)

        self.assertEqual(a('state_set').shape[0], 100)
        self.assertEqual(a('new_state_set').shape[0], 100)
        self.assertEqual(a('action_set').shape[0], 100)

        self.assertTrue(
            np.equal(a.get_mean_of('state_set'),
                     a.apply_op('state_set', np.mean)).all())
        self.assertTrue(
            np.equal(a.get_sum_of('state_set'),
                     a.apply_op('state_set', np.sum)).all())

        self.assertTrue(
            np.equal(a.get_sum_of('reward_set'),
                     a.apply_op('reward_set', np.sum)).all())
        self.assertTrue(
            np.equal(a.get_sum_of('reward_set'),
                     a.apply_op('reward_set', np.sum)).all())

        self.assertTrue(
            np.equal(a.get_sum_of('action_set'),
                     a.apply_op('action_set', np.sum)).all())
        self.assertTrue(
            np.equal(a.get_sum_of('action_set'),
                     a.apply_op('action_set', np.sum)).all())
        self.assertTrue(
            np.equal(a.apply_op('state_set', np.max, axis=-1),
                     np.max(a('state_set'), axis=-1)).all())

        tmp_action = a('action_set').copy()
        a.apply_transformation(set_name='action_set',
                               func=lambda x: x * 2,
                               direct_apply=False)
        self.assertTrue(np.equal(tmp_action, a('action_set')).all())
        a.apply_transformation(set_name='action_set',
                               func=lambda x: x * 2,
                               direct_apply=True)
        self.assertTrue(np.equal(tmp_action * 2.0, a('action_set')).all())
        try:
            a.apply_transformation(set_name='action_set',
                                   func=lambda _: np.array([1, 2, 3]),
                                   direct_apply=True)
        except TransformationResultedToDifferentShapeError as e:
            pass
        else:
            raise TypeError

        a.apply_transformation(set_name='action_set',
                               func=lambda x: x // 2,
                               direct_apply=True)
        self.assertTrue(np.equal(tmp_action, a('action_set')).all())

        index = np.arange(len(a._internal_data_dict['state_set'][0])).tolist()
        b = a.get_copy()
        a.shuffle(index=list(index))
        for i in range(len(index)):
            for key in a._internal_data_dict.keys():
                self.assertTrue(
                    np.equal(np.array(a._internal_data_dict[key][0][i]),
                             np.array(b._internal_data_dict[key][0][i])).all())
        a.append_new_set(name='test',
                         data_set=np.ones_like(
                             a._internal_data_dict['state_set'][0]),
                         shape=a._internal_data_dict['state_set'][1])
        a.reset()
        self.assertEqual(a.reward_set.shape[0], 0)
        self.assertEqual(a.done_set.shape[0], 0)

        self.assertEqual(a.action_set.shape[0], 0)
        self.assertEqual(a.state_set.shape[0], 0)
        self.assertEqual(a.new_state_set.shape[0], 0)

        self.assertEqual(a('reward_set').shape[0], 0)
        self.assertEqual(a('done_set').shape[0], 0)

        self.assertEqual(a('state_set').shape[0], 0)
        self.assertEqual(a('new_state_set').shape[0], 0)
        self.assertEqual(a('action_set').shape[0], 0)
Ejemplo n.º 26
0
def task_fn():
    env = make('Pendulum-v0')
    name = 'demo_exp'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[{
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }, {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }])
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    mlp_config=[{
                                        "ACT": "RELU",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "1",
                                        "N_UNITS": 16,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }, {
                                        "ACT": "LINEAR",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "OUPTUT",
                                        "N_UNITS": env_spec.flat_action_dim,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }],
                                    reuse=False)

    ddpg = DDPG(env_spec=env_spec,
                config_or_config_dict={
                    "REPLAY_BUFFER_SIZE": 10000,
                    "GAMMA": 0.999,
                    "Q_NET_L1_NORM_SCALE": 0.01,
                    "Q_NET_L2_NORM_SCALE": 0.01,
                    "CRITIC_LEARNING_RATE": 0.001,
                    "ACTOR_LEARNING_RATE": 0.001,
                    "DECAY": 0.5,
                    "BATCH_SIZE": 50,
                    "TRAIN_ITERATION": 1,
                    "critic_clip_norm": 0.1,
                    "actor_clip_norm": 0.1,
                },
                value_func=mlp_q,
                policy=policy,
                name=name + '_ddpg',
                replay_buffer=None)

    mlp_dyna = ContinuousMLPGlobalDynamicsModel(
        env_spec=env_spec,
        name_scope=name + '_mlp_dyna',
        name=name + '_mlp_dyna',
        output_low=env_spec.obs_space.low,
        output_high=env_spec.obs_space.high,
        learning_rate=0.01,
        mlp_config=[{
            "ACT": "RELU",
            "B_INIT_VALUE": 0.0,
            "NAME": "1",
            "L1_NORM": 0.0,
            "L2_NORM": 0.0,
            "N_UNITS": 16,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }, {
            "ACT": "LINEAR",
            "B_INIT_VALUE": 0.0,
            "NAME": "OUPTUT",
            "L1_NORM": 0.0,
            "L2_NORM": 0.0,
            "N_UNITS": env_spec.flat_obs_dim,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }])
    algo = Dyna(env_spec=env_spec,
                name=name + '_dyna_algo',
                model_free_algo=ddpg,
                dynamics_model=mlp_dyna,
                config_or_config_dict=dict(dynamics_model_train_iter=10,
                                           model_free_algo_train_iter=10))
    # For examples only, we use random reward function and terminal function with fixed episode length.
    algo.set_terminal_reward_function_for_dynamics_env(
        terminal_func=FixedEpisodeLengthTerminalFunc(
            max_step_length=env.unwrapped._max_episode_steps,
            step_count_fn=algo.dynamics_env.total_step_count_fn),
        reward_func=RandomRewardFunc())
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=algo,
        algo_saving_scheduler=PeriodicalEventSchedule(
            t_fn=lambda: get_global_status_collect()
            ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
            trigger_every_step=20,
            after_t=10),
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))
    flow = DynaFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict={
            "TRAIN_ALGO_EVERY_REAL_SAMPLE_COUNT_FROM_REAL_ENV": 10,
            "TRAIN_ALGO_EVERY_REAL_SAMPLE_COUNT_FROM_DYNAMICS_ENV": 10,
            "TEST_ALGO_EVERY_REAL_SAMPLE_COUNT": 10,
            "TEST_DYNAMICS_EVERY_REAL_SAMPLE_COUNT": 10,
            "TRAIN_DYNAMICS_EVERY_REAL_SAMPLE_COUNT": 10,
            "START_TRAIN_ALGO_AFTER_SAMPLE_COUNT": 1,
            "START_TRAIN_DYNAMICS_AFTER_SAMPLE_COUNT": 1,
            "START_TEST_ALGO_AFTER_SAMPLE_COUNT": 1,
            "START_TEST_DYNAMICS_AFTER_SAMPLE_COUNT": 1,
            "WARM_UP_DYNAMICS_SAMPLES": 1
        },
        func_dict={
            'train_algo': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_agent_training')
            },
            'train_algo_from_synthesized_data': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_agent_training')
            },
            'train_dynamics': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_dynamics_training')
            },
            'test_algo': {
                'func': agent.test,
                'args': list(),
                'kwargs': dict(sample_count=10)
            },
            'test_dynamics': {
                'func': agent.algo.test_dynamics,
                'args': list(),
                'kwargs': dict(sample_count=10, env=env)
            },
            'sample_from_real_env': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=10,
                     env=agent.env,
                     in_which_status='TRAIN',
                     store_flag=True)
            },
            'sample_from_dynamics_env': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=10,
                     env=agent.algo.dynamics_env,
                     in_which_status='TRAIN',
                     store_flag=True)
            }
        })

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name + '_exp')
    experiment.run()
Ejemplo n.º 27
0
def pendulum_task_fn():
    exp_config = PENDULUM_BENCHMARK_CONFIG_DICT
    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('Pendulum-v0')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              **exp_config['MLPQValueFunction'])
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    output_low=env_spec.action_space.low,
                                    output_high=env_spec.action_space.high,
                                    **exp_config['DeterministicMLPPolicy'],
                                    reuse=False)

    ddpg = DDPG(
        env_spec=env_spec,
        policy=policy,
        value_func=mlp_q,
        name=name + '_ddpg',
        **exp_config['DDPG']
    )
    agent = Agent(env=env, env_spec=env_spec,
                  algo=ddpg,
                  exploration_strategy=None,
                  noise_adder=AgentActionNoiseWrapper(noise=NormalActionNoise(),
                                                      noise_weight_scheduler=ConstantSchedule(value=0.3),
                                                      action_weight_scheduler=ConstantSchedule(value=1.0)),
                  name=name + '_agent')

    flow = TrainTestFlow(train_sample_count_func=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                         config_or_config_dict=exp_config['TrainTestFlow']['config_or_config_dict'],
                         func_dict={
                             'test': {'func': agent.test,
                                      'args': list(),
                                      'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TEST_SAMPLES_COUNT'],
                                                     sample_trajectory_flag=True),
                                      },
                             'train': {'func': agent.train,
                                       'args': list(),
                                       'kwargs': dict(),
                                       },
                             'sample': {'func': agent.sample,
                                        'args': list(),
                                        'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TRAIN_SAMPLES_COUNT'],
                                                       env=agent.env,
                                                       in_which_status='TRAIN',
                                                       store_flag=True),
                                        },
                         })

    experiment = Experiment(
        tuner=None,
        env=env,
        agent=agent,
        flow=flow,
        name=name
    )
    experiment.run()
Ejemplo n.º 28
0
def task_fn():
    env = make('Pendulum-v0')
    name = 'demo_exp'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[{
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }, {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }])
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    mlp_config=[{
                                        "ACT": "RELU",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "1",
                                        "N_UNITS": 16,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }, {
                                        "ACT": "LINEAR",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "OUPTUT",
                                        "N_UNITS": env_spec.flat_action_dim,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }],
                                    reuse=False)

    ddpg = DDPG(env_spec=env_spec,
                config_or_config_dict={
                    "REPLAY_BUFFER_SIZE": 10000,
                    "GAMMA": 0.999,
                    "CRITIC_LEARNING_RATE": 0.001,
                    "ACTOR_LEARNING_RATE": 0.001,
                    "DECAY": 0.5,
                    "BATCH_SIZE": 50,
                    "TRAIN_ITERATION": 1,
                    "critic_clip_norm": 0.1,
                    "actor_clip_norm": 0.1,
                },
                value_func=mlp_q,
                policy=policy,
                name=name + '_ddpg',
                replay_buffer=None)
    mlp_dyna_list = []
    for i in range(10):
        mlp_dyna = ContinuousMLPGlobalDynamicsModel(
            env_spec=env_spec,
            name_scope=name + '_mlp_dyna_{}'.format(i),
            name=name + '_mlp_dyna_{}'.format(i),
            learning_rate=0.01,
            state_input_scaler=RunningStandardScaler(
                dims=env_spec.flat_obs_dim),
            action_input_scaler=RunningStandardScaler(
                dims=env_spec.flat_action_dim),
            output_delta_state_scaler=RunningStandardScaler(
                dims=env_spec.flat_obs_dim),
            mlp_config=[{
                "ACT": "RELU",
                "B_INIT_VALUE": 0.0,
                "NAME": "1",
                "L1_NORM": 0.0,
                "L2_NORM": 0.0,
                "N_UNITS": 16,
                "TYPE": "DENSE",
                "W_NORMAL_STDDEV": 0.03
            }, {
                "ACT": "LINEAR",
                "B_INIT_VALUE": 0.0,
                "NAME": "OUPTUT",
                "L1_NORM": 0.0,
                "L2_NORM": 0.0,
                "N_UNITS": env_spec.flat_obs_dim,
                "TYPE": "DENSE",
                "W_NORMAL_STDDEV": 0.03
            }])
        mlp_dyna_list.append(mlp_dyna)
    dyna_ensemble_model = ModelEnsemble(n_models=10,
                                        model=mlp_dyna_list,
                                        prediction_type='random',
                                        env_spec=env_spec)
    algo = ModelEnsembleAlgo(env_spec=env_spec,
                             model_free_algo=ddpg,
                             dynamics_model=dyna_ensemble_model,
                             config_or_config_dict=dict(
                                 dynamics_model_train_iter=10,
                                 model_free_algo_train_iter=10,
                                 validation_trajectory_count=2,
                             ))
    # For examples only, we use random reward function and terminal function with fixed episode length.
    algo.set_terminal_reward_function_for_dynamics_env(
        terminal_func=FixedEpisodeLengthTerminalFunc(
            max_step_length=env.unwrapped._max_episode_steps,
            step_count_fn=algo.dynamics_env.total_step_count_fn),
        reward_func=PendulumRewardFunc())
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=algo,
        algo_saving_scheduler=PeriodicalEventSchedule(
            t_fn=lambda: get_global_status_collect()
            ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
            trigger_every_step=200,
            after_t=10),
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))

    # we can easily reuse the dyna training flow to implement the Model-ensemble training flow.
    flow = create_dyna_flow(
        train_algo_func=(agent.train, (), dict(state='state_agent_training')),
        train_algo_from_synthesized_data_func=(
            agent.train, (), dict(state='state_agent_training')),
        train_dynamics_func=(agent.train, (),
                             dict(state='state_dynamics_training')),
        test_algo_func=(agent.test, (), dict(sample_count=10)),
        test_dynamics_func=(agent.algo.test_dynamics, (),
                            dict(sample_count=10, env=env)),
        sample_from_real_env_func=(agent.sample, (),
                                   dict(sample_count=10,
                                        env=agent.env,
                                        store_flag=True)),
        sample_from_dynamics_env_func=(agent.sample, (),
                                       dict(sample_count=10,
                                            env=agent.algo.dynamics_env,
                                            store_flag=True)),
        # set this to large enough so agent only use data from dynamics env.
        train_algo_every_real_sample_count_by_data_from_real_env=100,
        train_algo_every_real_sample_count_by_data_from_dynamics_env=100,
        test_algo_every_real_sample_count=100,
        test_dynamics_every_real_sample_count=100,
        train_dynamics_ever_real_sample_count=100,
        start_train_algo_after_sample_count=1,
        start_train_dynamics_after_sample_count=1,
        start_test_algo_after_sample_count=1,
        start_test_dynamics_after_sample_count=1,
        warm_up_dynamics_samples=100)

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name + '_exp')
    experiment.run()
Ejemplo n.º 29
0
def task_fn():
    env = make('Pendulum-v0')
    name = 'mb_test'
    env_spec = env.env_spec
    model_path = '/home/yitongx/Documents/baconian-project/experiments/log'
    cyber = PendulumnCyber(env=env, epoch_to_use=60, use_traj_input=False, use_mbmf=True, \
                           model_path=model_path)
    mlp_config = [{
        "ACT": "RELU",
        "B_INIT_VALUE": 0.0,
        "NAME": "1",
        "N_UNITS": 32,
        "TYPE": "DENSE",
        "W_NORMAL_STDDEV": 0.03
    }, {
        "ACT": "RELU",
        "B_INIT_VALUE": 0.0,
        "NAME": "2",
        "N_UNITS": 16,
        "TYPE": "DENSE",
        "W_NORMAL_STDDEV": 0.03
    }, {
        "ACT": "RELU",
        "B_INIT_VALUE": 0.0,
        "NAME": "3",
        "N_UNITS": 8,
        "TYPE": "DENSE",
        "W_NORMAL_STDDEV": 0.03
    }, {
        "ACT": "TANH",
        "B_INIT_VALUE": 0.0,
        "NAME": "OUPTUT",
        "N_UNITS": 1,
        "TYPE": "DENSE",
        "W_NORMAL_STDDEV": 0.03
    }]
    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name=name + '_mlp_q',
                              name_scope=name + '_mlp_q',
                              output_high=env.action_space.high,
                              mlp_config=mlp_config)
    mlp_policy = DeterministicMLPPolicy(env_spec=env_spec,
                                        name=name + '_mlp_policy',
                                        name_scope=name + '_mlp_policy',
                                        output_high=env.observation_space.high,
                                        mlp_config=mlp_config,
                                        reuse=False)
    polyak = 0.995
    gamma = 0.99
    noise_scale = 0.5
    noise_decay = 0.999  # default 0.995
    batch_size = 128
    actor_lr = 0.001  # default 0.001
    critic_lr = 0.001  # default 0.001
    buffer_size = 100000
    total_steps = 500000  # default 1000000
    max_step_per_episode = 500  # reset env when counter > max_step_per_episode
    train_after_step = 10000  # default 10000
    train_every_step = 1
    train_iter_per_call = 1
    test_after_step = 10000
    test_every_step = 1000
    num_test = 10

    algo = DDPG(env_spec=env_spec,
                config_or_config_dict={
                    "REPLAY_BUFFER_SIZE": buffer_size,
                    "GAMMA": gamma,
                    "CRITIC_LEARNING_RATE": critic_lr,
                    "ACTOR_LEARNING_RATE": actor_lr,
                    "DECAY": polyak,
                    "BATCH_SIZE": batch_size,
                    "TRAIN_ITERATION": train_iter_per_call,
                    "critic_clip_norm": 0.1,
                    "actor_clip_norm": 0.1,
                },
                value_func=mlp_q,
                policy=mlp_policy,
                name=name + '_ddpg',
                replay_buffer=None)

    step_counter = SinglentonStepCounter(-1)
    noise_adder = AgentActionNoiseWrapper(
        noise=UniformNoise(scale=noise_scale),
        action_weight_scheduler=ConstantScheduler(1.),
        noise_weight_scheduler=DDPGNoiseScheduler(
            train_every_step=train_every_step,
            train_after_step=train_after_step,
            noise_decay=noise_decay,
            step_counter=step_counter))
    agent = DDPG_Agent(env=env,
                       algo=algo,
                       env_spec=env_spec,
                       noise_adder=noise_adder,
                       name=name + '_agent')

    flow = create_train_test_flow(env=env,
                                  cyber=cyber,
                                  agent=agent,
                                  num_test=num_test,
                                  total_steps=total_steps,
                                  max_step_per_episode=max_step_per_episode,
                                  train_after_step=train_after_step,
                                  test_after_step=test_after_step,
                                  train_every_step=train_every_step,
                                  test_every_step=test_every_step,
                                  train_func_and_args=(agent.train, (),
                                                       dict()),
                                  test_func_and_args=(agent.test, (), dict()),
                                  sample_func_and_args=(agent.sample, (),
                                                        dict()),
                                  flow_type='DDPG_TrainTestFlow')

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
Ejemplo n.º 30
0
"""
This gives a simple example on how to use Gaussian Process (GP) to approximate the Gym environment Pendulum-v0
We use gpflow package to build the Gaussian Process.
"""
from baconian.core.core import EnvSpec
from baconian.envs.gym_env import make
import numpy as np
from baconian.common.sampler.sample_data import TransitionData
from baconian.algo.rl.policy.random_policy import UniformRandomPolicy
from baconian.algo.dynamics.gaussian_process_dynamiocs_model import GaussianProcessDyanmicsModel
from baconian.algo.dynamics.dynamics_model import DynamicsEnvWrapper
from baconian.algo.dynamics.terminal_func.terminal_func import RandomTerminalFunc
from baconian.algo.dynamics.reward_func.reward_func import RandomRewardFunc

env = make('Pendulum-v0')
name = 'demo_exp'
env_spec = EnvSpec(obs_space=env.observation_space,
                   action_space=env.action_space)
data = TransitionData(env_spec=env_spec)
policy = UniformRandomPolicy(env_spec=env_spec)
# Do some initial sampling here to train GP model
st = env.reset()
for i in range(100):
    ac = policy.forward(st)
    new_st, re, _, _ = env.step(ac)
    data.append(state=st, new_state=new_st, action=ac, reward=re, done=False)
    st = new_st

gp = GaussianProcessDyanmicsModel(env_spec=env_spec, batch_data=data)
gp.init()
gp.train()