예제 #1
0
 def create_agent(self, algo, env, env_spec, eps=None, name='agent'):
     agent = Agent(env=env,
                   env_spec=env_spec,
                   algo=algo,
                   noise_adder=AgentActionNoiseWrapper(
                       noise=OUNoise(),
                       action_weight_scheduler=LinearScheduler(
                           t_fn=lambda: get_global_status_collect()
                           ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                           schedule_timesteps=100,
                           final_p=1.0,
                           initial_p=0.0),
                       noise_weight_scheduler=LinearScheduler(
                           t_fn=lambda: get_global_status_collect()
                           ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                           schedule_timesteps=100,
                           final_p=0.0,
                           initial_p=1.0)),
                   name=name,
                   algo_saving_scheduler=PeriodicalEventSchedule(
                       t_fn=lambda: get_global_status_collect()
                       ('TOTAL_ENV_STEP_TRAIN_SAMPLE_COUNT'),
                       trigger_every_step=20,
                       after_t=10),
                   exploration_strategy=eps)
     return agent, locals()
예제 #2
0
def mountaincar_task_fn():
    exp_config = MOUNTAINCAR_BENCHMARK_CONFIG_DICT
    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('MountainCar-v0')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              **exp_config['MLPQValueFunction'])
    dqn = DQN(env_spec=env_spec,
              name=name + '_dqn',
              value_func=mlp_q,
              **exp_config['DQN'])
    agent = Agent(env=env, env_spec=env_spec,
                  algo=dqn,
                  name=name + '_agent',
                  exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                                     prob_scheduler=LinearScheduler(
                                                         t_fn=lambda: get_global_status_collect()(
                                                             'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                                                         **exp_config['EpsilonGreedy']['LinearScheduler']),
                                                     **exp_config['EpsilonGreedy']['config_or_config_dict']))
    flow = TrainTestFlow(train_sample_count_func=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                         config_or_config_dict=exp_config['TrainTestFlow']['config_or_config_dict'],
                         func_dict={
                             'test': {'func': agent.test,
                                      'args': list(),
                                      'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TEST_SAMPLES_COUNT']),
                                      },
                             'train': {'func': agent.train,
                                       'args': list(),
                                       'kwargs': dict(),
                                       },
                             'sample': {'func': agent.sample,
                                        'args': list(),
                                        'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TRAIN_SAMPLES_COUNT'],
                                                       env=agent.env,
                                                       in_which_status='TRAIN',
                                                       store_flag=True),
                                        },
                         })

    experiment = Experiment(
        tuner=None,
        env=env,
        agent=agent,
        flow=flow,
        name=name
    )
    experiment.run()
예제 #3
0
    def validate(self, agent: Agent, *args, **kwargs):
        old_result = self.result
        self.validation_result = 0
        for a in range(len(self._dynamics_model)):
            individual_model = self._dynamics_model.model[a]
            env = individual_model.return_as_env()
            batch_data = agent.sample(
                env=env,
                sample_count=self.parameters('validation_trajectory_count'),
                sample_type='trajectory',
                store_flag=False)

            self.result[a] = batch_data.get_mean_of('reward')
            if self.result[a] > old_result[a]:
                self.validation_result += 1

        self.validation_result = self.validation_result / len(
            self._dynamics_model)

        return self.validation_result
예제 #4
0
def task_fn():
    env = make('Pendulum-v0')
    name = 'demo_exp_'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_v = MLPVValueFunc(env_spec=env_spec,
                          name_scope=name + 'mlp_v',
                          name=name + 'mlp_v',
                          mlp_config=[{
                              "ACT": "RELU",
                              "B_INIT_VALUE": 0.0,
                              "NAME": "1",
                              "N_UNITS": 16,
                              "L1_NORM": 0.01,
                              "L2_NORM": 0.01,
                              "TYPE": "DENSE",
                              "W_NORMAL_STDDEV": 0.03
                          }, {
                              "ACT": "LINEAR",
                              "B_INIT_VALUE": 0.0,
                              "NAME": "OUPTUT",
                              "N_UNITS": 1,
                              "TYPE": "DENSE",
                              "W_NORMAL_STDDEV": 0.03
                          }])

    policy = NormalDistributionMLPPolicy(env_spec=env_spec,
                                         name_scope=name + 'mlp_policy',
                                         name=name + 'mlp_policy',
                                         mlp_config=[{
                                             "ACT": "RELU",
                                             "B_INIT_VALUE": 0.0,
                                             "NAME": "1",
                                             "L1_NORM": 0.01,
                                             "L2_NORM": 0.01,
                                             "N_UNITS": 16,
                                             "TYPE": "DENSE",
                                             "W_NORMAL_STDDEV": 0.03
                                         }, {
                                             "ACT": "LINEAR",
                                             "B_INIT_VALUE": 0.0,
                                             "NAME": "OUPTUT",
                                             "N_UNITS":
                                             env_spec.flat_action_dim,
                                             "TYPE": "DENSE",
                                             "W_NORMAL_STDDEV": 0.03
                                         }],
                                         reuse=False)

    ppo = PPO(env_spec=env_spec,
              config_or_config_dict={
                  "gamma": 0.995,
                  "lam": 0.98,
                  "policy_train_iter": 10,
                  "value_func_train_iter": 10,
                  "clipping_range": None,
                  "beta": 1.0,
                  "eta": 50,
                  "log_var_init": -1.0,
                  "kl_target": 0.003,
                  "policy_lr": 0.01,
                  "value_func_lr": 0.01,
                  "value_func_train_batch_size": 10,
                  "lr_multiplier": 1.0
              },
              value_func=mlp_v,
              stochastic_policy=policy,
              name=name + 'ppo')
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=ppo,
        algo_saving_scheduler=PeriodicalEventSchedule(
            t_fn=lambda: get_global_status_collect()
            ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
            trigger_every_step=20,
            after_t=10),
        name=name + 'agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))
    flow = create_train_test_flow(
        test_every_sample_count=10,
        train_every_sample_count=10,
        start_test_after_sample_count=5,
        start_train_after_sample_count=5,
        train_func_and_args=(agent.train, (), dict()),
        test_func_and_args=(agent.test, (), dict(sample_count=10)),
        sample_func_and_args=(agent.sample, (),
                              dict(sample_count=100,
                                   env=agent.env,
                                   in_which_status='TRAIN',
                                   store_flag=True)))

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
예제 #5
0
def task_fn():
    env = make('Acrobot-v1')
    name = 'demo_exp'
    env.env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)
    env_spec = env.env_spec
    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[
                                  {
                                      "ACT": "TANH",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "1",
                                      "N_UNITS": 64,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  },
                                  {
                                      "ACT": "TANH",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "2",
                                      "N_UNITS": 64,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  },
                                  {
                                      "ACT": "RELU",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "3",
                                      "N_UNITS": 256,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  },
                                  {
                                      "ACT": "LINEAR",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "OUPTUT",
                                      "N_UNITS": 1,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  }
                              ])
    dqn = DQN(env_spec=env_spec,
              config_or_config_dict=dict(REPLAY_BUFFER_SIZE=50000,
                                         GAMMA=0.99,
                                         BATCH_SIZE=32,
                                         LEARNING_RATE=0.001,
                                         TRAIN_ITERATION=1,
                                         DECAY=0),
              name=name + '_dqn',
              value_func=mlp_q)

    epsilon_greedy = EpsilonGreedy(action_space=env_spec.action_space,
                                   prob_scheduler=LinearScheduler(
                                                         t_fn=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                                                         schedule_timesteps=int(0.1 * 100000),
                                                         initial_p=1.0,
                                                         final_p=0.02),
                                                         init_random_prob=0.1)

    agent = Agent(env=env,
                  env_spec=env_spec,
                  algo=dqn,
                  name=name + '_agent',
                  exploration_strategy=epsilon_greedy,
                  noise_adder=None)


    flow = create_train_test_flow(
        test_every_sample_count=1000,
        train_every_sample_count=1,
        start_test_after_sample_count=0,
        start_train_after_sample_count=10000,
        sample_func_and_args=(agent.sample, (), dict(sample_count=1, env=agent.env, store_flag=True)),
        train_func_and_args=(agent.train, (), dict()),
        test_func_and_args=(agent.test, (), dict(sample_count=1)),
    )
    experiment = Experiment(
        tuner=None,
        env=env,
        agent=agent,
        flow=flow,
        name=name
    )

    experiment.run()
예제 #6
0
def pendulum_task_fn():
    exp_config = PENDULUM_BENCHMARK_CONFIG_DICT
    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('Pendulum-v0')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_dyna = ContinuousMLPGlobalDynamicsModel(
        env_spec=env_spec,
        name_scope=name + '_mlp_dyna',
        name=name + '_mlp_dyna',
        **exp_config['DynamicsModel']
    )
    algo = ModelPredictiveControl(
        dynamics_model=mlp_dyna,
        env_spec=env_spec,
        config_or_config_dict=exp_config['MPC'],
        name=name + '_mpc',
        policy=UniformRandomPolicy(env_spec=env_spec, name='uni_policy')
    )
    algo.set_terminal_reward_function_for_dynamics_env(reward_func=REWARD_FUNC_DICT['Pendulum-v0'](),
                                                       terminal_func=FixedEpisodeLengthTerminalFunc(
                                                           max_step_length=env.unwrapped._max_episode_steps,
                                                           step_count_fn=algo.dynamics_env.total_step_count_fn), )
    agent = Agent(env=env, env_spec=env_spec,
                  algo=algo,
                  exploration_strategy=None,
                  noise_adder=None,
                  name=name + '_agent')

    flow = DynaFlow(
        train_sample_count_func=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict=exp_config['DynaFlow'],
        func_dict={
            'train_dynamics': {'func': agent.train,
                               'args': list(),
                               'kwargs': dict()},
            'train_algo': None,
            'test_algo': {'func': agent.test,
                          'args': list(),
                          'kwargs': dict(sample_count=1, sample_trajectory_flag=True)},
            'test_dynamics': {'func': agent.algo.test_dynamics,
                              'args': list(),
                              'kwargs': dict(sample_count=100, env=env)},
            'sample_from_real_env': {'func': agent.sample,
                                     'args': list(),
                                     'kwargs': dict(sample_count=10,
                                                    env=agent.env,
                                                    in_which_status='TRAIN',
                                                    store_flag=True)},
            'sample_from_dynamics_env': None,
            'train_algo_from_synthesized_data': None
        }
    )

    experiment = Experiment(
        tuner=None,
        env=env,
        agent=agent,
        flow=flow,
        name=name
    )
    experiment.run()
예제 #7
0
def task_fn():
    env = make('Pendulum-v0')
    name = 'demo_exp'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[{
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }, {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }])
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    mlp_config=[{
                                        "ACT": "RELU",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "1",
                                        "N_UNITS": 16,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }, {
                                        "ACT": "LINEAR",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "OUPTUT",
                                        "N_UNITS": env_spec.flat_action_dim,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }],
                                    reuse=False)

    ddpg = DDPG(env_spec=env_spec,
                config_or_config_dict={
                    "REPLAY_BUFFER_SIZE": 10000,
                    "GAMMA": 0.999,
                    "CRITIC_LEARNING_RATE": 0.001,
                    "ACTOR_LEARNING_RATE": 0.001,
                    "DECAY": 0.5,
                    "BATCH_SIZE": 50,
                    "TRAIN_ITERATION": 1,
                    "critic_clip_norm": 0.1,
                    "actor_clip_norm": 0.1,
                },
                value_func=mlp_q,
                policy=policy,
                name=name + '_ddpg',
                replay_buffer=None)
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=ddpg,
        algo_saving_scheduler=PeriodicalEventSchedule(
            t_fn=lambda: get_global_status_collect()
            ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
            trigger_every_step=20,
            after_t=10),
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))

    flow = create_train_test_flow(
        test_every_sample_count=10,
        train_every_sample_count=10,
        start_test_after_sample_count=5,
        start_train_after_sample_count=5,
        train_func_and_args=(agent.train, (), dict()),
        test_func_and_args=(agent.test, (), dict(sample_count=10)),
        sample_func_and_args=(agent.sample, (),
                              dict(sample_count=100,
                                   env=agent.env,
                                   in_which_status='TRAIN',
                                   store_flag=True)))

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
예제 #8
0
def task_fn():
    env = make('Pendulum-v0')
    name = 'demo_exp'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_dyna = ContinuousMLPGlobalDynamicsModel(
        env_spec=env_spec,
        name_scope=name + '_mlp_dyna',
        name=name + '_mlp_dyna',
        output_low=env_spec.obs_space.low,
        output_high=env_spec.obs_space.high,
        learning_rate=0.01,
        mlp_config=[{
            "ACT": "RELU",
            "B_INIT_VALUE": 0.0,
            "NAME": "1",
            "L1_NORM": 0.0,
            "L2_NORM": 0.0,
            "N_UNITS": 16,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }, {
            "ACT": "LINEAR",
            "B_INIT_VALUE": 0.0,
            "NAME": "OUPTUT",
            "L1_NORM": 0.0,
            "L2_NORM": 0.0,
            "N_UNITS": env_spec.flat_obs_dim,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }])
    algo = ModelPredictiveControl(
        dynamics_model=mlp_dyna,
        env_spec=env_spec,
        config_or_config_dict=dict(SAMPLED_HORIZON=2,
                                   SAMPLED_PATH_NUM=5,
                                   dynamics_model_train_iter=10),
        name=name + '_mpc',
        policy=UniformRandomPolicy(env_spec=env_spec, name='uni_policy'))
    algo.set_terminal_reward_function_for_dynamics_env(
        reward_func=RandomRewardFunc(name='reward_func'),
        terminal_func=RandomTerminalFunc(name='random_terminal'),
    )
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=algo,
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))
    flow = TrainTestFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict={
            "TEST_EVERY_SAMPLE_COUNT": 10,
            "TRAIN_EVERY_SAMPLE_COUNT": 10,
            "START_TRAIN_AFTER_SAMPLE_COUNT": 5,
            "START_TEST_AFTER_SAMPLE_COUNT": 5,
        },
        func_dict={
            'test': {
                'func': agent.test,
                'args': list(),
                'kwargs': dict(sample_count=10),
            },
            'train': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(),
            },
            'sample': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=100,
                     env=agent.env,
                     in_which_status='TRAIN',
                     store_flag=True),
            },
        })
    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
예제 #9
0
def task_fn():
    env = make('Pendulum-v0')
    name = 'demo_exp'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[{
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }, {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }])
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    mlp_config=[{
                                        "ACT": "RELU",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "1",
                                        "N_UNITS": 16,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }, {
                                        "ACT": "LINEAR",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "OUPTUT",
                                        "N_UNITS": env_spec.flat_action_dim,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }],
                                    reuse=False)

    ddpg = DDPG(env_spec=env_spec,
                config_or_config_dict={
                    "REPLAY_BUFFER_SIZE": 10000,
                    "GAMMA": 0.999,
                    "CRITIC_LEARNING_RATE": 0.001,
                    "ACTOR_LEARNING_RATE": 0.001,
                    "DECAY": 0.5,
                    "BATCH_SIZE": 50,
                    "TRAIN_ITERATION": 1,
                    "critic_clip_norm": 0.1,
                    "actor_clip_norm": 0.1,
                },
                value_func=mlp_q,
                policy=policy,
                name=name + '_ddpg',
                replay_buffer=None)
    mlp_dyna_list = []
    for i in range(10):
        mlp_dyna = ContinuousMLPGlobalDynamicsModel(
            env_spec=env_spec,
            name_scope=name + '_mlp_dyna_{}'.format(i),
            name=name + '_mlp_dyna_{}'.format(i),
            learning_rate=0.01,
            state_input_scaler=RunningStandardScaler(
                dims=env_spec.flat_obs_dim),
            action_input_scaler=RunningStandardScaler(
                dims=env_spec.flat_action_dim),
            output_delta_state_scaler=RunningStandardScaler(
                dims=env_spec.flat_obs_dim),
            mlp_config=[{
                "ACT": "RELU",
                "B_INIT_VALUE": 0.0,
                "NAME": "1",
                "L1_NORM": 0.0,
                "L2_NORM": 0.0,
                "N_UNITS": 16,
                "TYPE": "DENSE",
                "W_NORMAL_STDDEV": 0.03
            }, {
                "ACT": "LINEAR",
                "B_INIT_VALUE": 0.0,
                "NAME": "OUPTUT",
                "L1_NORM": 0.0,
                "L2_NORM": 0.0,
                "N_UNITS": env_spec.flat_obs_dim,
                "TYPE": "DENSE",
                "W_NORMAL_STDDEV": 0.03
            }])
        mlp_dyna_list.append(mlp_dyna)
    dyna_ensemble_model = ModelEnsemble(n_models=10,
                                        model=mlp_dyna_list,
                                        prediction_type='random',
                                        env_spec=env_spec)
    algo = ModelEnsembleAlgo(env_spec=env_spec,
                             model_free_algo=ddpg,
                             dynamics_model=dyna_ensemble_model,
                             config_or_config_dict=dict(
                                 dynamics_model_train_iter=10,
                                 model_free_algo_train_iter=10,
                                 validation_trajectory_count=2,
                             ))
    # For examples only, we use random reward function and terminal function with fixed episode length.
    algo.set_terminal_reward_function_for_dynamics_env(
        terminal_func=FixedEpisodeLengthTerminalFunc(
            max_step_length=env.unwrapped._max_episode_steps,
            step_count_fn=algo.dynamics_env.total_step_count_fn),
        reward_func=PendulumRewardFunc())
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=algo,
        algo_saving_scheduler=PeriodicalEventSchedule(
            t_fn=lambda: get_global_status_collect()
            ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
            trigger_every_step=200,
            after_t=10),
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))

    # we can easily reuse the dyna training flow to implement the Model-ensemble training flow.
    flow = create_dyna_flow(
        train_algo_func=(agent.train, (), dict(state='state_agent_training')),
        train_algo_from_synthesized_data_func=(
            agent.train, (), dict(state='state_agent_training')),
        train_dynamics_func=(agent.train, (),
                             dict(state='state_dynamics_training')),
        test_algo_func=(agent.test, (), dict(sample_count=10)),
        test_dynamics_func=(agent.algo.test_dynamics, (),
                            dict(sample_count=10, env=env)),
        sample_from_real_env_func=(agent.sample, (),
                                   dict(sample_count=10,
                                        env=agent.env,
                                        store_flag=True)),
        sample_from_dynamics_env_func=(agent.sample, (),
                                       dict(sample_count=10,
                                            env=agent.algo.dynamics_env,
                                            store_flag=True)),
        # set this to large enough so agent only use data from dynamics env.
        train_algo_every_real_sample_count_by_data_from_real_env=100,
        train_algo_every_real_sample_count_by_data_from_dynamics_env=100,
        test_algo_every_real_sample_count=100,
        test_dynamics_every_real_sample_count=100,
        train_dynamics_ever_real_sample_count=100,
        start_train_algo_after_sample_count=1,
        start_train_dynamics_after_sample_count=1,
        start_test_algo_after_sample_count=1,
        start_test_dynamics_after_sample_count=1,
        warm_up_dynamics_samples=100)

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name + '_exp')
    experiment.run()
예제 #10
0
def task_fn():
    env = make('Pendulum-v0')
    name = 'demo_exp'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[{
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }, {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }])
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    mlp_config=[{
                                        "ACT": "RELU",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "1",
                                        "N_UNITS": 16,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }, {
                                        "ACT": "LINEAR",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "OUPTUT",
                                        "N_UNITS": env_spec.flat_action_dim,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }],
                                    reuse=False)

    ddpg = DDPG(env_spec=env_spec,
                config_or_config_dict={
                    "REPLAY_BUFFER_SIZE": 10000,
                    "GAMMA": 0.999,
                    "Q_NET_L1_NORM_SCALE": 0.01,
                    "Q_NET_L2_NORM_SCALE": 0.01,
                    "CRITIC_LEARNING_RATE": 0.001,
                    "ACTOR_LEARNING_RATE": 0.001,
                    "DECAY": 0.5,
                    "BATCH_SIZE": 50,
                    "TRAIN_ITERATION": 1,
                    "critic_clip_norm": 0.1,
                    "actor_clip_norm": 0.1,
                },
                value_func=mlp_q,
                policy=policy,
                name=name + '_ddpg',
                replay_buffer=None)

    mlp_dyna = ContinuousMLPGlobalDynamicsModel(
        env_spec=env_spec,
        name_scope=name + '_mlp_dyna',
        name=name + '_mlp_dyna',
        output_low=env_spec.obs_space.low,
        output_high=env_spec.obs_space.high,
        learning_rate=0.01,
        mlp_config=[{
            "ACT": "RELU",
            "B_INIT_VALUE": 0.0,
            "NAME": "1",
            "L1_NORM": 0.0,
            "L2_NORM": 0.0,
            "N_UNITS": 16,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }, {
            "ACT": "LINEAR",
            "B_INIT_VALUE": 0.0,
            "NAME": "OUPTUT",
            "L1_NORM": 0.0,
            "L2_NORM": 0.0,
            "N_UNITS": env_spec.flat_obs_dim,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }])
    algo = Dyna(env_spec=env_spec,
                name=name + '_dyna_algo',
                model_free_algo=ddpg,
                dynamics_model=mlp_dyna,
                config_or_config_dict=dict(dynamics_model_train_iter=10,
                                           model_free_algo_train_iter=10))
    # For examples only, we use random reward function and terminal function with fixed episode length.
    algo.set_terminal_reward_function_for_dynamics_env(
        terminal_func=FixedEpisodeLengthTerminalFunc(
            max_step_length=env.unwrapped._max_episode_steps,
            step_count_fn=algo.dynamics_env.total_step_count_fn),
        reward_func=RandomRewardFunc())
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=algo,
        algo_saving_scheduler=PeriodicalEventSchedule(
            t_fn=lambda: get_global_status_collect()
            ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
            trigger_every_step=20,
            after_t=10),
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))
    flow = DynaFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict={
            "TRAIN_ALGO_EVERY_REAL_SAMPLE_COUNT_FROM_REAL_ENV": 10,
            "TRAIN_ALGO_EVERY_REAL_SAMPLE_COUNT_FROM_DYNAMICS_ENV": 10,
            "TEST_ALGO_EVERY_REAL_SAMPLE_COUNT": 10,
            "TEST_DYNAMICS_EVERY_REAL_SAMPLE_COUNT": 10,
            "TRAIN_DYNAMICS_EVERY_REAL_SAMPLE_COUNT": 10,
            "START_TRAIN_ALGO_AFTER_SAMPLE_COUNT": 1,
            "START_TRAIN_DYNAMICS_AFTER_SAMPLE_COUNT": 1,
            "START_TEST_ALGO_AFTER_SAMPLE_COUNT": 1,
            "START_TEST_DYNAMICS_AFTER_SAMPLE_COUNT": 1,
            "WARM_UP_DYNAMICS_SAMPLES": 1
        },
        func_dict={
            'train_algo': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_agent_training')
            },
            'train_algo_from_synthesized_data': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_agent_training')
            },
            'train_dynamics': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_dynamics_training')
            },
            'test_algo': {
                'func': agent.test,
                'args': list(),
                'kwargs': dict(sample_count=10)
            },
            'test_dynamics': {
                'func': agent.algo.test_dynamics,
                'args': list(),
                'kwargs': dict(sample_count=10, env=env)
            },
            'sample_from_real_env': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=10,
                     env=agent.env,
                     in_which_status='TRAIN',
                     store_flag=True)
            },
            'sample_from_dynamics_env': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=10,
                     env=agent.algo.dynamics_env,
                     in_which_status='TRAIN',
                     store_flag=True)
            }
        })

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name + '_exp')
    experiment.run()
예제 #11
0
def mountiancar_task_fn():
    exp_config = MOUNTAIN_CAR_CONTINUOUS_BENCHMARK_CONFIG_DICT

    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('MountainCarContinuous-v0')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              **exp_config['MLPQValueFunction'])
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    output_low=env_spec.action_space.low,
                                    output_high=env_spec.action_space.high,
                                    **exp_config['DeterministicMLPPolicy'],
                                    reuse=False)

    ddpg = DDPG(env_spec=env_spec,
                policy=policy,
                value_func=mlp_q,
                name=name + '_ddpg',
                **exp_config['DDPG'])
    n_actions = env.action_space.shape[0]
    agent = Agent(env=env,
                  env_spec=env_spec,
                  algo=ddpg,
                  exploration_strategy=None,
                  noise_adder=AgentActionNoiseWrapper(
                      noise=OrnsteinUhlenbeckActionNoise(
                          mu=np.zeros(n_actions),
                          sigma=0.5 * np.ones(n_actions)),
                      noise_weight_scheduler=ConstantScheduler(value=1),
                      action_weight_scheduler=ConstantScheduler(value=1.0)),
                  reset_noise_every_terminal_state=True,
                  name=name + '_agent')

    flow = TrainTestFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict=exp_config['TrainTestFlow']
        ['config_or_config_dict'],
        func_dict={
            'test': {
                'func':
                agent.test,
                'args':
                list(),
                'kwargs':
                dict(sample_count=exp_config['TrainTestFlow']
                     ['TEST_SAMPLES_COUNT']),
            },
            'train': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(),
            },
            'sample': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=exp_config['TrainTestFlow']
                     ['TRAIN_SAMPLES_COUNT'],
                     env=agent.env,
                     in_which_status='TRAIN',
                     store_flag=True),
            },
        })

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
예제 #12
0
def task_fn():
    env = make('Acrobot-v1')
    name = 'demo_exp'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[{
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }, {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }])
    dqn = DQN(env_spec=env_spec,
              config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000,
                                         GAMMA=0.99,
                                         BATCH_SIZE=10,
                                         Q_NET_L1_NORM_SCALE=0.001,
                                         Q_NET_L2_NORM_SCALE=0.001,
                                         LEARNING_RATE=0.01,
                                         TRAIN_ITERATION=1,
                                         DECAY=0.5),
              name=name + '_dqn',
              value_func=mlp_q)

    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=dqn,
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))

    flow = TrainTestFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict={
            "TEST_EVERY_SAMPLE_COUNT": 10,
            "TRAIN_EVERY_SAMPLE_COUNT": 10,
            "START_TRAIN_AFTER_SAMPLE_COUNT": 5,
            "START_TEST_AFTER_SAMPLE_COUNT": 5,
        },
        func_dict={
            'test': {
                'func': agent.test,
                'args': list(),
                'kwargs': dict(sample_count=10),
            },
            'train': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(),
            },
            'sample': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=100,
                     env=agent.env,
                     in_which_status='TRAIN',
                     store_flag=True),
            },
        })

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
예제 #13
0
def task_fn():
    # create the gym environment by make function
    env = make('Pendulum-v0')
    # give your experiment a name which is used to generate the log path etc.
    name = 'demo_exp'
    # construct the environment specification
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)
    # construct the neural network to approximate q function of DDPG
    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[{
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }, {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }])
    # construct the neural network to approximate policy for DDPG
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    mlp_config=[{
                                        "ACT": "RELU",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "1",
                                        "N_UNITS": 16,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }, {
                                        "ACT": "LINEAR",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "OUPTUT",
                                        "N_UNITS": env_spec.flat_action_dim,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }],
                                    reuse=False)
    # construct the DDPG algorithms
    ddpg = DDPG(env_spec=env_spec,
                config_or_config_dict={
                    "REPLAY_BUFFER_SIZE": 10000,
                    "GAMMA": 0.999,
                    "CRITIC_LEARNING_RATE": 0.001,
                    "ACTOR_LEARNING_RATE": 0.001,
                    "DECAY": 0.5,
                    "BATCH_SIZE": 50,
                    "TRAIN_ITERATION": 1,
                    "critic_clip_norm": 0.1,
                    "actor_clip_norm": 0.1,
                },
                value_func=mlp_q,
                policy=policy,
                name=name + '_ddpg',
                replay_buffer=None)
    # construct a neural network based global dynamics model to approximate the state transition of environment
    mlp_dyna = ContinuousMLPGlobalDynamicsModel(
        env_spec=env_spec,
        name_scope=name + '_mlp_dyna',
        name=name + '_mlp_dyna',
        learning_rate=0.01,
        state_input_scaler=RunningStandardScaler(dims=env_spec.flat_obs_dim),
        action_input_scaler=RunningStandardScaler(
            dims=env_spec.flat_action_dim),
        output_delta_state_scaler=RunningStandardScaler(
            dims=env_spec.flat_obs_dim),
        mlp_config=[{
            "ACT": "RELU",
            "B_INIT_VALUE": 0.0,
            "NAME": "1",
            "L1_NORM": 0.0,
            "L2_NORM": 0.0,
            "N_UNITS": 16,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }, {
            "ACT": "LINEAR",
            "B_INIT_VALUE": 0.0,
            "NAME": "OUPTUT",
            "L1_NORM": 0.0,
            "L2_NORM": 0.0,
            "N_UNITS": env_spec.flat_obs_dim,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }])
    # finally, construct the Dyna algorithms with a model free algorithm DDGP, and a NN model.
    algo = Dyna(env_spec=env_spec,
                name=name + '_dyna_algo',
                model_free_algo=ddpg,
                dynamics_model=mlp_dyna,
                config_or_config_dict=dict(dynamics_model_train_iter=10,
                                           model_free_algo_train_iter=10))
    # To make the NN based dynamics model a proper environment so be a sampling source for DDPG, reward function and
    # terminal function need to be set.

    # For examples only, we use random reward function and terminal function with fixed episode length.
    algo.set_terminal_reward_function_for_dynamics_env(
        terminal_func=FixedEpisodeLengthTerminalFunc(
            max_step_length=env.unwrapped._max_episode_steps,
            step_count_fn=algo.dynamics_env.total_step_count_fn),
        reward_func=RandomRewardFunc())
    # construct agent with additional exploration strategy if needed.
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=algo,
        algo_saving_scheduler=PeriodicalEventSchedule(
            t_fn=lambda: get_global_status_collect()
            ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
            trigger_every_step=20,
            after_t=10),
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))
    # construct the training flow, called Dyna flow. It defines how the training proceed, and the terminal condition
    flow = create_dyna_flow(
        train_algo_func=(agent.train, (), dict(state='state_agent_training')),
        train_algo_from_synthesized_data_func=(
            agent.train, (), dict(state='state_agent_training')),
        train_dynamics_func=(agent.train, (),
                             dict(state='state_dynamics_training')),
        test_algo_func=(agent.test, (), dict(sample_count=1)),
        test_dynamics_func=(agent.algo.test_dynamics, (),
                            dict(sample_count=10, env=env)),
        sample_from_real_env_func=(agent.sample, (),
                                   dict(sample_count=10,
                                        env=agent.env,
                                        store_flag=True)),
        sample_from_dynamics_env_func=(agent.sample, (),
                                       dict(sample_count=10,
                                            env=agent.algo.dynamics_env,
                                            store_flag=True)),
        train_algo_every_real_sample_count_by_data_from_real_env=40,
        train_algo_every_real_sample_count_by_data_from_dynamics_env=40,
        test_algo_every_real_sample_count=40,
        test_dynamics_every_real_sample_count=40,
        train_dynamics_ever_real_sample_count=20,
        start_train_algo_after_sample_count=1,
        start_train_dynamics_after_sample_count=1,
        start_test_algo_after_sample_count=1,
        start_test_dynamics_after_sample_count=1,
        warm_up_dynamics_samples=1)
    # construct the experiment
    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name + '_exp')
    # run!
    experiment.run()
예제 #14
0
def pendulum_task_fn():
    exp_config = PENDULUM_BENCHMARK_CONFIG_DICT
    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('Pendulum-v0')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_dyna = ContinuousMLPGlobalDynamicsModel(
        env_spec=env_spec,
        name_scope=name + '_mlp_dyna',
        name=name + '_mlp_dyna',
        output_low=env_spec.obs_space.low,
        output_high=env_spec.obs_space.high,
        **exp_config['DynamicsModel'])
    dyna_env = DynamicsEnvWrapper(mlp_dyna)
    dyna_env.set_terminal_reward_func(
        terminal_func=FixedEpisodeLengthTerminalFunc(
            max_step_length=env.unwrapped._max_episode_steps,
            step_count_fn=dyna_env.total_step_count_fn),
        reward_func=REWARD_FUNC_DICT['Pendulum-v0']())

    policy = iLQRPolicy(env_spec=env_spec,
                        **exp_config['ILQR'],
                        dynamics=dyna_env,
                        cost_fn=RewardFuncCostWrapper(
                            reward_func=REWARD_FUNC_DICT['Pendulum-v0']()))

    algo = iLQRAlogWrapper(policy=policy,
                           env_spec=env_spec,
                           dynamics_env=dyna_env)

    agent = Agent(env=env,
                  env_spec=env_spec,
                  algo=algo,
                  exploration_strategy=None,
                  noise_adder=None,
                  name=name + '_agent')

    flow = DynaFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict=exp_config['DynaFlow'],
        func_dict={
            'train_dynamics': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_dynamics_training')
            },
            'train_algo': None,
            'test_algo': {
                'func': agent.test,
                'args': list(),
                'kwargs': dict(sample_count=1, sample_trajectory_flag=True)
            },
            'test_dynamics': {
                'func': agent.algo.test_dynamics,
                'args': list(),
                'kwargs': dict(sample_count=100, env=env)
            },
            'sample_from_real_env': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=10,
                     env=agent.env,
                     in_which_status='TRAIN',
                     store_flag=True)
            },
            'sample_from_dynamics_env': None,
            'train_algo_from_synthesized_data': None
        })

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
예제 #15
0
    def func():
        env = make(env_id)
        exp_config = make_config(obs_dim=env.env_spec.flat_obs_dim,
                                 action_dim=env.env_spec.flat_action_dim,
                                 policy_hid1_multi=10,
                                 value_hid3_size=5,
                                 value_hid1_multi=10,
                                 total_episode=total_episode,
                                 episode_length=1000,
                                 episode_per_sample=episode_per_sample)

        GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                           exp_config['DEFAULT_EXPERIMENT_END_POINT'])
        env.reset()
        env = StepObservationWrapper(
            env, step_limit=env.unwrapped._max_episode_steps)
        name = 'benchmark'
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)

        mlp_v = MLPVValueFunc(env_spec=env_spec,
                              name_scope=name + 'mlp_v',
                              name=name + 'mlp_v',
                              **exp_config['MLP_V'])
        policy = NormalDistributionMLPPolicy(
            env_spec=env_spec,
            name_scope=name + 'mlp_policy',
            name=name + 'mlp_policy',
            **exp_config['POLICY'],
            output_low=env_spec.action_space.low,
            output_high=env_spec.action_space.high,
            reuse=False)

        ppo = PPO(env_spec=env_spec,
                  **exp_config['PPO'],
                  value_func=mlp_v,
                  stochastic_policy=policy,
                  name=name + '_ppo',
                  use_time_index_flag=True)
        agent = Agent(env=env,
                      env_spec=env_spec,
                      algo=ppo,
                      exploration_strategy=None,
                      noise_adder=None,
                      name=name + '_agent')

        flow = TrainTestFlow(
            train_sample_count_func=lambda: get_global_status_collect()
            ('TOTAL_AGENT_TRAIN_SAMPLE_FUNC_COUNT'),
            config_or_config_dict=exp_config['TrainTestFlow']
            ['config_or_config_dict'],
            func_dict={
                'test': {
                    'func':
                    agent.test,
                    'args':
                    list(),
                    'kwargs':
                    dict(sample_count=exp_config['TrainTestFlow']
                         ['TEST_SAMPLES_COUNT']),
                },
                'train': {
                    'func': agent.train,
                    'args': list(),
                    'kwargs': dict(),
                },
                'sample': {
                    'func':
                    agent.sample,
                    'args':
                    list(),
                    'kwargs':
                    dict(sample_count=exp_config['TrainTestFlow']
                         ['TRAIN_SAMPLES_COUNT'],
                         env=agent.env,
                         sample_type='trajectory',
                         in_which_status='TRAIN',
                         store_flag=True),
                },
            })

        experiment = Experiment(tuner=None,
                                env=env,
                                agent=agent,
                                flow=flow,
                                name=name)
        experiment.run()
예제 #16
0
def pendulum_task_fn():
    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('Pendulum-v0')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              **exp_config['MLPQValueFunction'])
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    output_low=env_spec.action_space.low,
                                    output_high=env_spec.action_space.high,
                                    **exp_config['DeterministicMLPPolicy'],
                                    reuse=False)

    ddpg = DDPG(env_spec=env_spec,
                policy=policy,
                value_func=mlp_q,
                name=name + '_ddpg',
                **exp_config['DDPG'])

    mlp_dyna = ContinuousMLPGlobalDynamicsModel(
        env_spec=env_spec,
        name_scope=name + '_mlp_dyna',
        name=name + '_mlp_dyna',
        output_low=env_spec.obs_space.low,
        output_high=env_spec.obs_space.high,
        **exp_config['DynamicsModel'])
    algo = Dyna(env_spec=env_spec,
                name=name + '_dyna_algo',
                model_free_algo=ddpg,
                dynamics_model=mlp_dyna,
                config_or_config_dict=dict(dynamics_model_train_iter=10,
                                           model_free_algo_train_iter=10))
    algo.set_terminal_reward_function_for_dynamics_env(
        terminal_func=FixedEpisodeLengthTerminalFunc(
            max_step_length=env.unwrapped._max_episode_steps,
            step_count_fn=algo.dynamics_env.total_step_count_fn),
        reward_func=REWARD_FUNC_DICT['Pendulum-v0']())
    agent = Agent(env=env,
                  env_spec=env_spec,
                  algo=algo,
                  exploration_strategy=None,
                  noise_adder=AgentActionNoiseWrapper(
                      noise=NormalActionNoise(),
                      noise_weight_scheduler=ConstantSchedule(value=0.3),
                      action_weight_scheduler=ConstantSchedule(value=1.0)),
                  name=name + '_agent')

    flow = DynaFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict=exp_config['DynaFlow'],
        func_dict={
            'train_algo': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_agent_training')
            },
            'train_algo_from_synthesized_data': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_agent_training', train_iter=1)
            },
            'train_dynamics': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_dynamics_training')
            },
            'test_algo': {
                'func': agent.test,
                'args': list(),
                'kwargs': dict(sample_count=1, sample_trajectory_flag=True)
            },
            'test_dynamics': {
                'func': agent.algo.test_dynamics,
                'args': list(),
                'kwargs': dict(sample_count=10, env=env)
            },
            'sample_from_real_env': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=10,
                     env=agent.env,
                     in_which_status='TRAIN',
                     store_flag=True)
            },
            'sample_from_dynamics_env': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=50,
                     sample_type='transition',
                     env=agent.algo.dynamics_env,
                     in_which_status='TRAIN',
                     store_flag=False)
            }
        })

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
예제 #17
0
def pendulum_task_fn():
    exp_config = PENDULUM_BENCHMARK_CONFIG_DICT
    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('Pendulum-v0')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_v = MLPVValueFunc(env_spec=env_spec,
                          name_scope=name + 'mlp_v',
                          name=name + 'mlp_v',
                          **exp_config['MLP_V'])
    policy = NormalDistributionMLPPolicy(
        env_spec=env_spec,
        name_scope=name + 'mlp_policy',
        name=name + 'mlp_policy',
        **exp_config['POLICY'],
        output_low=env_spec.action_space.low,
        output_high=env_spec.action_space.high,
        reuse=False)

    ppo = PPO(env_spec=env_spec,
              **exp_config['PPO'],
              value_func=mlp_v,
              stochastic_policy=policy,
              name=name + 'ppo')
    agent = Agent(env=env,
                  env_spec=env_spec,
                  algo=ppo,
                  exploration_strategy=None,
                  noise_adder=None,
                  name=name + '_agent')

    flow = TrainTestFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict=exp_config['TrainTestFlow']
        ['config_or_config_dict'],
        func_dict={
            'test': {
                'func':
                agent.test,
                'args':
                list(),
                'kwargs':
                dict(sample_count=exp_config['TrainTestFlow']
                     ['TEST_SAMPLES_COUNT'],
                     sample_trajectory_flag=True),
            },
            'train': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(),
            },
            'sample': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=exp_config['TrainTestFlow']
                     ['TRAIN_SAMPLES_COUNT'],
                     env=agent.env,
                     sample_type='trajectory',
                     in_which_status='TRAIN',
                     store_flag=True),
            },
        })

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
예제 #18
0
def task_fn():
    env = make('Pendulum-v0')
    name = 'demo_exp'
    env_spec = env.env_spec
    mlp_dyna = ContinuousMLPGlobalDynamicsModel(env_spec=env_spec,
                                                name_scope=name + '_mlp_dyna',
                                                name=name + '_mlp_dyna',
                                                learning_rate=0.01,
                                                mlp_config=[{
                                                    "ACT":
                                                    "TANH",
                                                    "B_INIT_VALUE":
                                                    0.0,
                                                    "NAME":
                                                    "1",
                                                    "L1_NORM":
                                                    0.0,
                                                    "L2_NORM":
                                                    0.0,
                                                    "N_UNITS":
                                                    128,
                                                    "TYPE":
                                                    "DENSE",
                                                    "W_NORMAL_STDDEV":
                                                    0.03
                                                }, {
                                                    "ACT":
                                                    "LINEAR",
                                                    "B_INIT_VALUE":
                                                    0.0,
                                                    "NAME":
                                                    "OUPTUT",
                                                    "L1_NORM":
                                                    0.0,
                                                    "L2_NORM":
                                                    0.0,
                                                    "N_UNITS":
                                                    env_spec.flat_obs_dim,
                                                    "TYPE":
                                                    "DENSE",
                                                    "W_NORMAL_STDDEV":
                                                    0.03
                                                }])
    algo = ModelPredictiveControl(
        dynamics_model=mlp_dyna,
        env_spec=env_spec,
        config_or_config_dict=dict(SAMPLED_HORIZON=2,
                                   SAMPLED_PATH_NUM=5,
                                   dynamics_model_train_iter=10),
        name=name + '_mpc',
        policy=UniformRandomPolicy(env_spec=env_spec, name='uni_policy'))
    algo.set_terminal_reward_function_for_dynamics_env(
        reward_func=RandomRewardFunc(name='reward_func'),
        terminal_func=RandomTerminalFunc(name='random_terminal'),
    )
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=algo,
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))
    flow = create_train_test_flow(test_every_sample_count=10,
                                  train_every_sample_count=10,
                                  start_test_after_sample_count=5,
                                  start_train_after_sample_count=5,
                                  train_func_and_args=(agent.train, (),
                                                       dict()),
                                  test_func_and_args=(agent.test, (),
                                                      dict(sample_count=10)),
                                  sample_func_and_args=(agent.sample, (),
                                                        dict(sample_count=100,
                                                             env=agent.env,
                                                             store_flag=True)))
    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
예제 #19
0
def task_fn():
    env = make('Acrobot-v1')
    name = 'example_scheduler_'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[
                                  {
                                      "ACT": "RELU",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "1",
                                      "N_UNITS": 16,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  },
                                  {
                                      "ACT": "LINEAR",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "OUPTUT",
                                      "N_UNITS": 1,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  }
                              ])
    dqn = DQN(env_spec=env_spec,
              config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000,
                                         GAMMA=0.99,
                                         BATCH_SIZE=10,
                                         LEARNING_RATE=0.001,
                                         TRAIN_ITERATION=1,
                                         DECAY=0.5),
              name=name + '_dqn',
              value_func=mlp_q)
    agent = Agent(env=env, env_spec=env_spec,
                  algo=dqn,
                  name=name + '_agent',
                  algo_saving_scheduler=PeriodicalEventSchedule(
                      t_fn=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                      trigger_every_step=20,
                      after_t=10),
                  exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                                     prob_scheduler=PiecewiseScheduler(
                                                         t_fn=lambda: get_global_status_collect()(
                                                             'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                                                         endpoints=((10, 0.3), (100, 0.1), (200, 0.0)),
                                                         outside_value=0.0
                                                     ),
                                                     init_random_prob=0.5))
    flow = create_train_test_flow(
        test_every_sample_count=10,
        train_every_sample_count=10,
        start_test_after_sample_count=5,
        start_train_after_sample_count=5,
        train_func_and_args=(agent.train, (), dict()),
        test_func_and_args=(agent.test, (), dict(sample_count=10)),
        sample_func_and_args=(agent.sample, (), dict(sample_count=100,
                                                     env=agent.env,
                                                     store_flag=True))
    )
    experiment = Experiment(
        tuner=None,
        env=env,
        agent=agent,
        flow=flow,
        name=name + 'experiment_debug'
    )

    dqn.parameters.set_scheduler(param_key='LEARNING_RATE',
                                 scheduler=LinearScheduler(
                                     t_fn=experiment.TOTAL_AGENT_TRAIN_SAMPLE_COUNT,
                                     schedule_timesteps=GlobalConfig().DEFAULT_EXPERIMENT_END_POINT[
                                         'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'],
                                     final_p=0.0001,
                                     initial_p=0.01))
    experiment.run()
예제 #20
0
def task_fn():
    env = make('Acrobot-v1')
    name = 'demo_exp'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[{
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }, {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }])
    dqn = DQN(env_spec=env_spec,
              config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000,
                                         GAMMA=0.99,
                                         BATCH_SIZE=10,
                                         Q_NET_L1_NORM_SCALE=0.001,
                                         Q_NET_L2_NORM_SCALE=0.001,
                                         LEARNING_RATE=0.01,
                                         TRAIN_ITERATION=1,
                                         DECAY=0.5),
              name=name + '_dqn',
              value_func=mlp_q)

    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=dqn,
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))

    flow = create_train_test_flow(
        test_every_sample_count=10,
        train_every_sample_count=10,
        start_test_after_sample_count=5,
        start_train_after_sample_count=5,
        train_func_and_args=(agent.train, (), dict()),
        test_func_and_args=(agent.test, (), dict(sample_count=10)),
        sample_func_and_args=(agent.sample, (),
                              dict(sample_count=100,
                                   env=agent.env,
                                   in_which_status='TRAIN',
                                   store_flag=True)))

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
예제 #21
0
def pendulum_task_fn():
    exp_config = PENDULUM_BENCHMARK_CONFIG_DICT
    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('Pendulum-v0')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              **exp_config['MLPQValueFunction'])
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    output_low=env_spec.action_space.low,
                                    output_high=env_spec.action_space.high,
                                    **exp_config['DeterministicMLPPolicy'],
                                    reuse=False)

    ddpg = DDPG(
        env_spec=env_spec,
        policy=policy,
        value_func=mlp_q,
        name=name + '_ddpg',
        **exp_config['DDPG']
    )
    agent = Agent(env=env, env_spec=env_spec,
                  algo=ddpg,
                  exploration_strategy=None,
                  noise_adder=AgentActionNoiseWrapper(noise=NormalActionNoise(),
                                                      noise_weight_scheduler=ConstantSchedule(value=0.3),
                                                      action_weight_scheduler=ConstantSchedule(value=1.0)),
                  name=name + '_agent')

    flow = TrainTestFlow(train_sample_count_func=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
                         config_or_config_dict=exp_config['TrainTestFlow']['config_or_config_dict'],
                         func_dict={
                             'test': {'func': agent.test,
                                      'args': list(),
                                      'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TEST_SAMPLES_COUNT'],
                                                     sample_trajectory_flag=True),
                                      },
                             'train': {'func': agent.train,
                                       'args': list(),
                                       'kwargs': dict(),
                                       },
                             'sample': {'func': agent.sample,
                                        'args': list(),
                                        'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TRAIN_SAMPLES_COUNT'],
                                                       env=agent.env,
                                                       in_which_status='TRAIN',
                                                       store_flag=True),
                                        },
                         })

    experiment = Experiment(
        tuner=None,
        env=env,
        agent=agent,
        flow=flow,
        name=name
    )
    experiment.run()
예제 #22
0
    def test_integration_with_dqn(self):
        env = make('Acrobot-v1')
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)

        mlp_q = MLPQValueFunction(env_spec=env_spec,
                                  name='mlp_q',
                                  name_scope='mlp_q',
                                  mlp_config=[
                                      {
                                          "ACT": "RELU",
                                          "B_INIT_VALUE": 0.0,
                                          "NAME": "1",
                                          "N_UNITS": 16,
                                          "TYPE": "DENSE",
                                          "W_NORMAL_STDDEV": 0.03
                                      },
                                      {
                                          "ACT": "LINEAR",
                                          "B_INIT_VALUE": 0.0,
                                          "NAME": "OUPTUT",
                                          "N_UNITS": 1,
                                          "TYPE": "DENSE",
                                          "W_NORMAL_STDDEV": 0.03
                                      }
                                  ])
        dqn = DQN(env_spec=env_spec,
                  name='dqn_test',
                  config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000,
                                             GAMMA=0.99,
                                             BATCH_SIZE=10,
                                             LEARNING_RATE=0.001,
                                             TRAIN_ITERATION=1,
                                             DECAY=0.5),
                  value_func=mlp_q)
        agent = Agent(env=env, env_spec=env_spec,
                      algo=dqn,
                      name='agent')
        agent.init()
        # dqn.init()
        st = env.reset()
        from baconian.common.sampler.sample_data import TransitionData
        a = TransitionData(env_spec)
        res = []
        agent.sample(env=env,
                     sample_count=100,
                     in_which_status='TRAIN',
                     store_flag=True,
                     sample_type='transition')
        agent.sample(env=env,
                     sample_count=100,
                     in_which_status='TRAIN',
                     store_flag=True,
                     sample_type='transition')
        res.append(dqn.train(batch_data=a, train_iter=10, sess=None, update_target=True)['average_loss'])
        res.append(dqn.train(batch_data=None, train_iter=10, sess=None, update_target=True)['average_loss'])
        self.assertTrue(dqn in dqn.recorder._obj_log)
        self.assertTrue('average_loss' in dqn.recorder._obj_log[dqn])
        self.assertTrue(len(dqn.recorder._obj_log[dqn]['average_loss']) == 2)
        self.assertTrue(
            np.equal(np.array(res), [x['value'] for x in dqn.recorder._obj_log[dqn]['average_loss']]).all())

        self.assertTrue(len(Logger()._registered_recorders) > 0)
        self.assertTrue(dqn.recorder in Logger()._registered_recorders)
        res = dqn.recorder.get_log(attr_name='average_loss', filter_by_status=dict())
        self.assertEqual(len(res), 2)
        res = agent.recorder.get_log(attr_name='sum_reward', filter_by_status={'status': 'TRAIN'})
        self.assertEqual(len(res), 2)
        res = agent.recorder.get_log(attr_name='sum_reward', filter_by_status={'status': 'TEST'})
        self.assertEqual(len(res), 0)
        Logger().flush_recorder()