コード例 #1
0
 def create_mlp_deterministic_policy(self, env_spec, name='mlp_policy'):
     policy = DeterministicMLPPolicy(env_spec=env_spec,
                                     name=name,
                                     name_scope=name,
                                     mlp_config=[{
                                         "ACT": "RELU",
                                         "B_INIT_VALUE": 0.0,
                                         "NAME": "1",
                                         "N_UNITS": 16,
                                         "TYPE": "DENSE",
                                         "W_NORMAL_STDDEV": 0.03
                                     }, {
                                         "ACT": "LINEAR",
                                         "B_INIT_VALUE": 0.0,
                                         "NAME": "OUPTUT",
                                         "N_UNITS":
                                         env_spec.flat_action_dim,
                                         "TYPE": "DENSE",
                                         "W_NORMAL_STDDEV": 0.03
                                     }],
                                     output_high=None,
                                     output_low=None,
                                     output_norm=None,
                                     input_norm=None,
                                     reuse=False)
     return policy, locals()
コード例 #2
0
    def create_ddpg(self, env_id='Pendulum-v0', name='ddpg'):
        env = make(env_id)
        env_spec = EnvSpec(obs_space=env.observation_space,
                           action_space=env.action_space)

        mlp_q = MLPQValueFunction(env_spec=env_spec,
                                  name_scope=name + 'mlp_q',
                                  name=name + 'mlp_q',
                                  mlp_config=[{
                                      "ACT": "RELU",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "1",
                                      "N_UNITS": 16,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  }, {
                                      "ACT": "LINEAR",
                                      "B_INIT_VALUE": 0.0,
                                      "NAME": "OUPTUT",
                                      "N_UNITS": 1,
                                      "TYPE": "DENSE",
                                      "W_NORMAL_STDDEV": 0.03
                                  }])
        self.assertTrue(len(mlp_q.parameters('tf_var_list')) == 4)
        policy = DeterministicMLPPolicy(env_spec=env_spec,
                                        name_scope=name + 'mlp_policy',
                                        name=name + 'mlp_policy',
                                        mlp_config=[{
                                            "ACT": "RELU",
                                            "B_INIT_VALUE": 0.0,
                                            "NAME": "1",
                                            "N_UNITS": 16,
                                            "TYPE": "DENSE",
                                            "W_NORMAL_STDDEV": 0.03
                                        }, {
                                            "ACT": "LINEAR",
                                            "B_INIT_VALUE": 0.0,
                                            "NAME": "OUPTUT",
                                            "N_UNITS":
                                            env_spec.flat_action_dim,
                                            "TYPE": "DENSE",
                                            "W_NORMAL_STDDEV": 0.03
                                        }],
                                        reuse=False)
        self.assertTrue(len(policy.parameters('tf_var_list')) == 4)

        ddpg = DDPG(env_spec=env_spec,
                    config_or_config_dict={
                        "REPLAY_BUFFER_SIZE": 10000,
                        "GAMMA": 0.999,
                        "CRITIC_LEARNING_RATE": 0.001,
                        "ACTOR_LEARNING_RATE": 0.001,
                        "DECAY": 0.5,
                        "BATCH_SIZE": 50,
                        "TRAIN_ITERATION": 1,
                        "critic_clip_norm": 0.1,
                        "actor_clip_norm": 0.1,
                    },
                    value_func=mlp_q,
                    policy=policy,
                    name=name,
                    replay_buffer=None)
        return ddpg, locals()
コード例 #3
0
def task_fn():
    env = make('Pendulum-v0')
    name = 'mb_test'
    env_spec = env.env_spec
    model_path = '/home/yitongx/Documents/baconian-project/experiments/log'
    cyber = PendulumnCyber(env=env, epoch_to_use=60, use_traj_input=False, use_mbmf=True, \
                           model_path=model_path)
    mlp_config = [{
        "ACT": "RELU",
        "B_INIT_VALUE": 0.0,
        "NAME": "1",
        "N_UNITS": 32,
        "TYPE": "DENSE",
        "W_NORMAL_STDDEV": 0.03
    }, {
        "ACT": "RELU",
        "B_INIT_VALUE": 0.0,
        "NAME": "2",
        "N_UNITS": 16,
        "TYPE": "DENSE",
        "W_NORMAL_STDDEV": 0.03
    }, {
        "ACT": "RELU",
        "B_INIT_VALUE": 0.0,
        "NAME": "3",
        "N_UNITS": 8,
        "TYPE": "DENSE",
        "W_NORMAL_STDDEV": 0.03
    }, {
        "ACT": "TANH",
        "B_INIT_VALUE": 0.0,
        "NAME": "OUPTUT",
        "N_UNITS": 1,
        "TYPE": "DENSE",
        "W_NORMAL_STDDEV": 0.03
    }]
    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name=name + '_mlp_q',
                              name_scope=name + '_mlp_q',
                              output_high=env.action_space.high,
                              mlp_config=mlp_config)
    mlp_policy = DeterministicMLPPolicy(env_spec=env_spec,
                                        name=name + '_mlp_policy',
                                        name_scope=name + '_mlp_policy',
                                        output_high=env.observation_space.high,
                                        mlp_config=mlp_config,
                                        reuse=False)
    polyak = 0.995
    gamma = 0.99
    noise_scale = 0.5
    noise_decay = 0.999  # default 0.995
    batch_size = 128
    actor_lr = 0.001  # default 0.001
    critic_lr = 0.001  # default 0.001
    buffer_size = 100000
    total_steps = 500000  # default 1000000
    max_step_per_episode = 500  # reset env when counter > max_step_per_episode
    train_after_step = 10000  # default 10000
    train_every_step = 1
    train_iter_per_call = 1
    test_after_step = 10000
    test_every_step = 1000
    num_test = 10

    algo = DDPG(env_spec=env_spec,
                config_or_config_dict={
                    "REPLAY_BUFFER_SIZE": buffer_size,
                    "GAMMA": gamma,
                    "CRITIC_LEARNING_RATE": critic_lr,
                    "ACTOR_LEARNING_RATE": actor_lr,
                    "DECAY": polyak,
                    "BATCH_SIZE": batch_size,
                    "TRAIN_ITERATION": train_iter_per_call,
                    "critic_clip_norm": 0.1,
                    "actor_clip_norm": 0.1,
                },
                value_func=mlp_q,
                policy=mlp_policy,
                name=name + '_ddpg',
                replay_buffer=None)

    step_counter = SinglentonStepCounter(-1)
    noise_adder = AgentActionNoiseWrapper(
        noise=UniformNoise(scale=noise_scale),
        action_weight_scheduler=ConstantScheduler(1.),
        noise_weight_scheduler=DDPGNoiseScheduler(
            train_every_step=train_every_step,
            train_after_step=train_after_step,
            noise_decay=noise_decay,
            step_counter=step_counter))
    agent = DDPG_Agent(env=env,
                       algo=algo,
                       env_spec=env_spec,
                       noise_adder=noise_adder,
                       name=name + '_agent')

    flow = create_train_test_flow(env=env,
                                  cyber=cyber,
                                  agent=agent,
                                  num_test=num_test,
                                  total_steps=total_steps,
                                  max_step_per_episode=max_step_per_episode,
                                  train_after_step=train_after_step,
                                  test_after_step=test_after_step,
                                  train_every_step=train_every_step,
                                  test_every_step=test_every_step,
                                  train_func_and_args=(agent.train, (),
                                                       dict()),
                                  test_func_and_args=(agent.test, (), dict()),
                                  sample_func_and_args=(agent.sample, (),
                                                        dict()),
                                  flow_type='DDPG_TrainTestFlow')

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
コード例 #4
0
def task_fn():
    env = make('Pendulum-v0')
    name = 'demo_exp'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[{
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }, {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }])
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    mlp_config=[{
                                        "ACT": "RELU",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "1",
                                        "N_UNITS": 16,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }, {
                                        "ACT": "LINEAR",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "OUPTUT",
                                        "N_UNITS": env_spec.flat_action_dim,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }],
                                    reuse=False)

    ddpg = DDPG(env_spec=env_spec,
                config_or_config_dict={
                    "REPLAY_BUFFER_SIZE": 10000,
                    "GAMMA": 0.999,
                    "CRITIC_LEARNING_RATE": 0.001,
                    "ACTOR_LEARNING_RATE": 0.001,
                    "DECAY": 0.5,
                    "BATCH_SIZE": 50,
                    "TRAIN_ITERATION": 1,
                    "critic_clip_norm": 0.1,
                    "actor_clip_norm": 0.1,
                },
                value_func=mlp_q,
                policy=policy,
                name=name + '_ddpg',
                replay_buffer=None)
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=ddpg,
        algo_saving_scheduler=PeriodicalEventSchedule(
            t_fn=lambda: get_global_status_collect()
            ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
            trigger_every_step=20,
            after_t=10),
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))

    flow = create_train_test_flow(
        test_every_sample_count=10,
        train_every_sample_count=10,
        start_test_after_sample_count=5,
        start_train_after_sample_count=5,
        train_func_and_args=(agent.train, (), dict()),
        test_func_and_args=(agent.test, (), dict(sample_count=10)),
        sample_func_and_args=(agent.sample, (),
                              dict(sample_count=100,
                                   env=agent.env,
                                   in_which_status='TRAIN',
                                   store_flag=True)))

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
コード例 #5
0
def task_fn():
    env = make('Pendulum-v0')
    name = 'demo_exp'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[{
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }, {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }])
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    mlp_config=[{
                                        "ACT": "RELU",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "1",
                                        "N_UNITS": 16,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }, {
                                        "ACT": "LINEAR",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "OUPTUT",
                                        "N_UNITS": env_spec.flat_action_dim,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }],
                                    reuse=False)

    ddpg = DDPG(env_spec=env_spec,
                config_or_config_dict={
                    "REPLAY_BUFFER_SIZE": 10000,
                    "GAMMA": 0.999,
                    "CRITIC_LEARNING_RATE": 0.001,
                    "ACTOR_LEARNING_RATE": 0.001,
                    "DECAY": 0.5,
                    "BATCH_SIZE": 50,
                    "TRAIN_ITERATION": 1,
                    "critic_clip_norm": 0.1,
                    "actor_clip_norm": 0.1,
                },
                value_func=mlp_q,
                policy=policy,
                name=name + '_ddpg',
                replay_buffer=None)
    mlp_dyna_list = []
    for i in range(10):
        mlp_dyna = ContinuousMLPGlobalDynamicsModel(
            env_spec=env_spec,
            name_scope=name + '_mlp_dyna_{}'.format(i),
            name=name + '_mlp_dyna_{}'.format(i),
            learning_rate=0.01,
            state_input_scaler=RunningStandardScaler(
                dims=env_spec.flat_obs_dim),
            action_input_scaler=RunningStandardScaler(
                dims=env_spec.flat_action_dim),
            output_delta_state_scaler=RunningStandardScaler(
                dims=env_spec.flat_obs_dim),
            mlp_config=[{
                "ACT": "RELU",
                "B_INIT_VALUE": 0.0,
                "NAME": "1",
                "L1_NORM": 0.0,
                "L2_NORM": 0.0,
                "N_UNITS": 16,
                "TYPE": "DENSE",
                "W_NORMAL_STDDEV": 0.03
            }, {
                "ACT": "LINEAR",
                "B_INIT_VALUE": 0.0,
                "NAME": "OUPTUT",
                "L1_NORM": 0.0,
                "L2_NORM": 0.0,
                "N_UNITS": env_spec.flat_obs_dim,
                "TYPE": "DENSE",
                "W_NORMAL_STDDEV": 0.03
            }])
        mlp_dyna_list.append(mlp_dyna)
    dyna_ensemble_model = ModelEnsemble(n_models=10,
                                        model=mlp_dyna_list,
                                        prediction_type='random',
                                        env_spec=env_spec)
    algo = ModelEnsembleAlgo(env_spec=env_spec,
                             model_free_algo=ddpg,
                             dynamics_model=dyna_ensemble_model,
                             config_or_config_dict=dict(
                                 dynamics_model_train_iter=10,
                                 model_free_algo_train_iter=10,
                                 validation_trajectory_count=2,
                             ))
    # For examples only, we use random reward function and terminal function with fixed episode length.
    algo.set_terminal_reward_function_for_dynamics_env(
        terminal_func=FixedEpisodeLengthTerminalFunc(
            max_step_length=env.unwrapped._max_episode_steps,
            step_count_fn=algo.dynamics_env.total_step_count_fn),
        reward_func=PendulumRewardFunc())
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=algo,
        algo_saving_scheduler=PeriodicalEventSchedule(
            t_fn=lambda: get_global_status_collect()
            ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
            trigger_every_step=200,
            after_t=10),
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))

    # we can easily reuse the dyna training flow to implement the Model-ensemble training flow.
    flow = create_dyna_flow(
        train_algo_func=(agent.train, (), dict(state='state_agent_training')),
        train_algo_from_synthesized_data_func=(
            agent.train, (), dict(state='state_agent_training')),
        train_dynamics_func=(agent.train, (),
                             dict(state='state_dynamics_training')),
        test_algo_func=(agent.test, (), dict(sample_count=10)),
        test_dynamics_func=(agent.algo.test_dynamics, (),
                            dict(sample_count=10, env=env)),
        sample_from_real_env_func=(agent.sample, (),
                                   dict(sample_count=10,
                                        env=agent.env,
                                        store_flag=True)),
        sample_from_dynamics_env_func=(agent.sample, (),
                                       dict(sample_count=10,
                                            env=agent.algo.dynamics_env,
                                            store_flag=True)),
        # set this to large enough so agent only use data from dynamics env.
        train_algo_every_real_sample_count_by_data_from_real_env=100,
        train_algo_every_real_sample_count_by_data_from_dynamics_env=100,
        test_algo_every_real_sample_count=100,
        test_dynamics_every_real_sample_count=100,
        train_dynamics_ever_real_sample_count=100,
        start_train_algo_after_sample_count=1,
        start_train_dynamics_after_sample_count=1,
        start_test_algo_after_sample_count=1,
        start_test_dynamics_after_sample_count=1,
        warm_up_dynamics_samples=100)

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name + '_exp')
    experiment.run()
コード例 #6
0
ファイル: hopper.py プロジェクト: yitongx/Baconian-public
def hopper_task_fn():
    exp_config = HOPPER_BENCHMARK_CONFIG_DICT
    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('Hopper-v2')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              **exp_config['MLP_V'])
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    output_low=env_spec.action_space.low,
                                    output_high=env_spec.action_space.high,
                                    **exp_config['POLICY'],
                                    reuse=False)

    ddpg = DDPG(env_spec=env_spec,
                policy=policy,
                value_func=mlp_q,
                name=name + '_ddpg',
                **exp_config['DDPG'])
    agent = Agent(env=env,
                  env_spec=env_spec,
                  algo=ddpg,
                  exploration_strategy=None,
                  noise_adder=AgentActionNoiseWrapper(
                      noise=OUNoise(theta=0.15, sigma=0.3),
                      noise_weight_scheduler=ConstantScheduler(1),
                      action_weight_scheduler=ConstantScheduler(1),
                  ),
                  name=name + '_agent')

    flow = TrainTestFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict=exp_config['TrainTestFlow']
        ['config_or_config_dict'],
        func_dict={
            'test': {
                'func':
                agent.test,
                'args':
                list(),
                'kwargs':
                dict(sample_count=exp_config['TrainTestFlow']
                     ['TEST_SAMPLES_COUNT']),
            },
            'train': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(),
            },
            'sample': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=exp_config['TrainTestFlow']
                     ['TRAIN_SAMPLES_COUNT'],
                     env=agent.env,
                     in_which_status='TRAIN',
                     store_flag=True),
            },
        })

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
コード例 #7
0
def mountiancar_task_fn():
    exp_config = MOUNTAIN_CAR_CONTINUOUS_BENCHMARK_CONFIG_DICT

    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('MountainCarContinuous-v0')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              **exp_config['MLPQValueFunction'])
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    output_low=env_spec.action_space.low,
                                    output_high=env_spec.action_space.high,
                                    **exp_config['DeterministicMLPPolicy'],
                                    reuse=False)

    ddpg = DDPG(env_spec=env_spec,
                policy=policy,
                value_func=mlp_q,
                name=name + '_ddpg',
                **exp_config['DDPG'])
    n_actions = env.action_space.shape[0]
    agent = Agent(env=env,
                  env_spec=env_spec,
                  algo=ddpg,
                  exploration_strategy=None,
                  noise_adder=AgentActionNoiseWrapper(
                      noise=OrnsteinUhlenbeckActionNoise(
                          mu=np.zeros(n_actions),
                          sigma=0.5 * np.ones(n_actions)),
                      noise_weight_scheduler=ConstantScheduler(value=1),
                      action_weight_scheduler=ConstantScheduler(value=1.0)),
                  reset_noise_every_terminal_state=True,
                  name=name + '_agent')

    flow = TrainTestFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict=exp_config['TrainTestFlow']
        ['config_or_config_dict'],
        func_dict={
            'test': {
                'func':
                agent.test,
                'args':
                list(),
                'kwargs':
                dict(sample_count=exp_config['TrainTestFlow']
                     ['TEST_SAMPLES_COUNT']),
            },
            'train': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(),
            },
            'sample': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=exp_config['TrainTestFlow']
                     ['TRAIN_SAMPLES_COUNT'],
                     env=agent.env,
                     in_which_status='TRAIN',
                     store_flag=True),
            },
        })

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
コード例 #8
0
def pendulum_task_fn():
    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('Pendulum-v0')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              **exp_config['MLPQValueFunction'])
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    output_low=env_spec.action_space.low,
                                    output_high=env_spec.action_space.high,
                                    **exp_config['DeterministicMLPPolicy'],
                                    reuse=False)

    ddpg = DDPG(env_spec=env_spec,
                policy=policy,
                value_func=mlp_q,
                name=name + '_ddpg',
                **exp_config['DDPG'])

    mlp_dyna = ContinuousMLPGlobalDynamicsModel(env_spec=env_spec,
                                                name_scope=name + '_mlp_dyna',
                                                name=name + '_mlp_dyna',
                                                **exp_config['DynamicsModel'])
    algo = Dyna(env_spec=env_spec,
                name=name + '_dyna_algo',
                model_free_algo=ddpg,
                dynamics_model=mlp_dyna,
                config_or_config_dict=dict(dynamics_model_train_iter=10,
                                           model_free_algo_train_iter=10))
    algo.set_terminal_reward_function_for_dynamics_env(
        terminal_func=FixedEpisodeLengthTerminalFunc(
            max_step_length=env.unwrapped._max_episode_steps,
            step_count_fn=algo.dynamics_env.total_step_count_fn),
        reward_func=REWARD_FUNC_DICT['Pendulum-v0']())
    agent = Agent(env=env,
                  env_spec=env_spec,
                  algo=algo,
                  exploration_strategy=None,
                  noise_adder=AgentActionNoiseWrapper(
                      noise=NormalActionNoise(),
                      noise_weight_scheduler=ConstantSchedule(value=0.3),
                      action_weight_scheduler=ConstantSchedule(value=1.0)),
                  name=name + '_agent')

    flow = DynaFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict=exp_config['DynaFlow'],
        func_dict={
            'train_algo': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_agent_training')
            },
            'train_algo_from_synthesized_data': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_agent_training', train_iter=1)
            },
            'train_dynamics': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_dynamics_training')
            },
            'test_algo': {
                'func': agent.test,
                'args': list(),
                'kwargs': dict(sample_count=1, sample_trajectory_flag=True)
            },
            'test_dynamics': {
                'func': agent.algo.test_dynamics,
                'args': list(),
                'kwargs': dict(sample_count=10, env=env)
            },
            'sample_from_real_env': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=10,
                     env=agent.env,
                     in_which_status='TRAIN',
                     store_flag=True)
            },
            'sample_from_dynamics_env': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=50,
                     sample_type='transition',
                     env=agent.algo.dynamics_env,
                     in_which_status='TRAIN',
                     store_flag=False)
            }
        })

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
コード例 #9
0
def task_fn():
    # create the gym environment by make function
    env = make('Pendulum-v0')
    # give your experiment a name which is used to generate the log path etc.
    name = 'demo_exp'
    # construct the environment specification
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)
    # construct the neural network to approximate q function of DDPG
    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[{
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }, {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }])
    # construct the neural network to approximate policy for DDPG
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    mlp_config=[{
                                        "ACT": "RELU",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "1",
                                        "N_UNITS": 16,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }, {
                                        "ACT": "LINEAR",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "OUPTUT",
                                        "N_UNITS": env_spec.flat_action_dim,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }],
                                    reuse=False)
    # construct the DDPG algorithms
    ddpg = DDPG(env_spec=env_spec,
                config_or_config_dict={
                    "REPLAY_BUFFER_SIZE": 10000,
                    "GAMMA": 0.999,
                    "CRITIC_LEARNING_RATE": 0.001,
                    "ACTOR_LEARNING_RATE": 0.001,
                    "DECAY": 0.5,
                    "BATCH_SIZE": 50,
                    "TRAIN_ITERATION": 1,
                    "critic_clip_norm": 0.1,
                    "actor_clip_norm": 0.1,
                },
                value_func=mlp_q,
                policy=policy,
                name=name + '_ddpg',
                replay_buffer=None)
    # construct a neural network based global dynamics model to approximate the state transition of environment
    mlp_dyna = ContinuousMLPGlobalDynamicsModel(
        env_spec=env_spec,
        name_scope=name + '_mlp_dyna',
        name=name + '_mlp_dyna',
        learning_rate=0.01,
        state_input_scaler=RunningStandardScaler(dims=env_spec.flat_obs_dim),
        action_input_scaler=RunningStandardScaler(
            dims=env_spec.flat_action_dim),
        output_delta_state_scaler=RunningStandardScaler(
            dims=env_spec.flat_obs_dim),
        mlp_config=[{
            "ACT": "RELU",
            "B_INIT_VALUE": 0.0,
            "NAME": "1",
            "L1_NORM": 0.0,
            "L2_NORM": 0.0,
            "N_UNITS": 16,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }, {
            "ACT": "LINEAR",
            "B_INIT_VALUE": 0.0,
            "NAME": "OUPTUT",
            "L1_NORM": 0.0,
            "L2_NORM": 0.0,
            "N_UNITS": env_spec.flat_obs_dim,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }])
    # finally, construct the Dyna algorithms with a model free algorithm DDGP, and a NN model.
    algo = Dyna(env_spec=env_spec,
                name=name + '_dyna_algo',
                model_free_algo=ddpg,
                dynamics_model=mlp_dyna,
                config_or_config_dict=dict(dynamics_model_train_iter=10,
                                           model_free_algo_train_iter=10))
    # To make the NN based dynamics model a proper environment so be a sampling source for DDPG, reward function and
    # terminal function need to be set.

    # For examples only, we use random reward function and terminal function with fixed episode length.
    algo.set_terminal_reward_function_for_dynamics_env(
        terminal_func=FixedEpisodeLengthTerminalFunc(
            max_step_length=env.unwrapped._max_episode_steps,
            step_count_fn=algo.dynamics_env.total_step_count_fn),
        reward_func=RandomRewardFunc())
    # construct agent with additional exploration strategy if needed.
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=algo,
        algo_saving_scheduler=PeriodicalEventSchedule(
            t_fn=lambda: get_global_status_collect()
            ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
            trigger_every_step=20,
            after_t=10),
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))
    # construct the training flow, called Dyna flow. It defines how the training proceed, and the terminal condition
    flow = create_dyna_flow(
        train_algo_func=(agent.train, (), dict(state='state_agent_training')),
        train_algo_from_synthesized_data_func=(
            agent.train, (), dict(state='state_agent_training')),
        train_dynamics_func=(agent.train, (),
                             dict(state='state_dynamics_training')),
        test_algo_func=(agent.test, (), dict(sample_count=1)),
        test_dynamics_func=(agent.algo.test_dynamics, (),
                            dict(sample_count=10, env=env)),
        sample_from_real_env_func=(agent.sample, (),
                                   dict(sample_count=10,
                                        env=agent.env,
                                        store_flag=True)),
        sample_from_dynamics_env_func=(agent.sample, (),
                                       dict(sample_count=10,
                                            env=agent.algo.dynamics_env,
                                            store_flag=True)),
        train_algo_every_real_sample_count_by_data_from_real_env=40,
        train_algo_every_real_sample_count_by_data_from_dynamics_env=40,
        test_algo_every_real_sample_count=40,
        test_dynamics_every_real_sample_count=40,
        train_dynamics_ever_real_sample_count=20,
        start_train_algo_after_sample_count=1,
        start_train_dynamics_after_sample_count=1,
        start_test_algo_after_sample_count=1,
        start_test_dynamics_after_sample_count=1,
        warm_up_dynamics_samples=1)
    # construct the experiment
    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name + '_exp')
    # run!
    experiment.run()
コード例 #10
0
    def test_DDPG_1(self):
        self.LogSetup()
        env = make('Pendulum-v0')
        name = 'mb_test'
        env_spec = env.env_spec
        cyber = PendulumnCyber(env=env, epoch_to_use=60, use_traj_input=False, use_mbmf=True, \
                               model_path='/home/yitongx/Documents/baconian-project/experiments/log')
        actor_policy_mlp_config = [{
            "ACT": "RELU",
            "B_INIT_VALUE": 0.0,
            "NAME": "1",
            "N_UNITS": 32,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }, {
            "ACT": "RELU",
            "B_INIT_VALUE": 0.0,
            "NAME": "2",
            "N_UNITS": 16,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }, {
            "ACT": "RELU",
            "B_INIT_VALUE": 0.0,
            "NAME": "3",
            "N_UNITS": 8,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }, {
            "ACT": "TANH",
            "B_INIT_VALUE": 0.0,
            "NAME": "OUPTUT",
            "N_UNITS": 1,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }]
        mlp_q = MLPQValueFunction(env_spec=env_spec,
                                  name=name + '_mlp_q',
                                  name_scope=name + '_mlp_q',
                                  output_high=env.action_space.high,
                                  mlp_config=actor_policy_mlp_config)
        mlp_policy = DeterministicMLPPolicy(
            env_spec=env_spec,
            name=name + '_mlp_policy',
            name_scope=name + '_mlp_policy',
            output_high=env.observation_space.high,
            mlp_config=actor_policy_mlp_config,
            reuse=False)
        polyak = 0.995
        gamma = 0.99
        batch_size = 128
        actor_lr = 0.001
        critic_lr = 0.001
        train_iter_per_call = 1
        buffer_size = 100000
        total_steps = 100000  # default 1000000
        max_step_per_episode = 500  # reset env when counter > max_step_per_episode
        train_after_step = 10000
        test_after_step = 10000
        train_every_step = 1
        test_every_step = 1000
        num_test = 10

        algo = DDPG(env_spec=env_spec,
                    config_or_config_dict={
                        "REPLAY_BUFFER_SIZE": buffer_size,
                        "GAMMA": gamma,
                        "CRITIC_LEARNING_RATE": critic_lr,
                        "ACTOR_LEARNING_RATE": actor_lr,
                        "DECAY": polyak,
                        "BATCH_SIZE": batch_size,
                        "TRAIN_ITERATION": train_iter_per_call,
                        "critic_clip_norm": 0.1,
                        "actor_clip_norm": 0.1,
                    },
                    value_func=mlp_q,
                    policy=mlp_policy,
                    name=name + '_ddpg',
                    replay_buffer=None)

        algo.init()
        buffer = TransitionData(env_spec=env_spec,
                                obs_shape=env_spec.obs_shape,
                                action_shape=env_spec.action_shape)
        for i in range(100):  # num_trajectory
            obs = env.reset()
            for j in range(1000):
                action = env.action_space.sample()
                obs_, reward, done, info = env.step(action)
                buffer.append(obs, action, obs_, done, reward)
                if done:
                    break
                else:
                    obs = obs_
        algo.append_to_memory(buffer)

        for i in range(10):
            res = algo.train()
            print(res)

        obs = env.reset()
        act = env.action_space.sample()
        obs_, reward, done, info = cyber.step(obs, act)
        print('obs_', obs_)

        obs = env.observation_space.sample()
        obs_batch = np.array([])
        for i in range(5):
            obs_batch = np.concatenate(
                (obs_batch, env.observation_space.sample()), axis=0)
            # print(obs_batch.shape)

        act_batch = algo.predict(obs_batch)
        print(act_batch.shape)
        print(act_batch)

        print('====> Test')
        act = algo.predict(obs)
        print(act.shape)
        obs = env.reset()
        print(obs.shape)
        obs_, reward, done, info = cyber.step(obs, act)
コード例 #11
0
    def test_DDPG_2(self):
        self.LogSetup()
        env = make('Pendulum-v0')
        name = 'mb_test'
        env_spec = env.env_spec
        cyber = PendulumnCyber(env=env, epoch_to_use=60, use_traj_input=False, use_mbmf=True, \
                               model_path='/home/yitongx/Documents/baconian-project/experiments/log')
        actor_policy_mlp_config = [{
            "ACT": "RELU",
            "B_INIT_VALUE": 0.0,
            "NAME": "1",
            "N_UNITS": 32,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }, {
            "ACT": "RELU",
            "B_INIT_VALUE": 0.0,
            "NAME": "2",
            "N_UNITS": 16,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }, {
            "ACT": "RELU",
            "B_INIT_VALUE": 0.0,
            "NAME": "3",
            "N_UNITS": 8,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }, {
            "ACT": "TANH",
            "B_INIT_VALUE": 0.0,
            "NAME": "OUPTUT",
            "N_UNITS": 1,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }]
        mlp_q = MLPQValueFunction(env_spec=env_spec,
                                  name=name + '_mlp_q',
                                  name_scope=name + '_mlp_q',
                                  output_high=env.action_space.high,
                                  mlp_config=actor_policy_mlp_config)
        mlp_policy = DeterministicMLPPolicy(
            env_spec=env_spec,
            name=name + '_mlp_policy',
            name_scope=name + '_mlp_policy',
            output_high=env.observation_space.high,
            mlp_config=actor_policy_mlp_config,
            reuse=False)
        polyak = 0.995
        gamma = 0.99
        noise_scale = 0.5
        noise_decay = 0.995
        batch_size = 128
        actor_lr = 0.001
        critic_lr = 0.001
        train_iter_per_call = 1
        buffer_size = 100000

        total_steps = 100000  # default 1000000
        max_step_per_episode = 500  # reset env when counter > max_step_per_episode
        train_after_step = 10000  # default 10000
        train_every_step = 1
        test_after_step = 10000
        test_every_step = 1000
        num_test = 10

        algo = DDPG(env_spec=env_spec,
                    config_or_config_dict={
                        "REPLAY_BUFFER_SIZE": buffer_size,
                        "GAMMA": gamma,
                        "CRITIC_LEARNING_RATE": critic_lr,
                        "ACTOR_LEARNING_RATE": actor_lr,
                        "DECAY": polyak,
                        "BATCH_SIZE": batch_size,
                        "TRAIN_ITERATION": train_iter_per_call,
                        "critic_clip_norm": 0.1,
                        "actor_clip_norm": 0.1,
                    },
                    value_func=mlp_q,
                    policy=mlp_policy,
                    name=name + '_ddpg',
                    replay_buffer=None)

        step_counter = SinglentonStepCounter(-1)
        noise_adder = AgentActionNoiseWrapper(
            noise=UniformNoise(scale=noise_scale),
            action_weight_scheduler=ConstantScheduler(1.),
            noise_weight_scheduler=DDPGNoiseScheduler(
                train_every_step=train_every_step,
                train_after_step=train_after_step,
                noise_decay=noise_decay,
                step_counter=step_counter))
        agent = DDPG_Agent(env=env,
                           algo=algo,
                           env_spec=env_spec,
                           noise_adder=noise_adder,
                           name=name + '_agent')

        agent.init()
        test_reward = []
        data_sample = []
        obs, ep_ret, ep_len = env.reset(), 0, 0
        for step in range(total_steps):
            step_counter.increase(1)
            act = agent.predict(obs=obs)
            obs_, reward, done, _ = cyber.step(obs, act)
            _buffer = TransitionData(env_spec=env_spec,
                                     obs_shape=env_spec.obs_shape,
                                     action_shape=env_spec.action_shape)
            _buffer.append(obs, act, obs_, done, reward)
            agent.algo.append_to_memory(_buffer)

            ep_ret += reward
            ep_len += 1

            if done or ep_len > max_step_per_episode:
                obs, ep_ret, ep_len = env.reset(), 0, 0
            else:
                obs = obs_
            if step > train_after_step and step % train_every_step == 0:
                agent.train()
            if step > test_after_step and step % test_every_step == 0:
                data_sample, test_reward = agent.test(
                    env=env,
                    cyber=cyber,
                    data_sample=data_sample,
                    test_reward=test_reward,
                    num_test=num_test,
                    max_step_per_episode=max_step_per_episode)