Exemple #1
0
def get_ddpg(vec_env, device):
    model_factory = DeterministicPolicyModelFactory(
        input_block=NormalizeObservationsFactory(input_shape=17),
        policy_backbone=MLPFactory(input_length=17, hidden_layers=[64, 64], activation='tanh'),
        value_backbone=MLPFactory(input_length=23, hidden_layers=[64, 64], activation='tanh'),
    )
    model = model_factory.instantiate(action_space=vec_env.action_space)
    reinforcer = BufferedOffPolicyIterationReinforcer(
        device=device,
        environment=vec_env,
        settings=BufferedOffPolicyIterationReinforcerSettings(
            rollout_steps=2,
            training_steps=64,
        ),
        model=model,
        algo=DeepDeterministicPolicyGradient(
            model_factory=model_factory,
            discount_factor=0.99,
            tau=0.01,
        ),
        env_roller=TransitionReplayEnvRoller(
            environment=vec_env,
            device=device,
            action_noise=OuNoise(std_dev=0.2, environment=vec_env),
            replay_buffer=CircularReplayBuffer(
                buffer_capacity=1_000_000,
                buffer_initial_size=2_000,
                num_envs=vec_env.num_envs,
                observation_space=vec_env.observation_space,
                action_space=vec_env.action_space
            ),
            normalize_returns=True,
            discount_factor=0.99
        ),
    )
Exemple #2
0
def half_cheetah_ddpg():
    device = torch.device('cuda:0')
    seed = 1002

    # Set random seed in python std lib, numpy and pytorch
    set_seed(seed)

    vec_env = DummyVecEnvWrapper(MujocoEnv('HalfCheetah-v2')).instantiate(
        parallel_envs=1, seed=seed)

    model_factory = DeterministicPolicyModelFactory(
        input_block=NormalizeObservationsFactory(input_shape=17),
        policy_backbone=MLPFactory(input_length=17,
                                   hidden_layers=[64, 64],
                                   activation='tanh'),
        value_backbone=MLPFactory(input_length=23,
                                  hidden_layers=[64, 64],
                                  activation='tanh'),
    )

    model = model_factory.instantiate(action_space=vec_env.action_space)

    reinforcer = BufferedOffPolicyIterationReinforcer(
        device=device,
        environment=vec_env,
        settings=BufferedOffPolicyIterationReinforcerSettings(
            rollout_steps=2,
            training_steps=64,
        ),
        model=model,
        algo=DeepDeterministicPolicyGradient(
            model_factory=model_factory,
            discount_factor=0.99,
            tau=0.01,
        ),
        env_roller=TransitionReplayEnvRoller(
            environment=vec_env,
            device=device,
            action_noise=OuNoise(std_dev=0.2, environment=vec_env),
            replay_buffer=CircularReplayBuffer(
                buffer_capacity=1_000_000,
                buffer_initial_size=2_000,
                num_envs=vec_env.num_envs,
                observation_space=vec_env.observation_space,
                action_space=vec_env.action_space),
            normalize_returns=True,
            discount_factor=0.99),
    )
Exemple #3
0
def get_ppo(vec_env, device):
    model = StochasticPolicyModelFactory(
        input_block=NormalizeObservationsFactory(input_shape=17),
        backbone=MLPFactory(input_length=23, hidden_layers=[64, 64], activation='tanh'),
    ).instantiate(action_space=vec_env.action_space)
    cliprange = LinearSchedule(
        initial_value=0.1,
        final_value=0.0
    )
    reinforcer = OnPolicyIterationReinforcer(
        device=device,
        settings=OnPolicyIterationReinforcerSettings(
            batch_size=256,
            experience_replay=4,
            number_of_steps=128
        ),
        model=model,
        algo=PpoPolicyGradient(
            entropy_coefficient=0.01,
            value_coefficient=0.5,
            max_grad_norm=0.5,
            discount_factor=0.99,
            gae_lambda=0.95,
            cliprange=cliprange
        ),
        env_roller=StepEnvRoller(
            environment=vec_env,
            device=device,
        )
    )
    return model, reinforcer
Exemple #4
0
def half_cheetah_ddpg():
    device = torch.device('cuda:0')
    seed = 1002

    # Set random seed in python std lib, numpy and pytorch
    set_seed(seed)

    env = MujocoEnv('HalfCheetah-v2').instantiate(seed=seed)

    model_factory = DeterministicPolicyModelFactory(
        policy_backbone=MLPFactory(input_length=17, hidden_layers=[64, 64], activation='tanh'),
        value_backbone=MLPFactory(input_length=23, hidden_layers=[64, 64], activation='tanh'),
    )

    model = model_factory.instantiate(action_space=env.action_space)

    reinforcer = BufferedSingleOffPolicyIterationReinforcer(
        device=device,
        settings=BufferedSingleOffPolicyIterationReinforcerSettings(
            batch_rollout_rounds=100,
            batch_training_rounds=50,
            batch_size=64,
            discount_factor=0.99
        ),
        environment=env,
        model=model,
        algo=DeepDeterministicPolicyGradient(
            model_factory=model_factory,
            tau=0.01,
        ),
        env_roller=DequeReplayRollerOuNoise(
            environment=env,
            device=device,
            batch_size=64,
            buffer_capacity=1_000_000,
            buffer_initial_size=2_000,
            noise_std_dev=0.2,
            normalize_observations=True,
            normalize_returns=True,
            discount_factor=0.99
        )
    )
Exemple #5
0
def test_trpo_bipedal_walker():
    """
    1 iteration of TRPO on bipedal walker
    """
    device = torch.device('cpu')
    seed = 1001

    # Set random seed in python std lib, numpy and pytorch
    set_seed(seed)

    vec_env = DummyVecEnvWrapper(MujocoEnv('BipedalWalker-v2'),
                                 normalize=True).instantiate(parallel_envs=8,
                                                             seed=seed)

    # Again, use a helper to create a model
    # But because model is owned by the reinforcer, model should not be accessed using this variable
    # but from reinforcer.model property
    model_factory = PolicyGradientModelSeparateFactory(
        policy_backbone=MLPFactory(input_length=24, hidden_layers=[32, 32]),
        value_backbone=MLPFactory(input_length=24, hidden_layers=[32]))

    # Reinforcer - an object managing the learning process
    reinforcer = OnPolicyIterationReinforcer(
        device=device,
        settings=OnPolicyIterationReinforcerSettings(discount_factor=0.99, ),
        model=model_factory.instantiate(action_space=vec_env.action_space),
        algo=TrpoPolicyGradient(
            max_kl=0.01,
            cg_iters=10,
            line_search_iters=10,
            improvement_acceptance_ratio=0.1,
            cg_damping=0.1,
            vf_iters=5,
            entropy_coef=0.0,
            max_grad_norm=0.5,
        ),
        env_roller=StepEnvRoller(
            environment=vec_env,
            device=device,
            number_of_steps=12,
            discount_factor=0.99,
        ))

    # Model optimizer
    optimizer = optim.Adam(reinforcer.model.parameters(), lr=1.0e-3, eps=1e-4)

    # Overall information store for training information
    training_info = TrainingInfo(
        metrics=[
            EpisodeRewardMetric(
                'episode_rewards'),  # Calculate average reward from episode
        ],
        callbacks=[FrameTracker(100_000)
                   ]  # Print live metrics every epoch to standard output
    )

    # A bit of training initialization bookkeeping...
    training_info.initialize()
    reinforcer.initialize_training(training_info)
    training_info.on_train_begin()

    # Let's make 100 batches per epoch to average metrics nicely
    num_epochs = 1

    # Normal handrolled training loop
    for i in range(1, num_epochs + 1):
        epoch_info = EpochInfo(training_info=training_info,
                               global_epoch_idx=i,
                               batches_per_epoch=1,
                               optimizer=optimizer)

        reinforcer.train_epoch(epoch_info, interactive=False)

    training_info.on_train_end()
Exemple #6
0
def test_ddpg_bipedal_walker():
    """
    1 iteration of DDPG bipedal walker environment
    """
    device = torch.device('cpu')
    seed = 1001

    # Set random seed in python std lib, numpy and pytorch
    set_seed(seed)

    # Only single environment for DDPG
    env = MujocoEnv('BipedalWalker-v2').instantiate(seed=seed)

    # Again, use a helper to create a model
    # But because model is owned by the reinforcer, model should not be accessed using this variable
    # but from reinforcer.model property
    model_factory = DeterministicPolicyModelFactory(
        policy_backbone=MLPFactory(input_length=24,
                                   hidden_layers=[64, 64],
                                   normalization='layer'),
        value_backbone=MLPFactory(input_length=28,
                                  hidden_layers=[64, 64],
                                  normalization='layer'))

    # Reinforcer - an object managing the learning process
    reinforcer = BufferedSingleOffPolicyIterationReinforcer(
        device=device,
        settings=BufferedSingleOffPolicyIterationReinforcerSettings(
            batch_rollout_rounds=4,
            batch_training_rounds=1,
            batch_size=32,
            discount_factor=0.99),
        environment=env,
        algo=DeepDeterministicPolicyGradient(model_factory=model_factory,
                                             tau=0.01,
                                             max_grad_norm=0.5),
        model=model_factory.instantiate(action_space=env.action_space),
        env_roller=DequeReplayRollerOuNoise(environment=env,
                                            device=device,
                                            batch_size=32,
                                            buffer_capacity=100,
                                            buffer_initial_size=100,
                                            noise_std_dev=0.2,
                                            normalize_observations=True,
                                            discount_factor=0.99))

    # Model optimizer
    optimizer = optim.Adam(reinforcer.model.parameters(), lr=2.5e-4, eps=1e-4)

    # Overall information store for training information
    training_info = TrainingInfo(
        metrics=[
            EpisodeRewardMetric(
                'episode_rewards'),  # Calculate average reward from episode
        ],
        callbacks=[FrameTracker(100_000)
                   ]  # Print live metrics every epoch to standard output
    )

    # A bit of training initialization bookkeeping...
    training_info.initialize()
    reinforcer.initialize_training(training_info)
    training_info.on_train_begin()

    # Let's make 100 batches per epoch to average metrics nicely
    num_epochs = 1

    # Normal handrolled training loop
    for i in range(1, num_epochs + 1):
        epoch_info = EpochInfo(training_info=training_info,
                               global_epoch_idx=i,
                               batches_per_epoch=1,
                               optimizer=optimizer)

        reinforcer.train_epoch(epoch_info, interactive=False)

    training_info.on_train_end()