Ejemplo n.º 1
0
def main():
    """ Paperboy entry point - parse the arguments and run a command """
    parser = argparse.ArgumentParser(
        description='Paperboy deep learning launcher')

    parser.add_argument('config',
                        metavar='FILENAME',
                        help='Configuration file for the run')
    parser.add_argument('command', metavar='COMMAND', help='A command to run')
    parser.add_argument('varargs',
                        nargs='*',
                        metavar='VARARGS',
                        help='Extra options to the command')
    parser.add_argument('-r',
                        '--run_number',
                        type=int,
                        default=0,
                        help="A run number")
    parser.add_argument('-d',
                        '--device',
                        default='cuda',
                        help="A device to run the model on")
    parser.add_argument('-s',
                        '--seed',
                        type=int,
                        default=None,
                        help="Random seed for the project")
    parser.add_argument('-p',
                        '--param',
                        type=str,
                        metavar='NAME=VALUE',
                        action='append',
                        default=[],
                        help="Configuration parameters")
    parser.add_argument('--reset',
                        action='store_true',
                        default=False,
                        help="Overwrite existing model storage")

    args = parser.parse_args()

    model_config = ModelConfig.from_file(
        args.config,
        args.run_number,
        reset=args.reset,
        device=args.device,
        seed=args.seed,
        params={
            k: v
            for (k, v) in (Parser.parse_equality(eq) for eq in args.param)
        })

    # Set seed already in the launcher
    set_seed(model_config.seed)

    model_config.banner(args.command)
    model_config.run_command(args.command, args.varargs)
    model_config.quit_banner()
Ejemplo n.º 2
0
def pivoting_rl(args):
    device = torch.device('cuda:'+str(args.gpu) if torch.cuda.is_available() else 'cpu')
    seed = 1002

    # Set random seed in python std lib, numpy and pytorch
    set_seed(seed)

    vec_env = DummyVecEnvWrapper(
        MujocoEnv('HalfCheetah-v2')
    ).instantiate(parallel_envs=1, seed=seed)

    if args.algo == 'ddpg':
       model, reinforcer = get_ddpg(vec_env, device)
    elif args.algo == 'ppo':
        model, reinforcer = get_ppo(vec_env, device)
    else:
        print('Unknown algo', args.algo); assert(False)


    # Optimizer helper - A weird regularization settings I've copied from OpenAI code
    adam_optimizer = AdamFactory(
        lr=[1.0e-4, 1.0e-3, 1.0e-3],
        weight_decay=[0.0, 0.0, 0.001],
        eps=1.0e-4,
        layer_groups=True
    ).instantiate(model)

    # Overall information store for training information
    training_info = TrainingInfo(
        metrics=[
            EpisodeRewardMetric('episode_rewards'),  # Calculate average reward from episode
        ],
        callbacks=[StdoutStreaming()]  # Print live metrics every epoch to standard output
    )

    # A bit of training initialization bookkeeping...
    training_info.initialize()
    reinforcer.initialize_training(training_info)
    training_info.on_train_begin()

    # Let's make 20 batches per epoch to average metrics nicely
    num_epochs = int(1.0e6 / 2 / 1000)

    # Normal handrolled training loop
    for i in range(1, num_epochs+1):
        epoch_info = EpochInfo(
            training_info=training_info,
            global_epoch_idx=i,
            batches_per_epoch=1000,
            optimizer=adam_optimizer
        )

        reinforcer.train_epoch(epoch_info)

    training_info.on_train_end()
Ejemplo n.º 3
0
def half_cheetah_ddpg():
    device = torch.device('cuda:0')
    seed = 1002

    # Set random seed in python std lib, numpy and pytorch
    set_seed(seed)

    vec_env = DummyVecEnvWrapper(MujocoEnv('HalfCheetah-v2')).instantiate(
        parallel_envs=1, seed=seed)

    model_factory = DeterministicPolicyModelFactory(
        input_block=NormalizeObservationsFactory(input_shape=17),
        policy_backbone=MLPFactory(input_length=17,
                                   hidden_layers=[64, 64],
                                   activation='tanh'),
        value_backbone=MLPFactory(input_length=23,
                                  hidden_layers=[64, 64],
                                  activation='tanh'),
    )

    model = model_factory.instantiate(action_space=vec_env.action_space)

    reinforcer = BufferedOffPolicyIterationReinforcer(
        device=device,
        environment=vec_env,
        settings=BufferedOffPolicyIterationReinforcerSettings(
            rollout_steps=2,
            training_steps=64,
        ),
        model=model,
        algo=DeepDeterministicPolicyGradient(
            model_factory=model_factory,
            discount_factor=0.99,
            tau=0.01,
        ),
        env_roller=TransitionReplayEnvRoller(
            environment=vec_env,
            device=device,
            action_noise=OuNoise(std_dev=0.2, environment=vec_env),
            replay_buffer=CircularReplayBuffer(
                buffer_capacity=1_000_000,
                buffer_initial_size=2_000,
                num_envs=vec_env.num_envs,
                observation_space=vec_env.observation_space,
                action_space=vec_env.action_space),
            normalize_returns=True,
            discount_factor=0.99),
    )
Ejemplo n.º 4
0
def half_cheetah_ddpg():
    device = torch.device('cuda:0')
    seed = 1002

    # Set random seed in python std lib, numpy and pytorch
    set_seed(seed)

    env = MujocoEnv('HalfCheetah-v2').instantiate(seed=seed)

    model_factory = DeterministicPolicyModelFactory(
        policy_backbone=MLPFactory(input_length=17, hidden_layers=[64, 64], activation='tanh'),
        value_backbone=MLPFactory(input_length=23, hidden_layers=[64, 64], activation='tanh'),
    )

    model = model_factory.instantiate(action_space=env.action_space)

    reinforcer = BufferedSingleOffPolicyIterationReinforcer(
        device=device,
        settings=BufferedSingleOffPolicyIterationReinforcerSettings(
            batch_rollout_rounds=100,
            batch_training_rounds=50,
            batch_size=64,
            discount_factor=0.99
        ),
        environment=env,
        model=model,
        algo=DeepDeterministicPolicyGradient(
            model_factory=model_factory,
            tau=0.01,
        ),
        env_roller=DequeReplayRollerOuNoise(
            environment=env,
            device=device,
            batch_size=64,
            buffer_capacity=1_000_000,
            buffer_initial_size=2_000,
            noise_std_dev=0.2,
            normalize_observations=True,
            normalize_returns=True,
            discount_factor=0.99
        )
    )
Ejemplo n.º 5
0
def eval_model():
    """load a checkpoint data and evaluate its performance
    :return: None
    """
    device = torch.device('cpu')
    seed = 1001

    # Set random seed in python std lib, numpy and pytorch
    set_seed(seed)

    env_function = lambda: ColoredEgoCostmapRandomAisleTurnEnv()
    vec_env = DummyVecEnv([env_function])
    vec_env.reset()

    model = PolicyGradientModelFactory(backbone=NatureCnnTwoTowerFactory(
        input_width=133, input_height=133, input_channels=1)).instantiate(
            action_space=vec_env.action_space)
    model_checkpoint = torch.load('tmp_checkout.data', map_location='cpu')
    model.load_state_dict(model_checkpoint)

    evaluate_model(model, vec_env, device, takes=10)
Ejemplo n.º 6
0
def train_model():
    """a sample training script, that creates a PPO instance and train it with bc-gym environment
    :return: None
    """
    device = torch.device('cpu')
    seed = 1001

    # Set random seed in python std lib, numpy and pytorch
    set_seed(seed)
    env_function = lambda: ColoredEgoCostmapRandomAisleTurnEnv()
    vec_env = DummyVecEnv([env_function])

    # Again, use a helper to create a model
    # But because model is owned by the reinforcer, model should not be accessed using this variable
    # but from reinforcer.model property
    model = PolicyGradientModelFactory(backbone=NatureCnnTwoTowerFactory(
        input_width=133, input_height=133, input_channels=1)).instantiate(
            action_space=vec_env.action_space)

    # Set schedule for gradient clipping.
    cliprange = LinearSchedule(initial_value=0.01, final_value=0.0)

    # Reinforcer - an object managing the learning process
    reinforcer = OnPolicyIterationReinforcer(
        device=device,
        settings=OnPolicyIterationReinforcerSettings(discount_factor=0.99,
                                                     batch_size=256,
                                                     experience_replay=4),
        model=model,
        algo=PpoPolicyGradient(entropy_coefficient=0.01,
                               value_coefficient=0.5,
                               max_grad_norm=0.01,
                               cliprange=cliprange),
        env_roller=StepEnvRoller(
            environment=vec_env,
            device=device,
            gae_lambda=0.95,
            number_of_steps=128,
            discount_factor=0.99,
        ))

    # Model optimizer
    optimizer = optim.Adam(reinforcer.model.parameters(), lr=1e-6, eps=1.0e-5)

    # Overall information store for training information
    training_info = TrainingInfo(
        metrics=[
            EpisodeRewardMetric(
                'episode_rewards'),  # Calculate average reward from episode
        ],
        callbacks=[
            StdoutStreaming(
            ),  # Print live metrics every epoch to standard output
            FrameTracker(
                1.1e8
            )  # We need frame tracker to track the progress of learning
        ])

    # A bit of training initialization bookkeeping...
    training_info.initialize()
    reinforcer.initialize_training(training_info)
    training_info.on_train_begin()

    # Let's make 10 batches per epoch to average metrics nicely
    # Rollout size is 8 environments times 128 steps
    num_epochs = int(1.1e8 / (128 * 1) / 10)

    # Normal handrolled training loop
    eval_results = []
    for i in range(1, num_epochs + 1):
        epoch_info = EpochInfo(training_info=training_info,
                               global_epoch_idx=i,
                               batches_per_epoch=10,
                               optimizer=optimizer)

        reinforcer.train_epoch(epoch_info)

        eval_result = evaluate_model(model, vec_env, device, takes=1)
        eval_results.append(eval_result)

        if i % 100 == 0:
            torch.save(model.state_dict(), 'tmp_checkout.data')
            with open('tmp_eval_results.pkl', 'wb') as f:
                pickle.dump(eval_results, f, 0)

    training_info.on_train_end()
Ejemplo n.º 7
0
def breakout_a2c():
    device = torch.device('cuda:0')
    seed = 1001

    # Set random seed in python std lib, numpy and pytorch
    set_seed(seed)

    # Create 16 environments evaluated in parallel in sub processess with all usual DeepMind wrappers
    # These are just helper functions for that
    vec_env = SubprocVecEnvWrapper(ClassicAtariEnv('BreakoutNoFrameskip-v4'),
                                   frame_history=4).instantiate(
                                       parallel_envs=16, seed=seed)

    # Again, use a helper to create a model
    # But because model is owned by the reinforcer, model should not be accessed using this variable
    # but from reinforcer.model property
    model = PolicyGradientModelFactory(backbone=NatureCnnFactory(
        input_width=84, input_height=84, input_channels=4)).instantiate(
            action_space=vec_env.action_space)

    # Reinforcer - an object managing the learning process
    reinforcer = OnPolicyIterationReinforcer(
        device=device,
        settings=OnPolicyIterationReinforcerSettings(
            discount_factor=0.99,
            batch_size=256,
        ),
        model=model,
        algo=A2CPolicyGradient(entropy_coefficient=0.01,
                               value_coefficient=0.5,
                               max_grad_norm=0.5),
        env_roller=StepEnvRoller(
            environment=vec_env,
            device=device,
            number_of_steps=5,
            discount_factor=0.99,
        ))

    # Model optimizer
    optimizer = optim.RMSprop(reinforcer.model.parameters(),
                              lr=7.0e-4,
                              eps=1e-3)

    # Overall information store for training information
    training_info = TrainingInfo(
        metrics=[
            EpisodeRewardMetric(
                'episode_rewards'),  # Calculate average reward from episode
        ],
        callbacks=[StdoutStreaming()
                   ]  # Print live metrics every epoch to standard output
    )

    # A bit of training initialization bookkeeping...
    training_info.initialize()
    reinforcer.initialize_training(training_info)
    training_info.on_train_begin()

    # Let's make 100 batches per epoch to average metrics nicely
    num_epochs = int(1.1e7 / (5 * 16) / 100)

    # Normal handrolled training loop
    for i in range(1, num_epochs + 1):
        epoch_info = EpochInfo(training_info=training_info,
                               global_epoch_idx=i,
                               batches_per_epoch=100,
                               optimizer=optimizer)

        reinforcer.train_epoch(epoch_info)

    training_info.on_train_end()
Ejemplo n.º 8
0
def main():
    """ Paperboy entry point - parse the arguments and run a command """
    parser = argparse.ArgumentParser(
        description='Paperboy deep learning launcher')

    parser.add_argument('config',
                        metavar='FILENAME',
                        help='Configuration file for the run')
    parser.add_argument('command', metavar='COMMAND', help='A command to run')
    parser.add_argument('varargs',
                        nargs='*',
                        metavar='VARARGS',
                        help='Extra options to the command')
    parser.add_argument('-r',
                        '--run_number',
                        type=int,
                        default=0,
                        help="A run number")
    parser.add_argument('-d',
                        '--device',
                        default='cuda',
                        help="A device to run the model on")
    parser.add_argument('-s',
                        '--seed',
                        type=int,
                        default=None,
                        help="Random seed for the project")
    parser.add_argument('-p',
                        '--param',
                        type=str,
                        metavar='NAME=VALUE',
                        action='append',
                        default=[],
                        help="Configuration parameters")
    parser.add_argument('--continue',
                        action='store_true',
                        default=False,
                        help="Continue previously started learning process")
    parser.add_argument('--profile',
                        type=str,
                        default=None,
                        help="Profiler output")

    args = parser.parse_args()

    model_config = ModelConfig.from_file(
        args.config,
        args.run_number,
        continue_training=getattr(args, 'continue'),
        device=args.device,
        seed=args.seed,
        params={
            k: v
            for (k, v) in (Parser.parse_equality(eq) for eq in args.param)
        })

    if model_config.project_dir not in sys.path:
        sys.path.append(model_config.project_dir)

    multiprocessing_setting = model_config.provide_with_default(
        'multiprocessing', default=None)

    if multiprocessing_setting:
        # This needs to be called before any of PyTorch module is imported
        multiprocessing.set_start_method(multiprocessing_setting)

    # Set seed already in the launcher
    from vel.util.random import set_seed
    set_seed(model_config.seed)

    model_config.banner(args.command)

    if args.profile:
        print("[PROFILER] Running Vel in profiling mode, output filename={}".
              format(args.profile))
        import cProfile
        import pstats
        profiler = cProfile.Profile()
        profiler.enable()
        model_config.run_command(args.command, args.varargs)
        profiler.disable()

        profiler.dump_stats(args.profile)
        profiler.print_stats(sort='tottime')

        print(
            "======================================================================"
        )
        pstats.Stats(profiler).strip_dirs().sort_stats('tottime').print_stats(
            30)
        print(
            "======================================================================"
        )
        pstats.Stats(profiler).strip_dirs().sort_stats('cumtime').print_stats(
            30)
    else:
        model_config.run_command(args.command, args.varargs)

    model_config.quit_banner()
Ejemplo n.º 9
0
def test_acer_breakout():
    """
    1 iteration of ACER on breakout environment
    """
    device = torch.device('cpu')
    seed = 1001

    # Set random seed in python std lib, numpy and pytorch
    set_seed(seed)

    # Create 16 environments evaluated in parallel in sub processess with all usual DeepMind wrappers
    # These are just helper functions for that
    vec_env = SubprocVecEnvWrapper(ClassicAtariEnv('BreakoutNoFrameskip-v4'),
                                   frame_history=4).instantiate(
                                       parallel_envs=16, seed=seed)

    # Again, use a helper to create a model
    # But because model is owned by the reinforcer, model should not be accessed using this variable
    # but from reinforcer.model property
    model_factory = QPolicyGradientModelFactory(backbone=NatureCnnFactory(
        input_width=84, input_height=84, input_channels=4))

    # Reinforcer - an object managing the learning process
    reinforcer = BufferedMixedPolicyIterationReinforcer(
        device=device,
        settings=BufferedMixedPolicyIterationReinforcerSettings(
            discount_factor=0.99,
            experience_replay=2,
            stochastic_experience_replay=False),
        model=model_factory.instantiate(action_space=vec_env.action_space),
        env=vec_env,
        algo=AcerPolicyGradient(
            model_factory=model_factory,
            entropy_coefficient=0.01,
            q_coefficient=0.5,
            rho_cap=10.0,
            retrace_rho_cap=1.0,
            trust_region=True,
            trust_region_delta=1.0,
            max_grad_norm=10.0,
        ),
        env_roller=ReplayQEnvRoller(environment=vec_env,
                                    device=device,
                                    number_of_steps=12,
                                    discount_factor=0.99,
                                    buffer_capacity=100,
                                    buffer_initial_size=100,
                                    frame_stack_compensation=4),
    )

    # Model optimizer
    optimizer = optim.RMSprop(reinforcer.model.parameters(),
                              lr=7.0e-4,
                              eps=1e-3,
                              alpha=0.99)

    # Overall information store for training information
    training_info = TrainingInfo(
        metrics=[
            EpisodeRewardMetric(
                'episode_rewards'),  # Calculate average reward from episode
        ],
        callbacks=[]  # Print live metrics every epoch to standard output
    )

    # A bit of training initialization bookkeeping...
    training_info.initialize()
    reinforcer.initialize_training(training_info)
    training_info.on_train_begin()

    # Let's make 100 batches per epoch to average metrics nicely
    num_epochs = 1

    # Normal handrolled training loop
    for i in range(1, num_epochs + 1):
        epoch_info = EpochInfo(training_info=training_info,
                               global_epoch_idx=i,
                               batches_per_epoch=1,
                               optimizer=optimizer)

        reinforcer.train_epoch(epoch_info, interactive=False)

    training_info.on_train_end()
Ejemplo n.º 10
0
def test_trpo_bipedal_walker():
    """
    1 iteration of TRPO on bipedal walker
    """
    device = torch.device('cpu')
    seed = 1001

    # Set random seed in python std lib, numpy and pytorch
    set_seed(seed)

    vec_env = DummyVecEnvWrapper(MujocoEnv('BipedalWalker-v2'),
                                 normalize=True).instantiate(parallel_envs=8,
                                                             seed=seed)

    # Again, use a helper to create a model
    # But because model is owned by the reinforcer, model should not be accessed using this variable
    # but from reinforcer.model property
    model_factory = PolicyGradientModelSeparateFactory(
        policy_backbone=MLPFactory(input_length=24, hidden_layers=[32, 32]),
        value_backbone=MLPFactory(input_length=24, hidden_layers=[32]))

    # Reinforcer - an object managing the learning process
    reinforcer = OnPolicyIterationReinforcer(
        device=device,
        settings=OnPolicyIterationReinforcerSettings(discount_factor=0.99, ),
        model=model_factory.instantiate(action_space=vec_env.action_space),
        algo=TrpoPolicyGradient(
            max_kl=0.01,
            cg_iters=10,
            line_search_iters=10,
            improvement_acceptance_ratio=0.1,
            cg_damping=0.1,
            vf_iters=5,
            entropy_coef=0.0,
            max_grad_norm=0.5,
        ),
        env_roller=StepEnvRoller(
            environment=vec_env,
            device=device,
            number_of_steps=12,
            discount_factor=0.99,
        ))

    # Model optimizer
    optimizer = optim.Adam(reinforcer.model.parameters(), lr=1.0e-3, eps=1e-4)

    # Overall information store for training information
    training_info = TrainingInfo(
        metrics=[
            EpisodeRewardMetric(
                'episode_rewards'),  # Calculate average reward from episode
        ],
        callbacks=[FrameTracker(100_000)
                   ]  # Print live metrics every epoch to standard output
    )

    # A bit of training initialization bookkeeping...
    training_info.initialize()
    reinforcer.initialize_training(training_info)
    training_info.on_train_begin()

    # Let's make 100 batches per epoch to average metrics nicely
    num_epochs = 1

    # Normal handrolled training loop
    for i in range(1, num_epochs + 1):
        epoch_info = EpochInfo(training_info=training_info,
                               global_epoch_idx=i,
                               batches_per_epoch=1,
                               optimizer=optimizer)

        reinforcer.train_epoch(epoch_info, interactive=False)

    training_info.on_train_end()
Ejemplo n.º 11
0
def test_ddpg_bipedal_walker():
    """
    1 iteration of DDPG bipedal walker environment
    """
    device = torch.device('cpu')
    seed = 1001

    # Set random seed in python std lib, numpy and pytorch
    set_seed(seed)

    # Only single environment for DDPG
    env = MujocoEnv('BipedalWalker-v2').instantiate(seed=seed)

    # Again, use a helper to create a model
    # But because model is owned by the reinforcer, model should not be accessed using this variable
    # but from reinforcer.model property
    model_factory = DeterministicPolicyModelFactory(
        policy_backbone=MLPFactory(input_length=24,
                                   hidden_layers=[64, 64],
                                   normalization='layer'),
        value_backbone=MLPFactory(input_length=28,
                                  hidden_layers=[64, 64],
                                  normalization='layer'))

    # Reinforcer - an object managing the learning process
    reinforcer = BufferedSingleOffPolicyIterationReinforcer(
        device=device,
        settings=BufferedSingleOffPolicyIterationReinforcerSettings(
            batch_rollout_rounds=4,
            batch_training_rounds=1,
            batch_size=32,
            discount_factor=0.99),
        environment=env,
        algo=DeepDeterministicPolicyGradient(model_factory=model_factory,
                                             tau=0.01,
                                             max_grad_norm=0.5),
        model=model_factory.instantiate(action_space=env.action_space),
        env_roller=DequeReplayRollerOuNoise(environment=env,
                                            device=device,
                                            batch_size=32,
                                            buffer_capacity=100,
                                            buffer_initial_size=100,
                                            noise_std_dev=0.2,
                                            normalize_observations=True,
                                            discount_factor=0.99))

    # Model optimizer
    optimizer = optim.Adam(reinforcer.model.parameters(), lr=2.5e-4, eps=1e-4)

    # Overall information store for training information
    training_info = TrainingInfo(
        metrics=[
            EpisodeRewardMetric(
                'episode_rewards'),  # Calculate average reward from episode
        ],
        callbacks=[FrameTracker(100_000)
                   ]  # Print live metrics every epoch to standard output
    )

    # A bit of training initialization bookkeeping...
    training_info.initialize()
    reinforcer.initialize_training(training_info)
    training_info.on_train_begin()

    # Let's make 100 batches per epoch to average metrics nicely
    num_epochs = 1

    # Normal handrolled training loop
    for i in range(1, num_epochs + 1):
        epoch_info = EpochInfo(training_info=training_info,
                               global_epoch_idx=i,
                               batches_per_epoch=1,
                               optimizer=optimizer)

        reinforcer.train_epoch(epoch_info, interactive=False)

    training_info.on_train_end()
Ejemplo n.º 12
0
def test_prioritized_dqn_breakout():
    """
    Simple 1 iteration of DQN prioritized replay breakout
    """
    device = torch.device('cpu')
    seed = 1001

    # Set random seed in python std lib, numpy and pytorch
    set_seed(seed)

    # Only single environment for DQN
    env = ClassicAtariEnv('BreakoutNoFrameskip-v4').instantiate(seed=seed)

    # Again, use a helper to create a model
    # But because model is owned by the reinforcer, model should not be accessed using this variable
    # but from reinforcer.model property
    model_factory = QModelFactory(backbone=NatureCnnFactory(
        input_width=84, input_height=84, input_channels=4))

    # Reinforcer - an object managing the learning process
    reinforcer = BufferedSingleOffPolicyIterationReinforcer(
        device=device,
        settings=BufferedSingleOffPolicyIterationReinforcerSettings(
            batch_rollout_rounds=4,
            batch_training_rounds=1,
            batch_size=32,
            discount_factor=0.99),
        environment=env,
        algo=DeepQLearning(model_factory=model_factory,
                           double_dqn=False,
                           target_update_frequency=10_000,
                           max_grad_norm=0.5),
        model=model_factory.instantiate(action_space=env.action_space),
        env_roller=PrioritizedReplayRollerEpsGreedy(
            environment=env,
            device=device,
            epsilon_schedule=LinearAndConstantSchedule(
                initial_value=1.0, final_value=0.1, end_of_interpolation=0.1),
            batch_size=8,
            buffer_capacity=100,
            priority_epsilon=1.0e-6,
            buffer_initial_size=100,
            frame_stack=4,
            priority_exponent=0.6,
            priority_weight=LinearSchedule(initial_value=0.4, final_value=1.0),
        ),
    )

    # Model optimizer
    optimizer = optim.RMSprop(reinforcer.model.parameters(),
                              lr=2.5e-4,
                              alpha=0.95,
                              momentum=0.95,
                              eps=1e-3)

    # Overall information store for training information
    training_info = TrainingInfo(
        metrics=[
            EpisodeRewardMetric(
                'episode_rewards'),  # Calculate average reward from episode
        ],
        callbacks=[FrameTracker(100_000)
                   ]  # Print live metrics every epoch to standard output
    )

    # A bit of training initialization bookkeeping...
    training_info.initialize()
    reinforcer.initialize_training(training_info)
    training_info.on_train_begin()

    # Let's make 100 batches per epoch to average metrics nicely
    num_epochs = 1

    # Normal handrolled training loop
    for i in range(1, num_epochs + 1):
        epoch_info = EpochInfo(training_info=training_info,
                               global_epoch_idx=i,
                               batches_per_epoch=1,
                               optimizer=optimizer)

        reinforcer.train_epoch(epoch_info, interactive=False)

    training_info.on_train_end()
Ejemplo n.º 13
0
def qbert_ppo():
    device = torch.device('cuda:0')
    seed = 1001

    # Set random seed in python std lib, numpy and pytorch
    set_seed(seed)

    # Create 16 environments evaluated in parallel in sub processess with all usual DeepMind wrappers
    # These are just helper functions for that
    vec_env = SubprocVecEnvWrapper(ClassicAtariEnv('QbertNoFrameskip-v4'),
                                   frame_history=4).instantiate(
                                       parallel_envs=8, seed=seed)

    # Again, use a helper to create a model
    # But because model is owned by the reinforcer, model should not be accessed using this variable
    # but from reinforcer.model property
    model = StochasticPolicyModelFactory(
        input_block=ImageToTensorFactory(),
        backbone=NatureCnnFactory(
            input_width=84, input_height=84,
            input_channels=4)).instantiate(action_space=vec_env.action_space)

    # Set schedule for gradient clipping.
    cliprange = LinearSchedule(initial_value=0.1, final_value=0.0)

    # Reinforcer - an object managing the learning process
    reinforcer = OnPolicyIterationReinforcer(
        device=device,
        settings=OnPolicyIterationReinforcerSettings(batch_size=256,
                                                     experience_replay=4,
                                                     number_of_steps=128),
        model=model,
        algo=PpoPolicyGradient(entropy_coefficient=0.01,
                               value_coefficient=0.5,
                               max_grad_norm=0.5,
                               discount_factor=0.99,
                               gae_lambda=0.95,
                               cliprange=cliprange),
        env_roller=StepEnvRoller(
            environment=vec_env,
            device=device,
        ))

    # Model optimizer
    optimizer = optim.Adam(reinforcer.model.parameters(),
                           lr=2.5e-4,
                           eps=1.0e-5)

    # Overall information store for training information
    training_info = TrainingInfo(
        metrics=[
            EpisodeRewardMetric(
                'episode_rewards'),  # Calculate average reward from episode
        ],
        callbacks=[
            StdoutStreaming(
            ),  # Print live metrics every epoch to standard output
            FrameTracker(
                1.1e7
            )  # We need frame tracker to track the progress of learning
        ])

    # A bit of training initialization bookkeeping...
    training_info.initialize()
    reinforcer.initialize_training(training_info)
    training_info.on_train_begin()

    # Let's make 10 batches per epoch to average metrics nicely
    # Rollout size is 8 environments times 128 steps
    num_epochs = int(1.1e7 / (128 * 8) / 10)

    # Normal handrolled training loop
    for i in range(1, num_epochs + 1):
        epoch_info = EpochInfo(training_info=training_info,
                               global_epoch_idx=i,
                               batches_per_epoch=10,
                               optimizer=optimizer)

        reinforcer.train_epoch(epoch_info)

    training_info.on_train_end()