def breakout_a2c_evaluate(checkpoint_file_path, takes=10):
    model_checkpoint = torch.load(checkpoint_file_path)
    device = torch.device('cuda:0')

    env = FrameStack(
        ClassicAtariEnv('BreakoutNoFrameskip-v4').instantiate(preset='raw'), k=4
    )

    model = PolicyGradientModelFactory(
        backbone=NatureCnnFactory(input_width=84, input_height=84, input_channels=4)
    ).instantiate(action_space=env.action_space)

    model.load_state_dict(model_checkpoint)
    model = model.to(device)

    model.eval()

    rewards = []
    lengths = []

    for i in range(takes):
        result = record_take(model, env, device)
        rewards.append(result['r'])
        lengths.append(result['l'])

    print(pd.DataFrame({'lengths': lengths, 'rewards': rewards}).describe())
Example #2
0
def eval_model():
    """load a checkpoint data and evaluate its performance
    :return: None
    """
    device = torch.device('cpu')
    seed = 1001

    # Set random seed in python std lib, numpy and pytorch
    set_seed(seed)

    env_function = lambda: ColoredEgoCostmapRandomAisleTurnEnv()
    vec_env = DummyVecEnv([env_function])
    vec_env.reset()

    model = PolicyGradientModelFactory(backbone=NatureCnnTwoTowerFactory(
        input_width=133, input_height=133, input_channels=1)).instantiate(
            action_space=vec_env.action_space)
    model_checkpoint = torch.load('tmp_checkout.data', map_location='cpu')
    model.load_state_dict(model_checkpoint)

    evaluate_model(model, vec_env, device, takes=10)
Example #3
0
def train_model():
    """a sample training script, that creates a PPO instance and train it with bc-gym environment
    :return: None
    """
    device = torch.device('cpu')
    seed = 1001

    # Set random seed in python std lib, numpy and pytorch
    set_seed(seed)
    env_function = lambda: ColoredEgoCostmapRandomAisleTurnEnv()
    vec_env = DummyVecEnv([env_function])

    # Again, use a helper to create a model
    # But because model is owned by the reinforcer, model should not be accessed using this variable
    # but from reinforcer.model property
    model = PolicyGradientModelFactory(backbone=NatureCnnTwoTowerFactory(
        input_width=133, input_height=133, input_channels=1)).instantiate(
            action_space=vec_env.action_space)

    # Set schedule for gradient clipping.
    cliprange = LinearSchedule(initial_value=0.01, final_value=0.0)

    # Reinforcer - an object managing the learning process
    reinforcer = OnPolicyIterationReinforcer(
        device=device,
        settings=OnPolicyIterationReinforcerSettings(discount_factor=0.99,
                                                     batch_size=256,
                                                     experience_replay=4),
        model=model,
        algo=PpoPolicyGradient(entropy_coefficient=0.01,
                               value_coefficient=0.5,
                               max_grad_norm=0.01,
                               cliprange=cliprange),
        env_roller=StepEnvRoller(
            environment=vec_env,
            device=device,
            gae_lambda=0.95,
            number_of_steps=128,
            discount_factor=0.99,
        ))

    # Model optimizer
    optimizer = optim.Adam(reinforcer.model.parameters(), lr=1e-6, eps=1.0e-5)

    # Overall information store for training information
    training_info = TrainingInfo(
        metrics=[
            EpisodeRewardMetric(
                'episode_rewards'),  # Calculate average reward from episode
        ],
        callbacks=[
            StdoutStreaming(
            ),  # Print live metrics every epoch to standard output
            FrameTracker(
                1.1e8
            )  # We need frame tracker to track the progress of learning
        ])

    # A bit of training initialization bookkeeping...
    training_info.initialize()
    reinforcer.initialize_training(training_info)
    training_info.on_train_begin()

    # Let's make 10 batches per epoch to average metrics nicely
    # Rollout size is 8 environments times 128 steps
    num_epochs = int(1.1e8 / (128 * 1) / 10)

    # Normal handrolled training loop
    eval_results = []
    for i in range(1, num_epochs + 1):
        epoch_info = EpochInfo(training_info=training_info,
                               global_epoch_idx=i,
                               batches_per_epoch=10,
                               optimizer=optimizer)

        reinforcer.train_epoch(epoch_info)

        eval_result = evaluate_model(model, vec_env, device, takes=1)
        eval_results.append(eval_result)

        if i % 100 == 0:
            torch.save(model.state_dict(), 'tmp_checkout.data')
            with open('tmp_eval_results.pkl', 'wb') as f:
                pickle.dump(eval_results, f, 0)

    training_info.on_train_end()
Example #4
0
def breakout_a2c():
    device = torch.device('cuda:0')
    seed = 1001

    # Set random seed in python std lib, numpy and pytorch
    set_seed(seed)

    # Create 16 environments evaluated in parallel in sub processess with all usual DeepMind wrappers
    # These are just helper functions for that
    vec_env = SubprocVecEnvWrapper(ClassicAtariEnv('BreakoutNoFrameskip-v4'),
                                   frame_history=4).instantiate(
                                       parallel_envs=16, seed=seed)

    # Again, use a helper to create a model
    # But because model is owned by the reinforcer, model should not be accessed using this variable
    # but from reinforcer.model property
    model = PolicyGradientModelFactory(backbone=NatureCnnFactory(
        input_width=84, input_height=84, input_channels=4)).instantiate(
            action_space=vec_env.action_space)

    # Reinforcer - an object managing the learning process
    reinforcer = OnPolicyIterationReinforcer(
        device=device,
        settings=OnPolicyIterationReinforcerSettings(
            discount_factor=0.99,
            batch_size=256,
        ),
        model=model,
        algo=A2CPolicyGradient(entropy_coefficient=0.01,
                               value_coefficient=0.5,
                               max_grad_norm=0.5),
        env_roller=StepEnvRoller(
            environment=vec_env,
            device=device,
            number_of_steps=5,
            discount_factor=0.99,
        ))

    # Model optimizer
    optimizer = optim.RMSprop(reinforcer.model.parameters(),
                              lr=7.0e-4,
                              eps=1e-3)

    # Overall information store for training information
    training_info = TrainingInfo(
        metrics=[
            EpisodeRewardMetric(
                'episode_rewards'),  # Calculate average reward from episode
        ],
        callbacks=[StdoutStreaming()
                   ]  # Print live metrics every epoch to standard output
    )

    # A bit of training initialization bookkeeping...
    training_info.initialize()
    reinforcer.initialize_training(training_info)
    training_info.on_train_begin()

    # Let's make 100 batches per epoch to average metrics nicely
    num_epochs = int(1.1e7 / (5 * 16) / 100)

    # Normal handrolled training loop
    for i in range(1, num_epochs + 1):
        epoch_info = EpochInfo(training_info=training_info,
                               global_epoch_idx=i,
                               batches_per_epoch=100,
                               optimizer=optimizer)

        reinforcer.train_epoch(epoch_info)

    training_info.on_train_end()
Example #5
0
def qbert_ppo():
    device = torch.device('cuda:0')
    seed = 1001

    # Set random seed in python std lib, numpy and pytorch
    set_seed(seed)

    # Create 16 environments evaluated in parallel in sub processess with all usual DeepMind wrappers
    # These are just helper functions for that
    vec_env = SubprocVecEnvWrapper(ClassicAtariEnv('QbertNoFrameskip-v4'),
                                   frame_history=4).instantiate(
                                       parallel_envs=8, seed=seed)

    # Again, use a helper to create a model
    # But because model is owned by the reinforcer, model should not be accessed using this variable
    # but from reinforcer.model property
    model = PolicyGradientModelFactory(backbone=NatureCnnFactory(
        input_width=84, input_height=84, input_channels=4)).instantiate(
            action_space=vec_env.action_space)

    # Set schedule for gradient clipping.
    cliprange = LinearSchedule(initial_value=0.1, final_value=0.0)

    # Reinforcer - an object managing the learning process
    reinforcer = OnPolicyIterationReinforcer(
        device=device,
        settings=OnPolicyIterationReinforcerSettings(discount_factor=0.99,
                                                     batch_size=256,
                                                     experience_replay=4),
        model=model,
        algo=PpoPolicyGradient(entropy_coefficient=0.01,
                               value_coefficient=0.5,
                               max_grad_norm=0.5,
                               cliprange=cliprange),
        env_roller=StepEnvRoller(
            environment=vec_env,
            device=device,
            gae_lambda=0.95,
            number_of_steps=128,
            discount_factor=0.99,
        ))

    # Model optimizer
    optimizer = optim.Adam(reinforcer.model.parameters(),
                           lr=2.5e-4,
                           eps=1.0e-5)

    # Overall information store for training information
    training_info = TrainingInfo(
        metrics=[
            EpisodeRewardMetric(
                'episode_rewards'),  # Calculate average reward from episode
        ],
        callbacks=[
            StdoutStreaming(
            ),  # Print live metrics every epoch to standard output
            FrameTracker(
                1.1e7
            )  # We need frame tracker to track the progress of learning
        ])

    # A bit of training initialization bookkeeping...
    training_info.initialize()
    reinforcer.initialize_training(training_info)
    training_info.on_train_begin()

    # Let's make 10 batches per epoch to average metrics nicely
    # Rollout size is 8 environments times 128 steps
    num_epochs = int(1.1e7 / (128 * 8) / 10)

    # Normal handrolled training loop
    for i in range(1, num_epochs + 1):
        epoch_info = EpochInfo(training_info=training_info,
                               global_epoch_idx=i,
                               batches_per_epoch=10,
                               optimizer=optimizer)

        reinforcer.train_epoch(epoch_info)

    training_info.on_train_end()