コード例 #1
0
def main(_):
    for map_name in env_names:
        if rl_algo == 'ddpg':
            from agent.ddpg import DDPGAgent
            from networks.acnetwork_q_seperated import ActorNet, CriticNet
            from utils.memory import SequentialMemory

            actor = ActorNet()
            critic = CriticNet()
            memory = SequentialMemory(limit=arglist.DDPG.memory_limit)
            learner = DDPGAgent(actor, critic, memory)

        elif rl_algo == 'ppo':
            from agent.ppo import PPOAgent
            from networks.acnetwork_v_seperated import ActorNet, CriticNet
            from utils.memory import EpisodeMemory

            actor = ActorNet()
            critic = CriticNet()
            memory = EpisodeMemory(limit=arglist.PPO.memory_limit,
                                   action_shape=arglist.action_shape,
                                   observation_shape=arglist.observation_shape)
            learner = PPOAgent(actor, critic, memory)

        else:
            raise NotImplementedError()

        preprocess = Preprocess()
        game = MiniGame(map_name, learner, preprocess, nb_episodes=10000)
        game.run_ddpg()
    return 0
コード例 #2
0
    def __init__(self, agent_id: int, enforce_env_name: str = None):
        """Chiefinvestigator can assign investigator to inspect the model and produce high-level analysis.

        Args:
            agent_id: ID of agent that will be analyzed.
            env_name: Name of the gym environment that the agent was trained in. Default is set to CartPole-v1
        """

        self.agent = PPOAgent.from_agent_state(agent_id, from_iteration='best')
        super().__init__(self.agent.policy, self.agent.distribution,
                         self.agent.preprocessor)
        self.env = self.agent.env
        if enforce_env_name is not None:
            print(
                f"Enforcing environment {enforce_env_name} over agents original environment. If you want to use"
                f"the same environment as the original agent anyways, there is no need to specify it in the"
                f"constructor!")
            self.env = gym.make(enforce_env_name)
        self.agent.preprocessor = CombiWrapper([
            StateNormalizationWrapper(self.agent.state_dim),
            RewardNormalizationWrapper()
        ])  # dirty fix, TODO remove soon
        self.weights = self.get_layer_weights('policy_recurrent_layer')
        self.n_hidden = self.weights[1].shape[0]
        self._get_rnn_type()
        self.sub_model_from = build_sub_model_from(self.network,
                                                   "beta_action_head")
コード例 #3
0
def evaluate():
    """Evaluate an agent."""
    if request.method == "POST":
        try:
            agent = PPOAgent.from_agent_state(request.json['id'])
            evaluation_stats, _ = agent.evaluate(10, save=True)

            return {"results": evaluation_stats._asdict()}

        except Exception as e:
            return {"success": e.__repr__()}

    return {"success": "success"}
コード例 #4
0
ファイル: train_ppo.py プロジェクト: flamz3d/q-trader
if len(sys.argv) < 4:
    print("Usage: python train.py [stock] [window] [episodes]")
    exit()

stock_name, window_size, episode_count = sys.argv[1], int(sys.argv[2]), int(
    sys.argv[3])
model_file = None

if len(sys.argv) == 5:
    model_file = sys.argv[4]

if __name__ == '__main__':
    actor_model_file = None
    critic_model_file = None
    if model_file is not None:
        model_file_path = os.path.dirname(os.path.abspath(model_file))
        base_filename = os.path.basename(os.path.abspath(model_file))
        base_filename = os.path.splitext(base_filename)[0]
        index = base_filename.rfind("_")
        episode_to_load = base_filename[index + 1:]
        critic_model_file = os.path.join(
            model_file_path,
            "model_critic_" + stock_name + "_" + episode_to_load + ".h5")
        actor_model_file = os.path.join(
            model_file_path,
            "model_actor_" + stock_name + "_" + episode_to_load + ".h5")

    agent = PPOAgent(StockPredict(window_size, stock_name, episode_count),
                     actor_model_file, critic_model_file)
    agent.run(episode_count)
コード例 #5
0
def test():
    parser = argparse.ArgumentParser(
        description='Test an agent in the ViZDoom environment.')
    parser.add_argument('agent_path', help='path to the agent checkpoint')
    parser.add_argument('--show_game',
                        dest='show_game',
                        default=False,
                        action='store_true',
                        help='whether to show the game while agent is playing')
    parser.add_argument('--record',
                        dest='record',
                        default=False,
                        action='store_true',
                        help='whether to record the agent playing')
    parser.add_argument('--output_path',
                        dest='output_path',
                        help='output path for the replay')
    parser.add_argument('--cuda',
                        dest='cuda',
                        default=False,
                        action='store_true',
                        help='whether to use cuda')
    parser.add_argument('--n_games',
                        dest='n_games',
                        default=1,
                        type=int,
                        help='number of games to play')

    args = parser.parse_args()
    env_params, progress_params, agent_params = CheckpointMonitor.load(
        args.agent_path)

    game = initialize_vizdoom(env_params["map_name"], args.show_game)
    actions = env_params["env"]["actions"]

    in_channels = env_params["env"]["state_dim"][0] * env_params["env"][
        "frames_per_state"]
    if env_params["agent"]["nn"] == 'deepmind_cnn':
        feature_net = CNN(in_channels)
    elif env_params["agent"]["nn"] == 'capsnet':
        feature_net = CapsNet(in_channels)

    if env_params["agent"]["alg"] == 'ppo':
        policy = ActorCriticPolicy(feature_net, len(actions))
        agent = PPOAgent(policy, None, None, cuda=args.cuda)
    elif env_params["agent"]["alg"] == 'a2c':
        policy = ActorCriticPolicy(feature_net, len(actions))
        agent = A2CAgent(policy, None, cuda=args.cuda)
    elif env_params["agent"]["alg"] == 'dqn':
        q_net = QNetwork(feature_net, len(actions))
        agent = DQNAgent(q_net, q_net, None, None, None, cuda=args.cuda)

    agent.load(agent_params)

    checkpoint_monitor = CheckpointMonitor(env_params, agent)
    generator = TrajectoryGenerator(
        game,
        0,
        0,
        agent,
        param_schedules=progress_params.get("schedules", None),
        monitors=[checkpoint_monitor, env_params["progress_monitor"]],
        **env_params["env"])

    mean, std, max, min, frames = generator.test(args.n_games, args.record)

    print("Score: %1.f +/- %1.f, max: %1.f, min: %1.f" % (mean, std, max, min))

    if args.record:
        save_recording(frames, args.output_path)
コード例 #6
0
from agent.ppo import PPOAgent
from utilities.const import QUALITATIVE_COLOR_PALETTE, PATH_TO_EXPERIMENTS

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

AGENT_ID = 1587117437
os.chdir("../../")

with open(f"{PATH_TO_EXPERIMENTS}/{AGENT_ID}/progress.json", "r") as f:
    data = json.load(f)

with open(f"{PATH_TO_EXPERIMENTS}/{AGENT_ID}/meta.json", "r") as f:
    meta = json.load(f)

agent = PPOAgent.from_agent_state(AGENT_ID, "b", path_modifier="")

mean_rewards = data["rewards"]["mean"]
mean_rewards_smooth = savgol_filter(mean_rewards, 51, 3)
std_rewards = data["rewards"]["stdev"]

axs: List[Axes]
fig: Figure = plt.figure(figsize=(12, 4))
grid = plt.GridSpec(1, 3)

progression_ax = fig.add_subplot(grid[:2])
progression_ax.set_xlim(0, len(mean_rewards))

progression_ax.axhline(meta["environment"]["reward_threshold"], ls="--", color="grey") \
    if (meta["environment"]["reward_threshold"] != "None" and meta["environment"]["reward_threshold"] is not None) else None
progression_ax.plot(mean_rewards,
コード例 #7
0
#!/usr/bin/env python
"""Example script on loading and inspecting an agent."""
import os

from agent.ppo import PPOAgent
from analysis.investigation import Investigator

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

os.chdir("../")

agent = PPOAgent.from_agent_state(1580042580)
inv = Investigator.from_agent(agent)

# render agent at different steps
inv.render_episode(agent.env)
コード例 #8
0
                    type=int,
                    nargs="?",
                    help="id of the agent, defaults to newest",
                    default=None)
parser.add_argument("-n",
                    type=int,
                    help="number of evaluation episodes",
                    default=10)
args = parser.parse_args()

if args.id is None:
    ids = map(int, os.listdir(BASE_SAVE_PATH))
    args.id = max(ids)

start = time.time()
agent = PPOAgent.from_agent_state(args.id, "b")
print(f"Agent {args.id} successfully loaded.")

stats, _ = agent.evaluate(args.n)

average_reward = round(statistics.mean(stats.episode_rewards), 2)
average_length = round(statistics.mean(stats.episode_lengths), 2)
std_reward = round(statistics.stdev(stats.episode_rewards), 2)
std_length = round(statistics.stdev(stats.episode_lengths), 2)

print(
    f"Evaluated agent on {args.n} x {agent.env_name} and achieved an average reward of {average_reward} [std: {std_reward}; "
    f"between ({min(stats.episode_rewards)}, {max(stats.episode_rewards)})].\n"
    f"An episode on average took {average_length} steps [std: {std_length}; "
    f"between ({min(stats.episode_lengths)}, {max(stats.episode_lengths)})].\n"
    f"This took me {round(time.time() - start, 2)}s.")
コード例 #9
0
#!/usr/bin/env python
"""Example script on loading agent and rendering episodes."""
import os

from agent.ppo import PPOAgent
from analysis.investigation import Investigator

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

os.chdir("../")

AGENT_ID = 1580042580

latest_agent = PPOAgent.from_agent_state(AGENT_ID)
persistent_env = latest_agent.env

# iterate over every save of the agent during training to see evolution of behaviour
for iteration in PPOAgent.get_saved_iterations(AGENT_ID):
    # load agent and wrap an investigator around it
    agent = PPOAgent.from_agent_state(AGENT_ID, from_iteration=iteration)
    inv = Investigator.from_agent(agent)

    # render a randomly initialized episode
    inv.render_episode(persistent_env)

    # just print a line to make output more readable
    print()
コード例 #10
0
def run_experiment(environment, settings: dict, verbose=True, init_ray=True, use_monitor=False) -> PPOAgent:
    """Run an experiment with the given settings ."""

    # sanity checks and warnings for given parameters
    if settings["preload"] is not None and settings["load_from"] is not None:
        raise InconsistentArgumentError("You gave both a loading from a pretrained component and from another "
                                        "agent state. This cannot be resolved.")

    # setup environment and extract and report information
    env = gym.make(environment)
    state_dim, number_of_actions = env_extract_dims(env)
    env_action_space_type = "continuous" if isinstance(env.action_space, Box) else "discrete"
    env_observation_space_type = "continuous" if isinstance(env.observation_space, Box) else "discrete"
    env_name = env.unwrapped.spec.id

    if env.spec.max_episode_steps is not None and env.spec.max_episode_steps > settings["horizon"] and not settings[
        "eval"]:
        logging.warning("Careful! Your horizon is lower than the max reward, this will most likely skew stats heavily.")

    # choose and make policy distribution
    if settings["distribution"] is None:
        settings["distribution"] = "categorical" if env_action_space_type == "discrete" else "gaussian"

    distribution = get_distribution_by_short_name(settings["distribution"])(env)

    # setting appropriate model building function
    if "ShadowHand" in environment or settings["architecture"] == "shadow":
        if settings["model"] == "ffn":
            print("Cannot use ffn with shadow architecture. Defaulting to GRU.")
            settings["model"] = "gru"

        if env.visual_input:
            build_models = get_model_builder(model="shadow", model_type=settings["model"], shared=settings["shared"])
        else:
            build_models = build_blind_shadow_brain_v1
    else:
        build_models = get_model_builder(model=settings["architecture"], model_type=settings["model"],
                                         shared=settings["shared"])

    # make preprocessor
    preprocessor = CombiWrapper(
        [StateNormalizationWrapper(state_dim) if not settings["no_state_norming"] else SkipWrapper(),
         RewardNormalizationWrapper() if not settings["no_reward_norming"] else SkipWrapper()])

    # announce experiment
    bc, ec, wn = COLORS["HEADER"], COLORS["ENDC"], COLORS["WARNING"]
    if verbose:
        print(f"-----------------------------------------\n"
              f"{wn}Learning the Task{ec}: {bc}{env_name}{ec}\n"
              f"{bc}{state_dim}{ec}-dimensional states ({bc}{env_observation_space_type}{ec}) "
              f"and {bc}{number_of_actions}{ec} actions ({bc}{env_action_space_type}{ec}).\n"
              f"Config: {settings['config']}\n"
              f"Model: {build_models.__name__}\n"
              f"Distribution: {settings['distribution']}\n"
              f"-----------------------------------------\n")

        print(f"{wn}HyperParameters{ec}: {settings}\n")

    if settings["cpu"]:
        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

    if settings["load_from"] is not None:
        if verbose:
            print(f"{wn}Loading{ec} from state {settings['load_from']}")
        agent = PPOAgent.from_agent_state(settings["load_from"])
    else:
        # set up the agent and a reporting module
        agent = PPOAgent(build_models, env, horizon=settings["horizon"], workers=settings["workers"],
                         learning_rate=settings["lr_pi"], lr_schedule=settings["lr_schedule"],
                         discount=settings["discount"],
                         clip=settings["clip"], c_entropy=settings["c_entropy"], c_value=settings["c_value"],
                         lam=settings["lam"],
                         gradient_clipping=settings["grad_norm"], clip_values=settings["clip_values"],
                         tbptt_length=settings["tbptt"], distribution=distribution, preprocessor=preprocessor,
                         pretrained_components=None if settings["preload"] is None else [settings["preload"]],
                         debug=settings["debug"])

        print(f"{wn}Created agent{ec} with ID {bc}{agent.agent_id}{ec}")

    if tf.test.is_gpu_available():
        agent.set_gpu(not settings["cpu"])
    else:
        agent.set_gpu(False)

    monitor = None
    if use_monitor:
        monitor = Monitor(agent, env, frequency=settings["monitor_frequency"], gif_every=settings["gif_every"],
                      iterations=settings["iterations"], config_name=settings["config"])

    redis_auth = None if settings["redis_ip"] is None else [settings["redis_ip"], settings["redis_pw"]]
    agent.drill(n=settings["iterations"], epochs=settings["epochs"], batch_size=settings["batch_size"], monitor=monitor,
                export=settings["export_file"], save_every=settings["save_every"], separate_eval=settings["eval"],
                stop_early=settings["stop_early"], parallel=not settings["sequential"], ray_is_initialized=not init_ray,
                radical_evaluation=settings["radical_evaluation"], redis_auth=redis_auth)

    agent.save_agent_state()
    env.close()

    return agent
コード例 #11
0
from agent.ppo import PPOAgent
from test_environment import TestEnvironment
import sys
import os
import argparse

parser = argparse.ArgumentParser(description='test with ppo.')
parser.add_argument('--stock_name',
                    dest='stock_name',
                    required=True,
                    help='name of stock')
parser.add_argument('--window',
                    dest='window',
                    default=10,
                    help='number of candles in window')
parser.add_argument('--model_file',
                    dest='model_file',
                    required=True,
                    help='checkpoint file relative to models directory')
args = parser.parse_args()

agent = PPOAgent(TestEnvironment(args.window, args.stock_name, 1000),
                 args.model_file)
agent.test()
コード例 #12
0
ファイル: train.py プロジェクト: pcode93/rl_doom
def train():
    parser = argparse.ArgumentParser(
        description='Train an agent in the ViZDoom environment.')
    parser.add_argument('map_name', help='path to the map config')
    parser.add_argument('--output_path',
                        dest='output_path',
                        help='output path for agent checkpoints')
    parser.add_argument(
        '--save_interval',
        dest='save_interval',
        default=10,
        type=int,
        help='interval, measured in epochs, between each agent checkpoint')
    parser.add_argument('--cuda',
                        dest='cuda',
                        default=False,
                        action='store_true',
                        help='whether to use cuda')
    parser.add_argument('--log_interval',
                        dest='log_interval',
                        default=10,
                        type=int,
                        help='interval between each progress update log')
    parser.add_argument(
        '--score_buffer_size',
        dest='score_buffer_size',
        default=50,
        type=int,
        help=
        'the amount of last scores that will be saved to compute statistics')

    parser.add_argument('--n_epochs',
                        dest='n_epochs',
                        default=1000,
                        type=int,
                        help='number of epochs')
    parser.add_argument('--epoch_len',
                        dest='epoch_len',
                        default=1024,
                        type=int,
                        help='the length of an epoch')
    parser.add_argument('--lr',
                        dest='lr',
                        default=2.5e-4,
                        type=float,
                        help='learning rate')
    parser.add_argument('--lr_decay',
                        dest='decay_lr',
                        default=False,
                        help='whether to decay learning rate each epoch')
    parser.add_argument('--gamma',
                        dest='gamma',
                        default=0.99,
                        type=float,
                        help='discount factor')
    parser.add_argument('--batch_size',
                        dest='batch_size',
                        default=32,
                        type=int,
                        help='batch size')
    parser.add_argument('--alg',
                        dest='alg',
                        default='ppo',
                        choices=['ppo', 'dqn', 'a2c'],
                        help='the algorithm the agent will use')
    parser.add_argument(
        '--nn',
        dest='nn',
        default='deepmind_cnn',
        choices=['deepmind_cnn', 'capsnet'],
        help='neural network that the agent will use as its feature network')

    parser.add_argument('--frame_skip',
                        dest='frame_skip',
                        default=4,
                        type=int,
                        help='number of frames to skip each action')
    parser.add_argument('--frames_per_state',
                        dest='frames_per_state',
                        default=4,
                        type=int,
                        help='number of frames to stack every state')
    parser.add_argument('--state_w',
                        dest='state_w',
                        default=108,
                        type=int,
                        help='target state width to resize each frame to')
    parser.add_argument('--state_h',
                        dest='state_h',
                        default=60,
                        type=int,
                        help='target state height to resize each frame to')
    parser.add_argument('--state_rgb',
                        dest='rgb',
                        default=False,
                        action='store_true',
                        help='whether to use rgb or gray frames')
    parser.add_argument(
        '--shape_rewards',
        dest='shape_rewards',
        default=False,
        action='store_true',
        help=
        'whether to use a reward shaping function specified for the selected map'
    )
    parser.add_argument(
        '--use_default_actions_for_map',
        dest='use_default_actions',
        default=False,
        action='store_true',
        help=
        'whether to use a default set of actions specified for the selected map'
    )

    parser.add_argument('--ppo_lambda',
                        dest='lam',
                        default=0.95,
                        type=float,
                        help='lambda value for GAE')
    parser.add_argument('--ppo_eps',
                        dest='eps',
                        default=0.1,
                        type=float,
                        help='clipping parameter for PPO')
    parser.add_argument(
        '--ppo_decay_params',
        dest='ppo_decay',
        default=False,
        action='store_true',
        help=
        'whether to decay PPO learning rate and epsilon each epoch linearly')
    parser.add_argument('--ppo_ent_coeff',
                        dest='ent_coeff',
                        default=0.01,
                        type=float,
                        help='entropy coefficient for PPO')
    parser.add_argument('--ppo_value_coeff',
                        dest='value_coeff',
                        default=1.0,
                        type=float,
                        help='value coefficient for PPO')
    parser.add_argument('--ppo_opt_epochs',
                        dest='opt_epochs',
                        default=4,
                        type=int,
                        help='number of optimization epochs for PPO')

    parser.add_argument('--dqn_use_ddqn',
                        dest='ddqn',
                        default=False,
                        action='store_true',
                        help='whether to use ddqn instead of dqn')
    parser.add_argument('--dqn_dueling',
                        dest='dueling',
                        default=False,
                        action='store_true',
                        help='whether to use a dueling architecture in dqn')
    parser.add_argument('--dqn_min_eps',
                        dest='min_eps',
                        default=0.01,
                        type=float,
                        help='minimum value of epsilon for dqn')
    parser.add_argument('--dqn_mem_size',
                        dest='memory_size',
                        default=100000,
                        type=int,
                        help='replay memory size for dqn')
    parser.add_argument('--dqn_init_size',
                        dest='init_size',
                        default=10000,
                        type=int,
                        help='number of timesteps before dqn starts learning')
    parser.add_argument('--dqn_q_update_interval',
                        dest='q_update_interval',
                        default=1,
                        type=int,
                        help='the interval between updates of the q function')
    parser.add_argument(
        '--dqn_target_update_interval',
        dest='target_update_interval',
        default=1000,
        type=int,
        help='the interval between updated of the target q function')

    args = parser.parse_args()

    game = initialize_vizdoom(args.map_name)

    if args.use_default_actions:
        actions = default_actions_for_map(game, args.map_name)
    else:
        actions = all_actions(game)

    reward_fn = default_reward_shaping(
        args.map_name) if args.shape_rewards else None

    in_channels = args.frames_per_state * (3 if args.rgb else 1)

    if args.nn == 'deepmind_cnn':
        feature_net = CNN(in_channels)
    elif args.nn == 'capsnet':
        feature_net = CapsNet(in_channels)

    if args.alg == 'ppo':
        policy = ActorCriticPolicy(feature_net, len(actions))
        optimizer = torch.optim.Adam(policy.parameters(), lr=args.lr)

        eps_sched = LinearSchedule("eps",
                                   args.eps,
                                   1,
                                   args.n_epochs,
                                   end_val=1.0 if not args.ppo_decay else 0.0)
        lr_sched = LRWrapper(
            optimizer,
            LinearSchedule("lr",
                           args.lr,
                           1,
                           args.n_epochs,
                           end_val=1.0 if not args.ppo_decay else 0.0))
        schedules = [lr_sched, eps_sched]

        agent = PPOAgent(policy,
                         optimizer,
                         eps_sched,
                         cuda=args.cuda,
                         n_timesteps=args.epoch_len,
                         batch_size=args.batch_size,
                         opt_epochs=args.opt_epochs,
                         gamma=args.gamma,
                         lam=args.lam,
                         entropy_coeff=args.ent_coeff,
                         value_coeff=args.value_coeff)
    elif args.alg == 'a2c':
        policy = ActorCriticPolicy(feature_net, len(actions))
        optimizer = torch.optim.Adam(policy.parameters(), lr=args.lr)

        lr_sched = LRWrapper(
            optimizer,
            LinearSchedule("lr",
                           args.lr,
                           1,
                           args.n_epochs,
                           end_val=1.0 if not args.decay_lr else 0.0))
        schedules = [lr_sched]

        agent = A2CAgent(policy, optimizer, args.cuda, args.gamma,
                         args.epoch_len)
    elif args.alg == 'dqn':
        q = QNetwork(feature_net, len(actions))
        tq = QNetwork(feature_net, len(actions))
        optimizer = torch.optim.Adam(q.parameters(), lr=args.lr)

        memory = ReplayMemory(args.memory_size)
        eps_sched = LinearSchedule("eps",
                                   1,
                                   1,
                                   args.n_epochs,
                                   end_val=args.min_eps)
        lr_sched = LRWrapper(
            optimizer,
            LinearSchedule("lr",
                           args.lr,
                           1,
                           args.n_epochs,
                           end_val=1.0 if not args.decay_lr else 0.0))
        schedules = [lr_sched, eps_sched]

        agent = DQNAgent(q,
                         tq,
                         optimizer,
                         memory,
                         eps_sched,
                         cuda=args.cuda,
                         init_steps=args.init_size,
                         q_update_interval=args.q_update_interval,
                         target_update_interval=args.target_update_interval,
                         ddqn=args.ddqn,
                         gamma=args.gamma,
                         batch_size=args.batch_size)

    progress_monitor = ProgressMonitor(args.score_buffer_size,
                                       monitor_interval=args.log_interval)

    env_params = {
        "env": {
            "frame_skip": args.frame_skip,
            "frames_per_state": args.frames_per_state,
            "state_dim": (3 if args.rgb else 1, args.state_h, args.state_w),
            "actions": actions
        },
        "agent": {
            "alg": args.alg,
            "nn": args.nn
        },
        "save_path": args.output_path,
        "save_interval": args.save_interval,
        "progress_monitor": progress_monitor,
        "map_name": args.map_name
    }

    if args.output_path:
        checkpoint_monitor = CheckpointMonitor(env_params, agent)
        monitors = [checkpoint_monitor, progress_monitor]
    else:
        monitors = [progress_monitor]

    generator = TrajectoryGenerator(game,
                                    args.n_epochs,
                                    args.epoch_len,
                                    agent,
                                    shape_reward_fn=reward_fn,
                                    monitors=monitors,
                                    param_schedules=schedules,
                                    **env_params["env"])

    generator.run()