def load(self, config):
        if type(config) == type(""):
            self.logger.info("reading config file at {}".format(config))
            with open(config, 'r') as infile:
                import json
                self.dict = json.load(infile)
        elif type(config) == type({}):
            self.dict = config
        else:
            raise TypeError(
                "Wrong type for configuration, must be a path or a dict")

        self.id = self.dict["general"]["id"]
        self.workspace = Path(self.dict["general"]["workspace"])
        print(self.dict["general"])

        import logging.config as config
        config.dictConfig(self.dict["general"]["dictConfig"])

        self.seed = self.dict["general"]["seed"]
        if self.seed is not None:
            from ncarrara.utils.math_utils import set_seed
            set_seed(self.seed)

        import numpy as np
        np.set_printoptions(precision=2, suppress=True)

        self.writer = None
        self.is_tensorboardX = self["general"]["is_tensorboardX"]

        return self
Exemple #2
0
def execute_policy_from_config(generate_envs,
                               policy_config,
                               seed=None,
                               gamma_r=1.0,
                               gamma_c=1.0,
                               n_trajectories=10,
                               beta=1.,
                               epsilon_schedule=None,
                               save_path=None,
                               logging_config={}):
    """
        Generate an environment and a policy from configurations, and collect trajectories.
    :param generate_envs: environment config
    :param policy_config: policy config
    :param seed: to seed the environment before execution
    :param gamma_r: see execute_policy()
    :param gamma_c: see execute_policy()
    :param n_trajectories: see execute_policy()
    :param beta: see execute_policy()
    :param epsilon_schedule: see execute_policy()
    :param save_path: see execute_policy()
    :param logging_config: the logging config of the process
    :return: the collected trajectories
    """
    if logging_config:
        import logging.config as config
        config.dictConfig(logging_config)

    envs, params = envs_factory.generate_envs(**generate_envs)
    env = envs[0]
    set_seed(seed, env)

    policy_config["env"] = env
    pi = policy_factory(policy_config)
    return execute_policy(env, pi, gamma_r, gamma_c, n_trajectories, beta, epsilon_schedule, save_path)
def main(source_envs, feature_dqn_info, net_params, dqn_params,
         N, seed, device, workspace, decay, start_decay, traj_max_size, gamma, writer=None):
    envs, params = generate_envs(**source_envs)

    for ienv, env in enumerate(envs):
        logger.info("generating samples for env {}".format(ienv))
        set_seed(seed=seed, env=env)
        feature_dqn = build_feature_dqn(feature_dqn_info)
        _, _, memory, dqn = run_dqn(
            env,
            id="generate_sources_env_{}".format(ienv),
            workspace=workspace / "dqn_workspace",
            seed=seed,
            feature_dqn=feature_dqn,
            device=device,
            net_params=net_params,
            dqn_params=dqn_params,
            N=N,
            decay=decay,
            start_decay=start_decay,
            traj_max_size=traj_max_size,
            gamma=gamma,
            writer=writer)

        memory.save(workspace / "samples" / "{}.json".format(ienv), as_json=False)
        dqn.save(workspace / "dqn" / "{}.pt".format(ienv))
    with open(workspace / 'params.json', 'w') as file:
        dump = json.dumps(params, indent=4)
        print(dump)
        file.write(dump)
    return env.action_space.n
def main(load_memory, generate_envs, feature_str, gamma, ftq_params, ftq_net_params, device, normalize_reward,
         workspace, seed,
         lambda_=0., **args):
    envs, params = envs_factory.generate_envs(**generate_envs)
    e = envs[0]
    set_seed(seed, e)

    feature = feature_factory(feature_str)

    ftq = PytorchFittedQ(
        device=device,
        policy_network=NetFTQ(n_in=len(feature(e.reset(), e)), n_out=e.action_space.n, **ftq_net_params),
        action_str=None if not hasattr(e, "action_str") else e.action_str,
        test_policy=None,
        gamma=gamma,
        **ftq_params
    )

    rm = Memory()
    rm.load_memory(**load_memory)

    transitions_ftq, _ = urpy.datas_to_transitions(rm.memory, e, feature, lambda_, normalize_reward)
    logger.info("[learning ftq with full batch] #samples={} ".format(len(transitions_ftq)))
    ftq.reset(True)
    ftq.workspace = workspace
    makedirs(ftq.workspace)
    ftq.fit(transitions_ftq)
    ftq.save_policy()
Exemple #5
0
def main(load_memory, generate_envs, feature_str, gamma, gamma_c, bftq_params,
         bftq_net_params, workspace, seed, device, normalize_reward, general,
         **args):
    logger = logging.getLogger(__name__)

    envs, params = envs_factory.generate_envs(**generate_envs)
    e = envs[0]
    e.reset()
    set_seed(seed, e)
    feature = feature_factory(feature_str)

    bftq = PytorchBudgetedFittedQ(
        device=device,
        workspace=workspace,
        actions_str=get_actions_str(e),
        policy_network=NetBFTQ(size_state=len(feature(e.reset(), e)),
                               n_actions=e.action_space.n,
                               **bftq_net_params),
        gamma=gamma,
        gamma_c=gamma_c,
        split_batches=general["gpu"]["split_batches"],
        cpu_processes=general["cpu"]["processes"],
        env=e,
        **bftq_params)

    makedirs(workspace)
    rm = Memory()
    rm.load_memory(**load_memory)

    _, transitions_bftq = urpy.datas_to_transitions(rm.memory, e, feature, 0,
                                                    normalize_reward)
    logger.info("[learning bftq with full batch] #samples={} ".format(
        len(transitions_bftq)))

    bftq.reset(True)
    _ = bftq.fit(transitions_bftq)

    bftq.save_policy()
Exemple #6
0
def main(generate_envs, feature_str, betas_for_exploration, gamma, gamma_c,
         bftq_params, bftq_net_params, N_trajs, workspace, seed, device,
         normalize_reward, trajs_by_ftq_batch, epsilon_decay, general, **args):
    # Prepare BFTQ
    envs, params = envs_factory.generate_envs(**generate_envs)
    e = envs[0]
    set_seed(seed, e)
    rm = Memory()
    feature = feature_factory(feature_str)

    def build_fresh_bftq():
        bftq = PytorchBudgetedFittedQ(
            device=device,
            workspace=workspace / "batch=0",
            actions_str=get_actions_str(e),
            policy_network=NetBFTQ(size_state=len(feature(e.reset(), e)),
                                   n_actions=e.action_space.n,
                                   **bftq_net_params),
            gamma=gamma,
            gamma_c=gamma_c,
            cpu_processes=general["cpu"]["processes"],
            env=e,
            split_batches=general["gpu"]["split_batches"],
            hull_options=general["hull_options"],
            **bftq_params)
        return bftq

    # Prepare learning
    i_traj = 0
    decays = math_utils.epsilon_decay(**epsilon_decay,
                                      N=N_trajs,
                                      savepath=workspace)
    betas_for_exploration = np.array(eval(betas_for_exploration))
    memory_by_batch = [get_current_memory()]
    batch_sizes = near_split(N_trajs, size_bins=trajs_by_ftq_batch)
    pi_epsilon_greedy_config = {
        "__class__": repr(EpsilonGreedyPolicy),
        "pi_greedy": {
            "__class__": repr(RandomBudgetedPolicy)
        },
        "pi_random": {
            "__class__": repr(RandomBudgetedPolicy)
        },
        "epsilon": decays[0],
        "hull_options": general["hull_options"],
        "clamp_Qc": bftq_params["clamp_Qc"]
    }

    # Main loop
    trajs = []
    for batch, batch_size in enumerate(batch_sizes):
        # Prepare workers
        cpu_processes = min(
            general["cpu"]["processes_when_linked_with_gpu"] or os.cpu_count(),
            batch_size)
        workers_n_trajectories = near_split(batch_size, cpu_processes)
        workers_start = np.cumsum(workers_n_trajectories)
        workers_traj_indexes = [
            np.arange(*times) for times in zip(
                np.insert(workers_start[:-1], 0, 0), workers_start)
        ]
        if betas_for_exploration.size:
            workers_betas = [
                betas_for_exploration.take(indexes, mode='wrap')
                for indexes in workers_traj_indexes
            ]
        else:
            workers_betas = [
                np.random.random(indexes.size)
                for indexes in workers_traj_indexes
            ]
        workers_seeds = np.random.randint(0, 10000, cpu_processes).tolist()
        workers_epsilons = [
            decays[i_traj + indexes] for indexes in workers_traj_indexes
        ]
        workers_params = list(
            zip_with_singletons(generate_envs, pi_epsilon_greedy_config,
                                workers_seeds, gamma, gamma_c,
                                workers_n_trajectories, workers_betas,
                                workers_epsilons, None, general["dictConfig"]))

        # Collect trajectories
        logger.info(
            "Collecting trajectories with {} workers...".format(cpu_processes))
        if cpu_processes == 1:
            results = []
            for params in workers_params:
                results.append(execute_policy_from_config(*params))
        else:
            with Pool(processes=cpu_processes) as pool:
                results = pool.starmap(execute_policy_from_config,
                                       workers_params)
        i_traj += sum([len(trajectories) for trajectories, _ in results])

        # Fill memory
        [
            rm.push(*sample) for trajectories, _ in results
            for trajectory in trajectories for sample in trajectory
        ]

        transitions_ftq, transition_bftq = datas_to_transitions(
            rm.memory, e, feature, 0, normalize_reward)

        # Fit model
        logger.info(
            "[BATCH={}]---------------------------------------".format(batch))
        logger.info(
            "[BATCH={}][learning bftq pi greedy] #samples={} #traj={}".format(
                batch, len(transition_bftq), i_traj))
        logger.info(
            "[BATCH={}]---------------------------------------".format(batch))
        bftq = build_fresh_bftq()
        bftq.reset(True)
        bftq.workspace = workspace / "batch={}".format(batch)
        makedirs(bftq.workspace)
        if isinstance(e, EnvGridWorld):
            for trajectories, _ in results:
                for traj in trajectories:
                    trajs.append(traj)

            w = World(e)
            w.draw_frame()
            w.draw_lattice()
            w.draw_cases()
            w.draw_source_trajectories(trajs)
            w.save((bftq.workspace / "bftq_on_2dworld_sources").as_posix())
        q = bftq.fit(transition_bftq)

        # Save policy
        network_path = bftq.save_policy()
        os.system("cp {}/policy.pt {}/policy.pt".format(
            bftq.workspace, workspace))

        # Save memory
        save_memory(bftq, memory_by_batch, by_batch=False)

        # Update greedy policy
        pi_epsilon_greedy_config["pi_greedy"] = {
            "__class__": repr(PytorchBudgetedFittedPolicy),
            "feature_str": feature_str,
            "network_path": network_path,
            "betas_for_discretisation": bftq.betas_for_discretisation,
            "device": bftq.device,
            "hull_options": general["hull_options"],
            "clamp_Qc": bftq_params["clamp_Qc"]
        }

        if isinstance(e, EnvGridWorld):

            def pi(state, beta):
                import torch
                from ncarrara.budgeted_rl.bftq.pytorch_budgeted_fittedq import convex_hull, \
                    optimal_pia_pib
                with torch.no_grad():
                    hull = convex_hull(s=torch.tensor([state],
                                                      device=device,
                                                      dtype=torch.float32),
                                       Q=q,
                                       action_mask=np.zeros(e.action_space.n),
                                       id="run_" + str(state),
                                       disp=False,
                                       betas=bftq.betas_for_discretisation,
                                       device=device,
                                       hull_options=general["hull_options"],
                                       clamp_Qc=bftq_params["clamp_Qc"])
                    opt, _ = optimal_pia_pib(beta=beta,
                                             hull=hull,
                                             statistic={})
                return opt

            def qr(state, a, beta):
                import torch
                s = torch.tensor([[state]], device=device)
                b = torch.tensor([[[beta]]], device=device)
                sb = torch.cat((s, b), dim=2)
                return q(sb).squeeze()[a]

            def qc(state, a, beta):
                import torch
                s = torch.tensor([[state]], device=device)
                b = torch.tensor([[[beta]]], device=device)
                sb = torch.cat((s, b), dim=2)
                return q(sb).squeeze()[e.action_space.n + a]

            w = World(e, bftq.betas_for_discretisation)
            w.draw_frame()
            w.draw_lattice()
            w.draw_cases()
            w.draw_policy_bftq(pi, qr, qc, bftq.betas_for_discretisation)
            w.save((bftq.workspace / "bftq_on_2dworld").as_posix())

    save_memory(bftq, memory_by_batch, by_batch=True)
Exemple #7
0
def main(betas_test, policy_path, generate_envs, feature_str, device,
         workspace, gamma, gamma_c, bftq_params, seed, N_trajs, path_results,
         general, **args):
    if not os.path.isabs(policy_path):
        policy_path = workspace / policy_path

    pi_config = {
        "__class__": repr(PytorchBudgetedFittedPolicy),
        "feature_str": feature_str,
        "network_path": policy_path,
        "betas_for_discretisation":
        eval(bftq_params["betas_for_discretisation"]),
        "device": device,
        "hull_options": general["hull_options"],
        "clamp_Qc": bftq_params["clamp_Qc"]
    }
    mock_env = envs_factory.generate_envs(**generate_envs)[0][0]
    makedirs(workspace / "trajs")

    makedirs(path_results)
    set_seed(seed)
    try:
        for beta in eval(betas_test):
            # Prepare workers
            cpu_processes = min(
                general["cpu"]["processes_when_linked_with_gpu"]
                or os.cpu_count(), N_trajs)
            workers_n_trajectories = near_split(N_trajs, cpu_processes)
            workers_seeds = np.random.randint(0, 10000, cpu_processes).tolist()
            workers_params = list(
                zip_with_singletons(
                    generate_envs, pi_config, workers_seeds, gamma, gamma_c,
                    workers_n_trajectories, beta, None,
                    "{}/beta={}.results".format(path_results,
                                                beta), general["dictConfig"]))
            logger.info("Collecting trajectories with {} workers...".format(
                cpu_processes))
            with Pool(cpu_processes) as pool:
                results = pool.starmap(execute_policy_from_config,
                                       workers_params)
                rez = np.concatenate([result for _, result in results], axis=0)

                trajs = []
                for t, _ in results:
                    trajs += t
            print("BFTQ({:.2f}) : {}".format(beta, format_results(rez)))

            if isinstance(mock_env, EnvGridWorld):
                from ncarrara.utils_rl.environments.gridworld.world import World
                w = World(mock_env)
                w.draw_frame()
                w.draw_lattice()
                w.draw_cases()
                w.draw_test_trajectories(trajs)
                pp = (workspace / "trajs" / "trajs_beta").as_posix()
                w.save(pp + "={:.2f}".format(beta))
        if isinstance(mock_env, EnvGridWorld):
            os.system("convert -delay 10 -loop 0 " + workspace.as_posix() +
                      "/trajs/" + "*.png " + workspace.as_posix() + "/out.gif")

    except FileNotFoundError as e:
        logger.warning("Could not load policy: {}".format(e))
def main(generate_envs,
         feature_str,
         gamma,
         gamma_c,
         ftq_params,
         ftq_net_params,
         device,
         epsilon_decay,
         N_trajs,
         trajs_by_ftq_batch,
         normalize_reward,
         workspace,
         seed,
         save_memory,
         general,
         lambda_=0,
         **args):
    envs, params = envs_factory.generate_envs(**generate_envs)
    e = envs[0]
    set_seed(seed, e)
    rm = Memory()
    feature = feature_factory(feature_str)

    def build_fresh_ftq():
        ftq = PytorchFittedQ(
            device=device,
            policy_network=NetFTQ(n_in=len(feature(e.reset(), e)),
                                  n_out=e.action_space.n,
                                  **ftq_net_params),
            action_str=None if not hasattr(e, "action_str") else e.action_str,
            test_policy=None,
            gamma=gamma,
            **ftq_params)
        return ftq

    # Prepare learning
    i_traj = 0
    decays = math_utils.epsilon_decay(**epsilon_decay,
                                      N=N_trajs,
                                      savepath=workspace)
    batch_sizes = near_split(N_trajs, size_bins=trajs_by_ftq_batch)
    pi_epsilon_greedy_config = {
        "__class__": repr(EpsilonGreedyPolicy),
        "pi_greedy": {
            "__class__": repr(RandomPolicy)
        },
        "pi_random": {
            "__class__": repr(RandomPolicy)
        },
        "epsilon": decays[0]
    }

    # Main loop
    trajs = []
    for batch, batch_size in enumerate(batch_sizes):
        # Prepare workers
        cpu_processes = min(
            general["cpu"]["processes_when_linked_with_gpu"] or os.cpu_count(),
            batch_size)
        workers_n_trajectories = near_split(batch_size, cpu_processes)
        workers_start = np.cumsum(workers_n_trajectories)
        workers_traj_indexes = [
            np.arange(*times) for times in zip(
                np.insert(workers_start[:-1], 0, 0), workers_start)
        ]
        workers_seeds = np.random.randint(0, 10000, cpu_processes).tolist()
        workers_epsilons = [
            decays[i_traj + indexes] for indexes in workers_traj_indexes
        ]
        workers_params = list(
            zip_with_singletons(generate_envs, pi_epsilon_greedy_config,
                                workers_seeds, gamma, gamma_c,
                                workers_n_trajectories, None, workers_epsilons,
                                None, general["dictConfig"]))

        # Collect trajectories
        logger.info(
            "Collecting trajectories with {} workers...".format(cpu_processes))
        if cpu_processes == 1:
            results = [execute_policy_from_config(*workers_params[0])]
        else:
            with Pool(processes=cpu_processes) as pool:
                results = pool.starmap(execute_policy_from_config,
                                       workers_params)
        i_traj += sum([len(trajectories) for trajectories, _ in results])

        # Fill memory
        [
            rm.push(*sample) for trajectories, _ in results
            for trajectory in trajectories for sample in trajectory
        ]
        transitions_ftq, _ = datas_to_transitions(rm.memory, e, feature,
                                                  lambda_, normalize_reward)

        # Fit model
        logger.info(
            "[BATCH={}]---------------------------------------".format(batch))
        logger.info(
            "[BATCH={}][learning ftq pi greedy] #samples={} #traj={}".format(
                batch, len(transitions_ftq), i_traj))
        logger.info(
            "[BATCH={}]---------------------------------------".format(batch))
        ftq = build_fresh_ftq()
        ftq.reset(True)
        ftq.workspace = workspace / "batch={}".format(batch)
        makedirs(ftq.workspace)

        if isinstance(e, EnvGridWorld):

            for trajectories, _ in results:
                for traj in trajectories:
                    trajs.append(traj)

            w = World(e)
            w.draw_frame()
            w.draw_lattice()
            w.draw_cases()
            w.draw_source_trajectories(trajs)
            w.save((ftq.workspace / "bftq_on_2dworld_sources").as_posix())

        ftq.fit(transitions_ftq)

        # Save policy
        network_path = ftq.save_policy()
        os.system("cp {}/policy.pt {}/final_policy.pt".format(
            ftq.workspace, workspace))

        # Update greedy policy
        pi_epsilon_greedy_config["pi_greedy"] = {
            "__class__": repr(PytorchFittedPolicy),
            "feature_str": feature_str,
            "network_path": network_path,
            "device": ftq.device
        }
    if save_memory is not None:
        rm.save(workspace / save_memory["path"], save_memory["as_json"])
Exemple #9
0
def run_dqn(env,id, workspace, device, net_params, dqn_params, decay, N, seed, feature_dqn, start_decay, gamma=None,
            transfer_module=None, evaluate_greedy_policy=True, traj_max_size=None, writer=None):
    size_state = len(feature_dqn(env.reset()))

    net = NetDQN(n_in=size_state, n_out=env.action_space.n, **net_params)
    dqn = TDQN(
        id=id,
        policy_network=net,
        device=device,
        transfer_module=transfer_module,
        workspace=workspace,
        writer=writer,
        feature=feature_dqn,
        info={"env":env},
        **dqn_params)
    dqn.reset()
    set_seed(seed=seed, env=env)
    rrr = []
    rrr_greedy = []
    epsilons = epsilon_decay(start=start_decay, decay=decay, N=N, savepath=workspace)
    nb_samples = 0
    memory = Memory()
    for n in range(N):
        # print("-------------------------- "+str(n)+ "----------------------")
        s = env.reset()
        done = False
        rr = 0
        it = 0
        while (not done):

            if random.random() < epsilons[n]:
                if hasattr(env, "action_space_executable"):
                    a = np.random.choice(env.action_space_executable())
                else:
                    a = env.action_space.sample()
            else:
                if hasattr(env, "action_space_executable"):
                    exec = env.action_space_executable()
                    action_mask = np.ones(env.action_space.n)
                    for ex in exec:
                        action_mask[ex] = 0.
                    a = dqn.pi(s, action_mask)
                else:
                    a = dqn.pi(s, np.zeros(env.action_space.n))

            s_, r_, done, info = env.step(a)
            done = done or (traj_max_size is not None and it >= traj_max_size - 1)
            rr += r_ * (gamma ** it)
            t_dqn = (s, a, r_, s_, done, info)
            memory.push(s, a, r_, s_, done, info)
            dqn.update(*t_dqn)
            s = s_
            nb_samples += 1
            it += 1
        # if writer is not None:
        #     writer.add_scalar('{} return/episode', rr, n)
        rrr.append(rr)

        if evaluate_greedy_policy:
            s = env.reset()
            done = False
            rr_greedy = 0
            it = 0
            while (not done):
                if hasattr(env, "action_space_executable"):
                    exec = env.action_space_executable()
                    action_mask = np.ones(env.action_space.n)
                    for ex in exec:
                        action_mask[ex] = 0.
                    a = dqn.pi(s, action_mask)
                else:
                    a = dqn.pi(s, np.zeros(env.action_space.n))
                    # print(env.action_space_str[a])

                s_, r_, done, info = env.step(a)
                done = done or (traj_max_size is not None and it >= traj_max_size - 1)
                rr_greedy += r_ * (gamma ** it)
                s = s_
                it += 1
            rrr_greedy.append(rr_greedy)
            if writer is not None:
                writer.add_scalar('{}_return_greedy/episode'.format(id), rr_greedy, n)
            # print("eps={} greedy={}".format(rr,rr_greedy))
    import matplotlib.pyplot as plt
    for param_stat in ["weights_over_time", "biais_over_time",
                       "ae_errors_over_time", "p_over_time",
                       "best_fit_over_time","error_bootstrap_source","error_bootstrap_partial"]:
        if hasattr(dqn, param_stat):
            var = getattr(dqn, param_stat)
            plt.plot(range(len(var)), var)
            plt.title(param_stat)
            plt.savefig(workspace / param_stat)
            plt.close()

    return rrr, rrr_greedy, memory, dqn
Exemple #10
0
def main(policy_path, generate_envs, feature_str, device, workspace, bftq_params, seed, general,
         betas_test, N_trajs, gamma, gamma_c, bftq_net_params, **args):
    if not os.path.isabs(policy_path):
        policy_path = workspace / policy_path

    env = envs_factory.generate_envs(**generate_envs)[0][0]
    feature = feature_factory(feature_str)

    bftq = PytorchBudgetedFittedQ(
        device=device,
        workspace=workspace,
        actions_str=get_actions_str(env),
        policy_network=NetBFTQ(size_state=len(feature(env.reset(), env)), n_actions=env.action_space.n,
                               **bftq_net_params),
        gamma=gamma,
        gamma_c=gamma_c,
        cpu_processes=general["cpu"]["processes"],
        env=env,
        hull_options=general["hull_options"],
        **bftq_params)
    bftq.reset(True)

    pi_config = {
        "__class__": repr(PytorchBudgetedFittedPolicy),
        "feature_str": feature_str,
        "network_path": policy_path,
        "betas_for_discretisation": eval(bftq_params["betas_for_discretisation"]),
        "device": device,
        "hull_options": general["hull_options"],
        "clamp_Qc": bftq_params["clamp_Qc"],
        "env": env
    }
    pi = policy_factory(pi_config)

    # Iterate over betas
    for beta in eval(betas_test):
        logger.info("Rendering with beta={}".format(beta))
        set_seed(seed, env)
        for traj in range(N_trajs):
            done = False
            pi.reset()
            info_env = {}
            info_pi = {"beta": beta}
            t = 0

            # Make a workspace for trajectories
            traj_workspace = workspace / "trajs" / "beta={}".format(beta) / "traj={}".format(traj)
            makedirs(traj_workspace)
            bftq.workspace = traj_workspace
            monitor = MonitorV2(env, traj_workspace, add_subdirectory=False)
            obs = monitor.reset()

            # Run trajectory
            while not done:
                action_mask = get_action_mask(env)
                info_pi = merge_two_dicts(info_pi, info_env)
                bftq.draw_Qr_and_Qc(obs, pi.network, "render_t={}".format(t), show=False)
                a, _, info_pi = pi.execute(obs, action_mask, info_pi)
                render(env, workspace, t, a)
                obs, _, done, info_env = monitor.step(a)
                t += 1
            monitor.close()