Ejemplo n.º 1
0
 def _thunk():
     env = make_env.make_env(env_id)
     env.seed(seed + rank)
     env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
                         allow_early_resets=True)
     gym.logger.setLevel(logging.WARN)
     return env
Ejemplo n.º 2
0
 def _thunk():
     env = make_env.make_env(env_id, max_episode_len=max_episode_len)
     env.discrete_action_input = True
     env.seed(seed + rank)
     env = bench.Monitor(env,
                         logger.get_dir()
                         and os.path.join(logger.get_dir(), str(rank)),
                         allow_early_resets=True)
     gym.logger.setLevel(logging.WARN)
     return env
Ejemplo n.º 3
0
def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.95, lam=0.92, log_interval=1, nprocs=32, nsteps=20,
          nstack=1, ent_coef=0.00, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
          kfac_clip=0.001, save_interval=100, lrschedule='linear', identical=None):
    tf.reset_default_graph()
    set_global_seeds(seed)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    make_model = lambda: Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
    =nsteps, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
                               vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
                               lrschedule=lrschedule, identical=identical)
    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()

    runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma, lam=lam)
    nbatch = nenvs * nsteps
    tstart = time.time()
    coord = tf.train.Coordinator()
    # enqueue_threads = [q_runner.create_threads(model.sess, coord=coord, start=True) for q_runner in model.q_runner]
    for update in range(1, total_timesteps // nbatch + 1):
        obs, states, rewards, masks, actions, values = runner.run()
        policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
        model.old_obs = obs
        nseconds = time.time() - tstart
        fps = int((update * nbatch) / nseconds)
        if update % log_interval == 0 or update == 1:
            ev = [explained_variance(values[k], rewards[k]) for k in range(model.num_agents)]
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)

            for k in range(model.num_agents):
                # logger.record_tabular('reward %d' % k, np.mean(rewards[k]))
                logger.record_tabular("explained_variance %d" % k, float(ev[k]))
                logger.record_tabular("policy_entropy %d" % k, float(policy_entropy[k]))
                logger.record_tabular("policy_loss %d" % k, float(policy_loss[k]))
                logger.record_tabular("value_loss %d" % k, float(value_loss[k]))
            logger.dump_tabular()

        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
            savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)
    coord.request_stop()
    # coord.join(enqueue_threads)
    env.close()
Ejemplo n.º 4
0
def train(logdir,
          env_id,
          lr,
          num_timesteps,
          seed,
          timesteps_per_batch,
          cont=False):
    from sandbox.ppo_sgd import mlp_policy
    from sandbox.ppo_sgd import pposgd_simple
    from rl import logger
    from rl.common import set_global_seeds, tf_util as U
    from rl import bench

    from gym.envs.registration import register
    import multiagent
    import make_env

    logger.configure(logdir, format_strs=['log', 'json', 'tensorboard'])
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    env = make_env.make_env(env_id)

    def policy_fn(name, ob_space, ac_space, id):
        pi = mlp_policy.MlpPolicy(name=name,
                                  ob_space=ob_space,
                                  ac_space=ac_space,
                                  hid_size=64,
                                  num_hid_layers=2,
                                  id=id)
        return pi

    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)
    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_batch=timesteps_per_batch,
                        clip_param=0.2,
                        entcoeff=0.0,
                        optim_epochs=10,
                        optim_stepsize=lr,
                        optim_batchsize=64,
                        gamma=0.99,
                        lam=0.95,
                        schedule='linear',
                        cont=cont)
    env.close()
    return None
Ejemplo n.º 5
0
def learn(policy,
          env,
          expert,
          seed,
          total_timesteps=int(40e6),
          gamma=0.99,
          lam=0.95,
          log_interval=1,
          nprocs=32,
          nsteps=20,
          nstack=1,
          ent_coef=0.01,
          vf_coef=0.5,
          vf_fisher_coef=1.0,
          lr=0.05,
          max_grad_norm=0.5,
          kfac_clip=0.001,
          save_interval=100,
          lrschedule='linear'):
    tf.reset_default_graph()
    set_global_seeds(seed)
    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    make_model = lambda: Model(policy,
                               ob_space,
                               ac_space,
                               nenvs,
                               total_timesteps,
                               nprocs=nprocs,
                               nsteps=1024,
                               nstack=nstack,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               vf_fisher_coef=vf_fisher_coef,
                               lr=lr,
                               max_grad_norm=max_grad_norm,
                               kfac_clip=kfac_clip,
                               lrschedule=lrschedule)
    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()

    for _ in range(10000):
        e_obs, e_actions, _, _ = expert.get_next_batch(1024)
        e_a = [np.argmax(e_actions[k], axis=1) for k in range(len(e_actions))]
        lld_loss = model.clone(e_obs, e_a)
        print(lld_loss)
Ejemplo n.º 6
0
def learn(*,
          policy,
          env,
          nsteps,
          total_timesteps,
          ent_coef,
          lr,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=20,
          expert=None,
          clone_iters=None):

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    nenvs = env.num_envs
    try:
        ob_space = env.observation_space
        ac_space = env.action_space
        num_agents = len(ob_space)
    except:
        ob_space = env.observation_space.spaces
        ac_space = env.action_space.spaces
        num_agents = len(ob_space)
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    make_model = lambda: Model(policy=policy,
                               ob_spaces=ob_space,
                               ac_spaces=ac_space,
                               nbatch_act=nenvs,
                               nbatch_train=nbatch_train,
                               nsteps=nsteps,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               max_grad_norm=max_grad_norm)
    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

    epinfobuf = deque(maxlen=100)
    tfirststart = time.time()

    if expert:
        if clone_iters:
            for i in range(clone_iters):
                e_obs, e_actions, _, _ = expert.get_next_batch(nbatch //
                                                               nminibatches)
                lld = model.clone(lr(1.0), e_obs, e_actions)
                if i % 100 == 0:
                    print([np.mean(l) for l in lld])

    nupdates = total_timesteps // nbatch
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        nbatch_train = nbatch // nminibatches
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  #pylint: disable=E0632
        epinfobuf.extend(epinfos)
        mblossvals = []
        # if states is None: # nonrecurrent version
        # advs = [returns[k] - values[k] for k in range(num_agents)]
        # advs = [(advs[k] - advs[k].mean()) / (advs[k].std() + 1e-8) for k in range(num_agents)]
        inds = np.arange(nbatch)
        for _ in range(noptepochs):
            np.random.shuffle(inds)
            for start in range(0, nbatch, nbatch_train):
                end = start + nbatch_train
                mbinds = inds[start:end]
                slices = ([a[mbinds] for a in arr]
                          for arr in (obs, returns, masks, actions, values,
                                      neglogpacs))
                mblossvals.append(model.train(lrnow, cliprangenow, *slices))
        # else: # recurrent version
        #     assert nenvs % nminibatches == 0
        #     envsperbatch = nenvs // nminibatches
        #     envinds = np.arange(nenvs)
        #     flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
        #     envsperbatch = nbatch_train // nsteps
        #     for _ in range(noptepochs):
        #         np.random.shuffle(envinds)
        #         for start in range(0, nenvs, envsperbatch):
        #             end = start + envsperbatch
        #             mbenvinds = envinds[start:end]
        #             mbflatinds = flatinds[mbenvinds].ravel()
        #             slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
        #             mbstates = states[mbenvinds]
        #             mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            ev = [
                explained_variance(values[k], returns[k])
                for k in range(num_agents)
            ]
            logger.logkv("serial_timesteps", update * nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            for k in range(num_agents):
                logger.logkv("explained_variance_{}".format(k), float(ev[k]))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                for k in range(num_agents):
                    logger.logkv(lossname + '{}'.format(k), lossval[k])
            logger.dumpkvs()
        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir():
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)
            env.save(savepath + '.ob_rms')
    env.close()
Ejemplo n.º 7
0
 def _make_env():
     env = gym.make(env_id)
     env = MAWrapper(env)
     env = bench.Monitor(env, logger.get_dir())
     return env
Ejemplo n.º 8
0
 def _make_env():
     env = make_env(env_id)  # gym.make(env_id)
     env = bench.Monitor(env, logger.get_dir())
     return env
Ejemplo n.º 9
0
def learn(policy,
          expert,
          env,
          env_id,
          seed,
          total_timesteps=int(40e6),
          gamma=0.99,
          lam=0.95,
          log_interval=1,
          nprocs=32,
          nsteps=20,
          nstack=1,
          ent_coef=0.01,
          vf_coef=0.5,
          vf_fisher_coef=1.0,
          lr=0.25,
          max_grad_norm=0.5,
          kfac_clip=0.001,
          save_interval=100,
          lrschedule='linear',
          dis_lr=0.001,
          disc_type='decentralized',
          bc_iters=500,
          identical=None,
          l2=0.1,
          d_iters=1,
          rew_scale=0.1):
    tf.reset_default_graph()
    set_global_seeds(seed)
    buffer = None

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    num_agents = (len(ob_space))
    make_model = lambda: Model(policy,
                               ob_space,
                               ac_space,
                               nenvs,
                               total_timesteps,
                               nprocs=nprocs,
                               nsteps=nsteps,
                               nstack=nstack,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               vf_fisher_coef=vf_fisher_coef,
                               lr=lr,
                               max_grad_norm=max_grad_norm,
                               kfac_clip=kfac_clip,
                               lrschedule=lrschedule,
                               identical=identical)
    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()
    if disc_type == 'decentralized' or disc_type == 'decentralized-all':
        discriminator = [
            Discriminator(
                model.sess,
                ob_space,
                ac_space,
                state_only=True,
                discount=gamma,
                nstack=nstack,
                index=k,
                disc_type=disc_type,
                scope="Discriminator_%d" % k,  # gp_coef=gp_coef,
                total_steps=total_timesteps // (nprocs * nsteps),
                lr_rate=dis_lr,
                l2_loss_ratio=l2) for k in range(num_agents)
        ]
    else:
        assert False

    # add reward regularization
    if env_id == 'simple_tag':
        reward_reg_loss = tf.reduce_mean(
            tf.square(discriminator[0].reward + discriminator[3].reward) +
            tf.square(discriminator[1].reward + discriminator[3].reward) +
            tf.square(discriminator[2].reward +
                      discriminator[3].reward)) + rew_scale * tf.reduce_mean(
                          tf.maximum(0.0, 1 - discriminator[0].reward) +
                          tf.maximum(0.0, 1 - discriminator[1].reward) +
                          tf.maximum(0.0, 1 - discriminator[2].reward) +
                          tf.maximum(0.0, discriminator[3].reward + 1))
        reward_reg_lr = tf.placeholder(tf.float32, ())
        reward_reg_optim = tf.train.AdamOptimizer(learning_rate=reward_reg_lr)
        reward_reg_train_op = reward_reg_optim.minimize(reward_reg_loss)

    tf.global_variables_initializer().run(session=model.sess)
    runner = Runner(env,
                    model,
                    discriminator,
                    nsteps=nsteps,
                    nstack=nstack,
                    gamma=gamma,
                    lam=lam,
                    disc_type=disc_type,
                    nobs_flag=True)
    nbatch = nenvs * nsteps
    tstart = time.time()
    coord = tf.train.Coordinator()
    # enqueue_threads = [q_runner.create_threads(model.sess, coord=coord, start=True) for q_runner in model.q_runner]
    for _ in range(bc_iters):
        e_obs, e_actions, e_nobs, _, _ = expert.get_next_batch(nenvs * nsteps)
        e_a = [np.argmax(e_actions[k], axis=1) for k in range(len(e_actions))]
        lld_loss = model.clone(e_obs, e_a)
        # print(lld_loss)

    update_policy_until = 10

    for update in range(1, total_timesteps // nbatch + 1):
        obs, obs_next, states, rewards, report_rewards, masks, actions, values, all_obs, all_nobs,\
        mh_actions, mh_all_actions, mh_rewards, mh_true_rewards, mh_true_returns = runner.run()

        total_loss = np.zeros((num_agents, d_iters))

        idx = 0
        idxs = np.arange(len(all_obs))
        random.shuffle(idxs)
        all_obs = all_obs[idxs]
        mh_actions = [mh_actions[k][idxs] for k in range(num_agents)]
        mh_obs = [obs[k][idxs] for k in range(num_agents)]
        mh_obs_next = [obs_next[k][idxs] for k in range(num_agents)]
        mh_values = [values[k][idxs] for k in range(num_agents)]

        if buffer:
            buffer.update(mh_obs, mh_actions, mh_obs_next, all_obs, mh_values)
        else:
            buffer = Dset(mh_obs,
                          mh_actions,
                          mh_obs_next,
                          all_obs,
                          mh_values,
                          randomize=True,
                          num_agents=num_agents,
                          nobs_flag=True)

        d_minibatch = nenvs * nsteps

        for d_iter in range(d_iters):
            e_obs, e_actions, e_nobs, e_all_obs, _ = expert.get_next_batch(
                d_minibatch)
            g_obs, g_actions, g_nobs, g_all_obs, _ = buffer.get_next_batch(
                batch_size=d_minibatch)

            e_a = [
                np.argmax(e_actions[k], axis=1) for k in range(len(e_actions))
            ]
            g_a = [
                np.argmax(g_actions[k], axis=1) for k in range(len(g_actions))
            ]

            g_log_prob = model.get_log_action_prob(g_obs, g_a)
            e_log_prob = model.get_log_action_prob(e_obs, e_a)
            if disc_type == 'decentralized':
                for k in range(num_agents):
                    total_loss[k, d_iter] = discriminator[k].train(
                        g_obs[k], g_actions[k], g_nobs[k],
                        g_log_prob[k].reshape([-1, 1]), e_obs[k], e_actions[k],
                        e_nobs[k], e_log_prob[k].reshape([-1, 1]))
            elif disc_type == 'decentralized-all':
                g_obs_all = np.concatenate(g_obs, axis=1)
                g_actions_all = np.concatenate(g_actions, axis=1)
                g_nobs_all = np.concatenate(g_nobs, axis=1)
                e_obs_all = np.concatenate(e_obs, axis=1)
                e_actions_all = np.concatenate(e_actions, axis=1)
                e_nobs_all = np.concatenate(e_nobs, axis=1)
                for k in range(num_agents):
                    total_loss[k, d_iter] = discriminator[k].train(
                        g_obs_all, g_actions_all, g_nobs_all,
                        g_log_prob[k].reshape([-1,
                                               1]), e_obs_all, e_actions_all,
                        e_nobs_all, e_log_prob[k].reshape([-1, 1]))
            else:
                assert False

            if env_id == 'simple_tag':
                if disc_type == 'decentralized':
                    feed_dict = {
                        discriminator[k].obs:
                        np.concatenate([g_obs[k], e_obs[k]], axis=0)
                        for k in range(num_agents)
                    }
                elif disc_type == 'decentralized-all':
                    feed_dict = {
                        discriminator[k].obs:
                        np.concatenate([g_obs_all, e_obs_all], axis=0)
                        for k in range(num_agents)
                    }
                else:
                    assert False
                feed_dict[reward_reg_lr] = discriminator[0].lr.value()
                model.sess.run(reward_reg_train_op, feed_dict=feed_dict)

            idx += 1

        if update > update_policy_until:  # 10
            policy_loss, value_loss, policy_entropy = model.train(
                obs, states, rewards, masks, actions, values)
        model.old_obs = obs
        nseconds = time.time() - tstart
        fps = int((update * nbatch) / nseconds)
        if update % log_interval == 0 or update == 1:
            ev = [
                explained_variance(values[k], rewards[k])
                for k in range(model.num_agents)
            ]
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)

            for k in range(model.num_agents):
                logger.record_tabular("explained_variance %d" % k,
                                      float(ev[k]))
                if update > update_policy_until:
                    logger.record_tabular("policy_entropy %d" % k,
                                          float(policy_entropy[k]))
                    logger.record_tabular("policy_loss %d" % k,
                                          float(policy_loss[k]))
                    logger.record_tabular("value_loss %d" % k,
                                          float(value_loss[k]))
                    try:
                        logger.record_tabular(
                            'pearson %d' % k,
                            float(
                                pearsonr(report_rewards[k].flatten(),
                                         mh_true_returns[k].flatten())[0]))
                        logger.record_tabular(
                            'spearman %d' % k,
                            float(
                                spearmanr(report_rewards[k].flatten(),
                                          mh_true_returns[k].flatten())[0]))
                        logger.record_tabular('reward %d' % k,
                                              float(np.mean(rewards[k])))
                    except:
                        pass

            total_loss_m = np.mean(total_loss, axis=1)
            for k in range(num_agents):
                logger.record_tabular("total_loss %d" % k, total_loss_m[k])
            logger.dump_tabular()

        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir():
            savepath = osp.join(logger.get_dir(), 'm_%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)
            if disc_type == 'decentralized' or disc_type == 'decentralized-all':
                for k in range(num_agents):
                    savepath = osp.join(logger.get_dir(),
                                        'd_%d_%.5i' % (k, update))
                    discriminator[k].save(savepath)
            else:
                assert False
    coord.request_stop()
    # coord.join(enqueue_threads)
    env.close()
Ejemplo n.º 10
0
def learn(policy,
          env,
          expert,
          seed,
          total_timesteps=int(40e6),
          gamma=0.99,
          lam=0.95,
          log_interval=1,
          nprocs=4,
          nsteps=20,
          nstack=1,
          ent_coef=0.01,
          vf_coef=0.5,
          vf_fisher_coef=1.0,
          lr=0.05,
          max_grad_norm=0.5,
          kfac_clip=0.001,
          save_interval=1000,
          lrschedule='linear',
          batch_size=1024):
    tf.reset_default_graph()
    set_global_seeds(seed)
    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    make_model = lambda: Model(policy,
                               ob_space,
                               ac_space,
                               nenvs,
                               total_timesteps,
                               nprocs=nprocs,
                               nsteps=batch_size,
                               nstack=nstack,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               vf_fisher_coef=vf_fisher_coef,
                               lr=lr,
                               max_grad_norm=max_grad_norm,
                               kfac_clip=kfac_clip,
                               lrschedule=lrschedule)
    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()

    tstart = time.time()
    coord = tf.train.Coordinator()
    # enqueue_threads = [q_runner.create_threads(model.sess, coord=coord, start=True) for q_runner in model.q_runner]
    print("-------------------------------")
    print(total_timesteps // batch_size + 1)
    print("-------------------------------")

    for update in range(total_timesteps // batch_size + 1):
        e_obs, e_actions, _, _ = expert.get_next_batch(batch_size)
        e_a = [np.argmax(e_actions[k], axis=1) for k in range(len(e_actions))]
        nseconds = time.time() - tstart
        fps = int((update * batch_size) / nseconds)

        lld_loss = model.clone(e_obs, e_a)[0]
        # print(lld_loss)
        if update % log_interval == 0 or update == 1:
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * batch_size)
            logger.record_tabular("fps", fps)
            for k in range(model.num_agents):
                logger.record_tabular("lld_loss %d" % k, float(lld_loss[k]))
            logger.dump_tabular()

        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir():
            savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)

    coord.request_stop()
Ejemplo n.º 11
0
 def _make_env(rank):
     env = gym.make('RoboSumo-Ant-vs-Ant-v0')
     env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
     return env