Esempio n. 1
0
 def collect_observations():
     rng = np.random.RandomState(0)
     venv = ProcgenEnv(num_envs=2, env_name=env_name, rand_seed=23)
     obs = venv.reset()
     obses = [obs["rgb"]]
     for _ in range(128):
         obs, _rew, _done, _info = venv.step(
             rng.randint(
                 low=0,
                 high=venv.action_space.n,
                 size=(venv.num_envs,),
                 dtype=np.int32,
             )
         )
         obses.append(obs["rgb"])
     return np.array(obses)
Esempio n. 2
0
            optimizer.param_groups[0]["lr"] = lrnow

        for step in range(0, args.num_steps):
            global_step += 1 * args.num_envs
            obs[step] = next_obs
            dones[step] = next_done

            # ALGO LOGIC: action logic
            with torch.no_grad():
                action, logprob, _, value = agent.get_action_and_value(next_obs)
                values[step] = value.flatten()
            actions[step] = action
            logprobs[step] = logprob

            # TRY NOT TO MODIFY: execute the game and log data.
            next_obs, reward, done, info = envs.step(action.cpu().numpy())
            rewards[step] = torch.tensor(reward).to(device).view(-1)
            next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device)

            for item in info:
                if "episode" in item.keys():
                    print(f"global_step={global_step}, episodic_return={item['episode']['r']}")
                    writer.add_scalar("charts/episodic_return", item["episode"]["r"], global_step)
                    writer.add_scalar("charts/episodic_length", item["episode"]["l"], global_step)
                    break

        # bootstrap value if not done
        with torch.no_grad():
            next_value = agent.get_value(next_obs).reshape(1, -1)
            if args.gae:
                advantages = torch.zeros_like(rewards).to(device)
Esempio n. 3
0
from procgen import ProcgenEnv

# env = gym.make('procgen:procgen-coinrun-v0')
# obs = env.reset()
#
# while True:
#     obs, rew, done, info = env.step(env.action_space.sample())
#     env.render()
#     if done:
#         break

env = ProcgenEnv(num_envs=2, env_name="coinrun", num_levels=12, start_level=34)
obs = env.reset()

print(obs['rgb'].shape)

action = np.ones(2) * env.action_space.sample()

obs, rew, done, info = env.step(action)

print(obs)
print(rew)
print(done)
print(info)

# while True:
#     obs, rew, done, info = env.step(env.action_space.sample())
#     env.render()
#     if done:
#         break
Esempio n. 4
0
def rollout(*,
            network,
            env,
            total_timesteps,
            eval_env=None,
            seed=None,
            nsteps=2048,
            ent_coef=0.0,
            lr=3e-4,
            vf_coef=0.5,
            max_grad_norm=0.5,
            gamma=0.99,
            lam=0.95,
            log_interval=10,
            nminibatches=4,
            noptepochs=4,
            cliprange=0.2,
            save_interval=0,
            load_path=None,
            model_fn=None,
            update_fn=None,
            init_fn=None,
            mpi_rank_weight=1,
            comm=None,
            num_steps,
            num_envs,
            env_name,
            num_levels,
            start_level,
            distribution_mode,
            **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''

    set_global_seeds(seed)

    # if isinstance(lr, float): lr = constfn(lr)
    # else: assert callable(lr)
    # if isinstance(cliprange, float): cliprange = constfn(cliprange)
    # else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     nbatch_act=nenvs,
                     nbatch_train=nbatch_train,
                     nsteps=nsteps,
                     ent_coef=ent_coef,
                     vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm,
                     comm=comm,
                     mpi_rank_weight=mpi_rank_weight)

    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
    # runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    # if eval_env is not None:
    #     eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam)

    # epinfobuf = deque(maxlen=100)
    # if eval_env is not None:
    #     eval_epinfobuf = deque(maxlen=100)

    # if init_fn is not None:
    #     init_fn()

    # # Start total timer
    # tfirststart = time.perf_counter()

    # nupdates = total_timesteps//nbatch
    # for update in range(1, nupdates+1):
    #     assert nbatch % nminibatches == 0
    #     # Start timer
    #     tstart = time.perf_counter()
    #     frac = 1.0 - (update - 1.0) / nupdates
    #     # Calculate the learning rate
    #     lrnow = lr(frac)
    #     # Calculate the cliprange
    #     cliprangenow = cliprange(frac)

    #     if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...')

    #     # Get minibatch
    #     obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632
    #     if eval_env is not None:
    #         eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() #pylint: disable=E0632

    #     if update % log_interval == 0 and is_mpi_root: logger.info('Done.')

    #     epinfobuf.extend(epinfos)
    #     if eval_env is not None:
    #         eval_epinfobuf.extend(eval_epinfos)

    #     # Here what we're going to do is for each minibatch calculate the loss and append it.
    #     mblossvals = []
    #     if states is None: # nonrecurrent version
    #         # Index of each element of batch_size
    #         # Create the indices array
    #         inds = np.arange(nbatch)
    #         for _ in range(noptepochs):
    #             # Randomize the indexes
    #             np.random.shuffle(inds)
    #             # 0 to batch_size with batch_train_size step
    #             for start in range(0, nbatch, nbatch_train):
    #                 end = start + nbatch_train
    #                 mbinds = inds[start:end]
    #                 slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
    #                 mblossvals.append(model.train(lrnow, cliprangenow, *slices))
    #     else: # recurrent version
    #         assert nenvs % nminibatches == 0
    #         envsperbatch = nenvs // nminibatches
    #         envinds = np.arange(nenvs)
    #         flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
    #         for _ in range(noptepochs):
    #             np.random.shuffle(envinds)
    #             for start in range(0, nenvs, envsperbatch):
    #                 end = start + envsperbatch
    #                 mbenvinds = envinds[start:end]
    #                 mbflatinds = flatinds[mbenvinds].ravel()
    #                 slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
    #                 mbstates = states[mbenvinds]
    #                 mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))

    #     # Feedforward --> get losses --> update
    #     lossvals = np.mean(mblossvals, axis=0)
    #     # End timer
    # tnow = time.perf_counter()
    # # Calculate the fps (frame per second)
    # fps = int(nbatch / (tnow - tstart))

    if update_fn is not None:
        update_fn(update)

    rewards = []
    for i in range(num_steps):
        env = ProcgenEnv(num_envs=num_envs,
                         env_name=env_name,
                         num_levels=num_levels,
                         start_level=start_level,
                         distribution_mode=distribution_mode)
        env = VecExtractDictObs(env, "rgb")

        env = VecMonitor(
            venv=env,
            filename=None,
            keep_buf=100,
        )

        env = VecNormalize(venv=env, ob=False)
        obs = env.reset()
        done = False
        reward = 0.0
        timesteps = 0
        while not done:
            # action = env.action_space.sample()
            # print("example of an action: ", action)
            # print("\n\n")
            # print("my action: ")
            actions, _, _, _ = model.step(obs)
            # print(actions.shape)
            # print("obs shape: ", obs.shape)
            # print(actions[0])
            obs, r, done, _ = env.step(actions[0])
            done = done.all()
            reward += r
            timesteps += 1
        rewards.append(reward)

        #Logging reward, timesteps, and numsteps
        logger.logkv("numsteps", i)
        logger.logkv("timesteps", timesteps)
        logger.logkv("episode_reward_mean", safemean(reward))
        logger.dumpkvs()

    # if update % log_interval == 0 or update == 1:
    #     # Calculates if value function is a good predicator of the returns (ev > 1)
    #     # or if it's just worse than predicting nothing (ev =< 0)
    #     ev = explained_variance(values, returns)
    #     logger.logkv("misc/serial_timesteps", update*nsteps)
    #     logger.logkv("misc/nupdates", update)
    #     logger.logkv("misc/total_timesteps", update*nbatch)
    #     logger.logkv("fps", fps)
    #     logger.logkv("misc/explained_variance", float(ev))
    #     logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
    #     logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
    #     if eval_env is not None:
    #         logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) )
    #         logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) )
    #     logger.logkv('misc/time_elapsed', tnow - tfirststart)
    #     for (lossval, lossname) in zip(lossvals, model.loss_names):
    #         logger.logkv('loss/' + lossname, lossval)

    #     logger.dumpkvs()
    # if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and is_mpi_root:
    #     checkdir = osp.join(logger.get_dir(), 'checkpoints')
    #     os.makedirs(checkdir, exist_ok=True)
    #     savepath = osp.join(checkdir, '%.5i'%update)
    #     print('Saving to', savepath)
    #     model.save(savepath)

    return model
Esempio n. 5
0
                 vf_coef=vf_coef,
                 max_grad_norm=max_grad_norm,
                 comm=comm,
                 mpi_rank_weight=mpi_rank_weight)

load_path = '000002400.ckpt'
if load_path is not None:
    model.load(load_path)

obs = training_env.reset()
dones = [False]
states = model.initial_state
import numpy as np
step = 0

rew = []
for _ in range(video_interval + video_length + 1):
    actions, values, states, _ = model.step(obs, S=states, M=dones)
    obs[:], rewards, dones, infos = training_env.step(actions)
    rew.append(rewards)
    step += 1
    print(f"Steps: {step}")
    training_env.render()
    if dones[0]:
        break

print(np.mean(rew))
training_env.close()

# recorded_video = glob.glob(os.path.join('./recordings', "*.mp4"))