Python SubprocVecEnv.step Examples

Programming Language: Python

Namespace/Package Name: wrappers

Class/Type: SubprocVecEnv

Method/Function: step

Examples at hotexamples.com: 2

Python SubprocVecEnv.step - 2 examples found. These are the top rated real world Python examples of wrappers.SubprocVecEnv.step extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

SubprocVecEnv(5)

reset(2)

step(2)

render(1)

Example #1

Show file

print('Collecting experience...')

# episode step for accumulate reward
epinfobuf = deque(maxlen=100)
# check learning time
start_time = time.time()

# env reset
s = np.array(env.reset())

# Trainning
for step in range(1, STEP_NUM // N_ENVS + 1):
    a = quota.choose_action(s, EPSILON, EPSILON_O)

    # take action and get next state
    s_, r, done, infos = env.step(a)
    # log arrange
    for info in infos:
        maybeepinfo = info.get('episode')
        if maybeepinfo: epinfobuf.append(maybeepinfo)
    s_ = np.array(s_)

    # clip rewards for numerical stability
    clip_r = np.sign(r)

    # store the transition
    for i in range(N_ENVS):
        quota.store_transition(s[i], a[i], clip_r[i], s_[i], done[i],
                               quota.options[i].item())

    # annealing the epsilon(exploration strategy)

Example #2

Show file

def train(args):
    print(args)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    if args.return_function == "GAE":
        return_function = GAE
    elif args.return_function == "Q":
        return_function = Q
    elif args.return_function == "A":
        return_function = A

    MONTE_CARLO = True if args.num_steps == 200 else False

    envs = SubprocVecEnv(
        [make_env(args.env, i + args.num_envs) for i in range(args.num_envs)],
        MONTE_CARLO)
    test_env = gym.make(args.env)
    test_env.seed(args.seed + args.num_envs)
    policy = ActorCriticMLP(input_dim=envs.observation_space.shape[0],
                            n_acts=envs.action_space.n)
    optim = torch.optim.Adam(params=policy.parameters(),
                             lr=args.lr,
                             weight_decay=args.weight_decay)

    test_rewards = []
    steps = 1

    obs = torch.from_numpy(envs.reset())
    while steps < args.max_steps:
        logp_actions = []
        state_values = []
        rewards = []
        masks = []

        for _ in range(args.num_steps):
            probs, state_value = policy.forward(obs)
            dist = Categorical(probs)
            action = dist.sample()

            obs, reward, done, _ = envs.step(action.numpy())

            logp_actions.append(dist.log_prob(action).unsqueeze(1))
            state_values.append(state_value)
            rewards.append(torch.FloatTensor(reward).unsqueeze(1))
            masks.append(torch.FloatTensor(1 - done).unsqueeze(1))
            obs = torch.from_numpy(obs)
            steps += 1

            if steps % args.test_every == 0:
                test_reward = np.mean(
                    [test(test_env, policy) for _ in range(10)])
                test_rewards.append(test_reward)
                print(f"Running reward at timestep {steps}: and {test_reward}")

            if (1 - done).sum() == 0:
                break

        next_value = 0
        if not (1 - done).sum() == 0:
            _, next_value = policy(obs)

        returns = return_function(next_value, rewards, masks, state_values,
                                  args)
        loss = policy_gradient(logp_actions, returns)

        optim.zero_grad()
        loss.backward()

        optim.step()
        # if monte carlo, we need to reset the environment by hand
        if MONTE_CARLO:
            obs = torch.from_numpy(envs.reset())
    return test_rewards