Example #1
0
print('Collecting experience...')

# episode step for accumulate reward
epinfobuf = deque(maxlen=100)
# check learning time
start_time = time.time()

# env reset
s = np.array(env.reset())

# Trainning
for step in range(1, STEP_NUM // N_ENVS + 1):
    a = quota.choose_action(s, EPSILON, EPSILON_O)

    # take action and get next state
    s_, r, done, infos = env.step(a)
    # log arrange
    for info in infos:
        maybeepinfo = info.get('episode')
        if maybeepinfo: epinfobuf.append(maybeepinfo)
    s_ = np.array(s_)

    # clip rewards for numerical stability
    clip_r = np.sign(r)

    # store the transition
    for i in range(N_ENVS):
        quota.store_transition(s[i], a[i], clip_r[i], s_[i], done[i],
                               quota.options[i].item())

    # annealing the epsilon(exploration strategy)
Example #2
0
def train(args):
    print(args)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    if args.return_function == "GAE":
        return_function = GAE
    elif args.return_function == "Q":
        return_function = Q
    elif args.return_function == "A":
        return_function = A

    MONTE_CARLO = True if args.num_steps == 200 else False

    envs = SubprocVecEnv(
        [make_env(args.env, i + args.num_envs) for i in range(args.num_envs)],
        MONTE_CARLO)
    test_env = gym.make(args.env)
    test_env.seed(args.seed + args.num_envs)
    policy = ActorCriticMLP(input_dim=envs.observation_space.shape[0],
                            n_acts=envs.action_space.n)
    optim = torch.optim.Adam(params=policy.parameters(),
                             lr=args.lr,
                             weight_decay=args.weight_decay)

    test_rewards = []
    steps = 1

    obs = torch.from_numpy(envs.reset())
    while steps < args.max_steps:
        logp_actions = []
        state_values = []
        rewards = []
        masks = []

        for _ in range(args.num_steps):
            probs, state_value = policy.forward(obs)
            dist = Categorical(probs)
            action = dist.sample()

            obs, reward, done, _ = envs.step(action.numpy())

            logp_actions.append(dist.log_prob(action).unsqueeze(1))
            state_values.append(state_value)
            rewards.append(torch.FloatTensor(reward).unsqueeze(1))
            masks.append(torch.FloatTensor(1 - done).unsqueeze(1))
            obs = torch.from_numpy(obs)
            steps += 1

            if steps % args.test_every == 0:
                test_reward = np.mean(
                    [test(test_env, policy) for _ in range(10)])
                test_rewards.append(test_reward)
                print(f"Running reward at timestep {steps}: and {test_reward}")

            if (1 - done).sum() == 0:
                break

        next_value = 0
        if not (1 - done).sum() == 0:
            _, next_value = policy(obs)

        returns = return_function(next_value, rewards, masks, state_values,
                                  args)
        loss = policy_gradient(logp_actions, returns)

        optim.zero_grad()
        loss.backward()

        optim.step()
        # if monte carlo, we need to reset the environment by hand
        if MONTE_CARLO:
            obs = torch.from_numpy(envs.reset())
    return test_rewards