print('Collecting experience...') # episode step for accumulate reward epinfobuf = deque(maxlen=100) # check learning time start_time = time.time() # env reset s = np.array(env.reset()) # Trainning for step in range(1, STEP_NUM // N_ENVS + 1): a = quota.choose_action(s, EPSILON, EPSILON_O) # take action and get next state s_, r, done, infos = env.step(a) # log arrange for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: epinfobuf.append(maybeepinfo) s_ = np.array(s_) # clip rewards for numerical stability clip_r = np.sign(r) # store the transition for i in range(N_ENVS): quota.store_transition(s[i], a[i], clip_r[i], s_[i], done[i], quota.options[i].item()) # annealing the epsilon(exploration strategy)
def train(args): print(args) torch.manual_seed(args.seed) np.random.seed(args.seed) if args.return_function == "GAE": return_function = GAE elif args.return_function == "Q": return_function = Q elif args.return_function == "A": return_function = A MONTE_CARLO = True if args.num_steps == 200 else False envs = SubprocVecEnv( [make_env(args.env, i + args.num_envs) for i in range(args.num_envs)], MONTE_CARLO) test_env = gym.make(args.env) test_env.seed(args.seed + args.num_envs) policy = ActorCriticMLP(input_dim=envs.observation_space.shape[0], n_acts=envs.action_space.n) optim = torch.optim.Adam(params=policy.parameters(), lr=args.lr, weight_decay=args.weight_decay) test_rewards = [] steps = 1 obs = torch.from_numpy(envs.reset()) while steps < args.max_steps: logp_actions = [] state_values = [] rewards = [] masks = [] for _ in range(args.num_steps): probs, state_value = policy.forward(obs) dist = Categorical(probs) action = dist.sample() obs, reward, done, _ = envs.step(action.numpy()) logp_actions.append(dist.log_prob(action).unsqueeze(1)) state_values.append(state_value) rewards.append(torch.FloatTensor(reward).unsqueeze(1)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1)) obs = torch.from_numpy(obs) steps += 1 if steps % args.test_every == 0: test_reward = np.mean( [test(test_env, policy) for _ in range(10)]) test_rewards.append(test_reward) print(f"Running reward at timestep {steps}: and {test_reward}") if (1 - done).sum() == 0: break next_value = 0 if not (1 - done).sum() == 0: _, next_value = policy(obs) returns = return_function(next_value, rewards, masks, state_values, args) loss = policy_gradient(logp_actions, returns) optim.zero_grad() loss.backward() optim.step() # if monte carlo, we need to reset the environment by hand if MONTE_CARLO: obs = torch.from_numpy(envs.reset()) return test_rewards