Beispiel #1
0
            p.append(out[i][0])
    next_obs, reward, terminated, info = env.step(np.hstack(p))
    setps += 1
    ep_len += 1
    for i in range(n_ant):
        buff[i].add(obs, p[i], reward, next_obs, terminated)
    obs = next_obs

    if (terminated) | (ep_len == max_ep_len):
        obs = env.reset()
        terminated = False
        ep_len = 0

    if setps % 10000 == 0:
        print(test_agent())

    if (setps < 1000) | (setps % 50 != 0):
        continue

    for e in range(50):

        for i in range(n_ant):
            X[i], A[i], R[i], next_X[i], D[i] = buff[i].getBatch(batch_size)
        q_e = agents.compute_target([next_X[i] for i in range(n_ant)])
        for i in range(n_ant):
            Q_target[i] = R[i] + (q_e[i] -
                                  alpha * q_e[i + n_ant]) * gamma * (1 - D[i])

        agents.train_critics(X, A, Q_target)
        agents.train_actors(X)
        agents.update()
Beispiel #2
0
	sum_reward += reward
	setps += 1
	buff.add(obs, p, reward, next_obs, terminated)
	obs = next_obs

	if terminated:
		obs = env.reset()
		terminated = False
		reward_list.append(sum_reward)
		sum_reward = 0
		if buff.pointer > buffer_size:

			print(np.mean(reward_list))
			reward_list = []

			for k in range(num_ite):
				states, actions, returns, next_states, dones, gammas = buff.getBatch(mini_batch)
				Q_target = agents.compute_target([next_states])[0]
				Q_target = returns + Q_target*gammas*(1 - dones)
				agents.train_critic(states, actions, Q_target)
				agents.update()

			states, actions, returns, next_states, dones, gammas = buff.getBatch(2000)
			advantages = agents.compute_advantage([states]+[actions[i] for i in range(n_ant)])
			if advantage_norm:
				for i in range(n_ant):
					advantages[i] = (advantages[i] - advantages[i].mean())/(advantages[i].std()+1e-8)
			agents.train_actors(states, actions, advantages)

			buff.reset()