Ejemplo n.º 1
0
	sum_reward += reward
	setps += 1
	buff.add(obs, p, reward, next_obs, terminated)
	obs = next_obs

	if terminated:
		obs = env.reset()
		terminated = False
		reward_list.append(sum_reward)
		sum_reward = 0
		if buff.pointer > buffer_size:

			print(np.mean(reward_list))
			reward_list = []

			for k in range(num_ite):
				states, actions, returns, next_states, dones, gammas = buff.getBatch(mini_batch)
				Q_target = agents.compute_target([next_states])[0]
				Q_target = returns + Q_target*gammas*(1 - dones)
				agents.train_critic(states, actions, Q_target)
				agents.update()

			states, actions, returns, next_states, dones, gammas = buff.getBatch(2000)
			advantages = agents.compute_advantage([states]+[actions[i] for i in range(n_ant)])
			if advantage_norm:
				for i in range(n_ant):
					advantages[i] = (advantages[i] - advantages[i].mean())/(advantages[i].std()+1e-8)
			agents.train_actors(states, actions, advantages)

			buff.reset()
Ejemplo n.º 2
0
target_update_freq = 200
gamma = 0.04
explor_period = 10000

env = Environ3D(seed)
buffer = ReplayBuffer(buffer_size, env)
dqn = DoubleDQN(len(env.action_Space), buffer, buffer_size, batch_size, training_freq,\
               target_update_freq, gamma, explor_period, seed, env)

env.reset()

#######
#prefill buffer

prefill_buffer_size = 50000
buffer.reset()

for _ in range(prefill_buffer_size):

    action = np.random.randint(0, len(env.action_Space))
    current_state = np.copy(env.state)
    next_state, reward, done = env.step(action)
    buffer.store(current_state, action, reward, done, prefill=True)

    if done:
        env.reset()

#reset when prefilling is done
env.reset()

###########