sum_reward += reward setps += 1 buff.add(obs, p, reward, next_obs, terminated) obs = next_obs if terminated: obs = env.reset() terminated = False reward_list.append(sum_reward) sum_reward = 0 if buff.pointer > buffer_size: print(np.mean(reward_list)) reward_list = [] for k in range(num_ite): states, actions, returns, next_states, dones, gammas = buff.getBatch(mini_batch) Q_target = agents.compute_target([next_states])[0] Q_target = returns + Q_target*gammas*(1 - dones) agents.train_critic(states, actions, Q_target) agents.update() states, actions, returns, next_states, dones, gammas = buff.getBatch(2000) advantages = agents.compute_advantage([states]+[actions[i] for i in range(n_ant)]) if advantage_norm: for i in range(n_ant): advantages[i] = (advantages[i] - advantages[i].mean())/(advantages[i].std()+1e-8) agents.train_actors(states, actions, advantages) buff.reset()
target_update_freq = 200 gamma = 0.04 explor_period = 10000 env = Environ3D(seed) buffer = ReplayBuffer(buffer_size, env) dqn = DoubleDQN(len(env.action_Space), buffer, buffer_size, batch_size, training_freq,\ target_update_freq, gamma, explor_period, seed, env) env.reset() ####### #prefill buffer prefill_buffer_size = 50000 buffer.reset() for _ in range(prefill_buffer_size): action = np.random.randint(0, len(env.action_Space)) current_state = np.copy(env.state) next_state, reward, done = env.step(action) buffer.store(current_state, action, reward, done, prefill=True) if done: env.reset() #reset when prefilling is done env.reset() ###########