noise = OUNoise(action_dim) done = False early_stop = False train_epoch = 0 rewards = 0 while not early_stop: # Sample trajectories for step in range(NB_STEP): action = model.actor.get_action( torch.FloatTensor(state) ) * ACTION_BOUND # récupère l'action de l'agent et multiplie par la range de l'action action = noise.get_action( action.detach().numpy(), step) # ajout de bruit pour l'exploration next_state, reward, done, _ = env.step( action) # avance dans l'environment # On record les données de ce time_step fifo.push(state, action, reward, next_state, done) rewards += reward state = next_state if len(fifo) > BATCH_SIZE: states_batch, actions_batch, rewards_batch, next_states_batch, done_batch = fifo.sample( BATCH_SIZE) states_batch = torch.FloatTensor(states_batch)
max_frames = 12000 * NUM_PROCESSES max_steps = 500 frame_idx = 0 episode_rewards = [] batch_size = 128 if __name__ == "__main__": # 초기 상태로 시작 while frame_idx < max_frames: state = envs.reset() ou_noise.reset() episode_reward = 0 for step in range(max_steps): action = policy_net.get_action(state) action = ou_noise.get_action(action, step) next_state, reward, done, _ = envs.step(action) replay_buffer.push(state, action, reward, next_state, done) if len(replay_buffer) > batch_size: ddpg.update(batch_size, replay_buffer) state = next_state episode_reward += reward frame_idx += NUM_PROCESSES if frame_idx % (NUM_PROCESSES * 100) == 0: # plot(frame_idx, rewards) # rewards_tmp = np.array(rewards) if episode_rewards: print("finished frames {}, {:.1f}".format(