obs_img = np.zeros((50, 50)) obs_dis = last_distance obs_ori = orientation new_obs_img = np.zeros((50, 50)) new_obs_dis = last_distance new_obs_ori = orientation ################################################## ''' We create the policy network (the Actor model) ''' policy = TD3(action_dim, max_action) ################################################## ''' We create the Experience Replay memory''' replay_buffer = ReplayBuffer(sample_size=sample_size) ################################################## im = CoreImage("./images/MASK1.png") imgCV2 = cv2.imread('./images/MASK1.png') rows, cols, dims = imgCV2.shape # Initializing the map first_update = True def init(): global sand global goal_x global goal_y
episode_reward = 0 maxepisode_timesteps = 500 torch.manual_seed(seed) np.random.seed(seed) state_dim = 5 action_dim = 1 max_action = 5 min_action = -5 # Getting our AI, which we call "brain", and that contains our neural network that represents our Q-function #brain = Dqn(5,3,0.9) action2rotation = [0, 5, -5] #spacenetwork = ObsSpaceNetwork() policy = TD3(state_dim, action_dim, max_action) replay_buffer = ReplayBuffer() last_reward = 0 scores = [] im = CoreImage("./images/MASK1.png") total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 episode_timesteps = 0 done = True t0 = time.time() # textureMask = CoreImage(source="./kivytest/simplemask1.png") # Initializing the map first_update = True i = 0
def update(self, dt): global longueur global largeur longueur = self.width largeur = self.height if first_update: init() def evaluate_policy(policy, eval_episodes=10): avg_reward = 0. for _ in range(eval_episodes): obs = reset(self) done = False while not done: action = policy.select_action(obs) obs, reward, done, _ = Car.move(action) avg_reward += reward avg_reward /= eval_episodes print("---------------------------------------") print("Average Reward over the Evaluation Step: %f" % (avg_reward)) print("---------------------------------------") return avg_reward file_name = "%s_%s_%s" % ("TD3", env_name, str(seed)) print("---------------------------------------") print("Settings: %s" % (file_name)) print("-------------------------------------- -") if not os.path.exists("./results"): os.makedirs("./results") if save_models and not os.path.exists("./pytorch_models"): os.makedirs("./pytorch_models") torch.manual_seed(seed) np.random.seed(seed) state_dim = [32, 32, 1] action_dim = 1 max_action = 5 policy = TD3(state_dim, action_dim, max_action) replay_buffer = ReplayBuffer() evaluations = [evaluate_policy(policy)] def mkdir(base, name): path = os.path.join(base, name) if not os.path.exists(path): os.makedirs(path) return path work_dir = mkdir('exp', 'brs') monitor_dir = mkdir(work_dir, 'monitor') max_episode_steps = 400 total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True t0 = time.time() # We start the main loop over 40,000 timesteps while total_timesteps < max_timesteps: # If the episode is done if done: # If we are not at the very beginning, we start the training process of the model if (total_timesteps != 0 and total_timesteps > (batch_size)): print("Total Timesteps: {} Episode Num: {} Reward: {}". format(total_timesteps, episode_num, episode_reward)) policy.train(replay_buffer, episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq) # We evaluate the episode and we save the policy if timesteps_since_eval >= eval_freq: timesteps_since_eval %= eval_freq evaluations.append(evaluate_policy(policy)) policy.save(file_name, directory="./pytorch_models") np.save("./results/%s" % (file_name), evaluations) # When the training step is done, we reset the state of the environment obs = reset() # Set the Done to False done = False # Set rewards and episode timesteps to zero episode_reward = 0 episode_timesteps = 0 episode_num += 1 # Before 10000 timesteps, we play random actions if total_timesteps < start_timesteps: action = np.random.normal(0, 1, size=1).clip(-1, 1).astype(np.float32) else: # After 10000 timesteps, we switch to the model action = policy.select_action(obs) # If the explore_noise parameter is not 0, we add noise to the action and we clip it if expl_noise != 0: action = (action + np.random.normal(0, expl_noise, size=1)).clip( -1, 1) # The agent performs the action in the environment, then reaches the next state and receives the reward new_obs, reward, done, _ = move(action) # We check if the episode is done # done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done) if episode_timesteps + 1 == max_episode_steps: done = True done = float(done) # We increase the total reward episode_reward += reward # We store the new transition into the Experience Replay memory (ReplayBuffer) replay_buffer.add((obs, new_obs, action, reward, done_bool)) # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy obs = new_obs episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 t1 = time.time() print("Total time taken: {}".format(t1 - t0)) evaluations.append(evaluate_policy(policy)) if save_models: policy.save("%s" % (file_name), directory="./pytorch_models") np.save("./results/%s" % (file_name), evaluations) CarApp().stop()