def test(n_epi): agent = Agent(state_size=37, action_size=4, seed=0) env = UnityEnvironment(file_name="Banana.app") brain_name = env.brain_names[0] # get the default brain brain = env.brains[brain_name] agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth')) for i in range(n_epi): scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores score = 0 # initialize the score env_info = env.reset( train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state while True: action = agent.act(state) env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward score += reward # update the score done = env_info.local_done[0] # see if episode has finished agent.step(state, action, reward, next_state, done) state = next_state # roll over the state to next time step if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score print('\rEpisode {}\tAverage Score: {:.2f}'.format( i, np.mean(scores_window))) env.close()
def dqn(LR, GAMMA, TAU, BUFF, UPD, n_episodes=1000, max_t=100, eps_start=1.0, eps_end=0.01, eps_decay=0.995): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ agent = Agent(state_size, action_size, LR, GAMMA, TAU, BUFF, UPD, seed=0) scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start # initialize epsilon for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] score = 0 for t in range(max_t): action = agent.act(state, eps) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) # if np.mean(scores_window)>=13.0: # print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window))) torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth') #break # return scores return np.mean(scores_window)
def dqn(args, eps_start=1.0, eps_end=0.01, eps_decay=0.995): """Deep Q-Learning. Params ====== args : command line arguments n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start # initialize epsilon state_size = 37 action_size = 4 agent = Agent(state_size, action_size, 1) for i_episode in range(1, args.num_episodes + 1): #resetting the environment for a new episode env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] score = 0 cnt = 0 while True: action = agent.act(state, eps) env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] agent.step(state, action, reward, next_state, done) state = next_state score += reward cnt += 1 if done: break scores_window.append( score) # save most recent score in the 100 episode window scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon print('\rEpisode {}\tAverage Score in the last 100 episodes: {:.2f}'. format(i_episode, np.mean(scores_window)), end="") if i_episode % args.save_every == 0: print( '\nSaving Checkpoint for {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode, np.mean(scores_window))) torch.save( agent.qnetwork_local.state_dict(), os.path.join(args.save_checkpoint_path, 'checkpoint_' + str(i_episode) + '.pth')) return scores
def trainFunction(n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): agent = Agent(state_size=37, action_size=4, seed=0, priority=True) epsilons = [] scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start # initialize epsilon for i_episode in range(1, n_episodes + 1): env_info = env.reset( train_mode=True)[brain_name] # reset the environment state = env_info.vector_observations[0] score = 0 for t in range(max_t): action = agent.act(state, eps) env_info = env.step(action.astype(np.int32))[brain_name] next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon epsilons.append(eps) print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") # if np.mean(scores_window)>=13.0: print('\nEnvironment finished in {:d} episodes!\tAverage Score: {:.2f}'. format(i_episode, np.mean(scores_window))) torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth') return scores, epsilons
def dqn(agent: Agent, params: Params): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = params.eps_start # initialize epsilon for i_episode in range(1, params.n_episodes + 1): agent.init_episode() score = 0 # initialize the score for t in range(params.max_t): agent.act(eps) # to be defined in the agent agent.step() # to be defined in the agent score += agent.get_reward() if agent.get_done(): break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(params.eps_end, params.eps_decay * eps) # decrease epsilon # print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) if np.mean(scores_window) >= 200.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth') break return scores
def train(n_episodes=2000, eps_start=1.0, eps_end=0.025, eps_decay=0.995): agent = Agent(state_size=37, action_size=4, seed=0) env = UnityEnvironment(file_name="Banana.app") brain_name = env.brain_names[0] # get the default brain brain = env.brains[brain_name] scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start # initialize epsilon for i_episode in range(1, n_episodes+1): env_info = env.reset(train_mode=True)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 # initialize the score while True: action = agent.act(state, eps) # select an action env_info = env.step(action)[brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished agent.step(state, action, reward, next_state, done) score += reward # update the score state = next_state # roll over the state to next time step if done: # exit loop if episode finished break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay*eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window))) if np.mean(scores_window)>=13.0: print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window))) torch.save(agent.qnetwork_local.state_dict(), 'checkpoint_Nav_V01_13.pth') env.close() break return scores
def train_banana_collector(env, brain_name, maxEpisodes, threshold, \ eps_start, eps_end, eps_decay, seed, filename, memory_type): # reset the environment env_info = env.reset(train_mode=True)[brain_name] brain = env.brains[brain_name] # number of agents in the environment print('Number of agents:', len(env_info.agents)) # number of actions action_size = brain.vector_action_space_size print('Number of actions:', action_size) # examine the state space state = env_info.vector_observations[0] print('States look like:', state) state_size = len(state) print('States have length:', state_size) env_info = env.reset(train_mode=True)[brain_name] agent = Agent(state_size=state_size, action_size=action_size, seed=seed, memory_type=memory_type) state = env_info.vector_observations[0] # get the current state # initialize the score score = 0 # current score within an episode scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores # initialize epsilon eps = eps_start # now execute up to maximum "maxEpisodes" episodes for i_episode in range(1, maxEpisodes): # 1.Step: reset the environment - set the train_mode to True !! env_info = env.reset(train_mode=True)[brain_name] # 2. Step: get the current state state = env_info.vector_observations[0] # 3.Step: set the score of the current episode to 0 score = 0 # 4.Step: while episode has not ended (done = True) repeat while True: # 5.Step: Calculate the next action from agent with epsilon eps action = agent.act(state, eps) #print("Action = " , action) # 6.Step: Tell the environment about this action and get result env_info = env.step(action)[brain_name] # 7.Step: now let's get the state observation from observation next_state = env_info.vector_observations[0] # 8.Step: now let's get the reward observation from observation reward = env_info.rewards[0] #print("Reward = " , reward) # 9.Step: now let's get the done observation from observation done = env_info.local_done[0] # 10.Step: Add the reward of the last action-state result score += reward # 11.Step: Execute a training step of the agent agent.step(state, action, reward, next_state, done) # 12.Step: Continue while-loop with next_state as current state state = next_state # 13.Step: in case of end of episode print the result and break loop if done: #print("Episode " , i_episode , " has ended with score: " , score) break # 14.Step: Finally append the score of last epsisode to the overall scores scores_window.append(score) scores.append(score) # 15.Step: Calculate next epsilon eps = max(eps_end, eps_decay * eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f} , epsilon: {}'.format( i_episode, np.mean(scores_window), eps), end="") # 16.Step: Print results every 100 episodes if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) # 17.Step: In case the performance "threshold" is exceeded --> stop and save the current agents neural network if np.mean(scores_window) >= threshold and len(scores_window) == 100: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode, np.mean(scores_window))) torch.save(agent.qnn_local.state_dict(), filename) break return scores
class DQN(): def __init__(self, state_size, action_size, env): self.agent = Agent(state_size=state_size, action_size=action_size, seed=0) self.env = env self.saved_network = 'VisualBanana_DQN_chkpt.pth' def train(self, n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995, score_window_size=100, target_score=13.0, save=True, verbose=True): """Deep Q-Learning. Params ====== n_episodes (int): max. number of training episodes max_t (int): max. number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): min. value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ moving_avgs = [ ] # list containing moving average scores (over last 100 episodes) scores = [] # list containing scores from each episode scores_window = deque( maxlen=score_window_size) # last score_window_size scores eps = eps_start # initialize epsilon save12 = False start = time.time() for i_episode in range(1, n_episodes + 1): state = self.env.reset() score = 0 for t in range(max_t): action = self.agent.act(state, eps) next_state, reward, done, _ = self.env.step(action) self.agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon avg_score = np.mean(scores_window) moving_avgs.append(avg_score) if (avg_score >= 13.0) and not save12: torch.save(self.agent.qnetwork_local.state_dict(), self.saved_network) np.save('VisualBanana_Scores.npy', np.array(scores)) save12 = True if (avg_score >= target_score) and (i_episode > 100): print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'\ .format(i_episode-100, np.mean(scores_window))) self.solved = True if save: torch.save(self.agent.qnetwork_local.state_dict(), self.saved_network) break print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if (i_episode % 100 == 0): print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) if (i_episode % 100 == 0): end = time.time() elapsed = (end - start) / 60.0 print('\tElapsed: {:3.2f} mins.'.format(elapsed)) if save: torch.save(self.agent.qnetwork_local.state_dict(), self.saved_network) end = time.time() elapsed = (end - start) / 60.0 print('\n*** TOTAL ELAPSED: {:3.2f} mins. ***'.format(elapsed)) return scores, moving_avgs
def dqn(n_episodes=10000, max_t=1000, eps_start=1.0, eps_end=0.05, eps_decay=0.995, train_mode=True): """Deep Q-Learning. Params ====== agent: env: n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon train_mode (bool): set environment into training mode if True. """ scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start # initialize epsilon env = UnityEnvironment(file_name="Banana/Banana.exe", base_port=64738, no_graphics=True) brain_name = env.brain_names[0] brain = env.brains[brain_name] brain_name = env.brain_names[0] brain = env.brains[brain_name] action_size = brain.vector_action_space_size env_info = env.reset(train_mode=train_mode)[brain_name] state_size = len(env_info.vector_observations[0]) agent = Agent(state_size=state_size, action_size=action_size, seed=0) for i_episode in range(1, n_episodes + 1): state = env_info.vector_observations[0] score = 0 for t in range(max_t): action = np.int32(agent.act(state, eps)) #next_state, reward, done, _ = env.step(action) env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: env.reset(train_mode=train_mode)[brain_name] break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) if np.mean(scores_window) > 13.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) torch.save(agent.qnetwork_local.state_dict(), 'checkpoint_vanilla.pth') break return scores
def dqn(agent: Agent, env: UnityEnvironment, n_episodes: int, max_t: int, eps_start: float, eps_end: float, eps_decay: float) -> None: """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ scores: List[float] = [] # list containing scores from each episode scores_window: Deque[float] = deque( maxlen=settings.score_window_size ) # last settings.score_window_size scores eps: float = eps_start # initialize epsilon for i_episode in range(1, n_episodes + 1): env_info: BrainInfo = env.reset( train_mode=True)[brain_name] # reset the environment state: np.ndarray = env_info.vector_observations[ 0] # get the current state score: float = 0 for t in range(max_t): action: int = agent.act(state, eps) env_info: BrainInfo = env.step(action)[brain_name] next_state: np.ndarray = env_info.vector_observations[ 0] # get the next state reward: float = env_info.rewards[0] # get the reward done: bool = env_info.local_done[ 0] # see if episode has finished agent.step(Experience(state, action, reward, next_state, done)) state = next_state score += reward if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % settings.score_window_size == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) if np.mean(scores_window) >= settings.solved_at: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - settings.score_window_size, np.mean(scores_window))) # The env is solved, save the outputs. create_output_files(agent.qnetwork_local, scores, i_episode, Path() / settings.output_dir, settings.checkpoints_dir) env.close() break # episodes reached, save the outputs. create_output_files(agent.qnetwork_local, scores, n_episodes, Path() / settings.output_dir, settings.checkpoints_dir) env.close()
def demo4_LearningPathPlanning(setting): n_sample = 100 # Environment env = FireEnvironment(64, 64) # Vehicle to generate observation mask vehicle = Vehicle(n_time_windows=512, grid_size=(64, 64), planner_type='Default') # Trainer and Estimator dyn_autoencoder = DynamicAutoEncoder(SETTING, grid_size=(env.map_width, env.map_height), n_state=3, n_obs=3, encoding_dim=16, gru_hidden_dim=16) ### DQN agent dqn_agent = DQN_Agent(state_size=16, action_size=4, replay_memory_size=1000, batch_size=64, gamma=0.99, learning_rate=0.01, target_tau=0.01, update_rate=1, seed=0) # Train Data Buffer memory = SingleTrajectoryBuffer(N_MEMORY_SIZE) # Train Iteration Logger writer = SummaryWriter() # Video Writier video_writer1 = ImageStreamWriter('LearningPlanner.avi', FPS, image_size=(1200, 820)) # Add concat. text setting_text = '' for k, v in setting.items(): setting_text += k setting_text += ':' setting_text += str(v) setting_text += '\t' writer.add_text('setting', setting_text) ######################################## ### Interacting with the Environment ### ######################################## mask_obs, obs, state = env.reset() state_est_grid = dyn_autoencoder.u_k ### Loss Monitors ### list_loss = [] list_cross_entropy_loss = [] list_entropy_loss = [] list_rewards = [] list_new_fire_count = [] list_action = [] ### Filling the Data Buffer ### for i in tqdm.tqdm(range(N_TRAIN_WAIT)): map_visit_mask, img_resized = vehicle.full_mask() mask_obs, obs, state, reward, info = env.step(map_visit_mask) memory.add(mask_obs.detach().long(), state.detach().long(), map_visit_mask.detach().long()) for i in tqdm.tqdm(range(N_TOTAL_TIME_STEPS)): # determine epsilon-greedy action from current sate h_k = dyn_autoencoder.h_k.squeeze().data.cpu().numpy() epsilon = 0.1 action = dqn_agent.act(h_k, epsilon) list_action.append(action) ### Collect Data from the Env. ### map_visit_mask, img_resized = vehicle.plan_a_trajectory( state_est_grid, n_sample, action) mask_obs, obs, state, reward, info = env.step(map_visit_mask) memory.add(mask_obs.detach().long(), state.detach().long(), map_visit_mask.detach().long()) ### Run the Estimator ### state_est_grid = dyn_autoencoder.step(mask_obs, map_visit_mask) h_kp1 = dyn_autoencoder.h_k.squeeze().data.cpu().numpy() #### Update the reinforcement learning agent ### dqn_agent.step(h_k, action, reward, h_kp1, done=False) list_rewards.append(reward) list_new_fire_count.append(info['new_fire_count']) ################################ ### Rendering and Save Video ### ################################ img_env = env.output_image() img_agent = dyn_autoencoder.output_image(state_est_grid) # State Est #blank = np.zeros((400, 200, 3)) img_top = img_env #np.concatenate((blank, img_env[:,:800], blank), axis=1) blank = np.zeros((20, 1200, 3)) img_top = np.concatenate((img_top, blank), axis=0) img_top = (img_top * 255).astype('uint8') img_state_est_grid_uint8 = (img_agent * 255).astype('uint8') backtorgb = cv2.cvtColor(img_state_est_grid_uint8, cv2.COLOR_GRAY2RGB) img_bayes_uint8 = np.concatenate((img_top, backtorgb), axis=0) #<-- to be saved render('Dynamic Auto Encoder', img_bayes_uint8, 1) # Save video # video_writer1.write_image_frame(img_bayes_uint8) ### Training ### loss_val, loss_val_cross, loss_val_ent, O_np_val = dyn_autoencoder.update( memory, N_TRAIN_BATCH, N_TRAIN_WINDOW) list_loss.append(loss_val) list_cross_entropy_loss.append(loss_val_cross) list_entropy_loss.append(loss_val_ent) if i % N_LOGGING_PERIOD == 0: avg_loss = np.mean(np.array(list_loss)) list_loss = [] writer.add_scalar('dynautoenc/loss', avg_loss, i) avg_loss_cross = np.mean(np.array(list_cross_entropy_loss)) list_cross_entropy_loss = [] writer.add_scalar('dynautoenc/crossentropy', avg_loss_cross, i) avg_loss_entropy = np.mean(np.array(list_entropy_loss)) list_entropy_loss = [] writer.add_scalar('dynautoenc/shannonentropy', avg_loss_entropy, i) avg_reward = np.mean(np.array(list_rewards)) list_rewards = [] writer.add_scalar('perform/rewards', avg_reward, i) avg_new_fire_count = np.mean(np.array(list_new_fire_count)) list_new_fire_count = [] writer.add_scalar('perform/new_fire_counts', avg_new_fire_count, i) writer.add_scalar('perform/pc_coverd_new_fire', avg_reward / avg_new_fire_count, i) action_0_count = list_action.count(0) action_1_count = list_action.count(1) action_2_count = list_action.count(2) action_3_count = list_action.count(3) writer.add_scalar('action_count/0', action_0_count / len(list_action), i) writer.add_scalar('action_count/1', action_1_count / len(list_action), i) writer.add_scalar('action_count/2', action_2_count / len(list_action), i) writer.add_scalar('action_count/3', action_3_count / len(list_action), i) list_action = [] writer.add_scalar('obs_state0/o00', O_np_val[0][0], i) writer.add_scalar('obs_state1/o01', O_np_val[0][1], i) writer.add_scalar('obs_state2/o02', O_np_val[0][2], i) writer.add_scalar('obs_state0/o10', O_np_val[1][0], i) writer.add_scalar('obs_state1/o11', O_np_val[1][1], i) writer.add_scalar('obs_state2/o12', O_np_val[1][2], i) writer.add_scalar('obs_state0/o20', O_np_val[2][0], i) writer.add_scalar('obs_state1/o21', O_np_val[2][1], i) writer.add_scalar('obs_state2/o22', O_np_val[2][2], i) print( 'losses at iteration: %d, losses: total %.3f, cross %.3f, shannon %.3f' % (i, avg_loss, avg_loss_cross, avg_loss_entropy)) print('memory size at iteration: %d, size: %d' % (i, len(memory.obs_memory))) if (i + 1) % N_SAVING_PERIOD == 0: f_name = setting['name'] dyn_autoencoder.save_the_model(i, f_name) dqn_agent.save_the_model(i, f_name) video_writer1.close()
while True: # determine epsilon-greedy action from current sate actions = agent.act(state, epsilon) converted_actions = [convert_action(a) for a in actions] # print ("CONVERTED_ACTIONS", actions, converted_actions) # send the action to the environment and receive resultant environment information env_info = env.step(converted_actions)[brain_name] next_state = env_info.vector_observations # get the next state rewards = env_info.rewards # get the reward dones = env_info.local_done # see if episode has finished #Send (S, A, R, S') info to the DQN agent for a neural network update agent.step(state, actions, rewards, next_state, dones) # set new state to current state for determining next action state = next_state # Update episode score scores += rewards # If unity indicates that episode is done, # then exit episode loop, to begin new episode if all(dones): break print("Scores", scores) # # Add episode score to Scores and...
def dqn(n_episodes=4000, max_t=3000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): agent = Agent(state_size=3, action_size=8, seed=0) start_pos = (200, 600) end_pos = (800, 375) env = environment(MAP, start_pos, end_pos) """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start # initialize epsilon for i_episode in range(1, n_episodes + 1): state, _, _ = env.reset(start_pos, end_pos) score = 0 for t in range(max_t): action = agent.act(state, eps) next_state, reward, done = env.step(action) agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: #print (state) break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) #if np.mean(scores_window)>=200.0: #print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window))) if i_episode % 200 == 0: torch.save(agent.qnetwork_local.state_dict(), 'checkpoint' + str(i_episode) + '.pth') #torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth') #break return scores
def train(fullcover, name, setting): n_sample = 20 # Environment env = FireEnvironment(64, 64) # Vehicle to generate observation mask vehicle = Vehicle(n_time_windows=1000, grid_size=(64,64), planner_type=setting['planner_type']) # Trainer and Estimator dyn_autoencoder = DynamicAutoEncoder(SETTING, grid_size = (env.map_width, env.map_height), n_state=3, n_obs=3, encoding_dim=16, gru_hidden_dim=16) # Train Data Buffer memory = SingleTrajectoryBuffer(N_MEMORY_SIZE) ### DQN agent dqn_agent = DQN_Agent(state_size=16, action_size=4, replay_memory_size=1000, batch_size=64, gamma=0.99, learning_rate=0.01, target_tau=0.01, update_rate=1, seed=0) # Train Iteration Logger from torch.utils.tensorboard import SummaryWriter writer = SummaryWriter() # Add concat. text setting_text = '' for k,v in setting.items(): setting_text += k setting_text += str(v) setting_text += '\t' writer.add_text('setting', setting_text) ######################################## ### Interacting with the Environment ### ######################################## mask_obs, obs, state = env.reset() map_visit_mask, img_resized = vehicle.full_mask() state_est_grid = dyn_autoencoder.u_k ### Loss Monitors ### list_loss = [] list_cross_entropy_loss = [] list_entropy_loss = [] list_rewards = [] list_count_fire_visit = [] list_count_all_fire = [] list_action = [] ### Filling the Data Buffer ### for i in tqdm.tqdm(range(N_TRAIN_WAIT)): if fullcover: map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action) else: map_visit_mask, img_resized = vehicle.full_mask() mask_obs, obs, state, reward = env.step(map_visit_mask) memory.add(mask_obs, state, map_visit_mask) for i in tqdm.tqdm(range(N_TOTAL_TIME_STEPS)): # determine epsilon-greedy action from current sate h_k = dyn_autoencoder.h_k.squeeze().data.cpu().numpy() epsilon = 0.1 action = dqn_agent.act(h_k, epsilon) list_action.append(action) ### Collect Data from the Env. ### if fullcover: map_visit_mask, img_resized = vehicle.full_mask() else: map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action) mask_obs, obs, state, reward = env.step(map_visit_mask) memory.add(mask_obs, state, map_visit_mask) ### Run the Estimator ### state_est_grid = dyn_autoencoder.step(mask_obs, map_visit_mask) h_kp1 = dyn_autoencoder.h_k.squeeze().data.cpu().numpy() #### Update the reinforcement learning agent ### dqn_agent.step(h_k, action, reward, h_kp1, done=False) list_rewards.append(reward) fire_count = (torch.sum(state[2])).item() fire_visit = (torch.sum(mask_obs.permute(2,0,1) * state[2].unsqueeze(0))).item() if fire_count < 1: print('no fire') else: list_count_fire_visit.append(fire_visit) list_count_all_fire.append(fire_count) ### Render the Env. and the Est. ### if i % N_RENDER_PERIOD == 0: img_env = env.output_image() img_state_est_grid = dyn_autoencoder.output_image(state_est_grid) render('env', img_env, 1) render('img_state_est_grid', img_state_est_grid, 1) ### Training ### loss_val, loss_val_cross, loss_val_ent, O_np_val = dyn_autoencoder.update(memory, N_TRAIN_BATCH, N_TRAIN_WINDOW) list_loss.append(loss_val) list_cross_entropy_loss.append(loss_val_cross) list_entropy_loss.append(loss_val_ent) if i%N_LOGGING_PERIOD == 0: avg_loss = np.mean(np.array(list_loss)) list_loss = [] writer.add_scalar('dynautoenc/loss', avg_loss, i) avg_loss_cross = np.mean(np.array(list_cross_entropy_loss)) list_cross_entropy_loss = [] writer.add_scalar('dynautoenc/crossentropy', avg_loss_cross, i) avg_loss_entropy = np.mean(np.array(list_entropy_loss)) list_entropy_loss = [] writer.add_scalar('dynautoenc/shannonentropy', avg_loss_entropy, i) avg_reward = np.mean(np.array(list_rewards)) list_rewards = [] writer.add_scalar('perform/rewards', avg_reward, i) avg_count_fire_visit = np.mean(np.array(list_count_fire_visit)) list_count_fire_visit = [] writer.add_scalar('perform/avg_count_fire_visit', avg_count_fire_visit, i) avg_count_all_fire = np.mean(np.array(list_count_all_fire)) list_count_all_fire = [] writer.add_scalar('perform/avg_count_all_fire', avg_count_all_fire, i) action_0_count = list_action.count(0) action_1_count = list_action.count(1) action_2_count = list_action.count(2) action_3_count = list_action.count(3) list_action = [] if setting['planner_type'] == 'Default': writer.add_scalar('action_count/0', action_0_count, i) writer.add_scalar('action_count/1', action_1_count, i) writer.add_scalar('action_count/2', action_2_count, i) writer.add_scalar('action_count/3', action_3_count, i) writer.add_scalar('obs_state0/o00', O_np_val[0][0], i) writer.add_scalar('obs_state1/o01', O_np_val[0][1], i) writer.add_scalar('obs_state2/o02', O_np_val[0][2], i) writer.add_scalar('obs_state0/o10', O_np_val[1][0], i) writer.add_scalar('obs_state1/o11', O_np_val[1][1], i) writer.add_scalar('obs_state2/o12', O_np_val[1][2], i) writer.add_scalar('obs_state0/o20', O_np_val[2][0], i) writer.add_scalar('obs_state1/o21', O_np_val[2][1], i) writer.add_scalar('obs_state2/o22', O_np_val[2][2], i) print('losses at iteration: %d, losses: total %.3f, cross %.3f, shannon %.3f' % (i, avg_loss, avg_loss_cross, avg_loss_entropy)) print('memory size at iteration: %d, size: %d' % (i, len(memory.obs_memory))) if (i+1)%N_SAVING_PERIOD==0: f_name = name dyn_autoencoder.save_the_model(i, f_name)
def train_dqn(dev, weights_file, n_episodes=1000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): """Deep Q-Learning. Params ====== dev (string): cpu or gpu weights_file (string): name of the file to save the weights n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ scores = [] # list containing scores from each episode averages = [ ] # list containing averages of the scores. Position i (1-index) has the average of the last min(i,100) episodes scores_window = deque(maxlen=100) # last 100 scores env = UnityEnvironment(file_name='./Banana_Linux/Banana.x86_64') brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] state_size = len(env_info.vector_observations[0]) action_size = brain.vector_action_space_size agent = Agent(state_size, action_size, seed=0, device=dev) eps = eps_start # initialize epsilon for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] score = 0 for t in range(max_t): action = agent.act(state, eps) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score averages.append(np.mean(scores_window)) eps = max(eps_end, eps_decay * eps) # decrease epsilon if (i_episode % 100 != 0): print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, averages[i_episode - 1]), end="") else: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, averages[i_episode - 1])) if (averages[i_episode - 1] >= 13.0): print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, averages[i_episode - 1])) torch.save(agent.qnetwork_local.state_dict(), weights_file) break env.close() return scores, averages
def dqn(n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995, max_score=200.0, layers_neurones=64): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ agent = Agent(state_size=state_size, action_size=action_size, seed=0, layers_neurones=layers_neurones) filename = f'./results/n_episodes={n_episodes}, max_t={max_t}, eps_start={eps_start}, eps_end={eps_end}, eps_decay={eps_decay}, max_score = {max_score}, layers_neurones = {layers_neurones}' scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start # initialize epsilon for i_episode in range(1, n_episodes + 1): # state = env.reset() env_info = env.reset(train_mode=True)[brain_name] score = 0 for t in range(max_t): state = env_info.vector_observations[0] action = agent.act(state, eps) env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished # next_state, reward, done, _ = env.step(action) agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) with open(f'{filename}.json', 'w') as filehandle: json.dump(scores, filehandle) if np.mean(scores_window) >= max_score: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) torch.save(agent.qnetwork_local.state_dict(), f'{filename}.pth') with open(f'{filename}.json', 'w') as filehandle: json.dump(scores, filehandle) break torch.save(agent.qnetwork_local.state_dict(), f'{filename}.pth') return scores
def train(agent_config, n_episodes=2000, max_t=1000, base_port=5005, save_path=None, name=None): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ env = UnityEnvironment(file_name="Banana_Linux_NoVis/Banana.x86_64", no_graphics=True, base_port=base_port) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] eps_start = agent_config.get('eps_start', 1.0) eps_end = agent_config.get('eps_end', 0.01) eps_decay = agent_config.get('eps_decay', 0.995) lr = agent_config.get('lr', 1e-3) lr_decay = agent_config.get('lr_decay', 1) agent = Agent(seed=0, **agent_config) # reset env_info = env.reset(train_mode=True)[brain_name] scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start # initialize epsilon with trange(n_episodes, desc='episode') as episode_bar: for episode in episode_bar: env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] score = 0 for t in range(max_t): action = agent.act(state, eps) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon lr = lr * lr_decay # decrease learning rate for g in agent.optimizer.param_groups: g['lr'] = lr episode_bar.set_postfix(avg_score=np.mean(scores_window)) if save_path: torch.save(agent.qnetwork_local.state_dict(), save_path) env.close() return pd.Series(scores, name=name)
def run(self, run_id=1, n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995, lr=5e-4, use_double_dqn=False, use_soft_update=True): start = time.time() agent = Agent(state_size=37, action_size=4, seed=0, lr=lr, use_double_dqn=use_double_dqn, use_soft_update=use_soft_update) # list containing scores from each episode scores = [] # last 100 scores scores_window = deque(maxlen=100) # initialize epsilon eps = eps_start for i_episode in range(1, n_episodes + 1): # reset the environment env_info = self.env.reset(train_mode=True)[self.brain_name] # get the current state state = env_info.vector_observations[0] score = 0 for t in range(max_t): action = agent.act(state, eps) #print("action: ", action) # send the action to the environment env_info = self.env.step(action)[self.brain_name] # get the next state next_state = env_info.vector_observations[0] # get the reward reward = env_info.rewards[0] # see if episode has finished done = env_info.local_done[0] # TODO add proper comment agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break # save most recent score scores_window.append(score) # save most recent score scores.append(score) # decrease epsilon eps = max(eps_end, eps_decay * eps) print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) if np.mean(scores_window) >= 14.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) end = time.time() elapsed = end - start print("\nTime taken to solve: {:.2f} minutes".format(elapsed / 60.0)) run_dir = "results/{}".format(run_id) os.mkdir(run_dir) torch.save(agent.qnetwork_local.state_dict(), "{}/checkpoint.pth".format(run_dir)) break return scores
def dqn(n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ # Get environment instance env = UnityEnvironment(file_name=BANANA_FILE) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # Reset environment env_info = env.reset(train_mode=True)[brain_name] # Get initial state, state size and action size action_size = brain.vector_action_space_size state = env_info.vector_observations[0] state_size = len(state) # Setup agent agent = Agent(state_size=state_size, action_size=action_size, seed=0) # Train! max_avg_score = -100000 # max avg score over 100 episodes scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start # initialize epsilon for i_episode in range(1, n_episodes + 1): state = env.reset(train_mode=True)[brain_name].vector_observations[0] score = 0 for t in range(max_t): action = agent.act(state, eps) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] agent.step(state, action, reward, next_state, done) score += reward state = next_state if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) if np.mean(scores_window) >= 13.0 and np.mean( scores_window) > max_avg_score: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth') # break max_avg_score = np.mean(scores_window) # Close environment env.close() return scores
def run(experiment_name, num_iterations, learning_rate, buffer_size, batch_size, gamma, epsilon, epsilod_decay, epsilon_min, stack_size, device, is_ddqn, evaluation_rate, log_directory): scores = [] episodic_accum = 0 epsidoic_rewards = [] iteration_rewards = [] episode = 1 agent = Agent(env=env, state_space=state_space, action_space=action_space, learning_rate=learning_rate,\ buffer_size=buffer_size, batch_size=batch_size, gamma=gamma,\ device=device, in_channels=stack_size, is_ddqn = is_ddqn) #initializing log directory for tensorboard if not os.path.exists(log_directory): os.makedirs(log_directory) tb_writer = SummaryWriter('{}/{}'.format(log_directory, experiment_name)) frame_count = 0 epoch_plot_count = 0 stop = False prev_iteration = None while agent.num_train_updates < num_iterations + 1 and not stop: state = env.reset() done = False # current state & 3-previous states state_frames = deque(maxlen=stack_size) episode_reward = [] while not done: frame_count += 1 _state = preprocess_state(state) state = torch.from_numpy(_state).float() # if it's the first frame, copy the same state multiple time in the stack if len(state_frames) < stack_size: for i in range(stack_size): state_frames.append(state) else: state_frames.append(state) state_stack = torch.stack(list(state_frames)).unsqueeze(dim=0) action = agent.act(state_stack, epsilon) prev_action = action next_state, reward, done, info = env.step(action) _next_state = preprocess_state(next_state) _next_state = torch.from_numpy(_next_state).float() agent.step(state_frames.__copy__(), action, reward, _next_state, done) state = next_state episodic_accum += reward iteration_rewards.append(reward) if agent.num_train_updates > 0: # evaluate every 1M steps and decay epsilon (based on paper) if agent.num_train_updates % evaluation_rate == 0 and prev_iteration != agent.num_train_updates: epsilon = max(epsilon_min, epsilon * epsilod_decay) prev_iteration = agent.num_train_updates if agent.num_train_updates > num_iterations: stop = True episode += 1 epsidoic_rewards.append(episodic_accum) episodic_accum = 0. if episode % 100 == 0 and len(epsidoic_rewards) > 20: tb_writer.add_scalar('Episode Accum score', np.mean(epsidoic_rewards[-20:]), episode) print('episode_num:{}\tepisode_score:{}\tepsilon:{}\tmemory_size:{}'.format(\ episode, np.mean(epsidoic_rewards[-20:]), epsilon,len(agent.memory))) torch.save(agent.QNetwork_local.state_dict(), '{}_checkpoint.pth'.format(experiment_name)) return epsidoic_rewards
env = gym.make('Breakout-v0') # env = gym.make('CarRacing-v0') env.seed(SEED) obs_size, action_size = env.observation_space.shape, env.action_space.n print('State shape: ', obs_size) print('Number of actions: ', action_size) state_size = 1024 h_size = 128 agent = Agent(action_size, state_size, h_size=h_size, seed=SEED) episodes = 100 steps = 150 for i_episode in range(episodes): obs = env.reset() obs = resize(obs, size=64) score = 0 for t in range(steps): action = agent.act(obs) next_obs, reward, done, _ = env.step(action) print(reward) next_obs = resize(next_obs, size=64) agent.step(obs, action, reward, next_obs, done) obs = next_obs score += reward if done: break
class DQN(): # env assumption: env.reset(), env.render(), env.step(), env.close() def __init__(self, name, state_size, action_size, env, load_net=False): self.agent = Agent(name, state_size=state_size, action_size=action_size, seed=0) self.env = env self.saved_network = name + '_dqn_checkpoint.pth' self.load_net = load_net if load_net: print('Loading pretrained network...') self.agent.qnetwork_local.load_state_dict( torch.load(self.saved_network)) self.agent.qnetwork_target.load_state_dict( torch.load(self.saved_network)) print('Loaded.') def train(self, n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995, score_window_size=100, target_score=13.0, save=True, verbose=True): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ scores = [] # list containing scores from each episode scores_window = deque( maxlen=score_window_size) # last score_window_size scores eps = eps_start # initialize epsilon save12 = False for i_episode in range(1, n_episodes + 1): state = self.env.reset() score = 0 for t in range(max_t): action = self.agent.act(state, eps) next_state, reward, done, _ = self.env.step(action) self.agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon avg_score = np.mean(scores_window) if avg_score > 13.0 and not save12 and not self.load_net: torch.save(self.agent.qnetwork_local.state_dict(), self.saved_network) np.save('scores13_0824.npy', np.array(scores)) save12 = True if avg_score >= target_score and i_episode > 100: if verbose: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode, np.mean(scores_window))) self.solved = True if save: torch.save(self.agent.qnetwork_local.state_dict(), self.saved_network) break if verbose: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) if save: torch.save(self.agent.qnetwork_local.state_dict(), self.saved_network) return scores def play(self, trials=3, steps=200, load=False): if load: self.agent.qnetwork_local.load_state_dict( torch.load(self.saved_network)) for i in range(trials): total_reward = 0 print('Start Trial...') state = self.env.reset() for j in range(steps): action = self.agent.act(state) self.env.render() state, reward, done, _ = self.env.step(action) total_reward += reward if reward != 0: print("Current Reward:", reward, "Total Reward:", total_reward) if done: print('Done.') break self.env.close()
def demo5_ComparePolicies(setting, env): n_sample = 2048 # Vehicle to generate observation mask vehicle = Vehicle(n_time_windows=64, grid_size=(64,64), planner_type='Default') # Trainer and Estimator dyn_autoencoder = DynamicAutoEncoder(SETTING, grid_size = (env.map_width, env.map_height), n_state=3, n_obs=3, encoding_dim=4, gru_hidden_dim=4) ### DQN agent dqn_agent = DQN_Agent(state_size=4, action_size=4, replay_memory_size=1000, batch_size=64, gamma=0.99, learning_rate=0.01, target_tau=0.01, update_rate=1, seed=0) # Train Data Buffer memory = SingleTrajectoryBuffer(N_MEMORY_SIZE) # Video Writier ''' video_f_name = 'UsePlanner'+ '_' + setting['name'] + '_' + setting['policy_type'] + '.avi' video_writer1 = ImageStreamWriter(video_f_name, FPS, image_size=(1200,820)) ''' # Train Iteration Logger writer = SummaryWriter() # Add concat. text setting_text = '' for k,v in setting.items(): setting_text += k setting_text += ':' setting_text += str(v) setting_text += '\t' writer.add_text('setting', setting_text) ######################################## ### Interacting with the Environment ### ######################################## ### Loss Monitors ### list_rewards = [] list_new_fire_count = [] list_action = [] list_loss = [] ### Filling the Data Buffer ### for i in tqdm.tqdm(range(N_TRAIN_WAIT)): map_visit_mask, img_resized = vehicle.full_mask() mask_obs, obs, state, reward, info = env.step(map_visit_mask) memory.add(mask_obs.detach().long(), state.detach().long(), map_visit_mask.detach().long()) mask_obs, obs, state = env.reset() state_est_grid = dyn_autoencoder.u_k for i in tqdm.tqdm(range(N_TOTAL_TIME_STEPS)): # determine epsilon-greedy action from current sate h_k = dyn_autoencoder.h_k.squeeze().data.cpu().numpy() epsilon = 0.1 action = dqn_agent.act(h_k, epsilon) ### Collect Data from the Env. ### # Plan a trajectory policy_type = setting['policy_type'] if policy_type == 'Default': map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action) elif policy_type == 'Random': action = 777 map_visit_mask, img_resized = vehicle.generate_a_random_trajectory() elif policy_type == 'Act0': action = 0 map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action) elif policy_type == 'Act1': action = 1 map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action) elif policy_type == 'Act2': action = 2 map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action) else: action = 3 map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action) list_action.append(action) # Collect the masked observation mask_obs, obs, state, reward, info = env.step(map_visit_mask) memory.add(mask_obs.detach().long(), state.detach().long(), map_visit_mask.detach().long()) ### Run the Estimator ### state_est_grid = dyn_autoencoder.step(mask_obs, map_visit_mask) h_kp1 = dyn_autoencoder.h_k.squeeze().data.cpu().numpy() list_rewards.append(reward) list_new_fire_count.append(info['new_fire_count']) update = True #### Update the reinforcement learning agent and Dyn Auto Enc ### if policy_type != 'Random': dqn_agent.step(h_k, action, reward, h_kp1, False, update) loss_val, loss_val_cross, loss_val_ent, O_np_val = dyn_autoencoder.update(memory, N_TRAIN_BATCH, N_TRAIN_WINDOW, update) list_loss.append(loss_val) ################################ ### Rendering and Save Video ### ################################ img_env = env.output_image() img_agent = dyn_autoencoder.output_image(state_est_grid) # State Est #blank = np.zeros((400, 200, 3)) img_top = img_env #np.concatenate((blank, img_env[:,:800], blank), axis=1) blank = np.zeros((20, 1200, 3)) img_top = np.concatenate((img_top, blank), axis=0) img_top = (img_top*255).astype('uint8') img_state_est_grid_uint8 = (img_agent*255).astype('uint8') backtorgb = cv2.cvtColor(img_state_est_grid_uint8, cv2.COLOR_GRAY2RGB) img_bayes_uint8 = np.concatenate((img_top, backtorgb), axis=0) #<-- to be saved render('Dynamic Auto Encoder', img_bayes_uint8, 1) # Save video # #video_writer1.write_image_frame(img_bayes_uint8) if i%N_LOGGING_PERIOD == 0: avg_reward = np.mean(np.array(list_rewards)) list_rewards = [] writer.add_scalar('perform/rewards', avg_reward, i) avg_new_fire_count = max(np.mean(np.array(list_new_fire_count)), 1) # to avoid division by zero list_new_fire_count = [] writer.add_scalar('perform/new_fire_counts', avg_new_fire_count, i) writer.add_scalar('perform/pc_coverd_new_fire', avg_reward/avg_new_fire_count, i) if policy_type != 'Random': avg_loss = np.mean(np.array(list_loss)) list_loss = [] writer.add_scalar('dynautoenc/loss', avg_loss, i) action_0_count = list_action.count(0) action_1_count = list_action.count(1) action_2_count = list_action.count(2) action_3_count = list_action.count(3) writer.add_scalar('action_count/0', action_0_count/len(list_action), i) writer.add_scalar('action_count/1', action_1_count/len(list_action), i) writer.add_scalar('action_count/2', action_2_count/len(list_action), i) writer.add_scalar('action_count/3', action_3_count/len(list_action), i) list_action = [] writer.add_scalar('obs_state0/o00', O_np_val[0][0], i) writer.add_scalar('obs_state1/o01', O_np_val[0][1], i) writer.add_scalar('obs_state2/o02', O_np_val[0][2], i) writer.add_scalar('obs_state0/o10', O_np_val[1][0], i) writer.add_scalar('obs_state1/o11', O_np_val[1][1], i) writer.add_scalar('obs_state2/o12', O_np_val[1][2], i) writer.add_scalar('obs_state0/o20', O_np_val[2][0], i) writer.add_scalar('obs_state1/o21', O_np_val[2][1], i) writer.add_scalar('obs_state2/o22', O_np_val[2][2], i)
elif round(action) == 2: converted_action = np.array([[0, 0, 1, 0]]) # counterclock elif round(action) == 3: converted_action = np.array([[0, 0, 2, 0]]) # clock # converted_action = np.column_stack([np.random.randint(0, converted_action_size[i], size=(converted_agent_num)) for i in range(len(converted_action_size))]) # converted_action = np.array([[1,0,0,0]]) # send the action to the environment and receive resultant environment information env_info = env.step(converted_action)[brain_name] next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished #Send (S, A, R, S') info to the DQN agent for a neural network update agent.step(state, action, reward, next_state, done) # set new state to current state for determining next action state = next_state # Update episode score score += reward # If unity indicates that episode is done, # then exit episode loop, to begin new episode if done: break # Add episode score to Scores and... # Calculate mean score over last 100 episodes # Mean score is calculated over current episodes until i_episode > 100
def dqn(n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start # initialize epsilon agent = Agent(state_size=state_size, action_size=action_size, seed=0) for i_episode in range(1, n_episodes + 1): env_info = env.reset( train_mode=GRAPHICS_OFF)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 for t in range(max_t): action = agent.act(state, eps) action = int(action) ### FIX env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) if np.mean(scores_window) >= TARGET_SCORE: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) torch.save(agent.qnetwork_local.state_dict(), "ckpt/{}".format(CHECKPOINT_NAME)) break return scores