def main(unused_argv): del unused_argv configs = extract_configs(*FLAGS.eval_config) # instantiate the Banana env env = BananaWrapper(file_name="./Banana") state_size = env.observation_size action_size = env.action_size # instantiate agent object agent = Agent(state_size=state_size, action_size=action_size, configs=configs) # load trained model agent.qnetwork_local.load_state_dict( torch.load("results/checkpoints/DoubleDQN.pth")) horizon = 1000 episodes = 5 for _ in range(episodes): state = env.reset() for _ in range(horizon): # Perform action given the trained policy action = agent.act(state) next_state, reward, done = env.step(action) time.sleep(0.05) if done: break state = next_state # close env env.close()
def main(): ################################################ # components requred from main_02.py ################################################ # spin up environment env = gym.make('LunarLander-v2') env.seed(0) # spin up agent (with underlying nn model) agent = Agent(state_size=8, action_size=4, seed=0) ################################################ # Import trained agent and render performance ################################################ # load the weights from file agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth')) for i in range(10): state = env.reset() img = plt.imshow(env.render(mode='rgb_array')) for j in range(400): action = agent.act(state) img.set_data(env.render(mode='rgb_array')) plt.axis('off') state, reward, done, _ = env.step(action) if done: break env.close()
def DQN_gif(file_name): env = gym.make('LunarLander-v2') env.seed(0) agent = Agent(state_size=8, action_size=4, seed=0) agent.qnetwork_local.load_state_dict( torch.load('checkpoint.pth', map_location=lambda storage, loc: storage)) images = [] state = env.reset() img = env.render(mode='rgb_array') for j in range(200): action = agent.act(state) state, reward, done, _ = env.step(action) frame = env.render(mode='rgb_array') pil_img = Image.fromarray(frame) draw = ImageDraw.Draw(pil_img) text = 'Step = {}\nReward = {}'.format(j + 1, reward) draw.text((20, 20), text, (255, 255, 255)) images.append(np.asarray(pil_img)) if done: break imageio.mimsave(file_name, images)
def main(): env = UnityEnvironment(file_name="./../Banana.app") # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # Instantiate agent: env_info = env.reset(train_mode=False)[brain_name] #agent = Agent(state_size=8, action_size=4, seed=0) action_size = brain.vector_action_space_size state = env_info.vector_observations[0] state_size = len(state) agent = Agent(state_size=state_size, action_size=action_size, seed=0) # load the weights from file agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth')) for i in range(3): #state = env.reset() env_info = env.reset(train_mode=False)[brain_name] state = env_info.vector_observations[0] for j in range(200): action = agent.act(state) #state, reward, done, _ = env.step(action) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] if done: break state = next_state env.close()
def test(self, run_id=5): agent = Agent(state_size=37, action_size=4, seed=0) run_dir = "results/{}".format(run_id) # load the weights from file agent.qnetwork_local.load_state_dict( torch.load("{}/checkpoint.pth".format(run_dir))) for i in range(5): env_info = self.env.reset( train_mode=False)[self.brain_name] # reset the environment state = env_info.vector_observations[0] for j in range(50): action = agent.act(state) env_info = self.env.step(action)[ self.brain_name] # send the action to the environment next_state = env_info.vector_observations[ 0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished if done: break
def main( file_name="/Users/joshuaschoenfield/Downloads/Banana.app", weights_file="checkpoint_banana_2_LONG_SAFE.pth", ): with get_environment(file_name=file_name) as env: from dqn_agent import Agent agent = Agent(state_size=37, action_size=4, seed=0) agent.qnetwork_local.load_state_dict(torch.load(weights_file)) scores = [] num_iterations = 100 for i in range(num_iterations): state = reset_and_get_first_state(env, train_mode=True) score = 0 for j in range(2000): action = agent.act(state, eps=0) # env.render() state, reward, done = get_next_state_reward_done(env, action) score += reward if done: break scores.append(score) # print(f"Score: {score}") # print(f"Average Score: {np.mean(scores)}") ax = plot_score_cumulative_distribution(scores) ax.figure.savefig("Media/validation_scores_cumulative.png") # plt.show() np.savetxt("validation_scores.txt", scores) return scores
def process(args): env = UnityEnvironment(file_name="Banana.app") brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 # initialize the score action_size = brain.vector_action_space_size state_size = len(state) agent = Agent(state_size, action_size, 1, args.model_path) while True: action = agent.act(state, 0.0) # select an action env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished score += reward # update the score state = next_state # roll over the state to next time step if done: # exit loop if episode finished break print("Score: {}".format(score))
def test(n_epi): agent = Agent(state_size=37, action_size=4, seed=0) env = UnityEnvironment(file_name="Banana.app") brain_name = env.brain_names[0] # get the default brain brain = env.brains[brain_name] agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth')) for i in range(n_epi): scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores score = 0 # initialize the score env_info = env.reset( train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state while True: action = agent.act(state) env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward score += reward # update the score done = env_info.local_done[0] # see if episode has finished agent.step(state, action, reward, next_state, done) state = next_state # roll over the state to next time step if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score print('\rEpisode {}\tAverage Score: {:.2f}'.format( i, np.mean(scores_window))) env.close()
def main(): agent = Agent(state_size=3, action_size=8, seed=0) start_pos = (200, 600) end_pos = (800, 375) #start_pos = (200,500) #end_pos = (800,500) env = environment(MAP, start_pos, end_pos) """ x_end, y_end = end_pos plt.figure(figsize=(10,6), dpi=200) plt.plot(start_pos[0], start_pos[1], 'rx') plt.plot(x_end, y_end, 'bx') plt.contourf(np.array(MAP), linestyles='dashed') #plt.imshow(np.array(MAP)) plt.gca().set_aspect('equal', adjustable='box') plt.colorbar() plt.show() sys.exit(()) """ # load the weights from file agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth')) for i in range(1): path_x = [start_pos[0]] path_y = [start_pos[1]] state, _, _ = env.reset(start_pos, end_pos) for j in range(6000): action = agent.act(state) #print (j, action) print(j) #if j%100 == 0: # env.render() state, reward, done = env.step(action) path_x.append(state[0]) path_y.append(state[1]) if done: break print(done) x_end, y_end = end_pos plt.figure(figsize=(10, 6), dpi=200) plt.plot(path_x, path_y, 'ro', markevery=20) plt.plot(x_end, y_end, 'bx') plt.contourf(np.array(MAP), linestyles='dashed') #plt.imshow(np.array(MAP)) plt.gca().set_aspect('equal', adjustable='box') plt.colorbar() plt.show() env.close()
def dqn(LR, GAMMA, TAU, BUFF, UPD, n_episodes=1000, max_t=100, eps_start=1.0, eps_end=0.01, eps_decay=0.995): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ agent = Agent(state_size, action_size, LR, GAMMA, TAU, BUFF, UPD, seed=0) scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start # initialize epsilon for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] score = 0 for t in range(max_t): action = agent.act(state, eps) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) # if np.mean(scores_window)>=13.0: # print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window))) torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth') #break # return scores return np.mean(scores_window)
def dqn(args, eps_start=1.0, eps_end=0.01, eps_decay=0.995): """Deep Q-Learning. Params ====== args : command line arguments n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start # initialize epsilon state_size = 37 action_size = 4 agent = Agent(state_size, action_size, 1) for i_episode in range(1, args.num_episodes + 1): #resetting the environment for a new episode env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] score = 0 cnt = 0 while True: action = agent.act(state, eps) env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] agent.step(state, action, reward, next_state, done) state = next_state score += reward cnt += 1 if done: break scores_window.append( score) # save most recent score in the 100 episode window scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon print('\rEpisode {}\tAverage Score in the last 100 episodes: {:.2f}'. format(i_episode, np.mean(scores_window)), end="") if i_episode % args.save_every == 0: print( '\nSaving Checkpoint for {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode, np.mean(scores_window))) torch.save( agent.qnetwork_local.state_dict(), os.path.join(args.save_checkpoint_path, 'checkpoint_' + str(i_episode) + '.pth')) return scores
def test(dev, weights_file, n_episodes=100, max_t=1000): """Test the environment with the parameters stored in weights_file Params ====== dev (string): cpu or gpu weights_file (string): name of the file to load the weights n_episodes (int): number of test episodes that will be performed max_t (int): maximum number of timesteps per episode """ env = UnityEnvironment(file_name='./Banana_Linux/Banana.x86_64') brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=False)[brain_name] state_size = len(env_info.vector_observations[0]) action_size = brain.vector_action_space_size agent = Agent(state_size, action_size, seed=0, device=dev) # load the weights from file print('Loading weights') try: checkpoint = torch.load(weights_file) except FileNotFoundError: print('Error: File \'{}\' not found'.format(weights_file)) sys.exit(1) agent.qnetwork_local.load_state_dict(checkpoint) scores = [] print('Running {} episodes'.format(n_episodes)) for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=False)[brain_name] score = 0 state = env_info.vector_observations[0] for j in range(max_t): action = agent.act(state) env_info = env.step(action)[brain_name] state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] score += reward if done: break scores.append(score) if (i_episode % 100 != 0): print('\rEpisode {}\tScore: {:.0f}\tAverage Score: {:.2f}'.format( i_episode, score, np.mean(scores)), end="") else: print('\rEpisode {}\tScore: {:.0f}\tAverage Score: {:.2f}'.format( i_episode, score, np.mean(scores))) env.close()
def dqn(agent: Agent, params: Params): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = params.eps_start # initialize epsilon for i_episode in range(1, params.n_episodes + 1): agent.init_episode() score = 0 # initialize the score for t in range(params.max_t): agent.act(eps) # to be defined in the agent agent.step() # to be defined in the agent score += agent.get_reward() if agent.get_done(): break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(params.eps_end, params.eps_decay * eps) # decrease epsilon # print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) if np.mean(scores_window) >= 200.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth') break return scores
def play_banana(isDoubleDQN=0): isDoubleDQN = int(isDoubleDQN) # find the path to the environment, this can be different for different OS env = UnityEnvironment(file_name="Banana_Windows_x86_64\Banana.exe") # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of actions action_size = brain.vector_action_space_size # examine the state space state = env_info.vector_observations[0] state_size = len(state) # instantiate agent agent = Agent(state_size=state_size, action_size=action_size, seed=0, isDoubleDQN=isDoubleDQN) # load the weights from file if (isDoubleDQN==1): print("Using Double DQN") agent.qnetwork_local.load_state_dict(torch.load('checkpoint_double_agent.pth')) else: print("Not Using Double DQN") agent.qnetwork_local.load_state_dict(torch.load('checkpoint_simple_agent.pth')) # start the agent env_info = env.reset(train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 # initialize the score while True: action = agent.act(state, eps=0) # select an action env_info = env.step(action)[brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished score += reward # update the score state = next_state # roll over the state to next time step if done: # exit loop if episode finished break print("Score: {}".format(score)) env.close()
def trainFunction(n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): agent = Agent(state_size=37, action_size=4, seed=0, priority=True) epsilons = [] scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start # initialize epsilon for i_episode in range(1, n_episodes + 1): env_info = env.reset( train_mode=True)[brain_name] # reset the environment state = env_info.vector_observations[0] score = 0 for t in range(max_t): action = agent.act(state, eps) env_info = env.step(action.astype(np.int32))[brain_name] next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon epsilons.append(eps) print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") # if np.mean(scores_window)>=13.0: print('\nEnvironment finished in {:d} episodes!\tAverage Score: {:.2f}'. format(i_episode, np.mean(scores_window))) torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth') return scores, epsilons
def testFunction(): agent = Agent(state_size=37, action_size=4, seed=0) agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth')) env_info = env.reset(train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 # initialize the score time_steps = 100000 for t in range(time_steps): action = agent.act(state) # select an action env_info = env.step(action.astype( np.int32))[brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished score += reward # update the score state = next_state # roll over the state to next time step if done: # exit loop if episode finished break print("Score: {}".format(score))
def run(agent_source, location, n_episodes): source = AgentSource[agent_source.upper()] agent = Agent(state_size=8, action_size=4, seed=0, learning_strategy=LearningStrategy.DQN) path_to_agent_checkpoint = retrieve_agent_checkpoint(source, location) agent.qnetwork_local.load_state_dict( torch.load(path_to_agent_checkpoint, map_location=lambda storage, loc: storage)) #display = Display(visible=0, size=(1400, 900)) #display.start() env = gym.make('LunarLander-v2') env.seed(0) print('State shape: ', env.observation_space.shape) print('Number of actions: ', env.action_space.n) for i in range(n_episodes): state = env.reset() #img = plt.imshow( env.render(mode='rgb_array') for j in range(500): action = agent.act(state) #img.set_data( env.render(mode='rgb_array') plt.axis('off') time.sleep(0.1) #display.display(plt.gcf()) #display.clear_output(wait=True) state, reward, done, _ = env.step(action) if done: break env.close()
def testAgent(): print("Testing the Agent") agent = Agent(state_size=state_size, action_size=action_size, seed=0, pretrainedWeightsFile='checkpoint.pth', train=False) env_info = env.reset(train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 # initialize the score while True: action = agent.act(state) # select an action env_info = env.step( action.item())[brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished score += reward # update the score state = next_state # roll over the state to next time step if done: # exit loop if episode finished break print("Score: {}".format(score)) return score
def train(n_episodes=2000, eps_start=1.0, eps_end=0.025, eps_decay=0.995): agent = Agent(state_size=37, action_size=4, seed=0) env = UnityEnvironment(file_name="Banana.app") brain_name = env.brain_names[0] # get the default brain brain = env.brains[brain_name] scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start # initialize epsilon for i_episode in range(1, n_episodes+1): env_info = env.reset(train_mode=True)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 # initialize the score while True: action = agent.act(state, eps) # select an action env_info = env.step(action)[brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished agent.step(state, action, reward, next_state, done) score += reward # update the score state = next_state # roll over the state to next time step if done: # exit loop if episode finished break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay*eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window))) if np.mean(scores_window)>=13.0: print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window))) torch.save(agent.qnetwork_local.state_dict(), 'checkpoint_Nav_V01_13.pth') env.close() break return scores
def demo4_LearningPathPlanning(setting): n_sample = 100 # Environment env = FireEnvironment(64, 64) # Vehicle to generate observation mask vehicle = Vehicle(n_time_windows=512, grid_size=(64, 64), planner_type='Default') # Trainer and Estimator dyn_autoencoder = DynamicAutoEncoder(SETTING, grid_size=(env.map_width, env.map_height), n_state=3, n_obs=3, encoding_dim=16, gru_hidden_dim=16) ### DQN agent dqn_agent = DQN_Agent(state_size=16, action_size=4, replay_memory_size=1000, batch_size=64, gamma=0.99, learning_rate=0.01, target_tau=0.01, update_rate=1, seed=0) # Train Data Buffer memory = SingleTrajectoryBuffer(N_MEMORY_SIZE) # Train Iteration Logger writer = SummaryWriter() # Video Writier video_writer1 = ImageStreamWriter('LearningPlanner.avi', FPS, image_size=(1200, 820)) # Add concat. text setting_text = '' for k, v in setting.items(): setting_text += k setting_text += ':' setting_text += str(v) setting_text += '\t' writer.add_text('setting', setting_text) ######################################## ### Interacting with the Environment ### ######################################## mask_obs, obs, state = env.reset() state_est_grid = dyn_autoencoder.u_k ### Loss Monitors ### list_loss = [] list_cross_entropy_loss = [] list_entropy_loss = [] list_rewards = [] list_new_fire_count = [] list_action = [] ### Filling the Data Buffer ### for i in tqdm.tqdm(range(N_TRAIN_WAIT)): map_visit_mask, img_resized = vehicle.full_mask() mask_obs, obs, state, reward, info = env.step(map_visit_mask) memory.add(mask_obs.detach().long(), state.detach().long(), map_visit_mask.detach().long()) for i in tqdm.tqdm(range(N_TOTAL_TIME_STEPS)): # determine epsilon-greedy action from current sate h_k = dyn_autoencoder.h_k.squeeze().data.cpu().numpy() epsilon = 0.1 action = dqn_agent.act(h_k, epsilon) list_action.append(action) ### Collect Data from the Env. ### map_visit_mask, img_resized = vehicle.plan_a_trajectory( state_est_grid, n_sample, action) mask_obs, obs, state, reward, info = env.step(map_visit_mask) memory.add(mask_obs.detach().long(), state.detach().long(), map_visit_mask.detach().long()) ### Run the Estimator ### state_est_grid = dyn_autoencoder.step(mask_obs, map_visit_mask) h_kp1 = dyn_autoencoder.h_k.squeeze().data.cpu().numpy() #### Update the reinforcement learning agent ### dqn_agent.step(h_k, action, reward, h_kp1, done=False) list_rewards.append(reward) list_new_fire_count.append(info['new_fire_count']) ################################ ### Rendering and Save Video ### ################################ img_env = env.output_image() img_agent = dyn_autoencoder.output_image(state_est_grid) # State Est #blank = np.zeros((400, 200, 3)) img_top = img_env #np.concatenate((blank, img_env[:,:800], blank), axis=1) blank = np.zeros((20, 1200, 3)) img_top = np.concatenate((img_top, blank), axis=0) img_top = (img_top * 255).astype('uint8') img_state_est_grid_uint8 = (img_agent * 255).astype('uint8') backtorgb = cv2.cvtColor(img_state_est_grid_uint8, cv2.COLOR_GRAY2RGB) img_bayes_uint8 = np.concatenate((img_top, backtorgb), axis=0) #<-- to be saved render('Dynamic Auto Encoder', img_bayes_uint8, 1) # Save video # video_writer1.write_image_frame(img_bayes_uint8) ### Training ### loss_val, loss_val_cross, loss_val_ent, O_np_val = dyn_autoencoder.update( memory, N_TRAIN_BATCH, N_TRAIN_WINDOW) list_loss.append(loss_val) list_cross_entropy_loss.append(loss_val_cross) list_entropy_loss.append(loss_val_ent) if i % N_LOGGING_PERIOD == 0: avg_loss = np.mean(np.array(list_loss)) list_loss = [] writer.add_scalar('dynautoenc/loss', avg_loss, i) avg_loss_cross = np.mean(np.array(list_cross_entropy_loss)) list_cross_entropy_loss = [] writer.add_scalar('dynautoenc/crossentropy', avg_loss_cross, i) avg_loss_entropy = np.mean(np.array(list_entropy_loss)) list_entropy_loss = [] writer.add_scalar('dynautoenc/shannonentropy', avg_loss_entropy, i) avg_reward = np.mean(np.array(list_rewards)) list_rewards = [] writer.add_scalar('perform/rewards', avg_reward, i) avg_new_fire_count = np.mean(np.array(list_new_fire_count)) list_new_fire_count = [] writer.add_scalar('perform/new_fire_counts', avg_new_fire_count, i) writer.add_scalar('perform/pc_coverd_new_fire', avg_reward / avg_new_fire_count, i) action_0_count = list_action.count(0) action_1_count = list_action.count(1) action_2_count = list_action.count(2) action_3_count = list_action.count(3) writer.add_scalar('action_count/0', action_0_count / len(list_action), i) writer.add_scalar('action_count/1', action_1_count / len(list_action), i) writer.add_scalar('action_count/2', action_2_count / len(list_action), i) writer.add_scalar('action_count/3', action_3_count / len(list_action), i) list_action = [] writer.add_scalar('obs_state0/o00', O_np_val[0][0], i) writer.add_scalar('obs_state1/o01', O_np_val[0][1], i) writer.add_scalar('obs_state2/o02', O_np_val[0][2], i) writer.add_scalar('obs_state0/o10', O_np_val[1][0], i) writer.add_scalar('obs_state1/o11', O_np_val[1][1], i) writer.add_scalar('obs_state2/o12', O_np_val[1][2], i) writer.add_scalar('obs_state0/o20', O_np_val[2][0], i) writer.add_scalar('obs_state1/o21', O_np_val[2][1], i) writer.add_scalar('obs_state2/o22', O_np_val[2][2], i) print( 'losses at iteration: %d, losses: total %.3f, cross %.3f, shannon %.3f' % (i, avg_loss, avg_loss_cross, avg_loss_entropy)) print('memory size at iteration: %d, size: %d' % (i, len(memory.obs_memory))) if (i + 1) % N_SAVING_PERIOD == 0: f_name = setting['name'] dyn_autoencoder.save_the_model(i, f_name) dqn_agent.save_the_model(i, f_name) video_writer1.close()
class DQN(): # env assumption: env.reset(), env.render(), env.step(), env.close() def __init__(self, name, state_size, action_size, env, load_net=False): self.agent = Agent(name, state_size=state_size, action_size=action_size, seed=0) self.env = env self.saved_network = name + '_dqn_checkpoint.pth' self.load_net = load_net if load_net: print('Loading pretrained network...') self.agent.qnetwork_local.load_state_dict( torch.load(self.saved_network)) self.agent.qnetwork_target.load_state_dict( torch.load(self.saved_network)) print('Loaded.') def train(self, n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995, score_window_size=100, target_score=13.0, save=True, verbose=True): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ scores = [] # list containing scores from each episode scores_window = deque( maxlen=score_window_size) # last score_window_size scores eps = eps_start # initialize epsilon save12 = False for i_episode in range(1, n_episodes + 1): state = self.env.reset() score = 0 for t in range(max_t): action = self.agent.act(state, eps) next_state, reward, done, _ = self.env.step(action) self.agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon avg_score = np.mean(scores_window) if avg_score > 13.0 and not save12 and not self.load_net: torch.save(self.agent.qnetwork_local.state_dict(), self.saved_network) np.save('scores13_0824.npy', np.array(scores)) save12 = True if avg_score >= target_score and i_episode > 100: if verbose: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode, np.mean(scores_window))) self.solved = True if save: torch.save(self.agent.qnetwork_local.state_dict(), self.saved_network) break if verbose: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) if save: torch.save(self.agent.qnetwork_local.state_dict(), self.saved_network) return scores def play(self, trials=3, steps=200, load=False): if load: self.agent.qnetwork_local.load_state_dict( torch.load(self.saved_network)) for i in range(trials): total_reward = 0 print('Start Trial...') state = self.env.reset() for j in range(steps): action = self.agent.act(state) self.env.render() state, reward, done, _ = self.env.step(action) total_reward += reward if reward != 0: print("Current Reward:", reward, "Total Reward:", total_reward) if done: print('Done.') break self.env.close()
print('Number of actions: ', env.action_space.n) # Please refer to the instructions in `Deep_Q_Network.ipynb` if you would like to write your own DQN agent. Otherwise, run the code cell below to load the solution files. # In[4]: from dqn_agent import Agent agent = Agent(state_size=8, action_size=4, seed=0) # watch an untrained agent state = env.reset() for j in range(200): action = agent.act(state) env.render() state, reward, done, _ = env.step(action) if done: break env.close() # ### 3. Train the Agent with DQN # # Run the code cell below to train the agent from scratch. You are welcome to amend the supplied values of the parameters in the function, to try to see if you can get better performance! # # Alternatively, you can skip to the next step below (**4. Watch a Smart Agent!**), to load the saved model weights from a pre-trained agent. # In[5]:
brain_name = env.brain_names[0] brain = env.brains[brain_name] agent = Agent(state_size=brain.vector_observation_space_size, action_size=brain.vector_action_space_size, seed=0) # Load trained model weights agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth')) env_info = env.reset(train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 eps = 0.0 # initialize the score while True: #replace: action = np.random.randint(action_size) # select an action action = agent.act(state, eps) env_info = env.step(action)[brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished score += reward # update the score state = next_state # roll over the state to next time step if done: # exit loop if episode finished break print(f"Score: {score}") # when finish testing, close the environment env.close()
def run(experiment_name, num_iterations, learning_rate, buffer_size, batch_size, gamma, epsilon, epsilod_decay, epsilon_min, stack_size, device, is_ddqn, evaluation_rate, log_directory): scores = [] episodic_accum = 0 epsidoic_rewards = [] iteration_rewards = [] episode = 1 agent = Agent(env=env, state_space=state_space, action_space=action_space, learning_rate=learning_rate,\ buffer_size=buffer_size, batch_size=batch_size, gamma=gamma,\ device=device, in_channels=stack_size, is_ddqn = is_ddqn) #initializing log directory for tensorboard if not os.path.exists(log_directory): os.makedirs(log_directory) tb_writer = SummaryWriter('{}/{}'.format(log_directory, experiment_name)) frame_count = 0 epoch_plot_count = 0 stop = False prev_iteration = None while agent.num_train_updates < num_iterations + 1 and not stop: state = env.reset() done = False # current state & 3-previous states state_frames = deque(maxlen=stack_size) episode_reward = [] while not done: frame_count += 1 _state = preprocess_state(state) state = torch.from_numpy(_state).float() # if it's the first frame, copy the same state multiple time in the stack if len(state_frames) < stack_size: for i in range(stack_size): state_frames.append(state) else: state_frames.append(state) state_stack = torch.stack(list(state_frames)).unsqueeze(dim=0) action = agent.act(state_stack, epsilon) prev_action = action next_state, reward, done, info = env.step(action) _next_state = preprocess_state(next_state) _next_state = torch.from_numpy(_next_state).float() agent.step(state_frames.__copy__(), action, reward, _next_state, done) state = next_state episodic_accum += reward iteration_rewards.append(reward) if agent.num_train_updates > 0: # evaluate every 1M steps and decay epsilon (based on paper) if agent.num_train_updates % evaluation_rate == 0 and prev_iteration != agent.num_train_updates: epsilon = max(epsilon_min, epsilon * epsilod_decay) prev_iteration = agent.num_train_updates if agent.num_train_updates > num_iterations: stop = True episode += 1 epsidoic_rewards.append(episodic_accum) episodic_accum = 0. if episode % 100 == 0 and len(epsidoic_rewards) > 20: tb_writer.add_scalar('Episode Accum score', np.mean(epsidoic_rewards[-20:]), episode) print('episode_num:{}\tepisode_score:{}\tepsilon:{}\tmemory_size:{}'.format(\ episode, np.mean(epsidoic_rewards[-20:]), epsilon,len(agent.memory))) torch.save(agent.QNetwork_local.state_dict(), '{}_checkpoint.pth'.format(experiment_name)) return epsidoic_rewards
def dqn(n_episodes=4000, max_t=3000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): agent = Agent(state_size=3, action_size=8, seed=0) start_pos = (200, 600) end_pos = (800, 375) env = environment(MAP, start_pos, end_pos) """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start # initialize epsilon for i_episode in range(1, n_episodes + 1): state, _, _ = env.reset(start_pos, end_pos) score = 0 for t in range(max_t): action = agent.act(state, eps) next_state, reward, done = env.step(action) agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: #print (state) break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) #if np.mean(scores_window)>=200.0: #print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window))) if i_episode % 200 == 0: torch.save(agent.qnetwork_local.state_dict(), 'checkpoint' + str(i_episode) + '.pth') #torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth') #break return scores
state = env_info.vector_observations[0] # set the initial episode score to zero. score = 0 # Run the episode training loop; # At each loop step take an epsilon-greedy action as a function of the current state observations # Based on the resultant environmental state (next_state) and reward received update the Agent network # If environment episode is done, exit loop... # Otherwise repeat until done == true converted_action_size = brain.vector_action_space_size converted_agent_num = len(env_info.agents) while True: # determine epsilon-greedy action from current sate action = agent.act(state, epsilon) # if round(action) == 0: # converted_action = np.array([[1,0,0,0]]) # elif round(action) == 1: # converted_action = np.array([[-1,0,0,0]]) # elif round(action) == 2: # converted_action = np.array([[0,0,-1,0]]) # elif round(action) == 3: # converted_action = np.array([[0,0,1,0]]) if round(action) == 0: converted_action = np.array([[1, 0, 0, 0]]) # forward elif round(action) == 1: converted_action = np.array([[2, 0, 0, 0]]) # backward elif round(action) == 2: converted_action = np.array([[0, 0, 1, 0]]) # counterclock
# examine the state space state = env_info.vector_observations[0] print('States look like:', state) state_size = len(state) print('States have length:', state_size) agent = Agent(state_size=state_size, action_size=action_size) agent.qnetwork_local.load_state_dict( torch.load(dirpath + "/checkpoint.pth")) max_t = 1000 for i in range(10): env_info = env.reset(train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] score = 0 for t in range(max_t): action = agent.act(state, 0.01) # send the action to the environment env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has score += reward state = next_state if done: break print(score) env.close() # from unityagents import UnityEnvironment
stockData = list(df['收盘']) l = len(stockData) - 1 window_size = 10 state = getState(stockData, 0, window_size + 1) # total_profit = 0 agent.inventory = [] action_list = [] pos_list = [] pos_old = 0 total_share = 0 cost = 0 money_initial = 10000 money = money_initial for t in range(l): action = agent.act(state, eps=0, is_eval=True) next_state = getState(stockData, t + 1, STATE_SIZE + 1) if action == 1: # 买入 pos_new = min(pos_old + 0.2, 1) total_share += money * (pos_new - pos_old) / stockData[t] #agent.inventory.append(stockData[t]) # print("buy" + str(stockData[t])) elif action == 2: pos_new = max(pos_old - 0.2, 0) total_share += money * (pos_new - pos_old) / stockData[t] #bought_price = agent.inventory.pop(0) #total_profit += stockData[t] - bought_price else: pos_new = pos_old money = money_calculate(money, total_share, stockData[t], pos_new)
def run(self, run_id=1, n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995, lr=5e-4, use_double_dqn=False, use_soft_update=True): start = time.time() agent = Agent(state_size=37, action_size=4, seed=0, lr=lr, use_double_dqn=use_double_dqn, use_soft_update=use_soft_update) # list containing scores from each episode scores = [] # last 100 scores scores_window = deque(maxlen=100) # initialize epsilon eps = eps_start for i_episode in range(1, n_episodes + 1): # reset the environment env_info = self.env.reset(train_mode=True)[self.brain_name] # get the current state state = env_info.vector_observations[0] score = 0 for t in range(max_t): action = agent.act(state, eps) #print("action: ", action) # send the action to the environment env_info = self.env.step(action)[self.brain_name] # get the next state next_state = env_info.vector_observations[0] # get the reward reward = env_info.rewards[0] # see if episode has finished done = env_info.local_done[0] # TODO add proper comment agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break # save most recent score scores_window.append(score) # save most recent score scores.append(score) # decrease epsilon eps = max(eps_end, eps_decay * eps) print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) if np.mean(scores_window) >= 14.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) end = time.time() elapsed = end - start print("\nTime taken to solve: {:.2f} minutes".format(elapsed / 60.0)) run_dir = "results/{}".format(run_id) os.mkdir(run_dir) torch.save(agent.qnetwork_local.state_dict(), "{}/checkpoint.pth".format(run_dir)) break return scores
print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window))) torch.save(agent.qnetwork_local.state_dict(), 'weights.pth') break return scores scores = dqn() # plot the scores fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(len(scores)), scores) plt.ylabel('Score') plt.xlabel('Episode #') plt.show() # load the weights from file agent.qnetwork_local.load_state_dict(torch.load('weights.pth')) for i in range(3): state = env.reset() for j in range(200): action = agent.act(state) env.render() state, reward, done, _ = env.step(action) if done: break env.close()
def objective(trial): file = pd.ExcelFile(r'sasd_a2c.xlsx') state_index_oh = file.parse('state_index') MaxEpisodes = 2000 Env = env() EPSILON = 1 Total_Reward = [] Avg_Rewards = [] # output1_lst = [] # output2_lst = [] # input1_lst = [] # input2_lst = [] fc1_dims = trial.suggest_categorical('fc1_dims', [15, 20, 30]) lr = trial.suggest_uniform("lr", 5e-6, 1e-4) gamma = trial.suggest_categorical("gamma", [0.97, 0.98, 0.99]) lr_ns = trial.suggest_uniform("lr_ns", 1e-4, 1e-2) lr_r = trial.suggest_uniform("lr_r", 1e-4, 5e-3) lr_d = trial.suggest_uniform("lr_d", 1e-4, 1e-3) agent1 = Agent(state_size=9, action_size=10, fc1_dims=fc1_dims, lr=lr, batch_size=64, buffer_size=100000, gamma=gamma, tau=0.002, lr_ns=lr_ns, lr_r=lr_r, lr_d=lr_d) #fc1=32 lr=0.0009 gamma=0,98 agent2 = Agent(state_size=9, action_size=10, fc1_dims=fc1_dims, lr=lr, batch_size=64, buffer_size=100000, gamma=gamma, tau=0.002, lr_ns=lr_ns, lr_r=lr_r, lr_d=lr_d) writer = SummaryWriter() writer.add_graph( agent1.q_network, torch.from_numpy(state_index_oh.iloc[:, 2:].values).float()) writer.add_graph( agent2.q_network, torch.from_numpy(state_index_oh.iloc[:, 2:].values).float()) writer.close() agent1.memory.buffer_reset() agent2.memory.buffer_reset() for ep in range(MaxEpisodes): state = Env.reset() #torch.zeros(1) # agent.memory.buffer_reset() done = False stepscounter = 0 ep_reward = 0 state_OH = state_index_oh.iloc[state.int().numpy(), 2:].values.reshape(-1) while not done: stepscounter += 1 action1 = agent1.act(state_OH, EPSILON) action = action1 * 10 new_state, reward, done, obs = Env.next_state(action) ep_reward += reward new_state_OH = state_index_oh.iloc[new_state.int().numpy(), 2:].values.reshape(-1) agent1.memory.store_transition( state_OH, action1, 2 * ((reward.item() + 50) / 100) - 1, new_state, done) state_OH = new_state_OH if done == True: break action2 = agent2.act(state_OH, EPSILON) if action2 == action1: continue action = action2 new_state, reward, done, obs = Env.next_state(action) ep_reward += reward new_state_OH = state_index_oh.iloc[new_state.int().numpy(), 2:].values.reshape(-1) agent2.memory.store_transition( state_OH, action2, 2 * ((reward.item() + 50) / 100) - 1, new_state, done) state_OH = new_state_OH agent1.learn() agent2.learn() update_model1 = agent1.train_model(1) update_model2 = agent2.train_model(2) for _ in range(5): agent1.sim_learn(1) agent2.sim_learn(2) output1 = obs[0].item() output2 = obs[1].item() input1 = obs[2].item() input2 = obs[3].item() EPSILON = epsilon_decay(eps=EPSILON) Total_Reward.append(ep_reward) avg_reward = np.mean(Total_Reward[-100:]) Avg_Rewards.append(avg_reward) if ep % 1 == 0: totalresult = 'episode: ' + str( ep + 1 ) + ' Total_Reward %.2f' % ep_reward + ' Average_Reward %.2f' % avg_reward + ' Steps ' + str( stepscounter) + ' Model Training Data: ' + str( update_model1) + str( update_model2 ) #+' Output1: '+str(output1)+' Output2: '+str(output2) # dataCollect("Total Result",Total_Result,totalresult,i_episode) print(f'\r{totalresult}', end='\r') writer.add_scalar('reward/episode', ep_reward, ep) writer.add_scalar('Avgreward/episode', avg_reward, ep) writer.add_scalar('output1/episode', output1, ep) writer.add_scalar('output2/episode', output2, ep) writer.add_scalar('input1/episode', input1, ep) writer.add_scalar('input2/episode', input2, ep) trial.report(avg_reward, ep) if trial.should_prune(): raise optuna.exceptions.TrialPruned() return avg_reward