def main(num_episodes, render=False): # initialize gym environment and the agent # env = gym.make('SpaceInvaders-v0') env = gym.make('Breakout-v0') state = env.reset() state_shape = list(state.shape) state_shape[-1] = state_shape[-1] * 5 agent = DQNAgent(state_shape, env.action_space.n) states = deque(maxlen=5) max_train_time = 800 # Iterate the game for e in range(num_episodes): # reset state in the beginning of each game state = env.reset() for i in range(5): states.appendleft(state) # time_t represents each frame of the game num_random = 0 total_reward = 0. for time_t in range(max_train_time): # turn this on if you want to render if render: env.render() # Decide action action = agent.act(states) if agent.acted_randomly: num_random += 1 # Advance the game to the next frame based on the action. next_state, reward, done, info = env.step(action) total_reward += reward # Remember the previous state, action, reward, and done agent.remember(states.copy(), action, reward, next_state, done) # make next_state the new current state for the next frame. states.appendleft(next_state) # done becomes True when the game ends if done: # print the score and break out of the loop rand_perc = num_random / float( time_t + 1) * 100. # Percentage of random actions. print( "episode: {}/{}, training_time: {}, summed_reward: {}, random_actions: {}%, eps: {}" .format(e, num_episodes, time_t, total_reward, rand_perc, agent.epsilon)) # train the agent with the experience of the episode agent.replay(min(100, time_t)) break # print("epsilon {}".format(agent.epsilon)) if e % 1000 == 0: agent.save("./deep_q_model.h5") print("saved model")
def print_progress(agent: DQNAgent, data: dict): percent = data['percent'] progress = '=' * int(percent) progress += '>' left = ' ' * (100 - percent) progress = f'{percent}% [{progress + left}]' reward, steps = data['stats'] mean = round(reward.mean(), 1) std = round(reward.std(), 1) positive = reward[reward > 0].size total = reward.size steps = steps.sum() losses = data['losses'] if total > 50: graph(reward, verbose=True) plt.savefig(f'figures/{run_id}_training.png') if len(losses) > 10: graph(losses.detach().numpy(), xlabel='Replays', ylabel='Loss', window=5) plt.savefig(f'figures/{run_id}_losses.png') # print(progress + f' μ: {mean}, σ: {std}; +{positive}/{total}, steps: {steps}', end='\r') # if percent % 5 != 0: # return last100 = reward[-100:] last_mean = round(last100.mean(), 2) last_std = round(last100.std(), 1) verbose = data['verbose'] if percent % 2 == 0 and last_mean > 200: print(' ' * 100, end='\r') if verbose: print('Last 100 episodes average over 200! ', end='') agent.save(f'{run_id}_{percent}p', str(round(last_mean, 0))) # rar = f'rar: {round(data["rar"], 5)}' if verbose else '' # Spaces at the end are to clean up the progress bar print(f'Total mean: {mean}, std: {std}; ' f'Last 100 mean: {last_mean}, std: {last_std}; ' f'Positive: {positive}/{total} ' f'Steps: {steps} ', # rar, " " * 20) if verbose: if len(losses) > 1: mean = round(losses.mean().item(), 3) std = round(torch.std(losses).item(), 3) print(f'Recent Losses: {losses[-5:]}, mean: {mean}, std: {std}') print(progress, end='\r')
def on_progress(self, agent: DQNAgent, data): """ After 1% of the total iterations is complete, the agent will call this function This is an opportunity to decide if it is time to quit early. """ percent: int = data['percent'] reward, steps = data['stats'] rar = data['rar'] if len(reward) >= 100: last100 = reward[-100:] mean = np.round(last100.mean()) if mean >= 200: print("Successfully completed goal") self.success = True self.exit_early = True agent.end_training_early() elif mean >= 50 and percent % 5 == 0: print("\nGood performance found, saving checkpoint") epoch = int(self.episodes * percent / 100) agent.save(f'{self.id}', f'{epoch}_{mean}') if self.verbose and percent % 10 == 0: # TODO: Print additional info print(f"\n{percent}% " f"\tTotal reward={round(reward.mean(), 3)} " f"steps={steps.sum()} " f"rar={round(rar, 3)}") # look at the last several episodes reward = reward[-self.percent_size:] print(f"\t\tRecent reward={round(reward.mean(), 3)}, " f"max={round(reward.max(), 3)}") if self.verbose: print(f'{percent}% ... ', end="") else: progress = '=' * int(percent) progress += '>' left = ' ' * (100 - percent) print(f'{percent}% [{progress + left}]', end='\r')
discount=0.99, target_freq=10, verbose=True, print_every=10) ''' agent = DQNAgent(action_set=[0, 1, 2], reward_function=mountain_car_reward_function, feature_extractor=MountainCarIdentityFeature(), hidden_dims=[50, 50], learning_rate=5e-4, buffer_size=50000, batch_size=64, num_batches=100, starts_learning=5000, final_epsilon=0.02, discount=0.99, target_freq=10, verbose=True, print_every=10) _, _, rewards = live(agent=agent, environment=env, num_episodes=episodes, max_timesteps=200, verbose=True, print_every=50) np.save(os.path.join(reward_path, file_name), rewards) agent.save(path=os.path.join(agent_path, file_name + '.pt'))
np.random.seed(0) torch.manual_seed(0) env = ForexEnv() agent = DQNAgent( action_set=[0, 1, 2], reward_function=functools.partial(Forex_reward_function), feature_extractor=ForexIdentityFeature(), hidden_dims=[50, 50], learning_rate=5e-4, buffer_size=5000, batch_size=12, num_batches=100, starts_learning=5000, final_epsilon=0.02, discount=0.99, target_freq=10, verbose=False, print_every=10) observation_data, action_data, rewards = live( agent=agent, environment=env, num_episodes=5, max_timesteps=5, verbose=True, print_every=50) agent.save('./dqn.pt')