def main(): # Check if the ROM is given through argv filename = './Super_Mario_Land_World.gb' env = Environment(filename, max_steps=N_STEPS, visualize=VISUALIZE) env.start() agent = A2C_Agent(discount=0.99, epsilon=0.9, learning_rate=1e-3) agent_is_setup = False entropy_term = 0 all_rewards = [] all_lengths = [] average_lengths = [] for episode in range(N_EPOCHS): print("\n ", "=" * 50) print("Epoch {}/{}".format(episode + 1, N_EPOCHS)) env.reset() state = env.obs() log_probs = [] values = [] rewards = [] if not agent_is_setup: agent.setup(env.observation_space, env.action_space, use_model) agent_is_setup = True for steps in range(N_STEPS): # Get action from agent with torch.no_grad(): action, log_prob, entropy, value = agent.get_action(state, TRAINING) value = value.detach().numpy()[0, 0] new_state, reward, done = env.step(action, steps) rewards.append(reward) values.append(value) log_probs.append(log_prob) entropy_term += entropy # Set obs to the new state state = new_state if done or steps == N_STEPS - 1: Qval, _ = agent.model.forward(torch.Tensor(new_state)) Qval = Qval.detach().numpy()[0, 0] all_rewards.append(np.sum(rewards)) all_lengths.append(steps) average_lengths.append(np.mean(all_lengths[-10:])) if episode % 10 == 0: sys.stdout.write("episode: {}, reward: {}, total length: {}, average length: {} \n".format(episode, np.sum( rewards), steps, average_lengths[ -1])) break print("Loss :", agent.train(values, rewards, log_probs, Qval, entropy_term)) if SAVE_MODEL and TRAINING: date = datetime.datetime.now() model_name = str(date.day) + '_' + str(date.month) + '_' + str(date.hour) + '_' + agent.name + '.h5' agent.save_model(model_name) env.stop()
def main(): # Check if the ROM is given through argv filename = './Super_Mario_Land_World.gb' env = Environment(filename, max_steps=N_STEPS, visualize=VISUALIZE) env.start() agent = DQN_Agent(discount=0.9, epsilon=0.9, learning_rate=1e-5) avg_loss = None agent_is_setup = False min_epsilon = 0.001 max_epsilon = 0.001 for episode in range(N_EPOCHS): print("\n ", "=" * 50) env.reset() state = torch.Tensor(env.obs()) old_state = state old_old_state = state is_a_released = torch.ones(1) states = [ torch.cat((state, old_state, old_old_state), 0).view(3, 16, 20), is_a_released, env.mario_size ] episode_reward = 0 if not agent_is_setup: agent.setup(env.observation_space, env.action_space, use_model) agent_is_setup = True for steps in range(N_STEPS): # Get action from agent actions = agent.get_action(states, TRAINING) new_state, reward, done = env.step(actions) #env.print_obs(new_state.numpy().astype(int)) if actions[1] == 0: is_a_released = torch.zeros(1) else: is_a_released = torch.ones(1) if steps + 1 == N_STEPS: done = True episode_reward += reward new_states = [ torch.cat((new_state, states[0][0, :, :], states[0][1, :, :]), 0).view(3, 16, 20), is_a_released, env.mario_size ] agent.update_replay_memory(states, actions, reward, new_states, done) # Train the neural network if TRAINING: loss = agent.train(done) if avg_loss is None: avg_loss = loss else: avg_loss = 0.99 * avg_loss + 0.01 * loss else: avg_loss = 0 states = new_states if (steps + 1) % 20 == 0: print("\rAverage loss : {:.5f} --".format(avg_loss), "Episode rewards: {} --".format(episode_reward), "epochs {}/{} --".format(episode, N_EPOCHS), "steps {}/{}".format(steps + 1, N_STEPS), end="") if done: print("\n", env.level_progress_max) break agent.epsilon = max( min_epsilon, min(max_epsilon, 1.0 - math.log10((episode + 1) / 5))) if SAVE_MODEL and TRAINING: date = datetime.datetime.now() model_name = str(date.day) + '_' + str(date.month) + '_' + str( date.hour) + '_' + agent.name + '.h5' agent.save_model(model_name) env.stop()