def create_environment(): """Creates the environment, applies some wrappers and returns it.""" tmp_env = gym_super_mario_bros.make(LEVEL_NAME) tmp_env = JoypadSpace(tmp_env, ACTION_SPACE) tmp_env = wrapper(tmp_env, FRAME_DIM) return tmp_env
def record_one_episode(agent, episode): tmp_env = gym_super_mario_bros.make(LEVEL_NAME) tmp_env = JoypadSpace(tmp_env, ACTION_SPACE) tmp_env = Monitor(tmp_env, './videos/video-episode-{0:05d}'.format(episode), force=True) tmp_env = wrapper(tmp_env, FRAME_DIM, FRAME_SKIP) state = lazy_frame_to_tensor(tmp_env.reset()) total_reward = 0 while True: action = agent.get_action(state) next_state, reward, done, info = tmp_env.step(action) next_state = lazy_frame_to_tensor(next_state) if done: break total_reward += reward state = next_state
def record_one_episode(agent): tmp_env = gym_super_mario_bros.make(LEVEL_NAME) tmp_env = JoypadSpace(tmp_env, ACTION_SPACE) tmp_env = Monitor(tmp_env, './video', force=True) tmp_env = wrapper(tmp_env, FRAME_DIM) state = lazy_frame_to_tensor(tmp_env.reset()) total_reward = 0 while True: action, _ = agent.select_action_based_on_state(state) next_state, reward, done, info = tmp_env.step(action) next_state = lazy_frame_to_tensor(next_state) if done: break total_reward += reward state = next_state
#from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv from nes_py.wrappers import JoypadSpace import gym_super_mario_bros from gym_super_mario_bros.actions import RIGHT_ONLY from agent import DQNAgent from wrappers import wrapper from utils import get_args # Take argument arg = get_args() # Build env (first level, right only) env = gym_super_mario_bros.make(arg.env) env = JoypadSpace(env, RIGHT_ONLY) env = wrapper(env) # Parameters states = (84, 84, 4) actions = env.action_space.n # Agent agent = DQNAgent(states=states, actions=actions, max_memory=100000, double_q=True) # Episodes # episodes = 100001 episodes = 101 rewards = [] # Timing start = time.time() step = 0
import wrappers act = wrappers.wrapper("mind2")
def train_model(parameters): #Initialization of environment and agent env = gym_super_mario_bros.make(parameters['environment']) env = JoypadSpace(env, RIGHT_ONLY) env = wrapper(env) states = (84, 84, 4) actions = env.action_space.n agent = DDQNagent(parameters, states, actions) if parameters['train']: #TENSORBOARD current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") log_dir = 'logs/mario/' + current_time + '/10k' summary_writer = tf.summary.create_file_writer(log_dir) summary_writer.set_as_default() maxXpos = 0 # Maximum X position of the Agent max_reward = 0 # Maximum reward start_time = time.time() # Start time #Initialization of varialbes for plots graph_reward = np.zeros(parameters['episodes_to_play']) # Reward graph_pos = np.zeros(parameters['episodes_to_play']) # Pozition graph_mean_reward = np.zeros(parameters['episodes_to_play']) # Mean Reward episodes = parameters['episodes_to_play'] # Number of episodes to train rewards = [] # Rewards array start = time.time() # Time for calculating processed frames per second step = 0 # Total steps #Lerning cycle for e in range(episodes): #Default state of the environment state = env.reset() total_reward = 0 # Reward gained for actual epsiode iter = 0 while True: #Select an action action = agent.run(state) #Apply action to environment next_state, reward, done, info = env.step(action) #Write new data to memory agent.update_memory(experience=(state, next_state, action, reward, done)) #Learn agent.learn() #Sum of rewards for every action total_reward += reward #Change current state to next one state = next_state iter += 1 #Render if parameters['render']: env.render() #Check finish condition if done or info['flag_get']: break #New data for variable that be used for plot rewards.append(total_reward / iter) #Update info if maxXpos < info['x_pos']: maxXpos = info['x_pos'] if max_reward < total_reward: max_reward = total_reward if info['flag_get'] == True: agent.flag_reached = agent.flag_reached + 1 #Epsilon decay if agent.eps >= 0.0: agent.eps = agent.eps - agent.eps_decay #Updtate variables for plots graph_reward[e] = total_reward graph_pos[e] = info['x_pos'] graph_mean_reward[e] = np.mean(graph_reward) #TENSORBOARD tf.summary.scalar("Rewards", total_reward, step=e) tf.summary.scalar("Position", info['x_pos'], step=e) tf.summary.scalar("Mean reward", np.mean(graph_reward), step=e) tf.summary.scalar("Flags", agent.flag_reached, step=e) tf.summary.scalar("Loss", agent.loss, step=e) #Console information print("Episode reward: " + str(total_reward) + ' - Pos: ' + str(info['x_pos'])) # Print if e % 10 == 0: end = time.time() print('Flags reached: ' + str(agent.flag_reached) + ' - Max reward: ' +str(max_reward)) print('Episode {e} - ' 'Frame {f} - ' 'Frames/sec {fs} - ' 'Epsilon {eps} - ' 'Mean Reward {r} - ' 'Time {t} sec - ' 'Max pos {pos}'.format(e=e, f=agent.step, fs=np.round((agent.step - step) / (time.time() - start)), eps=np.round(agent.eps, 4), r=np.mean(rewards[-100:]), t=round(end - start_time), pos=maxXpos)) start = time.time() step = agent.step #After learning draw plots and save weights draw_graph(graph_reward,'Rewards') draw_graph(graph_pos, 'Position') draw_graph(graph_mean_reward, 'Mean reward') agent.save_weights() env.close() else: #If train is equal to false, it is possible to load weights and observe result print('Weights file path (hdf5): ') weights_name = input() try: agent.model_target.load_weights(weights_name) agent.model_test(env) except: print("Weights with this name or on this path not found") env.close()
current_states.append(current_state) target_predictions.append(target_prediction) # train the policy model based on the predictions of the target model policy_model.fit(np.asarray(current_states).squeeze(), np.asarray(target_predictions).squeeze(), batch_size=BATCH_SIZE, verbose=0) env = gym_super_mario_bros.make("SuperMarioBros-v0") env = JoypadSpace( env, ACTION_SPACE ) # An environment wrapper to convert binary to discrete action space. # apply the wrapper env = wrapper(env, FRAME_DIM) # create the network policy_net = DQNetwork(stacked_frame_dim=FRAME_DIM, num_actions=env.action_space.n) target_net = DQNetwork(stacked_frame_dim=FRAME_DIM, num_actions=env.action_space.n) # create the replay memory replay_memory = ReplayMemory(REPLAY_MEMORY_CAPACITY) # play the episodes current_exploration = EXPLORATION_MAX total_steps = 0 reward_history = [] mean_reward_history = []
import wrappers act = wrappers.wrapper("mind1")
import wrappers act = wrappers.wrapper("ben")