def play_hill_climber(self, env, steps): _NOP = 0 steps = 100 env = JoypadSpace(env, self.actions) change_button_interval = 6 # every 6 steps actions_in_sequence = int(steps / change_button_interval) + 1 best_action_sequence = [ self.sample_no_start(env) for _ in range(actions_in_sequence) ] env.reset() best_score = self.evaluate_action_sequence(env, steps, change_button_interval, best_action_sequence) while True: env.reset() new_action_sequence = self.get_modified_actions( env, best_action_sequence, 0.2) new_score = self.evaluate_action_sequence(env, steps, change_button_interval, new_action_sequence) print('eval seq:', new_action_sequence) print('got score:', new_score, 'vs best score:', best_score) if new_score > best_score: best_score, best_action_sequence = new_score, new_action_sequence env.close()
def main(): """ Main entry point function for program. """ env = gym_super_mario_bros.make('SuperMarioBros-v0') env = JoypadSpace(env, RIGHT_ONLY) action_size = len(RIGHT_ONLY) cdqn = CDQN(action_size, memory_size=10000, image_shape=(45, 64, 1)) batch_size = 1024 games = 10000 skip = 100 beaten = False for game in range(games): print("Game: {}".format(game + 1), end=" ") done = True total_reward = 0 for step in range(8000): # Preprocess first image if done: state = env.reset() state = preprocess_image(state)[..., tf.newaxis] # Play move action = cdqn.act(state) next_state, reward, done, info = env.step(action) total_reward += reward # Remember move next_state = preprocess_image(next_state)[..., tf.newaxis] cdqn.remember(state, action, total_reward, next_state, done) state = next_state # Render game env.render() if done: break # Train when there are enough examples in memory #if len(cdqn.memory) >= batch_size and step % skip == 0: print("Reward: {}".format(total_reward)) for e in range(5): print('Epoch {}'.format(e + 1)) cdqn.experience_replay(batch_size) if game % 10 == 0: cdqn.update_target_model() print("Reward: {}".format(total_reward)) tf.saved_model.save(cdqn.network, "model.sav") env.close()
def fitness_func(self, genome, config, o): # create the environment game = gym_super_mario_bros.make('SuperMarioBros-v2') env = JoypadSpace(game, SIMPLE_MOVEMENT) try: # reset environment and create network from config file state = env.reset() neural_net = neat.nn.recurrent.RecurrentNetwork.create( genome, config) # frame count i = 0 # starting mario position start_mario_distance = 40 done = False # get shape of pixels inx, iny, inc = env.observation_space.shape inx, iny = int(inx / 8), int(iny / 8) while not done: # env.render() uncomment this to see mario play # resize image array and convert to grayscale state = cv2.resize(state, (inx, iny)) state = cv2.cvtColor(state, cv2.COLOR_BGR2GRAY) state = np.reshape(state, (inx, iny)) # flatten array so the network likes it state = state.flatten() # feed the state through the network and get max output output = neural_net.activate(state) action = output.index(max(output)) # do the action from the net observation, reward, done, info = env.step(action) state = observation # increase frame count i += 1 # check if 50 frames if mario moves and break from loop to restart if he hasn't if i % 50 == 0: if start_mario_distance == info['x_pos']: break else: start_mario_distance = info['x_pos'] # give a negative reward if mario didn't move else reward the distance he moved fitness = -1 if info['x_pos'] <= 40 else info['x_pos'] # if at the end of the level dump the current genome to file if fitness >= 4000: pickle.dump(genome, open("winning_genome.pkl", "wb")) # put current fitness into queue o.put(fitness) env.close() except KeyboardInterrupt: env.close() sys.exit()
def play(self): env = gym_tetris.make('TetrisA-v0') env = JoypadSpace(env, MOVEMENT) state = env.reset() model = self.global_model model_path = os.path.join(self.save_dir, 'model_{}.h5'.format('Tetris')) print('Loading model from: {}'.format(model_path)) model.load_weights(model_path) done = False step_counter = 0 reward_sum = 0 pieza_colocada = True informacion = env.get_info() antiguo_statistics = informacion['statistics'] state = [0, 0, 0, 0] while not done: env.render() if pieza_colocada: pieza_colocada = False pos = 5 giro = 0 u = -1 state = [state] policy, value = model( tf.convert_to_tensor(state, dtype=tf.float32)) policy = tf.nn.softmax(policy) action = np.argmax(policy) pos_objetivo = action % 10 giro_objetivo = action // 10 if (giro % giro_objetivo) != 0 and not done: state, reward, done, info = env.step(1) accion = 0 giro = giro + 1 elif pos > pos_objetivo and not done: state, reward, done, info = env.step(6) pos = pos - 1 accion = 0 elif pos < pos_objetivo and not done: state, reward, done, info = env.step(3) pos = pos + 1 accion = 0 elif not done and not pieza_colocada: state, reward, done, info = env.step(9) accion = 9 else: accion = 0 if not done: state, reward, done, info = env.step(accion) env.render() informacion = env.get_info() if antiguo_statistics != informacion['statistics']: antiguo_statistics = informacion['statistics'] step_counter += 1 env.close()
def main(): stats_gen = StatsGenerator(1, 'results/final_tats.txt') env = gym_super_mario_bros.make('SuperMarioBros-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) alpha, gamma, epsilon = 0.1, 1, 0.3 marioQLearner = MarioQLearner(env, alpha, gamma, epsilon, stats_gen) marioQLearner.action() env.close()
def run_player(self, member): env = gym_super_mario_bros.make(self.env) env = JoypadSpace(env, self.actions) env = WarpFrame(env) env = FrameStack(env, 4) player = MarioPlayer(self.num_of_actions, member.genes) if self.record: rec_output_path = os.path.join( self.current_gen_output_dir, "vid", "{name}.mp4".format(name=member.get_name())) rec = monitor.video_recorder.VideoRecorder(env, path=rec_output_path) state = env.reset() done = False last_x_pos = 0 same_x_pos_cunt = 0 for step in range(self.steps_scale): if done: break action = player.act(state) state, reward, done, info = env.step(action) if self.record: rec.capture_frame() if self.render: env.render() player.update_info(info) player.update_reward(reward) if last_x_pos == info['x_pos']: same_x_pos_cunt += 1 else: same_x_pos_cunt = 0 last_x_pos = info['x_pos'] if same_x_pos_cunt > self.standing_steps_limit: # end the run if player don't advance: done = True if not self.allow_death and info[ 'life'] < INITIAL_LIFE: # will repeat death, so why try more done = True if info['flag_get']: # if got to the flag - run is ended. done = True if self.record: rec.close() env.close() member.set_fitness_score(player.calculate_fitness()) outcome = player.get_run_info() outcome['generation'] = self.generation outcome['index'] = member.get_name() return outcome
def main(): env = gym_super_mario_bros.make('SuperMarioBros-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) done = False for e in range(100): state = env.reset() while not done: env.render() state, reward, done, info = env.step(env.action_space.sample()) env.close()
def contra_game_render(): env = gym.make('Contra-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) print("actions", env.action_space) print("observation_space ", env.observation_space.shape) done = False env.reset() for step in range(5000): if done: print("Over") break state, reward, done, info = env.step(env.action_space.sample()) env.render() env.close()
def main(): env = gym.make('SuperMarioBros-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) obs_shape = env.observation_space.shape obs_size = reduce(operator.mul, obs_shape, 1) action_size = env.action_space.n q = MLP(obs_size, action_size) q_target = MLP(obs_size, action_size) q_target.load_state_dict(q.state_dict()) if torch.cuda.is_available(): q = q.cuda() q_target = q_target.cuda() memory = ReplayBuffer() print_interval = 20 score = 0.0 optimizer = optim.Adam(q.parameters(), lr=learning_rate) for n_epi in range(10000): epsilon = max(0.01, 0.08 - 0.01 * (n_epi / 200)) # Linear annealing from 8% to 1% s = env.reset() done = False while not done: a = q.sample_action(torch.from_numpy(np.array(s)).float(), epsilon) s_prime, r, done, info = env.step(a) done_mask = 0.0 if done else 1.0 memory.put((s, a, r / 100.0, s_prime, done_mask)) s = s_prime score += r if done: break if memory.size() > 2000: train(q, q_target, memory, optimizer) if n_epi % print_interval == 0 and n_epi != 0: q_target.load_state_dict(q.state_dict()) print( "n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%". format(n_epi, score / print_interval, memory.size(), epsilon * 100)) score = 0.0 env.close()
class agent: def __init__(self): self.env = gym_super_mario_bros.make('SuperMarioBros-v0') self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT) self.size = self.env.observation_space.shape self.options = self.env.action_space.n self.baseline = 0 def get_screen(self): self.env.render() def close(self): self.env.close() def doStep(self, a): sP, r, done, info = self.env.step(a) return r, done, sP
def play_random_custom(env, steps): _NOP = 0 actions = [['start'], ['NOOP'], ['right', 'A'], ['left', 'A'], ['left', 'B'], ['right', 'B'], ['up'], ['down'], ['A'], ['B']] env = JoypadSpace(env, actions) env.reset() action = 0 start = time.time() # play_human for t in range(0, steps): # get the mapping of keyboard keys to actions in the environment if hasattr(env, 'get_keys_to_action'): keys_to_action = env.get_keys_to_action() elif hasattr(env.unwrapped, 'get_keys_to_action'): keys_to_action = env.unwrapped.get_keys_to_action() else: raise ValueError('env has no get_keys_to_action method') # # change action every 6 frames if t % 6 == 0: action = env.action_space.sample() # after 500 timesteps, stop pressing start button if t > 500: while action == 0: action = env.action_space.sample() observation, reward, done, info = env.step(action) # print("---------------------------t: ", t) # print("action space: ", action, env.action_space) # print("obs: ", observation) # print("reward: ", reward) # print("info: ", info) # runs game at about 60fps time.sleep(0.016667) env.render() end = time.time() env.close() print("time: ", (end - start), " seconds for ", steps, "steps")
def run_random_actions(): """ randomly take 1 of the 12 complex movement actions and print action, rewards """ env = JoypadSpace(gym_super_mario_bros.make('SuperMarioBros-v0'), COMPLEX_MOVEMENT) done = True for step in range(50): if done: env.reset() # randomly take an action from action_space random_action = env.action_space.sample() # info returns meta-data incl. coins, life, score etc. # state is RGB image (240, 256, 3) state, reward, done, info = env.step(random_action) print('# {}: Action: {}, Reward: {}, Done: {}'.format( step, random_action, reward, done)) env.close()
def main(): env = gym_super_mario_bros.make('SuperMarioBros-v1') env = JoypadSpace(env, USE_MOVEMENT) interval = 20 q = QNetWork() q_target = QNetWork() input_shape = (batch_size, 240, 256, 3) q.build(input_shape=input_shape) q_target.build(input_shape=input_shape) for src, dest in zip(q.variables, q_target.variables): dest.assign(src) memory = ReplayBuffer() score = 0. optimizer = optimizers.Adam(lr=learning_rate) for n_epi in range(10000): eqsilon = max(0.01, 0.08 - 0.01 * (n_epi / 200)) s = env.reset() for t in range(10000): a = q.sample_action(s, eqsilon) s_prime, r, done, _ = env.step(a) env.render() done = 0. if done else 1. memory.put((s, a, r, s_prime, done)) s = s_prime score += r if not done: break print ("epeide : {} ".format(n_epi)) if memory.size() > 100: train(q, q_target, memory, optimizer) # print("22, ", tf.size(q), tf.size(q)) if n_epi % interval == 0 and n_epi != 0: # print(q.variables, q_target.variables) for src, dest in zip(q.variables, q_target.variables): dest.assign(src) # 影子网络权值来自Q print(" # of epsode {}, avg_score {}, buffer size {}".format(n_epi, score/interval, memory.size())) score = 0. if n_epi % 200 == 0 and not n_epi: q_target.network.save_weights('dqn_weights{}.ckpt'.format(int(n_epi / 200))) env.close()
def eval_genome(genome): env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0") env = JoypadSpace(env, COMPLEX_MOVEMENT) done = False timeout = 100 state = env.reset() rewards = 0 while not done and timeout > 0: state_resized = resize(state, (state.shape[0] // 8, state.shape[1] // 8), anti_aliasing=False) state_resized = np.apply_along_axis( rgb2dec, 1, (np.reshape(state_resized, (state_resized.shape[0] * state_resized.shape[1], 3)) * 255), ) state, reward, done, info = env.step( np.argmax(genome.evaluate(state_resized))) rewards += reward if reward <= 0: timeout -= 1 else: timeout += 1 env.render() env.close() return rewards
def run(file): config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction, neat.DefaultSpeciesSet, neat.DefaultStagnation, 'config-feedforward') genome = pickle.load(open(file, 'rb')) #print(genome) env = gym_super_mario_bros.make('SuperMarioBros-1-1-v2') env = JoypadSpace(env, RIGHT_ONLY) env1 = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env1 = JoypadSpace(env1, RIGHT_ONLY) net = neat.nn.FeedForwardNetwork.create(genome, config) try: obs = env.reset() env1.reset() inx = int(obs.shape[0] / 8) iny = int(obs.shape[1] / 8) done = False while not done: #env.render() env1.render() obs = cv2.resize(obs, (inx, iny)) obs = cv2.cvtColor(obs, cv2.COLOR_BGR2GRAY) obs = np.reshape(obs, (inx, iny)) imgarray = np.ndarray.flatten(obs) actions = net.activate(imgarray) action = np.argmax(actions) _,_,_,info1 = env1.step(action) s, reward, done, info = env.step(action) xpos = info['x_pos'] print(done, action, xpos) obs = s env1.close() env.close() except KeyboardInterrupt: env.close() env1.close() exit()
def play_model(args): # if gpu is to be used device = torch.device( "cuda" if torch.cuda.is_available() and args.ngpu > 0 else "cpu") # Build env (first level, right only) env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) # setup networks init_screen = get_screen(env, device) _, _, screen_height, screen_width = init_screen.shape # Get number of actions from gym action space args.n_actions = env.action_space.n target_net = DQN(screen_height, screen_width, args.n_actions).to(device) if args.targetNet: target_net.load_state_dict( torch.load(args.targetNet, map_location=device)) with torch.no_grad(): i = 0 observation = env.reset() while i < 5000: env.render() state = get_screen(env, device) action = int(target_net(state).max(1)[1].view(1, 1)) observation, reward, done, info = env.step(action) if done: break i += 1 env.close()
class Environment(): def __init__(self, input_mode=RIGHT_ONLY, level_mode=0): # inputMode = RIGHT_ONLY, SIMPLE_MOVEMENT, or COMPLEX_MOVEMENT # levelMode = SuperMarioBros-vX self.env = gym_super_mario_bros.make(f"SuperMarioBros-v{level_mode}") self.env = JoypadSpace(self.env, input_mode) self.env.reset() # Performs action and returns result def input_action(self, action): return self.env.step(action) # Renders the environment def render(self): self.env.render() # Resets the environment def reset(self): self.env.reset() # Closes the environment def close(self): self.env.close()
def play_random_custom(env, steps): _NOP = 0 env = JoypadSpace(env, actions) env.reset() action = 0 start = time.time() if SHOULD_TRAIN: init_screen = get_screen() _, _, screen_height, screen_width = init_screen.shape # INIT Neural Network policy = Policy(screen_height, screen_width, len(actions)) if SHOULD_LOAD_STATE: print("Loading model from: ", DATA_PATH) policy.load_state_dict(torch.load(DATA_PATH)) optimizer = optim.Adam(policy.parameters(), lr=1e-2) eps = np.finfo(np.float32).eps.item() # Helper functions def select_action(state): global steps_done sample = random.random() eps_threshold = reward_threshold # eps_threshold = EPS_END + (EPS_START - EPS_END) * \ # math.exp(-1. * steps_done / EPS_DECAY) steps_done += 1 if sample > eps_threshold: with torch.no_grad(): # t.max(1) will return largest column value of each row. # second column on max result is index of where max element was # found, so we pick action with the larger expected reward. return policy(state).max(1)[1].view(1, 1) else: return torch.tensor([[random.randrange(len(actions))]], device=device, dtype=torch.long) def finish_episode(): R = 0 policy_loss = [] returns = [] for r in policy.rewards[::-1]: R = r + GAMMA * R returns.insert(0, R) returns = torch.tensor(returns) returns = (returns - returns.mean()) / \ (returns.std() + eps) for log_prob, R in zip(policy.saved_log_probs, returns): policy_loss.append(-log_prob * R) optimizer.zero_grad() print("POLICY LOSS: ", policy_loss) # policy_loss = torch.cat(policy_loss).sum() # policy_loss.backward() optimizer.step() torch.save(policy.state_dict(), DATA_PATH) del policy.rewards[:] del policy.saved_log_probs[:] running_reward = 10 for i_episode in count(1): print("Episode: ", i_episode) state, ep_reward = env.reset(), 0 # Don't infinite loop while learning for t in range(1, num_steps_per_episode): action = select_action(state).data.cpu().numpy()[0][0] # print("ACTION:", action) state, reward, done, info = env.step(action) if SHOULD_RENDER: env.render() policy.rewards.append(reward) ep_reward += reward if done: break running_reward = 0.05 * ep_reward + \ (1 - 0.05) * running_reward finish_episode() if i_episode % log_interval == 0: print( 'Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}' .format(i_episode, ep_reward, running_reward)) print("Running reward: ", running_reward) if running_reward > reward_threshold: print("Solved! Running reward is now {} and " "the last episode runs to {} time steps!".format( running_reward, t)) break else: # PLAY RANDOMLY for t in range(0, steps): # get the mapping of keyboard keys to actions in the environment if hasattr(env, 'get_keys_to_action'): keys_to_action = env.get_keys_to_action() elif hasattr(env.unwrapped, 'get_keys_to_action'): keys_to_action = env.unwrapped.get_keys_to_action() else: raise ValueError( 'env has no get_keys_to_action method') # # change action every 6 frames if t % 6 == 0: action = env.action_space.sample() # after 500 timesteps, stop pressing start button if t > 500: while action == 0: action = env.action_space.sample() observation, reward, done, info = env.step(action) print("---------------------------t: ", t) print("action space: ", action, env.action_space) print("obs: ", observation.shape) print("reward: ", reward) print("info: ", info) # runs game at about 60fps time.sleep(0.016667) env.render() end = time.time() env.close() print("time: ", (end - start), " seconds for ", steps, "steps")
from nes_py.wrappers import JoypadSpace from my_gym_super_mario_bros.actions import SIMPLE_MOVEMENT from my_gym_super_mario_bros import make env = make('SuperMarioBros-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) # from nes_py.wrappers import JoypadSpace # import gym_super_mario_bros # from gym_super_mario_bros.actions import SIMPLE_MOVEMENT # env = gym_super_mario_bros.make('SuperMarioBros-v0') # env = JoypadSpace(env, SIMPLE_MOVEMENT) done = True for step in range(100): if done: state = env.reset() cur_act = env.action_space.sample() state, reward, done, info = env.step(cur_act) print("Current Step State:",state.shape) print("Current Step Reward:",reward) # env.render() env.close()
t.start() time.sleep(0.5) try: [t.join() for t in threads] # wait for threads to finish except KeyboardInterrupt: print("Exiting threads!") def save_weights(self): print("Saving Weights") self.global_network.save_weights("A3CMarioWeights.h5") def restore_weights(self): print("Restoring Weights!") self.global_network.load_weights("A3CMarioWeights.h5") test_env = gym_super_mario_bros.make(env_name) test_env = JoypadSpace(test_env, SIMPLE_MOVEMENT) test_env = atari_wrapper.wrap_dqn(test_env) NUM_ACTIONS = test_env.action_space.n OBS_SPACE = test_env.observation_space.shape[0] state = test_env.reset() state = np.expand_dims(state, axis=0) stats = Stats() agent = A3CAgent() agent.start_threads() test_env.close()
class Agent: def __init__(self, level_name): self.level_name = level_name # setup environment self.env = gym_super_mario_bros.make(level_name) self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT) # one hot encoded version of our actions self.possible_actions = np.array(np.identity(self.env.action_space.n, dtype=int).tolist()) # resest graph tf.reset_default_graph() # instantiate the DQNetwork self.DQNetwork = DQNetwork(state_size, action_size, learning_rate) # instantiate memory self.memory = Memory(max_size=memory_size) # initialize deque with zero images self.stacked_frames = deque([np.zeros((100, 128), dtype=np.int) for i in range(stack_size)], maxlen=4) for i in range(pretrain_length): # If it's the first step if i == 0: state = self.env.reset() state, self.stacked_frames = stack_frames(self.stacked_frames, state, True) # Get next state, the rewards, done by taking a random action choice = random.randint(1, len(self.possible_actions)) - 1 action = self.possible_actions[choice] next_state, reward, done, _ = self.env.step(choice) # stack the frames next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False) # if the episode is finished (we're dead) if done: # we inished the episode next_state = np.zeros(state.shape) # add experience to memory self.memory.add((state, action, reward, next_state, done)) # start a new episode state = self.env.reset() state, self.stacked_frames = stack_frames(self.stacked_frames, state, True) else: # add experience to memory self.memory.add((state, action, reward, next_state, done)) # our new state is now the next_state state = next_state # saver will help us save our model self.saver = tf.train.Saver() # setup tensorboard writer self.writer = tf.summary.FileWriter("logs/") # losses tf.summary.scalar("Loss", self.DQNetwork.loss) self.write_op = tf.summary.merge_all() def predict_action(self, sess, explore_start, explore_stop, decay_rate, decay_step, state, actions): # first we randomize a number exp_exp_tradeoff = np.random.rand() explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step) if explore_probability > exp_exp_tradeoff: # make a random action choice = random.randint(1, len(self.possible_actions)) - 1 action = self.possible_actions[choice] else: # estimate the Qs values state Qs = sess.run(self.DQNetwork.output, feed_dict={self.DQNetwork.inputs_: state.reshape((1, *state.shape))}) # take the biggest Q value (= best action) choice = np.argmax(Qs) action = self.possible_actions[choice] return action, choice, explore_probability def play_notebook(self): import matplotlib.pyplot as plt # imports to render env to gif from JSAnimation.IPython_display import display_animation from matplotlib import animation from IPython.display import display # http://mckinziebrandon.me/TensorflowNotebooks/2016/12/21/openai.html def display_frames_as_gif(frames): """ Displays a list of frames as a gif, with controls """ #plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72) patch = plt.imshow(frames[0]) plt.axis('off') def animate(i): patch.set_data(frames[i]) anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50) display(display_animation(anim, default_mode='loop')) frames = [] with tf.Session() as sess: total_test_rewards = [] # Load the model self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name)) for episode in range(1): total_rewards = 0 state = self.env.reset() state, self.stacked_frames = stack_frames(self.stacked_frames, state, True) print("****************************************************") print("EPISODE ", episode) while True: # Reshape the state state = state.reshape((1, *state_size)) # Get action from Q-network # Estimate the Qs values state Qs = sess.run(self.DQNetwork.output, feed_dict = {self.DQNetwork.inputs_: state}) # Take the biggest Q value (= the best action) choice = np.argmax(Qs) #Perform the action and get the next_state, reward, and done information next_state, reward, done, _ = self.env.step(choice) frames.append(self.env.render(mode = 'rgb_array')) total_rewards += reward if done: print ("Score", total_rewards) total_test_rewards.append(total_rewards) break next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False) state = next_state self.env.close() display_frames_as_gif(frames) def play(self): with tf.Session() as sess: total_test_rewards = [] # Load the model self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name)) #self.env = wrap_env(self.env) for episode in range(1): total_rewards = 0 state = self.env.reset() state, self.stacked_frames = stack_frames(self.stacked_frames, state, True) print("****************************************************") print("EPISODE ", episode) while True: # Reshape the state state = state.reshape((1, *state_size)) # Get action from Q-network # Estimate the Qs values state Qs = sess.run(self.DQNetwork.output, feed_dict = {self.DQNetwork.inputs_: state}) # Take the biggest Q value (= the best action) choice = np.argmax(Qs) #Perform the action and get the next_state, reward, and done information next_state, reward, done, _ = self.env.step(choice) self.env.render() total_rewards += reward if done: print ("Score", total_rewards) total_test_rewards.append(total_rewards) break next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False) state = next_state self.env.close() def train(self): with tf.Session() as sess: # initialize the variables sess.run(tf.global_variables_initializer()) # initialize decay rate (that will be used to reduce epsilon) decay_step = 0 for episode in range(total_episodes): # set step to 0 step = 0 # initialize rewards of episode episode_rewards = [] # make a new episode and opserve the first state state = self.env.reset() # remember that stack frame function state, self.stacked_frames = stack_frames(self.stacked_frames, state, True) print("Episode:", episode) while step < max_steps: step += 1 #print("step:", step) # increase decay_step decay_step += 1 # predict an action action, choice, explore_probability = self.predict_action(sess, explore_start, explore_stop, decay_rate, decay_step, state, self.possible_actions) # perform the action and get the next_state, reward, and done information next_state, reward, done, _ = self.env.step(choice) if episode_render: self.env.render() # add the reward to total reward episode_rewards.append(reward) # the game is finished if done: print("done") # the episode ends so no next state next_state = np.zeros((110, 84), dtype=np.int) next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False) # set step = max_steps to end episode step = max_steps # get total reward of the episode total_reward = np.sum(episode_rewards) print("Episode:", episode, "Total reward:", total_reward, "Explore P:", explore_probability, "Training Loss:", loss) #rewards_list.append((episode, total_reward)) # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory self.memory.add((state, action, reward, next_state, done)) else: # stack frame of the next state next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False) # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory self.memory.add((state, action, reward, next_state, done)) # s_{i} := s_{i+1} state = next_state ### Learning part # obtain random mini-batch from memory batch = self.memory.sample(batch_size) states_mb = np.array([each[0] for each in batch], ndmin=3) actions_mb = np.array([each[1] for each in batch]) rewards_mb = np.array([each[2] for each in batch]) next_states_mb = np.array([each[3] for each in batch], ndmin=3) dones_mb = np.array([each[4] for each in batch]) target_Qs_batch = [] # get Q values for next_state Qs_next_state = sess.run(self.DQNetwork.output, feed_dict={self.DQNetwork.inputs_: next_states_mb}) # set Q_target = r if episode ends with s+1 for i in range(len(batch)): terminal = dones_mb[i] # if we are in a terminal state, only equals reward if terminal: target_Qs_batch.append(rewards_mb[i]) else: target = rewards_mb[i] + gamma * np.max(Qs_next_state[i]) target_Qs_batch.append(target) targets_mb = np.array([each for each in target_Qs_batch]) loss, _ = sess.run([self.DQNetwork.loss, self.DQNetwork.optimizer], feed_dict={self.DQNetwork.inputs_: states_mb, self.DQNetwork.target_Q: targets_mb, self.DQNetwork.actions_: actions_mb}) # write tf summaries summary = sess.run(self.write_op, feed_dict={self.DQNetwork.inputs_: states_mb, self.DQNetwork.target_Q: targets_mb, self.DQNetwork.actions_: actions_mb}) self.writer.add_summary(summary, episode) self.writer.flush() # save model every 5 episodes if episode % 5 == 0: self.saver.save(sess, "models/{0}.cpkt".format(self.level_name)) print("Model Saved")
def train_agent(args): # if gpu is to be used device = torch.device( "cuda" if torch.cuda.is_available() and args.ngpu > 0 else "cpu") # Build env (first level, right only) env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) # setup networks init_screen = get_screen(env, device) _, _, screen_height, screen_width = init_screen.shape # Get number of actions from gym action space args.n_actions = env.action_space.n policy_net = DQN(screen_height, screen_width, args.n_actions).to(device) target_net = DQN(screen_height, screen_width, args.n_actions).to(device) if args.targetNet: target_net.load_state_dict( torch.load(args.targetNet, map_location=device)) if args.policyNet: target_net.load_state_dict( torch.load(args.policyNet, map_location=device)) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.RMSprop(policy_net.parameters()) memory = ReplayMemory(10000) args.steps_done = 0 num_episodes = 1 for i_episode in range(num_episodes): # Initialize the environment and state env.reset() last_screen = get_screen(env, device) current_screen = get_screen(env, device) state = current_screen - last_screen for t in count(): # Select and perform an action action = select_action(state, policy_net, args, device) _, reward, done, _ = env.step(action.item()) reward = torch.tensor([reward], device=device) # Observe new state last_screen = current_screen current_screen = get_screen(env, device) if not done: next_state = current_screen - last_screen else: next_state = None # Store the transition in memory memory.push(state, action, next_state, reward) # Move to the next state state = next_state # Perform one step of the optimization (on the target network) optimize_model(optimizer, memory, policy_net, target_net, args, device) if done: episode_durations.append(t + 1) break # Update the target network, copying all weights and biases in DQN if i_episode % args.target_update == 0: target_net.load_state_dict(policy_net.state_dict()) torch.save(policy_net.state_dict(), args.output_policyNet) torch.save(target_net.state_dict(), args.output_targetNet) if i_episode % 10 == 0: print(f'{i_episode+1}/{num_episodes}: Completed Episode.') print('Complete') env.close() torch.save(policy_net.state_dict(), args.output_policyNet) torch.save(target_net.state_dict(), args.output_targetNet)
class NesEnv(): def __init__(self, env, seed, max_episode_length, action_repeat, bit_depth, args): from nes_py.wrappers import JoypadSpace import gym_tetris from gym_tetris.actions import SIMPLE_MOVEMENT self._env = gym_tetris.make(env,skip_level=True) self._env.seed(seed) self._env = JoypadSpace(self._env, SIMPLE_MOVEMENT) self.max_episode_length = max_episode_length self.action_repeat = action_repeat self.bit_depth = bit_depth self.small_image = args.small_image self.add_reward = args.add_reward self.typeb = "1" in env self.acc = 0.03 if self.typeb else 3 self.living = 0.003 if self.typeb else 0.3 self.dim=1 if args.binary_image else 3 if args.binary_image: self._process_obs=_images_to_observation_binary else: self._process_obs=_images_to_observation self.one_skip=False if not args.add_reward: self.acc=0 self.living=0 def reset(self): self.t = 0 # Reset internal timer state = self._env.reset() # hack the memory of the nes env, setting level to 29 # self._env.ram[0x0064]=29 # skip some frames for i in range(85): state,r,d,i=self._env.step(0) # print(self.observation_size) observation = self._process_obs(state, self.bit_depth, self.observation_size) # NxCxHxW return observation def step(self, action): action = action.argmax().item() # convert onehot action to int reward = 0 state, done = None, None total=3 if self._env.ram[0x0068]<2 else 1 for k in range(3): # print(f"Timer: {self._env.ram[0x0065]},State {self._env.ram[0x0068]}") state, reward_k, done, info = self._env.step(action if k==0 else 0) reward += reward_k self.t += 1 # Increment internal timer done = done or self.t == self.max_episode_length if done: break flag=False while self._env.ram[0x0065]>0 and self._env.ram[0x0068]>=2 and not done: flag=True o,r,d,info=self._env.step(0) reward+=r done=d or done if flag and self.one_skip: o,r,d,info=self._env.step(0) reward+=r done=d or done state=o if flag: reward+=self.acc if info['board_height']>10: reward-=self.acc reward+=self.living observation = self._process_obs(state, self.bit_depth, self.observation_size) return observation, reward, done def render(self): self._env.render() def close(self): self._env.close() @property def observation_size(self): # self._env.observation_space.shape: H x W x C (240x256x3) return (self.dim, 96, 96) if self.small_image else (self.dim, 128, 128) # C x H x W # return (3, 120, 128) # C x H x W # TODO: Lixin @property def action_size(self): return self._env.action_space.n def sample_random_action(self): indices = torch.tensor(self._env.action_space.sample()) return F.one_hot(indices, self.action_size).float()
class Agent: def __init__(self, level_name): self.level_name = level_name self.env = gym_super_mario_bros.make(level_name) self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT) self.possible_actions = np.array( np.identity(self.env.action_space.n, dtype=int).tolist()) tf.compat.v1.reset_default_graph() self.DQNet = DQNet(state_size, action_size, learning_rate) self.memory = Memory(max_size=memory_size) self.stacked_frames = deque( [np.zeros((100, 128), dtype=np.int) for i in range(stack_size)], maxlen=4) for i in range(pretrain_length): if i == 0: state = self.env.reset() state, self.stacked_frames = stack_frame( self.stacked_frames, state, True) choice = random.randint(1, len(self.possible_actions)) - 1 action = self.possible_actions[choice] next_state, reward, done, _ = self.env.step(choice) next_state, self.stacked_frames = stack_frame( self.stacked_frames, next_state, False) if done: next_state = np.zeros(state.shape) self.memory.add((state, action, reward, next_state, done)) state = self.env.reset() state, self.stacked_frames = stack_frame( self.stacked_frames, state, True) else: self.memory.add((state, action, reward, next_state, done)) state = next_state self.saver = tf.compat.v1.train.Saver() self.writer = tf.compat.v1.summary.FileWriter("logs/") tf.summary.scalar("Loss", self.DQNet.loss) self.write_op = tf.compat.v1.summary.merge_all() def predict_action( self, sess, explore_start, explore_stop, decay_rate, decay_step, state, actions, ): exp_exp_tradeoff = np.random.rand() explore_probs = explore_stop + (explore_start - explore_stop) * np.exp( -decay_rate * decay_step) if explore_probs > exp_exp_tradeoff: choice = random.randint(1, len(self.possible_actions)) - 1 action = self.possible_actions[choice] else: QS = sess.run(self.DQNet.output, feed_dict={ self.DQNet.inputs: state.reshape( (1, *state.shape)) }) choice = np.argmax(QS) action = self.possible_actions[choice] return action, choice, explore_probs def play_note(self): import matplotlib.pyplot as plt from JSAnimation.IPython_display import display_animation from matplotlib import animation from IPython.display import display def display_frame_gif(frames): patch = plt.imshow(frames[0]) plt.axis('off') def animate(i): patch.set_data(frames[i]) anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50) display(display_animation(anim, default_mode='loop')) frames = [] with tf.compat.v1.Session as sess: total_test_rewards = [] self.saver.restore(sess, "model/{0}.cpkt".format(self.level_name)) for episode in range(1): total_rewards = 0 state = self.env.reset() state, self.stacked_frames = stack_frame( self.stacked_frames, state, True) print("*************************************") print('EPISODE', episode) while True: state = state.reshape((1, *state_size)) QS = sess.run(self.DQNet.output, feed_dict={self.DQNet.inputs: state}) choice = np.argmax(QS) next_state, reward, done, _ = self.env.step(choice) frames.append(self.env.render(mode='rgb_array')) if done: print("Score", total_rewards) total_test_rewards.append(total_rewards) break next_state, self.stacked_frames = stack_frame( self.stacked_frames, next_state, False) state = next_state self.env.close() def play(self): with tf.compat.v1.Session() as sess: total_test_rewards = [] self.saver.restore(sess, "model/{0}.cpkt".format(self.level_name)) for episode in range(1): total_rewards = 0 state = self.env.reset() state, self.stacked_frames = stack_frame( self.stacked_frames, state, True) print("*************************************") print('EPISODE', episode) while True: state = state.reshape((1, *state_size)) QS = sess.run(self.DQNet.output, feed_dict={self.DQNet.inputs: state}) choice = np.argmax(QS) next_state, reward, done, _ = self.env.step(choice) self.env.render() total_rewards += reward if done: print("Score", total_rewards) total_test_rewards.append(total_rewards) break next_state, self.stacked_frames = stack_frame( self.stacked_frames, next_state, False) state = next_state self.env.close() def train(self): with tf.compat.v1.Session() as sess: sess.run(tf.compat.v1.global_variables_initializer()) decay_step = 0 for episode in range(total_episodes): step = 0 episodes_rewards = [] state = self.env.reset() state, self.stacked_frames = stack_frame( self.stacked_frames, state, True) print("EPISODE", episode) while step < max_steps: step += 1 decay_step += 1 action, choice, explore_probs = self.predict_action( sess, explore_start, explore_stop, decay_rate, decay_step, state, self.possible_actions) next_state, reward, done, _ = self.env.step(choice) if episode_render: self.env.render() episodes_rewards.append(reward) if done: print('done') next_state = np.zeros((100, 128), dtype=np.int) next_state, self.stacked_frames = stack_frame( self.stacked_frames, next_state, False) step = max_steps total_rewards = np.sum(episodes_rewards) print('Episode: {}'.format(episode), 'Total reward: {}'.format(total_rewards), 'Explore P: {:.4f}'.format(explore_probs), 'Training Loss {:.4f}'.format(loss)) self.memory.add( (state, action, reward, next_state, done)) else: next_state, self.stacked_frames = stack_frame( self.stacked_frames, next_state, False) self.memory.add( (state, action, reward, next_state, done)) state = next_state batch = self.memory.sample(batch_size) states_mb = np.array([each[0] for each in batch], ndmin=3) actions_mb = np.array([each[1] for each in batch]) rewards_mb = np.array([each[2] for each in batch]) next_state_mb = np.array([each[3] for each in batch], ndmin=3) dones_mb = np.array([each[4] for each in batch]) target_Qs_batch = [] Qs_next_state = sess.run( self.DQNet.output, feed_dict={self.DQNet.inputs: next_state_mb}) for i in range(len(batch)): terminal = dones_mb[i] if terminal: target_Qs_batch.append(rewards_mb[i]) else: target = rewards_mb[i] + gamma * np.max(Qs_next_state) target_Qs_batch.append(target) target_mb = np.array([each for each in target_Qs_batch]) loss, _ = sess.run( [self.DQNet.loss, self.DQNet.optimizer], feed_dict={ self.DQNet.inputs: states_mb, self.DQNet.target_q: target_mb, self.DQNet.action: actions_mb }) summary = sess.run(self.write_op, feed_dict={ self.DQNet.inputs: states_mb, self.DQNet.target_q: target_mb, self.DQNet.action: actions_mb }) self.writer.add_summary(summary, episode) self.writer.flush() if episode % 5 == 0: self.saver.save(sess, "models/{0}.cpkt".format(self.level_name)) print("model Saved")
class SimpleMario(): def __init__(self): self.BASE_DIR = os.getcwd() self.SAVE_DESTINALTION = os.path.join(self.BASE_DIR, "saved_model") self.env = gym_super_mario_bros.make('SuperMarioBros-v0') self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT) self.valid_move_indx = set(range(7)) self.fresh_start() def get_action_set(self) -> dict: """Dict of all possible actions - key[int]: input for the action - value: action Returns: dict: dict of all action """ # ['NOOP', 'right', 'right A', 'right B', 'right A B', 'A', 'left'] return dict({ 0: "stay", 1: "forward", 2: "forward , A ,", 3: "forward , B ", 4: "forward , A , B", 5: "jump", 6: "backward" }) def close_env(self): """ Cleans inmemory data for the enviroment - CLEAN AFTER YOU ARE DONE """ self.env.close() def fresh_start(self): """ Reset Every time you need to do a fresh start - like when you die - want to restart from begining """ self.env.reset() def get_env_state(self) -> dict: """Get Environment Details { 'state': env.state, 'reward': env.reward, 'isdead': env.isdead, 'info': env.info, } Returns: dict: environment Details """ respTuple = self.env.step(0) respData = dict() respData['state'] = respTuple[0] respData['reward'] = respTuple[1] respData['isdead'] = respTuple[2] respData['info'] = respTuple[3] return respData # TODO: make it happen def get_miv_env(self): """ Still in progress """ envConfig = self.get_env_state() envState = envConfig['state'] envState = [np.argmax(pixel) for pixel in envState] return envState def make_move(self, move): """Returns Null is invalid input valid input: self.get_action_set() Args: move (int): move index Returns: tuple: - (numpy.ndarray) the state as a result of the action - (float) the reward achieved by taking the action - (bool) a flag denoting whether the episode has ended - (dict) a dictionary of extra information """ if move in self.valid_move_indx: return self.env.step(move) return [None, None, True, None] def play_game(self, moveCount: int = 0): """Play game using interface - After playing make sure to restart Key input [int] : Enter your move: range(0,7) Do you want to restart: { -1: Do Nothing 0: ShutDown preview else: Reset and Restart } Args: moveCount (int, optional): key for action to take from self.get_action_set. Defaults to 0. """ self.fresh_start() self.env.render() for cou in range(moveCount): move_indx = int(input("Enter your move: ")) for _ in range(30): state, reward, done, info = self.make_move(move_indx) self.env.render() if done: self.fresh_start() restart = int(input("Do you want to restart: ")) if (restart == -1): pass elif (restart == 0): break def generate_random_file_name(self): if not os.path.isdir(self.SAVE_DESTINALTION): os.makedirs(self.SAVE_DESTINALTION) fileContentCount = len(os.listdir(self.SAVE_DESTINALTION)) return f"MarioEnv{fileContentCount}.npy" def save_env(self, destination: str = ""): """ Dont Really need it """ if destination == "": if not os.path.isdir(self.SAVE_DESTINALTION): os.makedirs(self.SAVE_DESTINALTION) else: self.SAVE_DESTINALTION = destination fileDestination = os.path.join(self.SAVE_DESTINALTION, self.generate_random_file_name()) fileContent = self.get_env_state() with open(fileDestination, 'w') as savedModel: np.save(fileDestination, np.array(list(fileContent.items()), dtype=object)) def load_env(self, fileLocation: str = ""): """ Dont really need it """ if (fileLocation == ""): raise Exception("File Location Not Provided") elif not os.path.isfile(fileLocation): raise Exception("Invalid File Location") fileContent = dict() with open(fileLocation, 'rb') as fileObj: fileListContent = np.load(fileObj, allow_pickle=True) for item in fileListContent: fileContent[item[0]] = item[1] return fileContent
def train_model(parameters): #Initialization of environment and agent env = gym_super_mario_bros.make(parameters['environment']) env = JoypadSpace(env, RIGHT_ONLY) env = wrapper(env) states = (84, 84, 4) actions = env.action_space.n agent = DDQNagent(parameters, states, actions) if parameters['train']: #TENSORBOARD current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") log_dir = 'logs/mario/' + current_time + '/10k' summary_writer = tf.summary.create_file_writer(log_dir) summary_writer.set_as_default() maxXpos = 0 # Maximum X position of the Agent max_reward = 0 # Maximum reward start_time = time.time() # Start time #Initialization of varialbes for plots graph_reward = np.zeros(parameters['episodes_to_play']) # Reward graph_pos = np.zeros(parameters['episodes_to_play']) # Pozition graph_mean_reward = np.zeros(parameters['episodes_to_play']) # Mean Reward episodes = parameters['episodes_to_play'] # Number of episodes to train rewards = [] # Rewards array start = time.time() # Time for calculating processed frames per second step = 0 # Total steps #Lerning cycle for e in range(episodes): #Default state of the environment state = env.reset() total_reward = 0 # Reward gained for actual epsiode iter = 0 while True: #Select an action action = agent.run(state) #Apply action to environment next_state, reward, done, info = env.step(action) #Write new data to memory agent.update_memory(experience=(state, next_state, action, reward, done)) #Learn agent.learn() #Sum of rewards for every action total_reward += reward #Change current state to next one state = next_state iter += 1 #Render if parameters['render']: env.render() #Check finish condition if done or info['flag_get']: break #New data for variable that be used for plot rewards.append(total_reward / iter) #Update info if maxXpos < info['x_pos']: maxXpos = info['x_pos'] if max_reward < total_reward: max_reward = total_reward if info['flag_get'] == True: agent.flag_reached = agent.flag_reached + 1 #Epsilon decay if agent.eps >= 0.0: agent.eps = agent.eps - agent.eps_decay #Updtate variables for plots graph_reward[e] = total_reward graph_pos[e] = info['x_pos'] graph_mean_reward[e] = np.mean(graph_reward) #TENSORBOARD tf.summary.scalar("Rewards", total_reward, step=e) tf.summary.scalar("Position", info['x_pos'], step=e) tf.summary.scalar("Mean reward", np.mean(graph_reward), step=e) tf.summary.scalar("Flags", agent.flag_reached, step=e) tf.summary.scalar("Loss", agent.loss, step=e) #Console information print("Episode reward: " + str(total_reward) + ' - Pos: ' + str(info['x_pos'])) # Print if e % 10 == 0: end = time.time() print('Flags reached: ' + str(agent.flag_reached) + ' - Max reward: ' +str(max_reward)) print('Episode {e} - ' 'Frame {f} - ' 'Frames/sec {fs} - ' 'Epsilon {eps} - ' 'Mean Reward {r} - ' 'Time {t} sec - ' 'Max pos {pos}'.format(e=e, f=agent.step, fs=np.round((agent.step - step) / (time.time() - start)), eps=np.round(agent.eps, 4), r=np.mean(rewards[-100:]), t=round(end - start_time), pos=maxXpos)) start = time.time() step = agent.step #After learning draw plots and save weights draw_graph(graph_reward,'Rewards') draw_graph(graph_pos, 'Position') draw_graph(graph_mean_reward, 'Mean reward') agent.save_weights() env.close() else: #If train is equal to false, it is possible to load weights and observe result print('Weights file path (hdf5): ') weights_name = input() try: agent.model_target.load_weights(weights_name) agent.model_test(env) except: print("Weights with this name or on this path not found") env.close()
class MarioManager(): ''' Initialize the environment, class contains basic Open Gym AI operations along with screen proccessing operations done by pytorch ''' def __init__(self, device): self.device = device self.env = JoypadSpace(gym_super_mario_bros.make('SuperMarioBros-v0'), RIGHT_ONLY) self.env.reset() self.current_screen = None self.done = False self.current_score = 0 self.current_coins = 0 self.x = -9999999 self.coins = 0 self.score = 0 self.count_same_posn = 0 def reset(self): self.env.reset() self.current_screen = None def close(self): self.env.close() def render(self, mode = 'human'): return self.env.render(mode) def num_actions(self): return self.env.action_space.n def take_act(self, action): observation, reward, self.done, info = self.env.step(action.item()) #uses action.item #if new coins if self.coins != info['coins']: reward += int(info['coins']) - self.coins self.coins = int(info['coins']) #if ghost is killed if self.score != info['score']: reward += int(info['score']) - self.score self.score = int(info['score']) #checking for same position in the game, means he is stuck, kill him if self.x == info['x_pos']: self.count_same_posn += 1 #if he moved after being stuck, give a reward elif self.count_same_posn > 0 and self.x != info['x_pos']: self.count_same_posn = 0 reward += 15 #else reset count to 0 else: self.count_same_posn = 0 # if reward == 0: # reward -= 1 #make negative reward even more negative #kill him after the first life to speed up training # if info['life'] < 2: # self.done = True #check that he actually moved to the right if self.x < info['x_pos']: reward += 0 #he didn't more right byt taking the action, pinalize else: reward -= 1 if info['x_pos'] != 40: self.x = info['x_pos'] return torch.tensor([reward], device = self.device) def return_count(self): return self.count_same_posn def return_posn(self): return self.x def is_starting(self): return self.current_screen is None def state(self): if self.is_starting() or self.done: self.current_screen = self.get_proccessed_screen() black_screen = torch.zeros_like(self.current_screen) return black_screen else: # screen = self.current_screen # next_screen = self.get_proccessed_screen() # self.current_screen = next_screen self.current_screen = self.get_proccessed_screen() return self.current_screen def screen_height(self): return self.get_proccessed_screen().shape[2] def screen_width(self): return self.get_proccessed_screen().shape[3] def get_proccessed_screen(self): screen = self.render('rgb_array').transpose((2,0,1)) screen = self.crop_screen(screen) return self.transform_screen_data(screen) def crop_screen(self, screen): screen_height = screen.shape[1] top = int(screen_height * 0.5) bottom = int(screen_height * 0.9) screen = screen[:,top:bottom, :] return screen def transform_screen_data(self, screen): screen = np.ascontiguousarray(screen, dtype = np.float32)/255 screen = torch.from_numpy(screen) size = t.Compose( [t.ToPILImage(), t.Resize((15,40)), #t.Grayscale(num_output_channels=1), t.ToTensor()]) return size(screen).unsqueeze(0).to(self.device)
class MarioEnvironment(dm_env.Environment): def __init__( self, skip_frames: int = 3, img_rescale_pc: float = 0.4, stack_func: Optional[Callable[[List[np.ndarray]], np.ndarray]] = np.hstack, stack_mode: str = "all", grayscale: bool = True, black_background: bool = True, in_game_score_weight: float = 0.01, movement_type: str = "simple", world_and_level: Optional[Tuple[int, int]] = None, idle_frames_threshold: Optional[int] = 1250, colorful_rendering: bool = True, ) -> None: assert stack_mode in ("first_and_last", "all") self._stack_mode = stack_mode env_name = (f"SuperMarioBros" if world_and_level is None else "SuperMarioBros-%d-%d" % world_and_level) env_name += f"-v{int(black_background)}" self._smb_env = gym_super_mario_bros.make(env_name) self._smb_env = JoypadSpace(self._smb_env, MOVEMENTS_TYPES[movement_type]) self._actions_queue = [] self._colorful_env = None if (grayscale or black_background) and colorful_rendering: self._colorful_env = gym_super_mario_bros.make( "SuperMarioBros-%d-%d-v0" % world_and_level) self._colorful_env = JoypadSpace(self._colorful_env, MOVEMENTS_TYPES[movement_type]) self._stack_func = stack_func self._grayscale = grayscale self._score_weight = in_game_score_weight self._idle_frames_threshold = idle_frames_threshold self._last_score = 0 self._last_x = 40 self._idle_counter = 0 self._rescale_pc = img_rescale_pc self._skip_frames = skip_frames self._obs_shape = self.reset().observation.shape self._num_actions = self._smb_env.action_space.n def reset(self): """ Returns the first `TimeStep` of a new episode. """ self._smb_env.reset() self._last_score = 0 self._last_x = 40 self._idle_counter = 0 self._actions_queue = [] if self._colorful_env is not None: self._colorful_env.reset() return dm_env.restart(self.step(0).observation) def _is_idle(self, info): if self._idle_frames_threshold is None: return False x = info["x_pos"] delta_x = x - self._last_x self._last_x = x if abs(delta_x) < 1: self._idle_counter += 1 return self._idle_counter > self._idle_frames_threshold self._idle_counter = 0 return False def step(self, action) -> TimeStep: """ Updates the environment's state. """ # NOTE: # The gym_super_mario_bros environment reuses the numpy array it # returns as observation. When stacking observations, this might be # a source of bugs (all observations in the stack might be representing # the same, final frame!), so always copy the arrays when doing that. # The observation arrays are already being copied inside # `self._preprocess_img`, so no explicit copying is needed here. action = int(action) initial_img, total_reward, done, info = self._smb_env.step(action) self._actions_queue.append(action) done = done or self._is_idle(info) # Skipping frames: if self._skip_frames > 0: imgs = [self._process_img(initial_img)] skip_count = 0 while skip_count < self._skip_frames: skip_count += 1 if not done: last_img, reward, done, info = self._smb_env.step(action) self._actions_queue.append(action) done = done or self._is_idle(info) total_reward += reward else: last_img = np.zeros_like(initial_img) if self._stack_mode == "all" or skip_count == self._skip_frames: imgs.append(self._process_img(last_img)) obs = self._stack_func(imgs) # Single frame: else: obs = self._process_img(initial_img) score_diff = info["score"] - self._last_score self._last_score = info["score"] total_reward = np.float64(total_reward + self._score_weight * score_diff) if done: return dm_env.termination(reward=total_reward, observation=obs) return dm_env.transition(reward=total_reward, observation=obs) def observation_spec(self): return dm_env.specs.BoundedArray(shape=self._obs_shape, dtype=np.float32, name="image", minimum=0, maximum=1) def action_spec(self): return dm_env.specs.DiscreteArray(dtype=np.int32, name="action", num_values=self._num_actions) def _process_img(self, img): img = np.divide(img, 255) img = img[50:, :, :] if abs(self._rescale_pc - 1) > 1e-2: img = rescale(img, scale=self._rescale_pc, multichannel=True) if self._grayscale: img = img @ RGB2GRAY_COEFFICIENTS return img.astype(np.float32, copy=True) def render(self, mode="human", return_all_imgs=False): if return_all_imgs: assert self._colorful_env is not None and mode == "rgb_array", ( "The option 'return_all_imgs' is valid only when using " "colorful rendering and rgb array mode!") # Regular rendering: if self._colorful_env is None: return self._smb_env.render(mode) # Colorful rendering: img_list = [] for action in self._actions_queue: self._colorful_env.step(action) if return_all_imgs: # NOTE: make sure a copy of the returned rgb array is made! img_list.append(self._colorful_env.render(mode).copy()) self._actions_queue = [] return img_list if return_all_imgs else self._colorful_env.render(mode) def plot_obs(self, obs): plt.imshow(obs, cmap="gray" if self._grayscale else None) plt.show() def close(self): self._smb_env.close()