def main(): #env = gym_super_mario_bros.make('SuperMarioBros-v0') env = gym_super_mario_bros.make('SuperMarioBros-1-1-v1') env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) timestart = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H:%M:%S') # env = VideoRecorderWrapper(env, PROJ_DIR + "/../video", str(timestart), 50) env = VideoRecorderWrapper(env, PROJ_DIR + "/../video/final", str(timestart), 1) env = DownsampleEnv(env, (84, 84)) env = PenalizeDeathEnv(env, penalty=-25) env = FrameStackEnv(env, 4) # good #act = deepq.load(PROJ_DIR+"/../models/mario_model_2018-08-12-13:00:58.pkl") # better act = deepq.load(PROJ_DIR + "/../models/mario_model_2018-08-12-19:21:50.pkl") episode = 0 while True: obs, done = env.reset(), False stepnr = 0 episode_rew = 0 while not done: env.render() obs, rew, done, _ = env.step(act(obs[None])[0]) if stepnr % 20 == 0: plot_obs(obs) episode_rew += rew stepnr += 1 print("Episode reward", episode_rew, episode) episode = episode+1
def run(self, solution, level, render, mode): env = gym_super_mario_bros.make(level) env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT) done = True reason_finish = "no_more_commands" pos = 0 total_r = 0 for step in range(len(solution)): if done: state = env.reset() state, reward, done, info = env.step(solution[pos]) pos+=1 if reward == -15: #faleceu reason_finish = "death" break if mode == "level" and info['flag_get'] == True: reason_finish = "win" break total_r = total_r + reward if render == "true": env.render() env.close() return total_r, pos, info, reason_finish
def main(path="./models/deepq/mario_reward_1736.7.pkl"): step_mul = 16 steps = 200 FLAGS = flags.FLAGS flags.DEFINE_string("env", "SuperMarioBros-v0", "RL environment to train.") flags.DEFINE_string("algorithm", "deepq", "RL algorithm to use.") FLAGS(sys.argv) # 1. Create gym environment env = gym_super_mario_bros.make('SuperMarioBros-v0') env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) act = deepq.load(path) nstack = 4 nh, nw, nc = env.observation_space.shape history = np.zeros((1, nh, nw, nc * nstack), dtype=np.uint8) obs, done = env.reset(), False # history = update_history(history, obs) episode_rew = 0 while not done: env.render() action = act([obs])[0] obs, rew, done, _ = env.step(action) # history = update_history(history, obs) episode_rew += rew print("action : %s reward : %s" % (action, rew)) print("Episode reward", episode_rew)
class Environment: actionMap = { 0: 'NOOP', 1: 'Right', 2: 'Right-Jump', 3: 'Right-Sprint', 4: 'Right-Jump-Sprint', 5: 'Jump', 6: 'Left' } def __init__(self, rows=19, columns=16, verbose=True, raw=True, variant=1): self.verbose = verbose self.raw = raw self.variant = variant self.img2state = Img2State(rows=19, columns=16) self.game = BinarySpaceToDiscreteSpaceEnv( gym_super_mario_bros.make('SuperMarioBros-v3'), SIMPLE_MOVEMENT) self.state = self.img2state.transfrom(self.game.reset(), raw=self.raw, variant=self.variant) self.reward = 0 # Actions self.A = list(Environment.actionMap.keys()) def step(self, action: int): if action not in self.A: raise Exception('Wrong Action...') state, self.reward, done, info = self.game.step(action) self.state = self.img2state.transfrom(state, raw=self.raw, variant=self.variant) if done and self.state[8]: self.reward = 100 elif self.state[8]: self.reward = 30 elif self.state[9]: self.reward = 15 if self.verbose: self.game.render() return done def reset(self): self.state = self.img2state.transfrom(self.game.reset(), raw=self.raw, variant=self.variant) self.reward = 0
class Environment(threading.Thread): stop_signal = False def __init__(self, render=False, eps_start=EPS_START, eps_end=EPS_STOP, eps_steps=EPS_STEPS): threading.Thread.__init__(self) self.render = render # Make the super mario gym environment and apply wrappers self.env = gym.make(ENV) self.env = BinarySpaceToDiscreteSpaceEnv(self.env, SIMPLE_MOVEMENT) self.env = preprocess.GrayScaleImage(self.env, height=HIGHT, width=WIDTH, grayscale=True) # self.env = wrappers.Monitor(self.env, "./Super_Mario_AI/videos", force = True, write_upon_reset=True) self.agent = Agent(TEMPERATURE) def runEpisode(self): s = self.env.reset() R = 0 while True: time.sleep(THREAD_DELAY) # yield if self.render: self.env.render() a = self.agent.act(s) s_, r, done, info = self.env.step(a) if done: # terminal state s_ = None self.agent.train(s, a, r, s_) s = s_ R += r if done or self.stop_signal: break print("Total R:", R) def run(self): while not self.stop_signal: self.runEpisode() def stop(self): self.stop_signal = True
class MarioEnv: def __init__(self, os='mac', display=False): self.display = display if os == 'mac' or os == 'linux': env = gym_super_mario_bros.make('SuperMarioBros-v0') self.env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) else: raise Exception("bad os") self.act_dim = self.env.action_space.n self.obs_dim = (1, 128, 128) print("env created with act_dim", self.act_dim, "obs_dim", self.obs_dim) self.transform = transforms.Compose([ transforms.ToTensor(), # chain 2 transforms together using list. transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) ]) def reset(self): state = self.env.reset() return self.__resize_image(state) def step(self, action): state, reward, done, info = self.env.step(action) if reward == 0: reward = -0.5 state_t = self.__resize_image(state) return state_t, \ np.reshape(reward, -1), \ np.reshape(done, -1) def close(self): self.env.close() def __resize_image(self, state): state_new = cv2.resize(state, (128, 128)) img = Image.fromarray(state_new) state_t = self.transform(img)[0, :, :].unsqueeze(0) state_t = state_t.float().to(DEVICE) return state_t.unsqueeze(0) def render(self): if self.display: self.env.render()
def main(): env = gym_super_mario_bros.make('SuperMarioBros-v0') env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) done = True max_step = 5000 print(env.observation_space.shape) #win下加ascii=True才会不换行 qbar = tqdm(max_step, ascii=True) for step in range(max_step): qbar.update() if done: state = env.reset() action = get_action(state, env.action_space) state, reward, done, info = env.step(action) if done: print(str(step) + " 英雄请卷土重来" + str(info)) env.render() env.close() qbar.close()
class MarioEnv(Process): def __init__(self, env_id, idx, child_conn, queue, n_step, is_render=False): super(MarioEnv, self).__init__() self.idx = idx self.env_id = env_id self.child_conn = child_conn self.queue = queue self.is_render = is_render self.n_step = n_step self.steps = 0 self.episodes = 0 self.accum_reward = 0 self.transition = [] def run(self): super(MarioEnv, self).run() self.env = gym_super_mario_bros.make(self.env_id) self.env = BinarySpaceToDiscreteSpaceEnv(self.env, SIMPLE_MOVEMENT) self.reset() print('[ Worker %2d ] ' % (self.idx), end='') print('Playing <', self.env_id, '>') self.request_action(0, False) while True: action = self.child_conn.recv() next_state, reward, done, info = self.env.step(action) self.steps += 1 self.accum_reward += reward next_state = rgb2dataset(next_state) if self.is_render and self.idx == 0: self.env.render() # make a transition self.transition.append(next_state) if len(self.transition) > 4: self.transition.pop(0) if done: self.send_result(info['x_pos']) self.reset() self.request_action(reward, True) else: self.request_action(reward, False) def reset(self): state = self.env.reset() state = rgb2dataset(state) self.transition.clear() self.transition.append(state) self.steps = 0 self.episodes += 1 self.accum_reward = 0 def request_action(self, reward, done): self.queue.put([self.idx, "OnStep", [self.transition, reward, done]]) def send_result(self, x_pos): self.queue.put([ self.idx, "Result", [self.episodes, self.steps, self.accum_reward, x_pos] ])
def main(): movement = SIMPLE_MOVEMENT movement.append(['left', 'A']) movement.append(['left', 'B']) movement.append(['left', 'A', 'B']) movement.append(['B']) movement.append(['down']) movement.append(['up']) env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = BinarySpaceToDiscreteSpaceEnv(env, movement) #channels is acting as the number of frames in history #if resize_height and height are different, assert final_height < resize_height and image will be cropped channels = 4 width = 84 resize_height = 110 final_height = 84 size = [channels, final_height, width] batch_size = 32 replay_capacity = 100000 replay_dir = '/home/hansencb/mario_replay/' epsilon = 1 gamma = 0.9 use_cuda = torch.cuda.is_available() torch.manual_seed(1) device = torch.device("cuda" if use_cuda else "cpu") model = simple_net(channels, len(movement), device).to(device) target_model = simple_net(channels, len(movement), device).to(device) model_file = 'mario_agent' model.load_state_dict(torch.load(model_file)) target_model.load_state_dict(torch.load(model_file)) lr = 0.001 optimizer = torch.optim.Adam(model.parameters(), lr=lr) total_reward_file = 'total_reward.txt' with open(total_reward_file, 'w') as f: f.write('Reward\tSteps\n') max_steps = 5000 num_eps = 1000 data = dataset(replay_capacity, batch_size, replay_dir, 1, size) for episode in range(num_eps): print('Episode {}'.format(episode + 1)) state = env.reset() state = preprocess(state, [resize_height, width], final_height) state = torch.cat((state, state, state, state)) action = 0 episode_reward = 0 for step in range(max_steps): if step % 3 == 0: if random.random() < epsilon: action = random.randint(0, len(movement) - 1) else: q_val, action = maxQ(state, model, device) next_state, reward, done, info = env.step(int(action)) if reward > 0: reward = 1 else: reward = -1 episode_reward += reward next_state = preprocess(next_state, [resize_height, width], final_height) next_state = torch.cat((state[1:, :, :], next_state)) trans = transition(state, action, reward, next_state, done) data.add(trans) train(model, device, optimizer, data.get_batch(model, device, gamma)) state = next_state env.render() #time.sleep(0.03) if done: with open(total_reward_file, 'a') as f: f.write('{}\t{}\n'.format(episode_reward, step)) break epsilon -= (1 / num_eps) if episode % 10 == 0: target_model.load_state_dict(model.state_dict()) with open(model_file, 'wb') as f: torch.save(model.state_dict(), f) env.close()
class Simulator: def __init__(self, movements, max_steps): """ Creates a new Simulator. The Simulator lets individuals play the game and assigns their resulting fitness to them. :param movements: a list of movements the individuals are allowed to make :param max_steps: the maximum number of simulation steps an individual is allowed to use """ self.movements = movements self.max_steps = max_steps # TODO maybe another name on "env_expanded"? self.env_expanded = gym_super_mario_bros.SuperMarioBrosEnv( frames_per_step=1, rom_mode='vanilla') self.env = BinarySpaceToDiscreteSpaceEnv(self.env_expanded, self.movements) # self.env.metadata['video.frames_per_second'] = 120 # self.env_expanded.metadata['video.frames_per_second'] = 120 self._log = logging.getLogger('MLProject.Simulator') def _simulate_individual(self, individual: Individual, render): """ Simulates a single individual and assigns its fitness score. This involves letting the individual play a game of Mario, and assigning the resulting fitness to the individual. :param individual: """ state = self.env.reset() x_pos = 0 last_x_pos = 0 reward_final = 0 accumulated_fitness = 0 died = False last_fps_time = time.time() frames = 0 steps_standing_still = 0 number_of_steps_standing_still_before_kill = 200 for step in range(self.max_steps): self.state_downscaled = get_sensor_map(self.env_expanded) action = individual.agent.act(self.state_downscaled) # print('\r', _vectofixedstr(action, 12), end=' ') action = np.argmax(action) state, reward, done, info = self.env.step(action) if info['flag_get']: accumulated_fitness += x_pos x_pos = info['x_pos'] + accumulated_fitness reward_final += reward # Checks if reward is 0 to see if Mario stood still in the last step if last_x_pos - 1 <= x_pos <= last_x_pos + 1: steps_standing_still += 1 if steps_standing_still >= number_of_steps_standing_still_before_kill: break else: steps_standing_still = 0 last_x_pos = x_pos if render: self.env.render() if info["life"] <= 2: died = True break # now = time.time() frames += 1 """ if now - last_fps_time >= 1: fps = frames / (now - last_fps_time) self._log.debug('FPS: {}'.format(fps)) last_fps_time = now frames = 0 """ fps = frames / (time.time() - last_fps_time) self._log.debug('Steps per second: {:.2f}'.format(fps)) individual.fitness = x_pos # individual.fitness = reward_final if died: self._log.debug( 'Individual {} died. It achieved fitness {}'.format( individual.id, individual.fitness)) else: self._log.debug( 'Individual {} ran out of simulation steps. It achieved fitness {}' .format(individual.id, individual.fitness)) def simulate_generation(self, generation: Generation, render=True): """ Simulates the whole generation and assigns each individual a fitness score. :param generation: :param render: """ for individual in generation.individuals: self._simulate_individual(individual, render) def shutdown(self): """ Does nothing. Needed for compatibility with ParallelSimulator """ pass
print("Made save path at: {}".format(save_dir)) save_path = save_dir / AGENT_FILENAME if Path.is_file(save_path): print("Loading saved agent...") agent.load(save_path) done = False batch_size = 32 for e in range(1, EPISODES + 1): state = env.reset() state = np.reshape(state, [1, state_size]) time = 0 while True: env.render() action = agent.act(state) next_state, reward, done, _ = env.step(action) reward = reward if not done else -10 next_state = np.reshape(next_state, [1, state_size]) agent.remember(state, action, reward, next_state, done) state = next_state if done or time >= 500: print("episode: {}/{}, score: {}, e: {:.2}".format( e, EPISODES, time, agent.epsilon)) break time += 1 if len(agent.memory) > batch_size: agent.replay(batch_size) if e % 10 == 0: agent.save(save_path)
class MarioEnvironment(Process): def __init__(self, env_id, is_render, env_idx, child_conn, history_size=4, h=84, w=84): super(MarioEnvironment, self).__init__() self.daemon = True self.env = BinarySpaceToDiscreteSpaceEnv( gym_super_mario_bros.make(env_id), movement) self.is_render = is_render self.env_idx = env_idx self.steps = 0 self.episode = 0 self.rall = 0 self.recent_rlist = deque(maxlen=100) self.child_conn = child_conn self.history_size = history_size self.history = np.zeros([history_size, h, w]) self.h = h self.w = w self.reset() def run(self): super(MarioEnvironment, self).run() while True: action = self.child_conn.recv() if self.is_render: self.env.render() obs, reward, done, info = self.env.step(action) if life_done: # when Mario loses life, changes the state to the terminal # state. if self.lives > info['life'] and info['life'] > 0: force_done = True self.lives = info['life'] else: force_done = done self.lives = info['life'] else: # normal terminal state force_done = done # reward range -15 ~ 15 log_reward = reward / 15 self.rall += log_reward r = log_reward self.history[:3, :, :] = self.history[1:, :, :] self.history[3, :, :] = self.pre_proc(obs) self.steps += 1 if done: self.recent_rlist.append(self.rall) print( "[Episode {}({})] Step: {} Reward: {} Recent Reward: {} Stage: {} current x:{} max x:{}" .format(self.episode, self.env_idx, self.steps, self.rall, np.mean(self.recent_rlist), info['stage'], info['x_pos'], self.max_pos)) self.history = self.reset() else: self.child_conn.send( [self.history[:, :, :], r, False, done, log_reward]) def reset(self): self.steps = 0 self.episode += 1 self.rall = 0 self.lives = 3 self.stage = 1 self.max_pos = 0 self.get_init_state(self.env.reset()) return self.history[:, :, :] def pre_proc(self, X): # grayscaling x = cv2.cvtColor(X, cv2.COLOR_RGB2GRAY) # resize x = cv2.resize(x, (self.h, self.w)) x = np.float32(x) * (1.0 / 255.0) return x def get_init_state(self, s): for i in range(self.history_size): self.history[i, :, :] = self.pre_proc(s)
class MarioEnvironment(Process): def __init__( self, env_id, is_render, env_idx, child_conn, history_size=4, life_done=False, h=84, w=84, movement=COMPLEX_MOVEMENT, sticky_action=True, p=0.25): super(MarioEnvironment, self).__init__() self.daemon = True self.env = BinarySpaceToDiscreteSpaceEnv( gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT) self.is_render = is_render self.env_idx = env_idx self.steps = 0 self.episode = 0 self.rall = 0 self.recent_rlist = deque(maxlen=100) self.child_conn = child_conn self.life_done = life_done self.sticky_action = sticky_action self.last_action = 0 self.p = p self.history_size = history_size self.history = np.zeros([history_size, h, w]) self.h = h self.w = w self.reset() def run(self): super(MarioEnvironment, self).run() while True: action = self.child_conn.recv() if self.is_render: self.env.render() # sticky action if self.sticky_action: if np.random.rand() <= self.p: action = self.last_action self.last_action = action # 4 frame skip reward = 0.0 done = None for i in range(4): obs, r, done, info = self.env.step(action) if self.is_render: self.env.render() reward += r if done: break # when Mario loses life, changes the state to the terminal # state. if self.life_done: if self.lives > info['life'] and info['life'] > 0: force_done = True self.lives = info['life'] else: force_done = done self.lives = info['life'] else: force_done = done # reward range -15 ~ 15 log_reward = reward / 15 self.rall += log_reward r = int(info.get('flag_get', False)) self.history[:3, :, :] = self.history[1:, :, :] self.history[3, :, :] = self.pre_proc(obs) self.steps += 1 if done: self.recent_rlist.append(self.rall) print( "[Episode {}({})] Step: {} Reward: {} Recent Reward: {} Stage: {} current x:{} max x:{}".format( self.episode, self.env_idx, self.steps, self.rall, np.mean( self.recent_rlist), info['stage'], info['x_pos'], self.max_pos)) self.history = self.reset() self.child_conn.send([self.history[:, :, :], r, force_done, done, log_reward]) def reset(self): self.last_action = 0 self.steps = 0 self.episode += 1 self.rall = 0 self.lives = 3 self.stage = 1 self.max_pos = 0 self.get_init_state(self.env.reset()) return self.history[:, :, :] def pre_proc(self, X): # grayscaling x = cv2.cvtColor(X, cv2.COLOR_RGB2GRAY) # resize x = cv2.resize(x, (self.h, self.w)) return x def get_init_state(self, s): for i in range(self.history_size): self.history[i, :, :] = self.pre_proc(s)
n_y=env.action_space.n, learning_rate=0.01, reward_decay=0.99) for episodes in range(EPISODES): observation = env.reset() observation = np.array(observation).reshape(1, 240, 256, 3) episode_reward = 0 print("episode", episodes) while True: if RENDER_ENV: env.render() action = PG.choose_action(observation) next_state, reward, done, info = env.step(action) PG.store_transition(next_state, action, reward) episode_rewards_sum = sum(PG.episode_rewards) if done: episode_rewards_sum = sum(PG.episode_rewards) rewards.append(episode_rewards_sum) print(episode_rewards_sum) max_reward_so_far = np.amax(rewards)
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv import gym_super_mario_bros from gym_super_mario_bros.actions import SIMPLE_MOVEMENT env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) print(env.observation_space) print(env.get_action_meanings()) print(env.get_keys_to_action()) done = True totalReward = 0 maxReward = 0 for step in range(1000): if done: state = env.reset() action = env.action_space.sample() state, reward, done, info = env.step(action) #print(f"State: {state.shape} {state}") #print(f"Info: {info}") totalReward += reward if totalReward > maxReward: maxReward = totalReward print(f"{step} ({400 - info['time']}s): {action} -> {reward} (total: {totalReward}, max: {maxReward})") env.render(mode='human') env.close()
class MarioEnv(Process): def __init__(self, env_id, idx, child_conn, queue, s_dim, a_dim, g_net, g_opt, update_iter=10, is_render=False, use_cuda=False): super(MarioEnv, self).__init__() self.idx = idx self.env_id = env_id self.child_conn = child_conn self.queue = queue self.is_render = is_render # self.n_step = n_step self.update_iter = update_iter self.steps = 0 self.episodes = 0 self.accum_reward = 0 self.transition = [] self.use_cuda = use_cuda self.device = torch.device("cuda:0" if use_cuda else "cpu") self.s_dim = s_dim self.a_dim = a_dim self.g_net = g_net self.g_opt = g_opt self.buffer_state = [] self.buffer_action = [] self.buffer_reward = [] def run(self): super(MarioEnv, self).run() self.model = A3C( self.s_dim, self.a_dim, gamma=0.95, epsilon_start=1.0, epsilon_end=0.1, epsilon_length=100000, use_cuda=self.use_cuda, ) self.model.l_net.load_state_dict(self.g_net.state_dict()) self.env = gym_super_mario_bros.make(self.env_id) self.env = BinarySpaceToDiscreteSpaceEnv(self.env, SIMPLE_MOVEMENT) self.reset() print('[ Worker %2d ] ' % (self.idx), end='') print('Playing <', self.env_id, '>') while True: if len(self.transition) != 4: action = self.model.get_action(self.transition, is_random=True) else: action = self.model.get_action(self.transition, is_random=False) next_state, reward, done, info = self.env.step(action) self.steps += 1 self.accum_reward += reward next_state = rgb2dataset(next_state) if self.is_render and self.idx == 0: self.env.render() self.buffer_state.append(self.transition) self.buffer_action.append(action) self.buffer_reward.append(reward) if len(self.buffer_state ) > 0 and self.steps % self.update_iter == 0: next_transition = self.transition[1:] next_transition.append(next_state) self.train(next_transition, done) self.buffer_state.clear() self.buffer_action.clear() self.buffer_reward.clear() # make a transition self.transition.append(next_state) if len(self.transition) > 4: self.transition.pop(0) if done: self.send_result(info['x_pos']) self.reset() def reset(self): state = self.env.reset() state = rgb2dataset(state) self.transition.clear() self.transition.append(state) self.steps = 0 self.episodes += 1 self.accum_reward = 0 def send_result(self, x_pos): self.queue.put([ self.idx, "Result", [self.episodes, self.steps, self.accum_reward, x_pos] ]) def train(self, next_transition, done): if done: v_s_ = 0. else: _, v = self.model.l_net.forward( torch.Tensor([next_transition]).to(self.device)) v_s_ = v.cpu().detach().numpy()[0][0] prob, v = self.model.l_net.forward( torch.Tensor(self.buffer_state).to(self.device)) buffer_v_target = [] for r in self.buffer_reward[::-1]: v_s_ = r + self.model.gamma * v_s_ buffer_v_target.append(v_s_) buffer_v_target.reverse() buffer_v_target = torch.Tensor(np.array(buffer_v_target)).to( self.device) buffer_action = torch.Tensor(np.array(self.buffer_action)).to( self.device) # LOSS 함수 구성 td_error = buffer_v_target - v loss_critic = td_error.pow(2) dist = torch.distributions.Categorical(prob) loss_actor = -dist.log_prob(buffer_action) * td_error.detach() loss = (loss_critic + loss_actor).mean() self.g_opt.zero_grad() loss.backward() for lp, gp in zip(self.model.l_net.parameters(), self.g_net.parameters()): gp._grad = lp.grad.clone().cpu() self.g_opt.step() self.model.l_net.load_state_dict(self.g_net.state_dict())
class MoMarioEnv(Process): def __init__(self, args, env_idx, child_conn, history_size=4, h=84, w=84): super(MoMarioEnv, self).__init__() self.daemon = True self.env = BinarySpaceToDiscreteSpaceEnv( gym_super_mario_bros.make(args.env_id), SIMPLE_MOVEMENT) self.is_render = args.render self.env_idx = env_idx self.steps = 0 self.episode = 0 self.rall = 0 self.coin = 0 self.x_pos = 0 self.time = 0 self.score = 0 self.n_mo = 5 self.morall = np.zeros(self.n_mo) self.recent_rlist = deque(maxlen=100) self.recent_morlist = deque(maxlen=100) self.child_conn = child_conn self.life_done = args.life_done self.single_stage = args.single_stage self.stage_bonus = 0 self.history_size = history_size self.history = np.zeros([history_size, h, w]) self.h = h self.w = w self.reset() def run(self): super(MoMarioEnv, self).run() while True: action = self.child_conn.recv() if self.is_render: self.env.render() obs, reward, done, info = self.env.step(action) if self.single_stage and info["flag_get"]: self.stage_bonus = 10000 done = True ''' Construct Multi-Objective Reward''' ##################################### # [x_pos, time, death, coin] moreward = [] # 1. x position xpos_r = info["x_pos"] - self.x_pos self.x_pos = info["x_pos"] # resolve an issue where after death the x position resets if xpos_r < -5: xpos_r = 0 moreward.append(xpos_r) # 2. time penaltiy time_r = info["time"] - self.time self.time = info["time"] # time is aways decreasing if time_r > 0: time_r = 0 moreward.append(time_r) # 3. death if self.lives > info['life']: death_r = -25 else: death_r = 0 moreward.append(death_r) # 4. coin coin_r = (info['coins'] - self.coin) * 100 self.coin = info['coins'] moreward.append(coin_r) # 5. enemy enemy_r = info['score'] - self.score if coin_r > 0 or done: enemy_r = 0 self.score = info['score'] moreward.append(enemy_r) ############################################################################ if self.life_done: # when Mario loses life, changes the state to the terminal # state. if self.lives > info['life'] and info['life'] > 0: force_done = True self.lives = info['life'] else: force_done = done self.lives = info['life'] else: # normal terminal state force_done = done # reward range -15 ~ 15 r = reward / 15 self.rall += reward self.morall += np.array(moreward) mor = np.array(moreward) * self.n_mo / 15 self.history[:3, :, :] = self.history[1:, :, :] self.history[3, :, :] = self.pre_proc(obs) self.steps += 1 score = info['score'] + self.stage_bonus if done: self.recent_rlist.append(self.rall) self.recent_morlist.append(self.morall) print( "[Episode {}({})]\tStep: {}\tScore: {}\tMoReward: {}\tRecent MoReward: {}\tcoin: {}\tcurrent x:{}" .format(self.episode, self.env_idx, self.steps, score, self.morall, np.mean(self.recent_morlist, axis=0), info['coins'], info['x_pos'])) self.history = self.reset() self.child_conn.send( [self.history[:, :, :], r, force_done, done, mor, score]) def reset(self): self.steps = 0 self.episode += 1 self.rall = 0 self.lives = 3 self.coin = 0 self.x_pos = 0 self.time = 0 self.score = 0 self.stage_bonus = 0 self.morall = np.zeros(self.n_mo) self.get_init_state(self.env.reset()) return self.history[:, :, :] def pre_proc(self, X): # grayscaling x = cv2.cvtColor(X, cv2.COLOR_RGB2GRAY) # resize x = cv2.resize(x, (self.h, self.w)) x = np.float32(x) * (1.0 / 255.0) return x def get_init_state(self, s): for i in range(self.history_size): self.history[i, :, :] = self.pre_proc(s)
def main(): movement = SIMPLE_MOVEMENT movement.append(['left', 'A']) movement.append(['left', 'B']) movement.append(['left', 'A', 'B']) #movement.append(['B']) #movement.append(['down']) #movement.append(['up']) env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = BinarySpaceToDiscreteSpaceEnv(env, movement) #channels is acting as the number of frames in history #if resize_height and height are different, assert final_height < resize_height and image will be cropped channels = 3 frames = 4 width = 128 resize_height = 180 final_height = 128 bottom_chop = 15 size = [channels * frames, final_height, width] batch_size = 16 replay_capacity = 100000 replay_dir = '/home-local/bayrakrg/mario_replay/' start_epsilon = 1.0 stop_epsilon = 0.01 epsilon_decay = 0.00005 gamma = 0.75 use_cuda = torch.cuda.is_available() torch.manual_seed(1) device = torch.device("cuda" if use_cuda else "cpu") model = simple_net(channels, len(movement), device).to(device) target_model = simple_net(channels, len(movement), device).to(device) model_file = 'mario_agent' model.load_state_dict(torch.load(model_file)) target_model.load_state_dict(torch.load(model_file)) lr = 0.0001 optimizer = torch.optim.Adam(model.parameters(), lr=lr) total_reward_file = 'total_reward.txt' with open(total_reward_file, 'w') as f: f.write('Reward\tSteps\n') max_steps = 500 num_eps = 10000 data = dataset(replay_capacity, batch_size, replay_dir, 1, size) tau = 0 max_tau = 10000 decay_step = 0 for episode in range(num_eps): print('Episode {}'.format(episode + 1)) state = env.reset() state = preprocess(state, [resize_height, width, 3], final_height, bottom_chop) state = torch.cat((state, state, state, state)) action = 0 episode_reward = 0 for step in range(max_steps): tau += 1 decay_step += 1 epsilon = stop_epsilon + (start_epsilon - stop_epsilon) * np.exp( -epsilon_decay * decay_step) if random.random() < epsilon: action = random.randint(0, len(movement) - 1) else: q_val, action, q_vals = maxQ(state, model, device) next_state, reward, done, info = env.step(int(action)) if step == max_steps - 1: reward -= 10 if reward > 0: reward = 1 else: reward = -1 episode_reward += reward next_state = preprocess(next_state, [resize_height, width, 3], final_height, bottom_chop) next_state = torch.cat((state[3:, :, :], next_state)) trans = transition(state, action, reward, next_state, done) data.add(trans) train(model, device, optimizer, data.get_batch(model, target_model, device, gamma)) state = next_state env.render() if tau > max_tau: target_model.load_state_dict(model.state_dict()) tau = 0 if done: break with open(total_reward_file, 'a') as f: f.write('{}\t{}\n'.format(episode_reward, step)) if episode % 5 == 0: with open(model_file, 'wb') as f: torch.save(model.state_dict(), f) env.close()
def main(): movement = SIMPLE_MOVEMENT movement.append(['left', 'A']) movement.append(['left', 'B']) movement.append(['left', 'A', 'B']) #movement.append(['B']) #movement.append(['down']) #movement.append(['up']) env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = BinarySpaceToDiscreteSpaceEnv(env, movement) #channels is acting as the number of frames in history #if resize_height and height are different, assert final_height < resize_height and image will be cropped channels = 4 # width = 84 # resize_height = 110 # final_height = 84 width=128 resize_height = 168 final_height = 128 size = [channels, final_height, width] batch_size = 16 replay_capacity = 100000 replay_dir = '/home/hansencb/mario_replay/' gamma = 0.95 start_epsilon = 0.3 stop_epsilon = 0.01 epsilon_decay = 0.00025 use_cuda = torch.cuda.is_available() torch.manual_seed(1) device = torch.device("cuda" if use_cuda else "cpu") model = simple_net(channels, len(movement), device).to(device) target_model = simple_net(channels, len(movement), device).to(device) data_file = 'data_loader' model_file = 'mario_agent' continue_train = True model.load_state_dict(torch.load(model_file)) if continue_train: target_model.load_state_dict(torch.load(model_file)) lr = 0.00005 optimizer = torch.optim.Adam(model.parameters(), lr=lr) total_reward_file ='total_reward.txt' if not continue_train: with open(total_reward_file, 'w') as f: f.write('Reward\tSteps\n') max_steps = 5000 num_eps = 5000 if continue_train: with open(data_file, 'rb') as f: data = pickle.load(f) data.batch_size = batch_size else: data = dataset(replay_capacity, batch_size, replay_dir, size) #initialize memory with 100 experiences done = True for i in range(100): if done: state = env.reset() state = preprocess(state, [resize_height, width], final_height) state = torch.cat((state, state, state, state)) action = random.randint(0,len(movement)-1) next_state, reward, done, info = env.step(int(action)) # if reward>0: # reward = 1 # else: # reward = -1 reward /= 15 if reward == 0: reward = -0.1 next_state = preprocess(next_state, [resize_height, width], final_height) next_state = torch.cat((state[1:, :, :], next_state)) trans = transition(state, action, reward, next_state, done) data.add(trans) state = next_state tau = 0 max_tau = 2000 decay_step = 0 farthest = 3000 cur_x = 1 #training loop for episode in range(num_eps): print('Episode {}'.format(episode+1)) state = env.reset() state = preprocess(state, [resize_height, width], final_height) state = torch.cat((state, state, state, state)) action = 0 episode_reward = 0 for step in range(max_steps): tau += 1 #epsilon = stop_epsilon+(start_epsilon - stop_epsilon)*np.exp(-epsilon_decay*decay_step) epsilon = start_epsilon * np.exp(1-(1/(cur_x/farthest))) if epsilon < stop_epsilon: epsilon = stop_epsilon if random.random() < epsilon: action = random.randint(0,len(movement)-1) else: q_val, action, q_vals = maxQ(state, model, device) next_state, reward, done, info = env.step(int(action)) cur_x = info['x_pos'] if cur_x > farthest: farthest = cur_x # if reward > 0: # reward = 1 # else: # reward = -1 reward /= 15 if reward == 0: reward = -0.1 episode_reward += reward next_state = preprocess(next_state, [resize_height, width], final_height) next_state = torch.cat((state[1:,:,:], next_state)) trans = transition(state, action, reward, next_state, done) data.add(trans) batch = data.get_batch(model, target_model, device, gamma) loss, abs_err = train(model, device, optimizer, batch) data.update_batch(batch['idx'], np.squeeze(torch.Tensor.numpy(abs_err))) state = next_state env.render() #time.sleep(0.03) if tau > max_tau: target_model.load_state_dict(model.state_dict()) tau = 0 if done: break decay_step += step with open(total_reward_file, 'a') as f: f.write('{}\t{}\n'.format(episode_reward, step)) if episode % 5 == 0: with open(model_file, 'wb') as f: torch.save(model.state_dict(), f) with open(data_file, 'wb') as f: pickle.dump(data, f) env.close()
def process_image(image, x, y, h, w): image = image[y:y + h, x:x + w] image = convert_to_gray_scale(image) return image # array = csv_train.get_values() x_list = [] done = True action = 0 for step in range(10): if done: state = env.reset() state, reward, done, info = env.step(1) env.render() print(info['x_pos']) x = info['x_pos'] - 22 y = env.unwrapped._y_position print(env.unwrapped._y_position) print(env.unwrapped._x_position) h = 40 # 52 w = 60 # 20 print(env.observation_space.shape) x_list.append(info['x_pos']) image = env.render('rgb_array') image = image[y:y + h, x:x + w] # image = cv2.resize(image, dsize=(128, 120), interpolation=cv2.INTER_CUBIC) print(image.shape) image = convert_to_gray_scale(image) print(image.shape)
# Build Bellman equation for the Q function inputs[i:i + 1] = np.expand_dims(state, axis=0) targets[i] = model.predict(state) Q_sa = model.predict(state_new) if done: targets[i, action] = reward else: targets[i, action] = reward + gamma * np.max(Q_sa) # Train network to output the Q function model.train_on_batch(inputs, targets) print('Learning Finished') # THIRD STEP: Play! observation = env.reset() obs = np.expand_dims(observation, axis=0) state = np.stack((obs, obs), axis=1) done = False tot_reward = 0.0 while not done: env.render() # Uncomment to see game running Q = model.predict(state) action = np.argmax(Q) observation, reward, done, info = env.step(action) obs = np.expand_dims(observation, axis=0) state = np.append(np.expand_dims(obs, axis=0), state[:, :1, :], axis=1) tot_reward += reward print('Game ended! Total reward: {}'.format(reward))
def replay_genome(genome, movements, gen): env_expanded = gym_super_mario_bros.SuperMarioBrosEnv(frames_per_step=1, rom_mode='vanilla') env = BinarySpaceToDiscreteSpaceEnv(env_expanded, movements) print('Number of genes: ', len(genome.connection_genes)) for gene in genome.connection_genes: print(gene.in_node, gene.out_node, gene.weight, gene.innovation_number, gene.type, gene.enabled) done = True unticked = 0 tick_interval = 1 / 30 last_tick_time = time.time() fps = 0 frames = 0 last_fps_time = time.time() for _ in range(500000): unticked += time.time() - last_tick_time last_tick_time = time.time() ticked = False # while unticked >= tick_interval: if done: state = env.reset() state_downscaled = get_sensor_map(env_expanded) action = genome.calculate_action(state_downscaled) # print('\rFPS: {:.3f}'.format(fps), end=' ') # print(vectofixedstr(action, 10), end=' ') action = np.argmax(action) print('\rtaking action', movements[action], end='', flush=True) state, reward, done, info = env.step(action) #filename = get_path_of('all_pictures/mario/') #imsave(filename + 'mario_' + str(_) + '.png', state) save_state = np.full((13, 10, 3), 255, dtype=np.int) COLORS = [[250, 250, 250], [0, 0, 0], [196, 0, 0], [0, 0, 196]] for i in range(13): for j in range(10): if state_downscaled[(i, j)] == -1: save_state[(i, j)] = COLORS[3] elif state_downscaled[(i, j)] == 0: save_state[(i, j)] = COLORS[0] else: save_state[(i, j)] = COLORS[1] save_state[(7, 2)] = COLORS[2] # filename = get_path_of('all_pictures/input_downscaled/') # imsave(filename + 'state_' + str(_) + '.png', save_state.astype(np.uint8)) # make_controller(movements[action], _, gen) env.render() if info["life"] <= 2: died = True break ticked = True frames += 1 unticked -= tick_interval # if ticked: # now = time.time() # if now - last_fps_time >= 1: # fps = frames / (now - last_fps_time) # last_fps_time = now # frames = 0 # else: # time.sleep(0.001) env.close()
class MarioEnv(Process): def __init__(self, env_id, idx, child_conn, queue, n_step, is_render=False): super(MarioEnv, self).__init__() self.idx = idx self.env_id = env_id self.child_conn = child_conn self.queue = queue self.is_render = is_render self.n_step = n_step self.steps = 0 self.episodes = 0 self.accum_reward = 0 self.transition = None self.prev_xpos = 0 self.prev_life = 0 def run(self): super(MarioEnv, self).run() self.env = gym_super_mario_bros.make(self.env_id) self.env = BinarySpaceToDiscreteSpaceEnv(self.env, SIMPLE_MOVEMENT) self.reset() print('[ Worker %2d ] ' % (self.idx), end='') print('Playing <', self.env_id, '>') self.request_action(0, False) while True: action = self.child_conn.recv() # print(SIMPLE_MOVEMENT[action]) next_state, reward, done, info = self.env.step(action) force_done = False if reward == -15: force_done = True self.steps += 1 self.accum_reward += reward next_state = rgb2dataset(next_state) if self.is_render and self.idx == 0: self.env.render() # make a transition self.transition[:3, :, :] = self.transition[1:, :, :] self.transition[3, :, :] = next_state if done: self.send_result(self.prev_xpos) self.reset() self.request_action(reward, force_done) else: self.request_action(reward, force_done) self.prev_xpos = info['x_pos'] def reset(self): state = self.env.reset() state = rgb2dataset(state) self.transition = np.zeros([4, 84, 84]) self.transition[-1, :] = state self.steps = 0 self.episodes += 1 self.accum_reward = 0 def request_action(self, reward, done): self.queue.put([self.idx, "OnStep", [self.transition, reward, done]]) def send_result(self, x_pos): self.queue.put([ self.idx, "Result", [self.episodes, self.steps, self.accum_reward, x_pos] ])
def main(): movement = SIMPLE_MOVEMENT movement.append(['left', 'A']) movement.append(['left', 'B']) movement.append(['left', 'A', 'B']) #movement.append(['B']) #movement.append(['down']) #movement.append(['up']) env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = BinarySpaceToDiscreteSpaceEnv(env, movement) #channels is acting as the number of frames in history #if resize_height and height are different, assert final_height < resize_height and image will be cropped channels = 3 frames = 4 width = 128 resize_height = 180 final_height = 128 bottom_chop = 15 epsilon = 0.0 use_cuda = torch.cuda.is_available() torch.manual_seed(1) device = torch.device("cuda" if use_cuda else "cpu") model = simple_net(channels, len(movement), device).to(device) model_file = 'mario_agent' model.load_state_dict(torch.load(model_file)) max_steps = 5000 num_eps = 1 for episode in range(num_eps): print('Episode {}'.format(episode + 1)) state = env.reset() state = preprocess(state, [resize_height, width, 3], final_height, bottom_chop) state = torch.cat((state, state, state, state)) action = 0 episode_reward = 0 for step in range(max_steps): if step % 3 == 0: if random.random() < epsilon: action = random.randint(0, len(movement) - 1) else: q_val, action, q_vals = maxQ(state, model, device) next_state, reward, done, info = env.step(int(action)) if reward > 0: reward = 1 else: reward = -1 episode_reward += reward next_state = preprocess(next_state, [resize_height, width, 3], final_height, bottom_chop) next_state = torch.cat((state[3:, :, :], next_state)) state = next_state env.render() time.sleep(0.03) if done: break env.close()
class Agent: def __init__(self, level_name): self.level_name = level_name # setup environment self.env = gym_super_mario_bros.make(level_name) self.env = BinarySpaceToDiscreteSpaceEnv(self.env, SIMPLE_MOVEMENT) # one hot encoded version of our actions self.possible_actions = np.array( np.identity(self.env.action_space.n, dtype=int).tolist()) # resest graph tf.reset_default_graph() # instantiate the DQNetwork self.DQNetwork = DQNetwork(state_size, action_size, learning_rate) # instantiate memory self.memory = Memory(max_size=memory_size) # initialize deque with zero images self.stacked_frames = deque( [np.zeros((100, 128), dtype=np.int) for i in range(stack_size)], maxlen=4) for i in range(pretrain_length): # If it's the first step if i == 0: state = self.env.reset() state, self.stacked_frames = stack_frames( self.stacked_frames, state, True) # Get next state, the rewards, done by taking a random action choice = random.randint(1, len(self.possible_actions)) - 1 action = self.possible_actions[choice] next_state, reward, done, _ = self.env.step(choice) # stack the frames next_state, self.stacked_frames = stack_frames( self.stacked_frames, next_state, False) # if the episode is finished (we're dead) if done: # we inished the episode next_state = np.zeros(state.shape) # add experience to memory self.memory.add((state, action, reward, next_state, done)) # start a new episode state = self.env.reset() state, self.stacked_frames = stack_frames( self.stacked_frames, state, True) else: # add experience to memory self.memory.add((state, action, reward, next_state, done)) # our new state is now the next_state state = next_state # saver will help us save our model self.saver = tf.train.Saver() # setup tensorboard writer self.writer = tf.summary.FileWriter("logs/") # losses tf.summary.scalar("Loss", self.DQNetwork.loss) self.write_op = tf.summary.merge_all() def predict_action(self, sess, explore_start, explore_stop, decay_rate, decay_step, state, actions): # first we randomize a number exp_exp_tradeoff = np.random.rand() explore_probability = explore_stop + ( explore_start - explore_stop) * np.exp(-decay_rate * decay_step) if explore_probability > exp_exp_tradeoff: # make a random action choice = random.randint(1, len(self.possible_actions)) - 1 action = self.possible_actions[choice] else: # estimate the Qs values state Qs = sess.run(self.DQNetwork.output, feed_dict={ self.DQNetwork.inputs_: state.reshape((1, *state.shape)) }) # take the biggest Q value (= best action) choice = np.argmax(Qs) action = self.possible_actions[choice] return action, choice, explore_probability def play_notebook(self): import matplotlib.pyplot as plt # imports to render env to gif from JSAnimation.IPython_display import display_animation from matplotlib import animation from IPython.display import display # http://mckinziebrandon.me/TensorflowNotebooks/2016/12/21/openai.html def display_frames_as_gif(frames): """ Displays a list of frames as a gif, with controls """ #plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72) patch = plt.imshow(frames[0]) plt.axis('off') def animate(i): patch.set_data(frames[i]) anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50) display(display_animation(anim, default_mode='loop')) frames = [] with tf.Session() as sess: total_test_rewards = [] # Load the model self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name)) for episode in range(1): total_rewards = 0 state = self.env.reset() state, self.stacked_frames = stack_frames( self.stacked_frames, state, True) print("****************************************************") print("EPISODE ", episode) while True: # Reshape the state state = state.reshape((1, *state_size)) # Get action from Q-network # Estimate the Qs values state Qs = sess.run(self.DQNetwork.output, feed_dict={self.DQNetwork.inputs_: state}) # Take the biggest Q value (= the best action) choice = np.argmax(Qs) #Perform the action and get the next_state, reward, and done information next_state, reward, done, _ = self.env.step(choice) frames.append(self.env.render(mode='rgb_array')) total_rewards += reward if done: print("Score", total_rewards) total_test_rewards.append(total_rewards) break next_state, self.stacked_frames = stack_frames( self.stacked_frames, next_state, False) state = next_state self.env.close() display_frames_as_gif(frames) def play(self): with tf.Session() as sess: total_test_rewards = [] # Load the model self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name)) for episode in range(1): total_rewards = 0 state = self.env.reset() state, self.stacked_frames = stack_frames( self.stacked_frames, state, True) print("****************************************************") print("EPISODE ", episode) while True: # Reshape the state state = state.reshape((1, *state_size)) # Get action from Q-network # Estimate the Qs values state Qs = sess.run(self.DQNetwork.output, feed_dict={self.DQNetwork.inputs_: state}) # Take the biggest Q value (= the best action) choice = np.argmax(Qs) #Perform the action and get the next_state, reward, and done information next_state, reward, done, _ = self.env.step(choice) self.env.render() total_rewards += reward if done: print("Score", total_rewards) total_test_rewards.append(total_rewards) break next_state, self.stacked_frames = stack_frames( self.stacked_frames, next_state, False) state = next_state self.env.close() def train(self): with tf.Session() as sess: # initialize the variables sess.run(tf.global_variables_initializer()) # initialize decay rate (that will be used to reduce epsilon) decay_step = 0 for episode in range(total_episodes): # set step to 0 step = 0 # initialize rewards of episode episode_rewards = [] # make a new episode and opserve the first state state = self.env.reset() # remember that stack frame function state, self.stacked_frames = stack_frames( self.stacked_frames, state, True) print("Episode:", episode) while step < max_steps: step += 1 #print("step:", step) # increase decay_step decay_step += 1 # predict an action action, choice, explore_probability = self.predict_action( sess, explore_start, explore_stop, decay_rate, decay_step, state, self.possible_actions) # perform the action and get the next_state, reward, and done information next_state, reward, done, _ = self.env.step(choice) if episode_render: self.env.render() # add the reward to total reward episode_rewards.append(reward) # the game is finished if done: print("done") # the episode ends so no next state next_state = np.zeros((110, 84), dtype=np.int) next_state, self.stacked_frames = stack_frames( self.stacked_frames, next_state, False) # set step = max_steps to end episode step = max_steps # get total reward of the episode total_reward = np.sum(episode_rewards) print("Episode:", episode, "Total reward:", total_reward, "Explore P:", explore_probability, "Training Loss:", loss) #rewards_list.append((episode, total_reward)) # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory self.memory.add( (state, action, reward, next_state, done)) else: # stack frame of the next state next_state, self.stacked_frames = stack_frames( self.stacked_frames, next_state, False) # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory self.memory.add( (state, action, reward, next_state, done)) # s_{i} := s_{i+1} state = next_state ### Learning part # obtain random mini-batch from memory batch = self.memory.sample(batch_size) states_mb = np.array([each[0] for each in batch], ndmin=3) actions_mb = np.array([each[1] for each in batch]) rewards_mb = np.array([each[2] for each in batch]) next_states_mb = np.array([each[3] for each in batch], ndmin=3) dones_mb = np.array([each[4] for each in batch]) target_Qs_batch = [] # get Q values for next_state Qs_next_state = sess.run( self.DQNetwork.output, feed_dict={self.DQNetwork.inputs_: next_states_mb}) # set Q_target = r if episode ends with s+1 for i in range(len(batch)): terminal = dones_mb[i] # if we are in a terminal state, only equals reward if terminal: target_Qs_batch.append(rewards_mb[i]) else: target = rewards_mb[i] + gamma * np.max( Qs_next_state[i]) target_Qs_batch.append(target) targets_mb = np.array([each for each in target_Qs_batch]) loss, _ = sess.run( [self.DQNetwork.loss, self.DQNetwork.optimizer], feed_dict={ self.DQNetwork.inputs_: states_mb, self.DQNetwork.target_Q: targets_mb, self.DQNetwork.actions_: actions_mb }) # write tf summaries summary = sess.run(self.write_op, feed_dict={ self.DQNetwork.inputs_: states_mb, self.DQNetwork.target_Q: targets_mb, self.DQNetwork.actions_: actions_mb }) self.writer.add_summary(summary, episode) self.writer.flush() # save model every 5 episodes if episode % 5 == 0: self.saver.save(sess, "models/{0}.cpkt".format(self.level_name)) print("Model Saved")
def train(num_episodes, episode_length, learning_rate , scenario = "deatmatch.cfg", map_path = 'map02', render= True): #discount factor discount_factor = 0.99 # 버퍼에 익스피리언스를 업데이트 하는 주기 learning_rate = 0.01 update_frequency = 5 store_frequency = 50 #아웃풋을 프린팅하는 주기 print_frequency = 1000 #total reward와 total loss를 저장할 변수를 초기화 total_reward = 0 total_loss = 0 old_q_value = 0 # episodic reward와 loss를 저장할 리스트를 초기화 rewards = [] losses = [] env = gym_super_mario_bros.make('SuperMarioBros-v0') env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT) env.reset() actionDRQN = DRQN((240, 256, 3), 11,learning_rate) targetDRQN = DRQN((240, 256, 3), 11,learning_rate) #experience buffer cell_size experiences = ExperienceReplay(1000000) # 모델 저장 saver = tf.train.Saver({v.name: v for v in actionDRQN.parameters}, max_to_keep = 1) #학습을 시작해보자 #샘플링을 위해 모든 변수를 초기화 시킨다. 그리고 버퍼에서 트렌지션을 storing한다. sample = 10 store = 100 with tf.Session() as sess: #모든 텐서플로우 변수를 초기화 한다. sess.run(tf.global_variables_initializer()) for episode in range(num_episodes): #새로운 에피소드를 시작한다. env.reset() for frame in range(episode_length): env.render() state = env.observation_space.shape print(state) action = actionDRQN.prediction.eval(feed_dict = {actionDRQN.input: state}) #env.step (action을 통하) next_state, reward, done, info = env.step(action) #reward를 업데이트 total_reward += reward state= next_state #game이 끝나면 break한다. if done: break #transition을 버퍼에 넣는다. if (frame%store)==0: experience.appendToBuffer((s,action,reward)) #buffer에서 샘플을 뽑는다. if (frame%sample) == 0: memory = experiences.sample(1) mem_frame=memory[0][0] mem_reward = memory[0][2] #train Q1 = actionDRQN.output.eval(feed_dict = {actionDRQN.input : state}) Q2 = targetDRQN.output.eval(feed_dict = {targetDRQN.input : mem_frame}) #learning rate learning_rate = actionDRQN.learning_rate.eval() #Q value를 계산한다. Qtarget = old_q_value + learning+_rate * (mem_reward + discount_factor * Q2 - old_q_value) #update old_q_value = Qtarget # loss 계산 loss =actionDRQN.loss.eval(feed_dict = {actionDRQN.target_vector : Qtarget, actionDRQN.input : mem_frame}) p #update loss total_loss += loss # 두 네트워크를 업데이트한다. actionDRQN.update.run(feed_dict = {actionDRQN.target+vector : Qtarget, actionDRQN.input : mem_frame}) targetDRQN.update.run(feed_dict = {targetDRQN.target+vector : Qtarget, targetDRQN.input : mem_frame}) rewards.append((episode, total_reward)) losses.append((episode, total_loss)) total_reward = 0 total_loss = 0
def run(self): global episode env = gym_super_mario_bros.make('SuperMarioBros-1-1-v3') env = BinarySpaceToDiscreteSpaceEnv(env, REALLY_COMPLEX_MOVEMENT) step = 0 while episode < EPISODES: done = False max_x = 40 no_progress = 0 score = 0 state = env.reset() # Making initial history with random actions for _ in range(5): next_state = state state, _, _, _ = env.step(0) state = preprocess(state) history = np.stack((state, state, state, state), axis=2) history = np.reshape([history], (1, 88, 128, 4)) while not done: # Rendering code # Seems to be causing error in Mac OS if self.thread_num == 1: env.render() step += 1 self.t += 1 step_reward = 0 action, policy = self.get_action(history) # Taking 3 steps with selected action # Mimicking frame skip for _ in range(6): next_state, reward, done, info = env.step(action) score += reward step_reward += reward if done: break # Kill Mario if Mario is making no progress for 10 seconds x_now = info.get('x_pos') # Handling exception x_pos = 65535 if x_now == 65535: x_now = max_x if max_x <= x_now: max_x = x_now no_progress = 0 else: no_progress += 1 if no_progress == 150: done = True reward -= 1 step_reward -= 1 score -= 1 print("#", self.thread_num, " STUCK") # Preprocessing each states next_state = preprocess(next_state) next_state = np.reshape([next_state], (1, 88, 128, 1)) next_history = np.append(next_state, history[:, :, :, :3], axis=3) # Average policy max value self.avg_p_max += np.amax( self.actor.predict(np.float32(history / 255.))) # Appending sample self.append_sample(history, action, step_reward) history = next_history if self.t >= self.t_max or done: #if done: self.train_model(done) self.update_local_model() self.t = 0 if done: # Recording training information episode += 1 print("#", self.thread_num, " episode:", episode, " score:", format(score, '.2f'), " step:", step, "max_x :", max_x) stats = [score, self.avg_p_max / float(step), step] for i in range(len(stats)): self.sess.run(self.update_ops[i], feed_dict={ self.summary_placeholders[i]: float(stats[i]) }) summary_str = self.sess.run(self.summary_op) self.summary_writer.add_summary(summary_str, episode + 1) self.avg_p_max = 0 self.avg_loss = 0 step = 0