def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta')) else: yield
def timed(msg): #print ("/algo/trpo_mpi/200lines") #yi ge iter zhi xing 12 ci # zhe shi yi ge fa bu xin xi de guo cheng if rank == 0: print(colorize(msg, color='red')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield
def traj_segment_generator(pi, env, horizon, stochastic, visualize=False): # Initialize state variables t = 0 ac = env.action_space.sample() new = True rew = 0.0 ob = env.reset() cur_ep_ret = 0 cur_ep_len = 0 ep_rets = [] ep_lens = [] # Initialize history arrays obs = np.array([ob for _ in range(horizon)]) rews = np.zeros(horizon, 'float32') vpreds = np.zeros(horizon, 'float32') news = np.zeros(horizon, 'int32') acs = np.array([ac for _ in range(horizon)]) prevacs = acs.copy() save = False playerScore = opponentScore = 0 wins = losses = ties = gamesTotal = totalPlayer = totalOpponent = 0 while True: prevac = ac ac, vpred = pi.act(stochastic, ob) # Slight weirdness here because we need value function at time T # before returning segment [0, T-1] so we get the correct # terminal value if t > 0 and t % horizon == 0: yield { "ob": obs, "rew": rews, "vpred": vpreds, "new": news, "ac": acs, "prevac": prevacs, "nextvpred": vpred * (1 - new), "ep_rets": ep_rets, "ep_lens": ep_lens } _, vpred = pi.act(stochastic, ob) # Be careful!!! if you change the downstream algorithm to aggregate # several of these batches, then be sure to do a deepcopy ep_rets = [] ep_lens = [] i = t % horizon obs[i] = ob vpreds[i] = vpred news[i] = new acs[i] = ac prevacs[i] = prevac ob, rew, new, _ = env.step(ac) if visualize: env.render() if rew > 0: playerScore += abs(rew) else: opponentScore += abs(rew) rews[i] = rew cur_ep_ret += rew cur_ep_len += 1 if new: msg = format("End of game: score %d - %d" % (playerScore, opponentScore)) print(colorize(msg, color='red')) gamesTotal += 1 if playerScore - opponentScore > 100: save = True if playerScore > opponentScore: wins += 1 elif opponentScore > playerScore: losses += 1 else: ties += 1 totalPlayer += playerScore totalOpponent += opponentScore playerScore = opponentScore = 0 msg = format( "Status so far: \nGames played - %d wins - %d losses - %d ties - %d\n Total score: %d - %d" % (gamesTotal, wins, losses, ties, totalPlayer, totalOpponent)) print(colorize(msg, color='red')) ep_rets.append(cur_ep_ret) ep_lens.append(cur_ep_len) cur_ep_ret = 0 cur_ep_len = 0 ob = env.reset() t += 1
def traj_episode_generator(pi, env, horizon, stochastic, render, downsample): t = 0 ac = env.action_space.sample() # not used, just so we have the datatype new = True # marks if we're on first timestep of an episode ob = env.reset() cur_ep_ret = 0 # return in current episode cur_ep_len = 0 # len of current episode # Initialize history arrays obs = [] rews = [] news = [] acs = [] playerScore = opponentScore = 0 wins = losses = ties = gamesTotal = totalPlayer = totalOpponent = 0 while True: prevac = ac ac, vpred = pi.act(stochastic, ob) if not downsample and hasattr(env.unwrapped, 'ale'): obs.append(env.unwrapped.ale.getScreenRGB2()) else: obs.append(obs) obs.append(ob) news.append(new) acs.append(ac) ob, rew, new, _ = env.step(ac) rews.append(rew) cur_ep_ret += rew cur_ep_len += 1 if render: env.render() if t > 0 and (new or t % horizon == 0): msg = format("End of game: score %d - %d" % (playerScore, opponentScore)) print(colorize(msg, color='red')) gamesTotal += 1 if playerScore > opponentScore: wins += 1 elif opponentScore > playerScore: losses += 1 else: ties += 1 totalPlayer += playerScore totalOpponent += opponentScore playerScore = opponentScore = 0 msg = format( "Status so far: \nGames played - %d wins - %d losses - %d ties - %d\n Total score: %d - %d" % (gamesTotal, wins, losses, ties, totalPlayer, totalOpponent)) print(colorize(msg, color='red')) # convert list into numpy array obs = np.array(obs) rews = np.array(rews) news = np.array(news) acs = np.array(acs) yield { "ob": obs, "rew": rews, "new": news, "ac": acs, "ep_ret": cur_ep_ret, "ep_len": cur_ep_len } ob = env.reset() cur_ep_ret = 0 cur_ep_len = 0 t = 0 # Initialize history arrays obs = [] rews = [] news = [] acs = [] t += 1
def playAtari(env, agent, U, modelPath, transpose=True, stochastic=False, fps=30, zoom=None, delay=None): """ Plays an Atari games' agent from trained model :param env: Atari environment :param agent: agent's policy model (neural network) :param U: baseline's tf_util :param modelPath: path to trained model :param transpose: transpose video :param stochastic: environment type :param fps: :param zoom: :param delay: start delay :return: """ global obs U.initialize() U.load_state(modelPath) tempEnv = env while not isinstance(tempEnv, ActionWrapper): try: tempEnv = tempEnv.env except: break # using ActionWrapper: if isinstance(tempEnv, ActionWrapper): obs_s = tempEnv.screen_space else: obs_s = env.observation_space # obs_s = env.observation_space # if len(env.observation_space.shape) < 3: # obs_s = env.env.screen_space assert type(obs_s) == Box assert len(obs_s.shape) == 2 or (len(obs_s.shape) == 3 and obs_s.shape[2] in [1, 3]) if zoom is None: zoom = 1 video_size = int(obs_s.shape[0] * zoom), int(obs_s.shape[1] * zoom) if transpose: video_size = tuple(reversed(video_size)) # setup the screen using pygame flags = RESIZABLE | HWSURFACE | DOUBLEBUF screen = pygame.display.set_mode(video_size, flags) pygame.event.set_blocked(pygame.MOUSEMOTION) clock = pygame.time.Clock() # ================================================================================================================= observation = env.reset() running = True envDone = False playerScore = opponentScore = 0 wins = losses = ties = gamesTotal = totalPlayer = totalOpponent = 0 print("Get ready...") if delay is not None: time.sleep(delay) while running: pygame.event.get() if envDone: # results of game: msg = format("End of game: score %d - %d" % (playerScore, opponentScore)) print(colorize(msg, color='red')) gamesTotal += 1 if playerScore > opponentScore: wins += 1 elif opponentScore > playerScore: losses += 1 else: ties += 1 totalPlayer += playerScore totalOpponent += opponentScore playerScore = opponentScore = 0 msg = format( "Status so far: \nGames played - %d wins - %d losses - %d ties - %d\n Total score: %d - %d" % (gamesTotal, wins, losses, ties, totalPlayer, totalOpponent)) print(colorize(msg, color='red')) # if gamesTotal == 3: # running = False envDone = False observation = env.reset() else: action, value_predicted = agent.act(stochastic, observation) if not isinstance(action, list): action = np.atleast_1d(action) observation, reward, envDone, info = env.step(action) # track of player score: if reward > 0: playerScore += abs(reward) else: opponentScore += abs(reward) # as we using RAM for observations so we need a screed data for playing a game with pygame: if hasattr(env, 'getImage'): obs = env.getImage() elif hasattr(env.unwrapped, 'ale'): obs = env.unwrapped.ale.getScreenRGB2() if obs is not None: if len(obs.shape) == 2: obs = obs[:, :, None] if obs.shape[2] == 1: obs = obs.repeat(3, axis=2) display_arr(screen, obs, video_size, transpose) pygame.display.flip() clock.tick(fps) pygame.quit()
def replayACS(env, modelPath, transpose=True, fps=30, zoom=None): """ Replays a game from recorded trajectories using actions This method is not precise though, because it indirectly recovers environment states from actions. Sometimes it gets asynchronous and distorts the real trajectory. :param env: Atari environment :param modelPath: path to trained model :param transpose: :param fps: :param zoom: :return: """ global obs with open(modelPath, 'rb') as rfp: trajectories = pkl.load(rfp) U.make_session(num_cpu=1).__enter__() U.initialize() tempEnv = env while not isinstance(tempEnv, ActionWrapper): try: tempEnv = tempEnv.env except: break # using ActionWrapper: if isinstance(tempEnv, ActionWrapper): obs_s = tempEnv.screen_space else: obs_s = env.observation_space # assert type(obs_s) == Box assert len(obs_s.shape) == 2 or (len(obs_s.shape) == 3 and obs_s.shape[2] in [1, 3]) if zoom is None: zoom = 1 video_size = int(obs_s.shape[0] * zoom), int(obs_s.shape[1] * zoom) if transpose: video_size = tuple(reversed(video_size)) # setup the screen using pygame flags = RESIZABLE | HWSURFACE | DOUBLEBUF screen = pygame.display.set_mode(video_size, flags) pygame.event.set_blocked(pygame.MOUSEMOTION) clock = pygame.time.Clock() # ================================================================================================================= running = True envDone = False playerScore = opponentScore = 0 wins = losses = ties = gamesTotal = totalPlayer = totalOpponent = 0 while running: trl = len(trajectories) for i in range(trl): obs = env.reset() print("\nRunning trajectory {}".format(i)) print("Length {}".format(len(trajectories[i]['ac']))) for ac in tqdm(trajectories[i]['ac']): if not isinstance(ac, list): ac = np.atleast_1d(ac) obs, reward, envDone, info = env.step(ac) # track of player score: if reward > 0: playerScore += abs(reward) else: opponentScore += abs(reward) if hasattr(env, 'getImage'): obs = env.getImage() if obs is not None: if len(obs.shape) == 2: obs = obs[:, :, None] if obs.shape[2] == 1: obs = obs.repeat(3, axis=2) display_arr(screen, obs, video_size, transpose) pygame.display.flip() clock.tick(fps) msg = format("End of game: score %d - %d" % (playerScore, opponentScore)) print(colorize(msg, color='red')) gamesTotal += 1 if playerScore > opponentScore: wins += 1 elif opponentScore > playerScore: losses += 1 else: ties += 1 totalPlayer += playerScore totalOpponent += opponentScore playerScore = opponentScore = 0 msg = format("Status so far: \nGames played - %d wins - %d losses - %d ties - %d\n Total score: %d - %d" % ( gamesTotal, wins, losses, ties, totalPlayer, totalOpponent)) print(colorize(msg, color='red')) pygame.quit()