def get_batch( flags, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, buffers: Buffers, initial_agent_state_buffers, timings, lock=threading.Lock(), ): with lock: timings.time("lock") indices = [full_queue.get() for _ in range(flags.batch_size)] timings.time("dequeue") batch = { key: torch.stack([buffers[key][m] for m in indices], dim=1) for key in buffers } # NOTE: AttentionNet is batch first. initial_agent_state = tuple( torch.cat(ts, dim=0) for ts in zip(*[initial_agent_state_buffers[m] for m in indices]) ) timings.time("batch") for m in indices: free_queue.put(m) timings.time("enqueue") batch = {k: t.to(device=flags.device, non_blocking=True) for k, t in batch.items()} initial_agent_state = tuple( t.to(device=flags.device, non_blocking=True) for t in initial_agent_state ) timings.time("device") return batch, initial_agent_state
def get_batch( free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, buffers: Buffers, flags, timings, lock=threading.Lock()) -> typing.Dict[str, torch.Tensor]: with lock: timings.time('lock') indices = [full_queue.get() for _ in range(flags.batch_size)] timings.time('dequeue') batch = { key: torch.stack([buffers[key][m] for m in indices], dim=1) for key in buffers } timings.time('batch') for m in indices: free_queue.put(m) timings.time('enqueue') batch = { k: t.to(device=flags.device, non_blocking=True) for k, t in batch.items() } timings.time('device') return batch
def get_batch( flags, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, buffers: Buffers, timings, lock=threading.Lock(), ): with lock: timings.time("lock") indices = [full_queue.get() for _ in range(flags.batch_size)] timings.time("dequeue") batch = { key: torch.stack([buffers[key][m] for m in indices], dim=1) for key in buffers } timings.time("batch") for m in indices: free_queue.put(m) timings.time("enqueue") batch = { k: t.to(device=flags.device, non_blocking=True) for k, t in batch.items() } timings.time("device") return batch
def get_batch( flags, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, buffers: Buffers, initial_agent_state_buffers, timings, lock=threading.Lock(), ): with lock: timings.time("lock") indices = [full_queue.get() for _ in range(flags.batch_size)] timings.time("dequeue") batch = { key: torch.stack([buffers[key][m] for m in indices], dim=1) for key in buffers } initial_agent_state = [torch.stack([initial_agent_state_buffers[m][i][0] for m in indices], axis=0) for i in range(2)] #print("initial_agent_state[0].shape: ", initial_agent_state[0].shape) timings.time("batch") for m in indices: free_queue.put(m) timings.time("enqueue") batch = {k: t.to(device=flags.device, non_blocking=True) for k, t in batch.items()} initial_agent_state = [t.to(device=flags.device, non_blocking=True) for t in initial_agent_state] timings.time("device") return batch, initial_agent_state
def get_batch( flags, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, buffers, initial_agent_state_buffers, lock=threading.Lock(), ): with lock: indices = [full_queue.get() for _ in range(flags.batch_size)] batch = { key: torch.stack([buffers[key][m] for m in indices], dim=1) for key in buffers } initial_agent_state = (torch.cat(ts, dim=1) for ts in zip( *[initial_agent_state_buffers[m] for m in indices])) for m in indices: free_queue.put(m) batch = { k: t.to(device=flags.device, non_blocking=True) for k, t in batch.items() } initial_agent_state = tuple( t.to(device=flags.device, non_blocking=True) for t in initial_agent_state) return batch, initial_agent_state
def act(flags, actor_index: int, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, model: torch.nn.Module, buffers: Buffers, initial_agent_state_buffers, level_name): try: logging.info("Actor %i started.", actor_index) timings = prof.Timings() # Keep track of how fast things are. seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little") ######changed next line gym_env = create_env(flags, level_name, seed) env = environment.Environment(gym_env) env_output = env.initial() agent_state = model.initial_state(batch_size=1) agent_output, unused_state = model(env_output, agent_state) while True: index = free_queue.get() if index is None: break # Write old rollout end. for key in env_output: buffers[key][index][0, ...] = env_output[key] for key in agent_output: buffers[key][index][0, ...] = agent_output[key] for i, tensor in enumerate(agent_state): initial_agent_state_buffers[index][i][...] = tensor # Do new rollout. for t in range(flags.unroll_length): timings.reset() with torch.no_grad(): agent_output, agent_state = model(env_output, agent_state) timings.time("model") env_output = env.step(agent_output["action"]) timings.time("step") for key in env_output: buffers[key][index][t + 1, ...] = env_output[key] for key in agent_output: buffers[key][index][t + 1, ...] = agent_output[key] timings.time("write") full_queue.put(index) if actor_index == 0: logging.info("Actor %i: %s", actor_index, timings.summary()) except KeyboardInterrupt: pass # Return silently. except Exception as e: logging.error("Exception in worker process %i", actor_index) traceback.print_exc() print() raise e
def act( flags, actor_index: int, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, model: torch.nn.Module, buffers, initial_agent_state_buffers, ): try: logging.info("Actor %i started.", actor_index) gym_env = create_env( flags.env, savedir=flags.rundir, archivefile="nethack.%i.%%(pid)i.%%(time)s.zip" % actor_index, ) env = ResettingEnvironment(gym_env) env_output = env.initial() agent_state = model.initial_state(batch_size=1) agent_output, unused_state = model(env_output, agent_state) while True: index = free_queue.get() if index is None: break # Write old rollout end. for key in env_output: buffers[key][index][0, ...] = env_output[key] for key in agent_output: buffers[key][index][0, ...] = agent_output[key] for i, tensor in enumerate(agent_state): initial_agent_state_buffers[index][i][...] = tensor # Do new rollout. for t in range(flags.unroll_length): with torch.no_grad(): agent_output, agent_state = model(env_output, agent_state) env_output = env.step(agent_output["action"]) for key in env_output: buffers[key][index][t + 1, ...] = env_output[key] for key in agent_output: buffers[key][index][t + 1, ...] = agent_output[key] full_queue.put(index) except KeyboardInterrupt: pass # Return silently. except Exception: logging.error("Exception in worker process %i", actor_index) traceback.print_exc() print() raise
def act(i: int, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, model: torch.nn.Module, buffers: Buffers, flags): try: logging.info('Actor %i started.', i) timings = prof.Timings() # Keep track of how fast things are. gym_env = Net.create_env(flags) seed = i ^ int.from_bytes(os.urandom(4), byteorder='little') gym_env.seed(seed) env = environment.Environment(gym_env) env_output = env.initial() agent_output = model(env_output) while True: index = free_queue.get() if index is None: break # Write old rollout end. for key in env_output: buffers[key][index][0, ...] = env_output[key] for key in agent_output: buffers[key][index][0, ...] = agent_output[key] # Do new rollout for t in range(flags.unroll_length): timings.reset() with torch.no_grad(): agent_output = model(env_output) timings.time('model') env_output = env.step(agent_output['action']) timings.time('step') for key in env_output: buffers[key][index][t + 1, ...] = env_output[key] for key in agent_output: buffers[key][index][t + 1, ...] = agent_output[key] timings.time('write') full_queue.put(index) if i == 0: logging.info('Actor %i: %s', i, timings.summary()) except KeyboardInterrupt: pass # Return silently. except Exception as e: logging.error('Exception in worker process %i', i) traceback.print_exc() print() raise e
def main(): episode = 0 path = "/home/michael/dev/fyp/AIGym/MP-Conv-Pong/" mp.set_start_method('spawn') worker_count = 3 #mp.cpu_count() learning_rate = 1e3 model = Model(2) criterion = nn.CrossEntropyLoss(reduce=False) optimizer = optim.Adam(model.parameters(), lr=learning_rate) if episode > 0: model.load_state_dict(torch.load(path + "Models/" + str(episode))) optimizer.load_state_dict( torch.load(path + "Optimizers/" + str(episode))) model.cuda() model.share_memory() envs = [gym.make("Pong-v0") for i in range(worker_count)] epoch_size = 1 batch_save = 128 best_score = None running_reward = None reward_queue = SimpleQueue() #Start workers workers = [ Worker(envs[i], epoch_size, model, criterion, optimizer, reward_queue, str(i + 1)) for i in range(worker_count) ] [w.start() for w in workers] # Gather rewards while True: reward = reward_queue.get() if isinstance(reward, Exception): print(reward) else: episode += 1 if (episode % batch_save == 0): torch.save(model.state_dict(), path + "Models/" + str(episode)) torch.save(optimizer.state_dict(), path + "Optimizers/" + str(episode)) if best_score is None: best_score = reward elif reward > best_score: best_score = reward running_reward = reward if running_reward is None else running_reward * 0.95 + reward * 0.05 if episode % 1 == 0: print( "episode {:4.0f} complete - average reward = {:3.0f}, last score was = {:3.0f}, best score is = {:3.0f}" .format(episode, running_reward, reward, best_score))
def _generate_parallel(self, iteration, network, device, num_workers): q, r = divmod(self.remaining_games, num_workers) num_active_workers = Value('i', num_workers) resign_threshold = Value('d', self.resign_mgr.threshold()) evaluator_mgr = BulkEvaluatorManager([network], device, num_workers) output_queue = SimpleQueue() # start the workers workers = [] for worker_id in range(num_workers): num_games = q + 1 if worker_id < r else q evaluator = evaluator_mgr.get_evaluator(worker_id, 0) worker = Process( target=self._worker_job, args=(worker_id, num_games, num_active_workers, resign_threshold, evaluator, output_queue), ) workers.append(worker) worker.start() # start evaluator server server = evaluator_mgr.get_server(num_active_workers) server.start() # collect the examples generated by workers while num_active_workers.value > 0 or not output_queue.empty(): examples, resign_value_history, result = output_queue.get() self.example_pool += examples self.game_length.append(len(examples)) # add the history into resignation manager to update the threshold if resign_value_history is not None: self.resign_mgr.add(resign_value_history, result) resign_threshold.value = self.resign_mgr.threshold() self.remaining_games -= 1 # periodically save the progress if (self.conf.GAMES_PER_ITERATION - self.remaining_games) \ % self.conf.EXAMPLE_POOL_SAVE_FREQUENCY == 0: self.save(iteration) log.info( f'[iter={iteration}] ExamplePool: checkpoint saved, ' f'{self.remaining_games} games remaining' ) for worker in workers: worker.join() server.join()
def get_batch( flags, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, buffers: Buffers, initial_agent_state_buffers, timings, lock=threading.Lock(), ): # need to make sure that we wait until batch_size trajectories/rollouts have been put into the queue with lock: timings.time("lock") # get the indices of actors "offering" trajectories/rollouts to be processed by the learner indices = [full_queue.get() for _ in range(flags.batch_size)] timings.time("dequeue") # create the batch as a dictionary for all the data in the buffers (see act() function for list of # keys), where each entry is a tensor of these values stacked across actors along the first dimension, # which I believe should be the "batch dimension" (see _format_frame()) batch = { key: torch.stack([buffers[key][m] for m in indices], dim=1) for key in buffers } # similar thing for the initial agent states, where I think the tuples are concatenated to become torch tensors initial_agent_state = (torch.cat(ts, dim=1) for ts in zip( *[initial_agent_state_buffers[m] for m in indices])) timings.time("batch") # once the data has been "transferred" into batch and initial_agent_state, # signal that the data has been processed to the actors for m in indices: free_queue.put(m) timings.time("enqueue") # move the data to the right device (e.g. GPU) batch = { k: t.to(device=flags.device, non_blocking=True) for k, t in batch.items() } initial_agent_state = tuple( t.to(device=flags.device, non_blocking=True) for t in initial_agent_state) timings.time("device") return batch, initial_agent_state
def get_batch( flags, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, buffers: Buffers, initial_agent_state_buffers, timings, lock=threading.Lock(), ): with lock: timings.time("lock") indices = [full_queue.get() for _ in range(flags.batch_size)] # TODO: Check if emptying full_queue and then readding to it takes very long, # seems like the only way to ensure a batch of similar length elements # One problem with doing this is that if get a really short trajectory, may never end up # using it. DONT CHANGE THIS FOR NOW. timings.time("dequeue") batch = { key: torch.stack([buffers[key][m] for m in indices], dim=1) for key in buffers } initial_agent_state = (torch.cat(ts, dim=1) for ts in zip( *[initial_agent_state_buffers[m] for m in indices])) timings.time("batch") for m in indices: free_queue.put(m) timings.time("enqueue") batch = { k: t.to(device=flags.device, non_blocking=True) for k, t in batch.items() } initial_agent_state = tuple( t.to(device=flags.device, non_blocking=True) for t in initial_agent_state) timings.time("device") return batch, initial_agent_state
def act(i: int, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, model: torch.nn.Module, buffers: Buffers, initial_agent_state_buffers, flags): try: log.info('Actor %i started.', i) timings = prof.Timings() gym_env = create_env(flags) seed = i ^ int.from_bytes(os.urandom(4), byteorder='little') gym_env.seed(seed) if flags.num_input_frames > 1: gym_env = FrameStack(gym_env, flags.num_input_frames) env = Environment(gym_env, fix_seed=flags.fix_seed, env_seed=flags.env_seed) env_output = env.initial() agent_state = model.initial_state(batch_size=1) agent_output, unused_state = model(env_output, agent_state) while True: index = free_queue.get() if index is None: break for key in env_output: buffers[key][index][0, ...] = env_output[key] for key in agent_output: buffers[key][index][0, ...] = agent_output[key] for i, tensor in enumerate(agent_state): initial_agent_state_buffers[index][i][...] = tensor for t in range(flags.unroll_length): timings.reset() with torch.no_grad(): agent_output, agent_state = model(env_output, agent_state) timings.time('model') env_output = env.step(agent_output['action']) timings.time('step') for key in env_output: buffers[key][index][t + 1, ...] = env_output[key] for key in agent_output: buffers[key][index][t + 1, ...] = agent_output[key] timings.time('write') full_queue.put(index) if i == 0: log.info('Actor %i: %s', i, timings.summary()) except KeyboardInterrupt: pass except Exception as e: logging.error('Exception in worker process %i', i) traceback.print_exc() print() raise e
def act(flags, gym_env, actor_index: int, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, buffers: Buffers, actor_buffers: Buffers, actor_model_queues: List[mp.SimpleQueue], actor_env_queues: List[mp.SimpleQueue]): try: logging.info("Actor %i started.", actor_index) timings = prof.Timings() # Keep track of how fast things are. gym_env = gym_env #seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little") #gym_env.seed(seed) if flags.agent in ["CNN"]: env = environment.Environment(gym_env, "image") elif flags.agent in ["NLM", "KBMLP", "GCN"]: if flags.state in ["relative", "integer", "block"]: env = environment.Environment(gym_env, "VKB") elif flags.state == "absolute": env = environment.Environment(gym_env, "absVKB") env_output = env.initial() for key in env_output: actor_buffers[key][actor_index][0] = env_output[key] while True: index = free_queue.get() if index is None: break # Write old rollout end. for key in actor_buffers: buffers[key][index][0] = actor_buffers[key][actor_index][0] # Do new rollout. for t in range(flags.unroll_length): timings.reset() actor_model_queues[actor_index].put(actor_index) env_info = actor_env_queues[actor_index].get() if env_info == "exit": return timings.time("model") env_output = env.step(actor_buffers["action"][actor_index][0]) timings.time("step") for key in actor_buffers: buffers[key][index][t + 1] = actor_buffers[key][actor_index][0] for key in env_output: buffers[key][index][t + 1, ...] = env_output[key] for key in env_output: actor_buffers[key][actor_index][0] = env_output[key] timings.time("write") full_queue.put(index) if actor_index == 0: logging.info("Actor %i: %s", actor_index, timings.summary()) except KeyboardInterrupt: pass # Return silently. except Exception as e: logging.error("Exception in worker process %i", actor_index) traceback.print_exc() print() raise e
epoch_reward += r history["observation"].append(x.cpu()) history["reward"].append(torch.FloatTensor(1).fill_(r)) self.reward_queue.put(epoch_reward) dataset = compileHistory(history) train(self.model, self.criterion, self.optimizer, dataset) except Exception as identifier: self.reward_queue.put(identifier) self.reward_queue.put(traceback.format_exc()) if __name__ == '__main__': mp.set_start_method('spawn') learning_rate = 1e-3 model = Model(2) model.cuda() model.share_memory() criterion = nn.CrossEntropyLoss(reduce=False) optimizer = optim.Adam(model.parameters(), lr=learning_rate) env = gym.make("Pong-v0") epoch_size = 1 queue = SimpleQueue() worker = Worker(env, epoch_size, model, criterion, optimizer, queue, "test") worker.start() while True: print(queue.get())
def train(config): task_queue = SimpleQueue() result_queue = SimpleQueue() stop = mp.Value('i', False) stats = SharedStats(config.state_dim) normalizers = [StaticNormalizer(config.state_dim) for _ in range(config.num_workers)] for normalizer in normalizers: normalizer.offline_stats.load(stats) workers = [Worker(id, normalizers[id], task_queue, result_queue, stop, config) for id in range(config.num_workers)] for w in workers: w.start() opt = cma.CMAOptions() opt['tolfun'] = -config.target opt['popsize'] = config.pop_size opt['verb_disp'] = 0 opt['verb_log'] = 0 opt['maxiter'] = sys.maxsize es = cma.CMAEvolutionStrategy(config.initial_weight, config.sigma, opt) total_steps = 0 initial_time = time.time() training_rewards = [] training_steps = [] training_timestamps = [] test_mean, test_ste = test(config, config.initial_weight, stats) logger.info('total steps %d, %f(%f)' % (total_steps, test_mean, test_ste)) training_rewards.append(test_mean) training_steps.append(0) training_timestamps.append(0) while True: solutions = es.ask() for id, solution in enumerate(solutions): task_queue.put((id, solution)) while not task_queue.empty(): continue result = [] while len(result) < len(solutions): if result_queue.empty(): continue result.append(result_queue.get()) result = sorted(result, key=lambda x: x[0]) total_steps += np.sum([r[2] for r in result]) cost = [r[1] for r in result] best_solution = solutions[np.argmin(cost)] elapsed_time = time.time() - initial_time test_mean, test_ste = test(config, best_solution, stats) logger.info('total steps %d, test %f(%f), best %f, elapased time %f' % (total_steps, test_mean, test_ste, -np.min(cost), elapsed_time)) training_rewards.append(test_mean) training_steps.append(total_steps) training_timestamps.append(elapsed_time) # with open('data/%s-best_solution_%s.bin' % (TAG, config.task), 'wb') as f: # pickle.dump(solutions[np.argmin(result)], f) if config.max_steps and total_steps > config.max_steps: stop.value = True break cost = fitness_shift(cost) es.tell(solutions, cost) # es.disp() for normalizer in normalizers: stats.merge(normalizer.online_stats) normalizer.online_stats.zero() for normalizer in normalizers: normalizer.offline_stats.load(stats) stop.value = True for w in workers: w.join() return [training_rewards, training_steps, training_timestamps]
def act( flags, actor_index: int, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, model: torch.nn.Module, buffers: Buffers, initial_agent_state_buffers, ): try: logging.info("Actor %i started.", actor_index) timings = prof.Timings() # Keep track of how fast things are. gym_env = create_env(flags) seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little") gym_env.seed(seed) env = environment.Environment(gym_env) env_output = env.initial() agent_state = model.initial_state(batch_size=1) mems, mem_padding = None, None agent_output, unused_state, mems, mem_padding, _ = model( env_output, agent_state, mems, mem_padding) while True: index = free_queue.get() if index is None: break # explicitly make done False to allow the loop to run # Don't need to set 'done' to true since now take step out of done state # when do arrive at 'done' # env_output['done'] = torch.tensor([0], dtype=torch.uint8) # Write old rollout end. for key in env_output: buffers[key][index][0, ...] = env_output[key] for key in agent_output: buffers[key][index][0, ...] = agent_output[key] for i, tensor in enumerate(agent_state): initial_agent_state_buffers[index][i][...] = tensor # Do one new rollout, untill flags.unroll_length t = 0 while t < flags.unroll_length and not env_output['done'].item(): # for t in range(flags.unroll_length): timings.reset() # REmoved since never this will never be true (MOVED TO AFTER FOR LOOP) # if env_output['done'].item(): # mems = None with torch.no_grad(): agent_output, agent_state, mems, mem_padding, _ = model( env_output, agent_state, mems, mem_padding) timings.time("model") # TODO: Shakti add action repeat? env_output = env.step(agent_output["action"]) timings.time("step") for key in env_output: buffers[key][index][t + 1, ...] = env_output[key] for key in agent_output: buffers[key][index][t + 1, ...] = agent_output[key] timings.time("write") t += 1 if env_output['done'].item(): mems = None # Take arbitrary step to reset environment env_output = env.step(torch.tensor([2])) if t != flags.unroll_length: # TODO I checked and seems good but Shakti can you check as well? buffers['done'][index][t + 1:] = torch.tensor( [True]).repeat(flags.unroll_length - t) full_queue.put(index) if actor_index == 0: logging.info("Actor %i: %s", actor_index, timings.summary()) except KeyboardInterrupt: pass # Return silently. except Exception as e: logging.error("Exception in worker process %i", actor_index) traceback.print_exc() # print() raise e
def act( args, actor_index: int, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, model: torch.nn.Module, buffers: Buffers, initial_agent_state_buffers, ): try: logging.info("Actor %i started.", actor_index) timings = Timings() # Keep track of how fast things are. gym_env = create_env(args) seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little") gym_env.seed(seed) env = Environment(gym_env) def make_env(args): def thunk(): env = create_env(args) return env return thunk envs = DummyVecEnv([make_env(args) for i in range(1)]) env_output = env.initial() envs.reset() agent_state = model.initial_state(batch_size=1) agent_output, unused_state = model(env_output, agent_state) while True: index = free_queue.get() if index is None: break # Write old rollout end. for key in env_output: buffers[key][index][0, ...] = env_output[key] for key in agent_output: buffers[key][index][0, ...] = agent_output[key] for i, tensor in enumerate(agent_state): initial_agent_state_buffers[index][i][...] = tensor # Do new rollout. for t in range(args.unroll_length): timings.reset() with torch.no_grad(): agent_output, agent_state = model(env_output, agent_state) # timings.time("model") env_output = env.step(agent_output["action"]) # env_output = env.step(agent_output["action"]) # envs.step((torch.randint(0, envs.action_space.n, (envs.num_envs,))).numpy()) assert agent_output["action"] == env_output["last_action"] timings.time("step") for key in env_output: buffers[key][index][t + 1, ...] = env_output[key] for key in agent_output: buffers[key][index][t + 1, ...] = agent_output[key] timings.time("write") full_queue.put(index) if actor_index == 0: logging.info("Actor %i: %s", actor_index, timings.summary()) except KeyboardInterrupt: pass # Return silently. except Exception as e: logging.error("Exception in worker process %i", actor_index) traceback.print_exc() print() raise e
def act( flags, game_params, actor_index: int, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, model: torch.nn.Module, buffers: Buffers, ): try: logging.info("Actor %i started.", actor_index) timings = prof.Timings() # Keep track of how fast things are. seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little") sc_env = init_game(game_params['env'], flags.map_name, random_seed=seed) obs_processer = IMPALA_ObsProcesser(action_table=model.action_table, **game_params['obs_processer']) env = environment.Environment(sc_env, obs_processer, seed) # initial rollout starts here env_output = env.initial() with torch.no_grad(): agent_output = model.actor_step(env_output) while True: index = free_queue.get() if index is None: break # Write old rollout end. for key in env_output: buffers[key][index][0, ...] = env_output[key] for key in agent_output: if key not in ['sc_env_action' ]: # no need to save this key on buffers buffers[key][index][0, ...] = agent_output[key] # Do new rollout. for t in range(flags.unroll_length): timings.reset() env_output = env.step(agent_output["sc_env_action"]) timings.time("step") with torch.no_grad(): agent_output = model.actor_step(env_output) timings.time("model") #env_output = env.step(agent_output["sc_env_action"]) #timings.time("step") for key in env_output: buffers[key][index][t + 1, ...] = env_output[key] for key in agent_output: if key not in ['sc_env_action' ]: # no need to save this key on buffers buffers[key][index][t + 1, ...] = agent_output[key] # env_output will be like # s_{0}, ..., s_{T} # act_mask_{0}, ..., act_mask_{T} # discount_{0}, ..., discount_{T} # r_{-1}, ..., r_{T-1} # agent_output will be like # a_0, ..., a_T with a_t ~ pi(.|s_t) # log_pi(a_0|s_0), ..., log_pi(a_T|s_T) # so the learner can use (s_i, act_mask_i) to predict log_pi_i timings.time("write") full_queue.put(index) if actor_index == 0: logging.info("Actor %i: %s", actor_index, timings.summary()) except KeyboardInterrupt: pass # Return silently. except Exception as e: logging.error("Exception in worker process %i", actor_index) traceback.print_exc() print() raise e
def train(config): task_queue = SimpleQueue() result_queue = SimpleQueue() stop = mp.Value('i', False) stats = SharedStats(config.state_dim) param = torch.FloatTensor(torch.from_numpy(config.initial_weight)) param.share_memory_() n_params = len(param.numpy().flatten()) if config.args.noise_type == 'lss': noise_sizes = [ cofig.state_dim * config.hidden_size, config.hidden_size * config.hidden_size, config.hidden_size * config.action_dim ] else: noise_sizes = None noise_generator = NoiseGenerator(n_params, config.pop_size, config.args.noise, noise_sizes=noise_sizes) normalizers = [ StaticNormalizer(config.state_dim) for _ in range(config.num_workers) ] for normalizer in normalizers: normalizer.offline_stats.load(stats) workers = [ Worker(id, param, normalizers[id], task_queue, result_queue, stop, noise_generator, config) for id in range(config.num_workers) ] for w in workers: w.start() training_rewards = [] training_steps = [] training_timestamps = [] initial_time = time.time() total_steps = 0 iteration = 0 while not stop.value: test_mean, test_ste = test(config, param.numpy(), stats) elapsed_time = time.time() - initial_time training_rewards.append(test_mean) training_steps.append(total_steps) training_timestamps.append(elapsed_time) logger.info('Test: total steps %d, %f(%f), elapsed time %d' % (total_steps, test_mean, test_ste, elapsed_time)) for i in range(config.pop_size): task_queue.put(i) rewards = [] epsilons = [] steps = [] while len(rewards) < config.pop_size: if result_queue.empty(): continue epsilon, fitness, step = result_queue.get() epsilons.append(epsilon) rewards.append(fitness) steps.append(step) total_steps += np.sum(steps) r_mean = np.mean(rewards) r_std = np.std(rewards) # rewards = (rewards - r_mean) / r_std logger.info('Train: iteration %d, %f(%f)' % (iteration, r_mean, r_std / np.sqrt(config.pop_size))) iteration += 1 # if r_mean > config.target: if config.max_steps and total_steps > config.max_steps: stop.value = True break for normalizer in normalizers: stats.merge(normalizer.online_stats) normalizer.online_stats.zero() for normalizer in normalizers: normalizer.offline_stats.load(stats) if config.args.reward_type == 'rank': rewards = fitness_shift(rewards) gradient = np.asarray(epsilons) * np.asarray(rewards).reshape((-1, 1)) gradient = np.mean(gradient, 0) / config.sigma gradient -= config.weight_decay * gradient if config.args.opt == 'adam': gradient = config.opt.update(gradient) gradient = torch.FloatTensor(gradient) param.add_(config.learning_rate * gradient) for w in workers: w.join() return [training_rewards, training_steps, training_timestamps]
def act( flags, env: str, task: int, full_action_space: bool, actor_index: int, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, model: torch.nn.Module, buffers: Buffers, initial_agent_state_buffers, ): try: logging.info("Actor %i started.", actor_index) timings = prof.Timings() # Keep track of how fast things are. # create the environment from command line parameters # => could also create a special one which operates on a list of games (which we need) gym_env = create_env( env, frame_height=flags.frame_height, frame_width=flags.frame_width, gray_scale=(flags.aaa_input_format == "gray_stack"), full_action_space=full_action_space, task=task) # generate a seed for the environment (NO HUMAN STARTS HERE!), could just # use this for all games wrapped by the environment for our application seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little") gym_env.seed(seed) # wrap the environment, this is actually probably the point where we could # use multiple games, because the other environment is still one from Gym env = environment.Environment(gym_env) # get the initial frame, reward, done, return, step, last_action env_output = env.initial() # perform the first step agent_state = model.initial_state(batch_size=1) agent_output, unused_state = model(env_output, agent_state) while True: # get a buffer index from the queue for free buffers (?) index = free_queue.get() # termination signal (?) for breaking out of this loop if index is None: break # Write old rollout end. # the keys here are (frame, reward, done, episode_return, episode_step, last_action) for key in env_output: buffers[key][index][0, ...] = env_output[key] # here the keys are (policy_logits, baseline, action) for key in agent_output: buffers[key][index][0, ...] = agent_output[key] # I think the agent_state is just the RNN/LSTM state (which will be the "initial" state for the next step) # not sure why it's needed though because it really just seems to be the initial state before starting to # act; however, it might be randomly initialised, which is why we might want it... for i, tensor in enumerate(agent_state): initial_agent_state_buffers[index][i][...] = tensor # Do new rollout for t in range(flags.unroll_length): timings.reset() # forward pass without keeping track of gradients to get the agent action with torch.no_grad(): agent_output, agent_state = model(env_output, agent_state) timings.time("model") # agent acting in the environment env_output = env.step(agent_output["action"]) timings.time("step") # writing the respective outputs of the current step (see above for the list of keys) for key in env_output: buffers[key][index][t + 1, ...] = env_output[key] for key in agent_output: buffers[key][index][t + 1, ...] = agent_output[key] timings.time("write") # after finishing a trajectory put the index in the "full queue", # presumably so that the data can be processed/sent to the learner full_queue.put(index) if actor_index == 0: logging.info("Actor %i: %s", actor_index, timings.summary()) except KeyboardInterrupt: pass # Return silently. except Exception as e: logging.error("Exception in worker process %i", actor_index) traceback.print_exc() print() raise e
def act( flags, game_params, actor_index: int, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, model: torch.nn.Module, buffers: Buffers, initial_agent_state_buffers, ): try: logging.info("Actor %i started.", actor_index) timings = prof.Timings() # Keep track of how fast things are. seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little") sc_env = init_game(game_params['env'], flags.map_name, random_seed=seed) obs_processer = IMPALA_ObsProcesser_v2(env=sc_env, action_table=model.action_table, **game_params['obs_processer']) env = environment.Environment_v2(sc_env, obs_processer, seed) # initial rollout starts here env_output = env.initial() new_res = model.spatial_processing_block.new_res agent_state = model.spatial_processing_block.conv_lstm._init_hidden(batch_size=1, image_size=(new_res,new_res) ) with torch.no_grad(): agent_output, new_agent_state = model.actor_step(env_output, *agent_state[0]) agent_state = agent_state[0] # _init_hidden yields [(h,c)], whereas actor step only (h,c) while True: index = free_queue.get() if index is None: break # Write old rollout end. for key in env_output: buffers[key][index][0, ...] = env_output[key] for key in agent_output: if key not in ['sc_env_action']: # no need to save this key on buffers buffers[key][index][0, ...] = agent_output[key] # lstm state in syncro with the environment / input to the agent # that's why agent_state = new_agent_state gets executed afterwards initial_agent_state_buffers[index][0][...] = agent_state[0] initial_agent_state_buffers[index][1][...] = agent_state[1] # Do new rollout. for t in range(flags.unroll_length): timings.reset() env_output = env.step(agent_output["sc_env_action"]) timings.time("step") # update state agent_state = new_agent_state with torch.no_grad(): agent_output, new_agent_state = model.actor_step(env_output, *agent_state) timings.time("model") #env_output = env.step(agent_output["sc_env_action"]) #timings.time("step") for key in env_output: buffers[key][index][t+1, ...] = env_output[key] for key in agent_output: if key not in ['sc_env_action']: # no need to save this key on buffers buffers[key][index][t+1, ...] = agent_output[key] # env_output will be like # s_{0}, ..., s_{T} # act_mask_{0}, ..., act_mask_{T} # discount_{0}, ..., discount_{T} # r_{-1}, ..., r_{T-1} # agent_output will be like # a_0, ..., a_T with a_t ~ pi(.|s_t) # log_pi(a_0|s_0), ..., log_pi(a_T|s_T) # so the learner can use (s_i, act_mask_i) to predict log_pi_i timings.time("write") full_queue.put(index) if actor_index == 0: logging.info("Actor %i: %s", actor_index, timings.summary()) except KeyboardInterrupt: pass # Return silently. except Exception as e: logging.error("Exception in worker process %i", actor_index) traceback.print_exc() print() raise e
def act(i: int, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, model: torch.nn.Module, buffers: Buffers, episode_state_count_dict: dict, train_state_count_dict: dict, initial_agent_state_buffers, flags): try: log.info('Actor %i started.', i) timings = prof.Timings() gym_env = create_env(flags) seed = i ^ int.from_bytes(os.urandom(4), byteorder='little') gym_env.seed(seed) if flags.num_input_frames > 1: gym_env = FrameStack(gym_env, flags.num_input_frames) env = Environment(gym_env, fix_seed=flags.fix_seed, env_seed=flags.env_seed) env_output = env.initial() agent_state = model.initial_state(batch_size=1) agent_output, unused_state = model(env_output, agent_state) while True: index = free_queue.get() if index is None: break # Write old rollout end. for key in env_output: buffers[key][index][0, ...] = env_output[key] for key in agent_output: buffers[key][index][0, ...] = agent_output[key] for i, tensor in enumerate(agent_state): initial_agent_state_buffers[index][i][...] = tensor # Update the episodic state counts episode_state_key = tuple(env_output['frame'].view(-1).tolist()) if episode_state_key in episode_state_count_dict: episode_state_count_dict[episode_state_key] += 1 else: episode_state_count_dict.update({episode_state_key: 1}) buffers['episode_state_count'][index][0, ...] = \ torch.tensor(1 / np.sqrt(episode_state_count_dict.get(episode_state_key))) # Reset the episode state counts when the episode is over if env_output['done'][0][0]: for episode_state_key in episode_state_count_dict: episode_state_count_dict = dict() # Update the training state counts if you're doing count-based exploration if flags.model == 'count': train_state_key = tuple(env_output['frame'].view(-1).tolist()) if train_state_key in train_state_count_dict: train_state_count_dict[train_state_key] += 1 else: train_state_count_dict.update({train_state_key: 1}) buffers['train_state_count'][index][0, ...] = \ torch.tensor(1 / np.sqrt(train_state_count_dict.get(train_state_key))) # Do new rollout for t in range(flags.unroll_length): timings.reset() with torch.no_grad(): agent_output, agent_state = model(env_output, agent_state) timings.time('model') env_output = env.step(agent_output['action']) timings.time('step') for key in env_output: buffers[key][index][t + 1, ...] = env_output[key] for key in agent_output: buffers[key][index][t + 1, ...] = agent_output[key] # Update the episodic state counts episode_state_key = tuple( env_output['frame'].view(-1).tolist()) if episode_state_key in episode_state_count_dict: episode_state_count_dict[episode_state_key] += 1 else: episode_state_count_dict.update({episode_state_key: 1}) buffers['episode_state_count'][index][t + 1, ...] = \ torch.tensor(1 / np.sqrt(episode_state_count_dict.get(episode_state_key))) # Reset the episode state counts when the episode is over if env_output['done'][0][0]: episode_state_count_dict = dict() # Update the training state counts if you're doing count-based exploration if flags.model == 'count': train_state_key = tuple( env_output['frame'].view(-1).tolist()) if train_state_key in train_state_count_dict: train_state_count_dict[train_state_key] += 1 else: train_state_count_dict.update({train_state_key: 1}) buffers['train_state_count'][index][t + 1, ...] = \ torch.tensor(1 / np.sqrt(train_state_count_dict.get(train_state_key))) timings.time('write') full_queue.put(index) if i == 0: log.info('Actor %i: %s', i, timings.summary()) except KeyboardInterrupt: pass except Exception as e: logging.error('Exception in worker process %i', i) traceback.print_exc() print() raise e