def batch_and_learn(i, lock=threading.Lock()): """Thread target for the learning process.""" nonlocal step, stats timings = prof.Timings() while step < flags.total_steps: timings.reset() batch, agent_state = get_batch( flags, free_queue, full_queue, buffers, initial_agent_state_buffers, timings, ) stats = learn(flags, model, learner_model, batch, agent_state, optimizer, scheduler) timings.time("learn") with lock: to_log = dict(step=step) to_log.update({k: stats[k] for k in stat_keys}) plogger.log(to_log) step += T * B if i == 0: logging.info("Batch and learn: %s", timings.summary())
def batch_and_learn(i, lock=threading.Lock()): """Thread target for the learning process.""" nonlocal step, stats timings = prof.Timings() while step < flags.total_steps: timings.reset() batch, agent_state = get_batch( flags, free_queue, full_queue, buffers, initial_agent_state_buffers, timings, ) # print('Before Learn') stats = learn(flags, model, learner_model, batch, agent_state, optimizer, scheduler) # print('After Learn') timings.time("learn") with lock: # step-wise learning rate annealing # TODO : How to perform annealing here exactly, we dont have access to the train_step ! if flags.scheduler in ['cosine', 'constant', 'dev_perf']: # linear warmup stage if step < flags.warmup_step: curr_lr = flags.lr * step / flags.warmup_step optimizer.param_groups[0]['lr'] = curr_lr else: if flags.scheduler == 'cosine': scheduler.step() elif flags.scheduler == 'inv_sqrt': scheduler.step() to_log = dict(step=step) to_log.update({k: stats[k] for k in stat_keys}) plogger.log(to_log) # print('updating step from {} to {}'.format(step, step+(T*B))) step += T * B if i == 0: logging.info("Batch and learn: %s", timings.summary())
def act( flags, actor_index: int, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, model: torch.nn.Module, buffers: Buffers, initial_agent_state_buffers, ): try: logging.info("Actor %i started.", actor_index) timings = prof.Timings() # Keep track of how fast things are. gym_env = create_env(flags) seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little") gym_env.seed(seed) env = environment.Environment(gym_env) env_output = env.initial() agent_state = model.initial_state(batch_size=1) agent_output, unused_state = model(env_output, agent_state) while True: index = free_queue.get() if index is None: break # Write old rollout end. for key in env_output: buffers[key][index][0, ...] = env_output[key] for key in agent_output: buffers[key][index][0, ...] = agent_output[key] for i, tensor in enumerate(agent_state): initial_agent_state_buffers[index][i][...] = tensor # Do new rollout. for t in range(flags.unroll_length): timings.reset() with torch.no_grad(): agent_output, agent_state = model(env_output, agent_state) timings.time("model") env_output = env.step(agent_output["action"]) timings.time("step") for key in env_output: buffers[key][index][t + 1, ...] = env_output[key] for key in agent_output: buffers[key][index][t + 1, ...] = agent_output[key] timings.time("write") full_queue.put(index) if actor_index == 0: logging.info("Actor %i: %s", actor_index, timings.summary()) except KeyboardInterrupt: pass # Return silently. except Exception as e: logging.error("Exception in worker process %i", actor_index) traceback.print_exc() print() raise e
def act( flags, actor_index: int, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, model: torch.nn.Module, buffers: Buffers, initial_agent_state_buffers, ): try: logging.info("Actor %i started.", actor_index) timings = prof.Timings() # Keep track of how fast things are. gym_env = create_env(flags) seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little") gym_env.seed(seed) env = environment.Environment(gym_env) env_output = env.initial() agent_state = model.initial_state(batch_size=1) mems, mem_padding = None, None agent_output, unused_state, mems, mem_padding, _ = model( env_output, agent_state, mems, mem_padding) while True: index = free_queue.get() if index is None: break # explicitly make done False to allow the loop to run # Don't need to set 'done' to true since now take step out of done state # when do arrive at 'done' # env_output['done'] = torch.tensor([0], dtype=torch.uint8) # Write old rollout end. for key in env_output: buffers[key][index][0, ...] = env_output[key] for key in agent_output: buffers[key][index][0, ...] = agent_output[key] for i, tensor in enumerate(agent_state): initial_agent_state_buffers[index][i][...] = tensor # Do one new rollout, untill flags.unroll_length t = 0 while t < flags.unroll_length and not env_output['done'].item(): # for t in range(flags.unroll_length): timings.reset() # REmoved since never this will never be true (MOVED TO AFTER FOR LOOP) # if env_output['done'].item(): # mems = None with torch.no_grad(): agent_output, agent_state, mems, mem_padding, _ = model( env_output, agent_state, mems, mem_padding) timings.time("model") # TODO: Shakti add action repeat? env_output = env.step(agent_output["action"]) timings.time("step") for key in env_output: buffers[key][index][t + 1, ...] = env_output[key] for key in agent_output: buffers[key][index][t + 1, ...] = agent_output[key] timings.time("write") t += 1 if env_output['done'].item(): mems = None # Take arbitrary step to reset environment env_output = env.step(torch.tensor([2])) if t != flags.unroll_length: # TODO I checked and seems good but Shakti can you check as well? buffers['done'][index][t + 1:] = torch.tensor( [True]).repeat(flags.unroll_length - t) full_queue.put(index) if actor_index == 0: logging.info("Actor %i: %s", actor_index, timings.summary()) except KeyboardInterrupt: pass # Return silently. except Exception as e: logging.error("Exception in worker process %i", actor_index) traceback.print_exc() # print() raise e