# def main(flags): # if flags.mode == "train": # train(flags) # else: # test(flags) # In[]: flags = parser.parse_args() # main(flags) if flags.xpid is None: flags.xpid = "torchbeast-%s" % time.strftime("%Y%m%d-%H%M%S") plogger = file_writer.FileWriter( xpid=flags.xpid, xp_args=flags.__dict__, rootdir=flags.savedir ) checkpointpath = os.path.expandvars( os.path.expanduser("%s/%s/%s" % (flags.savedir, flags.xpid, "model.tar")) ) if flags.num_buffers is None: # Set sensible default for num_buffers. flags.num_buffers = max(2 * flags.num_actors, flags.batch_size) if flags.num_actors >= flags.num_buffers: raise ValueError("num_buffers should be larger than num_actors") if flags.num_buffers < flags.batch_size: raise ValueError("num_buffers should be larger than batch_size") T = flags.unroll_length B = flags.batch_size
def train(flags): # pylint: disable=too-many-branches, too-many-statements if flags.xpid is None: flags.xpid = "torchbeast-%s" % time.strftime("%Y%m%d-%H%M%S") plogger = file_writer.FileWriter(xpid=flags.xpid, xp_args=flags.__dict__, rootdir=flags.savedir) checkpointpath = os.path.expandvars( os.path.expanduser("%s/%s/%s" % (flags.savedir, flags.xpid, "model.tar"))) if flags.num_buffers is None: # Set sensible default for num_buffers. flags.num_buffers = max(2 * flags.num_actors, flags.batch_size) if flags.num_actors >= flags.num_buffers: raise ValueError("num_buffers should be larger than num_actors") if flags.num_buffers < flags.batch_size: raise ValueError("num_buffers should be larger than batch_size") T = flags.unroll_length B = flags.batch_size flags.device = None if not flags.disable_cuda and torch.cuda.is_available(): logging.info("Using CUDA.") flags.device = torch.device("cuda") else: logging.info("Not using CUDA.") flags.device = torch.device("cpu") env = create_env(flags) model = Net(env.observation_space.shape, env.action_space.n, flags.use_lstm) buffers = create_buffers(flags, env.observation_space.shape, model.num_actions) model.share_memory() # Add initial RNN state. initial_agent_state_buffers = [] for _ in range(flags.num_buffers): state = model.initial_state(batch_size=1) for t in state: t.share_memory_() initial_agent_state_buffers.append(state) actor_processes = [] ctx = mp.get_context("fork") free_queue = ctx.SimpleQueue() full_queue = ctx.SimpleQueue() for i in range(flags.num_actors): actor = ctx.Process( target=act, args=( flags, i, free_queue, full_queue, model, buffers, initial_agent_state_buffers, ), ) actor.start() actor_processes.append(actor) learner_model = Net(env.observation_space.shape, env.action_space.n, flags.use_lstm).to(device=flags.device) optimizer = torch.optim.RMSprop( learner_model.parameters(), lr=flags.learning_rate, momentum=flags.momentum, eps=flags.epsilon, alpha=flags.alpha, ) def lr_lambda(epoch): return 1 - min(epoch * T * B, flags.total_steps) / flags.total_steps scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) logger = logging.getLogger("logfile") stat_keys = [ "total_loss", "mean_episode_return", "pg_loss", "baseline_loss", "entropy_loss", ] logger.info("# Step\t%s", "\t".join(stat_keys)) step, stats = 0, {} def batch_and_learn(i, lock=threading.Lock()): """Thread target for the learning process.""" nonlocal step, stats timings = prof.Timings() while step < flags.total_steps: timings.reset() batch, agent_state = get_batch( flags, free_queue, full_queue, buffers, initial_agent_state_buffers, timings, ) stats = learn(flags, model, learner_model, batch, agent_state, optimizer, scheduler) timings.time("learn") with lock: to_log = dict(step=step) to_log.update({k: stats[k] for k in stat_keys}) plogger.log(to_log) step += T * B if i == 0: logging.info("Batch and learn: %s", timings.summary()) for m in range(flags.num_buffers): free_queue.put(m) threads = [] for i in range(flags.num_learner_threads): thread = threading.Thread(target=batch_and_learn, name="batch-and-learn-%d" % i, args=(i, )) thread.start() threads.append(thread) def checkpoint(): if flags.disable_checkpoint: return logging.info("Saving checkpoint to %s", checkpointpath) torch.save( { "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "scheduler_state_dict": scheduler.state_dict(), "flags": vars(flags), }, checkpointpath, ) timer = timeit.default_timer try: last_checkpoint_time = timer() while step < flags.total_steps: start_step = step start_time = timer() time.sleep(5) if timer() - last_checkpoint_time > 10 * 60: # Save every 10 min. checkpoint() last_checkpoint_time = timer() sps = (step - start_step) / (timer() - start_time) if stats.get("episode_returns", None): mean_return = ("Return per episode: %.1f. " % stats["mean_episode_return"]) else: mean_return = "" total_loss = stats.get("total_loss", float("inf")) logging.info( "Steps %i @ %.1f SPS. Loss %f. %sStats:\n%s", step, sps, total_loss, mean_return, pprint.pformat(stats), ) except KeyboardInterrupt: return # Try joining actors then quit. else: for thread in threads: thread.join() logging.info("Learning finished after %d steps.", step) finally: for _ in range(flags.num_actors): free_queue.put(None) for actor in actor_processes: actor.join(timeout=1) checkpoint() plogger.close()
def train(flags): # pylint: disable=too-many-branches, too-many-statements # load the previous config if use_pretrained is true if flags.use_pretrained: logging.info('Using Pretrained Model') class Bunch(object): def __init__(self, adict): self.__dict__.update(adict) model_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'logs/torchbeast/' + flags.xpid + '/model.tar') pretrained_model = torch.load( model_path, map_location='cpu' if flags.disable_cuda else 'gpu') flags = Bunch(pretrained_model['flags']) flags.use_pretrained = True if flags.xpid is None: flags.xpid = "torchbeast-%s" % time.strftime("%Y%m%d-%H%M%S") plogger = file_writer.FileWriter(xpid=flags.xpid, xp_args=flags.__dict__, rootdir=flags.savedir) checkpointpath = os.path.expandvars( os.path.expanduser("%s/%s/%s" % (flags.savedir, flags.xpid, "model.tar"))) if flags.num_buffers is None: # Set sensible default for num_buffers. flags.num_buffers = max(2 * flags.num_actors, flags.batch_size) if flags.num_actors >= flags.num_buffers: raise ValueError("num_buffers should be larger than num_actors") if flags.num_buffers < flags.batch_size: raise ValueError("num_buffers should be larger than batch_size") T = flags.unroll_length B = flags.batch_size flags.device = None if not flags.disable_cuda and torch.cuda.is_available(): logging.info("Using CUDA.") flags.device = torch.device("cuda") else: logging.info("Not using CUDA.") flags.device = torch.device("cpu") env = create_env(flags) """model is each of the actors, running parallel. The upcoming block ctx.Process(...)""" model = Net(env.observation_space.shape, env.action_space.n) buffers = create_buffers(flags, env.observation_space.shape, model.num_actions) model.share_memory() # Add initial RNN state. initial_agent_state_buffers = [] for _ in range(flags.num_buffers): state = model.initial_state(batch_size=1) for t in state: t.share_memory_() initial_agent_state_buffers.append(state) actor_processes = [] ctx = mp.get_context("fork") free_queue = ctx.SimpleQueue() full_queue = ctx.SimpleQueue() for i in range(flags.num_actors): actor = ctx.Process( target=act, args=( flags, i, free_queue, full_queue, model, buffers, initial_agent_state_buffers, ), ) actor.start() actor_processes.append(actor) """learner_model is the central learner, which takes in the experiences and updates itself""" learner_model = Net(env.observation_space.shape, env.action_space.n).to(device=flags.device) optimizer = get_optimizer(flags, learner_model.parameters()) if optimizer is None: # Use the default optimizer used in monobeast optimizer = torch.optim.RMSprop(learner_model.parameters(), lr=flags.learning_rate, momentum=flags.momentum, eps=flags.epsilon, alpha=flags.alpha, weight_decay=flags.weight_decay) try: from apex.fp16_utils import FP16_Optimizer except: print('WARNING: apex not installed, ignoring --fp16 option') flags.fp16 = False if not flags.disable_cuda and flags.fp16: # If args.dynamic_loss_scale is False, static_loss_scale will be used. # If args.dynamic_loss_scale is True, it will take precedence over static_loss_scale. optimizer = FP16_Optimizer(optimizer, static_loss_scale=flags.static_loss_scale, dynamic_loss_scale=flags.dynamic_loss_scale, dynamic_loss_args={'init_scale': 2**16}) def lr_lambda(epoch): return 1 - min(epoch * T * B, flags.total_steps) / flags.total_steps scheduler = get_scheduler(flags, optimizer) if scheduler is None: # use the default scheduler as used in monobeast scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) logger = logging.getLogger("logfile") stat_keys = [ "total_loss", "mean_episode_return", "pg_loss", "baseline_loss", "entropy_loss", ] logger.info("# Step\t%s", "\t".join(stat_keys)) step, stats = 0, {} if flags.use_pretrained: logging.info( 'Using Pretrained Model -> loading learner_model, optimizer, scheduler states' ) learner_model.load_state_dict(pretrained_model['model_state_dict']) optimizer.load_state_dict(pretrained_model['optimizer_state_dict']) scheduler.load_state_dict(pretrained_model['scheduler_state_dict']) def batch_and_learn(i, lock=threading.Lock()): """Thread target for the learning process.""" nonlocal step, stats timings = prof.Timings() while step < flags.total_steps: timings.reset() batch, agent_state = get_batch( flags, free_queue, full_queue, buffers, initial_agent_state_buffers, timings, ) # print('Before Learn') stats = learn(flags, model, learner_model, batch, agent_state, optimizer, scheduler) # print('After Learn') timings.time("learn") with lock: # step-wise learning rate annealing # TODO : How to perform annealing here exactly, we dont have access to the train_step ! if flags.scheduler in ['cosine', 'constant', 'dev_perf']: # linear warmup stage if step < flags.warmup_step: curr_lr = flags.lr * step / flags.warmup_step optimizer.param_groups[0]['lr'] = curr_lr else: if flags.scheduler == 'cosine': scheduler.step() elif flags.scheduler == 'inv_sqrt': scheduler.step() to_log = dict(step=step) to_log.update({k: stats[k] for k in stat_keys}) plogger.log(to_log) # print('updating step from {} to {}'.format(step, step+(T*B))) step += T * B if i == 0: logging.info("Batch and learn: %s", timings.summary()) for m in range(flags.num_buffers): free_queue.put(m) threads = [] for i in range(flags.num_learner_threads): thread = threading.Thread(target=batch_and_learn, name="batch-and-learn-%d" % i, args=(i, )) thread.start() threads.append(thread) def checkpoint(): if flags.disable_checkpoint: return logging.info("Saving checkpoint to %s", checkpointpath) torch.save( { "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "scheduler_state_dict": scheduler.state_dict(), "flags": vars(flags), }, checkpointpath, ) timer = timeit.default_timer try: last_checkpoint_time = timer() while step < flags.total_steps: start_step = step start_time = timer() time.sleep(5) if timer() - last_checkpoint_time > 10 * 60: # Save every 10 min. checkpoint() last_checkpoint_time = timer() sps = (step - start_step) / (timer() - start_time) if stats.get("episode_returns", None): mean_return = ("Return per episode: %.1f. " % stats["mean_episode_return"]) else: mean_return = "" total_loss = stats.get("total_loss", float("inf")) # TODO : We also should save the model if the loss is the best loss seen so far # TODO : call checkpoint() here with some differen prefix # if not best_val_loss or val_loss < best_val_loss: # if not args.debug: # with open(os.path.join(args.work_dir, 'model.pt'), 'wb') as f: # torch.save(model, f) # with open(os.path.join(args.work_dir, 'optimizer.pt'), 'wb') as f: # torch.save(optimizer.state_dict(), f) # best_val_loss = val_loss logging.info( "Steps %i @ %.1f SPS. Loss %f. %sStats:\n%s", step, sps, total_loss, mean_return, pprint.pformat(stats), ) except KeyboardInterrupt: return # Try joining actors then quit. else: for thread in threads: thread.join() logging.info("Learning finished after %d steps.", step) finally: for _ in range(flags.num_actors): free_queue.put(None) for actor in actor_processes: actor.join(timeout=1) checkpoint() plogger.close()