def get_criterion(opt, model, nll_weight): """ Return a suitable loss function. """ if opt.mode == "SVI": rlog.info("\nLoss: NLL + KL") return SVILoss(model.get_kl_div, nll_weight=nll_weight) rlog.info("Loss: NLL \n") return nn.NLLLoss()
def run(opt): """ Entry Point. """ rlog.init(opt.experiment, path=opt.out_dir, tensorboard=True) rlog.addMetrics( rlog.AvgMetric("trn_R_ep", metargs=["trn_reward", "trn_done"]), rlog.AvgMetric("trn_loss", metargs=["trn_loss", 1]), rlog.FPSMetric("lrn_tps", metargs=["lrn_steps"]), rlog.AvgMetric("val_R_ep", metargs=["reward", "done"]), rlog.AvgMetric("val_avg_step", metargs=[1, "done"]), rlog.FPSMetric("val_fps", metargs=["val_frames"]), ) opt = game_settings_(opt) env, agent = experiment_factory(opt) rlog.info(ioutil.config_to_string(opt)) ioutil.save_config(opt, opt.out_dir) steps = 0 for ep in range(1, opt.env.episodes + 1): steps = train_one_ep(env, agent, steps, opt.update_freq, opt.target_update_freq) if ep % opt.valid_freq == 0: rlog.traceAndLog(ep) validate(env, agent, opt.valid_episodes) rlog.traceAndLog(ep)
def experiment_factory(opt, only_env=False): env = gym_wrapper.GymFromDMEnv(bsuite.load_from_id(opt.env.name)) env = TorchWrapper(env, opt.device) if only_env: return env replay = ExperienceReplay(**opt.replay) layers = [ reduce(lambda x, y: x * y, env.observation_space.shape), # input *opt.estimator["layers"], # hidden env.action_space.n, # output ] estimator = MLP(layers, spectral=opt.spectral, **opt.estimator) estimator.to(opt.device) optimizer = getattr(torch.optim, opt.optim.name)( estimator.parameters(), **opt.optim.kwargs ) policy_improvement = C51PolicyImprovement( estimator, opt.epsilon, env.action_space.n ) policy_evaluation = C51PolicyEvaluation(estimator, optimizer, opt.gamma) rlog.info(replay) rlog.info(estimator) return env, (replay, policy_improvement, policy_evaluation)
def get_model(opt, num_labels): """ Configure and return a model. """ if opt.model == "baseline": model = Baseline( Head(), nn.LSTM(44 ** 2, hidden_size=opt.hidden_size), nn.Linear(opt.hidden_size, num_labels), ) else: lstm_in = opt.window ** 2 * opt.topk idx2xy_partial = None if opt.use_coords: # increase the input for the xy coords lstm_in = (opt.window ** 2 + 2) * opt.topk idx2xy_partial = partial(idx2xy, k=opt.window, s=4) model = Glimpsy( partial(unfold, window=opt.window, stride=4), SparseAttention(opt.window ** 2, topk=opt.topk), nn.LSTM(lstm_in, hidden_size=opt.hidden_size), nn.Linear(opt.hidden_size, SyncedMNIST.num_labels), head=Head(), idx2xy=idx2xy_partial, ) rlog.info( summary( model, torch.zeros((opt.batch_size, 10, 1, 64, 64)), show_input=True, show_hierarchical=True, ), ) return model
def get_optimizer(opt, estimator): # Create custom param groups if hasattr(opt.optim, "div_by_rho") and opt.optim.div_by_rho: assert (opt.estimator.args["spectral"] is not None ), "When dividing by rho you should hook at least a layer." assert all( [ s[-1] == "L" for s in str(opt.estimator.args["spectral"]).split(",") ] ), "Spectral norm layers should not be active when dividing the optim step." param_groups = [{ "params": p, "name": n, "lr": opt.optim.args["lr"], "rho_idx": None } for n, p in estimator.named_parameters()] param_groups_ = [g for g in param_groups if "weight" in g["name"]] for k in estimator.get_spectral_norms().keys(): param_groups_[int(k)]["rho_idx"] = k else: param_groups = estimator.parameters() optimizer = getattr(O, opt.optim.name)(param_groups, **opt.optim.args) if hasattr(opt.optim, "div_by_rho") and opt.optim.div_by_rho: rlog.info("Checking the groups are alright, alright, alright...") for group in optimizer.param_groups: rlog.info("{:<36} rho_idx={}".format(group["name"], group["rho_idx"])) return optimizer
def make_rlog(opt): """ Configure logger. """ rlog.init("pff", path=opt.path, tensorboard=True) train_log = rlog.getLogger("pff.train") train_log.fmt = ( "[{gen:03d}/{batch:04d}] acc={acc:2.2f}% | bestFit={bestFit:2.3f}" + ", unFit={unFit:2.3f} [μ={attnMean:2.3f}/σ={attnVar:2.3f}]" ) if opt.model == "baseline": train_log.fmt = "[{batch:04d}] acc={acc:2.2f}%, loss={loss:2.3f}" msg = "Configuration:\n" for k, v in vars(opt).items(): msg += f" {k:16}: {v}\n" rlog.info(msg) return train_log
def load_policy(env, ckpt_path, opt): opt.action_cnt = env.action_space.n estimator = get_estimator(opt, env) agent_args = opt.agent.args agent_args["epsilon"] = 0.0 # purely max policy = AGENTS[opt.agent.name]["policy_improvement"]( estimator, opt.action_cnt, **agent_args ) idx = int(ckpt_path.stem.split("_")[1]) rlog.info(f"Loading {ckpt_path.stem}") ckpt = ioutil.load_checkpoint( ckpt_path.parent, idx=idx, verbose=False, device=torch.device(opt.device) ) if opt.estimator.args["spectral"] is not None: ioutil.special_conv_uv_buffer_fix(policy.estimator, ckpt["estimator_state"]) policy.estimator.load_state_dict(ckpt["estimator_state"]) return policy, idx
def run(opt): """ Entry point of the experiment """ # no need to run this for all the seeds if opt.run_id not in [0, 1, 2]: return # this is a bit of a hack, it would be nice to change it # when launching the experiment. It generally only affects the logger. if "JyxNorm" not in opt.experiment: opt.experiment += "--JyxNorm" rlog.init(opt.experiment, path=opt.out_dir, relative_time=True) rlog.addMetrics( rlog.AvgMetric("Jyx_norm_avg", metargs=["Jyx_norm", 1]), rlog.MaxMetric("Jyx_norm_max", metargs=["Jyx_norm"]), rlog.AvgMetric("val_R_ep", metargs=["reward", "done"]), rlog.SumMetric("val_ep_cnt", metargs=["done"]), rlog.AvgMetric("val_avg_step", metargs=[1, "done"]), rlog.FPSMetric("val_fps", metargs=["val_frames"]), ) opt.device = "cuda" if torch.cuda.is_available() else "cpu" root = Path(opt.out_dir) ckpt_paths = sorted(root.glob("**/checkpoint*")) rlog.info("Begin empirical estimation of norm(Jyx).") rlog.info("Runing experiment on {}.".format(opt.device)) rlog.info("Found {:3d} checkpoints.".format(len(ckpt_paths))) # Sample only every other third checkpoint if (Path(opt.out_dir) / "max_ckpt").exists(): ckpt_paths = [ p for p in ckpt_paths if int(p.stem.split("_")[1]) == int((Path(opt.out_dir) / "max_ckpt").read_text()) ] rlog.info("IMPORTANT! Found max_ckpt @{}.".format(ckpt_paths[0])) else: if "MinAtar" in opt.game: ckpt_paths = ckpt_paths[0::3] rlog.warning("IMPORTANT! Sampling only every other third checkpoint.") else: ckpt_paths = ckpt_paths[0::5] rlog.warning("IMPORTANT! Sampling only every other fifth checkpoint.") for ckpt_path in ckpt_paths: env = get_env(opt, mode="testing") policy, step = load_policy(env, ckpt_path, deepcopy(opt)) check_lipschitz_constant(policy, env, opt.valid_step_cnt) rlog.traceAndLog(step=step)
def run(opt): """ Run experiment. This function is being launched by liftoff. """ U.configure_logger(opt) # set seed opt.seed = (opt.run_id + 1) * opt.base_seed torch.manual_seed(opt.seed) # configure env env = ActionWrapper(TorchWrapper(gym.make(opt.env_name))) env.seed(opt.seed) # build estimator estimator = ActorCriticEstimator( env.observation_space.shape[0], env.action_space, hidden_size=opt.hidden_size, ) # load checkpoint and reset rlog.info("Loading model from %s", opt.model_state) estimator.load_state_dict(torch.load(opt.model_state)["policy"]) estimator.reset_policy() rlog.info("Policy reset.") if opt.freeze_critic: estimator.freeze_critic() rlog.info("Freezed feature extractor and critic.") # build the agent policy_improvement, policy_evaluation = build_agent( opt, env, estimator=estimator ) # log rlog.info(f"\n{U.config_to_string(opt)}") rlog.info(policy_improvement) # train try: train(env, policy_improvement, policy_evaluation, opt) except Exception as err: rlog.error(clr(str(err), "red", attrs=["bold"])) raise err
def valid_stats(opt, model, dset): """ Stats on the validation data. """ stats = {} stats["loss"], stats["acc"] = validate( DataLoader(dset, **vars(opt.val_loader)), model, opt.tst_mcs) if hasattr(opt, "log") and opt.log.mle_ish: # Use the means of the posterior to set a pseudo-MLE model. assert isinstance( model, SVIModel), "This stat only makes sense for SVI models." model.sync_mle_model() rlog.info("Synced MLE model using means from posterior.") rlog.info("Compute accuracy with a pseudo-MLE model.") stats["lossMLE"], stats["accMLE"] = validate( DataLoader(dset, **vars(opt.val_loader)), model._mle_model, # pylint: disable=protected-access 0, ) return stats
def train_stats(opt, model, dset): """ Stats on the traning data. """ stats = {} if isinstance(model, SVIModel): # Stats collected during traning use a single sample from the # posterior. Therefore we check the accuracy once more using # the same no of samples as on the validation set. stats["lossMC"], stats["accMC"] = validate( DataLoader(dset, **vars(opt.val_loader)), model, opt.tst_mcs) if hasattr(opt, "log") and opt.log.train_no_aug: # We also look at the accuracy on un-augmented training data. # This is done on both MLE and SVI rlog.info("Compute accuracy on un-augmented train data.") mc_samples = opt.tst_mcs if isinstance(model, SVIModel) else 0 stats["lossNoAug"], stats["accNoAug"] = validate( DataLoader(get_unaugmented(dset), **vars(opt.val_loader)), model, mc_samples, ) if hasattr(opt, "log") and opt.log.mle_ish: # Use the means of the posterior to set a pseudo-MLE model. assert isinstance( model, SVIModel), "This stat only makes sense for SVI models." model.sync_mle_model() rlog.info("Synced MLE model using means from posterior.") rlog.info("Compute accuracy with a pseudo-MLE model.") stats["lossMLE"], stats["accMLE"] = validate( DataLoader(dset, **vars(opt.val_loader)), model._mle_model, # pylint: disable=protected-access 0, ) return stats
def checkpoint_agent(path, crt_step, save_replay=True, **kwargs): to_save = {"step": crt_step} replay_path = None for k, v in kwargs.items(): if k == "replay": if save_replay: replay_path = v.save(path, crt_step, save_all=False) elif isinstance(v, (torch.nn.Module, torch.optim.Optimizer)): to_save[f"{k}_state"] = v.state_dict() elif isinstance(v, (Namespace, YamlNamespace)): to_save[k] = namespace_to_dict(v) else: to_save[k] = v with open(f"{path}/checkpoint_{crt_step:08d}.gz", "wb") as f: with GzipFile(fileobj=f) as outfile: torch.save(to_save, outfile) if replay_path is not None: shutil.copyfile(replay_path, Path(path) / "prev_replay.gz") rlog.info( "So, I have saved the agent's state" f"{'' if replay_path is not None else ' not'} including the experience replay." )
def run(opt): """ Entry point """ if "sRank" not in opt.experiment: opt.experiment += "--sRank" rlog.init(opt.experiment, path=opt.out_dir, relative_time=True) rlog.addMetrics( rlog.AvgMetric("avg_rank", metargs=["rank", 1]), # rlog.ValueMetric("rank", metargs=["rank"]), rlog.AvgMetric("val_R_ep", metargs=["reward", "done"]), rlog.SumMetric("val_ep_cnt", metargs=["done"]), rlog.AvgMetric("val_avg_step", metargs=[1, "done"]), rlog.FPSMetric("val_fps", metargs=["val_frames"]), ) opt.device = "cuda" if torch.cuda.is_available() else "cpu" root = Path(opt.out_dir) ckpt_paths = sorted(root.glob("**/checkpoint*")) rlog.info("Begin empirical estimation of feature matrix rank.") rlog.info("Runing experiment on {}".format(opt.device)) rlog.info("Found {:3d} checkpoints.".format(len(ckpt_paths))) # Sample only every other third checkpoint if "MinAtar" in opt.game: ckpt_paths = ckpt_paths[0::3] rlog.warning("IMPORTANT! Sampling only every other third checkpoint.") else: ckpt_paths = ckpt_paths[0::5] rlog.warning("IMPORTANT! Sampling only every other fifth checkpoint.") sampled_steps = min(opt.valid_step_cnt, opt.train_step_cnt) rlog.info( "Sampling {:6d} steps from the environment".format(sampled_steps)) for ckpt_path in ckpt_paths: env = get_env(opt, mode="testing") policy, step = load_policy(env, ckpt_path, deepcopy(opt)) check_effective_features_rank(policy, env, sampled_steps) rlog.traceAndLog(step=step)
def run(opt): """ Entry point of the program. """ if __debug__: print( clr( "Code might have assertions. Use -O in liftoff when running stuff.", color="red", attrs=["bold"], )) ioutil.create_paths(opt) sticky_schedule = OrderedDict([(int(s), float(p)) for (s, p) in opt.sticky_schedule]) assert 1 in sticky_schedule rlog.init(opt.experiment, path=opt.out_dir, tensorboard=True) train_loggers = OrderedDict() for i, epoch in enumerate(sticky_schedule.keys()): train_loggers[epoch] = train_log = rlog.getLogger( f"{opt.experiment}.{i:d}") train_log.addMetrics( rlog.AvgMetric("trn_R_ep", metargs=["trn_reward", "trn_done"]), rlog.SumMetric("trn_ep_cnt", metargs=["trn_done"]), rlog.AvgMetric("trn_loss", metargs=["trn_loss", 1]), rlog.FPSMetric("trn_tps", metargs=["trn_steps"]), rlog.ValueMetric("trn_sticky_action_prob", metargs=["trn_sticky_action_prob"]), rlog.FPSMetric("lrn_tps", metargs=["lrn_steps"]), rlog.AvgMetric("val_R_ep", metargs=["reward", "done"]), rlog.SumMetric("val_ep_cnt", metargs=["done"]), rlog.AvgMetric("val_avg_step", metargs=[1, "done"]), rlog.FPSMetric("val_fps", metargs=["val_frames"]), rlog.ValueMetric("val_sticky_action_prob", metargs=["val_sticky_action_prob"]), ) # Initialize the objects we will use during training. env, (replay, policy_improvement, policy_evaluation) = experiment_factory(opt) rlog.info("\n\n{}\n\n{}\n\n{}".format(env, replay, policy_evaluation.estimator)) rlog.info("\n\n{}\n\n{}".format(policy_improvement, policy_evaluation)) if opt.estimator.args.get("spectral", None) is not None: for k in policy_evaluation.estimator.get_spectral_norms().keys(): # k = f"min{str(k)[1:]}" rlog.addMetrics(rlog.ValueMetric(k, metargs=[k])) # if we loaded a checkpoint if Path(opt.out_dir).joinpath("replay.gz").is_file(): # sometimes the experiment is intrerupted while saving the replay # buffer and it gets corrupted. Therefore we attempt restoring # from the previous checkpoint and replay. try: idx = replay.load(Path(opt.out_dir) / "replay.gz") ckpt = ioutil.load_checkpoint(opt.out_dir, idx=idx) rlog.info(f"Loaded most recent replay (step {idx}).") except: gc.collect() rlog.info("Last replay gzip is faulty.") idx = replay.load(Path(opt.out_dir) / "prev_replay.gz") ckpt = ioutil.load_checkpoint(opt.out_dir, idx=idx) rlog.info(f"Loading a previous snapshot (step {idx}).") # load state dicts # load state dicts ioutil.special_conv_uv_buffer_fix(policy_evaluation.estimator, ckpt["estimator_state"]) policy_evaluation.estimator.load_state_dict(ckpt["estimator_state"]) ioutil.special_conv_uv_buffer_fix(policy_evaluation.target_estimator, ckpt["target_estimator_state"]) policy_evaluation.target_estimator.load_state_dict( ckpt["target_estimator_state"]) policy_evaluation.optimizer.load_state_dict(ckpt["optim_state"]) last_epsilon = None for _ in range(ckpt["step"]): last_epsilon = next(policy_improvement.epsilon) rlog.info(f"Last epsilon: {last_epsilon}.") # some counters last_epoch = ckpt["step"] // opt.train_step_cnt rlog.info(f"Resuming from epoch {last_epoch}.") start_epoch = last_epoch + 1 steps = ckpt["step"] else: steps = 0 start_epoch = 1 # add some hardware and git info, log and save opt = ioutil.add_platform_info(opt) rlog.info("\n" + ioutil.config_to_string(opt)) ioutil.save_config(opt, opt.out_dir) # Start training last_state = None # used by train_one_epoch to know how to resume episode. for epoch in range(start_epoch, opt.epoch_cnt + 1): last_sched_epoch = max(ep for ep in sticky_schedule if ep <= epoch) print(f"StickyActProb goes from {env.sticky_action_prob}" f" to {sticky_schedule[last_sched_epoch]}") env.sticky_action_prob = sticky_schedule[last_sched_epoch] crt_logger = train_loggers[last_sched_epoch] # train for 250,000 steps steps, last_state = train_one_epoch( env, (replay, policy_improvement, policy_evaluation), opt.train_step_cnt, opt.update_freq, opt.target_update_freq, opt, crt_logger, total_steps=steps, last_state=last_state, ) crt_logger.put(trn_sticky_action_prob=env.sticky_action_prob) crt_logger.traceAndLog(epoch * opt.train_step_cnt) # validate for 125,000 steps for sched_epoch, eval_logger in train_loggers.items(): eval_env = get_env( # this doesn't work, fute-m-aș în ele de wrappere opt, mode="testing", sticky_action_prob=sticky_schedule[sched_epoch]) eval_env.sticky_action_prob = sticky_schedule[sched_epoch] print( f"Evaluating on the env with sticky={eval_env.sticky_action_prob}." ) validate( AGENTS[opt.agent.name]["policy_improvement"]( policy_improvement.estimator, opt.action_cnt, epsilon=opt.val_epsilon, ), eval_env, opt.valid_step_cnt, eval_logger, ) eval_logger.put( val_sticky_action_prob=eval_env.sticky_action_prob, ) eval_logger.traceAndLog(epoch * opt.train_step_cnt) # save the checkpoint if opt.agent.save: ioutil.checkpoint_agent( opt.out_dir, steps, estimator=policy_evaluation.estimator, target_estimator=policy_evaluation.target_estimator, optim=policy_evaluation.optimizer, cfg=opt, replay=replay, save_replay=(epoch % 8 == 0 or epoch == opt.epoch_cnt), )
def freeze_critic(self): for module_name, module in self.named_modules(): if "policy" not in module_name and module_name != "": rlog.info("Freezing %s", module_name) module.weight.requires_grad = False module.bias.requires_grad = False
def run(opt): torch.set_printoptions(precision=8, sci_mode=False) opt = augment_options(opt) configure_logger(opt) check_options_are_valid(opt) rlog.info(f"\n{config_to_string(opt)}") # configure the environment env = wrap_env(gym.make(opt.game), opt) # configure estimator and policy if hasattr(opt.estimator, 'categorical'): _s = opt.estimator.categorical.support support = [_s.min, _s.max, _s.bin_no] estimator = MiniGridFF( opt.er.hist_len * 3, env.action_space.n, hidden_size=opt.estimator.lin_size, support=support, ).cuda() elif opt.estimator.ff: estimator = MiniGridFF( opt.er.hist_len * 3, env.action_space.n, hidden_size=opt.estimator.lin_size, ).cuda() else: estimator = MiniGridNet( opt.er.hist_len * 3, env.action_space.n, hidden_size=opt.estimator.lin_size, ).cuda() if hasattr(opt.estimator, "ensemble"): # Build Bootstrapped Ensembles objects estimator = BootstrappedEstimator(estimator, **opt.estimator.ensemble.__dict__) policy_evaluation = BootstrappedPE(estimator, env.action_space.n, opt.exploration.__dict__, vote=True) if hasattr(opt.estimator, 'categorical'): policy_improvement = BootstrappedPI( wt.CategoricalPolicyImprovement( estimator, optim.Adam(estimator.parameters(), lr=opt.lr, eps=1e-4), opt.gamma, ), categorical=True) else: policy_improvement = BootstrappedPI( wt.DQNPolicyImprovement( estimator, optim.Adam(estimator.parameters(), lr=opt.lr, eps=1e-4), opt.gamma, is_double=opt.double, )) elif hasattr(opt.estimator, "dropout"): # Build Variational Dropout objects estimator = MiniGridDropnet( opt.er.hist_len * 3, env.action_space.n, hidden_size=opt.estimator.lin_size, p=opt.estimator.dropout, mc_samples=opt.estimator.mc_samples, ).cuda() policy_evaluation = DropPE( estimator, env.action_space.n, epsilon=opt.exploration.__dict__, thompson=opt.estimator.thompson, ) policy_improvement = DropPI( estimator, optim.Adam(estimator.parameters(), lr=opt.lr, eps=1e-4), opt.gamma, is_double=opt.double, ) elif hasattr(opt.estimator, "categorical"): policy_evaluation = wt.EpsilonGreedyPolicy( estimator, env.action_space.n, epsilon=opt.exploration.__dict__) policy_improvement = wt.CategoricalPolicyImprovement( estimator, optim.Adam(estimator.parameters(), lr=opt.lr, eps=1e-4), opt.gamma, ) else: policy_evaluation = wt.EpsilonGreedyPolicy( estimator, env.action_space.n, epsilon=opt.exploration.__dict__) policy_improvement = wt.DQNPolicyImprovement( estimator, optim.Adam(estimator.parameters(), lr=opt.lr, eps=1e-4), opt.gamma, is_double=opt.double, ) policy = DQNPolicy( policy_evaluation, policy_improvement, wt.ExperienceReplay(**opt.er.__dict__)(), priority=opt.er.priority, ) # additionally info rlog.info(policy) rlog.info(estimator) # start training policy_iteration(env, policy, opt)
# sample from a gaussian for showcasing the histogram sample = random.gauss(mean, 0.1) # simply trace all the values you passed as `metargs` above. # the logger will know how to dispatch each argument. rlog.put(reward=reward, done=done, frame_no=1, sample=sample) if step % 10_000 == 0: # this is the call that dumps everything to the logger. summary = rlog.summarize() rlog.trace(step=step, **summary) # rlog.info( # "{0:6d}, ep {ep_cnt:3d}, RunR/ep{RunR:8.2f} | rw/ep{R_per_ep:8.2f}.".format( # step, **summary # ) # ) # rlog.reset() rlog.traceAndLog(step) mean += 1 rlog.trace("But we can continue tracing stuff manually...") # inlcuding structured stuff as long as we provide a `step` keyarg rlog.trace(step=step, aux_loss=0.23) rlog.info("Run `tensorboard --logdir sota_results` to see the results.") if __name__ == "__main__": main()
def run(opt): """ Entry point of the program. """ if __debug__: print( clr( "Code might have assertions. Use -O in liftoff when running stuff.", color="red", attrs=["bold"], )) ioutil.create_paths(opt) rlog.init(opt.experiment, path=opt.out_dir, tensorboard=True, relative_time=True) rlog.addMetrics( rlog.AvgMetric("trn_R_ep", metargs=["trn_reward", "trn_done"]), rlog.SumMetric("trn_ep_cnt", metargs=["trn_done"]), rlog.AvgMetric("trn_loss", metargs=["trn_loss", 1]), rlog.FPSMetric("trn_tps", metargs=["trn_steps"]), rlog.FPSMetric("lrn_tps", metargs=["lrn_steps"]), rlog.AvgMetric("val_R_ep", metargs=["reward", "done"]), rlog.SumMetric("val_ep_cnt", metargs=["done"]), rlog.AvgMetric("val_avg_step", metargs=[1, "done"]), rlog.FPSMetric("val_fps", metargs=["val_frames"]), ) # Initialize the objects we will use during training. env, (replay, policy_improvement, policy_evaluation) = experiment_factory(opt) guts = [ env, replay, policy_evaluation.estimator, policy_evaluation.optimizer, policy_improvement, policy_evaluation, ] rlog.info(("\n\n{}" * len(guts)).format(*guts)) if opt.estimator.args.get("spectral", None) is not None: for k in policy_evaluation.estimator.get_spectral_norms().keys(): # k = f"min{str(k)[1:]}" rlog.addMetrics(rlog.ValueMetric(k, metargs=[k])) # if we loaded a checkpoint if Path(opt.out_dir).joinpath("replay.gz").is_file(): # sometimes the experiment is intrerupted while saving the replay # buffer and it gets corrupted. Therefore we attempt restoring # from the previous checkpoint and replay. try: idx = replay.load(Path(opt.out_dir) / "replay.gz") ckpt = ioutil.load_checkpoint(opt.out_dir, idx=idx) rlog.info(f"Loaded most recent replay (step {idx}).") except: gc.collect() rlog.info("Last replay gzip is faulty.") idx = replay.load(Path(opt.out_dir) / "prev_replay.gz") ckpt = ioutil.load_checkpoint(opt.out_dir, idx=idx) rlog.info(f"Loading a previous snapshot (step {idx}).") # load state dicts # load state dicts ioutil.special_conv_uv_buffer_fix(policy_evaluation.estimator, ckpt["estimator_state"]) policy_evaluation.estimator.load_state_dict(ckpt["estimator_state"]) ioutil.special_conv_uv_buffer_fix(policy_evaluation.target_estimator, ckpt["target_estimator_state"]) policy_evaluation.target_estimator.load_state_dict( ckpt["target_estimator_state"]) policy_evaluation.optimizer.load_state_dict(ckpt["optim_state"]) last_epsilon = None for _ in range(ckpt["step"]): last_epsilon = next(policy_improvement.epsilon) rlog.info(f"Last epsilon: {last_epsilon}.") # some counters last_epoch = ckpt["step"] // opt.train_step_cnt rlog.info(f"Resuming from epoch {last_epoch}.") start_epoch = last_epoch + 1 steps = ckpt["step"] else: steps = 0 start_epoch = 1 # add some hardware and git info, log and save opt = ioutil.add_platform_info(opt) rlog.info("\n" + ioutil.config_to_string(opt)) ioutil.save_config(opt, opt.out_dir) # Start training last_state = None # used by train_one_epoch to know how to resume episode. for epoch in range(start_epoch, opt.epoch_cnt + 1): # train for 250,000 steps steps, last_state = train_one_epoch( env, (replay, policy_improvement, policy_evaluation), opt.train_step_cnt, opt.update_freq, opt.target_update_freq, opt, rlog.getRootLogger(), total_steps=steps, last_state=last_state, ) rlog.traceAndLog(epoch * opt.train_step_cnt) # validate for 125,000 steps validate( AGENTS[opt.agent.name]["policy_improvement"]( policy_improvement.estimator, opt.action_cnt, epsilon=opt.val_epsilon), get_env(opt, mode="testing"), opt.valid_step_cnt, rlog.getRootLogger(), ) rlog.traceAndLog(epoch * opt.train_step_cnt) # save the checkpoint if opt.agent.save: ioutil.checkpoint_agent( opt.out_dir, steps, estimator=policy_evaluation.estimator, target_estimator=policy_evaluation.target_estimator, optim=policy_evaluation.optimizer, cfg=opt, replay=replay, save_replay=(epoch % 8 == 0 or epoch == opt.epoch_cnt), )
def main(): # get the root logger, preconfigured to log to the console, # to a text file, a pickle and a tensorboard protobuf. experiment_path = get_experiment_path() rlog.init("dqn", path=experiment_path, tensorboard=True) rlog.info("Logging application level stuff.") rlog.info("Log artifacts will be saved in %s", experiment_path) rlog.addMetrics( # counts each time it receives a `done=True`, aka counts episodes rlog.SumMetric("ep_cnt", resetable=False, metargs=["done"]), # sums up all the `reward=value` it receives and divides it # by the number of `done=True`, aka mean reward per episode rlog.AvgMetric("R_per_ep", metargs=["reward", "done"]), ) for step in range(5): # probably not a good idea to call this every step if it is a hot loop? # also this will not be logged to the console or to the text file # since the default log-level for these two is INFO. rlog.trace(step=step, aux_loss=7.23 - step) # but we can register metrics that will accumulate traced events # and summarize them. Each Metric accepts a name and some metargs # that tells it which arguments received by the `put` call bellow # to accumulate and summarize. rlog.addMetrics( # counts each time it receives a `done=True`, aka counts episodes rlog.SumMetric("ep_cnt", resetable=False, metargs=["done"]), # sums up all the `reward=value` it receives and divides it # by the number of `done=True`, aka mean reward per episode rlog.AvgMetric("R_per_ep", metargs=["reward", "done"]), # same but keeps a running average instead (experimental). rlog.AvgMetric("RunR", eps=0.9, metargs=["reward", "done"]), # same as above but now we divide by the number of rewards rlog.AvgMetric("R_per_step", metargs=["reward", 1]), # same but with clipped rewards (to +- 1) rlog.AvgMetric("rw_per_ep", metargs=["clip(reward)", "done"]), # computes the no of frames per second rlog.FPSMetric("train_fps", metargs=["frame_no"]), # caches all the values it receives and inserts them into a # tensorboad.summary.histogram every time you call `log.trace` rlog.ValueMetric("gaussians", metargs=["sample"], tb_type="histogram"), ) mean = 0 for step in range(1, 300_001): # make a step in the "environment" reward, done = reward_following_policy(step) # sample from a gaussian for showcasing the histogram sample = random.gauss(mean, 0.1) # simply trace all the values you passed as `metargs` above. # the logger will know how to dispatch each argument. rlog.put(reward=reward, done=done, frame_no=1, sample=sample) if step % 10_000 == 0: # this is the call that dumps everything to the logger. summary = rlog.summarize() rlog.trace(step=step, **summary) # rlog.info( # "{0:6d}, ep {ep_cnt:3d}, RunR/ep{RunR:8.2f} | rw/ep{R_per_ep:8.2f}.".format( # step, **summary # ) # ) # rlog.reset() rlog.traceAndLog(step) mean += 1
def run(opt): """ Run experiment. This function is being launched by liftoff. """ # logging trn_log, val_log = set_logger(opt) # model related stuff device = torch.device("cuda") trn_set, val_set, wmp_set = get_dsets(opt) model = get_model(opt, device) optimizer = getattr(optim, opt.optim.name)(model.parameters(), **vars(opt.optim.args)) # batch_size batch_size = opt.trn_loader.batch_size rlog.info(U.config_to_string(opt)) rlog.info("Model: %s", str(model)) rlog.info("Optimizer: %s \n", str(optimizer)) # Warm-up the mode on a partition of the training dataset if wmp_set is not None: rlog.info("Warming-up on dset of size %d", len(wmp_set)) for epoch in range(opt.warmup.epochs): # train for one epoch trn_loss, trn_acc = train( DataLoader(wmp_set, **vars(opt.trn_loader)), model, optimizer, get_criterion(opt, model, len(wmp_set) // batch_size), mc_samples=opt.trn_mcs, ) val_stats = valid_stats(opt, model, val_set) trn_stats = train_stats(opt, model, wmp_set) trn_stats["loss"], trn_stats["acc"] = trn_loss, trn_acc # to pickle and tensorboard val_log.trace(step=epoch, **val_stats) trn_log.trace(step=epoch, **trn_stats) # to console for log, stats in zip([trn_log, val_log], [trn_stats, val_stats]): log.info(log.fmt.format(epoch, stats["acc"], stats["loss"])) # extra logging model_stats(opt, epoch, model) # maybe reset optimizer after warmup if opt.warmup.reset_optim: rlog.info("\nWarmup ended. Resetting optimizer.") optimizer = getattr(optim, opt.optim.name)(model.parameters(), **vars(opt.optim.args)) # Train on the full training dataset if wmp_set is not None: epochs = range(opt.warmup.epochs, opt.warmup.epochs + opt.epochs) else: epochs = range(opt.epochs) rlog.info("\nTraining on dset: %s", str(trn_set)) for epoch in epochs: trn_loss, trn_acc = train( DataLoader(trn_set, **vars(opt.trn_loader)), model, optimizer, get_criterion(opt, model, len(trn_set) // batch_size), mc_samples=opt.trn_mcs, ) val_stats = valid_stats(opt, model, val_set) trn_stats = train_stats(opt, model, trn_set) trn_stats["loss"], trn_stats["acc"] = trn_loss, trn_acc # to pickle and tensorboard val_log.trace(step=epoch, **val_stats) trn_log.trace(step=epoch, **trn_stats) # to console for log, stats in zip([trn_log, val_log], [trn_stats, val_stats]): log.info(log.fmt.format(epoch, stats["acc"], stats["loss"])) # extra logging model_stats(opt, epoch, model)