def run_maml(args): config = utils.load_config(args.config) train_params = config["train_params"] # open and close env just to get right action and obs space env = sonic_utils.make_from_config(config['env_params'], True) env.close() # init model model = CNNPolicy( env.observation_space, env.action_space, train_params["vf_coef"], train_params["ent_coef"], train_params["lr_meta"], train_params["max_grad_norm"] ) workers = find_workers("worker") init_workers(workers, config, model.get_weights()) # start run workers_results = {w: Pyro4.Future(w.run)() for w in workers} savedir = utils.prepare_exp_dir(config, args.exp_name) updates = 0 while True: # first zero all grads model.optimizer.zero_grad() # then apply add grads from remote workers wait_run_end(workers_results, model) # apply gradient model.optimizer.step() updates += 1 # save last weights if config['log']['save_last']: fpath = savedir / 'last.pt' model.save(fpath) # save on save period if updates % config['log']["save_interval"] == 0 or updates == 1: fpath = savedir / '{}.pt'.format(updates) model.save(fpath)
def train(config, exp_name='test'): train_params = config['train_params'] env_params = config['env_params'] log_params = config["log"] savedir = None if log_params["log_dir"] is not None: savedir = utils.prepare_exp_dir(config, exp_name) env = sonic_utils.make_from_config(env_params) model = CNNPolicy(env.observation_space, env.action_space, train_params["vf_coef"], train_params["ent_coef"], train_params["lr"], train_params["max_grad_norm"]) if train_params["weights"] is not None: model.load(train_params["weights"], train_params["load_adam_params"]) seg_gen = traj_segment_generator(model, env, train_params['n_steps'], sample=True) total_steps = 0 updates = 0 t0 = time() epinfobuf = deque(maxlen=train_params["ep_info_len"]) seg_inds = np.arange(train_params['n_steps']) n_batches = train_params["n_steps"] // train_params["batch_size"] loss_vals = [] while True: if total_steps > train_params['max_steps']: break # get batch seg = seg_gen.__next__() add_vtarg(seg, train_params['gamma'], train_params['lam']) # add episode info epinfobuf.extend(seg['ep_infos']) for _ in range(train_params["n_opt_epochs"]): np.random.shuffle(seg_inds) for i in range(n_batches): start = i * train_params["batch_size"] end = (i + 1) * train_params["batch_size"] inds = seg_inds[start:end] losses = model.train(train_params['cliprange'], seg['ob'][inds], seg['tdlamret'][inds], seg['ac'][inds], seg['vpred'][inds], seg["ac_logits"][inds]) loss_vals.append([l.detach().numpy() for l in losses]) total_steps += train_params['n_steps'] updates += 1 if log_params["log"] and (updates % log_params["log_interval"] == 0 or updates == 1): tnow = time() fps = int(total_steps / (tnow - t0)) # ev = explained_variance(values, returns) logger.logkv("total_steps", total_steps) logger.logkv("nupdates", updates) logger.logkv("fps", fps) logger.logkv( 'eprewmean', np.mean([epinfo['r'] for epinfo in epinfobuf if 'r' in epinfo])) logger.logkv( 'eprewmean_exp', np.mean([ epinfo['r_exp'] for epinfo in epinfobuf if 'r_exp' in epinfo ])) logger.logkv( 'eplenmean', np.mean([epinfo['l'] for epinfo in epinfobuf if 'l' in epinfo])) logger.logkv('time_elapsed', tnow - t0) for loss_val, loss_name in zip(np.mean(loss_vals, axis=0), model.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() del loss_vals[:] # save last weights if log_params['save_last'] and savedir is not None: fpath = savedir / 'last.pt' model.save(fpath) # save on save period if (updates % log_params["save_interval"] == 0 or updates == 1) and savedir is not None: fpath = savedir / '{}.pt'.format(updates) model.save(fpath) return epinfobuf