def __init__(self, index, env_name, env_kwargs, batch_size, observation_space, action_space, policy, baseline, seed, task_queue, train_queue, valid_queue, policy_lock): super(SamplerWorker, self).__init__() # 为 batch 中 batch_size 个任务都创建环境 (num_batches * batch_size) env_fns = [make_env(env_name, env_kwargs=env_kwargs) for _ in range(batch_size)] self.envs = SyncVectorEnv(env_fns, observation_space=observation_space, action_space=action_space) self.envs.seed(None if (seed is None) else seed + index * batch_size) self.batch_size = batch_size self.policy = policy self.baseline = baseline self.task_queue = task_queue self.train_queue = train_queue self.valid_queue = valid_queue self.policy_lock = policy_lock
def __init__(self, env_name, env_kwargs, batch_size, observation_space, action_space, policy, baseline, seed, prior_policy, task): # 为 batch 中 batch_size 个任务都创建环境 (num_batches * batch_size) env_fns = [make_env(env_name, env_kwargs=env_kwargs) for _ in range(batch_size)] self.envs = SyncVectorEnv(env_fns, observation_space=observation_space, action_space=action_space) self.envs.seed(None if (seed is None) else seed + batch_size) self.batch_size = batch_size self.policy = policy self.baseline = baseline self.envs.reset_task(task) self.task = task
def __init__(self, env_name, env_kwargs, batch_size, num_tasks, policy, baseline, env=None, seed=None): baseline = deepcopy(baseline) # 为 batch 中 batch_size 个任务都创建环境 (num_batches * batch_size) # env_fns = [make_env(env_name, env_kwargs=env_kwargs) # for _ in range(batch_size)] # self.envs = SyncVectorEnv(env_fns, # observation_space=env.observation_space, # action_space=env.action_space) # self.envs.seed(None if (seed is None) else seed + index * batch_size) self.tasks = self.sample_tasks(env, num_tasks) env_fns = [make_env(env_name, env_kwargs=env_kwargs)] self.env = SyncVectorEnv(env_fns, observation_space=env.observation_space, action_space=env.action_space) self.env.seed(None if (seed is None) else seed) # self.env = env # self.env.seed(None if (seed is None) else seed) self.batch_size = batch_size self.policy = policy self.baseline = baseline
def main(args): with open(args.config, 'r') as f: config = json.load(f) if args.seed is not None: torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) env = make_env(config['env-name'])() env.close() # Policy policy = get_policy_for_env(env, hidden_sizes=config['hidden-sizes'], nonlinearity=config['nonlinearity']) with open(args.policy, 'rb') as f: state_dict = torch.load(f, map_location=torch.device(args.device)) policy.load_state_dict(state_dict) policy.share_memory() # Baseline baseline = LinearFeatureBaseline(get_input_size(env)) # Sampler sampler = MultiTaskSampler(config['env-name'], env_kwargs=None, batch_size=config['fast-batch-size'], policy=policy, baseline=baseline, env=env, seed=args.seed, num_workers=args.num_workers, is_meta_test=True) logs = {'tasks': []} train_returns, valid_returns = [], [] for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) train_episodes, valid_episodes = sampler.sample( tasks, num_steps=config['num-steps'], fast_lr=config['fast-lr'], gamma=config['gamma'], gae_lambda=config['gae-lambda'], device=args.device) logs['tasks'].extend(tasks) train_returns.append(get_returns(train_episodes[0])) print("train:", np.mean(get_returns(train_episodes[0]))) valid_returns.append(get_returns(valid_episodes)) print("valid:", np.mean(get_returns(valid_episodes))) logs['train_returns'] = np.concatenate(train_returns, axis=0) logs['valid_returns'] = np.concatenate(valid_returns, axis=0) with open(args.output, 'wb') as f: np.savez(f, **logs)
def main(args): with open(args.config, 'r') as f: config = yaml.load(f, Loader=yaml.FullLoader) if args.output_folder is not None: if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) policy_filename = os.path.join(args.output_folder, 'policy.th') config_filename = os.path.join(args.output_folder, 'config.json') with open(config_filename, 'w') as f: config.update(vars(args)) json.dump(config, f, indent=2) # Set tb_writer args.log_name = "env-name::%s_num-steps::%s_fast-lr::%s_log" % ( config["env-name"], config["num-steps"], config["fast-lr"]) tb_writer = SummaryWriter("./{0}/tb_{1}_logs".format(args.output_folder, args.log_name)) log = set_log(args) # Set seed if args.seed is not None: torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) env = make_env(config["env-name"])() env.close() # Policy policy = get_policy_for_env( env, hidden_sizes=config['hidden-sizes'], nonlinearity=config['nonlinearity']) policy.share_memory() # Baseline baseline = LinearFeatureBaseline(get_input_size(env)) # Sampler sampler = MultiTaskSampler( config['env-name'], env_kwargs=config.get('env-kwargs', {}), batch_size=config['fast-batch-size'], policy=policy, baseline=baseline, env=env, seed=args.seed, num_workers=args.num_workers) metalearner = MAMLTRPO( policy, fast_lr=config['fast-lr'], first_order=config['first-order'], device=args.device) best_score = -np.inf for batch in range(config['num-batches']): tasks = sampler.sample_tasks(num_tasks=config['meta-batch-size']) futures = sampler.sample_async( tasks, num_steps=config['num-steps'], fast_lr=config['fast-lr'], gamma=config['gamma'], gae_lambda=config['gae-lambda'], device=args.device) metalearner.step( *futures, max_kl=config['max-kl'], cg_iters=config['cg-iters'], cg_damping=config['cg-damping'], ls_max_steps=config['ls-max-steps'], ls_backtrack_ratio=config['ls-backtrack-ratio']) # For logging train_episodes, valid_episodes = sampler.sample_wait(futures) train_score = np.mean(get_returns(train_episodes[0])) val_score = np.mean(get_returns(valid_episodes)) log[args.log_name].info("At iteration {}, train_reward: {:.3f}".format(batch, train_score)) tb_writer.add_scalars("reward/", {"train": train_score}, batch) log[args.log_name].info("At iteration {}, valid_reward: {:.3f}".format(batch, val_score)) tb_writer.add_scalars("reward/", {"val": val_score}, batch) # Save policy if val_score > best_score: best_score = val_score log[args.log_name].info("Saving best valid score: {:.3f}".format(best_score)) with open(policy_filename, 'wb') as f: torch.save(policy.state_dict(), f)