def test_sample(env_name, batch_size, num_tasks, num_steps, num_workers): # Environment env = gym.make(env_name) env.close() # Policy and Baseline policy = get_policy_for_env(env) baseline = LinearFeatureBaseline(get_input_size(env)) sampler = MultiTaskSampler(env_name, {}, # env_kwargs batch_size, policy, baseline, num_workers=num_workers) tasks = sampler.sample_tasks(num_tasks=num_tasks) train_episodes, valid_episodes = sampler.sample(tasks, num_steps=num_steps) sampler.close() assert len(train_episodes) == num_steps assert len(train_episodes[0]) == num_tasks assert isinstance(train_episodes[0][0], BatchEpisodes) assert len(valid_episodes) == num_tasks assert isinstance(valid_episodes[0], BatchEpisodes)
def main(args): with open(args.config, 'r') as f: config = json.load(f) if args.seed is not None: torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) env = gym.make(config['env-name'], **config['env-kwargs']) env.close() # Policy policy = get_policy_for_env(env, hidden_sizes=config['hidden-sizes'], nonlinearity=config['nonlinearity']) with open(args.policy, 'rb') as f: state_dict = torch.load(f, map_location=torch.device(args.device)) # 加载模型 policy.load_state_dict(state_dict) policy.share_memory() # Baseline baseline = LinearFeatureBaseline(get_input_size(env)) # Sampler sampler = MultiTaskSampler(config['env-name'], env_kwargs=config['env-kwargs'], batch_size=config['fast-batch-size'], policy=policy, baseline=baseline, env=env, seed=args.seed, num_workers=args.num_workers) logs = {'tasks': []} train_returns, valid_returns = [], [] # test phase : update NN for batch in trange(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) train_episodes, valid_episodes = sampler.sample( tasks, num_steps=config['num-steps'], fast_lr=config['fast-lr'], gamma=config['gamma'], gae_lambda=config['gae-lambda'], device=args.device) logs['tasks'].extend(tasks) train_returns.append(get_returns(train_episodes[0])) valid_returns.append(get_returns(valid_episodes)) # definition of get_returns # def get_returns(episodes): # return to_numpy([episode.rewards.sum(dim=0) for episode in episodes]) logs['train_returns'] = np.concatenate(train_returns, axis=0) logs['valid_returns'] = np.concatenate(valid_returns, axis=0) with open(args.output, 'wb') as f: np.savez(f, **logs)
def main(args): with open(args.config, 'r') as f: config = json.load(f) if args.seed is not None: torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # env = gym.make(config['env-name'], **config['env-kwargs']) env = gym.make(config['env-name'], **config.get('env-kwargs', {})) env.close() # Policy policy = get_policy_for_env(env, hidden_sizes=config['hidden-sizes'], nonlinearity=config['nonlinearity']) with open(args.policy, 'rb') as f: state_dict = torch.load(f, map_location=torch.device(args.device)) policy.load_state_dict(state_dict) policy.share_memory() # Baseline baseline = LinearFeatureBaseline(get_input_size(env)) # Sampler sampler = MultiTaskSampler(config['env-name'], env_kwargs=config.get('env-kwargs', {}), batch_size=config['fast-batch-size'], policy=policy, baseline=baseline, env=env, seed=args.seed, num_workers=args.num_workers) logs = {'tasks': []} train_returns, valid_returns = [], [] # to see the grad0 ~ multi gradient grad_returns = [] for i in range(Grad_Steps): grad_returns.append([]) # to see the grad0 ~ multi gradient for batch in trange(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) train_episodes, valid_episodes = sampler.sample( tasks, num_steps=config['num-steps'], fast_lr=config['fast-lr'], gamma=config['gamma'], gae_lambda=config['gae-lambda'], device=args.device) logs['tasks'].extend(tasks) # to see the grad0 ~ multi gradient for i in range(Grad_Steps): grad_returns[i].append(get_returns(train_episodes[i])) for i in range(Grad_Steps): logs['grad' + str(i) + '_returns'] = np.concatenate( grad_returns[i], axis=0) # to see the grad0 ~ multi gradient train_returns.append(get_returns(train_episodes[0])) valid_returns.append(get_returns(valid_episodes)) logs['train_returns'] = np.concatenate(train_returns, axis=0) logs['valid_returns'] = np.concatenate(valid_returns, axis=0) # to see the grad0 ~ multi gradient value = [0] * (Grad_Steps + 1) for i in range(Grad_Steps): value[i] = logs['grad' + str(i) + '_returns'].mean() value[Grad_Steps] = logs['valid_returns'].mean() print(value) print(logs['valid_returns'].mean()) # to see the grad0 ~ multi gradient with open(args.output, 'wb') as f: np.savez(f, **logs)