def test_sample(env_name, batch_size, num_tasks, num_steps, num_workers): # Environment env = gym.make(env_name) env.close() # Policy and Baseline policy = get_policy_for_env(env) baseline = LinearFeatureBaseline(get_input_size(env)) sampler = MultiTaskSampler(env_name, {}, # env_kwargs batch_size, policy, baseline, num_workers=num_workers) tasks = sampler.sample_tasks(num_tasks=num_tasks) train_episodes, valid_episodes = sampler.sample(tasks, num_steps=num_steps) sampler.close() assert len(train_episodes) == num_steps assert len(train_episodes[0]) == num_tasks assert isinstance(train_episodes[0][0], BatchEpisodes) assert len(valid_episodes) == num_tasks assert isinstance(valid_episodes[0], BatchEpisodes)
def main(args, config): set_random_seed(args) # Environment env = get_environment(args, config) # Policy & Baseline policy = get_policy_for_env(args, env, hidden_sizes=config['hidden-sizes'], nonlinearity=config['nonlinearity']) policy.share_memory() # this is done to share memory across processes for multiprocessing baseline = LinearFeatureBaseline(reduce(mul, env.observation_space.shape, 1)) # Sampler sampler = MultiTaskSampler(config['env-name'], env_kwargs=config.get('env-kwargs', {}), batch_size=config['fast-batch-size'], policy=policy, baseline=baseline, env=env, seed=args.seed, num_workers=args.num_workers) # Meta Model metalearner = MAMLTRPO(policy, fast_lr=config['fast-lr'], first_order=config['first-order'], device=args.device) # Solver solver = Solver(args, config, policy, sampler, metalearner) solver.train(args, config)
def main(args): args.output_folder = args.env_name # TODO continuous_actions = (args.env_name in ['AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0']) # writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: # config = {k: v for (k, v) in vars(args).iteritems() if k != 'device'} config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) print(config) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), # input shape int(np.prod(sampler.envs.action_space.shape)), # output shape hidden_sizes=(args.hidden_size,) * args.num_layers) # [100, 100] else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size,) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.num_batches): # number of epoches tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard # writer.add_scalar('total_rewards/before_update', # total_rewards([ep.rewards for ep, _ in episodes]), batch) # writer.add_scalar('total_rewards/after_update', # total_rewards([ep.rewards for _, ep in episodes]), batch) # # Save policy network # with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: # torch.save(policy.state_dict(), f) print(batch, total_rewards([ep.rewards for ep, _ in episodes]), total_rewards([ep.rewards for _, ep in episodes]))
def main(args): with open(args.config, 'r') as f: config = json.load(f) if args.seed is not None: torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) env = gym.make(config['env-name'], **config['env-kwargs']) env.close() # Policy policy = get_policy_for_env(env, hidden_sizes=config['hidden-sizes'], nonlinearity=config['nonlinearity']) with open(args.policy, 'rb') as f: state_dict = torch.load(f, map_location=torch.device(args.device)) # 加载模型 policy.load_state_dict(state_dict) policy.share_memory() # Baseline baseline = LinearFeatureBaseline(get_input_size(env)) # Sampler sampler = MultiTaskSampler(config['env-name'], env_kwargs=config['env-kwargs'], batch_size=config['fast-batch-size'], policy=policy, baseline=baseline, env=env, seed=args.seed, num_workers=args.num_workers) logs = {'tasks': []} train_returns, valid_returns = [], [] # test phase : update NN for batch in trange(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) train_episodes, valid_episodes = sampler.sample( tasks, num_steps=config['num-steps'], fast_lr=config['fast-lr'], gamma=config['gamma'], gae_lambda=config['gae-lambda'], device=args.device) logs['tasks'].extend(tasks) train_returns.append(get_returns(train_episodes[0])) valid_returns.append(get_returns(valid_episodes)) # definition of get_returns # def get_returns(episodes): # return to_numpy([episode.rewards.sum(dim=0) for episode in episodes]) logs['train_returns'] = np.concatenate(train_returns, axis=0) logs['valid_returns'] = np.concatenate(valid_returns, axis=0) with open(args.output, 'wb') as f: np.savez(f, **logs)
def main(args): continuous_actions = True writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.num_batches): print("========== BATCH NUMBER {0} ==========".format(batch)) tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) # Save policy network with open( os.path.join(save_folder, 'policy-{0}.pt'.format(batch + 256)), 'wb') as f: torch.save(policy.state_dict(), f)
def main(args): continuous_actions = (args.env_name in [ 'AntVelEnv-v1', 'AntDirEnv-v1', 'HalfCheetahVelEnv-v1', 'HalfCheetahDirEnv-v1', '2DNavigation-v0' ]) save_folder = os.path.join('tmp', args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) # Load model with open(args.model, 'rb') as f: state_dict = torch.load(f) policy.load_state_dict(state_dict) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) args.meta_batch_size = 81 # velocities = np.linspace(-1., 3., num=args.meta_batch_size) # tasks = [{'velocity': velocity} for velocity in velocities] tasks = [{'direction': direction} for direction in [-1, 1]] for batch in range(args.num_batches): episodes = metalearner.sample(tasks) train_returns = [ep.rewards.sum(0).cpu().numpy() for ep, _ in episodes] valid_returns = [ep.rewards.sum(0).cpu().numpy() for _, ep in episodes] with open(os.path.join(save_folder, '{0}.npz'.format(batch)), 'wb') as f: np.savez(f, train=train_returns, valid=valid_returns) print('Batch {0}'.format(batch))
def hierarchical_meta_policy(env, skills_dim, sampler, output_size, net_size): higher_policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), skills_dim, hidden_sizes=(args.hidden_size, ) * args.num_layers) observation_dim = int(np.prod(sampler.envs.observation_space.shape)) action_dim = int(np.prod(sampler.envs.action_space.shape)) hidden_size = net_size output_size = output_size skills_dim = skills_dim # Define the networks q_value_function_1 = FlattenMlp(hidden_sizes=[hidden_size, hidden_size], input_size=observation_dim + action_dim + skills_dim, output_size=output_size) q_value_function_2 = FlattenMlp(hidden_sizes=[hidden_size, hidden_size], input_size=observation_dim + action_dim + skills_dim, output_size=output_size) value_function = FlattenMlp(hidden_sizes=[hidden_size, hidden_size], input_size=observation_dim, output_size=output_size) discriminator_function = FlattenMlp( hidden_sizes=[hidden_size, hidden_size], input_size=observation_dim, output_size=skills_dim) policy = TanhGaussianPolicy(hidden_sizes=[hidden_size, hidden_size], obs_dim=observation_dim + skills_dim, action_dim=action_dim) # Define the empowerment skills algorithm env_pusher = PusherEnv() algorithm = EmpowermentSkills(env=env_pusher, policy=policy, higher_policy=higher_policy, discriminator=discriminator_function, q_value_function_1=q_value_function_1, q_value_function_2=q_value_function_2, value_function=value_function) lower_policy = algorithm baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) return higher_policy, lower_policy, baseline
def test_init(env_name, num_workers): batch_size = 10 # Environment env = gym.make(env_name) env.close() # Policy and Baseline policy = get_policy_for_env(env) baseline = LinearFeatureBaseline(get_input_size(env)) sampler = MultiTaskSampler(env_name, {}, # env_kwargs batch_size, policy, baseline, num_workers=num_workers) sampler.close()
def main(args): with open(args.config, 'r') as f: config = json.load(f) if args.seed is not None: torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) env = gym.make(config['env-name'], **config['env-kwargs']) env.close() # Policy post_policy = get_policy_for_env(args.device, args.log_var_init, env, hidden_sizes=config['hidden-sizes']) with open(args.policy, 'rb') as f: state_dict = torch.load(f, map_location=torch.device(args.device)) post_policy.load_state_dict(state_dict) # Baseline baseline = LinearFeatureBaseline(get_input_size(env)) tasks = env.unwrapped.sample_tasks(num_tasks) for batch in trange(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) train_episodes, valid_episodes = sampler.sample( tasks, num_steps=config['num-steps'], fast_lr=config['fast-lr'], gamma=config['gamma'], gae_lambda=config['gae-lambda'], device=args.device) logs['tasks'].extend(tasks) train_returns.append(get_returns(train_episodes[0])) valid_returns.append(get_returns(valid_episodes)) logs['train_returns'] = np.concatenate(train_returns, axis=0) logs['valid_returns'] = np.concatenate(valid_returns, axis=0) with open(args.output, 'wb') as f: np.savez(f, **logs)
def main(args): group_name = ''.join([ random.choice(string.ascii_letters + string.digits) for n in range(4) ]) wandb.init(group=group_name, job_type='optimizer', tensorboard=True) wandb.config.update(args) device = torch.device(args.device) continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0' ]) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=device.type) json.dump(config, f, indent=2) sampler = BatchSampler(group_name, args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=device) for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f)
def train_pretrained_model(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'MountainCarContinuousVT-v0' ]) # writer = SummaryWriter('./logs/{0}'.format(args.output_folder + '_pretrained')) save_folder = './saves/{0}'.format(args.output_folder + '_pretrained') if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) #batch_size=2*args.fast_batch_size to match the amount of data used in meta-learning sampler = BatchSampler(args.env_name, batch_size=2 * args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) #load pretrained model cont_from_batch = 0 if args.start_from_batch != -1: pretrained_model = os.path.join( save_folder, 'policy-{0}.pt'.format(args.start_from_batch - 1)) if os.path.exists(pretrained_model): policy.load_state_dict(torch.load(pretrained_model)) cont_from_batch = args.start_from_batch metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(cont_from_batch, args.num_batches): print('Currently processing Batch: {}'.format(batch + 1)) task_sampling_time = time.time() tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) task_sampling_time = time.time() - task_sampling_time episode_generating_time = time.time() episodes = metalearner.sample_for_pretraining( tasks, first_order=args.first_order) episode_generating_time = time.time() - episode_generating_time learning_step_time = time.time() params = metalearner.adapt(episodes, first_order=args.first_order) metalearner.policy.load_state_dict(params, strict=True) learning_step_time = time.time() - learning_step_time print('Tasking Sampling Time: {}'.format(task_sampling_time)) print('Episode Generating Time: {}'.format(episode_generating_time)) print('Learning Step Time: {}'.format(learning_step_time)) # Tensorboard # writer.add_scalar('total_rewards/before_update', # total_rewards([ep.rewards for ep, _ in episodes]), batch) # writer.add_scalar('total_rewards/after_update', # total_rewards([ep.rewards for _, ep in episodes]), batch) # experiment.log_metric("Avg Disc Reward (Pretrained)", total_rewards([episodes.rewards], args.gamma), batch+1) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(metalearner.policy.state_dict(), f) return
def main(args): env_name = 'RVONavigationAll-v0' #['2DNavigation-v0', 'RVONavigation-v0', 'RVONavigationAll-v0'] test_folder = './{0}'.format('test_nav') fast_batch_size = 40 # number of trajectories saved_policy_file = os.path.join( './TrainingResults/result3/saves/{0}'.format('maml-2DNavigation-dir'), 'policy-180.pt') sampler = BatchSampler(env_name, batch_size=fast_batch_size, num_workers=3) policy = NormalMLPPolicy(int(np.prod( sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(100, ) * 2) # Loading policy if os.path.isfile(saved_policy_file): policy_info = torch.load(saved_policy_file, map_location=lambda storage, loc: storage) policy.load_state_dict(policy_info) print('Loaded saved policy') else: sys.exit("The requested policy does not exist for loading") # Creating test folder if not os.path.exists(test_folder): os.makedirs(test_folder) # Generate tasks # goal = [[-0.8, 0.9]] # task = [{'goal': goal}][0] tasks = sampler.sample_tasks(num_tasks=1) task = tasks[0] # Start validation print("Starting to test...Total step = ", args.grad_steps) start_time = time.time() # baseline = LinearFeatureBaseline(int(np.prod(sampler.envs.observation_space.shape))) baseline = LinearFeatureBaseline(int(np.prod((2, )))) metalearner = MetaLearner(sampler, policy, baseline, gamma=0.9, fast_lr=0.01, tau=0.99, device='cpu') # test_episodes = metalearner.sample(tasks) # for train, valid in test_episodes: # total_reward, dist_reward, col_reward = total_rewards(train.rewards) # print(total_reward) # total_reward, dist_reward, col_reward = total_rewards(valid.rewards) # print(total_reward) test_episodes = metalearner.test(task, n_grad=args.grad_steps) print('-------------------') for n_grad, ep in test_episodes: total_reward, dist_reward, col_reward = total_rewards(ep.rewards) print(total_reward) # with open(os.path.join(test_folder, 'test_episodes_grad'+str(n_grad)+'.pkl'), 'wb') as f: # pickle.dump([ep.observations.cpu().numpy(), ep], f) # with open(os.path.join(test_folder, 'task.pkl'), 'wb') as f: # pickle.dump(task, f) print('Finished test. Time elapsed = {}'.format( time_elapsed(time.time() - start_time)))
def main(args): with open(args.config, 'r') as f: config = yaml.load(f, Loader=yaml.FullLoader) if args.output_folder is not None: if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) config_filename = os.path.join(args.output_folder, str(args.seed) + '_config.json') policy_filename = os.path.join(args.output_folder, str(args.seed) + '_policy.th') result_filename_txt = os.path.join(args.output_folder, str(args.seed) + '_results.txt') result_filename_pickle = os.path.join( args.output_folder, str(args.seed) + '_results.pickle') with open(config_filename, 'w') as f: config.update(vars(args)) json.dump(config, f, indent=2) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) results = { 'train_costs': [], 'test_costs': [], 'train_costs_sum': [], # the cost 'test_costs_sum': [], 'train_costs_mean': [], 'test_costs_mean': [], # the evaluation for grid-world, key-door, and mountain-car problems 'train_returns': [], 'test_returns': [], 'train_returns_mean': [], 'test_returns_mean': [], # the evaluation for the treasure problem 'train_returns_std': [], 'test_returns_std': [], } # env = gym.make(config['env-name'], **config['env-kwargs']) env = gym.make(config['env-name']) env.close() # Policy policy = get_policy_for_env(env, hidden_sizes=config['hidden-sizes'], nonlinearity=config['nonlinearity']) policy.share_memory() # Baseline baseline = LinearFeatureBaseline(get_input_size(env)) # Sampler sampler = MultiTaskSampler(config['env-name'], env_kwargs=config['env-kwargs'], batch_size=config['fast-batch-size'], policy=policy, baseline=baseline, env=env, seed=args.seed, num_workers=args.num_workers) metalearner = MAMLTRPO(policy, fast_lr=config['fast-lr'], first_order=config['first-order'], device=args.device) num_iterations = 0 for batch in trange(config['num-batches']): tasks = sampler.sample_tasks(num_tasks=config['meta-batch-size']) # print(tasks) futures = sampler.sample_async(tasks, num_steps=config['num-steps'], fast_lr=config['fast-lr'], gamma=config['gamma'], gae_lambda=config['gae-lambda'], device=args.device) # print(futures) logs = metalearner.step( *futures, max_kl=config['max-kl'], cg_iters=config['cg-iters'], cg_damping=config['cg-damping'], ls_max_steps=config['ls-max-steps'], ls_backtrack_ratio=config['ls-backtrack-ratio']) # print('logs') train_episodes, valid_episodes = sampler.sample_wait(futures) # print('train_episodes') num_iterations += sum( sum(episode.lengths) for episode in train_episodes[0]) num_iterations += sum( sum(episode.lengths) for episode in valid_episodes) logs.update(tasks=tasks, num_iterations=num_iterations, train_returns=get_returns(train_episodes[0]), valid_returns=get_returns(valid_episodes)) train_returns = get_discounted_returns(train_episodes[0]) test_returns = get_discounted_returns(valid_episodes) train_costs = get_costs(train_episodes[0]) test_costs = get_costs(valid_episodes) # Save results results['train_returns'].append(train_returns) results['test_returns'].append(test_returns) results['train_returns_mean'].append(np.mean(train_returns)) results['test_returns_mean'].append(np.mean(test_returns)) results['train_returns_std'].append(np.std(train_returns)) results['test_returns_std'].append(np.std(test_returns)) results['train_costs'].append(train_costs) results['test_costs'].append(test_costs) results['train_costs_sum'].append(np.sum(train_costs)) results['test_costs_sum'].append(np.sum(test_costs)) results['train_costs_mean'].append(np.mean(train_costs)) results['test_costs_mean'].append(np.mean(test_costs)) with open(result_filename_txt, "w") as file: file.write(str(results)) with open(result_filename_pickle, "wb") as file: dump(results, file, protocol=2) # Save policy if args.output_folder is not None: with open(policy_filename, 'wb') as f: torch.save(policy.state_dict(), f) print(np.sum(results['train_costs_sum']))
def main(args): with open(args.config, 'r') as f: config = yaml.load(f, Loader=yaml.FullLoader) if args.env_name in all_envs: config.update({ "env-name": args.env_name + "-v0", "env-kwargs": {}, "fast-batch-size": 16, "num-batches": 2000, "meta-batch-size": 1 }) if args.output_folder is not None: if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) policy_filename = os.path.join(args.output_folder, 'policy.th') config_filename = os.path.join(args.output_folder, 'config.json') with open(config_filename, 'w') as f: config.update(vars(args)) json.dump(config, f, indent=2) if args.seed is not None: torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) model_name = "maml" os.makedirs(f"{LOG_DIR}/{model_name}/{config['env-name']}/", exist_ok=True) run_num = len([ n for n in os.listdir(f"{LOG_DIR}/{model_name}/{config['env-name']}/") ]) log_path = f"{LOG_DIR}/{model_name}/{config['env-name']}/logs_{run_num}.txt" env = gym.make(config['env-name'], **config.get('env-kwargs', {})) env.close() policy = get_policy_for_env(env, hidden_sizes=config['hidden-sizes'], nonlinearity=config['nonlinearity']) policy.share_memory() baseline = LinearFeatureBaseline(get_input_size(env)) sampler = MultiTaskSampler(config['env-name'], env_kwargs=config.get('env-kwargs', {}), batch_size=config['fast-batch-size'], policy=policy, baseline=baseline, env=env, seed=args.seed, num_workers=args.num_workers) metalearner = MAMLTRPO(policy, fast_lr=config['fast-lr'], first_order=config['first-order'], device=args.device) num_iterations = 0 total_rewards = [] start = time.time() step = 0 # for batch in range(config['num-batches']+1): while step <= 500000: tasks = sampler.sample_tasks(num_tasks=config['meta-batch-size']) futures = sampler.sample_async(tasks, num_steps=config['num-steps'], fast_lr=config['fast-lr'], gamma=config['gamma'], gae_lambda=config['gae-lambda'], device=args.device) logs = metalearner.step( *futures, max_kl=config['max-kl'], cg_iters=config['cg-iters'], cg_damping=config['cg-damping'], ls_max_steps=config['ls-max-steps'], ls_backtrack_ratio=config['ls-backtrack-ratio']) train_episodes, valid_episodes = sampler.sample_wait(futures) num_iterations += sum( sum(episode.lengths) for episode in train_episodes[0]) num_iterations += sum( sum(episode.lengths) for episode in valid_episodes) logs.update(tasks=tasks, num_iterations=num_iterations, train_returns=get_returns(train_episodes[0]), valid_returns=get_returns(valid_episodes)) # Save policy old_step = step step += 250 if args.env_name in all_envs else train_episodes[0][ 0].lengths[0] if old_step == 0 or step // 1000 > old_step // 1000: rollouts = logs["valid_returns"][0] reward = np.mean(rollouts, -1) ep = step // 1000 total_rewards.append(reward) string = f"Step: {int(1000*ep):7d}, Reward: {total_rewards[-1]:9.3f} [{np.std(rollouts):8.3f}], Avg: {np.mean(total_rewards, axis=0):9.3f} ({0.0:.3f}) <{get_time(start)}> ({{}})" print(string) with open(log_path, "a+") as f: f.write(f"{string}\n")
def main(args): logging.basicConfig(filename=args.debug_file, level=logging.WARNING, filemode='w') logging.getLogger('metalearner').setLevel(logging.INFO) continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'PendulumTheta-v0' ]) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) #if args.baseline == 'critic shared': # policy = NormalMLPPolicyA2C(int(np.prod(sampler.envs.observation_space.shape)), # int(np.prod(sampler.envs.action_space.shape)), # hidden_sizes=(args.hidden_size,) * args.num_layers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) if args.baseline == 'linear': baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) elif args.baseline == 'critic separate': baseline = CriticFunction( int(np.prod(sampler.envs.observation_space.shape)), 1, hidden_sizes=(args.hidden_size, ) * args.num_layers) #elif args.baseline == 'critic shared': # RANJANI TO DO metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device, baseline_type=args.baseline, cliprange=args.cliprange, noptepochs=args.noptepochs, usePPO=args.usePPO, nminibatches=args.nminibatches, ppo_lr=args.ppo_lr, useSGD=args.useSGD, ppo_momentum=args.ppo_momentum, grad_clip=args.grad_clip) for batch in range(args.num_batches): print("*********************** Batch: " + str(batch) + " ****************************") print("Creating tasks...") tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) print("Creating episodes...") episodes, grad_norm = metalearner.sample(tasks, first_order=args.first_order) print("Taking a meta step...") metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) print("Writing results to tensorboard...") # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) if grad_norm: writer.add_scalar('PPO mb grad norm', np.average(grad_norm)) print(np.average(grad_norm)) print("Saving policy network...") # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f) print("***************************************************")
def train_meta_learning_model(args): # import matplotlib.pyplot as plt # import matplotlib.animation as animation # from matplotlib import style # style.use('fivethirtyeight') # fig = plt.figure() # ax1 = fig.add_subplot(1,1,1) # xs = [] # ys = [] # def animate(i): # ax1.clear() # ax1.plot(xs, ys) rewards_before_ml = [] rewards_after_ml = [] continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'MountainCarContinuousVT-v0' ]) # writer = SummaryWriter('./logs/{0}'.format(args.output_folder + '_metalearned')) save_folder = './saves/{0}'.format(args.output_folder + '_metalearned') if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) torch.manual_seed(args.random_seed) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) #load pretrained model cont_from_batch = 0 if args.start_from_batch != -1: metalearned_model = os.path.join( save_folder, 'policy-{0}.pt'.format(args.start_from_batch - 1)) if os.path.exists(metalearned_model): policy.load_state_dict(torch.load(metalearned_model)) cont_from_batch = args.start_from_batch metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(cont_from_batch, args.num_batches): print('Currently processing Batch: {}'.format(batch + 1)) task_sampling_time = time.time() tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size, sampling_type=args.sampling_type, points_per_dim=args.points_per_dim) task_sampling_time = time.time() - task_sampling_time episode_generating_time = time.time() episodes = metalearner.sample(tasks, first_order=args.first_order) episode_generating_time = time.time() - episode_generating_time learning_step_time = time.time() metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) learning_step_time = time.time() - learning_step_time print('Tasking Sampling Time: {}'.format(task_sampling_time)) print('Episode Generating Time: {}'.format(episode_generating_time)) print('Learning Step Time: {}'.format(learning_step_time)) reward_before_ml = total_rewards([ep.rewards for ep, _ in episodes], args.gamma) reward_after_ml = total_rewards([ep.rewards for _, ep in episodes], args.gamma) print('Before Update: {} After Update: {}'.format( reward_before_ml, reward_after_ml)) # experiment.log_metric("Avg Reward Before Update (MetaLearned)", reward_before_ml) experiment.log_metric("Avg Reward", reward_after_ml, batch + 1) rewards_before_ml.append(reward_before_ml) rewards_after_ml.append(reward_after_ml) # xs.append(batch+1) # ys.append(total_rewards([ep.rewards for _, ep in episodes], args.gamma)) # ani = animation.FuncAnimation(fig, animate, interval=1000) # plt.savefig('navg_baseline_monitor') # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(metalearner.policy.state_dict(), f) # tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) # episodes = metalearner.sample(tasks, first_order=args.first_order) # print("Avg Reward After Update (MetaLearned)", total_rewards([ep.rewards for _, ep in episodes], args.gamma)) testing_sampler = BatchSampler(args.env_name, batch_size=args.testing_fbs, num_workers=args.num_workers) testing_metalearner = MetaLearner(testing_sampler, metalearner.policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) test_tasks = testing_sampler.sample_tasks(num_tasks=args.testing_mbs, sampling_type='rand', points_per_dim=-1) test_episodes = testing_metalearner.sample(test_tasks, first_order=args.first_order, no_update=True) test_reward = total_rewards([ep.rewards for ep in test_episodes], args.gamma) print('-------------------------------------------------') print('Test Time reward is: ' + str(test_reward)) print('-------------------------------------------------') pickle_reward_data_file = os.path.join(save_folder, 'reward_data.pkl') with open(pickle_reward_data_file, 'wb') as f: pickle.dump(rewards_before_ml, f) pickle.dump(rewards_after_ml, f) pickle_final_reward_file = os.path.join(save_folder, 'final_reward.pkl') with open(pickle_final_reward_file, 'wb') as f: pickle.dump(test_reward, f) return
def main(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'Pusher' ]) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) if not args.hierarchical: sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for i, batch in enumerate(range(args.num_batches)): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) print('Total Rewards', str(total_rewards([ep.rewards for _, ep in episodes]))) # Tensorboard writer.add_scalar( 'total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar( 'total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) if (i + 1) % args.save_every == 0: # Save policy network with open( os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy, f) else: sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) # Get the policies higher_policy, lower_trainer, baseline = hierarchical_meta_policy( args.env_name, args.skills_dim, sampler=sampler, net_size=args.hidden_size, output_size=1) # Define the hierarchical meta learner hr_meta_learner = HierarchicalMetaLearner(sampler, higher_policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) # Training procedure for i, batch in enumerate(range(args.num_batches)): # Train the lower level policy lower_trainer.train() # Now freeze the lower level policy lower_networks = lower_trainer.networks lower_policy = lower_networks[0] lower_policy.trainable = False # Sample the different tasks tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) # Sample the different episodes for the different tasks episodes = hr_meta_learner.sample(tasks, lower_policy, first_order=args.first_order) hr_meta_learner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) print('Total Rewards', str(total_rewards([ep.rewards for _, ep in episodes]))) lower_policy.trainable = True # Tensorboard writer.add_scalar( 'total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar( 'total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) if (i + 1) % args.save_every == 0: # Save the policy networks with open( os.path.join(save_folder, 'h_policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(higher_policy, f) with open( os.path.join(save_folder, 'l_policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(lower_policy, f) with open(os.path.join(save_folder, 'baseline.pt'), 'wb') as f: torch.save(baseline, f)
def main(args, prior_policy=None, init_from_prior=True, verbose=1): with open(args.config, 'r') as f: config = yaml.load(f, Loader=yaml.FullLoader) if args.output_folder is not None: if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) policy_filename = os.path.join(args.output_folder, 'policy.th') config_filename = os.path.join(args.output_folder, 'config.json') with open(config_filename, 'w') as f: config.update(vars(args)) json.dump(config, f, indent=2) if args.seed is not None: torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) env = gym.make(config['env-name'], **config['env-kwargs']) env.close() args.output_size = reduce(mul, env.action_space.shape, 1) args.input_size = reduce(mul, env.observation_space.shape, 1) # # Policy # policy = get_policy_for_env(args, # env, # hidden_sizes=config['hidden-sizes'], # nonlinearity=config['nonlinearity']) # get model: # 如果有先验模型,则直接导入先验模型 if prior_policy and init_from_prior: # init from prior model: # deepcopy函数:复制并作为一个单独的个体存在;copy函数:复制原有对象,随着原有对象改变而改变 post_policy = deepcopy(prior_policy).to(args.device) else: # 否则直接加载新模型 post_policy = get_policy_for_env(args, env, hidden_sizes=config['hidden-sizes']) # 数据无需拷贝,即可使用 # post_policy.share_memory() # Baseline baseline = LinearFeatureBaseline(get_input_size(env)) # Unpack parameters: # 提取参数 # optim_func, optim_args, lr_schedule = \ # args.optim_func, args.optim_args, args.lr_schedule # Get optimizer: # 设置待优化参数 # optimizer = args.optim_func(post_policy.parameters(), **args.optim_args) sampler = MultiTaskSampler(config['env-name'], env_kwargs=config['env-kwargs'], batch_size=config['fast-batch-size'], num_tasks=config['meta-batch-size'], policy=post_policy, baseline=baseline, env=env, seed=args.seed) # tasks['goal'] fast-batch-size = 20 个目标值 0-19 # 提取batch中的任务: tasks 为 goal,2D任务中的目标值 # tasks = sampler.sample_tasks(num_tasks=config['meta-batch-size']) tasks = sampler.sample_tasks_return() # for index, task in enumerate(tasks): loss_train = [] loss_test = [] for index, task in enumerate(tasks): # 针对每一个 task, 采样 fast-batch-size 个 trajectories, for batch in trange(config['num-batches']): train_episodes, train_loss, valid_episodes, valid_loss = sampler.sample( task, num_steps=config['num-steps'], fast_lr=config['fast-lr'], gamma=config['gamma'], gae_lambda=config['gae-lambda'], device=args.device) loss_train.append(train_loss) loss_test.append(valid_loss) # metalearner = MAMLTRPO(args, # post_policy, # fast_lr=config['fast-lr'], # first_order=config['first-order']) num_iterations = 0
def main(args): with open(args.config, 'r') as f: config = json.load(f) if args.seed is not None: torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # env = gym.make(config['env-name'], **config['env-kwargs']) env = gym.make(config['env-name'], **config.get('env-kwargs', {})) env.close() # Policy policy = get_policy_for_env(env, hidden_sizes=config['hidden-sizes'], nonlinearity=config['nonlinearity']) with open(args.policy, 'rb') as f: state_dict = torch.load(f, map_location=torch.device(args.device)) policy.load_state_dict(state_dict) policy.share_memory() # Baseline baseline = LinearFeatureBaseline(get_input_size(env)) # Sampler sampler = MultiTaskSampler(config['env-name'], env_kwargs=config.get('env-kwargs', {}), batch_size=config['fast-batch-size'], policy=policy, baseline=baseline, env=env, seed=args.seed, num_workers=args.num_workers) logs = {'tasks': []} train_returns, valid_returns = [], [] # to see the grad0 ~ multi gradient grad_returns = [] for i in range(Grad_Steps): grad_returns.append([]) # to see the grad0 ~ multi gradient for batch in trange(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) train_episodes, valid_episodes = sampler.sample( tasks, num_steps=config['num-steps'], fast_lr=config['fast-lr'], gamma=config['gamma'], gae_lambda=config['gae-lambda'], device=args.device) logs['tasks'].extend(tasks) # to see the grad0 ~ multi gradient for i in range(Grad_Steps): grad_returns[i].append(get_returns(train_episodes[i])) for i in range(Grad_Steps): logs['grad' + str(i) + '_returns'] = np.concatenate( grad_returns[i], axis=0) # to see the grad0 ~ multi gradient train_returns.append(get_returns(train_episodes[0])) valid_returns.append(get_returns(valid_episodes)) logs['train_returns'] = np.concatenate(train_returns, axis=0) logs['valid_returns'] = np.concatenate(valid_returns, axis=0) # to see the grad0 ~ multi gradient value = [0] * (Grad_Steps + 1) for i in range(Grad_Steps): value[i] = logs['grad' + str(i) + '_returns'].mean() value[Grad_Steps] = logs['valid_returns'].mean() print(value) print(logs['valid_returns'].mean()) # to see the grad0 ~ multi gradient with open(args.output, 'wb') as f: np.savez(f, **logs)
def main(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'RVONavigation-v0', 'RVONavigationAll-v0' ]) assert continuous_actions == True writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) log_traj_folder = './logs/{0}'.format(args.output_traj_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) if not os.path.exists(log_traj_folder): os.makedirs(log_traj_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) # log_reward_total_file = open('./logs/reward_total.txt', 'a') # log_reward_dist_file = open('./logs/reward_dist.txt', 'a') # log_reward_col_file = open('./logs/reward_col.txt', 'a') sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) # print(sampler.envs.observation_space.shape) # print(sampler.envs.action_space.shape) # eewfe if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) # baseline = LinearFeatureBaseline( # int(np.prod(sampler.envs.observation_space.shape))) baseline = LinearFeatureBaseline(int(np.prod((2, )))) resume_training = True if resume_training: saved_policy_path = os.path.join( './TrainingResults/result2//saves/{0}'.format( 'maml-2DNavigation-dir'), 'policy-180.pt') if os.path.isfile(saved_policy_path): print('Loading a saved policy') policy_info = torch.load(saved_policy_path) policy.load_state_dict(policy_info) else: sys.exit("The requested policy does not exist for loading") metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) start_time = time.time() for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # print("observations shape: ") # print(episodes[0][1].observations.shape) # ewerw # Tensorboard total_reward_be, dist_reward_be, col_reward_be = total_rewards( [ep.rewards for ep, _ in episodes]) total_reward_af, dist_reward_af, col_reward_af = total_rewards( [ep.rewards for _, ep in episodes]) log_reward_total_file = open('./logs/reward_total.txt', 'a') log_reward_dist_file = open('./logs/reward_dist.txt', 'a') log_reward_col_file = open('./logs/reward_col.txt', 'a') log_reward_total_file.write( str(batch) + ',' + str(total_reward_be) + ',' + str(total_reward_af) + '\n') log_reward_dist_file.write( str(batch) + ',' + str(dist_reward_be) + ',' + str(dist_reward_af) + '\n') log_reward_col_file.write( str(batch) + ',' + str(col_reward_be) + ',' + str(col_reward_af) + '\n') log_reward_total_file.close( ) # not sure if open and close immediantly will help save the appended logs in-place log_reward_dist_file.close() log_reward_col_file.close() writer.add_scalar('total_rewards/before_update', total_reward_be, batch) writer.add_scalar('total_rewards/after_update', total_reward_af, batch) writer.add_scalar('distance_reward/before_update', dist_reward_be, batch) writer.add_scalar('distance_reward/after_update', dist_reward_af, batch) writer.add_scalar('collison_rewards/before_update', col_reward_be, batch) writer.add_scalar('collison_rewards/after_update', col_reward_af, batch) if batch % args.save_every == 0: # maybe it can save time/space if the models are saved only periodically # Save policy network print('Saving model {}'.format(batch)) with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f) if batch % 30 == 0: with open( os.path.join( log_traj_folder, 'train_episodes_observ_' + str(batch) + '.pkl'), 'wb') as f: pickle.dump( [ep.observations.cpu().numpy() for ep, _ in episodes], f) with open( os.path.join( log_traj_folder, 'valid_episodes_observ_' + str(batch) + '.pkl'), 'wb') as f: pickle.dump( [ep.observations.cpu().numpy() for _, ep in episodes], f) # with open(os.path.join(log_traj_folder, 'train_episodes_ped_state_'+str(batch)+'.pkl'), 'wb') as f: # pickle.dump([ep.hid_observations.cpu().numpy() for ep, _ in episodes], f) # with open(os.path.join(log_traj_folder, 'valid_episodes_ped_state_'+str(batch)+'.pkl'), 'wb') as f: # pickle.dump([ep.hid_observations.cpu().numpy() for _, ep in episodes], f) # save tasks # a sample task list of 2: [{'goal': array([0.0209588 , 0.15981938])}, {'goal': array([0.45034602, 0.17282322])}] with open( os.path.join(log_traj_folder, 'tasks_' + str(batch) + '.pkl'), 'wb') as f: pickle.dump(tasks, f) else: # supposed to be overwritten for each batch with open( os.path.join(log_traj_folder, 'latest_train_episodes_observ.pkl'), 'wb') as f: pickle.dump( [ep.observations.cpu().numpy() for ep, _ in episodes], f) with open( os.path.join(log_traj_folder, 'latest_valid_episodes_observ.pkl'), 'wb') as f: pickle.dump( [ep.observations.cpu().numpy() for _, ep in episodes], f) # with open(os.path.join(log_traj_folder, 'latest_train_episodes_ped_state.pkl'), 'wb') as f: # pickle.dump([ep.hid_observations.cpu().numpy() for ep, _ in episodes], f) # with open(os.path.join(log_traj_folder, 'latest_valid_episodes_ped_state.pkl'), 'wb') as f: # pickle.dump([ep.hid_observations.cpu().numpy() for _, ep in episodes], f) with open(os.path.join(log_traj_folder, 'latest_tasks.pkl'), 'wb') as f: pickle.dump(tasks, f) print('finished epoch {}; time elapsed: {}'.format( batch, time_elapsed(time.time() - start_time)))
def main(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0' ]) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if args.env_name == 'AntVel-v1': param_bounds = {"goal": [0, 3]} if args.env_name == 'AntPos-v0': param_bounds = {"x": [-3, 3], "y": [-3, 3]} teacher = TeacherController(args.teacher, args.nb_test_episodes, param_bounds, seed=args.seed, teacher_params={}) tree = TreeLSTM(args.tree_hidden_layer, len(param_bounds.keys()), args.cluster_0, args.cluster_1, device=args.device) if continuous_actions: policy = NormalMLPPolicy(int( np.prod(sampler.envs.observation_space.shape) + args.tree_hidden_layer), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers, tree=tree) else: policy = CategoricalMLPPolicy(int( np.prod(sampler.envs.observation_space.shape) + args.tree_hidden_layer), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers, tree=tree) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape)) + args.tree_hidden_layer) metalearner = MetaLearner(sampler, policy, baseline, tree=tree, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) all_tasks = [] for batch in range(args.num_batches): print("starting iteration {}".format(batch)) tasks = [] for _ in range(args.meta_batch_size): if args.env_name == 'AntPos-v0': tasks.append( {"position": teacher.task_generator.sample_task()}) if args.env_name == 'AntVel-v1': tasks.append( {"velocity": teacher.task_generator.sample_task()[0]}) all_tasks.append(tasks) # tasks = np.array(tasks) # tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) with open('./logs/{0}/task_list.pkl'.format(args.output_folder), 'wb') as pf: pickle.dump(all_tasks, pf) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) tr = [ep.rewards for _, ep in episodes] tr = [torch.mean(torch.sum(rewards, dim=0)).item() for rewards in tr] print("rewards:", tr) for t in range(args.meta_batch_size): if args.env_name == 'AntPos-v0': teacher.task_generator.update(tasks[t]["position"], tr[t]) if args.env_name == 'AntVel-v1': teacher.task_generator.update(np.array([tasks[t]["velocity"]]), tr[t]) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f) # Save tree torch.save(tree, os.path.join(save_folder, 'tree-{0}.pt'.format(batch)))
the_model = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: the_model = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) #loading the model save_folder = './saves/{0}'.format(args.output_folder) the_model.load_state_dict( torch.load(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)))) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, the_model, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) env = gym.make(args.env_name) # new task! episodes = [] #randomly sample task
valid_episodes_queue = mp.Queue() policy_lock = mp.Lock() env_name = "2DNavigation-v0" env_kwargs = { "low": -0.5, "high": 0.5, "task": {"goal": np.array([1, 1])} } env = gym.make(env_name, **env_kwargs) print(env.) policy = get_policy_for_env(env, hidden_sizes=(64, 64), nonlinearity='tanh') policy.share_memory() baseline = LinearFeatureBaseline(get_input_size(env)) seed = None worker = SamplerWorker(1, env_name, env_kwargs, 20, env.observation_space, env.action_space, policy, deepcopy(baseline), seed, task_queue, train_episodes_queue, valid_episodes_queue, policy_lock)
def run_meta_training(self, final_model_folder, policy=None): parser = argparse.ArgumentParser( description='Reinforcement learning with ' 'Model-Agnostic Meta-Learning (MAML)') # General parser.add_argument('--env-name', type=str, help='name of the environment', default='BiddingMDP-v0') parser.add_argument('--gamma', type=float, default=0.95, help='value of the discount factor gamma') parser.add_argument('--tau', type=float, default=1.0, help='value of the discount factor for GAE') parser.add_argument('--first-order', action='store_true', help='use the first-order approximation of MAML') # Policy network (relu activation function) #parser.add_argument('--hidden-size', type=int, default=50, parser.add_argument('--hidden-size', type=int, default=200, help='number of hidden units per layer') parser.add_argument('--num-layers', type=int, default=2, help='number of hidden layers') # Task-specific parser.add_argument('--fast-batch-size', type=int, default=20, help='batch size for each individual task') parser.add_argument( '--fast-lr', type=float, default=0.5, help='learning rate for the 1-step gradient update of MAML') # Optimization parser.add_argument( '--num-batches', type=int, default=32, # parser.add_argument('--num-batches', type=int, default=32, # parser.add_argument('--num-batches', type=int, default=50, help='number of batches') #parser.add_argument('--meta-batch-size', type=int, default=50, # parser.add_argument('--meta-batch-size', type=int, default=50, parser.add_argument('--meta-batch-size', type=int, default=2, help='number of tasks per batch') parser.add_argument('--max-kl', type=float, default=1e-2, help='maximum value for the KL constraint in TRPO') parser.add_argument('--cg-iters', type=int, default=10, help='number of iterations of conjugate gradient') parser.add_argument('--cg-damping', type=float, default=1e-5, help='damping in conjugate gradient') # parser.add_argument('--ls-max-steps', type=int, default=2, parser.add_argument( '--ls-max-steps', type=int, default=15, # parser.add_argument('--ls-max-steps', type=int, default=15, help='maximum number of iterations for line search') parser.add_argument( '--ls-backtrack-ratio', type=float, default=0.8, help='maximum number of iterations for line search') # Miscellaneous parser.add_argument('--output-folder', type=str, default='maml', help='name of the output folder') # parser.add_argument('--num-workers', type=int, default=mp.cpu_count() - 2, parser.add_argument('--num-workers', type=int, default=4, help='number of workers for trajectories sampling') parser.add_argument('--device', type=str, default='cuda', help='set the device (cpu or cuda)') args = parser.parse_args() self.fast_batch_size = args.fast_batch_size self.max_kl = args.max_kl self.cg_iters = args.cg_iters self.first_order = args.first_order self.cg_damping = args.cg_damping self.ls_max_steps = args.ls_max_steps self.ls_backtrack_ratio = args.ls_backtrack_ratio self.output_folder = args.output_folder self.num_batches = args.num_batches continuous_actions = (args.env_name in [ 'BiddingMDP-v0', 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0' ]) # Create logs and saves folder if they don't exist if not os.path.exists('./logs'): os.makedirs('./logs') if not os.path.exists('./saves'): os.makedirs('./saves') # Device # args.device = torch.device(args.device # if torch.cuda.is_available() else 'cpu') args.device = torch.device("cpu") writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if policy is None and continuous_actions: print("CREATING POLICY WHEN IT SHOULD NOT") exit() policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) self.policy = policy elif policy is None: print("CREATING POLICY WHEN IT SHOULD NOT") exit() policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) self.policy = policy metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard writer.add_scalar( 'total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar( 'total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) torch.cuda.empty_cache() # Save policy network final_model_path = final_model_folder + "meta_rl_gamma_policy_{}.pt".format( batch) with open(final_model_path, 'wb') as f: torch.save(policy.state_dict(), f) self.metalearner = metalearner return final_model_path
def main(args): # Setup for logging tb_writer = SummaryWriter('./logs/tb_{}'.format( args.log_name)) # Tensorboard logging log = set_log(args) # Setup before meta-train starts sampler = BatchSampler(env_name=args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers, args=args) # NOTE Observation space is a list with [predator0, predator1, ..., prey] # Thus using the index of 0 policy = NormalMLPPolicy( input_size=int(np.prod(sampler.envs.observation_space[0].shape)), output_size=int(np.prod(sampler.envs.action_space[0].shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( input_size=int(np.prod(sampler.envs.observation_space[0].shape))) meta_learner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device, args=args, log=log, tb_writer=tb_writer) # meta_learner.load( # filename="theta_200", directory="./pytorch_models") meta_tester = MetaTester(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device, args=args, log=log, tb_writer=tb_writer) prey = Prey(env=sampler._env, args=args, log=log, tb_writer=tb_writer, name="prey", i_agent=0) # Meta-train starts iteration = 0 while True: # Sample train and validation episode tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size, test=False) episodes = meta_learner.sample(tasks, prey, first_order=args.first_order, iteration=iteration) # Train meta-policy meta_learner.step(episodes=episodes, args=args) # Test meta-policy if iteration % 10 == 0: test_tasks = sampler.sample_tasks(num_tasks=5, test=True) meta_tester.few_shot_adaptation(meta_policy=meta_learner.policy, tasks=test_tasks, first_order=args.first_order, iteration=iteration, prey=prey) if iteration % 100 == 0: meta_learner.save(iteration) iteration += 1
def main(args): with open(args.config, 'r') as f: config = yaml.load(f, Loader=yaml.FullLoader) if args.output_folder is not None: if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) policy_filename = os.path.join(args.output_folder, 'policy.th') config_filename = os.path.join(args.output_folder, 'config.json') with open(config_filename, 'w') as f: config.update(vars(args)) json.dump(config, f, indent=2) if args.seed is not None: torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) print(config) writer = SummaryWriter(logdir='./log') env = gym.make(config['env-name'], **config.get('env-kwargs', {})) env.close() # Policy policy = get_policy_for_env(env, hidden_sizes=config['hidden-sizes'], nonlinearity=config['nonlinearity']) policy.share_memory() # Baseline baseline = LinearFeatureBaseline(get_input_size(env)) # Sampler sampler = MultiTaskSampler(config['env-name'], env_kwargs=config.get('env-kwargs', {}), batch_size=config['fast-batch-size'], policy=policy, baseline=baseline, env=env, seed=args.seed, num_workers=args.num_workers) metalearner = MAMLTRPO(policy, fast_lr=config['fast-lr'], first_order=config['first-order'], device=args.device) num_iterations = 0 for batch in trange(config['num-batches']): tasks = sampler.sample_tasks( num_tasks=config['meta-batch-size']) # (meta-batch-size, K-arm) futures = sampler.sample_async(tasks, num_steps=config['num-steps'], fast_lr=config['fast-lr'], gamma=config['gamma'], gae_lambda=config['gae-lambda'], device=args.device) logs = metalearner.step( *futures, max_kl=config['max-kl'], cg_iters=config['cg-iters'], cg_damping=config['cg-damping'], ls_max_steps=config['ls-max-steps'], ls_backtrack_ratio=config['ls-backtrack-ratio']) train_episodes, valid_episodes = sampler.sample_wait(futures) num_iterations += sum( sum(episode.lengths) for episode in train_episodes[0]) num_iterations += sum( sum(episode.lengths) for episode in valid_episodes) logs.update(tasks=tasks, num_iterations=num_iterations, train_returns=get_returns(train_episodes[0]), valid_returns=get_returns(valid_episodes)) # print(logs) writer.add_scalar('MAML/Loss Before', logs['loss_before'].mean(), num_iterations) writer.add_scalar('MAML/KL Before', logs['kl_before'].mean(), num_iterations) if 'loss_after' in logs: writer.add_scalar('MAML/Loss After', logs['loss_after'].mean(), num_iterations) if 'kl_after' in logs: writer.add_scalar('MAML/KL After', logs['kl_after'].mean(), num_iterations) writer.add_scalar('MAML/Train Returns', logs['train_returns'].sum(), num_iterations) writer.add_scalar('MAML/Valid Returns', logs['valid_returns'].sum(), num_iterations) writer.add_scalar( 'MAML/Train Cumulative Regret', sum([task['mean'].max() for task in logs['tasks']]) * config['fast-batch-size'] - logs['train_returns'].sum(), num_iterations) writer.add_scalar( 'MAML/Valid Cumulative Regret', sum([task['mean'].max() for task in logs['tasks']]) * config['fast-batch-size'] - logs['valid_returns'].sum(), num_iterations) # Save policy if args.output_folder is not None: with open(policy_filename, 'wb') as f: torch.save(policy.state_dict(), f) writer.close()
def main(args): continuous_actions = (args.env_name in ['AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0']) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size,) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size,) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearnerNGLVCVPG(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device, verbose=args.verbose) for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) start = time.time() episodes, kls, param_diffs = metalearner.sample(tasks, first_order=args.first_order, cg_iters=args.cg_iters) sample_time = time.time() - start start = time.time() if args.optimizer is 'sgd': metalearner.step_sgd(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) else: metalearner.step_adam(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) update_time = time.time() - start # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) writer.add_scalar('kl-mean between meta update', torch.mean(torch.stack(kls)), batch) writer.add_scalar('kl-std between meta update', torch.std(torch.stack(kls)), batch) writer.add_scalar('Euclidean-distance-mean between meta update', torch.mean(torch.stack(param_diffs)), batch) writer.add_scalar('Euclidean-distance-std between meta update', torch.std(torch.stack(param_diffs)), batch) print("Batch {}. before_update: {}, after_update: {}\n sample time {}, update_time {}".format(batch, total_rewards([ep.rewards for ep, _ in episodes]), total_rewards([ep.rewards for _, ep in episodes]), sample_time, update_time)) print("Batch {}. kl-divergence between meta update: {}, kl std: {}".format( batch, torch.mean(torch.stack(kls)), torch.std(torch.stack(kls)))) print("Batch {}. Euclidean-distance-mean meta update: {}, Euclidean-distance-std: {}".format( batch, torch.mean(torch.stack(param_diffs)), torch.std(torch.stack(param_diffs)))) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f)
def main(args, prior_policy=None, init_from_prior=True): # ******************************************************************* # config log filename # 'r': read; 'w': write # ******************************************************************* with open(args.config, 'r') as f: config = yaml.load(f, Loader=yaml.FullLoader) if args.output_folder is not None: # 如果没有文件,则创建文件地址 if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) # 文件夹地址与文件名 policy_filename = os.path.join(args.output_folder, 'policy_2d_PAC_Bayes.th') config_filename = os.path.join(args.output_folder, 'config_2d_PAC_Bayes.json') # with open(config_filename, 'w') as f: # config.update(vars(args)) # json.dump(config, f, indent=2) if args.seed is not None: torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) env = gym.make(config['env-name'], **config['env-kwargs']) # 待测试 env.seed(args.seed) env.close() """ ************************************************************ 新增加的参数:用于获取环境的动作观测空间大小,一便生成随机贝叶斯网络 output_size = reduce(mul, env.action_space.shape, 1) input_size = reduce(mul, env.observation_space.shape, 1) ************************************************************ """ observation_space = env.observation_space action_space = env.action_space args.output_size = reduce(mul, env.action_space.shape, 1) args.input_size = reduce(mul, env.observation_space.shape, 1) """ ************************************************************ 新增加的模型:随机网络 device = ('cuda' if (torch.cuda.is_available() and args.use_cuda) else 'cpu') log_var_init = {'mean': -10, 'std': 0.1} ************************************************************ """ if prior_policy and init_from_prior: # init from prior model: # deepcopy函数:复制并作为一个单独的个体存在;copy函数:复制原有对象,随着原有对象改变而改变 prior_policy = deepcopy(prior_policy).to(args.device) else: # 否则直接加载新模型 prior_policy = get_policy_for_env(args.device, args.log_var_init, env, hidden_sizes=config['hidden-sizes']) # 数据无需拷贝,即可使用 # prior_policy.share_memory() """ ************************************************************ 策略 prior model 与 post model 以及对应的参数 param prior_policy posteriors_policies prior_params all_post_param all_params ************************************************************ """ num_tasks = config['meta-batch-size'] batch_size = config['fast-batch-size'] # Unpack parameters: # 提取参数 优化方法 优化参数 学习率等 optim_func, optim_args, lr_schedule =\ args.optim_func, args.optim_args, args.lr_schedule posteriors_policies = [ get_policy_for_env(args.device, args.log_var_init, env, hidden_sizes=config['hidden-sizes']) for _ in range(num_tasks) ] all_post_param = sum([ list(posterior_policy.parameters()) for posterior_policy in posteriors_policies ], []) # Create optimizer for all parameters (posteriors + prior) # 对所有参数 包括 prior 以及 posterior 创建优化器 prior_params = list(prior_policy.parameters()) all_params = all_post_param + prior_params all_optimizer = optim_func(all_params, **optim_args) """生成固定的 tasks 随机数问题尚未解决,可重复性不行 """ # Baseline baseline = LinearFeatureBaseline(get_input_size(env)) # 生成 'meta-batch-size' 任务 # for task in enumerate(tasks): tasks = env.unwrapped.sample_tasks(num_tasks) # meta-batch-size:Number of tasks in each batch of tasks # 一个batch中任务的个数,此处使用 PAC-Bayes方法,因此任务类型以及数量是固定 # 也即在2D导航任务中,目标值固定,每次采用不同轨迹进行训练 # tasks = sampler.sample_tasks(num_tasks=config['meta-batch-size']) avg_empiric_loss_per_task = torch.zeros(num_tasks, device=args.device) avg_reward_per_task = torch.zeros(num_tasks, device=args.device) complexity_per_task = torch.zeros(num_tasks, device=args.device) # 此参数针对不同任务有不同的训练数量的情况 n_samples_per_task = torch.zeros(num_tasks, device=args.device) Info_avg_reward = [] Info_total_objective = [] Info_last_reward = [] Info_train_trajectories = [] # 训练的次数 num-batches 个 batch for batch in range(config['num-batches']): print(batch) # params_show_train = prior_policy.state_dict() # Hyper-prior term: # 计算超先验与超后验的散度 hyper_dvrg = get_hyper_divergnce(kappa_prior=args.kappa_prior, kappa_post=args.kappa_post, divergence_type=args.divergence_type, device=args.device, prior_model=prior_policy) # 根据 超散度 hyper_dvrg 计算对应的 meta项 传参方式也可以直接安顺序传递 meta_complex_term = get_meta_complexity_term( hyper_kl=hyper_dvrg, delta=args.delta, complexity_type=args.complexity_type, n_train_tasks=num_tasks) for i_task in range(num_tasks): sampler = SampleTest(config['env-name'], env_kwargs=config['env-kwargs'], batch_size=batch_size, observation_space=observation_space, action_space=action_space, policy=posteriors_policies[i_task], baseline=baseline, seed=args.seed, prior_policy=prior_policy, task=tasks[i_task]) # calculate empirical error for per task loss_per_task, avg_reward, last_reward, train_episodes = sampler.sample( ) complexity = get_task_complexity( delta=args.delta, complexity_type=args.complexity_type, device=args.device, divergence_type=args.divergence_type, kappa_post=args.kappa_post, prior_model=prior_policy, post_model=posteriors_policies[i_task], n_samples=batch_size, avg_empiric_loss=loss_per_task, hyper_dvrg=hyper_dvrg, n_train_tasks=num_tasks, noised_prior=True) avg_empiric_loss_per_task[i_task] = loss_per_task avg_reward_per_task[i_task] = avg_reward complexity_per_task[i_task] = complexity n_samples_per_task[i_task] = batch_size # Approximated total objective: if args.complexity_type == 'Variational_Bayes': # note that avg_empiric_loss_per_task is estimated by an average over batch samples, # but its weight in the objective should be considered by how many samples there are total in the task total_objective = \ (avg_empiric_loss_per_task * n_samples_per_task + complexity_per_task).mean() * num_tasks \ + meta_complex_term # total_objective = ( avg_empiric_loss_per_task * n_samples_per_task # + complexity_per_task).mean() + meta_complex_term else: total_objective = \ avg_empiric_loss_per_task.mean() + complexity_per_task.mean() + meta_complex_term # Take gradient step with the shared prior and all tasks' posteriors: grad_step(total_objective, all_optimizer, lr_schedule, args.lr) Info_avg_reward.append(avg_reward_per_task.mean()) Info_total_objective.append(total_objective) Info_last_reward.append(last_reward) # ******************************************************************* # Save policy # ******************************************************************* # 将模型参数保存至 policy_filename 中的 python.th if args.output_folder is not None: with open(policy_filename, 'wb') as f: # 保存网络中的参数,f 为路径 torch.save(prior_policy.state_dict(), f) # ******************************************************************* # Test # learned policy : prior_policy # saved parameters : 'policy_2d_PAC_Bayes.th' # ******************************************************************* env_name = config['env-name'], env_kwargs = config['env-kwargs'] test_num = 10 Info_test_loss = [] Info_test_avg_reward = [] Info_test_last_reward = [] for test_batch in range(test_num): # 生成新任务,训练并进行验证误差 test_task = env.unwrapped.sample_tasks(1) post_policy = get_policy_for_env(args.device, args.log_var_init, env, hidden_sizes=config['hidden-sizes']) post_policy.load_state_dict(prior_policy.state_dict()) # based on the prior_policy, train post_policy; then test learned post_policy test_loss_per_task, test_avg_reward, test_last_reward = run_test( task=test_task, prior_policy=prior_policy, post_policy=post_policy, baseline=baseline, args=args, env_name=env_name, env_kwargs=env_kwargs, batch_size=batch_size, observation_space=observation_space, action_space=action_space, n_train_tasks=num_tasks) Info_test_loss.append(test_loss_per_task) Info_test_avg_reward.append(test_avg_reward) Info_test_last_reward.append(test_last_reward)
def main(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'Point2DWalls-corner-v0', 'Ant-v0', 'HalfCheetah-v0' ]) logger.configure(dir=args.log_dir, format_strs=['stdout', 'log', 'csv']) logger.log(args) json.dump(vars(args), open(os.path.join( args.log_dir, 'params.json', ), 'w'), indent=2) sampler = BatchSamplerMultiworld(args) sampler_val = BatchSamplerMultiworld(args, val=True) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers, bias_transformation_size=args.bias_transformation_size, init_gain=args.init_gain, ) else: raise NotImplementedError baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, entropy_coef=args.entropy_coef, device=args.device) start_time = time.time() processes = [] for batch in range(args.num_batches): metalearner.reset() tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) if sampler.rewarder.fit_counter > 0: metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) if batch % args.rewarder_fit_period == 0: sampler.fit_rewarder(logger) if args.rewarder == 'unsupervised': sampler.log_unsupervised(logger) log_main(logger, episodes, batch, args, start_time, metalearner) if batch % args.save_period == 0 or batch == args.num_batches - 1: save_model_maml(args, policy, batch) if batch % args.val_period == 0 or batch == args.num_batches - 1: val(args, sampler_val, policy, baseline, batch) if batch % args.vis_period == 0 or batch == args.num_batches - 1: if args.plot: p = Popen( 'python maml_rl/utils/visualize.py --log-dir {}'.format( args.log_dir), shell=True) processes.append(p) logger.dumpkvs()
def main(args): with open(args.config, 'r') as f: config = yaml.load(f, Loader=yaml.FullLoader) if args.output_folder is not None: if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) policy_filename = os.path.join(args.output_folder, 'policy.th') config_filename = os.path.join(args.output_folder, 'config.json') with open(config_filename, 'w') as f: config.update(vars(args)) json.dump(config, f, indent=2) if args.seed is not None: torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) env = gym.make(config['env-name'], **config['env-kwargs']) env.close() # Policy policy = get_policy_for_env(env, hidden_sizes=config['hidden-sizes'], nonlinearity=config['nonlinearity']) policy.share_memory() print(policy) # Baseline baseline = LinearFeatureBaseline(get_input_size(env)) # Sampler sampler = MultiTaskSampler(config['env-name'], env_kwargs=config['env-kwargs'], batch_size=config['fast-batch-size'], policy=policy, baseline=baseline, env=env, seed=args.seed, num_workers=args.num_workers, args=args) metalearner = MAMLTRPO(policy, fast_lr=config['fast-lr'], first_order=config['first-order'], device=args.device) num_iterations = 0 for batch in trange(config['num-batches']): tasks = sampler.sample_tasks(num_tasks=config['meta-batch-size']) futures = sampler.sample_async(tasks, num_steps=config['num-steps'], fast_lr=config['fast-lr'], gamma=config['gamma'], gae_lambda=config['gae-lambda'], device=args.device) logs = metalearner.step( *futures, max_kl=config['max-kl'], cg_iters=config['cg-iters'], cg_damping=config['cg-damping'], ls_max_steps=config['ls-max-steps'], ls_backtrack_ratio=config['ls-backtrack-ratio'], args=args) train_episodes, valid_episodes = sampler.sample_wait(futures) num_iterations += sum( sum(episode.lengths) for episode in train_episodes[0]) num_iterations += sum( sum(episode.lengths) for episode in valid_episodes) logs.update(tasks=tasks, num_iterations=num_iterations, train_returns=get_returns(train_episodes[0]), valid_returns=get_returns(valid_episodes)) # Save policy if args.output_folder is not None: with open(policy_filename, 'wb') as f: torch.save(policy.state_dict(), f)