def main(args): continuous_actions = True writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.num_batches): print("========== BATCH NUMBER {0} ==========".format(batch)) tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) # Save policy network with open( os.path.join(save_folder, 'policy-{0}.pt'.format(batch + 256)), 'wb') as f: torch.save(policy.state_dict(), f)
def main(args): continuous_actions = (args.env_name in [ 'AntVelEnv-v1', 'AntDirEnv-v1', 'HalfCheetahVelEnv-v1', 'HalfCheetahDirEnv-v1', '2DNavigation-v0' ]) save_folder = os.path.join('tmp', args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) # Load model with open(args.model, 'rb') as f: state_dict = torch.load(f) policy.load_state_dict(state_dict) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) args.meta_batch_size = 81 # velocities = np.linspace(-1., 3., num=args.meta_batch_size) # tasks = [{'velocity': velocity} for velocity in velocities] tasks = [{'direction': direction} for direction in [-1, 1]] for batch in range(args.num_batches): episodes = metalearner.sample(tasks) train_returns = [ep.rewards.sum(0).cpu().numpy() for ep, _ in episodes] valid_returns = [ep.rewards.sum(0).cpu().numpy() for _, ep in episodes] with open(os.path.join(save_folder, '{0}.npz'.format(batch)), 'wb') as f: np.savez(f, train=train_returns, valid=valid_returns) print('Batch {0}'.format(batch))
def clone_policy(policy, params=None, with_names=False): if params is None: params = policy.get_trainable_variables() if isinstance(policy, CategoricalMLPPolicy): cloned_policy = CategoricalMLPPolicy(input_size=policy.input_size, output_size=policy.output_size, hidden_sizes=policy.hidden_sizes, nonlinearity=policy.nonlinearity) elif isinstance(policy, NormalMLPPolicy): cloned_policy = NormalMLPPolicy(input_size=policy.input_size, output_size=policy.output_size, hidden_sizes=policy.hidden_sizes, nonlinearity=policy.nonlinearity) else: raise NotImplementedError('Only `Categorical` and `Normal` ' 'policies are valid policies at the moment.') #x = tf.zeros(shape=(1, cloned_policy.input_size)) #cloned_policy(x) if with_names: cloned_policy.set_params_with_name(params) else: cloned_policy.set_params(params) return cloned_policy
def main(args): args.output_folder = args.env_name # TODO continuous_actions = (args.env_name in ['AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0']) # writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: # config = {k: v for (k, v) in vars(args).iteritems() if k != 'device'} config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) print(config) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), # input shape int(np.prod(sampler.envs.action_space.shape)), # output shape hidden_sizes=(args.hidden_size,) * args.num_layers) # [100, 100] else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size,) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.num_batches): # number of epoches tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard # writer.add_scalar('total_rewards/before_update', # total_rewards([ep.rewards for ep, _ in episodes]), batch) # writer.add_scalar('total_rewards/after_update', # total_rewards([ep.rewards for _, ep in episodes]), batch) # # Save policy network # with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: # torch.save(policy.state_dict(), f) print(batch, total_rewards([ep.rewards for ep, _ in episodes]), total_rewards([ep.rewards for _, ep in episodes]))
def get_policy_for_env(env, hidden_sizes=(100, 100), nonlinearity='relu'): continuous_actions = isinstance(env.action_space, gym.spaces.Box) input_size = get_input_size(env) nonlinearity = getattr(torch, nonlinearity) if continuous_actions: output_size = reduce(mul, env.action_space.shape, 1) policy = NormalMLPPolicy(input_size, output_size, hidden_sizes=tuple(hidden_sizes), nonlinearity=nonlinearity) else: output_size = env.action_space.n policy = CategoricalMLPPolicy(input_size, output_size, hidden_sizes=tuple(hidden_sizes), nonlinearity=nonlinearity) return policy
def get_policy_for_env(env, hidden_sizes=(100, 100), nonlinearity='relu'): # 判断是连续控制还是离散控制问题 continuous_actions = isinstance(env.action_space, gym.spaces.Box) input_size = get_input_size(env) # 输入是状态空间的维数乘积 nonlinearity = getattr(torch, nonlinearity) # 返回对象的属性值,这里是一个torch函数 if continuous_actions: output_size = reduce(mul, env.action_space.shape, 1) # 输出是动作空间维数乘积 policy = NormalMLPPolicy(input_size, output_size, hidden_sizes=tuple(hidden_sizes), nonlinearity=nonlinearity) else: output_size = env.action_space.n policy = CategoricalMLPPolicy(input_size, output_size, hidden_sizes=tuple(hidden_sizes), nonlinearity=nonlinearity) return policy
def train_meta_learning_model(args): # import matplotlib.pyplot as plt # import matplotlib.animation as animation # from matplotlib import style # style.use('fivethirtyeight') # fig = plt.figure() # ax1 = fig.add_subplot(1,1,1) # xs = [] # ys = [] # def animate(i): # ax1.clear() # ax1.plot(xs, ys) rewards_before_ml = [] rewards_after_ml = [] continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'MountainCarContinuousVT-v0' ]) # writer = SummaryWriter('./logs/{0}'.format(args.output_folder + '_metalearned')) save_folder = './saves/{0}'.format(args.output_folder + '_metalearned') if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) torch.manual_seed(args.random_seed) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) #load pretrained model cont_from_batch = 0 if args.start_from_batch != -1: metalearned_model = os.path.join( save_folder, 'policy-{0}.pt'.format(args.start_from_batch - 1)) if os.path.exists(metalearned_model): policy.load_state_dict(torch.load(metalearned_model)) cont_from_batch = args.start_from_batch metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(cont_from_batch, args.num_batches): print('Currently processing Batch: {}'.format(batch + 1)) task_sampling_time = time.time() tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size, sampling_type=args.sampling_type, points_per_dim=args.points_per_dim) task_sampling_time = time.time() - task_sampling_time episode_generating_time = time.time() episodes = metalearner.sample(tasks, first_order=args.first_order) episode_generating_time = time.time() - episode_generating_time learning_step_time = time.time() metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) learning_step_time = time.time() - learning_step_time print('Tasking Sampling Time: {}'.format(task_sampling_time)) print('Episode Generating Time: {}'.format(episode_generating_time)) print('Learning Step Time: {}'.format(learning_step_time)) reward_before_ml = total_rewards([ep.rewards for ep, _ in episodes], args.gamma) reward_after_ml = total_rewards([ep.rewards for _, ep in episodes], args.gamma) print('Before Update: {} After Update: {}'.format( reward_before_ml, reward_after_ml)) # experiment.log_metric("Avg Reward Before Update (MetaLearned)", reward_before_ml) experiment.log_metric("Avg Reward", reward_after_ml, batch + 1) rewards_before_ml.append(reward_before_ml) rewards_after_ml.append(reward_after_ml) # xs.append(batch+1) # ys.append(total_rewards([ep.rewards for _, ep in episodes], args.gamma)) # ani = animation.FuncAnimation(fig, animate, interval=1000) # plt.savefig('navg_baseline_monitor') # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(metalearner.policy.state_dict(), f) # tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) # episodes = metalearner.sample(tasks, first_order=args.first_order) # print("Avg Reward After Update (MetaLearned)", total_rewards([ep.rewards for _, ep in episodes], args.gamma)) testing_sampler = BatchSampler(args.env_name, batch_size=args.testing_fbs, num_workers=args.num_workers) testing_metalearner = MetaLearner(testing_sampler, metalearner.policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) test_tasks = testing_sampler.sample_tasks(num_tasks=args.testing_mbs, sampling_type='rand', points_per_dim=-1) test_episodes = testing_metalearner.sample(test_tasks, first_order=args.first_order, no_update=True) test_reward = total_rewards([ep.rewards for ep in test_episodes], args.gamma) print('-------------------------------------------------') print('Test Time reward is: ' + str(test_reward)) print('-------------------------------------------------') pickle_reward_data_file = os.path.join(save_folder, 'reward_data.pkl') with open(pickle_reward_data_file, 'wb') as f: pickle.dump(rewards_before_ml, f) pickle.dump(rewards_after_ml, f) pickle_final_reward_file = os.path.join(save_folder, 'final_reward.pkl') with open(pickle_final_reward_file, 'wb') as f: pickle.dump(test_reward, f) return
def main(args): logging.basicConfig(filename=args.debug_file, level=logging.WARNING, filemode='w') logging.getLogger('metalearner').setLevel(logging.INFO) continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'PendulumTheta-v0' ]) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) #if args.baseline == 'critic shared': # policy = NormalMLPPolicyA2C(int(np.prod(sampler.envs.observation_space.shape)), # int(np.prod(sampler.envs.action_space.shape)), # hidden_sizes=(args.hidden_size,) * args.num_layers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) if args.baseline == 'linear': baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) elif args.baseline == 'critic separate': baseline = CriticFunction( int(np.prod(sampler.envs.observation_space.shape)), 1, hidden_sizes=(args.hidden_size, ) * args.num_layers) #elif args.baseline == 'critic shared': # RANJANI TO DO metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device, baseline_type=args.baseline, cliprange=args.cliprange, noptepochs=args.noptepochs, usePPO=args.usePPO, nminibatches=args.nminibatches, ppo_lr=args.ppo_lr, useSGD=args.useSGD, ppo_momentum=args.ppo_momentum, grad_clip=args.grad_clip) for batch in range(args.num_batches): print("*********************** Batch: " + str(batch) + " ****************************") print("Creating tasks...") tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) print("Creating episodes...") episodes, grad_norm = metalearner.sample(tasks, first_order=args.first_order) print("Taking a meta step...") metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) print("Writing results to tensorboard...") # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) if grad_norm: writer.add_scalar('PPO mb grad norm', np.average(grad_norm)) print(np.average(grad_norm)) print("Saving policy network...") # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f) print("***************************************************")
def main(args): env_name = 'RVONavigationAll-v0' #['2DNavigation-v0', 'RVONavigation-v0', 'RVONavigationAll-v0'] test_folder = './{0}'.format('test_nav') fast_batch_size = 40 # number of trajectories saved_policy_file = os.path.join( './TrainingResults/result3/saves/{0}'.format('maml-2DNavigation-dir'), 'policy-180.pt') sampler = BatchSampler(env_name, batch_size=fast_batch_size, num_workers=3) policy = NormalMLPPolicy(int(np.prod( sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(100, ) * 2) # Loading policy if os.path.isfile(saved_policy_file): policy_info = torch.load(saved_policy_file, map_location=lambda storage, loc: storage) policy.load_state_dict(policy_info) print('Loaded saved policy') else: sys.exit("The requested policy does not exist for loading") # Creating test folder if not os.path.exists(test_folder): os.makedirs(test_folder) # Generate tasks # goal = [[-0.8, 0.9]] # task = [{'goal': goal}][0] tasks = sampler.sample_tasks(num_tasks=1) task = tasks[0] # Start validation print("Starting to test...Total step = ", args.grad_steps) start_time = time.time() # baseline = LinearFeatureBaseline(int(np.prod(sampler.envs.observation_space.shape))) baseline = LinearFeatureBaseline(int(np.prod((2, )))) metalearner = MetaLearner(sampler, policy, baseline, gamma=0.9, fast_lr=0.01, tau=0.99, device='cpu') # test_episodes = metalearner.sample(tasks) # for train, valid in test_episodes: # total_reward, dist_reward, col_reward = total_rewards(train.rewards) # print(total_reward) # total_reward, dist_reward, col_reward = total_rewards(valid.rewards) # print(total_reward) test_episodes = metalearner.test(task, n_grad=args.grad_steps) print('-------------------') for n_grad, ep in test_episodes: total_reward, dist_reward, col_reward = total_rewards(ep.rewards) print(total_reward) # with open(os.path.join(test_folder, 'test_episodes_grad'+str(n_grad)+'.pkl'), 'wb') as f: # pickle.dump([ep.observations.cpu().numpy(), ep], f) # with open(os.path.join(test_folder, 'task.pkl'), 'wb') as f: # pickle.dump(task, f) print('Finished test. Time elapsed = {}'.format( time_elapsed(time.time() - start_time)))
def main(args): random.seed(args.seed) np.random.seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.manual_seed(args.seed) continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0' ]) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner( sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, q_inner=args.inner_q == 'true', q_residuce_gradient=args.inner_q_residue_gradient == 'true', q_soft=args.inner_q_soft == 'true', q_soft_temp=args.inner_q_soft_temp, device=args.device, ) for batch in range(args.num_batches): if args.device.type == 'cuda': torch.cuda.empty_cache() tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes, adaptation_info = metalearner.sample( tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard pre_update_rewards = total_rewards([ep.rewards for ep, _ in episodes]) post_update_rewards = total_rewards([ep.rewards for _, ep in episodes]) writer.add_scalar('total_rewards/before_update', pre_update_rewards, batch) writer.add_scalar('total_rewards/after_update', post_update_rewards, batch) writer.add_scalar('total_rewards/rewards_improvement', post_update_rewards - pre_update_rewards, batch) writer.add_scalar('adaptation/pre_update_inner_loss', adaptation_info.mean_pre_update_loss, batch) writer.add_scalar('adaptation/post_update_inner_loss', adaptation_info.mean_post_update_loss, batch) writer.add_scalar('adaptation/inner_loss_improvement', adaptation_info.mean_loss_improvment, batch) writer.add_scalar('adaptation/weight_change', adaptation_info.mean_weight_change, batch) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f)
def main(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0' ]) writer = tf.summary.create_file_writer('./logs/{0}'.format( args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) # Create policy for the given task with tf.name_scope('policy') as scope: if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers, name=scope) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers, name=scope) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) optimizer = ConjugateGradientOptimizer(args.cg_damping, args.cg_iters, args.ls_backtrack_ratio, args.ls_max_steps, args.max_kl, policy) metalearner = MetaLearner(sampler, policy, baseline, optimizer=optimizer, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau) optimizer.setup(metalearner) for batch in range(args.num_batches): print(f"----------Batch number {batch+1}----------") tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes) with writer.as_default(): return_before = total_rewards([ep.rewards for ep, _ in episodes]) return_after = total_rewards([ep.rewards for _, ep in episodes]) tf.summary.scalar('total_rewards/before_update', return_before, batch) tf.summary.scalar('total_rewards/after_update', return_after, batch) print( f"{batch+1}:: \t Before: {return_before} \t After: {return_after}" ) writer.flush() if (batch + 1) % args.save_iters == 0: # Save policy network policy.save_weights(save_folder + f"/policy-{batch+1}", overwrite=True) baseline.save_weights(save_folder + f"/baseline-{batch + 1}", overwrite=True) print(f"Policy saved at iteration {batch+1}")
def main(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0' ]) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if args.env_name == 'AntVel-v1': param_bounds = {"goal": [0, 3]} if args.env_name == 'AntPos-v0': param_bounds = {"x": [-3, 3], "y": [-3, 3]} teacher = TeacherController(args.teacher, args.nb_test_episodes, param_bounds, seed=args.seed, teacher_params={}) tree = TreeLSTM(args.tree_hidden_layer, len(param_bounds.keys()), args.cluster_0, args.cluster_1, device=args.device) if continuous_actions: policy = NormalMLPPolicy(int( np.prod(sampler.envs.observation_space.shape) + args.tree_hidden_layer), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers, tree=tree) else: policy = CategoricalMLPPolicy(int( np.prod(sampler.envs.observation_space.shape) + args.tree_hidden_layer), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers, tree=tree) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape)) + args.tree_hidden_layer) metalearner = MetaLearner(sampler, policy, baseline, tree=tree, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) all_tasks = [] for batch in range(args.num_batches): print("starting iteration {}".format(batch)) tasks = [] for _ in range(args.meta_batch_size): if args.env_name == 'AntPos-v0': tasks.append( {"position": teacher.task_generator.sample_task()}) if args.env_name == 'AntVel-v1': tasks.append( {"velocity": teacher.task_generator.sample_task()[0]}) all_tasks.append(tasks) # tasks = np.array(tasks) # tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) with open('./logs/{0}/task_list.pkl'.format(args.output_folder), 'wb') as pf: pickle.dump(all_tasks, pf) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) tr = [ep.rewards for _, ep in episodes] tr = [torch.mean(torch.sum(rewards, dim=0)).item() for rewards in tr] print("rewards:", tr) for t in range(args.meta_batch_size): if args.env_name == 'AntPos-v0': teacher.task_generator.update(tasks[t]["position"], tr[t]) if args.env_name == 'AntVel-v1': teacher.task_generator.update(np.array([tasks[t]["velocity"]]), tr[t]) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f) # Save tree torch.save(tree, os.path.join(save_folder, 'tree-{0}.pt'.format(batch)))
def main(args): save_folder = f'saves/{args.output_folder + get_date_str()}' if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) print('Initializing samplers...') sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) test_sampler = BatchSampler(args.env_name, test_env=True, batch_size=args.fast_batch_size, num_workers=max(1, args.num_workers // 2)) policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) print('Initializing meta-learners...') metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) # noqa: E128 # NOTE: we need this metalearner only to sample test tasks test_metalearner = MetaLearner(test_sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) # noqa: E128 print('Starting the training') # Initialize logging wandb.init() wandb.config.update(args) task_name2id = {name: i for i, name in enumerate(sampler._env._task_names)} task_id2name = sampler._env._task_names task2prob = np.ones(sampler._env.num_tasks) / sampler._env.num_tasks uniform = np.ones_like(task2prob) / sampler._env.num_tasks # outer loop (meta-training) for i in range(args.num_batches): print(f'Batch {i}') # sample trajectories from random tasks print(f'\tSampling a batch of {args.meta_batch_size} training tasks') tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size, task2prob=0.99 * task2prob + 0.01 * uniform) # Note: Dirty hack to overcome metaworld dirty hack task_names = [sampler._env._task_names[t['task']] for t in tasks] # inner loop (adaptation) # returns list of tuples (train_episodes, valid_episodes) print(f'\tTraining') episodes = metalearner.sample(tasks, first_order=args.first_order) print(f'\tUpdating the meta-model') metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Logging # before: before parameters update # after: after parameters adaptation to the task r_before = total_rewards([ep.rewards for ep, _ in episodes]) r_after = total_rewards([ep.rewards for _, ep in episodes]) test_episode_infos = [ep._info_list for ep, _ in episodes] success_rate_before, task_success_rate_before = get_success_rate( test_episode_infos, task_names, per_task=True) test_episode_infos = [ep._info_list for _, ep in episodes] success_rate_after, task_success_rate_after = get_success_rate( test_episode_infos, task_names, per_task=True) wandb.log( { 'total_rewards/before_update': r_before, 'total_rewards/after_update': r_after, 'success_rate/before_update': success_rate_before, 'success_rate/after_update': success_rate_after, 'success_rate/improvement': success_rate_after - success_rate_before, 'success_rate/before_update_macro': np.mean(list(task_success_rate_before.values())), 'success_rate/after_update_macro': np.mean(list(task_success_rate_after.values())), }, step=i) wandb.log( { f'success_rate/after_update/{task}': rate for task, rate in task_success_rate_after.items() }, step=i) wandb.log( { f'success_rate/before_update/{task}': rate for task, rate in task_success_rate_before.items() }, step=i) wandb.log( { f'success_rate/imrovement/{task}': task_success_rate_after[task] - task_success_rate_before[task] for task in task_success_rate_before.keys() }, step=i) wandb.log( { f'n_acquired_tasks/before_update/at_{x}': sum( rate > x for rate in task_success_rate_before.values()) for x in [0.001, 0.01, 0.05, 0.1, 0.5] }, step=i) wandb.log( { f'n_acquired_tasks/after_update/at_{x}': sum( rate > x for rate in task_success_rate_after.values()) for x in [0.001, 0.01, 0.05, 0.1, 0.5] }, step=i) if args.active_learning: new_task2prob = np.zeros_like(task2prob) if args.prob_f == 'linear': norm = 1e-7 + sum(task_success_rate_after.values()) for task, rate in task_success_rate_after.items(): task_id = task_name2id[task] new_task2prob[task_id] = 1. - rate / norm elif args.prob_f == 'softmax': # softmax(1 - rate) # numerical stability trick # http://cs231n.github.io/linear-classify/#softmax max_f = 1 - min(task_success_rate_after.values()) for task, rate in task_success_rate_after.items(): task_id = task_name2id[task] f = 1 - rate new_task2prob[task_id] = np.exp( (f - max_f) / args.temperature) new_task2prob = new_task2prob / (1e-7 + sum(new_task2prob)) elif args.prob_f == 'softmax2': # 1 - softmax(rate) max_f = max(task_success_rate_after.values()) for task, rate in task_success_rate_after.items(): task_id = task_name2id[task] new_task2prob[task_id] = np.exp( (rate - max_f) / args.temperature) new_task2prob = 1. - new_task2prob / (1e-7 + sum(new_task2prob)) else: raise RuntimeError( 'prob-f should be either "softmax", "softmax2" or "linear"' ) alpha = args.success_rate_smoothing task2prob = alpha * task2prob + (1 - alpha) * new_task2prob task2prob /= sum(task2prob) assert all(task2prob > 0) # strictly! wandb.log( { f'task2prob/{task_id2name[task_id]}': prob for task_id, prob in enumerate(task2prob) }, step=i) # meta-test if i % args.eval_every == 0: print(f'Evaluating on meta-test') # save policy network _save_path = os.path.join(save_folder, 'policy-{0}.pt'.format(i)) with open(_save_path, 'wb') as f: torch.save(policy.state_dict(), f) wandb.save(_save_path) # Evaluate on meta-test tasks = test_sampler.sample_tasks(num_tasks=2 * args.meta_batch_size) # Note: Dirty hack to overcome metaworld dirty hack task_names = [ test_sampler._env._task_names[t['task']] for t in tasks ] episodes = test_metalearner.sample(tasks, first_order=args.first_order) r_before = total_rewards([ep.rewards for ep, _ in episodes]) r_after = total_rewards([ep.rewards for _, ep in episodes]) test_episode_infos = [ep._info_list for ep, _ in episodes] success_rate_before, task_success_rate_before = get_success_rate( test_episode_infos, task_names, per_task=True) test_episode_infos = [ep._info_list for _, ep in episodes] success_rate_after, task_success_rate_after = get_success_rate( test_episode_infos, task_names, per_task=True) wandb.log( { 'total_rewards_test/before_update': r_before, 'total_rewards_test/after_update': r_after, 'success_rate_test/before_update': success_rate_before, 'success_rate_test/after_update': success_rate_after, 'success_rate_test/improvement': success_rate_after - success_rate_before }, step=i) wandb.log( { f'success_rate_test/after_update/{task}': rate for task, rate in task_success_rate_after.items() }, # noqa: E501 step=i) wandb.log( { f'success_rate_test/before_update/{task}': rate for task, rate in task_success_rate_before.items() }, # noqa: E501 step=i) wandb.log( { f'success_rate_test/imrovement/{task}': task_success_rate_after[task] - task_success_rate_before[task] for task in task_success_rate_before.keys() }, step=i) print('Saving the final model') # save final policy _save_path = os.path.join(save_folder, 'policy-final.pt') with open(_save_path, 'wb') as f: torch.save(policy.state_dict(), f) wandb.save(_save_path)
def main(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'Point2DWalls-corner-v0', 'Ant-v0', 'HalfCheetah-v0' ]) writer = SummaryWriter(log_dir=args.log_dir) logger.configure(dir=args.log_dir, format_strs=['stdout', 'log', 'csv']) logger.log(args) json.dump(vars(args), open(os.path.join( args.log_dir, 'params.json', ), 'w'), indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) logger.logkv('return_avg_pre', total_rewards([ep.rewards for ep, _ in episodes])) logger.logkv('return_avg_post', total_rewards([ep.rewards for _, ep in episodes])) logger.dumpkvs()
def main(args): set_random_seed(args.random) continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', '2DNavigationBiased-v0' ]) writer = SummaryWriter('./logs/{0}'.format(args.alg)) save_folder = './saves/{0}'.format(args.alg) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers, seed=args.random) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) if args.alg == 'simul': # vanilla maml metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.meta_policy_num * args.num_batches): # first sample tasks under the distribution tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) # get episodes in the form of (train episodes, test episodes after adaption) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard writer.add_scalar( 'maml/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar( 'maml/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f) elif args.alg == 'greedy': # multi-policy maml metalearner = KPolicyMetaLearner(sampler, policy, baseline, args.meta_policy_num, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) # visualize the poolicies' behavior trajectories = [] for policy_idx in range(args.meta_policy_num): print(policy_idx) metalearner.optimize_policy_index(policy_idx) for batch in range(args.num_batches): print('batch num %d' % batch) tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) metalearner.evaluate_optimized_policies(tasks) episodes = metalearner.sample(tasks, first_order=args.first_order) # loss is computed inside, then update policies metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # not sure what to write in tensorboard... for epIdx in range(len(episodes)): writer.add_scalar( 'kmaml/pi_' + str(policy_idx) + '_task_' + str(epIdx), total_rewards([episodes[epIdx][1].rewards]), batch) # use a random task (no update here anyway) to visualize meta-policies tasks = sampler.sample_tasks(num_tasks=1) trajectories.append(metalearner.sample_meta_policy(tasks[0])) plotTrajectories(trajectories)
def main(args): group_name = ''.join([ random.choice(string.ascii_letters + string.digits) for n in range(4) ]) wandb.init(group=group_name, job_type='optimizer', tensorboard=True) wandb.config.update(args) device = torch.device(args.device) continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0' ]) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=device.type) json.dump(config, f, indent=2) sampler = BatchSampler(group_name, args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=device) for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f)
def main(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'Pusher' ]) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) if not args.hierarchical: sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for i, batch in enumerate(range(args.num_batches)): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) print('Total Rewards', str(total_rewards([ep.rewards for _, ep in episodes]))) # Tensorboard writer.add_scalar( 'total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar( 'total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) if (i + 1) % args.save_every == 0: # Save policy network with open( os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy, f) else: sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) # Get the policies higher_policy, lower_trainer, baseline = hierarchical_meta_policy( args.env_name, args.skills_dim, sampler=sampler, net_size=args.hidden_size, output_size=1) # Define the hierarchical meta learner hr_meta_learner = HierarchicalMetaLearner(sampler, higher_policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) # Training procedure for i, batch in enumerate(range(args.num_batches)): # Train the lower level policy lower_trainer.train() # Now freeze the lower level policy lower_networks = lower_trainer.networks lower_policy = lower_networks[0] lower_policy.trainable = False # Sample the different tasks tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) # Sample the different episodes for the different tasks episodes = hr_meta_learner.sample(tasks, lower_policy, first_order=args.first_order) hr_meta_learner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) print('Total Rewards', str(total_rewards([ep.rewards for _, ep in episodes]))) lower_policy.trainable = True # Tensorboard writer.add_scalar( 'total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar( 'total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) if (i + 1) % args.save_every == 0: # Save the policy networks with open( os.path.join(save_folder, 'h_policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(higher_policy, f) with open( os.path.join(save_folder, 'l_policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(lower_policy, f) with open(os.path.join(save_folder, 'baseline.pt'), 'wb') as f: torch.save(baseline, f)
def k_shot_experiments(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'MountainCarContinuousVT-v0' ]) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy_pretrained = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) policy_metalearned = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) policy_random = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy_pretrained = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) policy_metalearned = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) policy_random = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) # save_folder_pretrained = './saves/{0}'.format(args.output_folder + '_pretrained') # pretrained_model = os.path.join(save_folder_pretrained, 'policy-{0}.pt'.format(args.num_batches-1)) # policy_pretrained.load_state_dict(torch.load(pretrained_model)) save_folder_metalearned = './saves/{0}'.format(args.output_folder + '_metalearned') metalearned_model = os.path.join( save_folder_metalearned, 'policy-{0}.pt'.format(args.num_batches - 1)) policy_metalearned.load_state_dict(torch.load(metalearned_model)) # metalearned_tester = k_shot_tester(args.K_shot_batch_num, policy_metalearned, args.K_shot_batch_size, args.K_shot_num_tasks, 'MetaLearned', args) # avg_discounted_returns_metalearned = metalearned_tester.run_k_shot_exp() # print('Metalearned KSHOT result: ', avg_discounted_returns_metalearned) # print('Mean: ', torch.mean(avg_discounted_returns_metalearned, 0)) results_folder = './saves/{0}'.format(args.output_folder + '_results') if not os.path.exists(results_folder): os.makedirs(results_folder) kshot_fig_path1 = os.path.join(results_folder, 'kshot_testing') # kshot_fig_path2 = os.path.join(results_folder, 'ml_pre_diff') result_data_path = os.path.join(results_folder, 'data_') metalearned_tester = k_shot_tester(args.K_shot_batch_num, policy_metalearned, args.K_shot_batch_size, args.K_shot_num_tasks, 'MetaLearned', args) avg_discounted_returns_metalearned = metalearned_tester.run_k_shot_exp() # pretrained_tester = k_shot_tester(args.K_shot_batch_num, policy_pretrained, args.K_shot_batch_size, args.K_shot_num_tasks, 'Pretrained', args) # avg_discounted_returns_pretrained = pretrained_tester.run_k_shot_exp() # random_tester = k_shot_tester(args.K_shot_batch_num, policy_random, args.K_shot_batch_size, args.K_shot_num_tasks, 'Random', args) # avg_discounted_returns_random = random_tester.run_k_shot_exp() plt.figure('K Shot: Testing Curves') # plt.plot([i for i in range(args.K_shot_batch_num + 1)], avg_discounted_returns_pretrained, color=np.array([0.,0.,1.]), label='Pre-Trained') # plt.plot([i for i in range(args.K_shot_batch_num + 1)], avg_discounted_returns_metalearned, color=np.array([0.,1.,0.]), label='Meta-Learned') # plt.plot([i for i in range(args.K_shot_batch_num + 1)], avg_discounted_returns_random, color=np.array([0.,0.,0.]), label='Random') # plt.errorbar([i for i in range(args.K_shot_batch_num + 1)], torch.mean(avg_discounted_returns_pretrained, 0).tolist(), torch.std(avg_discounted_returns_pretrained, 0).tolist(), color=np.array([0.,0.,1.]), label='Pre-Trained', capsize=5, capthick=2) plt.errorbar([i for i in range(args.K_shot_batch_num + 1)], torch.mean(avg_discounted_returns_metalearned, 0).tolist(), torch.std(avg_discounted_returns_metalearned, 0).tolist(), color=np.array([0., 1., 0.]), label='Meta-Learned', capsize=5, capthick=2) # plt.errorbar([i for i in range(args.K_shot_batch_num + 1)], torch.mean(avg_discounted_returns_random, 0).tolist(), torch.std(avg_discounted_returns_random, 0).tolist(), color=np.array([0.,0.,0.]), label='Random', capsize=5, capthick=2) plt.xlabel('Gradient Descent Iteration Number') plt.ylabel('Average Discounted Return') plt.title('K Shot: Testing Curves') plt.legend(loc='upper left') plt.savefig(kshot_fig_path1) # plt.show() # plt.figure('K Shot: Difference between Metalearned and Pretrained') # plt.errorbar([i for i in range(args.K_shot_batch_num + 1)], torch.mean(avg_discounted_returns_metalearned-avg_discounted_returns_pretrained, 0).tolist(), torch.std(avg_discounted_returns_metalearned-avg_discounted_returns_pretrained, 0).tolist(), color=np.array([0.,0.,0.]), capsize=5, capthick=2) # plt.xlabel('Gradient Descent Iteration Number') # plt.ylabel('Average Discounted Return Difference') # plt.title('K Shot: Difference between Metalearned and Pretrained') # plt.savefig(kshot_fig_path2) # plt.show() #save torch tensor results to combine with other experiments # torch.save(avg_discounted_returns_pretrained, result_data_path + 'pretrained') torch.save(avg_discounted_returns_metalearned, result_data_path + 'metalearned') return
def train_pretrained_model(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'MountainCarContinuousVT-v0' ]) # writer = SummaryWriter('./logs/{0}'.format(args.output_folder + '_pretrained')) save_folder = './saves/{0}'.format(args.output_folder + '_pretrained') if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) #batch_size=2*args.fast_batch_size to match the amount of data used in meta-learning sampler = BatchSampler(args.env_name, batch_size=2 * args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) #load pretrained model cont_from_batch = 0 if args.start_from_batch != -1: pretrained_model = os.path.join( save_folder, 'policy-{0}.pt'.format(args.start_from_batch - 1)) if os.path.exists(pretrained_model): policy.load_state_dict(torch.load(pretrained_model)) cont_from_batch = args.start_from_batch metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(cont_from_batch, args.num_batches): print('Currently processing Batch: {}'.format(batch + 1)) task_sampling_time = time.time() tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) task_sampling_time = time.time() - task_sampling_time episode_generating_time = time.time() episodes = metalearner.sample_for_pretraining( tasks, first_order=args.first_order) episode_generating_time = time.time() - episode_generating_time learning_step_time = time.time() params = metalearner.adapt(episodes, first_order=args.first_order) metalearner.policy.load_state_dict(params, strict=True) learning_step_time = time.time() - learning_step_time print('Tasking Sampling Time: {}'.format(task_sampling_time)) print('Episode Generating Time: {}'.format(episode_generating_time)) print('Learning Step Time: {}'.format(learning_step_time)) # Tensorboard # writer.add_scalar('total_rewards/before_update', # total_rewards([ep.rewards for ep, _ in episodes]), batch) # writer.add_scalar('total_rewards/after_update', # total_rewards([ep.rewards for _, ep in episodes]), batch) # experiment.log_metric("Avg Disc Reward (Pretrained)", total_rewards([episodes.rewards], args.gamma), batch+1) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(metalearner.policy.state_dict(), f) return
def main(args): # Setup for logging tb_writer = SummaryWriter('./logs/tb_{}'.format( args.log_name)) # Tensorboard logging log = set_log(args) # Setup before meta-train starts sampler = BatchSampler(env_name=args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers, args=args) # NOTE Observation space is a list with [predator0, predator1, ..., prey] # Thus using the index of 0 policy = NormalMLPPolicy( input_size=int(np.prod(sampler.envs.observation_space[0].shape)), output_size=int(np.prod(sampler.envs.action_space[0].shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( input_size=int(np.prod(sampler.envs.observation_space[0].shape))) meta_learner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device, args=args, log=log, tb_writer=tb_writer) # meta_learner.load( # filename="theta_200", directory="./pytorch_models") meta_tester = MetaTester(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device, args=args, log=log, tb_writer=tb_writer) prey = Prey(env=sampler._env, args=args, log=log, tb_writer=tb_writer, name="prey", i_agent=0) # Meta-train starts iteration = 0 while True: # Sample train and validation episode tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size, test=False) episodes = meta_learner.sample(tasks, prey, first_order=args.first_order, iteration=iteration) # Train meta-policy meta_learner.step(episodes=episodes, args=args) # Test meta-policy if iteration % 10 == 0: test_tasks = sampler.sample_tasks(num_tasks=5, test=True) meta_tester.few_shot_adaptation(meta_policy=meta_learner.policy, tasks=test_tasks, first_order=args.first_order, iteration=iteration, prey=prey) if iteration % 100 == 0: meta_learner.save(iteration) iteration += 1
def main(args): wandb.config.update({ k: v for k, v in vars(args).items() if k in ['env_name', 'tau', 'critic_lr'] }) continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0' ]) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, args.seed, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) critic = Critic(int(np.prod(sampler.envs.observation_space.shape)), 1, hidden_sizes=(args.hidden_size, ) * args.num_layers) metalearner = ActorCriticMetaLearner(sampler, policy, critic, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device, critic_lr=args.critic_lr) wandb.watch(metalearner.critic) for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) meta_critic_loss = metalearner.step( episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Logging wandb.log( { 'total_rewards/before_update': total_rewards([ep.rewards for ep, _ in episodes]) }, step=batch) wandb.log( { 'total_rewards/after_update': total_rewards([ep.rewards for _, ep in episodes]) }, step=batch) wandb.log({'meta critic loss': meta_critic_loss.detach().item()}, step=batch) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f)
def main(args): continuous_actions = (args.env_name in ['AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', '2DPointEnvCorner-v0']) save_folder = './saves/{0}'.format(args.env_name+'/'+args.output_folder) if args.output_folder!='maml-trial' and args.output_folder!='trial': i=0 while os.path.exists(save_folder): args.output_folder=str(i+1) i+=1 save_folder = './saves/{0}'.format(args.env_name+'/'+args.output_folder) log_directory = './logs/{0}'.format(args.env_name+'/'+args.output_folder) os.makedirs(save_folder) writer = SummaryWriter('./logs/{0}'.format(args.env_name+'/'+args.output_folder)) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size,) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size,) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) if args.load_dir is not None: policy.load_state_dict(torch.load(args.load_dir)) metalearner = MetaLearner(sampler, policy, baseline, args, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) print('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) print('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) # Plotting figure # plotting(episodes, batch, save_folder,args.num_plots) if args.load_dir is not None: sys.exit(0) # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f)
def main(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'RVONavigation-v0', 'RVONavigationAll-v0' ]) assert continuous_actions == True writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) log_traj_folder = './logs/{0}'.format(args.output_traj_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) if not os.path.exists(log_traj_folder): os.makedirs(log_traj_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) # log_reward_total_file = open('./logs/reward_total.txt', 'a') # log_reward_dist_file = open('./logs/reward_dist.txt', 'a') # log_reward_col_file = open('./logs/reward_col.txt', 'a') sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) # print(sampler.envs.observation_space.shape) # print(sampler.envs.action_space.shape) # eewfe if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) # baseline = LinearFeatureBaseline( # int(np.prod(sampler.envs.observation_space.shape))) baseline = LinearFeatureBaseline(int(np.prod((2, )))) resume_training = True if resume_training: saved_policy_path = os.path.join( './TrainingResults/result2//saves/{0}'.format( 'maml-2DNavigation-dir'), 'policy-180.pt') if os.path.isfile(saved_policy_path): print('Loading a saved policy') policy_info = torch.load(saved_policy_path) policy.load_state_dict(policy_info) else: sys.exit("The requested policy does not exist for loading") metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) start_time = time.time() for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # print("observations shape: ") # print(episodes[0][1].observations.shape) # ewerw # Tensorboard total_reward_be, dist_reward_be, col_reward_be = total_rewards( [ep.rewards for ep, _ in episodes]) total_reward_af, dist_reward_af, col_reward_af = total_rewards( [ep.rewards for _, ep in episodes]) log_reward_total_file = open('./logs/reward_total.txt', 'a') log_reward_dist_file = open('./logs/reward_dist.txt', 'a') log_reward_col_file = open('./logs/reward_col.txt', 'a') log_reward_total_file.write( str(batch) + ',' + str(total_reward_be) + ',' + str(total_reward_af) + '\n') log_reward_dist_file.write( str(batch) + ',' + str(dist_reward_be) + ',' + str(dist_reward_af) + '\n') log_reward_col_file.write( str(batch) + ',' + str(col_reward_be) + ',' + str(col_reward_af) + '\n') log_reward_total_file.close( ) # not sure if open and close immediantly will help save the appended logs in-place log_reward_dist_file.close() log_reward_col_file.close() writer.add_scalar('total_rewards/before_update', total_reward_be, batch) writer.add_scalar('total_rewards/after_update', total_reward_af, batch) writer.add_scalar('distance_reward/before_update', dist_reward_be, batch) writer.add_scalar('distance_reward/after_update', dist_reward_af, batch) writer.add_scalar('collison_rewards/before_update', col_reward_be, batch) writer.add_scalar('collison_rewards/after_update', col_reward_af, batch) if batch % args.save_every == 0: # maybe it can save time/space if the models are saved only periodically # Save policy network print('Saving model {}'.format(batch)) with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f) if batch % 30 == 0: with open( os.path.join( log_traj_folder, 'train_episodes_observ_' + str(batch) + '.pkl'), 'wb') as f: pickle.dump( [ep.observations.cpu().numpy() for ep, _ in episodes], f) with open( os.path.join( log_traj_folder, 'valid_episodes_observ_' + str(batch) + '.pkl'), 'wb') as f: pickle.dump( [ep.observations.cpu().numpy() for _, ep in episodes], f) # with open(os.path.join(log_traj_folder, 'train_episodes_ped_state_'+str(batch)+'.pkl'), 'wb') as f: # pickle.dump([ep.hid_observations.cpu().numpy() for ep, _ in episodes], f) # with open(os.path.join(log_traj_folder, 'valid_episodes_ped_state_'+str(batch)+'.pkl'), 'wb') as f: # pickle.dump([ep.hid_observations.cpu().numpy() for _, ep in episodes], f) # save tasks # a sample task list of 2: [{'goal': array([0.0209588 , 0.15981938])}, {'goal': array([0.45034602, 0.17282322])}] with open( os.path.join(log_traj_folder, 'tasks_' + str(batch) + '.pkl'), 'wb') as f: pickle.dump(tasks, f) else: # supposed to be overwritten for each batch with open( os.path.join(log_traj_folder, 'latest_train_episodes_observ.pkl'), 'wb') as f: pickle.dump( [ep.observations.cpu().numpy() for ep, _ in episodes], f) with open( os.path.join(log_traj_folder, 'latest_valid_episodes_observ.pkl'), 'wb') as f: pickle.dump( [ep.observations.cpu().numpy() for _, ep in episodes], f) # with open(os.path.join(log_traj_folder, 'latest_train_episodes_ped_state.pkl'), 'wb') as f: # pickle.dump([ep.hid_observations.cpu().numpy() for ep, _ in episodes], f) # with open(os.path.join(log_traj_folder, 'latest_valid_episodes_ped_state.pkl'), 'wb') as f: # pickle.dump([ep.hid_observations.cpu().numpy() for _, ep in episodes], f) with open(os.path.join(log_traj_folder, 'latest_tasks.pkl'), 'wb') as f: pickle.dump(tasks, f) print('finished epoch {}; time elapsed: {}'.format( batch, time_elapsed(time.time() - start_time)))
#ant flips over way too easily.. seems to need to keep kl and fast_lr both small #try giving it more episodes continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'PendulumTheta-v0' ]) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: the_model = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: the_model = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) #loading the model save_folder = './saves/{0}'.format(args.output_folder) the_model.load_state_dict( torch.load(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)))) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape)))
def main(args): continuous_actions = (args.env_name in ['AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0']) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size,) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size,) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearnerNGLVCVPG(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device, verbose=args.verbose) for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) start = time.time() episodes, kls, param_diffs = metalearner.sample(tasks, first_order=args.first_order, cg_iters=args.cg_iters) sample_time = time.time() - start start = time.time() if args.optimizer is 'sgd': metalearner.step_sgd(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) else: metalearner.step_adam(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) update_time = time.time() - start # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) writer.add_scalar('kl-mean between meta update', torch.mean(torch.stack(kls)), batch) writer.add_scalar('kl-std between meta update', torch.std(torch.stack(kls)), batch) writer.add_scalar('Euclidean-distance-mean between meta update', torch.mean(torch.stack(param_diffs)), batch) writer.add_scalar('Euclidean-distance-std between meta update', torch.std(torch.stack(param_diffs)), batch) print("Batch {}. before_update: {}, after_update: {}\n sample time {}, update_time {}".format(batch, total_rewards([ep.rewards for ep, _ in episodes]), total_rewards([ep.rewards for _, ep in episodes]), sample_time, update_time)) print("Batch {}. kl-divergence between meta update: {}, kl std: {}".format( batch, torch.mean(torch.stack(kls)), torch.std(torch.stack(kls)))) print("Batch {}. Euclidean-distance-mean meta update: {}, Euclidean-distance-std: {}".format( batch, torch.mean(torch.stack(param_diffs)), torch.std(torch.stack(param_diffs)))) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f)
def eval(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0' ]) # writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) log_folder = './logs/{0}'.format(args.output_folder) if not os.path.exists(log_folder): os.makedirs(log_folder) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if args.env_name == 'AntPos-v0': param_bounds = {"x": [-3, 3], "y": [-3, 3]} tree = TreeLSTM(args.tree_hidden_layer, len(param_bounds.keys()), args.cluster_0, args.cluster_1, device=args.device) if continuous_actions: policy = NormalMLPPolicy(int( np.prod(sampler.envs.observation_space.shape) + args.tree_hidden_layer), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) policy.eval() tree.eval() all_tasks = [] # torch.autograd.set_detect_anomaly(True) reward_list = [] for batch in range(args.num_batches + 1): print("starting iteration {}".format(batch)) try: policy.load_state_dict( torch.load( os.path.join(save_folder, 'policy-{0}.pt'.format(batch)))) tree = torch.load( os.path.join(save_folder, 'tree-{0}.pt'.format(batch))) tree.eval() except Exception: with open( './logs/{0}/reward_list_eval.pkl'.format( args.output_folder), 'wb') as pf: pickle.dump(reward_list, pf) print(reward_list) return # tree.load_state_dict(torch.load(os.path.join(save_folder, # 'tree-{0}.pt'.format(batch)))) tasks = sampler.sample_tasks(args.meta_batch_size) all_tasks.append(tasks) # tasks = np.array(tasks) # tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) with open('./logs/{0}/task_list_eval.pkl'.format(args.output_folder), 'wb') as pf: pickle.dump(all_tasks, pf) print("evaluating...".format(batch)) all_rewards = [] for task in tasks: print(task["position"]) episodes = sampler.sample(policy, task, tree=tree) # print("training...".format(batch)) # tr = [ep.rewards for ep in episodes] # tr = np.mean([torch.mean(torch.sum(rewards, dim=0)).item() for rewards in tr]) all_rewards.append(total_rewards(episodes.rewards)) reward_list.append(np.mean(all_rewards)) with open('./logs/{0}/reward_list_eval.pkl'.format(args.output_folder), 'wb') as pf: pickle.dump(reward_list, pf) print(reward_list)
def main(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'Point2DWalls-corner-v0', 'Ant-v0', 'HalfCheetah-v0' ]) logger.configure(dir=args.log_dir, format_strs=['stdout', 'log', 'csv']) logger.log(args) json.dump(vars(args), open(os.path.join( args.log_dir, 'params.json', ), 'w'), indent=2) sampler = BatchSamplerMultiworld(args) sampler_val = BatchSamplerMultiworld(args, val=True) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers, bias_transformation_size=args.bias_transformation_size, init_gain=args.init_gain, ) else: raise NotImplementedError baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, entropy_coef=args.entropy_coef, device=args.device) start_time = time.time() processes = [] for batch in range(args.num_batches): metalearner.reset() tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) if sampler.rewarder.fit_counter > 0: metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) if batch % args.rewarder_fit_period == 0: sampler.fit_rewarder(logger) if args.rewarder == 'unsupervised': sampler.log_unsupervised(logger) log_main(logger, episodes, batch, args, start_time, metalearner) if batch % args.save_period == 0 or batch == args.num_batches - 1: save_model_maml(args, policy, batch) if batch % args.val_period == 0 or batch == args.num_batches - 1: val(args, sampler_val, policy, baseline, batch) if batch % args.vis_period == 0 or batch == args.num_batches - 1: if args.plot: p = Popen( 'python maml_rl/utils/visualize.py --log-dir {}'.format( args.log_dir), shell=True) processes.append(p) logger.dumpkvs()
main(args) else: env = gym.make(id) # maml = [] #indexes = [e for e in range(400) if e % 10 == 9] #indexes = [0] + indexes indexes = [399] num_test_tasks = 100 buckets = 1 successes = [] for index in indexes: sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) model = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) checkpoint = torch.load( '../final_models/meta/{0}/policy-{1}.pt'.format( args.to_pickle, index)) model.load_state_dict(checkpoint) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, model, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device)