print("new task: ", test_task[0], ", where 1 is forward") #task = env.unwrapped.sample_tasks(1) env.unwrapped.reset_task(test_task[0]) observations = env.reset() print("new task: ", env.step([1])[3]['task'], ", where 1 is forward") _theta = env.step([1])[3]['task'] degrees = 180 * _theta['theta'] / np.pi print("new task in degrees: ", degrees) train_episodes = metalearner.sampler.sample(the_model, gamma=args.gamma, device=args.device) print("len of train episoid: ", len(train_episodes)) print(train_episodes) params = metalearner.adapt(train_episodes, first_order=args.first_order) valid_episodes = metalearner.sampler.sample(the_model, params=params, gamma=args.gamma, device=args.device) episodes.append((train_episodes, valid_episodes)) for param in [None, params]: for i in np.arange(1): observations = env.reset() if param == None: print("New episode before gradient update") else: print("New episode after one gradient update") rewards = 0
def train_pretrained_model(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'MountainCarContinuousVT-v0' ]) # writer = SummaryWriter('./logs/{0}'.format(args.output_folder + '_pretrained')) save_folder = './saves/{0}'.format(args.output_folder + '_pretrained') if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) #batch_size=2*args.fast_batch_size to match the amount of data used in meta-learning sampler = BatchSampler(args.env_name, batch_size=2 * args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) #load pretrained model cont_from_batch = 0 if args.start_from_batch != -1: pretrained_model = os.path.join( save_folder, 'policy-{0}.pt'.format(args.start_from_batch - 1)) if os.path.exists(pretrained_model): policy.load_state_dict(torch.load(pretrained_model)) cont_from_batch = args.start_from_batch metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(cont_from_batch, args.num_batches): print('Currently processing Batch: {}'.format(batch + 1)) task_sampling_time = time.time() tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) task_sampling_time = time.time() - task_sampling_time episode_generating_time = time.time() episodes = metalearner.sample_for_pretraining( tasks, first_order=args.first_order) episode_generating_time = time.time() - episode_generating_time learning_step_time = time.time() params = metalearner.adapt(episodes, first_order=args.first_order) metalearner.policy.load_state_dict(params, strict=True) learning_step_time = time.time() - learning_step_time print('Tasking Sampling Time: {}'.format(task_sampling_time)) print('Episode Generating Time: {}'.format(episode_generating_time)) print('Learning Step Time: {}'.format(learning_step_time)) # Tensorboard # writer.add_scalar('total_rewards/before_update', # total_rewards([ep.rewards for ep, _ in episodes]), batch) # writer.add_scalar('total_rewards/after_update', # total_rewards([ep.rewards for _, ep in episodes]), batch) # experiment.log_metric("Avg Disc Reward (Pretrained)", total_rewards([episodes.rewards], args.gamma), batch+1) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(metalearner.policy.state_dict(), f) return