if __name__ == "__main__": variant = dict( algo_params=dict( num_epochs=int(sys.argv[2]), num_steps_per_epoch=1000, num_steps_per_eval=1000, batch_size=128, max_path_length=999, discount=0.99, reward_scale=float(sys.argv[3]), soft_target_tau=0.001, policy_lr=3E-4, qf_lr=3E-4, vf_lr=3E-4, ), net_size=300, env=sys.argv[1], algo_name="virel", algo_seed=int(sys.argv[5]), ) seed = int(sys.argv[5]) random.seed(seed) np.random.seed(seed) name = "virel_" + "_" + sys.argv[1] + "_" + sys.argv[5] + "_" + sys.argv[ 3] + "_" + sys.argv[6] setup_logger(name, variant=variant) ptu.set_gpu_mode(True) experiment(variant)
) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train() if __name__ == "__main__": # noinspection PyTypeChecker variant = dict( algo_params=dict( num_epochs=1000, num_steps_per_epoch=1000, num_steps_per_eval=1000, batch_size=128, max_path_length=999, discount=0.99, reward_scale=1, soft_target_tau=0.001, policy_lr=3E-4, qf_lr=3E-4, vf_lr=3E-4, ), net_size=300, ) setup_logger('name-of-experiment', variant=variant) experiment(variant)
def experiment(log_dir, variant_overwrite, cpu=False): if not cpu: ptu.set_gpu_mode(True) # optionally set the GPU (default=False) # Load experiment from file. env, _, data, variant = load_experiment(log_dir, variant_overwrite) assert all([ a == b for a, b in zip(env.sampled_goal, variant['env_kwargs']['goal_prior']) ]) # Set log directory. exp_id = 'eval/ne{}-mpl{}-{}-rs{}/nhp{}'.format( variant['algo_kwargs']['num_episodes'], variant['algo_kwargs']['max_path_length'], ','.join(variant_overwrite['env_kwargs']['shaped_rewards']), variant['algo_kwargs']['reward_scale'], variant['historical_policies_kwargs']['num_historical_policies'], ) exp_id = create_exp_name(exp_id) out_dir = os.path.join(log_dir, exp_id) print('Logging to:', out_dir) setup_logger( log_dir=out_dir, variant=variant, snapshot_mode='none', snapshot_gap=50, ) # Load trained model from file. policy = data['policy'] vf = data['vf'] qf = data['qf'] algorithm = SoftActorCritic( env=env, training_env=env, # can't clone box2d env cause of swig save_environment=False, # can't save box2d env cause of swig policy=policy, qf=qf, vf=vf, **variant['algo_kwargs'], ) # Overwrite algorithm for p(z) adaptation (if model is SMM). if variant['intrinsic_reward'] == 'smm': discriminator = data['discriminator'] density_model = data['density_model'] SMMHook(base_algorithm=algorithm, discriminator=discriminator, density_model=density_model, **variant['smm_kwargs']) # Overwrite algorithm for historical averaging. if variant['historical_policies_kwargs']['num_historical_policies'] > 0: HistoricalPoliciesHook( base_algorithm=algorithm, log_dir=log_dir, **variant['historical_policies_kwargs'], ) algorithm.to(ptu.device) algorithm.train()
def experiment(exp_specs): ptu.set_gpu_mode(exp_specs['use_gpu']) # Set up logging ---------------------------------------------------------- exp_id = exp_specs['exp_id'] exp_prefix = exp_specs['exp_name'] seed = exp_specs['seed'] set_seed(seed) setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs) # Prep the data ----------------------------------------------------------- replay_dict = joblib.load(exp_specs['replay_dict_path']) next_obs_array = replay_dict['next_observations'] acts_array = replay_dict['actions'] data_loader = BasicDataLoader(next_obs_array[:40000], acts_array[:40000], exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled()) val_data_loader = BasicDataLoader(next_obs_array[40000:], acts_array[40000:], exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled()) # Model Definition -------------------------------------------------------- conv_encoder = nn.Sequential( nn.Conv2d(3, 32, 1, stride=1, padding=0, bias=False), nn.BatchNorm2d(32), nn.ReLU(), nn.Conv2d(32, 32, 1, stride=1, padding=0, bias=False), nn.BatchNorm2d(32), nn.ReLU()) ae_dim = 128 z_dim = 128 pre_gru = nn.Sequential(nn.Linear(288 + z_dim + 4, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU(), nn.Linear(ae_dim, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU()) post_fc = nn.Sequential(nn.Linear(ae_dim + 288 + 4, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU(), nn.Linear(ae_dim, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU()) post_mean_fc = nn.Linear(ae_dim, z_dim, bias=True) post_log_cov_fc = nn.Linear(ae_dim, z_dim, bias=True) prior_fc = nn.Sequential(nn.Linear(ae_dim + 4, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU(), nn.Linear(ae_dim, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU()) prior_mean_fc = nn.Linear(ae_dim, z_dim, bias=True) prior_log_cov_fc = nn.Linear(ae_dim, z_dim, bias=True) gru = nn.GRUCell(ae_dim, ae_dim, bias=True) fc_decoder = nn.Sequential( nn.Linear(ae_dim + z_dim + 4, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU(), nn.Linear(ae_dim, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU(), nn.Linear(ae_dim, 288, bias=False), nn.BatchNorm1d(288), nn.ReLU(), ) conv_decoder = nn.Sequential( nn.ConvTranspose2d(32, 32, 1, stride=1, padding=0, output_padding=0, bias=False), nn.BatchNorm2d(32), nn.ReLU(), nn.ConvTranspose2d(32, 32, 1, stride=1, padding=0, output_padding=0, bias=False), nn.BatchNorm2d(32), nn.ReLU(), nn.Conv2d(32, 3, 1, stride=1, padding=0, bias=True), nn.Sigmoid()) if ptu.gpu_enabled(): conv_encoder.cuda() pre_gru.cuda() post_fc.cuda() post_mean_fc.cuda() post_log_cov_fc.cuda() prior_fc.cuda() prior_mean_fc.cuda() prior_log_cov_fc.cuda() gru.cuda() fc_decoder.cuda() conv_decoder.cuda() # Optimizer --------------------------------------------------------------- model_optim = Adam([ item for sublist in map(lambda x: list(x.parameters()), [ pre_gru, conv_encoder, gru, fc_decoder, conv_decoder, post_fc, post_log_cov_fc, post_mean_fc, prior_fc, prior_log_cov_fc, prior_mean_fc ]) for item in sublist ], lr=float(exp_specs['model_lr']), weight_decay=float(exp_specs['model_wd'])) # ------------------------------------------------------------------------- freq_bptt = exp_specs['freq_bptt'] episode_length = exp_specs['episode_length'] losses = [] KLs = [] for iter_num in range(int(float(exp_specs['max_iters']))): if iter_num % freq_bptt == 0: if iter_num > 0: # loss = loss / freq_bptt loss = loss + total_KL loss.backward() model_optim.step() loss = 0 total_KL = 0 prev_h_batch = Variable( torch.zeros(exp_specs['batch_size'], ae_dim)) if ptu.gpu_enabled(): prev_h_batch = prev_h_batch.cuda() if iter_num % exp_specs['freq_val'] == 0: train_loss_print = '\t'.join(losses) train_KLs_print = '\t'.join(KLs) losses = [] KLs = [] obs_batch, act_batch = data_loader.get_next_batch() enc = conv_encoder(obs_batch).view(obs_batch.size(0), -1) hidden = post_fc(torch.cat([prev_h_batch, enc, act_batch], 1)) post_mean = post_mean_fc(hidden) post_log_cov = post_log_cov_fc(hidden) hidden = prior_fc(torch.cat([prev_h_batch, act_batch], 1)) prior_mean = prior_mean_fc(hidden) prior_log_cov = prior_log_cov_fc(hidden) recon = fc_decoder(torch.cat([prev_h_batch, act_batch, post_mean], 1)).view(obs_batch.size(0), 32, 3, 3) recon = conv_decoder(recon) hidden = pre_gru(torch.cat([enc, post_mean, act_batch], 1)) prev_h_batch = gru(hidden, prev_h_batch) KL = compute_KL(prior_mean, prior_log_cov, post_mean, post_log_cov) if iter_num % episode_length != 0: loss = loss + torch.sum( (obs_batch.view(obs_batch.size(0), -1) - recon.view(obs_batch.size(0), -1))**2, 1).mean() total_KL = total_KL + KL losses.append('%.4f' % ((obs_batch - recon)**2).mean()) KLs.append('%.4f' % KL) if iter_num % (50 * exp_specs['episode_length']) in range( 2 * exp_specs['episode_length']): save_pytorch_tensor_as_img( recon[0].data.cpu(), 'junk_vis/full_KL_mem_grid_%d_recon.png' % iter_num) save_pytorch_tensor_as_img( obs_batch[0].data.cpu(), 'junk_vis/full_KL_mem_grid_%d_obs.png' % iter_num) if iter_num % exp_specs['freq_val'] == 0: print('\nValidating Iter %d...' % iter_num) list( map(lambda x: x.eval(), [ pre_gru, conv_encoder, gru, fc_decoder, conv_decoder, post_fc, post_log_cov_fc, post_mean_fc, prior_fc, prior_log_cov_fc, prior_mean_fc ])) val_prev_h_batch = Variable( torch.zeros(exp_specs['batch_size'], ae_dim)) if ptu.gpu_enabled(): val_prev_h_batch = val_prev_h_batch.cuda() val_losses = [] val_KLs = [] for i in range(freq_bptt): obs_batch, act_batch = data_loader.get_next_batch() enc = conv_encoder(obs_batch).view(obs_batch.size(0), -1) hidden = post_fc(torch.cat([prev_h_batch, enc, act_batch], 1)) post_mean = post_mean_fc(hidden) post_log_cov = post_log_cov_fc(hidden) hidden = prior_fc(torch.cat([prev_h_batch, act_batch], 1)) prior_mean = prior_mean_fc(hidden) prior_log_cov = prior_log_cov_fc(hidden) recon = fc_decoder( torch.cat([prev_h_batch, act_batch, post_mean], 1)).view(obs_batch.size(0), 32, 3, 3) recon = conv_decoder(recon) hidden = pre_gru(torch.cat([enc, post_mean, act_batch], 1)) prev_h_batch = gru(hidden, prev_h_batch) val_losses.append('%.4f' % ((obs_batch - recon)**2).mean()) val_KL = compute_KL(prior_mean, prior_log_cov, post_mean, post_log_cov) val_KLs.append('%.4f' % val_KL) val_loss_print = '\t'.join(val_losses) val_KLs_print = '\t'.join(val_KLs) print('Val MSE:\t' + val_loss_print) print('Train MSE:\t' + train_loss_print) print('Val KL:\t\t' + val_KLs_print) print('Train KL:\t' + train_KLs_print) list( map(lambda x: x.train(), [ pre_gru, conv_encoder, gru, fc_decoder, conv_decoder, post_fc, post_log_cov_fc, post_mean_fc, prior_fc, prior_log_cov_fc, prior_mean_fc ]))
def experiment(variant): # create multi-task environment and sample tasks env = NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params'])) tasks = env.get_all_task_idx() obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) reward_dim = 1 # instantiate networks latent_dim = variant['latent_size'] context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[ 'algo_params'][ 'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][ 'use_information_bottleneck'] else latent_dim net_size = variant['net_size'] recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder hidden_sizes = [200, 200, 200] if variant['algo_params']['snail']: encoder_model = SnailEncoder hidden_sizes = [20] context_encoder = encoder_model( hidden_sizes=hidden_sizes, input_size=context_encoder_input_dim, output_size=context_encoder_output_dim, ) context_encoder.use_next_obs_in_context = variant['algo_params'][ 'use_next_obs_in_context'] qf1 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + latent_dim, output_size=1, ) policy = PEARLTanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) agent = PEARLAgent(latent_dim, context_encoder, policy, **variant['algo_params']) qf1_exp = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2_exp = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf_exp = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim, output_size=1, ) policy_exp = PEARLTanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, latent_dim=latent_dim) agent_exp = ExpAgentSimple(latent_dim, context_encoder, policy_exp, **variant['algo_params']) algorithm = ExpSACSimple(env=env, train_tasks=list( tasks[:variant['n_train_tasks']]), eval_tasks=list(tasks[-variant['n_eval_tasks']:]), nets=[agent, qf1, qf2, vf], nets_exp=[agent_exp, qf1_exp, qf2_exp, vf_exp], encoder=context_encoder, **variant['algo_params']) # optionally load pre-trained weights if variant['path_to_weights'] is not None: path = variant['path_to_weights'] context_encoder.load_state_dict( torch.load(os.path.join(path, 'context_encoder.pth'))) qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth'))) qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth'))) vf.load_state_dict(torch.load(os.path.join(path, 'vf.pth'))) # TODO hacky, revisit after model refactor algorithm.networks[-6].load_state_dict( torch.load(os.path.join(path, 'target_vf.pth'))) policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth'))) # optional GPU mode ptu.set_gpu_mode(variant['util_params']['use_gpu'], variant['util_params']['gpu_id']) if ptu.gpu_enabled(): device = torch.device('cuda:0') print(device) algorithm.to(device) context_encoder.to(device) # debugging triggers a lot of printing and logs to a debug directory DEBUG = variant['util_params']['debug'] os.environ['DEBUG'] = str(int(DEBUG)) # create logging directory # TODO support Docker exp_id = 'debug' if DEBUG else None experiment_log_dir = setup_logger( variant['env_name'], variant=variant, exp_id=exp_id, base_log_dir=variant['util_params']['base_log_dir']) # optionally save eval trajectories as pkl files if variant['algo_params']['dump_eval_paths']: pickle_dir = experiment_log_dir + '/eval_trajectories' pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True) # run the algorithm algorithm.train()
num_eval_steps_per_epoch=500 * 5, num_trains_per_train_loop=1000, num_expl_steps_per_train_loop=1000, min_num_steps_before_training=1000, max_path_length=500, batch_size=256, ), trainer_kwargs=dict( discount=0.99, soft_target_tau=5e-3, target_update_period=1, policy_lr=3e-4, qf_lr=3e-4, reward_scale=1, use_automatic_entropy_tuning=True, ), env_kwargs=dict( robots="Panda", has_renderer=False, has_offscreen_renderer=False, use_camera_obs=False, camera_heights=64, camera_widths=64, reward_shaping=True, ), env_name="Lift", ) setup_logger("name-of-experiment", variant=variant) ptu.set_gpu_mode(True) # optionally set the GPU (default=False) experiment(variant)
def experiment(exp_specs): ptu.set_gpu_mode(exp_specs['use_gpu']) # Set up logging ---------------------------------------------------------- exp_id = exp_specs['exp_id'] exp_prefix = exp_specs['exp_name'] seed = exp_specs['seed'] set_seed(seed) setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs) # Prep the data ----------------------------------------------------------- env_specs = { 'flat_repr': False, 'one_hot_repr': False, 'maze_h': 9, 'maze_w': 9, 'obs_h': 5, 'obs_w': 5, 'scale': 4, 'num_objs': 10 } maze_constructor = lambda: PartiallyObservedGrid(env_specs) data_loader = VerySpecificOnTheFLyDataLoader(maze_constructor, exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled()) val_data_loader = VerySpecificOnTheFLyDataLoader( maze_constructor, exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled()) # Model Definition -------------------------------------------------------- conv_channels = 32 conv_encoder = nn.Sequential( nn.Conv2d(3, conv_channels, 4, stride=2, padding=1, bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU(), nn.Conv2d(conv_channels, conv_channels, 4, stride=2, padding=1, bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU()) gru_channels = 128 inter_h = 5 act_channels = 4 act_proc = nn.Linear(4, act_channels * inter_h * inter_h, bias=True) pre_gru_conv = nn.Sequential( nn.Conv2d(act_channels + conv_channels, conv_channels, 3, stride=1, padding=1, bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU(), ) gru = ConvGRUCell(conv_channels, gru_channels, 3) post_gru_conv = nn.Sequential( nn.Conv2d(act_channels + gru_channels, conv_channels, 3, stride=1, padding=1, bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU(), ) conv_decoder = nn.Sequential( nn.ConvTranspose2d(conv_channels, conv_channels, 4, stride=2, padding=1, output_padding=0, bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU(), # nn.Conv2d(conv_channels, conv_channels, 3, stride=1, padding=1, bias=False), # nn.BatchNorm2d(conv_channels), # nn.ReLU(), nn.ConvTranspose2d(conv_channels, conv_channels, 4, stride=2, padding=1, output_padding=0, bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU(), # nn.Conv2d(conv_channels, conv_channels, 3, stride=1, padding=1, bias=False), # nn.BatchNorm2d(conv_channels), # nn.ReLU(), ) mean_decoder = nn.Sequential( nn.Conv2d(conv_channels, 3, 1, stride=1, padding=0, bias=True), nn.Sigmoid()) log_cov_decoder = nn.Sequential( nn.Conv2d(conv_channels, 3, 1, stride=1, padding=0, bias=True), ) if ptu.gpu_enabled(): conv_encoder.cuda() pre_gru_conv.cuda() gru.cuda() post_gru_conv.cuda() conv_decoder.cuda() mean_decoder.cuda() log_cov_decoder.cuda() act_proc.cuda() # Optimizer --------------------------------------------------------------- model_optim = Adam([ item for sublist in map(lambda x: list(x.parameters()), [ conv_encoder, pre_gru_conv, gru, post_gru_conv, conv_decoder, mean_decoder, log_cov_decoder ]) for item in sublist ], lr=float(exp_specs['model_lr']), weight_decay=float(exp_specs['model_wd'])) # ------------------------------------------------------------------------- freq_bptt = exp_specs['freq_bptt'] episode_length = exp_specs['episode_length'] losses = [] for iter_num in range(int(float(exp_specs['max_iters']))): if iter_num % freq_bptt == 0: if iter_num > 0: # loss = loss / freq_bptt loss.backward() model_optim.step() prev_h_batch = prev_h_batch.detach() loss = 0 if iter_num % episode_length == 0: prev_h_batch = Variable( torch.zeros(exp_specs['batch_size'], gru_channels, inter_h, inter_h)) if ptu.gpu_enabled(): prev_h_batch = prev_h_batch.cuda() train_loss_print = '\t'.join(losses) losses = [] obs_batch, act_batch = data_loader.get_next_batch() act_batch = act_proc(act_batch).view(act_batch.size(0), act_channels, inter_h, inter_h) hidden = post_gru_conv(torch.cat([prev_h_batch, act_batch], 1)) hidden = conv_decoder(hidden) recon = mean_decoder(hidden) log_cov = log_cov_decoder(hidden) log_cov = torch.clamp(log_cov, LOG_COV_MIN, LOG_COV_MAX) enc = conv_encoder(obs_batch) enc = pre_gru_conv(torch.cat([enc, act_batch], 1)) prev_h_batch = gru(enc, prev_h_batch) losses.append('%.4f' % ((obs_batch - recon)**2).mean()) if iter_num % episode_length != 0: loss = loss + ( (obs_batch - recon)**2).sum() / float(exp_specs['batch_size']) # loss = loss + compute_diag_log_prob(recon, log_cov, obs_batch)/float(exp_specs['batch_size']) if iter_num % (500 * episode_length) in range(2 * episode_length): save_pytorch_tensor_as_img( recon[0].data.cpu(), 'junk_vis/onthefly_conv_gru_pogrid_len_8_scale_4/rnn_recon_%d.png' % iter_num) save_pytorch_tensor_as_img( obs_batch[0].data.cpu(), 'junk_vis/onthefly_conv_gru_pogrid_len_8_scale_4/rnn_obs_%d.png' % iter_num) if iter_num % exp_specs['freq_val'] == 0: print('\nValidating Iter %d...' % iter_num) list( map(lambda x: x.eval(), [ conv_encoder, pre_gru_conv, gru, post_gru_conv, conv_decoder, mean_decoder, log_cov_decoder, act_proc ])) val_prev_h_batch = Variable( torch.zeros(exp_specs['batch_size'], gru_channels, inter_h, inter_h)) if ptu.gpu_enabled(): val_prev_h_batch = val_prev_h_batch.cuda() losses = [] for i in range(episode_length): obs_batch, act_batch = val_data_loader.get_next_batch() act_batch = act_proc(act_batch).view(act_batch.size(0), act_channels, inter_h, inter_h) hidden = post_gru_conv( torch.cat([val_prev_h_batch, act_batch], 1)) hidden = conv_decoder(hidden) recon = mean_decoder(hidden) log_cov = log_cov_decoder(hidden) log_cov = torch.clamp(log_cov, LOG_COV_MIN, LOG_COV_MAX) enc = conv_encoder(obs_batch) enc = pre_gru_conv(torch.cat([enc, act_batch], 1)) val_prev_h_batch = gru(enc, val_prev_h_batch) # val_loss = compute_diag_log_prob(recon, log_cov, obs_batch)/float(exp_specs['batch_size']) losses.append('%.4f' % ((obs_batch - recon)**2).mean()) loss_print = '\t'.join(losses) print('Val MSE:\t' + loss_print) print('Train MSE:\t' + train_loss_print) list( map(lambda x: x.train(), [ conv_encoder, pre_gru_conv, gru, post_gru_conv, conv_decoder, mean_decoder, log_cov_decoder, act_proc ]))
algorithm = MetaSoftActorCritic(env_sampler=env_sampler, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1 if __name__ == '__main__': # Arguments parser = argparse.ArgumentParser() parser.add_argument('-e', '--experiment', help='experiment specification file') args = parser.parse_args() with open(args.experiment, 'r') as spec_file: spec_string = spec_file.read() exp_specs = yaml.load(spec_string) exp_id = exp_specs['exp_id'] exp_prefix = exp_specs['exp_name'] seed = exp_specs['seed'] set_seed(seed) setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs) experiment(exp_specs)
variant = dict( algorithm='HER-TD3', version='normal', algo_kwargs=dict( batch_size=256, num_epochs=100, num_eval_steps_per_epoch=5000, num_expl_steps_per_train_loop=1000, num_trains_per_train_loop=1000, min_num_steps_before_training=1000, max_path_length=50, ), td3_trainer_kwargs=dict( discount=0.95, reward_scale=1 ), replay_buffer_kwargs=dict( max_size=int(1E6), fraction_goals_rollout_goals=0.2, # equal to k = 4 in HER paper fraction_goals_env_goals=0, ), qf_kwargs=dict( hidden_sizes=[400, 300], ), policy_kwargs=dict( hidden_sizes=[400, 300], ), ) setup_logger('her-td3-fetch-experiment', variant=variant) experiment(variant)
def experiment(exp_specs): # Set up logging ---------------------------------------------------------- exp_id = exp_specs['exp_id'] exp_prefix = exp_specs['exp_name'] seed = exp_specs['seed'] set_seed(seed) setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs) # Load the data ----------------------------------------------------------- extra_data_path = exp_specs['extra_data_path'] train_replay_buffer = joblib.load(extra_data_path)['replay_buffer'] train_replay_buffer.change_max_size_to_cur_size() train_replay_buffer._next_obs = train_replay_buffer._next_obs[:,exp_specs['extra_obs_dim']:] if exp_specs['remove_env_info']: train_replay_buffer._observations = train_replay_buffer._observations[:,exp_specs['extra_obs_dim']:] else: if exp_specs['normalize_env_info']: low, high = exp_specs['env_info_range'][0], exp_specs['env_info_range'][1] train_replay_buffer._observations[:,:exp_specs['extra_obs_dim']] -= (low + high)/2.0 train_replay_buffer._observations[:,:exp_specs['extra_obs_dim']] /= (high - low)/2.0 print('\nRewards: {} +/- {}'.format( np.mean(train_replay_buffer._rewards), np.std(train_replay_buffer._rewards) )) next_obs_mean = np.mean(train_replay_buffer._next_obs, 0) next_obs_std = np.std(train_replay_buffer._next_obs, 0) print('\nNext Obs:\n{}\n+/-\n{}'.format( next_obs_mean, next_obs_std )) print('\nAvg Next Obs Square Norm: {}'.format( np.mean(np.linalg.norm(train_replay_buffer._next_obs, axis=1)**2) )) sample_batch = train_replay_buffer.random_batch(exp_specs['train_batch_size']) obs_dim = sample_batch['observations'].shape[-1] act_dim = sample_batch['actions'].shape[-1] val_replay_buffer = SimpleReplayBuffer(exp_specs['val_set_size'], obs_dim, act_dim) val_replay_buffer.set_buffer_from_dict( train_replay_buffer.sample_and_remove(exp_specs['val_set_size']) ) if exp_specs['train_from_beginning_transitions']: trans_dict = dict( observations=train_replay_buffer._observations[:exp_specs['train_set_size']], actions=train_replay_buffer._actions[:exp_specs['train_set_size']], rewards=train_replay_buffer._rewards[:exp_specs['train_set_size']], terminals=train_replay_buffer._terminals[:exp_specs['train_set_size']], next_observations=train_replay_buffer._next_obs[:exp_specs['train_set_size']], ) train_replay_buffer.set_buffer_from_dict(trans_dict) else: train_replay_buffer.set_buffer_from_dict( train_replay_buffer.sample_and_remove(exp_specs['train_set_size']) ) # Model Definitions ------------------------------------------------------- if exp_specs['remove_env_info']: output_dim = [obs_dim + 1] else: output_dim = [obs_dim - exp_specs['extra_obs_dim'] + 1] model = GenericMap( [obs_dim + act_dim], output_dim, siamese_input=False, siamese_output=False, num_hidden_layers=exp_specs['num_hidden_layers'], hidden_dim=exp_specs['hidden_dim'], act='relu', use_bn=True, deterministic=True ) model_optim = Adam(model.parameters(), lr=float(exp_specs['lr'])) # Train ------------------------------------------------------------------- model.train() for iter_num in range(exp_specs['max_iters']): model_optim.zero_grad() batch = train_replay_buffer.random_batch(exp_specs['train_batch_size']) batch = convert_numpy_dict_to_pytorch(batch) inputs = Variable(torch.cat([batch['observations'], batch['actions']], -1)) outputs = Variable(torch.cat([batch['next_observations'], batch['rewards']], -1)) preds = model([inputs])[0] if exp_specs['residual']: # residual for observations preds = preds + Variable( torch.cat( [ batch['observations'][:,exp_specs['extra_obs_dim']:], torch.zeros(exp_specs['train_batch_size'], 1) ], 1) ) loss = torch.mean(torch.sum((outputs - preds)**2, -1)) loss.backward() model_optim.step() if iter_num % exp_specs['freq_val'] == 0: model.eval() val_batch = val_replay_buffer.random_batch(exp_specs['val_batch_size']) val_batch = convert_numpy_dict_to_pytorch(val_batch) inputs = Variable(torch.cat([val_batch['observations'], val_batch['actions']], -1)) outputs = Variable(torch.cat([val_batch['next_observations'], val_batch['rewards']], -1)) # print(exp_specs['remove_env_info']) # print(inputs) # print(outputs) # sleep(5) preds = model([inputs])[0] if exp_specs['residual']: # residual for observations preds = preds + Variable( torch.cat( [ val_batch['observations'][:,exp_specs['extra_obs_dim']:], torch.zeros(exp_specs['train_batch_size'], 1) ], 1) ) loss = torch.mean(torch.sum((outputs - preds)**2, -1)) next_obs_loss = torch.mean(torch.sum((outputs[:,:-1] - preds[:,:-1])**2, -1)) rew_loss = torch.mean(torch.sum((outputs[:,-1:] - preds[:,-1:])**2, -1)) print('\n') print('-'*20) logger.record_tabular('Iter', iter_num) logger.record_tabular('Loss', loss.data[0]) logger.record_tabular('Obs Loss', next_obs_loss.data[0]) logger.record_tabular('Rew Loss', rew_loss.data[0]) logger.dump_tabular(with_prefix=False, with_timestamp=False) model.train()
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "exp_dir", type=str, help="Experiment directory to load params and append logs") parser.add_argument('start_epoch', type=int, help="Start epoch for continue training logs") parser.add_argument("--params_fname", default="params.pkl", type=str) parser.add_argument('--gui', action='store_true') parser.add_argument('--no_gpu', action='store_true') args = parser.parse_args() variant = load_variant(args.exp_dir) variant["start_epoch"] = args.start_epoch variant['headless'] = not args.gui gpu_str = "0" if not args.no_gpu: ptu.enable_gpus(gpu_str) ptu.set_gpu_mode(True) params_data = load_params(os.path.join(args.exp_dir, args.params_fname)) setup_logger(log_dir=args.exp_dir, variant=variant) experiment(variant, params_data)
def experiment(exp_specs): ptu.set_gpu_mode(exp_specs['use_gpu']) # Set up logging ---------------------------------------------------------- exp_id = exp_specs['exp_id'] exp_prefix = exp_specs['exp_name'] seed = exp_specs['seed'] set_seed(seed) setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs) # Prep the data ----------------------------------------------------------- env_specs = { 'flat_repr': False, 'one_hot_repr': False, 'maze_h': 9, 'maze_w': 9, 'obs_h': 5, 'obs_w': 5, 'scale': 4, 'num_objs': 10 } maze_constructor = lambda: PartiallyObservedGrid(env_specs) data_loader = VerySpecificOnTheFLyDataLoader(maze_constructor, exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled()) val_data_loader = VerySpecificOnTheFLyDataLoader( maze_constructor, exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled()) # Model Definition -------------------------------------------------------- model = RecurrentModel() if ptu.gpu_enabled(): model.cuda() # Optimizer --------------------------------------------------------------- model_optim = Adam(model.parameters(), lr=float(exp_specs['model_lr']), weight_decay=float(exp_specs['model_wd'])) # ------------------------------------------------------------------------- freq_bptt = exp_specs['freq_bptt'] episode_length = exp_specs['episode_length'] losses = [] for iter_num in range(int(float(exp_specs['max_iters']))): if iter_num % freq_bptt == 0: if iter_num > 0: # loss = loss / freq_bptt loss.backward() model_optim.step() prev_h_batch = prev_h_batch.detach() prev_c_batch = prev_c_batch.detach() loss = 0 if iter_num % episode_length == 0: prev_h_batch = Variable( torch.zeros(exp_specs['batch_size'], model.lstm_dim)) prev_c_batch = Variable( torch.zeros(exp_specs['batch_size'], model.lstm_dim)) if ptu.gpu_enabled(): prev_h_batch = prev_h_batch.cuda() prev_c_batch = prev_c_batch.cuda() train_loss_print = '\t'.join(losses) losses = [] obs_batch, act_batch = data_loader.get_next_batch() recon, log_cov, prev_h_batch, prev_c_batch = model.forward( obs_batch, act_batch, prev_h_batch, prev_c_batch) losses.append('%.4f' % ((obs_batch - recon)**2).mean()) if iter_num % episode_length != 0: # temp = (obs_batch - recon)**2 / 4. # temp[:,:,1:4,1:4] = temp[:,:,1:4,1:4] * 4. temp = (obs_batch - recon)**2 loss = loss + temp.sum() / float( exp_specs['batch_size']) + model.reg_loss # loss = loss - compute_diag_log_prob(recon, log_cov, obs_batch)/float(exp_specs['batch_size']) if iter_num % (500 * episode_length) in range(2 * episode_length): save_pytorch_tensor_as_img( recon[0].data.cpu(), 'junk_vis/recurrent_deconv_stronger_2/rnn_recon_%d.png' % iter_num) save_pytorch_tensor_as_img( obs_batch[0].data.cpu(), 'junk_vis/recurrent_deconv_stronger_2/rnn_obs_%d.png' % iter_num) if iter_num % exp_specs['freq_val'] == 0: model.eval() # print(mask[0], torch.mean(mask, 1), torch.std(mask, 1), torch.min(mask, 1), torch.max(mask, 1)) print('\nValidating Iter %d...' % iter_num) val_prev_h_batch = Variable( torch.zeros(exp_specs['batch_size'], model.lstm_dim)) val_prev_c_batch = Variable( torch.zeros(exp_specs['batch_size'], model.lstm_dim)) if ptu.gpu_enabled(): val_prev_h_batch = val_prev_h_batch.cuda() val_prev_c_batch = val_prev_c_batch.cuda() losses = [] for i in range(episode_length): obs_batch, act_batch = val_data_loader.get_next_batch() recon, log_cov, val_prev_h_batch, val_prev_c_batch = model.forward( obs_batch, act_batch, val_prev_h_batch, val_prev_c_batch) # val_loss = compute_diag_log_prob(recon, log_cov, obs_batch)/float(exp_specs['batch_size']) losses.append('%.4f' % ((obs_batch - recon)**2).mean()) loss_print = '\t'.join(losses) print('Val MSE:\t' + loss_print) print('Train MSE:\t' + train_loss_print) model.train()
def experiment(variant): setup_logger("name-of-experiment", variant=variant) ptu.set_gpu_mode(True) log_dir = os.path.expanduser(variant["log_dir"]) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) # missing - set torch seed and num threads=1 # expl_env = gym.make(variant["env_name"]) expl_envs = make_vec_envs( variant["env_name"], variant["seed"], variant["num_processes"], variant["gamma"], variant["log_dir"], # probably change this? ptu.device, False, pytorch=False, ) # eval_env = gym.make(variant["env_name"]) eval_envs = make_vec_envs( variant["env_name"], variant["seed"], variant["num_processes"], variant["gamma"], variant["log_dir"], ptu.device, False, pytorch=False, ) obs_shape = expl_envs.observation_space.image.shape # if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: # convert WxHxC into CxWxH # expl_env = TransposeImage(expl_env, op=[2, 0, 1]) # eval_env = TransposeImage(eval_env, op=[2, 0, 1]) # obs_shape = expl_env.observation_space.shape channels, obs_width, obs_height = obs_shape action_space = expl_envs.action_space action_space = gym.spaces.Box(-np.inf, np.inf, (10, )) expl_envs.action_space = action_space # not sure if this works... lets see?! eval_envs.action_space = action_space base_kwargs = { "num_inputs": channels, "recurrent": variant["recurrent_policy"] } base = CNNBase(**base_kwargs) bernoulli_dist = distributions.Bernoulli(base.output_size, 4) continuous_dist = distributions.DiagGaussian(base.output_size, 6) dist = distributions.DistributionGeneratorTuple( (bernoulli_dist, continuous_dist)) eval_policy = LearnPlanPolicy( WrappedPolicy( obs_shape, action_space, ptu.device, base=base, deterministic=True, dist=dist, num_processes=variant["num_processes"], ), num_processes=variant["num_processes"], vectorised=True, ) expl_policy = LearnPlanPolicy( WrappedPolicy( obs_shape, action_space, ptu.device, base=base, deterministic=False, dist=dist, num_processes=variant["num_processes"], ), num_processes=variant["num_processes"], vectorised=True, ) # missing: at this stage, policy hasn't been sent to device, but happens later eval_path_collector = HierarchicalStepCollector( eval_envs, eval_policy, ptu.device, max_num_epoch_paths_saved=variant["algorithm_kwargs"] ["num_eval_steps_per_epoch"], num_processes=variant["num_processes"], render=variant["render"], ) expl_path_collector = HierarchicalStepCollector( expl_envs, expl_policy, ptu.device, max_num_epoch_paths_saved=variant["num_steps"], num_processes=variant["num_processes"], render=variant["render"], ) # added: created rollout(5,1,(4,84,84),Discrete(6),1), reset env and added obs to rollout[step] trainer = A2CTrainer(actor_critic=expl_policy.learner, **variant["trainer_kwargs"]) # missing: by this point, rollout back in sync. replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_envs) # added: replay buffer is new algorithm = TorchIkostrikovRLAlgorithm( trainer=trainer, exploration_env=expl_envs, evaluation_env=eval_envs, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"], # batch_size, # max_path_length, # num_epochs, # num_eval_steps_per_epoch, # num_expl_steps_per_train_loop, # num_trains_per_train_loop, # num_train_loops_per_epoch=1, # min_num_steps_before_training=0, ) algorithm.to(ptu.device) # missing: device back in sync algorithm.train()
def experiment(exp_specs): ptu.set_gpu_mode(exp_specs['use_gpu']) # Set up logging ---------------------------------------------------------- exp_id = exp_specs['exp_id'] exp_prefix = exp_specs['exp_name'] seed = exp_specs['seed'] set_seed(seed) setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs) # Prep the data ----------------------------------------------------------- path = 'junk_vis/debug_att_vae_shallower_48_64_dim_0p1_kl_stronger_seg_conv' (X_train, Y_train), (X_test, Y_test) = multi_mnist(path, max_digits=2, canvas_size=48, seed=42, use_max=False) convert_dict = {0: [0., 0.], 1: [1., 0.], 2: [1., 1.]} Num_train = np.array([convert_dict[a.shape[0]] for a in Y_train]) Num_test = np.array([convert_dict[a.shape[0]] for a in Y_test]) X_train = X_train[:, None, ...] X_test = X_test[:, None, ...] X_train, X_test = torch.FloatTensor(X_train) / 255.0, torch.FloatTensor( X_test) / 255.0 mask_train, mask_test = torch.FloatTensor(Num_train), torch.FloatTensor( Num_test) train_ds = TensorDataset(X_train, Num_train) val_ds = TensorDataset(X_test, Num_test) # Model Definition -------------------------------------------------------- model = AttentiveVAE([1, 48, 48], exp_specs['vae_specs']['z_dim'], exp_specs['vae_specs']['x_encoder_specs'], exp_specs['vae_specs']['z_seg_conv_specs'], exp_specs['vae_specs']['z_seg_fc_specs'], exp_specs['vae_specs']['z_obj_conv_specs'], exp_specs['vae_specs']['z_obj_fc_specs'], exp_specs['vae_specs']['z_seg_recon_fc_specs'], exp_specs['vae_specs']['z_seg_recon_upconv_specs'], exp_specs['vae_specs']['z_obj_recon_fc_specs'], exp_specs['vae_specs']['z_obj_recon_upconv_specs'], exp_specs['vae_specs']['recon_upconv_part_specs']) if ptu.gpu_enabled(): model.cuda() # Optimizer --------------------------------------------------------------- model_optim = Adam(model.parameters(), lr=float(exp_specs['model_lr']), weight_decay=float(exp_specs['model_wd'])) # ------------------------------------------------------------------------- global_iter = 0 for epoch in range(exp_specs['epochs']): train_loader = DataLoader(train_ds, batch_size=exp_specs['batch_size'], shuffle=True, num_workers=4, pin_memory=False, drop_last=True) for iter_num, img_batch in enumerate(train_loader): img_batch, num_batch = img_batch[0], img_batch[1] if ptu.gpu_enabled(): img_batch = img_batch.cuda() what_means, what_log_covs, where_means, where_log_covs, masks, recon_mean, recon_log_cov = model( img_batch, num_batch) elbo, KL = model.compute_ELBO(what_means + where_means, what_log_covs + where_log_covs, recon_mean, recon_log_cov, img_batch, average_over_batch=True) loss = -1. * elbo loss = loss + 1. * sum([m.mean() for m in masks]) loss.backward() model_optim.step() if global_iter % exp_specs['freq_val'] == 0: with torch.no_grad(): print('\nValidating Iter %d...' % global_iter) model.eval() idxs = np.random.choice(int(X_test.size(0)), size=exp_specs['batch_size'], replace=False) img_batch, num_batch = X_test[idxs], Num_test[idxs] if ptu.gpu_enabled(): img_batch = img_batch.cuda() what_means, what_log_covs, where_means, where_log_covs, masks, recon_mean, recon_log_cov = model( img_batch, num_batch) elbo, KL = model.compute_ELBO(what_means + where_means, what_log_covs + where_log_covs, recon_mean, recon_log_cov, img_batch, average_over_batch=True) mse = ((recon_mean - img_batch)**2).mean() print('ELBO:\t%.4f' % elbo) print('MSE:\t%.4f' % mse) print('KL:\t%.4f' % KL) for i in range(1): save_pytorch_tensor_as_img( img_batch[i].data.cpu(), os.path.join(path, '%d_%d_img.png' % (global_iter, i))) save_pytorch_tensor_as_img( recon_mean[i].data.cpu(), os.path.join(path, '%d_%d_recon.png' % (global_iter, i))) save_pytorch_tensor_as_img( masks[0][i].data.cpu(), os.path.join(path, '%d_%d_mask_0.png' % (global_iter, i))) # save_pytorch_tensor_as_img(masks[1][i].data.cpu(), os.path.join(path, '%d_%d_mask_1.png'%(global_iter, i))) model.train() global_iter += 1
if __name__ == "__main__": # noinspection PyTypeChecker T = 2048 max_ep_len = 1000 epochs = 10 minibatch_size = 64 variant = dict( algorithm="PPO", version="normal", layer_size=64, replay_buffer_size=T, algorithm_kwargs=dict( num_iter=int(1e6 // T), num_eval_steps_per_epoch=max_ep_len, num_trains_per_train_loop=T // minibatch_size * epochs, num_expl_steps_per_train_loop=T, min_num_steps_before_training=0, max_path_length=max_ep_len, minibatch_size=minibatch_size, ), trainer_kwargs=dict( epsilon=0.2, reward_scale=1.0, lr=3e-4, ), ) setup_logger('PPOBipedalWalkerV2', variant=variant) #ptu.set_gpu_mode(True) # optionally set the GPU (default=False) experiment(variant)
qf_lr=args.qf_lr, reward_scale=1, # BEAR specific params mode='auto', kernel_choice=args.kernel_type, policy_update_style='0', mmd_sigma=args.mmd_sigma, target_mmd_thresh=args.target_mmd_thresh, ), ) setup_logger( exp_prefix='bear-' + args.env, variant=variant, text_log_file="debug.log", variant_log_file="variant.json", tabular_log_file="progress.csv", snapshot_mode="gap_and_last", snapshot_gap=100, log_tabular_only=False, log_dir=None, git_infos=None, script_name=None, # **create_log_dir_kwargs base_log_dir='./data', exp_id=9999, seed=0) ptu.set_gpu_mode(True) # optionally set the GPU (default=False) experiment(variant)
sup_lr=(args.lr if args.lr else 1e-3), ), load_kwargs=dict( load=args.load, load_dir=log_dir, ), ) if args.load: log_dir = log_dir + '_load' import os if not os.path.isdir(log_dir): os.makedirs(log_dir) with open(osp.join(log_dir, 'variant.json'), 'w') as out_json: import json json.dump(variant, out_json, indent=2) import sys cmd_input = 'python ' + ' '.join(sys.argv) + '\n' with open(osp.join(log_dir, 'cmd_input.txt'), 'a') as f: f.write(cmd_input) setup_logger(args.exp_name + '/' + main_dir, variant=variant, snapshot_mode=args.snapshot_mode, snapshot_gap=args.snapshot_gap, log_dir=log_dir) import numpy as np import torch np.random.seed(args.seed) torch.manual_seed(args.seed) # ptu.set_gpu_mode(True) # optionally set the GPU (default=False) experiment(variant)
def experiment(variant, args): # expl_env = NormalizedBoxEnv(gym.make(str(args.env))) # eval_env = NormalizedBoxEnv(gym.make(str(args.env))) print(os.getpid()) ptu.set_gpu_mode(True) # optionally set the GPU (default=False) set_seed(args.seed) setup_logger('DIAYN_' + str(args.skill_dim) + '_' + args.env + str(args.seed), variant=variant, snapshot_mode="last") expl_env = NormalizedBoxEnv(Mani2dEnv()) eval_env = NormalizedBoxEnv(Mani2dEnv()) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size skill_dim = args.skill_dim M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim + skill_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim + skill_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim + skill_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim + skill_dim, output_size=1, hidden_sizes=[M, M], ) df = FlattenMlp( input_size=obs_dim, output_size=skill_dim, hidden_sizes=[M, M], ) policy = SkillTanhGaussianPolicy(obs_dim=obs_dim + skill_dim, action_dim=action_dim, hidden_sizes=[M, M], skill_dim=skill_dim) eval_policy = MakeDeterministic(policy) eval_path_collector = DIAYNMdpPathCollector( eval_env, eval_policy, ) expl_step_collector = MdpStepCollector( expl_env, policy, ) replay_buffer = DIAYNEnvReplayBuffer( variant['replay_buffer_size'], expl_env, skill_dim, ) trainer = DIAYNTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, df=df, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = DIAYNTorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_step_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()