def _do_reward_training(self): ''' Train the discriminator ''' self.disc_optimizer.zero_grad() expert_batch = self.get_expert_batch(self.disc_optim_batch_size) expert_obs = expert_batch['observations'] expert_actions = expert_batch['actions'] policy_batch = self.get_policy_batch(self.disc_optim_batch_size) policy_obs = policy_batch['observations'] policy_actions = policy_batch['actions'] obs = torch.cat([expert_obs, policy_obs], dim=0) actions = torch.cat([expert_actions, policy_actions], dim=0) disc_logits = self.discriminator(obs, actions) disc_preds = (disc_logits > 0).type(torch.FloatTensor) disc_loss = self.bce(disc_logits, self.bce_targets) accuracy = (disc_preds == self.bce_targets).type( torch.FloatTensor).mean() if self.use_grad_pen: eps = Variable(torch.rand(self.disc_optim_batch_size, 1)) if ptu.gpu_enabled(): eps = eps.cuda() interp_obs = eps * expert_obs + (1 - eps) * policy_obs interp_obs.detach() interp_obs.requires_grad = True interp_actions = eps * expert_actions + (1 - eps) * policy_actions interp_actions.detach() interp_actions.requires_grad = True gradients = autograd.grad( outputs=self.discriminator(interp_obs, interp_actions).sum(), inputs=[interp_obs, interp_actions], # grad_outputs=torch.ones(exp_specs['batch_size'], 1).cuda(), create_graph=True, retain_graph=True, only_inputs=True) total_grad = torch.cat([gradients[0], gradients[1]], dim=1) gradient_penalty = ((total_grad.norm(2, dim=1) - 1)**2).mean() disc_loss = disc_loss + gradient_penalty * self.grad_pen_weight disc_loss.backward() self.disc_optimizer.step() """ Save some statistics for eval """ if self.rewardf_eval_statistics is None: """ Eval should set this to None. This way, these statistics are only computed for one batch. """ self.rewardf_eval_statistics = OrderedDict() self.rewardf_eval_statistics['Disc Loss'] = np.mean( ptu.get_numpy(disc_loss)) self.rewardf_eval_statistics['Disc Acc'] = np.mean( ptu.get_numpy(accuracy))
def experiment(variant): env_sampler = MazeSampler(variant['env_specs']) env, _ = env_sampler() if variant['conv_input']: qf = ConvNet(kernel_sizes=variant['kernel_sizes'], num_channels=variant['num_channels'], strides=variant['strides'], paddings=variant['paddings'], hidden_sizes=variant['hidden_sizes'], input_size=env.observation_space.shape, output_size=env.action_space.n) else: qf = Mlp( hidden_sizes=[ variant['net_size'] for _ in range(variant['num_layers']) ], input_size=int(np.prod(env.observation_space.shape)), output_size=env.action_space.n, ) qf_criterion = nn.MSELoss() # Use this to switch to DoubleDQN # algorithm = DoubleDQN( print('WTF is going on!') print(env_sampler) algorithm = MetaDQN(env_sampler=env_sampler, qf=qf, qf_criterion=qf_criterion, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): expert_buffer = joblib.load(variant['exp_xy_data_path'])['xy_data'] policy_buffer = joblib.load(variant['pol_xy_data_path'])['xy_data'] # set up the discriminator models if variant['threeway']: disc_model_class = ThreeWayResNetAIRLDisc else: if variant['use_resnet_disc']: disc_model_class = ResNetAIRLDisc else: disc_model_class = StandardAIRLDisc disc_model = disc_model_class( 2, # obs is just x-y pos num_layer_blocks=variant['disc_num_blocks'], hid_dim=variant['disc_hid_dim'], hid_act=variant['disc_hid_act'], use_bn=variant['disc_use_bn'], clamp_magnitude=variant['disc_clamp_magnitude']) print(disc_model) print(disc_model.clamp_magnitude) # set up the AIRL algorithm alg_class = ThreeWayFixedDistDiscTrainAlg if variant[ 'threeway'] else FixedDistDiscTrainAlg algorithm = alg_class(disc_model, expert_buffer, policy_buffer, **variant['algo_params']) print(algorithm.disc_optimizer.defaults['lr']) # train if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): #env = NormalizedBoxEnv(HalfCheetahEnv()) # Or for a specific version: # import gym env = NormalizedBoxEnv(gym.make('Pointmass-v1')) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def __init__( self, train_dataset, test_dataset, model, batch_size=128, log_interval=0, beta=0.5, beta_schedule=None, imsize=84, lr=1e-3, do_scatterplot=False, normalize=False, state_sim_debug=False, mse_weight=0.1, is_auto_encoder=False, lmbda=0.5, mu=1, gamma=0.2, ): self.log_interval = log_interval self.batch_size = batch_size self.beta = beta if is_auto_encoder: self.beta = 0 self.beta_schedule = beta_schedule if self.beta_schedule is None: self.beta_schedule = ConstantSchedule(self.beta) self.imsize = imsize self.do_scatterplot = do_scatterplot self.lmbda = lmbda self.mu = mu self.gamma = gamma """ I think it's a bit nicer if the caller makes this call, i.e. ``` m = ConvVAE(representation_size) if ptu.gpu_enabled(): m.cuda() t = ConvVAETrainer(train_data, test_data, m) ``` However, I'll leave this here for backwards-compatibility. """ if ptu.gpu_enabled(): model.cuda() self.model = model self.representation_size = model.representation_size self.input_channels = model.input_channels self.imlength = model.imlength self.optimizer = optim.Adam(self.model.parameters(), lr=lr) self.train_dataset, self.test_dataset = train_dataset, test_dataset self.normalize = normalize self.state_sim_debug = state_sim_debug self.mse_weight = mse_weight self.x_next_index = self.input_channels * self.imsize**2 if self.normalize: self.train_data_mean = np.mean(self.train_dataset, axis=0)
def experiment(variant): farmlist_base = [('123.123.123.123', 4)] farmer = Farmer(farmlist_base) environment = acq_remote_env(farmer) env = NormalizedBoxEnv(environment) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(user_variant): variant = default_variant.copy() variant.update(user_variant) if ptu.gpu_enabled(): enable_gpus("0") env_id = variant["env"] env = build_env(env_id) agent_configs = variant["agent_configs"] agent = build_agent(env, env_id, agent_configs) agent.visualize = variant["visualize"] model_file = variant.get("model_file") if (model_file is not ""): agent.load_model(model_file) log_dir = logger.get_snapshot_dir() if (variant["train"]): agent.train(max_iter=variant["max_iter"], test_episodes=variant["test_episodes"], output_dir=log_dir, output_iters=variant["output_iters"]) else: agent.eval(num_episodes=variant["test_episodes"]) return
def __init__( self, discriminator, exp_data, pol_data, disc_optim_batch_size=1024, num_update_loops_per_train_call=1, num_disc_updates_per_loop_iter=1, disc_lr=1e-3, disc_momentum=0.0, disc_optimizer_class=optim.Adam, use_grad_pen=True, grad_pen_weight=10, train_objective='airl', ): assert disc_lr != 1e-3, 'Just checking that this is being taken from the spec file' self.exp_data, self.pol_data = exp_data, pol_data self.discriminator = discriminator self.rewardf_eval_statistics = None self.disc_optimizer = disc_optimizer_class( self.discriminator.parameters(), lr=disc_lr, betas=(disc_momentum, 0.999)) print('\n\nDISC MOMENTUM: %f\n\n' % disc_momentum) self.disc_optim_batch_size = disc_optim_batch_size assert train_objective in ['airl', 'fairl', 'gail', 'w1'] self.train_objective = train_objective self.bce = nn.BCEWithLogitsLoss() target_batch_size = self.disc_optim_batch_size self.bce_targets = torch.cat([ torch.ones(target_batch_size, 1), torch.zeros(target_batch_size, 1) ], dim=0) self.bce_targets = Variable(self.bce_targets) if ptu.gpu_enabled(): self.bce.cuda() self.bce_targets = self.bce_targets.cuda() self.use_grad_pen = use_grad_pen self.grad_pen_weight = grad_pen_weight self.num_update_loops_per_train_call = num_update_loops_per_train_call self.num_disc_updates_per_loop_iter = num_disc_updates_per_loop_iter d = 5.0 self._d = d self._d_len = np.arange(-d, d + 0.25, 0.25).shape[0] self.xy_var = [] for i in np.arange(-d, d + 0.25, 0.25): for j in np.arange(-d, d + 0.25, 0.25): self.xy_var.append([float(i), float(j)]) self.xy_var = np.array(self.xy_var) self.xy_var = Variable(ptu.from_numpy(self.xy_var), requires_grad=False)
def experiment(variant): # make the disc model z_dim = variant['algo_params']['z_dim'] # make the MLP # hidden_sizes = [variant['algo_params']['mlp_hid_dim']] * variant['algo_params']['mlp_layers'] # mlp = Mlp( # hidden_sizes, # output_size=1, # input_size=48 + z_dim, # batch_norm=variant['algo_params']['mlp_use_bn'] # ) algorithm = FetchShapeTaskDesign( # mlp, **variant['algo_params'] ) # for _ in range(100): # # print(algorithm._get_any()) # # print(algorithm._get_except(0,1)) # img = algorithm._get_image_without_object(1, 2) # print('-------') # print(img[:6]) # print(img[6:12]) # print(img[12:18]) # print(img[18:24]) # 1/0 if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): # we have to generate the combinations for the env_specs env_specs = variant['env_specs'] env_sampler = MazeSampler(env_specs) sample_env, _ = env_sampler() meta_params_dim = 0 obs_dim = int(np.prod(sample_env.observation_space.shape)) if isinstance(sample_env.action_space, Discrete): action_dim = int(sample_env.action_space.n) else: action_dim = int(np.prod(sample_env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + meta_params_dim, output_size=action_dim, ) policy = DiscreteQWrapperPolicy(qf) algorithm = MetaSoftQLearning(env_sampler=env_sampler, qf=qf, policy=policy, **variant['algo_params']) # assert False, "Have not added new sac yet!" if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): from rlkit.core import logger import rlkit.torch.pytorch_util as ptu beta = variant["beta"] representation_size = variant["representation_size"] train_data, test_data, info = generate_vae_dataset( **variant['get_data_kwargs']) logger.save_extra_data(info) logger.get_snapshot_dir() if 'beta_schedule_kwargs' in variant: beta_schedule = PiecewiseLinearSchedule( **variant['beta_schedule_kwargs']) else: beta_schedule = None m = ConvVAE(representation_size, input_channels=3, **variant['conv_vae_kwargs']) if ptu.gpu_enabled(): m.to(ptu.device) t = ConvVAETrainer(train_data, test_data, m, beta=beta, beta_schedule=beta_schedule, **variant['algo_kwargs']) save_period = variant['save_period'] for epoch in range(variant['num_epochs']): should_save_imgs = (epoch % save_period == 0) t.train_epoch(epoch) t.test_epoch(epoch, save_reconstruction=should_save_imgs, save_scatterplot=should_save_imgs) if should_save_imgs: t.dump_samples(epoch)
def forward(self, obs_batch, act_batch, prev_h_batch, prev_c_batch): lstm_act_proc = self.lstm_act_proc_fc(act_batch) recon_act_proc = self.recon_act_proc_fc(act_batch) batch_size = obs_batch.size(0) att_prev_h_batch = Variable(torch.zeros(batch_size, self.flat_inter_img_dim)) att_prev_c_batch = Variable(torch.zeros(batch_size, self.flat_inter_img_dim)) if ptu.gpu_enabled(): att_prev_h_batch = att_prev_h_batch.cuda() att_prev_c_batch = att_prev_c_batch.cuda() self.reg_loss = 0. att_input = torch.cat([prev_h_batch, recon_act_proc], 1) for _ in range(4): att_prev_h_batch, att_prev_c_batch = self.attention_lstm(att_input, (att_prev_h_batch, att_prev_c_batch)) hidden = att_prev_h_batch.view(obs_batch.size(0), self.conv_channels, self.img_h, self.img_h) hidden = self.conv_decoder(hidden) recon = self.mean_decoder(hidden) log_cov = self.log_cov_decoder(hidden) log_cov = torch.clamp(log_cov, LOG_COV_MIN, LOG_COV_MAX) enc = self.conv_encoder(obs_batch) enc = enc.view(obs_batch.size(0), -1) enc = self.fc_encoder(torch.cat([enc, lstm_act_proc], 1)) prev_h_batch, prev_c_batch = self.lstm(enc, (prev_h_batch, prev_c_batch)) return recon, log_cov, prev_h_batch, prev_c_batch
def experiment(variant): env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(PointEnv(**variant['task_params'])) ptu.set_gpu_mode(variant['use_gpu'], variant['gpu_id']) tasks = env.get_all_task_idx() obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) latent_dim = 5 task_enc_output_dim = latent_dim * 2 if variant['algo_params']['use_information_bottleneck'] else latent_dim reward_dim = 1 net_size = variant['net_size'] # start with linear task encoding recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder task_enc = encoder_model( hidden_sizes=[200, 200, 200], # deeper net + higher dim space generalize better input_size=obs_dim + action_dim + reward_dim, output_size=task_enc_output_dim, ) qf1 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + latent_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) agent = ProtoAgent( latent_dim, [task_enc, policy, qf1, qf2, vf], **variant['algo_params'] ) algorithm = ProtoSoftActorCritic( env=env, train_tasks=list(tasks[:-20]), eval_tasks=list(tasks[-20:]), nets=[agent, task_enc, policy, qf1, qf2, vf], latent_dim=latent_dim, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.to() algorithm.train()
def experiment(variant): algorithm = joblib.load(variant['ckpt_path'])['algorithm'] if ptu.gpu_enabled(): algorithm.cuda() tuner = FetchTuner(algorithm, **variant['algo_params']) tuner.train() return 1
def experiment(variant): task_mode = variant['task_mode'] # train, test, eval task_idx = variant['task_idx'] if task_mode == 'train': task_sampler = WalkerTrainParamsSampler() elif task_mode == 'test': task_sampler = WalkerTestParamsSampler() else: raise NotImplementedError() task_params = task_sampler.get_task(task_idx) obs_task_params = task_sampler.get_obs_task_params(task_params) env = SingleTaskWalkerEnv(task_params, obs_task_params) training_env = SingleTaskWalkerEnv(task_params, obs_task_params) print(env.observation_space) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] hidden_sizes = [net_size] * variant['num_hidden_layers'] print('Using simple model') qf1 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim, action_dim=action_dim, ) algorithm = NewSoftActorCritic( env=env, training_env=training_env, policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): # we have to generate the combinations for the env_specs env_specs = variant['env_specs'] env_specs_vg = VariantGenerator() env_spec_constants = {} for k, v in env_specs.items(): if isinstance(v, list): env_specs_vg.add(k, v) else: env_spec_constants[k] = v env_specs_list = [] for es in env_specs_vg.variants(): del es['_hidden_keys'] es.update(env_spec_constants) env_specs_list.append(es) print(env_specs_list) print(env_specs_list[0]) env_sampler = EnvSampler(env_specs_list) # set up similar to non-meta version sample_env, _ = env_sampler() if variant['algo_params']['concat_env_params_to_obs']: meta_params_dim = sample_env.env_meta_params.shape[0] else: meta_params_dim = 0 obs_dim = int(np.prod(sample_env.observation_space.shape)) action_dim = int(np.prod(sample_env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim + meta_params_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + meta_params_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim + meta_params_dim, action_dim=action_dim, ) algorithm = MetaSoftActorCritic(env_sampler=env_sampler, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def preload(self, batch_size): try: self.batch = self.random_batch(batch_size) except StopIteration: self.batch = None return if ptu.gpu_enabled(): # with torch.cuda.stream(self.stream): for k in self.batch: self.batch[k] = self.batch[k].to(device=ptu.device, non_blocking=True)
def experiment(specs): with open(path.join(specs['specific_exp_dir'], 'variant.json'), 'r') as f: variant = json.load(f) variant['algo_params']['do_not_train'] = True variant['seed'] = specs['seed'] policy = joblib.load(path.join(specs['specific_exp_dir'], 'params.pkl'))['exploration_policy'] assert False, 'Do you really wanna make it deterministic?' policy = MakeDeterministic(policy) env_specs = variant['env_specs'] env, _ = get_env(env_specs) training_env, _ = get_env(env_specs) variant['algo_params']['replay_buffer_size'] = int( np.floor(specs['num_episodes'] * variant['algo_params']['max_path_length'] / specs['subsampling'])) # Hack until I figure out how things are gonna be in general then I'll clean it up if 'policy_uses_pixels' not in variant['algo_params']: variant['algo_params']['policy_uses_pixels'] = False if 'policy_uses_task_params' not in variant['algo_params']: variant['algo_params']['policy_uses_task_params'] = False if 'concat_task_params_to_policy_obs' not in variant['algo_params']: variant['algo_params']['concat_task_params_to_policy_obs'] = False replay_buffer = ExpertReplayBuffer( variant['algo_params']['replay_buffer_size'], env, subsampling=specs['subsampling'], policy_uses_pixels=variant['algo_params']['policy_uses_pixels'], policy_uses_task_params=variant['algo_params'] ['policy_uses_task_params'], concat_task_params_to_policy_obs=variant['algo_params'] ['concat_task_params_to_policy_obs'], ) variant['algo_params']['freq_saving'] = 1 algorithm = ExpertTrajGeneratorAlgorithm( env=env, training_env=training_env, exploration_policy=policy, replay_buffer=replay_buffer, max_num_episodes=specs['num_episodes'], **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def __init__(self, env, policy, discriminator, policy_optimizer, expert_replay_buffer, disc_optim_batch_size=32, policy_optim_batch_size=1000, disc_lr=1e-3, disc_optimizer_class=optim.Adam, use_grad_pen=True, grad_pen_weight=10, plotter=None, render_eval_paths=False, eval_deterministic=True, **kwargs): assert disc_lr != 1e-3, 'Just checking that this is being taken from the spec file' if eval_deterministic: eval_policy = MakeDeterministic(policy) else: eval_policy = policy super().__init__(env=env, exploration_policy=policy, eval_policy=eval_policy, expert_replay_buffer=expert_replay_buffer, policy_optimizer=policy_optimizer, **kwargs) self.discriminator = discriminator self.rewardf_eval_statistics = None self.disc_optimizer = disc_optimizer_class( self.discriminator.parameters(), lr=disc_lr, ) self.disc_optim_batch_size = disc_optim_batch_size self.policy_optim_batch_size = policy_optim_batch_size self.bce = nn.BCEWithLogitsLoss() self.bce_targets = torch.cat([ torch.ones(self.disc_optim_batch_size, 1), torch.zeros(self.disc_optim_batch_size, 1) ], dim=0) self.bce_targets = Variable(self.bce_targets) if ptu.gpu_enabled(): self.bce.cuda() self.bce_targets = self.bce_targets.cuda() self.use_grad_pen = use_grad_pen self.grad_pen_weight = grad_pen_weight
def continue_experiment(args): logger.add_text_output('./d_text.txt') logger.add_tabular_output('./d_tabular.txt') logger.set_snapshot_dir('./snaps') extra = joblib.load(args.extra) algorithm = extra['algorithm'] algorithm.farmlist_base = [('0.0.0.0', 15)] algorithm.refarm() if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = NormalizedBoxEnv( GoalXYPosAndVelAnt( goal_dim_weights=[0.1, 0.1, 0.9, 0.9], speed_weight=None, )) max_tau = variant['tdm_kwargs']['max_tau'] # Normalizer isn't used unless you set num_pretrain_paths > 0 tdm_normalizer = TdmNormalizer( env, vectorized=True, max_tau=max_tau, ) qf = TdmQf( env=env, vectorized=True, norm_order=1, tdm_normalizer=tdm_normalizer, hidden_sizes=[300, 300], ) policy = TdmPolicy( env=env, tdm_normalizer=tdm_normalizer, hidden_sizes=[300, 300], ) es = OUStrategy( action_space=env.action_space, theta=0.1, max_sigma=0.1, min_sigma=0.1, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer( env=env, max_size=int(1E6), ) algorithm = TemporalDifferenceModel(env, qf=qf, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, qf_criterion=HuberLoss(), tdm_normalizer=tdm_normalizer, **variant['tdm_kwargs']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): # env = NormalizedBoxEnv(HalfCheetahEnv()) # env = NormalizedBoxEnv(InvertedPendulumEnv()) # --------- # env = NormalizedBoxEnv(get_meta_env(variant['env_specs'])) # training_env = NormalizedBoxEnv(get_meta_env(variant['env_specs'])) env = ReacherEnv() training_env = ReacherEnv() # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) total_meta_variable_dim = 0 for dims in exp_specs['true_meta_variable_dims']: total_meta_variable_dim += sum(dims) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim + total_meta_variable_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + total_meta_variable_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim + total_meta_variable_dim, action_dim=action_dim, ) algorithm = SoftActorCritic( env=env, training_env=training_env, policy=policy, qf=qf, vf=vf, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): logger.add_text_output('./d_text.txt') logger.add_tabular_output('./d_tabular.txt') logger.set_snapshot_dir('./snaps') farmer = Farmer([('0.0.0.0', 1)]) remote_env = farmer.force_acq_env() remote_env.set_spaces() env = NormalizedBoxEnv(remote_env) es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[256, 256], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[256, 256], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[256, 256], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(GoalXVelHalfCheetah()) max_tau = variant['tdm_kwargs']['max_tau'] tdm_normalizer = TdmNormalizer( env, vectorized=True, max_tau=max_tau, ) qf = TdmQf( env=env, vectorized=True, norm_order=1, tdm_normalizer=tdm_normalizer, hidden_sizes=[300, 300], ) policy = TdmPolicy( env=env, tdm_normalizer=tdm_normalizer, hidden_sizes=[300, 300], ) es = OUStrategy( action_space=env.action_space, theta=0.1, max_sigma=0.1, min_sigma=0.1, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer( env=env, max_size=int(1E6), ) algorithm = TemporalDifferenceModel(env, qf=qf, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, tdm_normalizer=tdm_normalizer, qf_criterion=HuberLoss(), **variant['tdm_kwargs']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def __init__(self, max_replay_buffer_size, env, env_info_sizes=None): observation_dim = get_dim(env.observation_space) action_dim = get_dim(env.action_space) if env_info_sizes is None: if hasattr(env, 'info_sizes'): env_info_sizes = env.info_sizes else: env_info_sizes = dict() self._max_replay_buffer_size = max_replay_buffer_size self._observations = torch.zeros( (max_replay_buffer_size, observation_dim), dtype=torch.float).pin_memory() # It's a bit memory inefficient to save the observations twice, # but it makes the code *much* easier since you no longer have to # worry about termination conditions. self._next_obs = torch.zeros((max_replay_buffer_size, observation_dim), dtype=torch.float).pin_memory() self._actions = torch.zeros((max_replay_buffer_size, action_dim), dtype=torch.float).pin_memory() # Make everything a 2D np array to make it easier for other code to # reason about the shape of the data self._rewards = torch.zeros((max_replay_buffer_size, 1), dtype=torch.float).pin_memory() # self._terminals[i] = a terminal was received at time i self._terminals = torch.zeros((max_replay_buffer_size, 1), dtype=torch.float).pin_memory() # Define self._env_infos[key][i] to be the return value of env_info[key] # at time i self._env_infos = {} for key, size in env_info_sizes.items(): self._env_infos[key] = torch.zeros((max_replay_buffer_size, size), dtype=torch.float).pin_memory() self._env_info_keys = env_info_sizes.keys() self._top = 0 self._size = 0 if ptu.gpu_enabled(): # self.stream = torch.cuda.Stream(ptu.device) self.batch = None
def __init__( self, X_train, X_test, y_train, y_test, model, batch_size=128, lr=3e-4, weight_decay=0, num_batches = 128, ): self.batch_size = batch_size if ptu.gpu_enabled(): model.to(ptu.device) self.model = model self.criterion = nn.MSELoss() self.optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) self.X_train, self.X_test, self.y_train, self.y_test = X_train, X_test, y_train, y_test self.num_batches = num_batches
def experiment(variant): env = NormalizedBoxEnv(gym.make('HalfCheetah-v2')) num_skills = variant['num_skills'] '''observation dim includes dim of latent variable''' obs_dim = int(np.prod(env.observation_space.shape)) + num_skills action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) # TODO: VERIFY THIS # num_skills=variant['num_skills'] discrim = FlattenMlp(hidden_sizes=[net_size, net_size], input_size=obs_dim - num_skills, output_size=num_skills, output_activation=nn.Sigmoid()) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = DIAYN(env=env, policy=policy, qf=qf, vf=vf, discrim=discrim, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def her_twin_sac_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) observation_key = variant.get('observation_key', 'observation') desired_goal_key = variant.get('desired_goal_key', 'desired_goal') replay_buffer = ObsDictRelabelingBuffer(env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, **variant['replay_buffer_kwargs']) obs_dim = env.observation_space.spaces['observation'].low.size action_dim = env.action_space.low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size if variant['normalize']: env = NormalizedBoxEnv(env) qf1 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) vf = ConcatMlp(input_size=obs_dim + goal_dim, output_size=1, **variant['vf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs']) algorithm = HerTwinSac(env, qf1=qf1, qf2=qf2, vf=vf, policy=policy, replay_buffer=replay_buffer, observation_key=observation_key, desired_goal_key=desired_goal_key, **variant['algo_kwargs']) if ptu.gpu_enabled(): qf1.to(ptu.device) qf2.to(ptu.device) vf.to(ptu.device) policy.to(ptu.device) algorithm.to(ptu.device) algorithm.train()
def experiment(variant,env=None): if env is None: # default setting of environment env = NormalizedBoxEnv(HopperEnv()) es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf=qf1, policy=policy, exploration_policy=exploration_policy, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()