def __init__( self, env, qf, vf, sac_kwargs, tdm_kwargs, base_kwargs, policy=None, replay_buffer=None, give_terminal_reward=False, ): SoftActorCritic.__init__(self, env=env, policy=policy, qf=qf, vf=vf, replay_buffer=replay_buffer, **sac_kwargs, **base_kwargs) TemporalDifferenceModel.__init__(self, **tdm_kwargs) action_space_diff = (self.env.action_space.high - self.env.action_space.low) # TODO(vitchyr): Maybe add this to the main SAC code. terminal_reward = 0 for dim in range(action_space_diff.size): terminal_reward += (-np.log(1. / action_space_diff[dim])) self.terminal_bonus = float(terminal_reward) self.give_terminal_reward = give_terminal_reward
def __init__(self, *args, observation_key=None, desired_goal_key=None, **kwargs): HER.__init__( self, observation_key=observation_key, desired_goal_key=desired_goal_key, ) SoftActorCritic.__init__(self, *args, **kwargs) assert isinstance(self.replay_buffer, ObsDictRelabelingBuffer)
def experiment(variant): env_params = variant['env_params'] env = SawyerXYZReachingEnv(**env_params) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(gym.make('HalfCheetah-v2')) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(variant['env_class'](**variant['env_kwargs'])) if variant['multitask']: env = MultitaskToFlatEnv(env) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv( MultiGoalEnv( actuation_cost_coeff=10, distance_cost_coeff=1, goal_reward=10, )) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) qf = FlattenMlp( hidden_sizes=[100, 100], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[100, 100], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[100, 100], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) algorithm.to(ptu.device) with torch.autograd.profiler.profile() as prof: algorithm.train() prof.export_chrome_trace("tmp-torch-chrome-trace.prof")
def experiment(variant): # env = normalize(GymEnv( # 'HalfCheetah-v1', # force_reset=True, # record_video=False, # record_log=False, # )) env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(variant['env_class']()) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size variant['algo_kwargs'] = dict( num_epochs=variant['num_epochs'], num_steps_per_epoch=variant['num_steps_per_epoch'], num_steps_per_eval=variant['num_steps_per_eval'], max_path_length=variant['max_path_length'], min_num_steps_before_training=variant['min_num_steps_before_training'], batch_size=variant['batch_size'], discount=variant['discount'], replay_buffer_size=variant['replay_buffer_size'], soft_target_tau=variant['soft_target_tau'], target_update_period=variant['target_update_period'], train_policy_with_reparameterization=variant[ 'train_policy_with_reparameterization'], policy_lr=variant['policy_lr'], qf_lr=variant['qf_lr'], vf_lr=variant['vf_lr'], reward_scale=variant['reward_scale'], use_automatic_entropy_tuning=variant.get( 'use_automatic_entropy_tuning', False)) M = variant['layer_size'] qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], # **variant['qf_kwargs'] ) vf = FlattenMlp( input_size=obs_dim, output_size=1, hidden_sizes=[M, M], # **variant['vf_kwargs'] ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], # **variant['policy_kwargs'] ) algorithm = SoftActorCritic(env, policy=policy, qf=qf, vf=vf, **variant['algo_kwargs']) if ptu.gpu_enabled(): qf.cuda() vf.cuda() policy.cuda() algorithm.cuda() algorithm.train()
def experiment(variant): imsize = variant['imsize'] history = variant['history'] env = Pusher2DEnv()#gym.make(variant['env_id']).env env = NormalizedBoxEnv(ImageMujocoEnv(env, imsize=imsize, keep_prev=history - 1, init_camera=variant['init_camera'])) # es = GaussianStrategy( # action_space=env.action_space, # ) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = MergedCNN(input_width=imsize, input_height=imsize, output_size=1, input_channels= history, added_fc_input_size=action_dim, **variant['cnn_params']) vf = CNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=history, **variant['cnn_params']) policy = TanhCNNGaussianPolicy(input_width=imsize, input_height=imsize, output_size=action_dim, input_channels=history, **variant['cnn_params'], ) algorithm = SoftActorCritic( env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv( MultiGoalEnv( actuation_cost_coeff=10, distance_cost_coeff=1, goal_reward=10, )) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) qf = FlattenMlp( hidden_sizes=[100, 100], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[100, 100], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[100, 100], obs_dim=obs_dim, action_dim=action_dim, ) plotter = QFPolicyPlotter(qf=qf, policy=policy, obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0], [2.5, 2.5]]), default_action=[np.nan, np.nan], n_samples=100) algorithm = SoftActorCritic( env=env, policy=policy, qf=qf, vf=vf, # plotter=plotter, # render_eval_paths=True, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = variant['env_class']() env = NormalizedBoxEnv(env) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) vf = FlattenMlp(input_size=obs_dim, output_size=1, **variant['vf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs']) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = SawyerXYZEnv(**variant['env_kwargs']) env = MultitaskToFlatEnv(env) if variant['normalize']: env = NormalizedBoxEnv(env) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) vf = FlattenMlp(input_size=obs_dim, output_size=1, **variant['vf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs']) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def evaluate(self, epoch): SoftActorCritic.evaluate(self, epoch)