def experiment(variant): env_params = variant['env_params'] env = MultiTaskSawyerXYZReachingEnv(env_params) tdm_normalizer = TdmNormalizer( env, vectorized=True, max_tau=variant['ddpg_tdm_kwargs']['tdm_kwargs']['max_tau'], ) qf = TdmQf( env=env, vectorized=True, hidden_sizes=[variant['hidden_sizes'], variant['hidden_sizes']], structure='norm_difference', tdm_normalizer=tdm_normalizer, ) policy = TdmPolicy( env=env, hidden_sizes=[variant['hidden_sizes'], variant['hidden_sizes']], tdm_normalizer=tdm_normalizer, ) es = OUStrategy( action_space=env.action_space, **variant['es_kwargs'] ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer( env=env, **variant['her_replay_buffer_kwargs'] ) qf_criterion = variant['qf_criterion_class']() ddpg_tdm_kwargs = copy.deepcopy(variant['ddpg_tdm_kwargs']) ddpg_tdm_kwargs['ddpg_kwargs']['qf_criterion'] = qf_criterion algorithm = TdmDdpg( env, qf=qf, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **variant['ddpg_tdm_kwargs'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): vectorized = variant['sac_tdm_kwargs']['tdm_kwargs']['vectorized'] # env = NormalizedBoxEnv(variant['env_class'](**variant['env_kwargs'])) env = NormalizedBoxEnv(variant['env_class'](**variant['env_kwargs'])) max_tau = variant['sac_tdm_kwargs']['tdm_kwargs']['max_tau'] tdm_normalizer = TdmNormalizer( env, vectorized, max_tau=max_tau, **variant['tdm_normalizer_kwargs'] ) qf = TdmQf( env=env, vectorized=vectorized, tdm_normalizer=tdm_normalizer, **variant['qf_kwargs'] ) vf = TdmVf( env=env, vectorized=vectorized, tdm_normalizer=tdm_normalizer, **variant['vf_kwargs'] ) policy = StochasticTdmPolicy( env=env, tdm_normalizer=tdm_normalizer, **variant['policy_kwargs'] ) replay_buffer = HerReplayBuffer( env=env, **variant['her_replay_buffer_kwargs'] ) algorithm = TdmSac( env=env, policy=policy, qf=qf, vf=vf, replay_buffer=replay_buffer, **variant['sac_tdm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): vectorized = variant['vectorized'] norm_order = variant['norm_order'] variant['ddpg_tdm_kwargs']['tdm_kwargs']['vectorized'] = vectorized variant['ddpg_tdm_kwargs']['tdm_kwargs']['norm_order'] = norm_order env = NormalizedBoxEnv(variant['env_class'](**variant['env_kwargs'])) max_tau = variant['ddpg_tdm_kwargs']['tdm_kwargs']['max_tau'] tdm_normalizer = TdmNormalizer(env, vectorized, max_tau=max_tau, **variant['tdm_normalizer_kwargs']) qf = TdmQf(env=env, vectorized=vectorized, norm_order=norm_order, tdm_normalizer=tdm_normalizer, **variant['qf_kwargs']) policy = TdmPolicy(env=env, tdm_normalizer=tdm_normalizer, **variant['policy_kwargs']) es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer(env=env, **variant['her_replay_buffer_kwargs']) qf_criterion = variant['qf_criterion_class']( **variant['qf_criterion_kwargs']) ddpg_tdm_kwargs = variant['ddpg_tdm_kwargs'] ddpg_tdm_kwargs['ddpg_kwargs']['qf_criterion'] = qf_criterion ddpg_tdm_kwargs['tdm_kwargs']['tdm_normalizer'] = tdm_normalizer algorithm = TdmDdpg(env, qf=qf, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **variant['ddpg_tdm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(variant['env_class'](**variant['env_kwargs'])) observation_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) obs_normalizer = TorchFixedNormalizer(observation_dim) goal_normalizer = TorchFixedNormalizer(env.goal_dim) action_normalizer = TorchFixedNormalizer(action_dim) distance_normalizer = TorchFixedNormalizer(env.goal_dim) tdm_normalizer = TdmNormalizer(env, obs_normalizer=obs_normalizer, goal_normalizer=goal_normalizer, action_normalizer=action_normalizer, distance_normalizer=distance_normalizer, max_tau=1, **variant['tdm_normalizer_kwargs']) qf = HerQFunction(env=env, **variant['qf_kwargs']) policy = HerPolicy(env=env, tdm_normalizer=tdm_normalizer, **variant['policy_kwargs']) es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer(env=env, **variant['her_replay_buffer_kwargs']) qf_criterion = variant['qf_criterion_class']( **variant['qf_criterion_kwargs']) ddpg_tdm_kwargs = variant['ddpg_tdm_kwargs'] ddpg_tdm_kwargs['ddpg_kwargs']['qf_criterion'] = qf_criterion algorithm = HER(env, qf=qf, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **variant['ddpg_tdm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env_params = variant['env_params'] env = MultiTaskSawyerXYZReachingEnv(**env_params) max_tau = variant['ddpg_tdm_kwargs']['tdm_kwargs']['max_tau'] tdm_normalizer = TdmNormalizer( env, vectorized=True, max_tau=max_tau, ) qf = TdmQf(env=env, vectorized=True, norm_order=2, tdm_normalizer=tdm_normalizer, **variant['qf_kwargs']) policy = TdmPolicy(env=env, tdm_normalizer=tdm_normalizer, **variant['policy_kwargs']) es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer(env=env, **variant['her_replay_buffer_kwargs']) qf_criterion = variant['qf_criterion_class']() ddpg_tdm_kwargs = variant['ddpg_tdm_kwargs'] ddpg_tdm_kwargs['ddpg_kwargs']['qf_criterion'] = qf_criterion ddpg_tdm_kwargs['tdm_kwargs']['tdm_normalizer'] = tdm_normalizer algorithm = TdmDdpg(env, qf=qf, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **variant['ddpg_tdm_kwargs']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): vectorized = variant['sac_tdm_kwargs']['tdm_kwargs']['vectorized'] env = NormalizedBoxEnv(variant['env_class'](**variant['env_kwargs'])) max_tau = variant['sac_tdm_kwargs']['tdm_kwargs']['max_tau'] qf = TdmQf(env, vectorized=vectorized, **variant['qf_kwargs']) tdm_normalizer = TdmNormalizer(env, vectorized, max_tau=max_tau, **variant['tdm_normalizer_kwargs']) implicit_model = TdmToImplicitModel( env, qf, tau=0, ) vf = TdmVf(env=env, vectorized=vectorized, tdm_normalizer=tdm_normalizer, **variant['vf_kwargs']) policy = StochasticTdmPolicy(env=env, tdm_normalizer=tdm_normalizer, **variant['policy_kwargs']) replay_buffer = HerReplayBuffer(env=env, **variant['her_replay_buffer_kwargs']) goal_slice = env.ob_to_goal_slice lbfgs_mpc_controller = TdmLBfgsBCMC(implicit_model, env, goal_slice=goal_slice, multitask_goal_slice=goal_slice, **variant['mpc_controller_kwargs']) state_only_mpc_controller = TdmLBfgsBStateOnlyCMC( vf, policy, env, goal_slice=goal_slice, multitask_goal_slice=goal_slice, **variant['state_only_mpc_controller_kwargs']) es = GaussianStrategy(action_space=env.action_space, **variant['es_kwargs']) if variant['explore_with'] == 'TdmLBfgsBCMC': exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=lbfgs_mpc_controller, ) variant['sac_tdm_kwargs']['base_kwargs']['exploration_policy'] = ( exploration_policy) elif variant['explore_with'] == 'TdmLBfgsBStateOnlyCMC': exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=state_only_mpc_controller, ) variant['sac_tdm_kwargs']['base_kwargs']['exploration_policy'] = ( exploration_policy) if variant['eval_with'] == 'TdmLBfgsBCMC': variant['sac_tdm_kwargs']['base_kwargs']['eval_policy'] = ( lbfgs_mpc_controller) elif variant['eval_with'] == 'TdmLBfgsBStateOnlyCMC': variant['sac_tdm_kwargs']['base_kwargs']['eval_policy'] = ( state_only_mpc_controller) algorithm = TdmSac(env=env, policy=policy, qf=qf, vf=vf, replay_buffer=replay_buffer, **variant['sac_tdm_kwargs']) algorithm.to(ptu.device) algorithm.train()