def __init__( self, env, qf, exploration_policy, ddpg_kwargs, tdm_kwargs, base_kwargs, policy=None, replay_buffer=None, ): tdm_kwargs.update(**dict( sample_rollout_goals_from='environment', sample_train_goals_from='her', vectorized=False, cycle_taus_for_rollout=False, max_tau=0, finite_horizon=False, dense_rewards=True, reward_type='indicator', )) if isinstance(qf, TdmQf): assert qf.structure == 'none' TdmDdpg.__init__( self, env, qf, exploration_policy, ddpg_kwargs, tdm_kwargs, base_kwargs, policy=policy, replay_buffer=replay_buffer, )
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) # env = NormalizedBoxEnv(env) # tdm_normalizer = TdmNormalizer( # env, # vectorized=True, # max_tau=variant['algo_kwargs']['tdm_kwargs']['max_tau'], # ) tdm_normalizer = None qf = TdmQf(env=env, vectorized=True, tdm_normalizer=tdm_normalizer, **variant['qf_kwargs']) policy = TdmPolicy(env=env, tdm_normalizer=tdm_normalizer, **variant['policy_kwargs']) es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer(env=env, **variant['her_replay_buffer_kwargs']) qf_criterion = variant['qf_criterion_class']() ddpg_tdm_kwargs = variant['algo_kwargs'] ddpg_tdm_kwargs['ddpg_kwargs']['qf_criterion'] = qf_criterion ddpg_tdm_kwargs['tdm_kwargs']['tdm_normalizer'] = tdm_normalizer algorithm = TdmDdpg(env, qf=qf, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = variant['env_class']() obs_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) vectorized = variant['algo_params']['tdm_kwargs']['vectorized'] if vectorized: qf = VectorizedDiscreteQFunction(observation_dim=int( np.prod(env.observation_space.low.shape)), action_dim=env.action_space.n, goal_dim=env.goal_dim, **variant['qf_params']) policy = ArgmaxDiscreteTdmPolicy(qf, **variant['policy_params']) else: qf = FlattenMlp(input_size=int(np.prod(env.observation_space.shape)) + env.goal_dim + 1, output_size=env.action_space.n, **variant['qf_params']) policy = ArgmaxDiscretePolicy(qf) es = OUStrategy( action_space=env.action_space, theta=0.1, max_sigma=0.1, min_sigma=0.1, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer(env=env, **variant['her_replay_buffer_params']) qf_criterion = variant['qf_criterion_class']( **variant['qf_criterion_params']) algo_params = variant['algo_params'] algo_params['ddpg_kwargs']['qf_criterion'] = qf_criterion plotter = Simple1DTdmDiscretePlotter( tdm=qf, location_lst=np.array([-5, 0, 5]), goal_lst=np.array([-5, 0, 5]), max_tau=algo_params['tdm_kwargs']['max_tau'], grid_size=10, ) algo_params['ddpg_kwargs']['plotter'] = plotter algorithm = TdmDdpg(env, qf=qf, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **algo_params) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env_params = variant['env_params'] env = MultiTaskSawyerXYZReachingEnv(env_params) tdm_normalizer = TdmNormalizer( env, vectorized=True, max_tau=variant['ddpg_tdm_kwargs']['tdm_kwargs']['max_tau'], ) qf = TdmQf( env=env, vectorized=True, hidden_sizes=[variant['hidden_sizes'], variant['hidden_sizes']], structure='norm_difference', tdm_normalizer=tdm_normalizer, ) policy = TdmPolicy( env=env, hidden_sizes=[variant['hidden_sizes'], variant['hidden_sizes']], tdm_normalizer=tdm_normalizer, ) es = OUStrategy( action_space=env.action_space, **variant['es_kwargs'] ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer( env=env, **variant['her_replay_buffer_kwargs'] ) qf_criterion = variant['qf_criterion_class']() ddpg_tdm_kwargs = copy.deepcopy(variant['ddpg_tdm_kwargs']) ddpg_tdm_kwargs['ddpg_kwargs']['qf_criterion'] = qf_criterion algorithm = TdmDdpg( env, qf=qf, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **variant['ddpg_tdm_kwargs'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(variant['env_class']()) obs_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) vectorized = variant['ddpg_tdm_kwargs']['tdm_kwargs']['vectorized'] qf = StructuredQF( observation_dim=obs_dim, action_dim=action_dim, goal_dim=env.goal_dim, output_size=env.goal_dim if vectorized else 1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim + env.goal_dim + 1, output_size=action_dim, **variant['policy_kwargs'] ) es = OUStrategy( action_space=env.action_space, **variant['es_kwargs'] ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer( env=env, **variant['her_replay_buffer_kwargs'] ) qf_criterion = variant['qf_criterion_class']( **variant['qf_criterion_kwargs'] ) ddpg_tdm_kwargs = variant['ddpg_tdm_kwargs'] ddpg_tdm_kwargs['ddpg_kwargs']['qf_criterion'] = qf_criterion algorithm = TdmDdpg( env, qf=qf, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **variant['ddpg_tdm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): vectorized = variant['vectorized'] norm_order = variant['norm_order'] variant['ddpg_tdm_kwargs']['tdm_kwargs']['vectorized'] = vectorized variant['ddpg_tdm_kwargs']['tdm_kwargs']['norm_order'] = norm_order env = NormalizedBoxEnv(variant['env_class'](**variant['env_kwargs'])) max_tau = variant['ddpg_tdm_kwargs']['tdm_kwargs']['max_tau'] tdm_normalizer = TdmNormalizer(env, vectorized, max_tau=max_tau, **variant['tdm_normalizer_kwargs']) qf = TdmQf(env=env, vectorized=vectorized, norm_order=norm_order, tdm_normalizer=tdm_normalizer, **variant['qf_kwargs']) policy = TdmPolicy(env=env, tdm_normalizer=tdm_normalizer, **variant['policy_kwargs']) es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer(env=env, **variant['her_replay_buffer_kwargs']) qf_criterion = variant['qf_criterion_class']( **variant['qf_criterion_kwargs']) ddpg_tdm_kwargs = variant['ddpg_tdm_kwargs'] ddpg_tdm_kwargs['ddpg_kwargs']['qf_criterion'] = qf_criterion ddpg_tdm_kwargs['tdm_kwargs']['tdm_normalizer'] = tdm_normalizer algorithm = TdmDdpg(env, qf=qf, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **variant['ddpg_tdm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env_params = variant['env_params'] env = MultiTaskSawyerXYZReachingEnv(**env_params) max_tau = variant['ddpg_tdm_kwargs']['tdm_kwargs']['max_tau'] tdm_normalizer = TdmNormalizer( env, vectorized=True, max_tau=max_tau, ) qf = TdmQf(env=env, vectorized=True, norm_order=2, tdm_normalizer=tdm_normalizer, **variant['qf_kwargs']) policy = TdmPolicy(env=env, tdm_normalizer=tdm_normalizer, **variant['policy_kwargs']) es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer(env=env, **variant['her_replay_buffer_kwargs']) qf_criterion = variant['qf_criterion_class']() ddpg_tdm_kwargs = variant['ddpg_tdm_kwargs'] ddpg_tdm_kwargs['ddpg_kwargs']['qf_criterion'] = qf_criterion ddpg_tdm_kwargs['tdm_kwargs']['tdm_normalizer'] = tdm_normalizer algorithm = TdmDdpg(env, qf=qf, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **variant['ddpg_tdm_kwargs']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = variant['env_class']() obs_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) vectorized = variant['algo_params']['tdm_kwargs']['vectorized'] if variant['qf_type'] == 'onehot': qf = OneHotTauQF(observation_dim=obs_dim, action_dim=action_dim, goal_dim=env.goal_dim, output_size=env.goal_dim if vectorized else 1, **variant['qf_params']) elif variant['qf_type'] == 'structured': qf = StructuredQF(observation_dim=obs_dim, action_dim=action_dim, goal_dim=env.goal_dim, output_size=env.goal_dim if vectorized else 1, **variant['qf_params']) elif variant['qf_type'] == 'flat': qf = FlattenMlp(input_size=obs_dim + action_dim + env.goal_dim + 1, output_size=env.goal_dim if vectorized else 1, **variant['qf_params']) else: raise TypeError("Invalid qf type: {}".format(variant['qf_type'])) policy = TanhMlpPolicy(input_size=obs_dim + env.goal_dim + 1, output_size=action_dim, **variant['policy_params']) es = OUStrategy( action_space=env.action_space, theta=0.1, max_sigma=0.1, min_sigma=0.1, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer(env=env, **variant['her_replay_buffer_params']) qf_criterion = variant['qf_criterion_class']( **variant['qf_criterion_params']) algo_params = variant['algo_params'] algo_params['ddpg_kwargs']['qf_criterion'] = qf_criterion plotter = Simple1DTdmPlotter( tdm=qf, # location_lst=np.array([-10, 0, 10]), # goal_lst=np.array([-10, 0, 5]), location_lst=np.array([-5, 0, 5]), goal_lst=np.array([-5, 0, 5]), max_tau=algo_params['tdm_kwargs']['max_tau'], grid_size=10, ) algo_params['ddpg_kwargs']['plotter'] = plotter algorithm = TdmDdpg(env, qf=qf, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **algo_params) algorithm.to(ptu.device) algorithm.train()