def experiment(variant): env = NormalizedBoxEnv(HalfCheetahEnv()) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def __init__( self, env, qf, exploration_policy, ddpg_kwargs, tdm_kwargs, base_kwargs, policy=None, replay_buffer=None, ): DDPG.__init__(self, env=env, qf=qf, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **ddpg_kwargs, **base_kwargs) super().__init__(**tdm_kwargs) # Not supporting these in this implementation assert self.qf_weight_decay == 0 assert self.residual_gradient_weight == 0
def ddpg_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) env = MultitaskToFlatEnv(env) if variant.get('make_silent_env', True): env = MultitaskEnvToSilentMultitaskEnv(env) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(MultiGoalEnv( actuation_cost_coeff=10, distance_cost_coeff=1, goal_reward=10, )) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) plotter = QFPolicyPlotter( qf=qf, # policy=policy, policy=exploration_policy, obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0], [2.5, 2.5]]), default_action=[np.nan, np.nan], n_samples=100 ) algorithm = DDPG( env, qf=qf, policy=policy, exploration_policy=exploration_policy, render_eval_paths=True, plotter=plotter, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant,env=None): if env is None: # default setting of environment env = NormalizedBoxEnv(HopperEnv()) es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf=qf1, policy=policy, exploration_policy=exploration_policy, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): logger.add_text_output('./d_text.txt') logger.add_tabular_output('./d_tabular.txt') logger.set_snapshot_dir('./snaps') farmer = Farmer([('0.0.0.0', 1)]) remote_env = farmer.force_acq_env() remote_env.set_spaces() env = NormalizedBoxEnv(remote_env) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size net_size = variant['net_size'] qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[net_size, net_size], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[net_size, net_size], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) es = GaussianStrategy(action_space=env.action_space, **variant['es_kwargs']) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): data = joblib.load(GOOD_DDPG_POLICY_PATH) expert_policy = data['policy'] env = NormalizedBoxEnv(variant['env_class']()) es = OUStrategy( action_space=env.action_space, **variant['es_kwargs'] ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=expert_policy, ) algorithm = DDPG( env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) if ptu.gpu_enabled(): expert_policy.to(ptu.device) algorithm.to(ptu.device) algorithm.train()
def evaluate(self, epoch): DDPG.evaluate(self, epoch)
def __init__(self, *args, her_kwargs, ddpg_kwargs, **kwargs): HER.__init__(self, **her_kwargs) DDPG.__init__(self, *args, **kwargs, **ddpg_kwargs) assert isinstance(self.replay_buffer, RelabelingReplayBuffer) or isinstance( self.replay_buffer, ObsDictRelabelingBuffer)