def __init__( self, training_environment, evaluation_environment, policy, Qs, pool, static_fns, plotter=None, tf_summaries=False, lr=3e-4, reward_scale=1.0, target_entropy='auto', discount=0.99, tau=5e-3, target_update_interval=1, action_prior='uniform', reparameterize=False, store_extra_policy_info=False, deterministic=False, model_train_freq=250, model_train_slower=1, num_networks=7, num_elites=5, num_Q_elites=2, # The num of Q ensemble is set in command line model_retain_epochs=20, rollout_batch_size=100e3, real_ratio=0.1, critic_same_as_actor=True, rollout_schedule=[20,100,1,1], hidden_dim=200, max_model_t=None, dir_name=None, evaluate_explore_freq=0, num_Q_per_grp=2, num_Q_grp=1, cross_grp_diff_batch=False, model_load_dir=None, model_load_index=None, model_log_freq=0, **kwargs, ): """ Args: env (`SoftlearningEnv`): Environment used for training. policy: A policy function approximator. initial_exploration_policy: ('Policy'): A policy that we use for initial exploration which is not trained by the algorithm. Qs: Q-function approximators. The min of these approximators will be used. Usage of at least two Q-functions improves performance by reducing overestimation bias. pool (`PoolBase`): Replay pool to add gathered samples to. plotter (`QFPolicyPlotter`): Plotter instance to be used for visualizing Q-function during training. lr (`float`): Learning rate used for the function approximators. discount (`float`): Discount factor for Q-function updates. tau (`float`): Soft value function target update weight. target_update_interval ('int'): Frequency at which target network updates occur in iterations. reparameterize ('bool'): If True, we use a gradient estimator for the policy derived using the reparameterization trick. We use a likelihood ratio based estimator otherwise. critic_same_as_actor ('bool'): If True, use the same sampling schema (model free or model based) as the actor in critic training. Otherwise, use model free sampling to train critic. """ super(MBPO, self).__init__(**kwargs) if training_environment.unwrapped.spec.id.find("Fetch") != -1: # Fetch env obs_dim = sum([i.shape[0] for i in training_environment.observation_space.spaces.values()]) self.multigoal = 1 else: obs_dim = np.prod(training_environment.observation_space.shape) # print("====", obs_dim, "========") act_dim = np.prod(training_environment.action_space.shape) # TODO: add variable scope to directly extract model parameters self._model_load_dir = model_load_dir print("============Model dir: ", self._model_load_dir) if model_load_index: latest_model_index = model_load_index else: latest_model_index = self._get_latest_index() self._model = construct_model(obs_dim=obs_dim, act_dim=act_dim, hidden_dim=hidden_dim, num_networks=num_networks, num_elites=num_elites, model_dir=self._model_load_dir, model_load_timestep=latest_model_index, load_model=True if model_load_dir else False) self._static_fns = static_fns self.fake_env = FakeEnv(self._model, self._static_fns) model_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self._model.name) all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) self._rollout_schedule = rollout_schedule self._max_model_t = max_model_t # self._model_pool_size = model_pool_size # print('[ MBPO ] Model pool size: {:.2E}'.format(self._model_pool_size)) # self._model_pool = SimpleReplayPool(pool._observation_space, pool._action_space, self._model_pool_size) self._model_retain_epochs = model_retain_epochs self._model_train_freq = model_train_freq self._rollout_batch_size = int(rollout_batch_size) self._deterministic = deterministic self._real_ratio = real_ratio self._log_dir = os.getcwd() self._writer = Writer(self._log_dir) self._training_environment = training_environment self._evaluation_environment = evaluation_environment self._policy = policy self._Qs = Qs self._Q_ensemble = len(Qs) self._Q_elites = num_Q_elites self._Q_targets = tuple(tf.keras.models.clone_model(Q) for Q in Qs) self._pool = pool self._plotter = plotter self._tf_summaries = tf_summaries self._policy_lr = lr self._Q_lr = lr self._reward_scale = reward_scale self._target_entropy = ( -np.prod(self._training_environment.action_space.shape) if target_entropy == 'auto' else target_entropy) print('[ MBPO ] Target entropy: {}'.format(self._target_entropy)) self._discount = discount self._tau = tau self._target_update_interval = target_update_interval self._action_prior = action_prior self._reparameterize = reparameterize self._store_extra_policy_info = store_extra_policy_info observation_shape = self._training_environment.active_observation_shape action_shape = self._training_environment.action_space.shape assert len(observation_shape) == 1, observation_shape self._observation_shape = observation_shape assert len(action_shape) == 1, action_shape self._action_shape = action_shape # self._critic_train_repeat = kwargs["critic_train_repeat"] # actor UTD should be n times larger or smaller than critic UTD assert self._actor_train_repeat % self._critic_train_repeat == 0 or \ self._critic_train_repeat % self._actor_train_repeat == 0 self._critic_train_freq = self._n_train_repeat // self._critic_train_repeat self._actor_train_freq = self._n_train_repeat // self._actor_train_repeat self._critic_same_as_actor = critic_same_as_actor self._model_train_slower = model_train_slower self._origin_model_train_epochs = 0 self._dir_name = dir_name self._evaluate_explore_freq = evaluate_explore_freq # Inter-group Qs are trained with the same data; Cross-group Qs different. self._num_Q_per_grp = num_Q_per_grp self._num_Q_grp = num_Q_grp self._cross_grp_diff_batch = cross_grp_diff_batch self._model_log_freq = model_log_freq self._build()
def __init__( self, training_environment, evaluation_environment, policy, Qs, pool, static_fns, plotter=None, tf_summaries=False, lr=3e-4, reward_scale=1.0, target_entropy='auto', discount=0.99, tau=5e-3, target_update_interval=1, action_prior='uniform', reparameterize=False, store_extra_policy_info=False, deterministic=False, model_train_freq=250, num_networks=7, num_elites=5, model_retain_epochs=20, rollout_batch_size=100e3, real_ratio=0.1, rollout_schedule=[20,100,1,1], hidden_dim=200, max_model_t=None, **kwargs, ): """ Args: env (`SoftlearningEnv`): Environment used for training. policy: A policy function approximator. initial_exploration_policy: ('Policy'): A policy that we use for initial exploration which is not trained by the algorithm. Qs: Q-function approximators. The min of these approximators will be used. Usage of at least two Q-functions improves performance by reducing overestimation bias. pool (`PoolBase`): Replay pool to add gathered samples to. plotter (`QFPolicyPlotter`): Plotter instance to be used for visualizing Q-function during training. lr (`float`): Learning rate used for the function approximators. discount (`float`): Discount factor for Q-function updates. tau (`float`): Soft value function target update weight. target_update_interval ('int'): Frequency at which target network updates occur in iterations. reparameterize ('bool'): If True, we use a gradient estimator for the policy derived using the reparameterization trick. We use a likelihood ratio based estimator otherwise. """ super(MBPO, self).__init__(**kwargs) obs_dim = np.prod(training_environment.observation_space.shape) act_dim = np.prod(training_environment.action_space.shape) self._model = construct_model(obs_dim=obs_dim, act_dim=act_dim, hidden_dim=hidden_dim, num_networks=num_networks, num_elites=num_elites) self._static_fns = static_fns self.fake_env = FakeEnv(self._model, self._static_fns) self._rollout_schedule = rollout_schedule self._max_model_t = max_model_t # self._model_pool_size = model_pool_size # print('[ MBPO ] Model pool size: {:.2E}'.format(self._model_pool_size)) # self._model_pool = SimpleReplayPool(pool._observation_space, pool._action_space, self._model_pool_size) self._model_retain_epochs = model_retain_epochs self._model_train_freq = model_train_freq self._rollout_batch_size = int(rollout_batch_size) self._deterministic = deterministic self._real_ratio = real_ratio self._log_dir = os.getcwd() self._writer = Writer(self._log_dir) self._training_environment = training_environment self._evaluation_environment = evaluation_environment self._policy = policy self._Qs = Qs self._Q_targets = tuple(tf.keras.models.clone_model(Q) for Q in Qs) self._pool = pool self._plotter = plotter self._tf_summaries = tf_summaries self._policy_lr = lr self._Q_lr = lr self._reward_scale = reward_scale self._target_entropy = ( -np.prod(self._training_environment.action_space.shape) if target_entropy == 'auto' else target_entropy) print('[ MBPO ] Target entropy: {}'.format(self._target_entropy)) self._discount = discount self._tau = tau self._target_update_interval = target_update_interval self._action_prior = action_prior self._reparameterize = reparameterize self._store_extra_policy_info = store_extra_policy_info observation_shape = self._training_environment.active_observation_shape action_shape = self._training_environment.action_space.shape assert len(observation_shape) == 1, observation_shape self._observation_shape = observation_shape assert len(action_shape) == 1, action_shape self._action_shape = action_shape self._build()
def __init__( self, training_environment, evaluation_environment, policy, Qs, pool, static_fns, plotter=None, tf_summaries=False, lr=3e-4, reward_scale=1.0, target_entropy='auto', discount=0.99, tau=5e-3, target_update_interval=1, action_prior='uniform', reparameterize=False, store_extra_policy_info=False, deterministic=False, model_train_freq=250, num_networks=7, num_elites=5, model_retain_epochs=20, rollout_batch_size=1e3, real_ratio=0.1, rollout_schedule=[20, 100, 1, 1], hidden_dim=200, max_model_t=None, shape_reward=False, max_action=1.0, **kwargs, ): """ Args: env (`SoftlearningEnv`): Environment used for training. policy: A policy function approximator. initial_exploration_policy: ('Policy'): A policy that we use for initial exploration which is not trained by the algorithm. Qs: Q-function approximators. The min of these approximators will be used. Usage of at least two Q-functions improves performance by reducing overestimation bias. pool (`PoolBase`): Replay pool to add gathered samples to. plotter (`QFPolicyPlotter`): Plotter instance to be used for visualizing Q-function during training. lr (`float`): Learning rate used for the function approximators. discount (`float`): Discount factor for Q-function updates. tau (`float`): Soft value function target update weight. target_update_interval ('int'): Frequency at which target network updates occur in iterations. reparameterize ('bool'): If True, we use a gradient estimator for the policy derived using the reparameterization trick. We use a likelihood ratio based estimator otherwise. """ super(MBPO, self).__init__(**kwargs) # for regular gym env #obs_dim = np.prod(training_environment.observation_space.shape) # for yuchen's modified env obs_dim = np.prod( training_environment.observation_space['observation'].shape) act_dim = np.prod(training_environment.action_space.shape) self.obs_dim_tup = training_environment.observation_space[ 'observation'].shape self.act_dim_tup = training_environment.action_space.shape self._model = construct_model(obs_dim=obs_dim, act_dim=act_dim, hidden_dim=hidden_dim, num_networks=num_networks, num_elites=num_elites) self._static_fns = static_fns self.fake_env = FakeEnv(self._model, self._static_fns) self._rollout_schedule = rollout_schedule self._max_model_t = max_model_t # self._model_pool_size = model_pool_size # print('[ MBPO ] Model pool size: {:.2E}'.format(self._model_pool_size)) # self._model_pool = SimpleReplayPool(pool._observation_space, pool._action_space, self._model_pool_size) self._model_retain_epochs = model_retain_epochs self._model_train_freq = model_train_freq self._rollout_batch_size = int(rollout_batch_size) self._deterministic = deterministic self._real_ratio = real_ratio self._log_dir = os.getcwd() self._writer = Writer(self._log_dir) self._training_environment = training_environment self._evaluation_environment = evaluation_environment self._policy = policy self._Qs = Qs self._Q_targets = tuple(tf.keras.models.clone_model(Q) for Q in Qs) self._pool = pool # TODO: Fix hard-coded path # Only do this if we are shaping the reward print("Are we shaping the reward: {0}".format( shape_reward)) #TODO: remove this line once debugging is done if (shape_reward): demo_data = np.load("./mbpo/demonstration_data/demo_data_old.npz") # The demo data needs the next observations # TODO : Fix the skip last trajectory. The data should contain separate # observations and next_observations. samples = { 'observations': demo_data["o"].reshape(-1, 6)[:-40], 'actions': demo_data["u"].reshape(-1, 4), 'next_observations': demo_data["o"].reshape(-1, 6)[:-40], 'rewards': demo_data["r"].reshape(-1, 1), 'terminals': demo_data["done"].reshape(-1, 1) } self._demo_pool = SimpleReplayPool( pool._observation_space['observation'], pool._action_space, pool._max_size) self._demo_pool.add_samples(samples) self._plotter = plotter self._tf_summaries = tf_summaries self._policy_lr = lr self._Q_lr = lr self._reward_scale = reward_scale self._target_entropy = ( -np.prod(self._training_environment.action_space.shape) if target_entropy == 'auto' else target_entropy) print('[ MBPO ] Target entropy: {}'.format(self._target_entropy)) self._discount = discount self._tau = tau self._target_update_interval = target_update_interval self._action_prior = action_prior self._reparameterize = reparameterize self._store_extra_policy_info = store_extra_policy_info observation_shape = self._training_environment.active_observation_shape action_shape = self._training_environment.action_space.shape assert len(observation_shape) == 1, observation_shape self._observation_shape = observation_shape assert len(action_shape) == 1, action_shape self._action_shape = action_shape self.shape_reward = shape_reward self.max_action = max_action self._build()