def __init__(self, policy, supervised_model=None, supervised_ground_truth='teacher', name="ppo", learning_rate=1e-3, clip_eps=0.2, max_epochs=5, max_epochs_r=20, entropy_bonus=0., reward_predictor=None, reward_predictor_type='gaussian', grad_clip_threshold=None, **kwargs): # TODO: Check to avoid duplicates of variables and scopes self.reward_predictor = reward_predictor Serializable.quick_init(self, locals()) super(PPO, self).__init__(policy) self.recurrent = getattr(self.policy, 'recurrent', False) self.supervised_model = supervised_model if self.recurrent: backprop_steps = kwargs.get('backprop_steps', 32) self.optimizer = RL2FirstOrderOptimizer( learning_rate=learning_rate, max_epochs=max_epochs, backprop_steps=backprop_steps, grad_clip_threshold=grad_clip_threshold) if self.reward_predictor is not None: self.optimizer_r = RL2FirstOrderOptimizer( learning_rate=learning_rate, max_epochs=max_epochs_r, backprop_steps=backprop_steps, grad_clip_threshold=grad_clip_threshold) if self.supervised_model is not None: self.optimizer_s = RL2FirstOrderOptimizer( learning_rate=learning_rate, max_epochs=max_epochs_r, backprop_steps=backprop_steps, grad_clip_threshold=grad_clip_threshold) else: self.optimizer = FirstOrderOptimizer(learning_rate=learning_rate, max_epochs=max_epochs) # TODO figure out what this does self._optimization_keys = [ 'observations', 'actions', 'advantages', 'rewards', 'agent_infos', 'env_infos' ] self._optimization_r_keys = [ 'observations', 'actions', 'advantages', 'rewards', 'agent_infos', 'env_infos' ] self.name = name self._clip_eps = clip_eps self.entropy_bonus = entropy_bonus self.supervised_ground_truth = supervised_ground_truth self.reward_predictor_type = reward_predictor_type self.build_graph()
def __init__(self, obs_dim, action_dim, name='v_fun', hidden_sizes=(256, 256), hidden_nonlinearity=tf.tanh, output_nonlinearity=None, **kwargs): # store the init args for serialization and call the super constructors Serializable.quick_init(self, locals()) self.obs_dim = obs_dim self.action_dim = action_dim self.name = name self.hidden_sizes = hidden_sizes self.hidden_nonlinearity = hidden_nonlinearity self.output_nonlinearity = output_nonlinearity self.vfun_params = None self.input_var = None self.qval_var = None self.log_std_var = None self.action_var = None self._assign_ops = None self.build_graph()
def __init__(self, obs_dim, action_dim, name='policy', hidden_sizes=(32, 32), learn_std=True, hidden_nonlinearity=tf.tanh, output_nonlinearity=None, **kwargs ): Serializable.quick_init(self, locals()) self.obs_dim = obs_dim self.action_dim = action_dim self.name = name self.hidden_sizes = hidden_sizes self.learn_std = learn_std self.hidden_nonlinearity = hidden_nonlinearity self.output_nonlinearity = output_nonlinearity self._dist = None self.policy_params = None self._assign_ops = None self._assign_phs = None self.policy_params_keys = None self.policy_params_ph = None
def __init__( self, policy, name="ppo", learning_rate=1e-3, clip_eps=0.2, max_epochs=5, entropy_bonus=0., **kwargs ): Serializable.quick_init(self, locals()) super(PPO, self).__init__(policy) self.recurrent = getattr(self.policy, 'recurrent', False) if self.recurrent: backprop_steps = kwargs.get('backprop_steps', 32) self.optimizer = RL2FirstOrderOptimizer(learning_rate=learning_rate, max_epochs=max_epochs, backprop_steps=backprop_steps) else: self.optimizer = FirstOrderOptimizer(learning_rate=learning_rate, max_epochs=max_epochs) self._optimization_keys = ['observations', 'actions', 'advantages', 'agent_infos'] self.name = name self._clip_eps = clip_eps self.entropy_bonus = entropy_bonus self.build_graph()
def __init__(self, *args, init_std=1., min_std=1e-6, cell_type='lstm', **kwargs): # store the init args for serialization and call the super constructors Serializable.quick_init(self, locals()) Policy.__init__(self, *args, **kwargs) self.min_log_std = np.log(min_std) self.init_log_std = np.log(init_std) self.init_policy = None self.policy_params = None self.obs_var = None self.mean_var = None self.log_std_var = None self.action_var = None self._dist = None self._hidden_state = None self.recurrent = True self._cell_type = cell_type self.build_graph() self._zero_hidden = self.cell.zero_state(1, tf.float32)
def __init__( self, tf_optimizer_cls=tf.train.AdamOptimizer, tf_optimizer_args=None, learning_rate=1e-3, max_epochs=1, tolerance=1e-6, num_minibatches=1, verbose=False, ): Serializable.quick_init(self, locals()) self._target = None if tf_optimizer_args is None: tf_optimizer_args = dict() tf_optimizer_args['learning_rate'] = learning_rate self._tf_optimizer = tf_optimizer_cls(**tf_optimizer_args) self._max_epochs = max_epochs self._tolerance = tolerance self._verbose = verbose self._num_minibatches = num_minibatches self._all_inputs = None self._train_op = None self._loss = None self._input_ph_dict = None
def __getstate__(self): state = { 'init_args': Serializable.__getstate__(self), 'network_params': self.get_params(), 'filter': [obs_filter.get_params() for obs_filter in self.obs_filters], } return state
def __getstate__(self): state = dict() state['init_args'] = Serializable.__getstate__(self) print('getstate\n') print(state['init_args']) state['policy'] = self.policy.__getstate__() state['optimizer'] = self.optimizer.__getstate__() return state
def __init__(self, *args, init_std=1., min_std=1e-6, **kwargs): # store the init args for serialization and call the super constructors Serializable.quick_init(self, locals()) Policy.__init__(self, *args, **kwargs) self.min_log_std = np.log(min_std) self.init_log_std = np.log(init_std) self.init_policy = None self.policy_params = None self.obs_var = None self.mean_var = None self.log_std_var = None self.action_var = None self._dist = None self.build_graph()
def __init__(self, obs_dim, action_dim, name='np_policy', **kwargs ): Serializable.quick_init(self, locals()) self.obs_dim = obs_dim self.action_dim = action_dim self.name = name self._dist = None self.policy_params = None self.policy_params_batch = None self._num_deltas = None self.obs_filters = [Filter((self.obs_dim,))]
def __init__(self, obs_dim, action_dim, name='np_policy', hidden_sizes=(64, 64), hidden_nonlinearity='tanh', output_nonlinearity=None, normalization='first', **kwargs): Serializable.quick_init(self, locals()) NpPolicy.__init__(self, obs_dim, action_dim, name, **kwargs) assert normalization in ['all', 'first', None, 'none'] self.obs_filter = MeanStdFilter(shape=(obs_dim, )) self.hidden_nonlinearity = self._activations[hidden_nonlinearity] self.output_nonlinearity = self._activations[output_nonlinearity] self.hidden_sizes = hidden_sizes self.policy_params = OrderedDict() self.obs_filters = [] prev_size = obs_dim for i, hidden_size in enumerate(hidden_sizes): W = np.zeros((hidden_size, prev_size), dtype=np.float64) b = np.zeros((hidden_size, )) self.policy_params['W_%d' % i] = W self.policy_params['b_%d' % i] = b if normalization == 'all' or (normalization == 'first' and i == 0): self.obs_filters.append(MeanStdFilter(shape=(prev_size, ))) else: self.obs_filters.append(Filter(shape=(prev_size, ))) prev_size = hidden_size if normalization == 'all' or (normalization == 'first' and len(hidden_sizes) == 0): self.obs_filters.append(MeanStdFilter(shape=(prev_size, ))) else: self.obs_filters.append(Filter(shape=(prev_size, ))) W = np.zeros((action_dim, prev_size), dtype=np.float64) b = np.zeros((action_dim, )) self.policy_params['W_out'] = W self.policy_params['b_out'] = b
def __setstate__(self, state): Serializable.__setstate__(self, state['init_args']) self.policy.__setstate__(state['policy']) self.optimizer.__getstate__(state['optimizer'])
def __setstate__(self, state): Serializable.__setstate__(self, state['init_args']) # tf.get_default_session().run(tf.global_variables_initializer()) self.set_params(state['network_params'])
def __getstate__(self): state = { 'init_args': Serializable.__getstate__(self), 'network_params': self.get_param_values() } return state
from meta_mb.logger import logger
def __setstate__(self, state): Serializable.__setstate__(self, state['init_args']) self.set_params(state['network_params']) [obs_filter.set_params(params) for obs_filter, params in zip(self.obs_filters, state['filter'])]