def _eval(vector): """The evaluation function. Args: vector (torch.Tensor): The vector to be multiplied with Hessian. Returns: torch.Tensor: The product of Hessian of function f and v. """ unflatten_vector = unflatten_tensors(vector, param_shapes) assert len(f_grads) == len(unflatten_vector) grad_vector_product = torch.sum( torch.stack( [torch.sum(g * x) for g, x in zip(f_grads, unflatten_vector)])) hvp = list( torch.autograd.grad(grad_vector_product, params, retain_graph=True)) for i, (hx, p) in enumerate(zip(hvp, params)): if hx is None: hvp[i] = torch.zeros_like(p) flat_output = torch.cat([h.reshape(-1) for h in hvp]) return flat_output + reg_coeff * vector
def init_policy_np(policy, np_random=np.random): params = policy.get_params(trainable=True) shapes = policy.get_param_shapes(trainable=True) param_values = policy.get_param_values(trainable=True) flattened_params = np_random.rand(*param_values.shape) param_values = unflatten_tensors(flattened_params, shapes) for i, param in enumerate(params): # assert param.name[-3] == "W" or param.name[-3] == "b" if param.name[-3] == "W": shape = shapes[i] if len(shape) == 2: n_inputs, n_outputs = shape else: receptive_field_size = np.prod(shape[:2]) n_inputs = shape[-2] * receptive_field_size n_outputs = shape[-1] * receptive_field_size init_range = np.sqrt(6.0 / (n_inputs + n_outputs)) param_values[i] = (param_values[i] * 2 - 1) * init_range elif param.name[-3] == "b": param_values[i] = np.zeros_like(param_values[i]) param_values = flatten_tensors(param_values) return param_values
def integrate_new_skill(self, new_skill_id, new_skill_subpath): skill_integration_method = CategoricalMLPSkillIntegrator.Method.SUBPATH_SKILLS_AVG ## Hierarchized environment hrl_env = HierarchizedEnv( # base env that was wrapped in HierarchizedEnv (not fully unwrapped - may be normalized!) env=self.env.env.env, num_orig_skills=self._hrl_policy.num_skills ) tf_hrl_env = TfEnv(hrl_env) ## Top policy # 1) Get old policy from saved data old_top_policy = self._hrl_policy.get_top_policy() # 2) Get weights of old top policy otp_weights = unflatten_tensors( old_top_policy.get_param_values(), old_top_policy.get_param_shapes() ) # 3) Create weights for new top policy skill_integrator = CategoricalMLPSkillIntegrator() ntp_weight_values = skill_integrator.integrate_skill( old_policy_weights=otp_weights, method=skill_integration_method, # Specific parameters for START_OBSS_SKILLS_AVG subpath_start_obss=new_skill_subpath['start_observations'], top_policy=old_top_policy, # Specific parameters for SUBPATH_SKILLS_AVG, SUBPATH_SKILLS_SMOOTH_AVG and SUBPATH_FIRST_SKILL subpath_actions=new_skill_subpath['actions'] ) # 4) Create new policy and randomly initialize its weights new_top_policy = CategoricalMLPPolicy( env_spec=tf_hrl_env.spec, # This env counts with new skill (action space = n + 1) hidden_sizes=(32, 32), # As was in asa_test.py, name='CategoricalMLPPolicyWithSkill{}'.format(new_skill_id) ) ntp_init_op = tf.variables_initializer(new_top_policy.get_params()) ntp_init_op.run() # 5) Fill new policy with adjusted weights new_top_policy.set_param_values( flattened_params=flatten_tensors(ntp_weight_values) ) ## Adjust HRL policy and training algorithms self._hrl_policy.top_policy = new_top_policy hrl_env.set_hrl_policy(self._hrl_policy) self.env = tf_hrl_env self.policy=self._hrl_policy.get_top_policy() self._top_algo = self._top_algo_cls( env=tf_hrl_env, policy=self._hrl_policy.get_top_policy(), baseline=self.baseline, **self._top_algo_kwargs ) self.sampler = self._top_algo.sampler self.start_worker(self._tf_sess)
def set_param_values(self, flattened_params, name=None, **tags): """Set the values for the parameters.""" with tf.name_scope(name, 'set_param_values', [flattened_params]): param_values = unflatten_tensors(flattened_params, self.get_param_shapes(**tags)) for param, value in zip(self.get_params(**tags), param_values): param.load(value)
def set_param_values(self, param_values): """Set param values. Args: param_values (np.ndarray): A numpy array of parameter values. """ param_values = unflatten_tensors(param_values, self.get_param_shapes()) for param, value in zip(self.get_params(), param_values): param.load(value)
def set_param_values(self, flattened_params, **tags): debug = tags.pop('debug', False) param_values = unflatten_tensors(flattened_params, self.get_param_shapes(**tags)) for param, dtype, value in zip(self.get_params(**tags), self.get_param_dtypes(**tags), param_values): param.set_value(value.astype(dtype)) if debug: print('setting value of %s' % param.name)
def flat_to_params(self, flattened_params): """Unflatten tensors according to their respective shapes. Args: flattened_params (np.ndarray): A numpy array of flattened params. Returns: List[np.ndarray]: A list of parameters reshaped to the shapes specified. """ return unflatten_tensors(flattened_params, self.get_param_shapes())
def set_param_values(self, param_values, name=None, **tags): """Set param values. Args: param_values (np.ndarray): A numpy array of parameter values. tags (dict): A map of parameters for which the values should be loaded. """ param_values = unflatten_tensors(param_values, self.get_param_shapes(**tags)) for param, value in zip(self.get_params(**tags), param_values): param.load(value)
def set_param_values(self, flattened_params, name=None, **tags): """Set the values for the parameters. Args: tags (dict): Some common tags include 'regularizable' and 'trainable' """ with tf.name_scope(name, 'set_param_values', [flattened_params]): param_values = unflatten_tensors(flattened_params, self.get_param_shapes(**tags)) for param, value in zip(self.get_params(**tags), param_values): param.load(value)
def flat_to_params(self, flattened_params, **tags): """Unflatten tensors according to their respective shapes. Args: flattened_params (np.ndarray): A numpy array of flattened params. tags (dict): A map specifying the parameters and their shapes. Returns: tensors (List[np.ndarray]): A list of parameters reshaped to the shapes specified. """ return unflatten_tensors(flattened_params, self.get_param_shapes(**tags))
def flat_to_params(self, flattened_params, **tags): """Unflatten tensors according to their respective shapes. Args: flattened_params (np.ndarray): A numpy array of flattened params. tags (dict): Some common tags include 'regularizable' and 'trainable' Returns: tensors (List[np.ndarray]): A list of parameters reshaped to the shapes specified. """ return unflatten_tensors(flattened_params, self.get_param_shapes(**tags))
def _backtracking_line_search(self, params, descent_step, f_loss, f_constraint): prev_params = [p.clone() for p in params] ratio_list = self._backtrack_ratio**np.arange(self._max_backtracks) loss_before = f_loss() param_shapes = [p.shape or torch.Size([1]) for p in params] descent_step = unflatten_tensors(descent_step, param_shapes) assert len(descent_step) == len(params) for ratio in ratio_list: for step, prev_param, param in zip(descent_step, prev_params, params): step = ratio * step new_param = prev_param.data - step param.data = new_param.data loss = f_loss() constraint_val = f_constraint() if (loss < loss_before and constraint_val <= self._max_constraint_value): break if ((torch.isnan(loss) or torch.isnan(constraint_val) or loss >= loss_before or constraint_val >= self._max_constraint_value) and not self._accept_violation): logger.log('Line search condition violated. Rejecting the step!') if torch.isnan(loss): logger.log('Violated because loss is NaN') if torch.isnan(constraint_val): logger.log('Violated because constraint is NaN') if loss >= loss_before: logger.log('Violated because loss not improving') if constraint_val >= self._max_constraint_value: logger.log('Violated because constraint is violated') for prev, cur in zip(prev_params, params): cur.data = prev.data
def set_param_values(self, flattened_params, name=None, **tags): with tf.name_scope(name, "set_param_values", [flattened_params]): debug = tags.pop("debug", False) param_values = unflatten_tensors(flattened_params, self.get_param_shapes(**tags)) ops = [] feed_dict = dict() for param, dtype, value in zip(self.get_params(**tags), self.get_param_dtypes(**tags), param_values): if param not in self._cached_assign_ops: assign_placeholder = tf.placeholder( dtype=param.dtype.base_dtype) assign_op = tf.assign(param, assign_placeholder) self._cached_assign_ops[param] = assign_op self._cached_assign_placeholders[ param] = assign_placeholder ops.append(self._cached_assign_ops[param]) feed_dict[self._cached_assign_placeholders[ param]] = value.astype(dtype) if debug: print("setting value of %s" % param.name) tf.get_default_session().run(ops, feed_dict=feed_dict)
def flat_to_params(self, flattened_params, **tags): return unflatten_tensors(flattened_params, self.get_param_shapes(**tags))
def run_task(*_): # Configure TF session config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config).as_default() as tf_session: ## Load data from itr_N.pkl with open(snapshot_file, 'rb') as file: saved_data = dill.load(file) ## Load data of new skill global new_skill_subpath if new_skill_policy_file: with open(new_skill_policy_file, 'rb') as file: new_skill_data = dill.load(file) new_skill_policy = new_skill_data['policy'] new_skill_subpath = new_skill_data['subpath'] unique_end_obss = np.unique(new_skill_subpath['end_observations'], axis=0) new_skill_stop_func = lambda path: (path['observations'][-1] == unique_end_obss).all(axis=1).any() ## Lower level environment & policies # Base (original) environment. base_env = saved_data['env'].env.env # <NormalizedEnv<MinibotEnv instance>> # Skill policies, operating in base environment skill_targets = [ # 13 basic room regions ( 6, 5), ( 6, 18), ( 6, 33), ( 6, 47), ( 6, 61), (21, 5), (21, 18), (21, 33), (21, 47), (21, 61), (37, 5), (37, 18), (37, 33), ] trained_skill_policies = \ [GridworldTargetPolicy(env_spec=base_env.spec, target=t) for t in skill_targets] + \ [GridworldStepPolicy(env_spec=base_env.spec, direction=d, n=7) for d in range(4)] + \ [ new_skill_policy # GridworldTargetPolicy(env_spec=base_env.spec, target=(43, 54)) # DEBUG use GridworldTargetPolicy as new skill # GridworldRandomPolicy(env_spec=base_env.spec, n=25) # DEBUG use GridworldRandomPolicy as new skill # GridworldStayPolicy(env_spec=base_env.spec, n=25) # DEBUG use GridworldStayPolicy as new skill ] trained_skill_policies_stop_funcs = \ [pol.skill_stopping_func for pol in trained_skill_policies[:-1]] + \ [ new_skill_stop_func # trained_skill_policies[-1].skill_stopping_func # DEBUG use Gridworld*Policy as new skill ] skill_policy_prototype = saved_data['hrl_policy'].skill_policy_prototype ## Upper level environment & policies # Hierarchized environment hrl_env = HierarchizedEnv( env=base_env, num_orig_skills=len(trained_skill_policies) ) tf_hrl_env = TfEnv(hrl_env) ## Top policy # 1) Get old policy from saved data old_top_policy = saved_data['policy'] # 2) Get weights of old top policy otp_weights = unflatten_tensors( old_top_policy.get_param_values(), old_top_policy.get_param_shapes() ) # 3) Create weights for new top policy skill_integrator = CategoricalMLPSkillIntegrator() ntp_weight_values = skill_integrator.integrate_skill( old_policy_weights=otp_weights, method=skill_integration_method, # Specific parameters for START_OBSS_SKILLS_AVG subpath_start_obss=new_skill_subpath['start_observations'], top_policy=old_top_policy, # Specific parameters for SUBPATH_SKILLS_AVG, SUBPATH_SKILLS_SMOOTH_AVG and SUBPATH_FIRST_SKILL subpath_actions=new_skill_subpath['actions'] ) # 4) Create new policy and randomly initialize its weights new_top_policy = CategoricalMLPPolicy( env_spec=tf_hrl_env.spec, # This env counts with new skill (action space = n + 1) hidden_sizes=(32, 32), # As was in asa_basic_run.py, name="TopCategoricalMLPPolicy2" ) ntp_init_op = tf.variables_initializer(new_top_policy.get_params()) ntp_init_op.run() # 5) Fill new policy with adjusted weights new_top_policy.set_param_values( flattened_params=flatten_tensors(ntp_weight_values) ) ## Hierarchy of policies hrl_policy = HierarchicalPolicy( top_policy=new_top_policy, skill_policy_prototype=skill_policy_prototype, skill_policies=trained_skill_policies, skill_stop_functions=trained_skill_policies_stop_funcs, skill_max_timesteps=150 ) # Link hrl_policy and hrl_env, so that hrl_env can use skills hrl_env.set_hrl_policy(hrl_policy) ## Other # Baseline baseline = saved_data['baseline'] # Take trained baseline # Main ASA algorithm asa_algo = AdaptiveSkillAcquisition( env=tf_hrl_env, hrl_policy=hrl_policy, baseline=baseline, top_algo_cls=TRPO, low_algo_cls=TRPO, # Top algo kwargs batch_size=5000, max_path_length=50, n_itr=300, start_itr=saved_data['itr'] + 1, # Continue from previous iteration number discount=0.99, force_batch_sampler=True, low_algo_kwargs={ 'batch_size': 20000, 'max_path_length': 800, 'n_itr': 300, 'discount': 0.99, } ) ## Launch training train_info = asa_algo.train( sess=tf_session, snapshot_mode='none' ) ## Save last iteration out_file = os.path.join(train_info['snapshot_dir'], 'final.pkl') empty_samples_data = {'paths': None} with open(out_file, 'wb') as file: out_data = asa_algo.get_itr_snapshot( itr=asa_algo.n_itr - 1, samples_data=empty_samples_data ) dill.dump(out_data, file)
def set_param_values(self, param_values, name=None, **tags): param_values = unflatten_tensors(param_values, self.get_param_shapes(**tags)) for param, value in zip(self.get_params(**tags), param_values): param.load(value)