Esempio n. 1
0
    def _eval(vector):
        """The evaluation function.

        Args:
            vector (torch.Tensor): The vector to be multiplied with
                Hessian.

        Returns:
            torch.Tensor: The product of Hessian of function f and v.

        """
        unflatten_vector = unflatten_tensors(vector, param_shapes)

        assert len(f_grads) == len(unflatten_vector)
        grad_vector_product = torch.sum(
            torch.stack(
                [torch.sum(g * x) for g, x in zip(f_grads, unflatten_vector)]))

        hvp = list(
            torch.autograd.grad(grad_vector_product, params,
                                retain_graph=True))
        for i, (hx, p) in enumerate(zip(hvp, params)):
            if hx is None:
                hvp[i] = torch.zeros_like(p)

        flat_output = torch.cat([h.reshape(-1) for h in hvp])
        return flat_output + reg_coeff * vector
Esempio n. 2
0
def init_policy_np(policy, np_random=np.random):
    params = policy.get_params(trainable=True)
    shapes = policy.get_param_shapes(trainable=True)
    param_values = policy.get_param_values(trainable=True)

    flattened_params = np_random.rand(*param_values.shape)
    param_values = unflatten_tensors(flattened_params, shapes)

    for i, param in enumerate(params):
        # assert param.name[-3] == "W" or param.name[-3] == "b"
        if param.name[-3] == "W":
            shape = shapes[i]
            if len(shape) == 2:
                n_inputs, n_outputs = shape
            else:
                receptive_field_size = np.prod(shape[:2])
                n_inputs = shape[-2] * receptive_field_size
                n_outputs = shape[-1] * receptive_field_size
            init_range = np.sqrt(6.0 / (n_inputs + n_outputs))
            param_values[i] = (param_values[i] * 2 - 1) * init_range
        elif param.name[-3] == "b":
            param_values[i] = np.zeros_like(param_values[i])

    param_values = flatten_tensors(param_values)
    return param_values
    def integrate_new_skill(self, new_skill_id, new_skill_subpath):
        skill_integration_method = CategoricalMLPSkillIntegrator.Method.SUBPATH_SKILLS_AVG

        ## Hierarchized environment
        hrl_env = HierarchizedEnv(
                # base env that was wrapped in HierarchizedEnv (not fully unwrapped - may be normalized!)
                env=self.env.env.env,
                num_orig_skills=self._hrl_policy.num_skills
        )
        tf_hrl_env = TfEnv(hrl_env)

        ## Top policy
        # 1) Get old policy from saved data
        old_top_policy = self._hrl_policy.get_top_policy()

        # 2) Get weights of old top policy
        otp_weights = unflatten_tensors(
                old_top_policy.get_param_values(),
                old_top_policy.get_param_shapes()
        )

        # 3) Create weights for new top policy
        skill_integrator = CategoricalMLPSkillIntegrator()
        ntp_weight_values = skill_integrator.integrate_skill(
                old_policy_weights=otp_weights,
                method=skill_integration_method,
                # Specific parameters for START_OBSS_SKILLS_AVG
                subpath_start_obss=new_skill_subpath['start_observations'],
                top_policy=old_top_policy,
                # Specific parameters for SUBPATH_SKILLS_AVG, SUBPATH_SKILLS_SMOOTH_AVG and SUBPATH_FIRST_SKILL
                subpath_actions=new_skill_subpath['actions']
        )

        # 4) Create new policy and randomly initialize its weights
        new_top_policy = CategoricalMLPPolicy(
                env_spec=tf_hrl_env.spec,  # This env counts with new skill (action space = n + 1)
                hidden_sizes=(32, 32),     # As was in asa_test.py,
                name='CategoricalMLPPolicyWithSkill{}'.format(new_skill_id)
        )
        ntp_init_op = tf.variables_initializer(new_top_policy.get_params())
        ntp_init_op.run()

        # 5) Fill new policy with adjusted weights
        new_top_policy.set_param_values(
                flattened_params=flatten_tensors(ntp_weight_values)
        )

        ## Adjust HRL policy and training algorithms
        self._hrl_policy.top_policy = new_top_policy
        hrl_env.set_hrl_policy(self._hrl_policy)
        self.env = tf_hrl_env
        self.policy=self._hrl_policy.get_top_policy()
        self._top_algo = self._top_algo_cls(
                env=tf_hrl_env,
                policy=self._hrl_policy.get_top_policy(),
                baseline=self.baseline,
                **self._top_algo_kwargs
        )
        self.sampler = self._top_algo.sampler
        self.start_worker(self._tf_sess)
Esempio n. 4
0
 def set_param_values(self, flattened_params, name=None, **tags):
     """Set the values for the parameters."""
     with tf.name_scope(name, 'set_param_values', [flattened_params]):
         param_values = unflatten_tensors(flattened_params,
                                          self.get_param_shapes(**tags))
         for param, value in zip(self.get_params(**tags), param_values):
             param.load(value)
Esempio n. 5
0
    def set_param_values(self, param_values):
        """Set param values.

        Args:
            param_values (np.ndarray): A numpy array of parameter values.

        """
        param_values = unflatten_tensors(param_values, self.get_param_shapes())
        for param, value in zip(self.get_params(), param_values):
            param.load(value)
Esempio n. 6
0
 def set_param_values(self, flattened_params, **tags):
     debug = tags.pop('debug', False)
     param_values = unflatten_tensors(flattened_params,
                                      self.get_param_shapes(**tags))
     for param, dtype, value in zip(self.get_params(**tags),
                                    self.get_param_dtypes(**tags),
                                    param_values):
         param.set_value(value.astype(dtype))
         if debug:
             print('setting value of %s' % param.name)
Esempio n. 7
0
    def flat_to_params(self, flattened_params):
        """Unflatten tensors according to their respective shapes.

        Args:
            flattened_params (np.ndarray): A numpy array of flattened params.

        Returns:
            List[np.ndarray]: A list of parameters reshaped to the
                shapes specified.

        """
        return unflatten_tensors(flattened_params, self.get_param_shapes())
Esempio n. 8
0
    def set_param_values(self, param_values, name=None, **tags):
        """Set param values.

        Args:
            param_values (np.ndarray): A numpy array of parameter values.
            tags (dict): A map of parameters for which the values should be
            loaded.
        """
        param_values = unflatten_tensors(param_values,
                                         self.get_param_shapes(**tags))
        for param, value in zip(self.get_params(**tags), param_values):
            param.load(value)
Esempio n. 9
0
    def set_param_values(self, flattened_params, name=None, **tags):
        """Set the values for the parameters.

        Args:
            tags (dict): Some common tags include 'regularizable' and
            'trainable'

        """
        with tf.name_scope(name, 'set_param_values', [flattened_params]):
            param_values = unflatten_tensors(flattened_params,
                                             self.get_param_shapes(**tags))
            for param, value in zip(self.get_params(**tags), param_values):
                param.load(value)
Esempio n. 10
0
    def flat_to_params(self, flattened_params, **tags):
        """Unflatten tensors according to their respective shapes.

        Args:
            flattened_params (np.ndarray): A numpy array of flattened params.
            tags (dict): A map specifying the parameters and their shapes.

        Returns:
            tensors (List[np.ndarray]): A list of parameters reshaped to the
            shapes specified.

        """
        return unflatten_tensors(flattened_params,
                                 self.get_param_shapes(**tags))
Esempio n. 11
0
    def flat_to_params(self, flattened_params, **tags):
        """Unflatten tensors according to their respective shapes.

        Args:
            flattened_params (np.ndarray): A numpy array of flattened params.
            tags (dict): Some common tags include 'regularizable' and
            'trainable'

        Returns:
            tensors (List[np.ndarray]): A list of parameters reshaped to the
            shapes specified.

        """
        return unflatten_tensors(flattened_params,
                                 self.get_param_shapes(**tags))
Esempio n. 12
0
    def _backtracking_line_search(self, params, descent_step, f_loss,
                                  f_constraint):
        prev_params = [p.clone() for p in params]
        ratio_list = self._backtrack_ratio**np.arange(self._max_backtracks)
        loss_before = f_loss()

        param_shapes = [p.shape or torch.Size([1]) for p in params]
        descent_step = unflatten_tensors(descent_step, param_shapes)
        assert len(descent_step) == len(params)

        for ratio in ratio_list:
            for step, prev_param, param in zip(descent_step, prev_params,
                                               params):
                step = ratio * step
                new_param = prev_param.data - step
                param.data = new_param.data

            loss = f_loss()
            constraint_val = f_constraint()
            if (loss < loss_before
                    and constraint_val <= self._max_constraint_value):
                break

        if ((torch.isnan(loss) or torch.isnan(constraint_val)
             or loss >= loss_before
             or constraint_val >= self._max_constraint_value)
                and not self._accept_violation):
            logger.log('Line search condition violated. Rejecting the step!')
            if torch.isnan(loss):
                logger.log('Violated because loss is NaN')
            if torch.isnan(constraint_val):
                logger.log('Violated because constraint is NaN')
            if loss >= loss_before:
                logger.log('Violated because loss not improving')
            if constraint_val >= self._max_constraint_value:
                logger.log('Violated because constraint is violated')
            for prev, cur in zip(prev_params, params):
                cur.data = prev.data
Esempio n. 13
0
 def set_param_values(self, flattened_params, name=None, **tags):
     with tf.name_scope(name, "set_param_values", [flattened_params]):
         debug = tags.pop("debug", False)
         param_values = unflatten_tensors(flattened_params,
                                          self.get_param_shapes(**tags))
         ops = []
         feed_dict = dict()
         for param, dtype, value in zip(self.get_params(**tags),
                                        self.get_param_dtypes(**tags),
                                        param_values):
             if param not in self._cached_assign_ops:
                 assign_placeholder = tf.placeholder(
                     dtype=param.dtype.base_dtype)
                 assign_op = tf.assign(param, assign_placeholder)
                 self._cached_assign_ops[param] = assign_op
                 self._cached_assign_placeholders[
                     param] = assign_placeholder
             ops.append(self._cached_assign_ops[param])
             feed_dict[self._cached_assign_placeholders[
                 param]] = value.astype(dtype)
             if debug:
                 print("setting value of %s" % param.name)
         tf.get_default_session().run(ops, feed_dict=feed_dict)
Esempio n. 14
0
 def flat_to_params(self, flattened_params, **tags):
     return unflatten_tensors(flattened_params,
                              self.get_param_shapes(**tags))
Esempio n. 15
0
def run_task(*_):
    # Configure TF session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config).as_default() as tf_session:
        ## Load data from itr_N.pkl
        with open(snapshot_file, 'rb') as file:
            saved_data = dill.load(file)

        ## Load data of new skill
        global new_skill_subpath
        if new_skill_policy_file:
            with open(new_skill_policy_file, 'rb') as file:
                new_skill_data = dill.load(file)
            new_skill_policy = new_skill_data['policy']
            new_skill_subpath = new_skill_data['subpath']
            unique_end_obss = np.unique(new_skill_subpath['end_observations'], axis=0)
            new_skill_stop_func = lambda path: (path['observations'][-1] == unique_end_obss).all(axis=1).any()

        ## Lower level environment & policies
        # Base (original) environment.
        base_env = saved_data['env'].env.env  # <NormalizedEnv<MinibotEnv instance>>

        # Skill policies, operating in base environment
        skill_targets = [  # 13 basic room regions
            ( 6,  5), ( 6, 18), ( 6, 33), ( 6, 47), ( 6, 61),
            (21,  5), (21, 18), (21, 33), (21, 47), (21, 61),
            (37,  5), (37, 18), (37, 33),
        ]
        trained_skill_policies = \
            [GridworldTargetPolicy(env_spec=base_env.spec, target=t) for t in skill_targets] + \
            [GridworldStepPolicy(env_spec=base_env.spec, direction=d, n=7) for d in range(4)] + \
            [
             new_skill_policy
             # GridworldTargetPolicy(env_spec=base_env.spec, target=(43, 54))  # DEBUG use GridworldTargetPolicy as new skill
             # GridworldRandomPolicy(env_spec=base_env.spec, n=25)             # DEBUG use GridworldRandomPolicy as new skill
             # GridworldStayPolicy(env_spec=base_env.spec, n=25)               # DEBUG use GridworldStayPolicy as new skill
            ]
        trained_skill_policies_stop_funcs = \
                [pol.skill_stopping_func for pol in trained_skill_policies[:-1]] + \
                [
                 new_skill_stop_func
                 # trained_skill_policies[-1].skill_stopping_func                  # DEBUG use Gridworld*Policy as new skill
                ]
        skill_policy_prototype = saved_data['hrl_policy'].skill_policy_prototype

        ## Upper level environment & policies
        # Hierarchized environment
        hrl_env = HierarchizedEnv(
                env=base_env,
                num_orig_skills=len(trained_skill_policies)
        )
        tf_hrl_env = TfEnv(hrl_env)


        ## Top policy
        # 1) Get old policy from saved data
        old_top_policy = saved_data['policy']

        # 2) Get weights of old top policy
        otp_weights = unflatten_tensors(
                old_top_policy.get_param_values(),
                old_top_policy.get_param_shapes()
        )

        # 3) Create weights for new top policy
        skill_integrator = CategoricalMLPSkillIntegrator()
        ntp_weight_values = skill_integrator.integrate_skill(
                old_policy_weights=otp_weights,
                method=skill_integration_method,
                # Specific parameters for START_OBSS_SKILLS_AVG
                subpath_start_obss=new_skill_subpath['start_observations'],
                top_policy=old_top_policy,
                # Specific parameters for SUBPATH_SKILLS_AVG, SUBPATH_SKILLS_SMOOTH_AVG and SUBPATH_FIRST_SKILL
                subpath_actions=new_skill_subpath['actions']
        )

        # 4) Create new policy and randomly initialize its weights
        new_top_policy = CategoricalMLPPolicy(
                env_spec=tf_hrl_env.spec,  # This env counts with new skill (action space = n + 1)
                hidden_sizes=(32, 32),     # As was in asa_basic_run.py,
                name="TopCategoricalMLPPolicy2"
        )
        ntp_init_op = tf.variables_initializer(new_top_policy.get_params())
        ntp_init_op.run()

        # 5) Fill new policy with adjusted weights
        new_top_policy.set_param_values(
                flattened_params=flatten_tensors(ntp_weight_values)
        )


        ## Hierarchy of policies
        hrl_policy = HierarchicalPolicy(
                top_policy=new_top_policy,
                skill_policy_prototype=skill_policy_prototype,
                skill_policies=trained_skill_policies,
                skill_stop_functions=trained_skill_policies_stop_funcs,
                skill_max_timesteps=150
        )
        # Link hrl_policy and hrl_env, so that hrl_env can use skills
        hrl_env.set_hrl_policy(hrl_policy)

        ## Other
        # Baseline
        baseline = saved_data['baseline']  # Take trained baseline

        # Main ASA algorithm
        asa_algo = AdaptiveSkillAcquisition(
                env=tf_hrl_env,
                hrl_policy=hrl_policy,
                baseline=baseline,
                top_algo_cls=TRPO,
                low_algo_cls=TRPO,
                # Top algo kwargs
                    batch_size=5000,
                    max_path_length=50,
                    n_itr=300,
                    start_itr=saved_data['itr'] + 1,  # Continue from previous iteration number
                    discount=0.99,
                    force_batch_sampler=True,
                low_algo_kwargs={
                    'batch_size': 20000,
                    'max_path_length': 800,
                    'n_itr': 300,
                    'discount': 0.99,
                }
        )

        ## Launch training
        train_info = asa_algo.train(
                sess=tf_session,
                snapshot_mode='none'
        )

        ## Save last iteration
        out_file = os.path.join(train_info['snapshot_dir'], 'final.pkl')
        empty_samples_data = {'paths': None}
        with open(out_file, 'wb') as file:
            out_data = asa_algo.get_itr_snapshot(
                itr=asa_algo.n_itr - 1,
                samples_data=empty_samples_data
            )
            dill.dump(out_data, file)
Esempio n. 16
0
 def set_param_values(self, param_values, name=None, **tags):
     param_values = unflatten_tensors(param_values,
                                      self.get_param_shapes(**tags))
     for param, value in zip(self.get_params(**tags), param_values):
         param.load(value)