Ejemplo n.º 1
0
 def __init__(
         self,
         env,
         policy,
         baseline,
         difference_params=False,
         quantize=False,
         quantization_tuning=4,
         optimizer=None,
         optimizer_args=None,
         **kwargs):
     Serializable.quick_init(self, locals())
     if optimizer is None:
         default_args = dict(
             batch_size=None,
             max_epochs=1,
         )
         if optimizer_args is None:
             optimizer_args = default_args
         else:
             optimizer_args = dict(default_args, **optimizer_args)
         optimizer = FirstOrderOptimizer(**optimizer_args)
     self.optimizer = optimizer
     self.quantize = quantize
     self.quantization_tuning = quantization_tuning
     self.opt_info = None
     self.policy_params_last_update = 0
     self.difference_params = difference_params
     super(Agent, self).__init__(env=env,
                                 policy=policy,
                                 baseline=baseline,
                                 quantize=quantize,
                                 quantization_tuning=quantization_tuning, **kwargs)
    def __init__(self,
                 optimizer=None,
                 optimizer_args=None,
                 step_size=0.003,
                 num_latents=6,
                 latents=None,  # some sort of iterable of the actual latent vectors
                 period=10,  # how often I choose a latent
                 truncate_local_is_ratio=None,
                 epsilon=0.1,
                 train_pi_iters=10,
                 use_skill_dependent_baseline=False,
                 mlp_skill_dependent_baseline=False,
                 freeze_manager=False,
                 freeze_skills=False,
                 **kwargs):
        if optimizer is None:
            if optimizer_args is None:
                # optimizer_args = dict()
                optimizer_args = dict(batch_size=None)
            self.optimizer = FirstOrderOptimizer(learning_rate=step_size, max_epochs=train_pi_iters, **optimizer_args)
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        self.epsilon = epsilon

        super(Concurrent_PPO, self).__init__(**kwargs)  # not sure if this line is correct
        self.num_latents = kwargs['policy'].latent_dim
        self.latents = latents
        self.period = period
        self.freeze_manager = freeze_manager
        self.freeze_skills = freeze_skills
        assert (not freeze_manager) or (not freeze_skills)

        # todo: fix this sampler stuff
        # import pdb; pdb.set_trace()
        self.sampler = HierBatchSampler(self, self.period)
        # self.sampler = BatchSampler(self)
        # i hope this is right
        self.diagonal = DiagonalGaussian(self.policy.low_policy.action_space.flat_dim)
        self.debug_fns = []

        assert isinstance(self.policy, HierarchicalPolicy)
        if self.policy is not None:
            self.period = self.policy.period
        assert self.policy.period == self.period
        # self.old_policy = copy.deepcopy(self.policy)

        # skill dependent baseline
        self.use_skill_dependent_baseline = use_skill_dependent_baseline
        self.mlp_skill_dependent_baseline = mlp_skill_dependent_baseline
        if use_skill_dependent_baseline:
            curr_env = kwargs['env']
            skill_dependent_action_space = curr_env.action_space
            new_obs_space_no_bi = curr_env.observation_space.shape[0] + 1  # 1 for the t_remaining
            skill_dependent_obs_space_dim = (new_obs_space_no_bi * (self.num_latents + 1) + self.num_latents,)
            skill_dependent_obs_space = Box(-1.0, 1.0, shape=skill_dependent_obs_space_dim)
            skill_dependent_env_spec = EnvSpec(skill_dependent_obs_space, skill_dependent_action_space)
            if self.mlp_skill_dependent_baseline:
                self.skill_dependent_baseline = GaussianMLPBaseline(env_spec=skill_dependent_env_spec)
            else:
                self.skill_dependent_baseline = LinearFeatureBaseline(env_spec=skill_dependent_env_spec)
Ejemplo n.º 3
0
    def __init__(
            self,
            manager_optimizer=None,
            optimizer=None,
            snn_optimizer=None,
            optimizer_args=None,
            step_size=1e-6,
            latents=None,  # some sort of iterable of the actual latent vectors
            period=10,  # how often I choose a latent
            truncate_local_is_ratio=None,
            **kwargs):
        if optimizer is None:
            if optimizer_args is None:
                # optimizer_args = dict()
                optimizer_args = dict(batch_size=None)
            self.optimizer = FirstOrderOptimizer(
                learning_rate=step_size,
                **optimizer_args)  # I hope this is right
        self.manager_optimizer = manager_optimizer
        self.snn_optimizer = snn_optimizer
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        super(PG_concurrent,
              self).__init__(**kwargs)  # not sure if this line is correct
        self.latents = latents
        self.period = period

        # todo: fix this sampler stuff
        self.sampler = HierBatchSampler(self, self.period)

        # i hope this is right
        self.diagonal = DiagonalGaussian(
            self.policy.low_policy.action_space.flat_dim)
        self.debug_fns = []
Ejemplo n.º 4
0
    def __init__(self,
                 qf,
                 policy,
                 min_pool_size=10000,
                 replay_pool_size=1000000,
                 replacement_prob=1.0,
                 qf_batch_size=32,
                 qf_weight_decay=0.,
                 qf_update_method='adam',
                 qf_learning_rate=1e-3,
                 qf_use_target=True,
                 soft_target_tau=0.001,
                 ):

        self.soft_target_tau = soft_target_tau
        self.min_pool_size = min_pool_size
        self.replay_pool_size = replay_pool_size
        self.replacement_prob = replacement_prob
        self.qf_batch_size = qf_batch_size
        self.qf_weight_decay = qf_weight_decay
        self.qf_update_method = FirstOrderOptimizer(update_method=qf_update_method,
                                                    learning_rate=qf_learning_rate)
        self.qf_use_target = qf_use_target
        self.discount = 0.99
        self.qf = qf
        self.policy = policy

        self.qf_loss_averages = []
        self.q_averages = []
        self.y_averages = []
Ejemplo n.º 5
0
    def __init__(self, wrapped_constraint, 
                       env_spec, 
                       yield_zeros_until=1,
                       optimizer=None, 
                       hidden_sizes=(32,), 
                       hidden_nonlinearity=NL.sigmoid, 
                       lag_time=10, 
                       coeff=1.,
                       filter_bonuses=False,
                       max_epochs=25,
                       *args, **kwargs):

        Serializable.quick_init(self,locals())

        self._wrapped_constraint = wrapped_constraint
        self._env_spec = env_spec
        self._filter_bonuses = filter_bonuses
        self._yield_zeros_until = yield_zeros_until
        self._hidden_sizes = hidden_sizes
        self._lag_time = lag_time
        self._coeff = coeff
        self._max_epochs = max_epochs
        self.use_bonus = True

        if optimizer is None:
            #optimizer = LbfgsOptimizer()
            optimizer = FirstOrderOptimizer(max_epochs=max_epochs, batch_size=None)

        self._optimizer = optimizer

        obs_dim = env_spec.observation_space.flat_dim

        predictor_network = MLP(1,hidden_sizes,hidden_nonlinearity,NL.sigmoid,
                                     input_shape=(obs_dim,))

        LasagnePowered.__init__(self, [predictor_network.output_layer])

        x_var = predictor_network.input_layer.input_var
        y_var = TT.matrix("ys")
        out_var = L.get_output(predictor_network.output_layer, 
                               {predictor_network.input_layer: x_var})

        regression_loss = TT.mean(TT.square(y_var - out_var))

        optimizer_args = dict(
            loss=regression_loss,
            target=self,
            inputs=[x_var, y_var],
        )

        self._optimizer.update_opt(**optimizer_args)
        self._f_predict = compile_function([x_var],out_var)

        self._fit_steps = 0

        self.has_baseline = self._wrapped_constraint.has_baseline
        if self.has_baseline:
            self.baseline = self._wrapped_constraint.baseline
    def __init__(
            self,
            optimizer=None,
            optimizer_args=None,
            step_size=1e-2,
            num_latents=6,
            latents=None,  # some sort of iterable of the actual latent vectors
            period=10,  # how often I choose a latent
            truncate_local_is_ratio=None,
            use_skill_dependent_baseline=False,
            **kwargs):
        Serializable.quick_init(self, locals())
        if optimizer is None:
            default_args = dict(batch_size=None, max_epochs=1)
            if optimizer_args is None:
                optimizer_args = default_args
            else:
                optimizer_args = dict(default_args, **optimizer_args)
            optimizer = FirstOrderOptimizer(learning_rate=step_size,
                                            **optimizer_args)
        self.optimizer = optimizer
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        super(PG_concurrent_approx,
              self).__init__(**kwargs)  # not sure if this line is correct
        self.num_latents = kwargs['policy'].latent_dim
        self.latents = latents
        self.period = period

        # todo: fix this sampler stuff
        self.sampler = HierBatchSampler(self, self.period)

        # i hope this is right
        self.diagonal = DiagonalGaussian(
            self.policy.low_policy.action_space.flat_dim)
        self.debug_fns = []

        assert isinstance(self.policy, HierarchicalPolicy)
        if self.policy is not None:
            self.period = self.policy.period
        assert self.policy.period == self.period

        self.trainable_manager = self.policy.trainable_manager

        # skill dependent baseline
        self.use_skill_dependent_baseline = use_skill_dependent_baseline
        if use_skill_dependent_baseline:
            curr_env = kwargs['env']
            skill_dependent_action_space = curr_env.action_space
            skill_dependent_obs_space_dim = (
                (curr_env.observation_space.shape[0] + 1) * self.num_latents, )
            skill_dependent_obs_space = Box(
                -1.0, 1.0, shape=skill_dependent_obs_space_dim)
            skill_depdendent_env_spec = EnvSpec(skill_dependent_obs_space,
                                                skill_dependent_action_space)
            self.skill_dependent_baseline = LinearFeatureBaseline(
                env_spec=skill_depdendent_env_spec)
Ejemplo n.º 7
0
    def __init__(self,
                 agents_number,
                 average_period,
                 participation_rate,
                 env,
                 policy,
                 baseline,
                 difference_params=False,
                 quantize=False,
                 quantization_tuning=4,
                 optimizer=None,
                 optimizer_args=None,
                 **kwargs):

        Serializable.quick_init(self, locals())
        if optimizer is None:
            default_args = dict(
                batch_size=None,
                max_epochs=1,
            )
            if optimizer_args is None:
                optimizer_args = default_args
            else:
                optimizer_args = dict(default_args, **optimizer_args)
            optimizer = [FirstOrderOptimizer(**optimizer_args)] * agents_number
        self.agents = [
            Agent(env=env,
                  policy=policy,
                  optimizer=optimizer,
                  baseline=baseline,
                  difference_params=difference_params,
                  quantize=quantize,
                  quantization_tuning=quantization_tuning,
                  **kwargs) for optimizer in optimizer
        ]
        self.baseline = baseline
        self.average_period = average_period
        self.participation_rate = participation_rate
        self.transferred_bits = 0
        super(Server, self).__init__(agents_number=agents_number,
                                     average_period=average_period,
                                     participation_rate=participation_rate,
                                     env=env,
                                     policy=policy,
                                     baseline=baseline,
                                     difference_params=difference_params,
                                     quantize=quantize,
                                     quantization_tuning=quantization_tuning,
                                     optimizer=optimizer,
                                     optimizer_args=optimizer_args,
                                     **kwargs)
Ejemplo n.º 8
0
 def __init__(self,
              optimizer=None,
              optimizer_args=None,
              step_size=0.01,
              truncate_local_is_ratio=None,
              **kwargs):
     if optimizer is None:
         if optimizer_args is None:
             optimizer_args = dict()
         optimizer = FirstOrderOptimizer(**optimizer_args)
     self.optimizer = optimizer
     self.step_size = step_size
     self.truncate_local_is_ratio = truncate_local_is_ratio
     super(VPG, self).__init__(**kwargs)
Ejemplo n.º 9
0
    def __init__(self,
                 optimizer=None,
                 optimizer_args=None,
                 step_size=0.0003,
                 truncate_local_is_ratio=None,
                 epsilon=0.1,
                 train_pi_iters=80,
                 **kwargs):
        if optimizer is None:
            if optimizer_args is None:
                optimizer_args = dict(batch_size=None)
            self.optimizer = FirstOrderOptimizer(learning_rate=step_size, max_epochs=train_pi_iters,
                                                 **optimizer_args)
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        self.epsilon = epsilon

        super(PPO_flat, self).__init__(**kwargs)  # not sure if this line is correct

        # i hope this is right
        self.debug_fns = []
Ejemplo n.º 10
0
    def __init__(
            self,
            optimizer=None,
            optimizer_args=None,
            step_size=0.0003,
            latents=None,  # some sort of iterable of the actual latent vectors
            average_period=10,  # average over all the periods
            truncate_local_is_ratio=None,
            epsilon=0.1,
            train_pi_iters=80,
            use_skill_dependent_baseline=False,
            **kwargs):
        if optimizer is None:
            if optimizer_args is None:
                # optimizer_args = dict()
                optimizer_args = dict(batch_size=None)
            self.optimizer = FirstOrderOptimizer(learning_rate=step_size,
                                                 max_epochs=train_pi_iters,
                                                 **optimizer_args)
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        self.epsilon = epsilon

        super(Hippo,
              self).__init__(**kwargs)  # not sure if this line is correct
        self.num_latents = kwargs['policy'].latent_dim
        self.latents = latents
        self.average_period = average_period

        # import pdb; pdb.set_trace()
        self.sampler = BatchSampler(self)

        # i hope this is right
        self.diagonal = DiagonalGaussian(
            self.policy.low_policy.action_space.flat_dim)
        self.debug_fns = []
        self.use_skill_dependent_baseline = use_skill_dependent_baseline

        assert isinstance(self.policy, HierarchicalPolicy)
        self.old_policy = copy.deepcopy(self.policy)
Ejemplo n.º 11
0
 def __init__(self,
              num_of_agents,
              env,
              policy,
              policy_list,
              baseline,
              baseline_list,
              optimizer=None,
              optimizer_args=None,
              with_critic=True,
              **kwargs):
     Serializable.quick_init(self, locals())
     if optimizer is None:
         default_args = dict(
             batch_size=None,
             max_epochs=1,
         )
         if optimizer_args is None:
             optimizer_args = default_args
         else:
             optimizer_args = dict(default_args, **optimizer_args)
         optimizer = FirstOrderOptimizer(**optimizer_args)
         #optimizer = MyFirstOrderOptimizer(**optimizer_args)
     self.optimizer = optimizer
     self.opt_info = None
     self.num_of_agents = num_of_agents
     self.sampler_list = [
         BatchSampler_Multi(self, i, with_critic)
         for i in range(self.num_of_agents)
     ]
     self.optimizer_list = [
         pickle.loads(pickle.dumps(self.optimizer))
         for _ in range(self.num_of_agents)
     ]
     super(VPG_multi, self).__init__(env=env,
                                     policy=policy,
                                     baseline=baseline,
                                     **kwargs)
     self.policy_list = policy_list
     self.baseline_list = baseline_list
Ejemplo n.º 12
0
 def __init__(
         self,
         env,
         policy,
         baseline,
         optimizer=None,
         optimizer_args=None,
         **kwargs):
     Serializable.quick_init(self, locals())
     if optimizer is None:
         default_args = dict(
             batch_size=None,
             max_epochs=1,
         )
         if optimizer_args is None:
             optimizer_args = default_args
         else:
             optimizer_args = dict(default_args, **optimizer_args)
         optimizer = FirstOrderOptimizer(**optimizer_args)
     self.optimizer = optimizer
     self.opt_info = None
     super(VPG, self).__init__(env=env, policy=policy, baseline=baseline, **kwargs)
Ejemplo n.º 13
0
    def __init__(self,
                 agents_number,
                 average_period,
                 server_env,
                 policy,
                 baseline,
                 optimizer=None,
                 optimizer_args=None,
                 **kwargs):

        Serializable.quick_init(self, locals())
        if optimizer is None:
            default_args = dict(
                batch_size=None,
                max_epochs=1,
            )
            if optimizer_args is None:
                optimizer_args = default_args
            else:
                optimizer_args = dict(default_args, **optimizer_args)
            optimizer = [FirstOrderOptimizer(**optimizer_args)] * agents_number
        self.agents = [
            Agent(env=server_env.agents_envs[k],
                  policy=policy,
                  optimizer=optimizer,
                  baseline=baseline,
                  **kwargs) for k, optimizer in enumerate(optimizer)
        ]
        self.baseline = baseline
        self.average_period = average_period
        super(Server, self).__init__(agents_number=agents_number,
                                     average_period=average_period,
                                     env=server_env,
                                     policy=policy,
                                     baseline=baseline,
                                     optimizer=optimizer,
                                     optimizer_args=optimizer_args,
                                     **kwargs)
    def __init__(
        self,
        env_spec,
        policy,
        recurrent=False,
        predict_all=True,
        obs_regressed='all',
        act_regressed='all',
        use_only_sign=False,
        noisify_traj_coef=0,
        optimizer=None,  # this defaults to LBFGS
        regressor_args=None,  # here goes all args straight to the regressor: hidden_sizes, TR, step_size....
    ):
        """
        :param predict_all: this is only for the recurrent case, to use all hidden states as predictions
        :param obs_regressed: list of index of the obs variables used to fit the regressor. default string 'all'
        :param act_regressed: list of index of the act variables used to fit the regressor. default string 'all'
        :param regressor_args:
        """
        self.env_spec = env_spec
        self.policy = policy
        self.latent_dim = policy.latent_dim
        self.recurrent = recurrent
        self.predict_all = predict_all
        self.use_only_sign = use_only_sign
        self.noisify_traj_coef = noisify_traj_coef
        self.regressor_args = regressor_args
        # decide what obs variables will be regressed upon
        if obs_regressed == 'all':
            self.obs_regressed = list(
                range(env_spec.observation_space.flat_dim))
        else:
            self.obs_regressed = obs_regressed
        # decide what action variables will be regressed upon
        if act_regressed == 'all':
            self.act_regressed = list(range(env_spec.action_space.flat_dim))
        else:
            self.act_regressed = act_regressed
        # shape the input dimension of the NN for the above decisions.
        self.obs_act_dim = len(self.obs_regressed) + len(self.act_regressed)

        Serializable.quick_init(self, locals())  # ??

        if regressor_args is None:
            regressor_args = dict()

        if optimizer == 'first_order':
            self.optimizer = FirstOrderOptimizer(
                max_epochs=10,  # both of these are to match Rocky's 10
                batch_size=128,
            )
        elif optimizer is None:
            self.optimizer = None
        else:
            raise NotImplementedError

        if policy.latent_name == 'bernoulli':
            if self.recurrent:
                self._regressor = BernoulliRecurrentRegressor(
                    input_shape=(self.obs_act_dim, ),
                    output_dim=policy.latent_dim,
                    optimizer=self.optimizer,
                    predict_all=self.predict_all,
                    **regressor_args)
            else:
                self._regressor = BernoulliMLPRegressor(
                    input_shape=(self.obs_act_dim, ),
                    output_dim=policy.latent_dim,
                    optimizer=self.optimizer,
                    **regressor_args)
        elif policy.latent_name == 'categorical':
            if self.recurrent:
                self._regressor = CategoricalRecurrentRegressor(  # not implemented
                    input_shape=(self.obs_act_dim, ),
                    output_dim=policy.latent_dim,
                    optimizer=self.optimizer,
                    # predict_all=self.predict_all,
                    **regressor_args)
            else:
                self._regressor = CategoricalMLPRegressor(
                    input_shape=(self.obs_act_dim, ),
                    output_dim=policy.latent_dim,
                    optimizer=self.optimizer,
                    **regressor_args)
        elif policy.latent_name == 'normal':
            self._regressor = GaussianMLPRegressor(
                input_shape=(self.obs_act_dim, ),
                output_dim=policy.latent_dim,
                optimizer=self.optimizer,
                **regressor_args)
        else:
            raise NotImplementedError