Example #1
0
    def build_lr(self,
                 net,
                 param_init_net,
                 base_learning_rate,
                 learning_rate_blob=None,
                 policy="fixed",
                 iter_val=0,
                 **kwargs):
        if learning_rate_blob is None:
            learning_rate_blob = self.make_unique_blob_name('lr')

        iteration = utils.BuildUniqueMutexIter(param_init_net,
                                               net,
                                               iter_val=iter_val)

        if not net.BlobIsDefined(learning_rate_blob):
            # There is one interesting thing here: since we are minimizing, we are
            # doing "descent" so the learning rate is set to be negative.
            lr = net.LearningRate([iteration],
                                  learning_rate_blob,
                                  base_lr=-base_learning_rate,
                                  policy=policy,
                                  **kwargs)
        else:
            lr = net.GetBlobRef(learning_rate_blob)

        if self._lr_multiplier is not None:
            lr_multiplier = net.CopyFromCPUInput(
                self._lr_multiplier,
                self.make_unique_blob_name('lr_multiplier'))

            lr = net.Mul(
                [lr, lr_multiplier],
                self.make_unique_blob_name('scaled_lr'),
                broadcast=1,
            )

        if self._local_lr_multiplier is not None:
            current_scope = scope.CurrentDeviceScope()
            if (current_scope is not None
                    and current_scope.device_type == caffe2_pb2.CUDA
                    and not self._local_lr_multiplier_on_gpu):
                local_lr_multiplier = net.CopyFromCPUInput(
                    self._local_lr_multiplier,
                    self.make_unique_blob_name('local_lr_multiplier'))
            else:
                local_lr_multiplier = self._local_lr_multiplier

            lr = net.Mul(
                [lr, local_lr_multiplier],
                self.make_unique_blob_name('local_scaled_lr'),
                broadcast=1,
            )

        return lr, iteration
Example #2
0
    def testBuildUniqueMutexIter(self):
        init_net = core.Net("init_net")
        net = core.Net("net")
        utils.BuildUniqueMutexIter(init_net, net)

        for op in init_net.Proto().op:
            self.assertEqual(op.device_option.extra_info[0],
                             "device_type_override:cpu")

        for op in net.Proto().op:
            self.assertEqual(op.device_option.extra_info[0],
                             "device_type_override:cpu")
Example #3
0
 def _run_on_loss(self, net, param_init_net, param, grad=None):
     iteration = utils.BuildUniqueMutexIter(param_init_net, net)
     # Since we are most likely to do a minimization
     discount = net.NextScopedBlob(param + "_log_barrier_discount")
     net.LearningRate([iteration], [discount],
                      base_lr=-self.reg_lambda,
                      policy=self.discount_policy,
                      **self.discount_options)
     # TODO(xlwang): param might still be negative at the initialization time or
     # slightly negative due to the distributed training. Enforce it's non-negativity
     # for now (at least above machine epsilon)
     param_non_neg = net.NextScopedBlob(param + "_non_neg")
     net.Clip([param], [param_non_neg], min=self.kEpsilon)
     param_log = net.NextScopedBlob(param + "_log")
     net.Log([param_non_neg], [param_log])
     param_log_sum = net.NextScopedBlob(param + "_log_sum")
     net.SumElements([param_log], [param_log_sum])
     output_blob = net.NextScopedBlob(param + "_log_barrier")
     net.Mul([param_log_sum, discount], [output_blob], broadcast=1)
     return output_blob
Example #4
0
    def _run(self, net, param_init_net, param_info):

        # Note: This is number of persistent scalars in YellowFin optimizer.
        #       It should always be the number of scalars being used. The same
        #       number should be used in class for the operation.
        SCALARS_MEMORY_SIZE = 5

        param = param_info.blob
        grad = param_info.grad
        moment = param_init_net.ConstantFill([param],
                                             param + "_moment",
                                             value=0.0)
        curv_win = param_init_net.ConstantFill([],
                                               param + "_curv_win",
                                               shape=[self.curv_win_width],
                                               value=0.0)
        g_avg = param_init_net.ConstantFill([param],
                                            param + "_g_avg",
                                            value=0.0)
        g2_avg = param_init_net.ConstantFill([param],
                                             param + "_g2_avg",
                                             value=0.0)
        lr_avg = param_init_net.ConstantFill([],
                                             param + "_lr_avg",
                                             shape=[1],
                                             value=self.alpha)
        mu_avg = param_init_net.ConstantFill([],
                                             param + "_mu_avg",
                                             shape=[1],
                                             value=self.mu)
        scalars_memory = param_init_net.ConstantFill(
            [],
            param + "_scalars_memory",
            shape=[SCALARS_MEMORY_SIZE],
            value=0.0)

        assert self.alpha > 0
        assert not isinstance(grad, core.GradientSlice), \
            "YellowFin does not support sparse gradients"

        iteration = utils.BuildUniqueMutexIter(param_init_net, net, iter_val=0)

        self._aux_params.shared.append(iteration)
        self._aux_params.local.append(moment)
        self._aux_params.local.append(lr_avg)
        self._aux_params.local.append(mu_avg)
        self._aux_params.local.append(curv_win)
        self._aux_params.local.append(g_avg)
        self._aux_params.local.append(g2_avg)
        self._aux_params.local.append(scalars_memory)

        yf_in_out_args = [
            param, moment, lr_avg, mu_avg, curv_win, g_avg, g2_avg,
            scalars_memory
        ]

        net.YellowFin(yf_in_out_args + [grad, iteration],
                      yf_in_out_args,
                      beta=self.beta,
                      epsilon=self.epsilon,
                      curv_win_width=self.curv_win_width,
                      zero_debias=self.zero_debias)