Ejemplo n.º 1
0
def optimization(loss,
                 warmup_steps,
                 num_train_steps,
                 learning_rate,
                 train_program,
                 startup_prog,
                 weight_decay,
                 d_model,
                 scheduler='linear_warmup_decay',
                 use_fp16=False,
                 use_dynamic_loss_scaling=False,
                 init_loss_scaling=1.0,
                 incr_every_n_steps=1000,
                 decr_every_n_nan_or_inf=2,
                 incr_ratio=2.0,
                 decr_ratio=0.8,
                 grad_norm=1.0,
                 beta1=0.9,
                 beta2=0.999,
                 epsilon=1e-8):
    """Optimization function"""
    if warmup_steps > 0:
        if scheduler == 'noam_decay':
            # scheduled_lr = fluid.layers.learning_rate_scheduler \
            #     .noam_decay(1 / (warmup_steps * (learning_rate ** 2)),
            #                 warmup_steps)
            with fluid.default_main_program()._lr_schedule_guard():
                scheduled_lr = fluid.layers.learning_rate_scheduler.noam_decay(
                    d_model, warmup_steps) * learning_rate
        elif scheduler == 'linear_warmup_decay':
            scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
                                               num_train_steps)
        else:
            raise ValueError("Unkown learning rate scheduler, should be "
                             "'noam_decay' or 'linear_warmup_decay'")
        optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr,
                                         beta1=beta1,
                                         beta2=beta2,
                                         epsilon=epsilon)
    else:
        scheduled_lr = fluid.layers.create_global_var(
            name=fluid.unique_name.generate("learning_rate"),
            shape=[1],
            value=learning_rate,
            dtype='float32',
            persistable=True)
        optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
        optimizer._learning_rate_map[
            fluid.default_main_program()] = scheduled_lr

    fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(
        clip_norm=grad_norm))

    def exclude_from_weight_decay(name):
        """params exclude from weight decay"""
        if name.find("layer_norm") > -1:
            return True
        bias_suffix = ["_bias", "_b", ".b_0"]
        for suffix in bias_suffix:
            if name.endswith(suffix):
                return True
        return False

    param_list = dict()

    loss_scaling = fluid.layers.create_global_var(
        name=fluid.unique_name.generate("loss_scaling"),
        shape=[1],
        value=init_loss_scaling,
        dtype='float32',
        persistable=True)

    if use_fp16:
        loss *= loss_scaling
        param_grads = optimizer.backward(loss)

        master_param_grads = create_master_params_grads(
            param_grads, train_program, startup_prog, loss_scaling)

        for param, _ in master_param_grads:
            param_list[param.name] = param * 1.0
            param_list[param.name].stop_gradient = True

        if use_dynamic_loss_scaling:
            apply_dynamic_loss_scaling(loss_scaling, master_param_grads,
                                       incr_every_n_steps,
                                       decr_every_n_nan_or_inf, incr_ratio,
                                       decr_ratio)

        optimizer.apply_gradients(master_param_grads)

        if weight_decay > 0:
            for param, grad in master_param_grads:
                if exclude_from_weight_decay(param.name.rstrip(".master")):
                    continue
                with param.block.program._optimized_guard(
                    [param, grad]), fluid.framework.name_scope("weight_decay"):
                    updated_param = param - param_list[
                        param.name] * weight_decay * scheduled_lr
                    fluid.layers.assign(output=param, input=updated_param)

        master_param_to_train_param(master_param_grads, param_grads,
                                    train_program)

    else:
        for param in train_program.global_block().all_parameters():
            param_list[param.name] = param * 1.0
            param_list[param.name].stop_gradient = True

        _, param_grads = optimizer.minimize(loss)

        if weight_decay > 0:
            for param, grad in param_grads:
                if exclude_from_weight_decay(param.name):
                    continue
                with param.block.program._optimized_guard(
                    [param, grad]), fluid.framework.name_scope("weight_decay"):
                    updated_param = param - param_list[
                        param.name] * weight_decay * scheduled_lr
                    fluid.layers.assign(output=param, input=updated_param)

    return scheduled_lr, loss_scaling
Ejemplo n.º 2
0
def optimization(loss,
                 warmup_steps,
                 num_train_steps,
                 learning_rate,
                 train_program,
                 startup_prog,
                 weight_decay,
                 scheduler='linear_warmup_decay',
                 use_fp16=False,
                 use_dynamic_loss_scaling=False,
                 init_loss_scaling=1.0,
                 incr_every_n_steps=1000,
                 decr_every_n_nan_or_inf=2,
                 incr_ratio=2.0,
                 decr_ratio=0.8):

    scheduled_lr, loss_scaling = None, None
    if scheduler == 'noam_decay':
        if warmup_steps > 0:
            scheduled_lr = fluid.layers.learning_rate_scheduler\
             .noam_decay(1/(warmup_steps *(learning_rate ** 2)),
            warmup_steps)
        else:
            print(
                "WARNING: noam decay of learning rate should have postive warmup "
                "steps but given {}, using constant learning rate instead!".
                format(warmup_steps))
            scheduled_lr = fluid.layers.create_global_var(
                name=fluid.unique_name.generate("learning_rate"),
                shape=[1],
                value=learning_rate,
                dtype='float32',
                persistable=True)
    elif scheduler == 'linear_warmup_decay':
        if warmup_steps > 0:
            scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
                                               num_train_steps)
        else:
            print(
                "WARNING: linear warmup decay of learning rate should have "
                "postive warmup steps but given {}, use constant learning rate "
                "instead!".format(warmup_steps))
            scheduled_lr = fluid.layers.create_global_var(
                name=fluid.unique_name.generate("learning_rate"),
                shape=[1],
                value=learning_rate,
                dtype='float32',
                persistable=True)
    else:
        raise ValueError("Unkown learning rate scheduler, should be "
                         "'noam_decay' or 'linear_warmup_decay'")

    optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
    fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(
        clip_norm=1.0))

    def exclude_from_weight_decay(param):
        name = param.name.rstrip(".master")
        if name.find("layer_norm") > -1:
            return True
        bias_suffix = ["_bias", "_b", ".b_0"]
        for suffix in bias_suffix:
            if name.endswith(suffix):
                return True
        return False

    param_list = dict()

    if use_fp16:
        loss_scaling = fluid.layers.create_global_var(
            name=fluid.unique_name.generate("loss_scaling"),
            shape=[1],
            value=init_loss_scaling,
            dtype='float32',
            persistable=True)
        loss *= loss_scaling

        param_grads = optimizer.backward(loss)
        master_param_grads = create_master_params_grads(
            param_grads, train_program, startup_prog, loss_scaling)

        if weight_decay > 0:
            for param, _ in master_param_grads:
                param_list[param.name] = param * 1.0
                param_list[param.name].stop_gradient = True

        if use_dynamic_loss_scaling:
            apply_dynamic_loss_scaling(loss_scaling, master_param_grads,
                                       incr_every_n_steps,
                                       decr_every_n_nan_or_inf, incr_ratio,
                                       decr_ratio)

        optimizer.apply_gradients(master_param_grads)

        if weight_decay > 0:
            for param, grad in master_param_grads:
                if exclude_from_weight_decay(param):
                    continue
                with param.block.program._optimized_guard(
                    [param, grad]), fluid.framework.name_scope("weight_decay"):
                    updated_param = param - param_list[
                        param.name] * weight_decay * scheduled_lr
                    fluid.layers.assign(output=param, input=updated_param)

        master_param_to_train_param(master_param_grads, param_grads,
                                    train_program)

    else:
        if weight_decay > 0:
            for param in train_program.global_block().all_parameters():
                param_list[param.name] = param * 1.0
                param_list[param.name].stop_gradient = True

        _, param_grads = optimizer.minimize(loss)

        if weight_decay > 0:
            for param, grad in param_grads:
                if exclude_from_weight_decay(param):
                    continue
                with param.block.program._optimized_guard(
                    [param, grad]), fluid.framework.name_scope("weight_decay"):
                    updated_param = param - param_list[
                        param.name] * weight_decay * scheduled_lr
                    fluid.layers.assign(output=param, input=updated_param)

    return scheduled_lr, loss_scaling
Ejemplo n.º 3
0
def optimization(loss,
                 warmup_steps,
                 num_train_steps,
                 learning_rate,
                 train_program,
                 startup_prog,
                 weight_decay,
                 scheduler='linear_warmup_decay',
                 use_fp16=False,
                 use_lamb=False,
                 use_dynamic_loss_scaling=False,
                 init_loss_scaling=1.0,
                 incr_every_n_steps=1000,
                 decr_every_n_nan_or_inf=2,
                 incr_ratio=2.0,
                 decr_ratio=0.8,
                 layer_decay_rate=0.0,
                 n_layers=12):
    def exclude_from_weight_decay(param):
        name = param.name.rstrip('.master')
        if name.find("layer_norm") > -1:
            return True
        bias_suffix = ["_bias", "_b", ".b_0"]
        for suffix in bias_suffix:
            if name.endswith(suffix):
                return True
        return False

    if warmup_steps > 0:
        if scheduler == 'noam_decay':
            scheduled_lr = fluid.layers.learning_rate_scheduler\
             .noam_decay(1/(warmup_steps *(learning_rate ** 2)),
                         warmup_steps)
        elif scheduler == 'linear_warmup_decay':
            scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
                                               num_train_steps)
        else:
            raise ValueError("Unkown learning rate scheduler, should be "
                             "'noam_decay' or 'linear_warmup_decay'")
        if not use_lamb:
            log.debug('using Adam')
            optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
        else:
            log.debug('using Lamb')
            optimizer = fluid.optimizer.Lamb(
                learning_rate=scheduled_lr,
                lamb_weight_decay=weight_decay,
                exclude_from_weight_decay_fn=exclude_from_weight_decay)
    else:
        scheduled_lr = fluid.layers.create_global_var(
            name=fluid.unique_name.generate("learning_rate"),
            shape=[1],
            value=learning_rate,
            dtype='float32',
            persistable=True)
        if not use_lamb:
            log.debug('using Adam')
            optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
        else:
            log.debug('using Lamb')
            optimizer = fluid.optimizer.Lamb(
                learning_rate=scheduled_lr,
                lamb_weight_decay=weight_decay,
                exclude_from_weight_decay_fn=exclude_from_weight_decay)
        optimizer._learning_rate_map[fluid.default_main_program(
        )] = scheduled_lr

    fluid.clip.set_gradient_clip(
        clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))

    param_list = dict()

    loss_scaling = fluid.layers.create_global_var(
        name=fluid.unique_name.generate("loss_scaling"),
        shape=[1],
        value=init_loss_scaling,
        dtype='float32',
        persistable=True)

    if use_fp16:
        from utils.fp16 import create_master_params_grads, master_param_to_train_param, apply_dynamic_loss_scaling
        loss *= loss_scaling
        param_grads = optimizer.backward(loss)

        master_param_grads = create_master_params_grads(
            param_grads, train_program, startup_prog, loss_scaling)

        for param, _ in master_param_grads:
            param_list[param.name] = param * 1.0
            param_list[param.name].stop_gradient = True

        if use_dynamic_loss_scaling:
            apply_dynamic_loss_scaling(
                loss_scaling, master_param_grads, incr_every_n_steps,
                decr_every_n_nan_or_inf, incr_ratio, decr_ratio)

        optimizer.apply_gradients(master_param_grads)

        if not use_lamb and weight_decay > 0:
            for param, grad in master_param_grads:
                if exclude_from_weight_decay(param):
                    continue
                with param.block.program._optimized_guard(
                    [param, grad]), fluid.framework.name_scope("weight_decay"):
                    updated_param = param - param_list[
                        param.name] * weight_decay * scheduled_lr
                    fluid.layers.assign(output=param, input=updated_param)

        master_param_to_train_param(master_param_grads, param_grads,
                                    train_program)

    else:
        for param in train_program.global_block().all_parameters():
            param_list[param.name] = param * 1.0
            param_list[param.name].stop_gradient = True

        _, param_grads = optimizer.minimize(loss)
        if layer_decay_rate > 0:
            for param, grad in param_grads:
                with param.block.program._optimized_guard(
                    [param, grad]), fluid.framework.name_scope("layer_decay"):
                    param_decay = layer_decay(param, param_list[param.name],
                                              scheduled_lr, layer_decay_rate,
                                              n_layers)
                    if param_decay:
                        fluid.layers.assign(output=param, input=param_decay)

        if not use_lamb and weight_decay > 0:
            for param, grad in param_grads:
                if exclude_from_weight_decay(param):
                    continue
                with param.block.program._optimized_guard(
                    [param, grad]), fluid.framework.name_scope("weight_decay"):
                    updated_param = param - param_list[
                        param.name] * weight_decay * scheduled_lr
                    fluid.layers.assign(output=param, input=updated_param)

    return scheduled_lr, loss_scaling