def do_train(args):
    paddle.set_device("gpu" if args.n_gpu else "cpu")
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config_path = os.path.join(args.model_name_or_path, 'model_config.json')
    cfg_dict = dict(json.loads(open(config_path).read()))
    num_labels = cfg_dict['num_classes']

    model = model_class.from_pretrained(args.model_name_or_path,
                                        num_classes=num_labels)

    origin_model = model_class.from_pretrained(args.model_name_or_path,
                                               num_classes=num_labels)

    sp_config = supernet(expand_ratio=[1.0, args.width_mult])
    model = Convert(sp_config).convert(model)

    ofa_model = OFA(model)

    sd = paddle.load(
        os.path.join(args.model_name_or_path, 'model_state.pdparams'))
    ofa_model.model.set_state_dict(sd)
    best_config = utils.dynabert_config(ofa_model, args.width_mult)
    ofa_model.export(best_config,
                     input_shapes=[[1, args.max_seq_length],
                                   [1, args.max_seq_length]],
                     input_dtypes=['int64', 'int64'],
                     origin_model=origin_model)
    for name, sublayer in origin_model.named_sublayers():
        if isinstance(sublayer, paddle.nn.MultiHeadAttention):
            sublayer.num_heads = int(args.width_mult * sublayer.num_heads)

    output_dir = os.path.join(args.sub_model_output_dir,
                              "model_width_%.5f" % args.width_mult)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    model_to_save = origin_model
    model_to_save.save_pretrained(output_dir)

    if args.static_sub_model != None:
        export_static_model(origin_model, args.static_sub_model,
                            args.max_seq_length)
Ejemplo n.º 2
0
def _dynabert_export(task_name, ofa_model, dynabert_config, output_dir):
    from paddleslim.nas.ofa import OFA, DistillConfig, utils
    ofa_model.model.base_model_class.forward = auto_model_forward
    ofa_model._add_teacher = False
    _recover_transormer_func()

    for width_mult in dynabert_config.width_mult_list:
        model_dir = os.path.join(output_dir, str(width_mult))
        state_dict = paddle.load(
            os.path.join(model_dir, "model_state.pdparams"))
        if "cmrc2018" in task_name:
            origin_model = AutoModelForQuestionAnswering.from_pretrained(
                model_dir)
        elif task_name == "msra_ner":
            origin_model = AutoModelForTokenClassification.from_pretrained(
                model_dir)
        else:
            origin_model = AutoModelForSequenceClassification.from_pretrained(
                model_dir)
        ofa_model.model.set_state_dict(state_dict)
        best_config = utils.dynabert_config(ofa_model, width_mult)
        origin_model_new = ofa_model.export(
            best_config,
            input_shapes=[[1, 1], [1, 1]],
            input_dtypes=['int64', 'int64'],
            origin_model=origin_model)

        for name, sublayer in origin_model_new.named_sublayers():
            if isinstance(sublayer, paddle.nn.MultiHeadAttention):
                sublayer.num_heads = int(width_mult * sublayer.num_heads)

        input_shape = [
            paddle.static.InputSpec(
                shape=[None, None], dtype='int64'), paddle.static.InputSpec(
                    shape=[None, None], dtype='int64')
        ]
        pruned_infer_model_dir = os.path.join(
            model_dir, dynabert_config.output_filename_prefix)
        net = paddle.jit.to_static(origin_model_new, input_spec=input_shape)
        paddle.jit.save(net, pruned_infer_model_dir)
Ejemplo n.º 3
0
def do_train(args):
    paddle.set_device("gpu" if args.n_gpu else "cpu")
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args)

    args.task_name = args.task_name.lower()
    metric_class = METRIC_CLASSES[args.task_name]
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

    train_ds = load_dataset('glue', args.task_name, splits="train")

    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
    trans_func = partial(
        convert_example,
        tokenizer=tokenizer,
        label_list=train_ds.label_list,
        max_seq_length=args.max_seq_length)
    train_ds = train_ds.map(trans_func, lazy=True)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
        Stack(dtype="int64" if train_ds.label_list else "float32")  # label
    ): fn(samples)

    train_data_loader = DataLoader(
        dataset=train_ds,
        batch_sampler=train_batch_sampler,
        collate_fn=batchify_fn,
        num_workers=0,
        return_list=True)
    if args.task_name == "mnli":
        dev_ds_matched, dev_ds_mismatched = load_dataset(
            'glue', args.task_name, splits=["dev_matched", "dev_mismatched"])
        dev_ds_matched = dev_ds_matched.map(trans_func, lazy=True)
        dev_ds_mismatched = dev_ds_mismatched.map(trans_func,
                                                            lazy=True)
        dev_batch_sampler_matched = paddle.io.BatchSampler(
            dev_ds_matched, batch_size=args.batch_size, shuffle=False)
        dev_data_loader_matched = DataLoader(
            dataset=dev_ds_matched,
            batch_sampler=dev_batch_sampler_matched,
            collate_fn=batchify_fn,
            num_workers=0,
            return_list=True)
        dev_batch_sampler_mismatched = paddle.io.BatchSampler(
            dev_ds_mismatched, batch_size=args.batch_size, shuffle=False)
        dev_data_loader_mismatched = DataLoader(
            dataset=dev_ds_mismatched,
            batch_sampler=dev_batch_sampler_mismatched,
            collate_fn=batchify_fn,
            num_workers=0,
            return_list=True)
    else:
        dev_ds = load_dataset('glue', args.task_name, splits='dev')
        dev_ds = dev_ds.map(trans_func, lazy=True)
        dev_batch_sampler = paddle.io.BatchSampler(
            dev_ds, batch_size=args.batch_size, shuffle=False)
        dev_data_loader = DataLoader(
            dataset=dev_ds,
            batch_sampler=dev_batch_sampler,
            collate_fn=batchify_fn,
            num_workers=0,
            return_list=True)

    num_labels = 1 if train_ds.label_list == None else len(train_ds.label_list)

    model = model_class.from_pretrained(
        args.model_name_or_path, num_classes=num_labels)
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    # Step1: Initialize a dictionary to save the weights from the origin BERT model.
    origin_weights = {}
    for name, param in model.named_parameters():
        origin_weights[name] = param

    # Step2: Convert origin model to supernet.
    sp_config = supernet(expand_ratio=args.width_mult_list)
    model = Convert(sp_config).convert(model)
    # Use weights saved in the dictionary to initialize supernet.
    utils.set_state_dict(model, origin_weights)
    del origin_weights

    # Step3: Define teacher model.
    teacher_model = model_class.from_pretrained(
        args.model_name_or_path, num_classes=num_labels)

    # Step4: Config about distillation.
    mapping_layers = ['bert.embeddings']
    for idx in range(model.bert.config['num_hidden_layers']):
        mapping_layers.append('bert.encoder.layers.{}'.format(idx))

    default_distill_config = {
        'lambda_distill': 0.1,
        'teacher_model': teacher_model,
        'mapping_layers': mapping_layers,
    }
    distill_config = DistillConfig(**default_distill_config)

    # Step5: Config in supernet training.
    ofa_model = OFA(model,
                    distill_config=distill_config,
                    elastic_order=['width'])

    criterion = paddle.nn.loss.CrossEntropyLoss(
    ) if train_ds.label_list else paddle.nn.loss.MSELoss()

    metric = metric_class()

    if args.task_name == "mnli":
        dev_data_loader = (dev_data_loader_matched, dev_data_loader_mismatched)

    # Step6: Calculate the importance of neurons and head, 
    # and then reorder them according to the importance.
    head_importance, neuron_importance = nlp_utils.compute_neuron_head_importance(
        args.task_name,
        ofa_model.model,
        dev_data_loader,
        loss_fct=criterion,
        num_layers=model.bert.config['num_hidden_layers'],
        num_heads=model.bert.config['num_attention_heads'])
    reorder_neuron_head(ofa_model.model, head_importance, neuron_importance)

    num_training_steps = args.max_steps if args.max_steps > 0 else len(
        train_data_loader) * args.num_train_epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
                                         args.warmup_steps)

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=ofa_model.model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in [
            p.name for n, p in ofa_model.model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ])

    global_step = 0
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        # Step7: Set current epoch and task.
        ofa_model.set_epoch(epoch)
        ofa_model.set_task('width')

        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, segment_ids, labels = batch

            for width_mult in args.width_mult_list:
                # Step8: Broadcast supernet config from width_mult,
                # and use this config in supernet training.
                net_config = utils.dynabert_config(ofa_model, width_mult)
                ofa_model.set_net_config(net_config)
                logits, teacher_logits = ofa_model(
                    input_ids, segment_ids, attention_mask=[None, None])
                rep_loss = ofa_model.calc_distill_loss()
                if args.task_name == 'sts-b':
                    logit_loss = 0.0
                else:
                    logit_loss = soft_cross_entropy(logits,
                                                    teacher_logits.detach())
                loss = rep_loss + args.lambda_logit * logit_loss
                loss.backward()
            optimizer.step()
            lr_scheduler.step()
            ofa_model.model.clear_grad()

            if global_step % args.logging_steps == 0:
                if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
                    logger.info(
                        "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                        % (global_step, epoch, step, loss,
                           args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()

            if global_step % args.save_steps == 0:
                if args.task_name == "mnli":
                    evaluate(
                        teacher_model,
                        criterion,
                        metric,
                        dev_data_loader_matched,
                        width_mult=100)
                    evaluate(
                        teacher_model,
                        criterion,
                        metric,
                        dev_data_loader_mismatched,
                        width_mult=100)
                else:
                    evaluate(
                        teacher_model,
                        criterion,
                        metric,
                        dev_data_loader,
                        width_mult=100)
                for idx, width_mult in enumerate(args.width_mult_list):
                    net_config = utils.dynabert_config(ofa_model, width_mult)
                    ofa_model.set_net_config(net_config)
                    tic_eval = time.time()
                    if args.task_name == "mnli":
                        acc = evaluate(ofa_model, criterion, metric,
                                       dev_data_loader_matched, width_mult)
                        evaluate(ofa_model, criterion, metric,
                                 dev_data_loader_mismatched, width_mult)
                        print("eval done total : %s s" %
                              (time.time() - tic_eval))
                    else:
                        acc = evaluate(ofa_model, criterion, metric,
                                       dev_data_loader, width_mult)
                        print("eval done total : %s s" %
                              (time.time() - tic_eval))

                    if (not args.n_gpu > 1
                        ) or paddle.distributed.get_rank() == 0:
                        output_dir = os.path.join(args.output_dir,
                                                  "model_%d" % global_step)
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        # need better way to get inner model of DataParallel
                        model_to_save = model._layers if isinstance(
                            model, paddle.DataParallel) else model
                        model_to_save.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)
Ejemplo n.º 4
0
def do_train(args):
    paddle.set_device("gpu" if args.n_gpu else "cpu")
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config_path = os.path.join(args.model_name_or_path, 'model_config.json')
    cfg_dict = dict(json.loads(open(config_path).read()))

    kept_layers_index = {}
    if args.depth_mult < 1.0:
        depth = round(cfg_dict["init_args"][0]['num_hidden_layers'] *
                      args.depth_mult)
        cfg_dict["init_args"][0]['num_hidden_layers'] = depth
        for idx, i in enumerate(range(1, depth + 1)):
            kept_layers_index[idx] = math.floor(i / args.depth_mult) - 1

    os.rename(config_path, config_path + '_bak')
    with open(config_path, "w", encoding="utf-8") as f:
        f.write(json.dumps(cfg_dict, ensure_ascii=False))

    num_labels = cfg_dict['num_classes']

    model = model_class.from_pretrained(
        args.model_name_or_path, num_classes=num_labels)

    origin_model = model_class.from_pretrained(
        args.model_name_or_path, num_classes=num_labels)

    os.rename(config_path + '_bak', config_path)

    sp_config = supernet(expand_ratio=[1.0, args.width_mult])
    model = Convert(sp_config).convert(model)

    ofa_model = OFA(model)

    sd = paddle.load(
        os.path.join(args.model_name_or_path, 'model_state.pdparams'))

    if len(kept_layers_index) == 0:
        ofa_model.model.set_state_dict(sd)
    else:
        for name, params in ofa_model.model.named_parameters():
            if 'encoder' not in name:
                params.set_value(sd[name])
            else:
                idx = int(name.strip().split('.')[3])
                mapping_name = name.replace(
                    '.' + str(idx) + '.',
                    '.' + str(kept_layers_index[idx]) + '.')
                params.set_value(sd[mapping_name])

    best_config = utils.dynabert_config(ofa_model, args.width_mult)
    for name, sublayer in ofa_model.model.named_sublayers():
        if isinstance(sublayer, paddle.nn.MultiHeadAttention):
            sublayer.num_heads = int(args.width_mult * sublayer.num_heads)

    ofa_model.export(
        best_config,
        input_shapes=[[1, args.max_seq_length], [1, args.max_seq_length]],
        input_dtypes=['int64', 'int64'],
        origin_model=origin_model)
    for name, sublayer in origin_model.named_sublayers():
        if isinstance(sublayer, paddle.nn.MultiHeadAttention):
            sublayer.num_heads = int(args.width_mult * sublayer.num_heads)

    output_dir = os.path.join(args.sub_model_output_dir,
                              "model_width_%.5f" % args.width_mult)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    model_to_save = origin_model
    model_to_save.save_pretrained(output_dir)

    if args.static_sub_model != None:
        export_static_model(origin_model, args.static_sub_model,
                            args.max_seq_length)
Ejemplo n.º 5
0
def _dynabert_training(self, task_name, ofa_model, model, teacher_model,
                       train_dataloader, eval_dataloader, width_mult_list,
                       criterion, num_train_epochs, output_dir):
    metric = Accuracy()
    if task_name == "msra_ner":
        metric = ChunkEvaluator(label_list=self.train_dataset.label_list)

    @paddle.no_grad()
    def evaluate(model, criterion, data_loader, width_mult=1.0):
        model.eval()
        all_start_logits = []
        all_end_logits = []
        metric.reset()
        for batch in data_loader:
            if "cmrc2018" in task_name:
                input_ids, token_type_ids = batch['input_ids'], batch[
                    'token_type_ids']
                logits = model(
                    input_ids, token_type_ids, attention_mask=[None, None])
                if width_mult == 100:
                    start_logits_tensor, end_logits_tensor = logits
                else:
                    start_logits_tensor, end_logits_tensor = logits[0]
                for idx in range(start_logits_tensor.shape[0]):
                    if len(all_start_logits) % 1000 == 0 and len(
                            all_start_logits):
                        logger.info("Processing example: %d" %
                                    len(all_start_logits))
                    all_start_logits.append(start_logits_tensor.numpy()[idx])
                    all_end_logits.append(end_logits_tensor.numpy()[idx])

            else:
                input_ids, segment_ids, labels = batch['input_ids'], batch[
                    'token_type_ids'], batch['labels']
                logits = model(
                    input_ids, segment_ids, attention_mask=[None, None])
                if isinstance(logits, tuple):
                    logits = logits[0]
                loss = criterion(logits, labels)
                if task_name == "msra_ner":
                    preds = logits.argmax(axis=2)
                    num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute(
                        batch['seq_len'], preds, batch['labels'])
                    metric.update(num_infer_chunks.numpy(),
                                  num_label_chunks.numpy(),
                                  num_correct_chunks.numpy())
                else:
                    correct = metric.compute(logits, labels)
                    metric.update(correct)
        if "cmrc2018" in task_name:
            n_best_size = 20
            max_answer_length = 50
            all_predictions, _, _ = compute_prediction(
                self.eval_examples, self.eval_dataset,
                (all_start_logits, all_end_logits), False, n_best_size,
                max_answer_length)
            res = squad_evaluate(
                examples=[raw_data for raw_data in self.eval_examples],
                preds=all_predictions,
                is_whitespace_splited=False)
            if width_mult == 100:
                logger.info("teacher model, EM: %f, F1: %f" %
                            (res['exact'], res['f1']))
            else:
                logger.info("width_mult: %s, EM: %f, F1: %f, " %
                            (str(width_mult), res['exact'], res['f1']))
            res = res['exact']
        else:
            res = metric.accumulate()
            # Teacher model's evaluation
            if task_name == "msra_ner":
                if width_mult == 100:
                    logger.info(
                        "teacher model, eval loss: %f, precision: %f, recall: %f, f1_score: %f"
                        % (paddle.mean(loss).numpy(), res[0], res[1], res[2]))
                else:
                    logger.info(
                        "width_mult: %s, eval loss: %f, precision: %f, recall: %f, f1_score: %f"
                        % (str(width_mult), paddle.mean(loss).numpy(), res[0],
                           res[1], res[2]))
                res = res[2]
            else:
                if width_mult == 100:
                    logger.info("teacher model, eval loss: %f, acc: %s, " %
                                (loss.numpy(), res))
                else:
                    logger.info("width_mult: %s, eval loss: %f, acc: %s, " %
                                (str(width_mult), loss.numpy(), res))
        model.train()
        return res

    from paddleslim.nas.ofa import OFA, DistillConfig, utils
    global_step = 0
    lambda_logit = 1.0
    tic_train = time.time()
    best_acc = 0.0
    acc = 0.0
    logger.info("DynaBERT training starts. This period will cost some time.")
    for epoch in range(num_train_epochs):
        # Step7: Set current epoch and task.
        ofa_model.set_epoch(epoch)
        ofa_model.set_task('width')

        for step, batch in enumerate(train_dataloader):
            global_step += 1
            if "cmrc2018" in task_name:
                input_ids, token_type_ids, start_positions, end_positions = batch[
                    'input_ids'], batch['token_type_ids'], batch[
                        'start_positions'], batch['end_positions']
            else:
                input_ids, token_type_ids, labels = batch['input_ids'], batch[
                    'token_type_ids'], batch['labels']
            for width_mult in width_mult_list:
                # Step8: Broadcast supernet config from width_mult,
                # and use this config in supernet training.
                net_config = utils.dynabert_config(ofa_model, width_mult)
                ofa_model.set_net_config(net_config)
                logits, teacher_logits = ofa_model(
                    input_ids, token_type_ids, attention_mask=[None, None])
                rep_loss = ofa_model.calc_distill_loss()
                if "cmrc2018" in task_name:
                    logit_loss = (soft_cross_entropy(logits[0], teacher_logits[0].detach()) \
                                + \
                                soft_cross_entropy(logits[1], teacher_logits[1].detach()))/2
                else:
                    logit_loss = soft_cross_entropy(logits,
                                                    teacher_logits.detach())
                loss = rep_loss + lambda_logit * logit_loss
                loss.backward()
            self.optimizer.step()
            self.lr_scheduler.step()
            self.optimizer.clear_grad()

            if global_step % self.args.logging_steps == 0:
                if paddle.distributed.get_rank() == 0:
                    logger.info(
                        "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                        % (global_step, epoch, step, loss,
                           self.args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()

            if "cmrc2018" not in task_name and global_step % self.args.save_steps == 0:
                tic_eval = time.time()

                evaluate(
                    teacher_model, criterion, eval_dataloader, width_mult=100)
                logger.info("eval done total : %s s" % (time.time() - tic_eval))
                for idx, width_mult in enumerate(width_mult_list):
                    net_config = utils.dynabert_config(ofa_model, width_mult)
                    ofa_model.set_net_config(net_config)
                    tic_eval = time.time()
                    acc = evaluate(ofa_model, criterion, eval_dataloader,
                                   width_mult)
                    if acc > best_acc:
                        best_acc = acc
                        if paddle.distributed.get_rank() == 0:
                            output_dir_width = os.path.join(output_dir,
                                                            str(width_mult))
                            if not os.path.exists(output_dir_width):
                                os.makedirs(output_dir_width)
                            # need better way to get inner model of DataParallel
                            model_to_save = model._layers if isinstance(
                                model, paddle.DataParallel) else model
                            model_to_save.save_pretrained(output_dir_width)
                    logger.info("eval done total : %s s" %
                                (time.time() - tic_eval))
            if global_step > self.args.num_training_steps:
                if best_acc == 0.0:
                    output_dir_width = os.path.join(output_dir, str(width_mult))
                    if not os.path.exists(output_dir_width):
                        os.makedirs(output_dir_width)
                    # need better way to get inner model of DataParallel
                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    model_to_save.save_pretrained(output_dir_width)
                logger.info("Best acc: %.4f" % (best_acc))
                return ofa_model

        if "cmrc2018" in task_name:
            tic_eval = time.time()
            evaluate(teacher_model, criterion, eval_dataloader, width_mult=100)
            logger.info("eval done total : %s s" % (time.time() - tic_eval))
            for idx, width_mult in enumerate(width_mult_list):
                net_config = utils.dynabert_config(ofa_model, width_mult)
                ofa_model.set_net_config(net_config)
                tic_eval = time.time()
                acc = evaluate(ofa_model, criterion, eval_dataloader,
                               width_mult)
                if acc > best_acc:
                    best_acc = acc
                    if paddle.distributed.get_rank() == 0:
                        output_dir_width = os.path.join(output_dir,
                                                        str(width_mult))
                        if not os.path.exists(output_dir_width):
                            os.makedirs(output_dir_width)
                        # need better way to get inner model of DataParallel
                        model_to_save = model._layers if isinstance(
                            model, paddle.DataParallel) else model
                        model_to_save.save_pretrained(output_dir_width)
                logger.info("eval done total : %s s" % (time.time() - tic_eval))

    logger.info("Best acc: %.4f" % (best_acc))
    return ofa_model
Ejemplo n.º 6
0
 def test_dynabert(self):
     self.model = TestModel()
     sp_net_config = supernet(expand_ratio=[0.5, 1.0])
     self.model = Convert(sp_net_config).convert(self.model)
     ofa_model = OFA(self.model)
     config = dynabert_config(ofa_model, 0.5)
Ejemplo n.º 7
0
def do_train(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args)

    args.task_name = args.task_name.lower()
    metric_class = METRIC_CLASSES[args.task_name]
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    train_ds = load_dataset('clue', args.task_name, splits='train')
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)

    trans_func = partial(convert_example,
                         label_list=train_ds.label_list,
                         tokenizer=tokenizer,
                         max_seq_length=args.max_seq_length)
    train_ds = train_ds.map(trans_func, lazy=True)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
        Stack(dtype="int64" if train_ds.label_list else "float32")  # label
    ): fn(samples)

    train_data_loader = DataLoader(dataset=train_ds,
                                   batch_sampler=train_batch_sampler,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   return_list=True)

    dev_ds = load_dataset('clue', args.task_name, splits='dev')
    dev_ds = dev_ds.map(trans_func, lazy=True)
    dev_batch_sampler = paddle.io.BatchSampler(dev_ds,
                                               batch_size=args.batch_size,
                                               shuffle=False)
    dev_data_loader = DataLoader(dataset=dev_ds,
                                 batch_sampler=dev_batch_sampler,
                                 collate_fn=batchify_fn,
                                 num_workers=0,
                                 return_list=True)
    num_labels = 1 if train_ds.label_list == None else len(train_ds.label_list)

    model = model_class.from_pretrained(args.model_name_or_path,
                                        num_classes=num_labels)

    # Step1: Initialize a dictionary to save the weights from the origin PPMiniLM model.
    origin_weights = model.state_dict()

    # Step2: Convert origin model to supernet.
    sp_config = supernet(expand_ratio=[1.0])
    model = Convert(sp_config).convert(model)
    # Use weights saved in the dictionary to initialize supernet.
    utils.set_state_dict(model, origin_weights)
    del origin_weights

    super_sd = paddle.load(
        os.path.join(args.model_name_or_path, 'model_state.pdparams'))
    model.set_state_dict(super_sd)

    # Step3: Define teacher model.
    teacher_model = model_class.from_pretrained(args.model_name_or_path,
                                                num_classes=num_labels)

    # Step4: Config about distillation.
    mapping_layers = ['ppminilm.embeddings']
    for idx in range(model.ppminilm.config['num_hidden_layers']):
        mapping_layers.append('ppminilm.encoder.layers.{}'.format(idx))

    default_distill_config = {
        'lambda_distill': 0.1,
        'teacher_model': teacher_model,
        'mapping_layers': mapping_layers,
    }
    distill_config = DistillConfig(**default_distill_config)

    # Step5: Config in supernet training.
    ofa_model = OFA(model,
                    distill_config=distill_config,
                    elastic_order=['width'])

    criterion = paddle.nn.loss.CrossEntropyLoss(
    ) if train_ds.label_list else paddle.nn.loss.MSELoss()

    metric = metric_class()

    #### Step6: Calculate the importance of neurons and head,
    #### and then reorder them according to the importance.
    head_importance, neuron_importance = nlp_utils.compute_neuron_head_importance(
        args.task_name,
        ofa_model.model,
        dev_data_loader,
        loss_fct=criterion,
        num_layers=model.ppminilm.config['num_hidden_layers'],
        num_heads=model.ppminilm.config['num_attention_heads'])
    reorder_neuron_head(ofa_model.model, head_importance, neuron_importance)

    if paddle.distributed.get_world_size() > 1:
        ofa_model.model = paddle.DataParallel(ofa_model.model)

    if args.max_steps > 0:
        num_training_steps = args.max_steps
        num_train_epochs = math.ceil(num_training_steps /
                                     len(train_data_loader))
    else:
        num_training_steps = len(train_data_loader) * args.num_train_epochs
        num_train_epochs = args.num_train_epochs

    warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, warmup)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        beta1=0.9,
        beta2=0.999,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params,
        grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm))

    global_step = 0
    tic_train = time.time()
    best_res = 0.0
    for epoch in range(num_train_epochs):
        # Step7: Set current epoch and task.
        ofa_model.set_epoch(epoch)
        ofa_model.set_task('width')

        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, segment_ids, labels = batch

            for width_mult in args.width_mult_list:
                # Step8: Broadcast supernet config from width_mult,
                # and use this config in supernet training.
                net_config = utils.dynabert_config(ofa_model, width_mult)
                ofa_model.set_net_config(net_config)
                logits, teacher_logits = ofa_model(input_ids,
                                                   segment_ids,
                                                   attention_mask=[None, None])
                rep_loss = ofa_model.calc_distill_loss()
                logit_loss = soft_cross_entropy(logits,
                                                teacher_logits.detach())
                loss = rep_loss + args.lambda_logit * logit_loss
                loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()

            if global_step % args.logging_steps == 0:
                logger.info(
                    "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                    % (global_step, epoch, step, loss, args.logging_steps /
                       (time.time() - tic_train)))
                tic_train = time.time()

            if global_step % args.save_steps == 0 or global_step == num_training_steps:
                tic_eval = time.time()
                evaluate(teacher_model,
                         metric,
                         dev_data_loader,
                         width_mult=100)
                print("eval done total : %s s" % (time.time() - tic_eval))
                for idx, width_mult in enumerate(args.width_mult_list):
                    net_config = utils.dynabert_config(ofa_model, width_mult)
                    ofa_model.set_net_config(net_config)
                    tic_eval = time.time()
                    res = evaluate(ofa_model, metric, dev_data_loader,
                                   width_mult)
                    print("eval done total : %s s" % (time.time() - tic_eval))

                    if best_res < res:
                        output_dir = args.output_dir
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        # need better way to get inner model of DataParallel
                        model_to_save = model._layers if isinstance(
                            model, paddle.DataParallel) else model
                        model_to_save.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)
                        best_res = res
            if global_step >= num_training_steps:
                print("best_res: ", best_res)
                return
    print("best_res: ", best_res)
Ejemplo n.º 8
0
                depth_mult_list = [1.0]
            else:
                ofa_model.set_task('depth')
                depth_mult_list = run_config.elastic_depth
            for step, d in enumerate(
                    tqdm(
                        train_ds.start(place), desc='training')):
                ids, sids, label = d

                accumulate_gradients = dict()
                for param in opt._parameter_list:
                    accumulate_gradients[param.name] = 0.0

                for depth_mult in depth_mult_list:
                    for width_mult in args.width_mult_list:
                        net_config = utils.dynabert_config(
                            ofa_model, width_mult, depth_mult=depth_mult)
                        ofa_model.set_net_config(net_config)

                        student_output, teacher_output = ofa_model(
                            ids,
                            sids,
                            labels=label,
                            num_layers=model_cfg['num_hidden_layers'])
                        loss, student_logit, student_reps = student_output[
                            0], student_output[1], student_output[2]['hiddens']
                        teacher_logit, teacher_reps = teacher_output[
                            1], teacher_output[2]['hiddens']

                        if ofa_model.task == 'depth':
                            depth_mult = ofa_model.current_config['depth']
                            depth = round(model_cfg['num_hidden_layers'] *