Ejemplo n.º 1
0
 def test_elementwise(self):
     self.ofa_model = OFA(self.model)
     self.ofa_model.set_epoch(0)
     self.ofa_model.set_task('expand_ratio')
     out, _ = self.ofa_model(self.images)
     assert list(
         self.ofa_model._ofa_layers.keys()) == ['conv2.0', 'conv3.0']
Ejemplo n.º 2
0
    def test_ofa(self):
        ofa_model = OFA(self.model,
                        self.run_config,
                        distill_config=self.distill_config)

        start_epoch = 0
        for idx in range(len(self.run_config.n_epochs)):
            cur_idx = self.run_config.n_epochs[idx]
            for ph_idx in range(len(cur_idx)):
                cur_lr = self.run_config.init_learning_rate[idx][ph_idx]
                adam = fluid.optimizer.Adam(
                    learning_rate=cur_lr,
                    parameter_list=(
                        ofa_model.parameters() + ofa_model.netAs_param))
                for epoch_id in range(start_epoch,
                                      self.run_config.n_epochs[idx][ph_idx]):
                    for model_no in range(self.run_config.dynamic_batch_size[
                            idx]):
                        output, _ = ofa_model(self.data)
                        loss = fluid.layers.reduce_mean(output)
                        if self.distill_config.mapping_layers != None:
                            dis_loss = ofa_model.calc_distill_loss()
                            loss += dis_loss
                            dis_loss = dis_loss.numpy()[0]
                        else:
                            dis_loss = 0
                        print('epoch: {}, loss: {}, distill loss: {}'.format(
                            epoch_id, loss.numpy()[0], dis_loss))
                        loss.backward()
                        adam.minimize(loss)
                        adam.clear_gradients()
                start_epoch = self.run_config.n_epochs[idx][ph_idx]
Ejemplo n.º 3
0
class TestExport(unittest.TestCase):
    def setUp(self):
        self._init_model()

    def _init_model(self):
        self.origin_model = ModelOriginLinear()
        model = ModelLinear()
        self.ofa_model = OFA(model)

    def test_ofa(self):
        config = {
            'embedding_1': {
                'expand_ratio': (2.0)
            },
            'linear_3': {
                'expand_ratio': (2.0)
            },
            'linear_4': {},
            'linear_5': {}
        }
        origin_dict = {}
        for name, param in self.origin_model.named_parameters():
            origin_dict[name] = param.shape
        self.ofa_model.export(self.origin_model,
                              config,
                              input_shapes=[[1, 64]],
                              input_dtypes=['int64'])
        for name, param in self.origin_model.named_parameters():
            if name in config.keys():
                if 'expand_ratio' in config[name]:
                    assert origin_dict[name][
                        -1] == param.shape[-1] * config[name]['expand_ratio']
Ejemplo n.º 4
0
class TestInputDict(unittest.TestCase):
    def setUp(self):
        model = ModelInputDict()

        sp_net_config = supernet(expand_ratio=[0.5, 1.0])
        self.model = Convert(sp_net_config).convert(model)
        self.images = paddle.randn(shape=[2, 3, 32, 32], dtype='float32')
        self.images2 = {
            'data': paddle.randn(shape=[2, 12, 32, 32], dtype='float32')
        }
        default_run_config = {'skip_layers': ['conv1.0', 'conv2.0']}
        self.run_config = RunConfig(**default_run_config)

        self.ofa_model = OFA(self.model, run_config=self.run_config)
        self.ofa_model._clear_search_space(self.images, data=self.images2)

    def test_export(self):

        config = self.ofa_model._sample_config(task="expand_ratio",
                                               sample_type="smallest")
        self.ofa_model.export(config,
                              input_shapes=[[1, 3, 32, 32], {
                                  'data': [1, 12, 32, 32]
                              }],
                              input_dtypes=['float32', 'float32'])
Ejemplo n.º 5
0
 def setUp(self):
     model = ModelShortcut()
     sp_net_config = supernet(expand_ratio=[0.5, 1.0])
     self.model = Convert(sp_net_config).convert(model)
     self.images = paddle.randn(shape=[2, 3, 32, 32], dtype='float32')
     self.init_config()
     self.ofa_model = OFA(self.model, run_config=self.run_config)
     self.ofa_model._clear_search_space(self.images)
Ejemplo n.º 6
0
 def test_multiexit(self):
     self.ofa_model = OFA(self.model)
     self.ofa_model.set_epoch(0)
     self.ofa_model.set_task('expand_ratio')
     out, _ = self.ofa_model(self.images)
     assert list(self.ofa_model._ofa_layers.keys()) == [
         'conv1.0', 'block1.0', 'block1.4', 'block2.0', 'out2.0'
     ]
Ejemplo n.º 7
0
 def setUp(self):
     model = ModelLinear()
     data_np = np.random.random((3, 64)).astype(np.int64)
     self.data = paddle.to_tensor(data_np)
     self.ofa_model = OFA(model)
     self.ofa_model.set_epoch(0)
     outs, _ = self.ofa_model(self.data)
     self.config = self.ofa_model.current_config
Ejemplo n.º 8
0
def _dynabert_init(task_name, model, eval_dataloader, criterion,
                   width_mult_list):
    from paddleslim.nas.ofa.convert_super import Convert, supernet
    from paddleslim.nas.ofa import OFA, DistillConfig, utils

    # Step1: Initialize a dictionary to save the weights from the origin model.
    origin_weights = model.state_dict()

    # Step2: Define teacher model.
    teacher_model = copy.deepcopy(model)

    # Step3: Convert origin model to supernet.
    sp_config = supernet(expand_ratio=[1.0])
    model = Convert(sp_config).convert(model)

    # Use weights saved in the dictionary to initialize supernet.
    utils.set_state_dict(model, origin_weights)
    del origin_weights

    # Step4: Config about distillation.
    mapping_layers = [model.base_model_prefix + '.embeddings']
    for idx in range(model.base_model.config['num_hidden_layers']):
        mapping_layers.append(model.base_model_prefix +
                              '.encoder.layers.{}'.format(idx))

    default_distill_config = {
        'lambda_distill': 0.1,
        'teacher_model': teacher_model,
        'mapping_layers': mapping_layers,
    }
    distill_config = DistillConfig(**default_distill_config)

    # Step5: Config in supernet training.
    ofa_model = OFA(model,
                    distill_config=distill_config,
                    elastic_order=['width'])

    # Step6: Calculate the importance of neurons and head,
    # and then reorder them according to the importance.
    # NOTE: Importing `nlp_utils` would rewrite `forward` function of
    # TransformerEncoder, TransformerEncoderLayer, MultiHeadAttention and
    # `_prepare_qkv` function of MultiHeadAttention.
    from paddleslim.nas.ofa.utils import nlp_utils

    head_importance, neuron_importance = compute_neuron_head_importance(
        task_name=task_name,
        model=ofa_model.model,
        data_loader=eval_dataloader,
        loss_fct=criterion,
        num_layers=model.base_model.config['num_hidden_layers'],
        num_heads=model.base_model.config['num_attention_heads'])

    reorder_neuron_head(ofa_model.model, head_importance, neuron_importance)

    if paddle.distributed.get_world_size() > 1:
        ofa_model.model = paddle.DataParallel(ofa_model.model)

    return ofa_model, teacher_model
Ejemplo n.º 9
0
class TestOFAV2(unittest.TestCase):
    def setUp(self):
        model = ModelV1()
        sp_net_config = supernet(expand_ratio=[0.25, 0.5, 1.0])
        self.model = Convert(sp_net_config).convert(model)
        self.images = paddle.randn(shape=[2, 3, 32, 32], dtype='float32')

    def test_ofa(self):
        self.ofa_model = OFA(self.model)
        self.ofa_model.set_epoch(0)
        self.ofa_model.set_task('expand_ratio')
        out, _ = self.ofa_model(self.images)
Ejemplo n.º 10
0
class Testelementwise(unittest.TestCase):
    def setUp(self):
        model = ModelElementwise()
        sp_net_config = supernet(expand_ratio=[0.25, 0.5, 1.0])
        self.model = Convert(sp_net_config).convert(model)
        self.images = paddle.randn(shape=[2, 3, 32, 32], dtype='float32')

    def test_elementwise(self):
        self.ofa_model = OFA(self.model)
        self.ofa_model.set_epoch(0)
        self.ofa_model.set_task('expand_ratio')
        out, _ = self.ofa_model(self.images)
        assert list(
            self.ofa_model._ofa_layers.keys()) == ['conv2.0', 'conv3.0']
Ejemplo n.º 11
0
class TestExportCase1(unittest.TestCase):
    def setUp(self):
        model = ModelLinear1()
        data_np = np.random.random((3, 64)).astype(np.int64)
        self.data = paddle.to_tensor(data_np)
        self.ofa_model = OFA(model)
        self.ofa_model.set_epoch(0)
        outs, _ = self.ofa_model(self.data)
        self.config = self.ofa_model.current_config

    def test_export_model(self):
        self.ofa_model.export(
            self.config, input_shapes=[[3, 64]], input_dtypes=['int64'])
        assert len(self.ofa_model.ofa_layers) == 4
Ejemplo n.º 12
0
    def setUp(self):
        model = ModelInputDict()

        sp_net_config = supernet(expand_ratio=[0.5, 1.0])
        self.model = Convert(sp_net_config).convert(model)
        self.images = paddle.randn(shape=[2, 3, 32, 32], dtype='float32')
        self.images2 = {
            'data': paddle.randn(shape=[2, 12, 32, 32], dtype='float32')
        }
        default_run_config = {'skip_layers': ['conv1.0', 'conv2.0']}
        self.run_config = RunConfig(**default_run_config)

        self.ofa_model = OFA(self.model, run_config=self.run_config)
        self.ofa_model._clear_search_space(self.images, data=self.images2)
Ejemplo n.º 13
0
class TestOFAV2Export(unittest.TestCase):
    def setUp(self):
        model = ModelV1(name='export')
        sp_net_config = supernet(expand_ratio=[0.25, 0.5, 1.0])
        self.model = Convert(sp_net_config).convert(model)
        self.images = paddle.randn(shape=[2, 3, 32, 32], dtype='float32')
        self.ofa_model = OFA(self.model)

    def test_export(self):
        origin_model = ModelV1(name='origin')
        net_config = {'model.0': {}}
        self.ofa_model.export(net_config,
                              input_shapes=[1, 3, 32, 32],
                              input_dtypes=['float32'],
                              origin_model=origin_model)
Ejemplo n.º 14
0
class TestMultiExit(unittest.TestCase):
    def setUp(self):
        self.images = paddle.randn(shape=[1, 3, 224, 224], dtype='float32')
        model = ModelMultiExit()
        sp_net_config = supernet(expand_ratio=[0.25, 0.5, 1.0])
        self.model = Convert(sp_net_config).convert(model)

    def test_multiexit(self):
        self.ofa_model = OFA(self.model)
        self.ofa_model.set_epoch(0)
        self.ofa_model.set_task('expand_ratio')
        out, _ = self.ofa_model(self.images)
        assert list(self.ofa_model._ofa_layers.keys()) == [
            'conv1.0', 'block1.0', 'block1.4', 'block2.0', 'out2.0'
        ]
Ejemplo n.º 15
0
class TestExportCase2(unittest.TestCase):
    def setUp(self):
        model = ModelLinear()
        data_np = np.random.random((3, 64)).astype(np.int64)
        self.data = paddle.to_tensor(data_np)
        self.ofa_model = OFA(model)
        self.ofa_model.set_epoch(0)
        outs, _ = self.ofa_model(self.data)
        self.config = self.ofa_model.current_config

    def test_export_model_linear2(self):
        config = self.ofa_model._sample_config(
            task='expand_ratio', phase=None, sample_type='smallest')
        ex_model = self.ofa_model.export(
            config, input_shapes=[[3, 64]], input_dtypes=['int64'])
        ex_model(self.data)
        assert len(self.ofa_model.ofa_layers) == 3
def do_train(args):
    paddle.set_device("gpu" if args.n_gpu else "cpu")
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config_path = os.path.join(args.model_name_or_path, 'model_config.json')
    cfg_dict = dict(json.loads(open(config_path).read()))
    num_labels = cfg_dict['num_classes']

    model = model_class.from_pretrained(args.model_name_or_path,
                                        num_classes=num_labels)

    origin_model = model_class.from_pretrained(args.model_name_or_path,
                                               num_classes=num_labels)

    sp_config = supernet(expand_ratio=[1.0, args.width_mult])
    model = Convert(sp_config).convert(model)

    ofa_model = OFA(model)

    sd = paddle.load(
        os.path.join(args.model_name_or_path, 'model_state.pdparams'))
    ofa_model.model.set_state_dict(sd)
    best_config = utils.dynabert_config(ofa_model, args.width_mult)
    ofa_model.export(best_config,
                     input_shapes=[[1, args.max_seq_length],
                                   [1, args.max_seq_length]],
                     input_dtypes=['int64', 'int64'],
                     origin_model=origin_model)
    for name, sublayer in origin_model.named_sublayers():
        if isinstance(sublayer, paddle.nn.MultiHeadAttention):
            sublayer.num_heads = int(args.width_mult * sublayer.num_heads)

    output_dir = os.path.join(args.sub_model_output_dir,
                              "model_width_%.5f" % args.width_mult)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    model_to_save = origin_model
    model_to_save.save_pretrained(output_dir)

    if args.static_sub_model != None:
        export_static_model(origin_model, args.static_sub_model,
                            args.max_seq_length)
Ejemplo n.º 17
0
class TestShortCut(unittest.TestCase):
    def setUp(self):
        model = resnet50()
        sp_net_config = supernet(expand_ratio=[0.25, 0.5, 1.0])
        self.model = Convert(sp_net_config).convert(model)
        self.images = paddle.randn(shape=[2, 3, 224, 224], dtype='float32')
        self._test_clear_search_space()

    def _test_clear_search_space(self):
        self.ofa_model = OFA(self.model)
        self.ofa_model.set_epoch(0)
        outs, _ = self.ofa_model(self.images)
        self.config = self.ofa_model.current_config

    def test_export_model(self):
        self.ofa_model.export(
            self.config,
            input_shapes=[[2, 3, 224, 224]],
            input_dtypes=['float32'])
        assert len(self.ofa_model.ofa_layers) == 37
Ejemplo n.º 18
0
class TestExport(unittest.TestCase):
    def setUp(self):
        self._init_model()

    def _init_model(self):
        self.origin_model = ModelOriginLinear()
        model = ModelLinear()
        self.ofa_model = OFA(model)

    def test_ofa(self):
        config = self.ofa_model._sample_config(task='expand_ratio', phase=None)
        origin_dict = {}
        for name, param in self.origin_model.named_parameters():
            origin_dict[name] = param.shape
        self.ofa_model.export(
            config,
            input_shapes=[[1, 64]],
            input_dtypes=['int64'],
            origin_model=self.origin_model)
        for name, param in self.origin_model.named_parameters():
            if name in config.keys():
                if 'expand_ratio' in config[name]:
                    assert origin_dict[name][-1] == param.shape[-1] * config[
                        name]['expand_ratio']
Ejemplo n.º 19
0
class TestShortcutSkiplayers(unittest.TestCase):
    def setUp(self):
        model = ModelShortcut()
        sp_net_config = supernet(expand_ratio=[0.5, 1.0])
        self.model = Convert(sp_net_config).convert(model)
        self.images = paddle.randn(shape=[2, 3, 32, 32], dtype='float32')
        self.init_config()
        self.ofa_model = OFA(self.model, run_config=self.run_config)
        self.ofa_model._clear_search_space(self.images)

    def init_config(self):
        default_run_config = {'skip_layers': ['branch1.6']}
        self.run_config = RunConfig(**default_run_config)

    def test_shortcut(self):
        self.ofa_model.set_epoch(0)
        self.ofa_model.set_task('expand_ratio')
        for i in range(5):
            self.ofa_model(self.images)
        assert list(self.ofa_model._ofa_layers.keys()) == ['branch2.0']
Ejemplo n.º 20
0
def do_train(args):
    paddle.set_device("gpu" if args.n_gpu else "cpu")
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args)

    args.task_name = args.task_name.lower()
    dataset_class, metric_class = TASK_CLASSES[args.task_name]
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

    train_ds = dataset_class.get_datasets(['train'])

    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         label_list=train_ds.get_labels(),
                         max_seq_length=args.max_seq_length)
    train_ds = train_ds.apply(trans_func, lazy=True)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
        Stack(),  # length
        Stack(dtype="int64" if train_ds.get_labels() else "float32")  # label
    ): [data for i, data in enumerate(fn(samples)) if i != 2]
    train_data_loader = DataLoader(dataset=train_ds,
                                   batch_sampler=train_batch_sampler,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   return_list=True)
    if args.task_name == "mnli":
        dev_dataset_matched, dev_dataset_mismatched = dataset_class.get_datasets(
            ["dev_matched", "dev_mismatched"])
        dev_dataset_matched = dev_dataset_matched.apply(trans_func, lazy=True)
        dev_dataset_mismatched = dev_dataset_mismatched.apply(trans_func,
                                                              lazy=True)
        dev_batch_sampler_matched = paddle.io.BatchSampler(
            dev_dataset_matched, batch_size=args.batch_size, shuffle=False)
        dev_data_loader_matched = DataLoader(
            dataset=dev_dataset_matched,
            batch_sampler=dev_batch_sampler_matched,
            collate_fn=batchify_fn,
            num_workers=0,
            return_list=True)
        dev_batch_sampler_mismatched = paddle.io.BatchSampler(
            dev_dataset_mismatched, batch_size=args.batch_size, shuffle=False)
        dev_data_loader_mismatched = DataLoader(
            dataset=dev_dataset_mismatched,
            batch_sampler=dev_batch_sampler_mismatched,
            collate_fn=batchify_fn,
            num_workers=0,
            return_list=True)
    else:
        dev_dataset = dataset_class.get_datasets(["dev"])
        dev_dataset = dev_dataset.apply(trans_func, lazy=True)
        dev_batch_sampler = paddle.io.BatchSampler(dev_dataset,
                                                   batch_size=args.batch_size,
                                                   shuffle=False)
        dev_data_loader = DataLoader(dataset=dev_dataset,
                                     batch_sampler=dev_batch_sampler,
                                     collate_fn=batchify_fn,
                                     num_workers=0,
                                     return_list=True)

    num_labels = 1 if train_ds.get_labels() == None else len(
        train_ds.get_labels())

    model = model_class.from_pretrained(args.model_name_or_path,
                                        num_classes=num_labels)
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    # Step1: Initialize a dictionary to save the weights from the origin BERT model.
    origin_weights = {}
    for name, param in model.named_parameters():
        origin_weights[name] = param

    # Step2: Convert origin model to supernet.
    sp_config = supernet(expand_ratio=args.width_mult_list)
    model = Convert(sp_config).convert(model)
    # Use weights saved in the dictionary to initialize supernet.
    utils.set_state_dict(model, origin_weights)
    del origin_weights

    # Step3: Define teacher model.
    teacher_model = model_class.from_pretrained(args.model_name_or_path,
                                                num_classes=num_labels)

    # Step4: Config about distillation.
    mapping_layers = ['bert.embeddings']
    for idx in range(model.bert.config['num_hidden_layers']):
        mapping_layers.append('bert.encoder.layers.{}'.format(idx))

    default_distill_config = {
        'lambda_distill': 0.1,
        'teacher_model': teacher_model,
        'mapping_layers': mapping_layers,
    }
    distill_config = DistillConfig(**default_distill_config)

    # Step5: Config in supernet training.
    ofa_model = OFA(model,
                    distill_config=distill_config,
                    elastic_order=['width'])

    criterion = paddle.nn.loss.CrossEntropyLoss() if train_ds.get_labels(
    ) else paddle.nn.loss.MSELoss()

    metric = metric_class()

    if args.task_name == "mnli":
        dev_data_loader = (dev_data_loader_matched, dev_data_loader_mismatched)

    # Step6: Calculate the importance of neurons and head,
    # and then reorder them according to the importance.
    head_importance, neuron_importance = utils.compute_neuron_head_importance(
        args.task_name,
        ofa_model.model,
        dev_data_loader,
        loss_fct=criterion,
        num_layers=model.bert.config['num_hidden_layers'],
        num_heads=model.bert.config['num_attention_heads'])
    reorder_neuron_head(ofa_model.model, head_importance, neuron_importance)

    lr_scheduler = paddle.optimizer.lr.LambdaDecay(
        args.learning_rate,
        lambda current_step, num_warmup_steps=args.warmup_steps,
        num_training_steps=args.max_steps if args.max_steps > 0 else
        (len(train_data_loader) * args.num_train_epochs): float(
            current_step) / float(max(1, num_warmup_steps))
        if current_step < num_warmup_steps else max(
            0.0,
            float(num_training_steps - current_step) / float(
                max(1, num_training_steps - num_warmup_steps))))

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=ofa_model.model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in [
            p.name for n, p in ofa_model.model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ])

    global_step = 0
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        # Step7: Set current epoch and task.
        ofa_model.set_epoch(epoch)
        ofa_model.set_task('width')

        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, segment_ids, labels = batch

            for width_mult in args.width_mult_list:
                # Step8: Broadcast supernet config from width_mult,
                # and use this config in supernet training.
                net_config = apply_config(ofa_model, width_mult)
                ofa_model.set_net_config(net_config)
                logits, teacher_logits = ofa_model(input_ids,
                                                   segment_ids,
                                                   attention_mask=[None, None])
                rep_loss = ofa_model.calc_distill_loss()
                if args.task_name == 'sts-b':
                    logit_loss = 0.0
                else:
                    logit_loss = soft_cross_entropy(logits,
                                                    teacher_logits.detach())
                loss = rep_loss + args.lambda_logit * logit_loss
                loss.backward()
            optimizer.step()
            lr_scheduler.step()
            ofa_model.model.clear_gradients()

            if global_step % args.logging_steps == 0:
                if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
                    logger.info(
                        "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                        % (global_step, epoch, step, loss, args.logging_steps /
                           (time.time() - tic_train)))
                tic_train = time.time()

            if global_step % args.save_steps == 0:
                if args.task_name == "mnli":
                    evaluate(teacher_model,
                             criterion,
                             metric,
                             dev_data_loader_matched,
                             width_mult=100)
                    evaluate(teacher_model,
                             criterion,
                             metric,
                             dev_data_loader_mismatched,
                             width_mult=100)
                else:
                    evaluate(teacher_model,
                             criterion,
                             metric,
                             dev_data_loader,
                             width_mult=100)
                for idx, width_mult in enumerate(args.width_mult_list):
                    net_config = apply_config(ofa_model, width_mult)
                    ofa_model.set_net_config(net_config)
                    tic_eval = time.time()
                    if args.task_name == "mnli":
                        acc = evaluate(ofa_model, criterion, metric,
                                       dev_data_loader_matched, width_mult)
                        evaluate(ofa_model, criterion, metric,
                                 dev_data_loader_mismatched, width_mult)
                        print("eval done total : %s s" %
                              (time.time() - tic_eval))
                    else:
                        acc = evaluate(ofa_model, criterion, metric,
                                       dev_data_loader, width_mult)
                        print("eval done total : %s s" %
                              (time.time() - tic_eval))

                    if (not args.n_gpu > 1
                        ) or paddle.distributed.get_rank() == 0:
                        output_dir = os.path.join(args.output_dir,
                                                  "model_%d" % global_step)
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        # need better way to get inner model of DataParallel
                        model_to_save = model._layers if isinstance(
                            model, paddle.DataParallel) else model
                        model_to_save.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)
Ejemplo n.º 21
0
 def test_ofa(self):
     ofa_model = OFA(self.model)
     out = self.model(self.data)
Ejemplo n.º 22
0
        default_run_config = {
            'n_epochs': [[4 * args.epoch], [6 * args.epoch]],
            'init_learning_rate': [[args.lr], [args.lr]],
            'elastic_depth': args.depth_mult_list,
            'dynamic_batch_size': [[1, 1], [1, 1]]
        }
        run_config = RunConfig(**default_run_config)

        model_cfg = get_config(args.from_pretrained)

        default_distill_config = {'teacher_model': teacher_model}
        distill_config = DistillConfig(**default_distill_config)

        ofa_model = OFA(model,
                        run_config,
                        distill_config=distill_config,
                        elastic_order=['width', 'depth'])

        ### suppose elastic width first
        if args.reorder_weight:
            head_importance, neuron_importance = compute_neuron_head_importance(
                args, ofa_model.model, tokenizer, dev_ds, place, model_cfg)
            reorder_neuron_head(ofa_model.model, head_importance,
                                neuron_importance)
        #################

        if args.init_checkpoint is not None:
            log.info('loading checkpoint from %s' % args.init_checkpoint)
            sd, _ = FD.load_dygraph(args.init_checkpoint)
            ofa_model.model.set_dict(sd)
Ejemplo n.º 23
0
def test_ofa():

    model = Model()
    teacher_model = Model()

    default_run_config = {
        'train_batch_size': 256,
        'n_epochs': [[1], [2, 3], [4, 5]],
        'init_learning_rate': [[0.001], [0.003, 0.001], [0.003, 0.001]],
        'dynamic_batch_size': [1, 1, 1],
        'total_images': 50000,  #1281167,
        'elastic_depth': (2, 5, 8)
    }
    run_config = RunConfig(**default_run_config)

    default_distill_config = {
        'lambda_distill': 0.01,
        'teacher_model': teacher_model,
        'mapping_layers': ['models.0.fn']
    }
    distill_config = DistillConfig(**default_distill_config)

    ofa_model = OFA(model, run_config, distill_config=distill_config)

    train_dataset = paddle.vision.datasets.MNIST(mode='train',
                                                 backend='cv2',
                                                 transform=transform)
    train_loader = paddle.io.DataLoader(train_dataset,
                                        places=place,
                                        feed_list=[image, label],
                                        drop_last=True,
                                        batch_size=64)

    start_epoch = 0
    for idx in range(len(run_config.n_epochs)):
        cur_idx = run_config.n_epochs[idx]
        for ph_idx in range(len(cur_idx)):
            cur_lr = run_config.init_learning_rate[idx][ph_idx]
            adam = paddle.optimizer.Adam(
                learning_rate=cur_lr,
                parameter_list=(ofa_model.parameters() +
                                ofa_model.netAs_param))
            for epoch_id in range(start_epoch,
                                  run_config.n_epochs[idx][ph_idx]):
                for batch_id, data in enumerate(train_loader()):
                    dy_x_data = np.array([
                        x[0].reshape(1, 28, 28) for x in data
                    ]).astype('float32')
                    y_data = np.array([x[1] for x in data
                                       ]).astype('int64').reshape(-1, 1)

                    img = paddle.dygraph.to_variable(dy_x_data)
                    label = paddle.dygraph.to_variable(y_data)
                    label.stop_gradient = True

                    for model_no in range(run_config.dynamic_batch_size[idx]):
                        output, _ = ofa_model(img, label)
                        loss = F.mean(output)
                        dis_loss = ofa_model.calc_distill_loss()
                        loss += dis_loss
                        loss.backward()

                        if batch_id % 10 == 0:
                            print(
                                'epoch: {}, batch: {}, loss: {}, distill loss: {}'
                                .format(epoch_id, batch_id,
                                        loss.numpy()[0],
                                        dis_loss.numpy()[0]))
                    ### accumurate dynamic_batch_size network of gradients for same batch of data
                    ### NOTE: need to fix gradients accumulate in PaddlePaddle
                    adam.minimize(loss)
                    adam.clear_gradients()
            start_epoch = run_config.n_epochs[idx][ph_idx]
Ejemplo n.º 24
0
    elif 7 < i <= 14:
        channel_optional.append([4, 8, 12, 16, 20, 24, 28, 32])
        # channel_optional.append([20, 24, 28, 32])
    elif 14 < i <= 21:
        channel_optional.append(
            [4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64])
        # channel_optional.append([36, 40, 44, 48, 52, 56,60, 64])
    else:
        channel_optional.append(
            [4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64])
        # channel_optional.append([36, 40, 44, 48, 52, 56,60, 64])

distill_config = DistillConfig(teacher_model=net2)
sp_net_config = supernet(channel=channel_optional)
sp_model = Convert(sp_net_config).convert(net)
ofa_net = OFA(sp_model, distill_config=distill_config)
ofa_net.set_task('channel')

model = paddle.Model(ofa_net)

MAX_EPOCH = 300
LR = 0.1
WEIGHT_DECAY = 5e-4
MOMENTUM = 0.9
BATCH_SIZE = 128
CIFAR_MEAN = [0.5071, 0.4865, 0.4409]
CIFAR_STD = [0.1942, 0.1918, 0.1958]
DATA_FILE = './data/data76994/cifar-100-python.tar.gz'

model.prepare(
    paddle.optimizer.Momentum(learning_rate=LinearWarmup(
Ejemplo n.º 25
0
 def _init_model(self):
     self.origin_model = ModelOriginLinear()
     model = ModelLinear()
     self.ofa_model = OFA(model)
Ejemplo n.º 26
0
 def test_ofa(self):
     self.model = ModelLinear2()
     ofa_model = OFA(self.model)
     ofa_model.set_net_config({'expand_ratio': None})
Ejemplo n.º 27
0
def do_train(args):
    paddle.set_device("gpu" if args.n_gpu else "cpu")
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config_path = os.path.join(args.model_name_or_path, 'model_config.json')
    cfg_dict = dict(json.loads(open(config_path).read()))

    kept_layers_index = {}
    if args.depth_mult < 1.0:
        depth = round(cfg_dict["init_args"][0]['num_hidden_layers'] *
                      args.depth_mult)
        cfg_dict["init_args"][0]['num_hidden_layers'] = depth
        for idx, i in enumerate(range(1, depth + 1)):
            kept_layers_index[idx] = math.floor(i / args.depth_mult) - 1

    os.rename(config_path, config_path + '_bak')
    with open(config_path, "w", encoding="utf-8") as f:
        f.write(json.dumps(cfg_dict, ensure_ascii=False))

    num_labels = cfg_dict['num_classes']

    model = model_class.from_pretrained(
        args.model_name_or_path, num_classes=num_labels)

    origin_model = model_class.from_pretrained(
        args.model_name_or_path, num_classes=num_labels)

    os.rename(config_path + '_bak', config_path)

    sp_config = supernet(expand_ratio=[1.0, args.width_mult])
    model = Convert(sp_config).convert(model)

    ofa_model = OFA(model)

    sd = paddle.load(
        os.path.join(args.model_name_or_path, 'model_state.pdparams'))

    if len(kept_layers_index) == 0:
        ofa_model.model.set_state_dict(sd)
    else:
        for name, params in ofa_model.model.named_parameters():
            if 'encoder' not in name:
                params.set_value(sd[name])
            else:
                idx = int(name.strip().split('.')[3])
                mapping_name = name.replace(
                    '.' + str(idx) + '.',
                    '.' + str(kept_layers_index[idx]) + '.')
                params.set_value(sd[mapping_name])

    best_config = utils.dynabert_config(ofa_model, args.width_mult)
    for name, sublayer in ofa_model.model.named_sublayers():
        if isinstance(sublayer, paddle.nn.MultiHeadAttention):
            sublayer.num_heads = int(args.width_mult * sublayer.num_heads)

    ofa_model.export(
        best_config,
        input_shapes=[[1, args.max_seq_length], [1, args.max_seq_length]],
        input_dtypes=['int64', 'int64'],
        origin_model=origin_model)
    for name, sublayer in origin_model.named_sublayers():
        if isinstance(sublayer, paddle.nn.MultiHeadAttention):
            sublayer.num_heads = int(args.width_mult * sublayer.num_heads)

    output_dir = os.path.join(args.sub_model_output_dir,
                              "model_width_%.5f" % args.width_mult)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    model_to_save = origin_model
    model_to_save.save_pretrained(output_dir)

    if args.static_sub_model != None:
        export_static_model(origin_model, args.static_sub_model,
                            args.max_seq_length)
Ejemplo n.º 28
0
def do_train(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args)

    args.task_name = args.task_name.lower()
    metric_class = METRIC_CLASSES[args.task_name]
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    train_ds = load_dataset('clue', args.task_name, splits='train')
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)

    trans_func = partial(convert_example,
                         label_list=train_ds.label_list,
                         tokenizer=tokenizer,
                         max_seq_length=args.max_seq_length)
    train_ds = train_ds.map(trans_func, lazy=True)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
        Stack(dtype="int64" if train_ds.label_list else "float32")  # label
    ): fn(samples)

    train_data_loader = DataLoader(dataset=train_ds,
                                   batch_sampler=train_batch_sampler,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   return_list=True)

    dev_ds = load_dataset('clue', args.task_name, splits='dev')
    dev_ds = dev_ds.map(trans_func, lazy=True)
    dev_batch_sampler = paddle.io.BatchSampler(dev_ds,
                                               batch_size=args.batch_size,
                                               shuffle=False)
    dev_data_loader = DataLoader(dataset=dev_ds,
                                 batch_sampler=dev_batch_sampler,
                                 collate_fn=batchify_fn,
                                 num_workers=0,
                                 return_list=True)
    num_labels = 1 if train_ds.label_list == None else len(train_ds.label_list)

    model = model_class.from_pretrained(args.model_name_or_path,
                                        num_classes=num_labels)

    # Step1: Initialize a dictionary to save the weights from the origin PPMiniLM model.
    origin_weights = model.state_dict()

    # Step2: Convert origin model to supernet.
    sp_config = supernet(expand_ratio=[1.0])
    model = Convert(sp_config).convert(model)
    # Use weights saved in the dictionary to initialize supernet.
    utils.set_state_dict(model, origin_weights)
    del origin_weights

    super_sd = paddle.load(
        os.path.join(args.model_name_or_path, 'model_state.pdparams'))
    model.set_state_dict(super_sd)

    # Step3: Define teacher model.
    teacher_model = model_class.from_pretrained(args.model_name_or_path,
                                                num_classes=num_labels)

    # Step4: Config about distillation.
    mapping_layers = ['ppminilm.embeddings']
    for idx in range(model.ppminilm.config['num_hidden_layers']):
        mapping_layers.append('ppminilm.encoder.layers.{}'.format(idx))

    default_distill_config = {
        'lambda_distill': 0.1,
        'teacher_model': teacher_model,
        'mapping_layers': mapping_layers,
    }
    distill_config = DistillConfig(**default_distill_config)

    # Step5: Config in supernet training.
    ofa_model = OFA(model,
                    distill_config=distill_config,
                    elastic_order=['width'])

    criterion = paddle.nn.loss.CrossEntropyLoss(
    ) if train_ds.label_list else paddle.nn.loss.MSELoss()

    metric = metric_class()

    #### Step6: Calculate the importance of neurons and head,
    #### and then reorder them according to the importance.
    head_importance, neuron_importance = nlp_utils.compute_neuron_head_importance(
        args.task_name,
        ofa_model.model,
        dev_data_loader,
        loss_fct=criterion,
        num_layers=model.ppminilm.config['num_hidden_layers'],
        num_heads=model.ppminilm.config['num_attention_heads'])
    reorder_neuron_head(ofa_model.model, head_importance, neuron_importance)

    if paddle.distributed.get_world_size() > 1:
        ofa_model.model = paddle.DataParallel(ofa_model.model)

    if args.max_steps > 0:
        num_training_steps = args.max_steps
        num_train_epochs = math.ceil(num_training_steps /
                                     len(train_data_loader))
    else:
        num_training_steps = len(train_data_loader) * args.num_train_epochs
        num_train_epochs = args.num_train_epochs

    warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, warmup)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        beta1=0.9,
        beta2=0.999,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params,
        grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm))

    global_step = 0
    tic_train = time.time()
    best_res = 0.0
    for epoch in range(num_train_epochs):
        # Step7: Set current epoch and task.
        ofa_model.set_epoch(epoch)
        ofa_model.set_task('width')

        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, segment_ids, labels = batch

            for width_mult in args.width_mult_list:
                # Step8: Broadcast supernet config from width_mult,
                # and use this config in supernet training.
                net_config = utils.dynabert_config(ofa_model, width_mult)
                ofa_model.set_net_config(net_config)
                logits, teacher_logits = ofa_model(input_ids,
                                                   segment_ids,
                                                   attention_mask=[None, None])
                rep_loss = ofa_model.calc_distill_loss()
                logit_loss = soft_cross_entropy(logits,
                                                teacher_logits.detach())
                loss = rep_loss + args.lambda_logit * logit_loss
                loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()

            if global_step % args.logging_steps == 0:
                logger.info(
                    "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                    % (global_step, epoch, step, loss, args.logging_steps /
                       (time.time() - tic_train)))
                tic_train = time.time()

            if global_step % args.save_steps == 0 or global_step == num_training_steps:
                tic_eval = time.time()
                evaluate(teacher_model,
                         metric,
                         dev_data_loader,
                         width_mult=100)
                print("eval done total : %s s" % (time.time() - tic_eval))
                for idx, width_mult in enumerate(args.width_mult_list):
                    net_config = utils.dynabert_config(ofa_model, width_mult)
                    ofa_model.set_net_config(net_config)
                    tic_eval = time.time()
                    res = evaluate(ofa_model, metric, dev_data_loader,
                                   width_mult)
                    print("eval done total : %s s" % (time.time() - tic_eval))

                    if best_res < res:
                        output_dir = args.output_dir
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        # need better way to get inner model of DataParallel
                        model_to_save = model._layers if isinstance(
                            model, paddle.DataParallel) else model
                        model_to_save.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)
                        best_res = res
            if global_step >= num_training_steps:
                print("best_res: ", best_res)
                return
    print("best_res: ", best_res)
Ejemplo n.º 29
0
 def test_dynabert(self):
     self.model = TestModel()
     sp_net_config = supernet(expand_ratio=[0.5, 1.0])
     self.model = Convert(sp_net_config).convert(self.model)
     ofa_model = OFA(self.model)
     config = dynabert_config(ofa_model, 0.5)
Ejemplo n.º 30
0
 def _test_clear_search_space(self):
     self.ofa_model = OFA(self.model)
     self.ofa_model.set_epoch(0)
     outs, _ = self.ofa_model(self.images)
     self.config = self.ofa_model.current_config