Ejemplo n.º 1
0
 def __init__(self,
              topk=(1, ),
              pos_label=1,
              name='acc_and_f1',
              *args,
              **kwargs):
     super(AccuracyAndF1, self).__init__(*args, **kwargs)
     self.topk = topk
     self.pos_label = pos_label
     self._name = name
     self.acc = Accuracy(self.topk, *args, **kwargs)
     self.precision = Precision(*args, **kwargs)
     self.recall = Recall(*args, **kwargs)
     self.reset()
Ejemplo n.º 2
0
    def compute_metrics(p):
        preds = p.predictions[0] if isinstance(p.predictions,
                                               tuple) else p.predictions

        preds = paddle.to_tensor(preds)
        label = paddle.to_tensor(p.label_ids)

        probs = F.softmax(preds, axis=1)
        metric = Accuracy()
        metric.reset()
        result = metric.compute(preds, label)
        metric.update(result)
        accu = metric.accumulate()
        metric.reset()
        return {"accuracy": accu}
Ejemplo n.º 3
0
def main(cfg):
    paddle.seed(cfg.COMMON.seed)
    np.random.seed(cfg.COMMON.seed)

    net = build_classifier(cfg.CLASSIFIER)
    model = paddle.Model(net)
    FLOPs = paddle.flops(net, [1, 3, 32, 32], print_detail=False)

    lrs = build_lrscheduler(cfg.SCHEDULER)
    optim = build_optim(cfg.OPTIMIZER,
                        parameters=net.parameters(),
                        learning_rate=lrs)
    train_transforms, val_transforms = build_transform()
    train_set = Cifar100(cfg.COMMON.data_path,
                         mode='train',
                         transform=train_transforms)
    test_set = Cifar100(cfg.COMMON.data_path,
                        mode='test',
                        transform=val_transforms)
    vis_name = '/{}-{}'.format(
        cfg.CLASSIFIER.name,
        time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()))
    callbacks = [LRSchedulerC(), VisualDLC(cfg.COMMON.logdir + vis_name)]

    model.prepare(optim, CrossEntropyLoss(), Accuracy(topk=(1, 5)))
    model.fit(
        train_set,
        test_set,
        batch_size=cfg.COMMON.batch_size,
        epochs=cfg.COMMON.epochs,
        num_workers=cfg.COMMON.workers,
        verbose=cfg.COMMON.verbose,
        callbacks=callbacks,
    )
Ejemplo n.º 4
0
def main(_):
    transform = T.Compose([T.ToTensor(), T.Normalize(mean=0.5, std=0.5)])
    train_img_path = []
    train_label = []
    train_dataset = MyDataset(image=train_img_path,
                              lable=train_label,
                              transform=transform)
    train_loader = paddle.io.DataLoader(train_dataset,
                                        places=paddle.CPUPlace(),
                                        batch_size=2,
                                        shuffle=True)
    model = resnet18(pretrained=True, num_classes=102, with_pool=True)
    model = paddle.Model(model)
    optim = paddle.optimizer.Adam(learning_rate=0.001,
                                  parameters=model.parameters())
    """Train or evaluates the model."""
    if FLAGS.mode == 'train':
        model.prepare(
            optimizer=optim,
            loss=paddle.nn.MSELoss(),
            metric=Accuracy()  # topk计算准确率的top个数,默认是1
        )
        model.fit(
            train_loader,
            epochs=2,
            verbose=1,
        )
        model.evaluate(train_dataset, batch_size=2, verbose=1)
        model.save('inference_model', training=False)

    elif FLAGS.mode == 'eval_rollout':
        metadata = _read_metadata(FLAGS.data_path)
Ejemplo n.º 5
0
    def test_static_multiple_gpus(self):
        device = set_device('gpu')

        im_shape = (-1, 1, 28, 28)
        batch_size = 128

        inputs = [Input(im_shape, 'float32', 'image')]
        labels = [Input([None, 1], 'int64', 'label')]

        model = Model(LeNet(), inputs, labels)
        optim = fluid.optimizer.Momentum(
            learning_rate=0.001, momentum=.9, parameter_list=model.parameters())
        model.prepare(optim, CrossEntropyLoss(), Accuracy())

        train_dataset = MnistDataset(mode='train')
        val_dataset = MnistDataset(mode='test')
        test_dataset = MnistDataset(mode='test', return_label=False)

        cbk = paddle.callbacks.ProgBarLogger(50)
        model.fit(train_dataset,
                  val_dataset,
                  epochs=2,
                  batch_size=batch_size,
                  callbacks=cbk)

        eval_result = model.evaluate(val_dataset, batch_size=batch_size)

        output = model.predict(
            test_dataset, batch_size=batch_size, stack_outputs=True)

        np.testing.assert_equal(output[0].shape[0], len(test_dataset))

        acc = compute_accuracy(output[0], val_dataset.labels)

        np.testing.assert_allclose(acc, eval_result['acc'])
    def func_warn_or_error(self):
        with self.assertRaises(ValueError):
            paddle.callbacks.ReduceLROnPlateau(factor=2.0)
        # warning
        paddle.callbacks.ReduceLROnPlateau(mode='1', patience=3, verbose=1)

        transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
        train_dataset = CustomMnist(mode='train', transform=transform)
        val_dataset = CustomMnist(mode='test', transform=transform)
        net = LeNet()
        optim = paddle.optimizer.Adam(learning_rate=0.001,
                                      parameters=net.parameters())
        inputs = [InputSpec([None, 1, 28, 28], 'float32', 'x')]
        labels = [InputSpec([None, 1], 'int64', 'label')]
        model = Model(net, inputs=inputs, labels=labels)
        model.prepare(optim, loss=CrossEntropyLoss(), metrics=[Accuracy()])
        callbacks = paddle.callbacks.ReduceLROnPlateau(monitor='miou',
                                                       patience=3,
                                                       verbose=1)
        model.fit(train_dataset,
                  val_dataset,
                  batch_size=8,
                  log_freq=1,
                  save_freq=10,
                  epochs=1,
                  callbacks=[callbacks])

        optim = paddle.optimizer.Adam(
            learning_rate=paddle.optimizer.lr.PiecewiseDecay([0.001, 0.0001],
                                                             [5, 10]),
            parameters=net.parameters())

        model.prepare(optim, loss=CrossEntropyLoss(), metrics=[Accuracy()])
        callbacks = paddle.callbacks.ReduceLROnPlateau(monitor='acc',
                                                       mode='max',
                                                       patience=3,
                                                       verbose=1,
                                                       cooldown=1)
        model.fit(train_dataset,
                  val_dataset,
                  batch_size=8,
                  log_freq=1,
                  save_freq=10,
                  epochs=3,
                  callbacks=[callbacks])
Ejemplo n.º 7
0
    def fit(self, dynamic, num_replicas=None, rank=None, num_iters=None):
        fluid.enable_dygraph(self.device) if dynamic else None
        seed = 333
        paddle.seed(seed)
        paddle.framework.random._manual_program_seed(seed)

        net = LeNet()
        optim_new = fluid.optimizer.Adam(
            learning_rate=0.001, parameter_list=net.parameters())
        model = Model(net, inputs=self.inputs, labels=self.labels)
        model.prepare(
            optim_new,
            loss=CrossEntropyLoss(reduction="sum"),
            metrics=Accuracy())
        model.fit(self.train_dataset, batch_size=64, shuffle=False)

        result = model.evaluate(self.val_dataset, batch_size=64)
        np.testing.assert_allclose(result['acc'], self.acc1)

        model.fit(self.train_dataset,
                  batch_size=64,
                  shuffle=False,
                  num_iters=num_iters)

        result = model.evaluate(
            self.val_dataset, batch_size=64, num_iters=num_iters)

        train_sampler = DistributedBatchSampler(
            self.train_dataset,
            batch_size=64,
            shuffle=False,
            num_replicas=num_replicas,
            rank=rank)
        val_sampler = DistributedBatchSampler(
            self.val_dataset,
            batch_size=64,
            shuffle=False,
            num_replicas=num_replicas,
            rank=rank)

        train_loader = fluid.io.DataLoader(
            self.train_dataset,
            batch_sampler=train_sampler,
            places=self.device,
            return_list=True)

        val_loader = fluid.io.DataLoader(
            self.val_dataset,
            batch_sampler=val_sampler,
            places=self.device,
            return_list=True)

        model.fit(train_loader, val_loader)
        fluid.disable_dygraph() if dynamic else None
Ejemplo n.º 8
0
class AccuracyAndF1(Metric):
    """
    Encapsulates Accuracy, Precision, Recall and F1 metric logic.
    """
    def __init__(self,
                 topk=(1, ),
                 pos_label=1,
                 name='acc_and_f1',
                 *args,
                 **kwargs):
        super(AccuracyAndF1, self).__init__(*args, **kwargs)
        self.topk = topk
        self.pos_label = pos_label
        self._name = name
        self.acc = Accuracy(self.topk, *args, **kwargs)
        self.precision = Precision(*args, **kwargs)
        self.recall = Recall(*args, **kwargs)
        self.reset()

    def compute(self, pred, label, *args):
        self.label = label
        self.preds_pos = paddle.nn.functional.softmax(pred)[:, self.pos_label]
        return self.acc.compute(pred, label)

    def update(self, correct, *args):
        self.acc.update(correct)
        self.precision.update(self.preds_pos, self.label)
        self.recall.update(self.preds_pos, self.label)

    def accumulate(self):
        acc = self.acc.accumulate()
        precision = self.precision.accumulate()
        recall = self.recall.accumulate()
        if precision == 0.0 or recall == 0.0:
            f1 = 0.0
        else:
            # 1/f1 = 1/2 * (1/precision + 1/recall)
            f1 = (2 * precision * recall) / (precision + recall)
        return (
            acc,
            precision,
            recall,
            f1,
            (acc + f1) / 2,
        )

    def reset(self):
        self.acc.reset()
        self.precision.reset()
        self.recall.reset()
        self.label = None
        self.preds_pos = None

    def name(self):
        """
        Return name of metric instance.
        """
        return self._name
Ejemplo n.º 9
0
def main(args):
    print('download training data and load training data')
    train_dataset = MnistDataset(mode='train', )
    val_dataset = MnistDataset(mode='test', )
    test_dataset = MnistDataset(mode='test', return_label=False)

    im_shape = (-1, 1, 28, 28)
    batch_size = 64

    inputs = [Input(im_shape, 'float32', 'image')]
    labels = [Input([None, 1], 'int64', 'label')]

    model = Model(LeNet(), inputs, labels)
    optim = paddle.optimizer.SGD(learning_rate=0.001)
    if args.bf16:
        optim = amp.bf16.decorate_bf16(
            optim,
            amp_lists=amp.bf16.AutoMixedPrecisionListsBF16(
                custom_bf16_list={
                    'matmul_v2', 'pool2d', 'relu', 'scale', 'elementwise_add',
                    'reshape2', 'slice', 'reduce_mean', 'conv2d'
                }, ))

    # Configuration model
    model.prepare(optim, paddle.nn.CrossEntropyLoss(), Accuracy())
    # Training model #
    if args.bf16:
        print('Training BF16')
    else:
        print('Training FP32')
    model.fit(train_dataset, epochs=2, batch_size=batch_size, verbose=1)
    eval_result = model.evaluate(val_dataset, batch_size=batch_size, verbose=1)

    output = model.predict(
        test_dataset, batch_size=batch_size, stack_outputs=True)

    np.testing.assert_equal(output[0].shape[0], len(test_dataset))

    acc = compute_accuracy(output[0], val_dataset.labels)

    print("acc", acc)
    print("eval_result['acc']", eval_result['acc'])

    np.testing.assert_allclose(acc, eval_result['acc'])
Ejemplo n.º 10
0
    def evaluate(self, dynamic):
        fluid.enable_dygraph(self.device) if dynamic else None
        model = Model(LeNet(), self.inputs, self.labels)
        model.prepare(metrics=Accuracy())
        model.load(self.weight_path)
        result = model.evaluate(self.val_dataset, batch_size=64)
        np.testing.assert_allclose(result['acc'], self.acc1)

        sampler = DistributedBatchSampler(
            self.val_dataset, batch_size=64, shuffle=False)

        val_loader = fluid.io.DataLoader(
            self.val_dataset,
            batch_sampler=sampler,
            places=self.device,
            return_list=True)

        model.evaluate(val_loader)

        fluid.disable_dygraph() if dynamic else None
 def func_reduce_lr_on_plateau(self):
     transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
     train_dataset = CustomMnist(mode='train', transform=transform)
     val_dataset = CustomMnist(mode='test', transform=transform)
     net = LeNet()
     optim = paddle.optimizer.Adam(learning_rate=0.001,
                                   parameters=net.parameters())
     inputs = [InputSpec([None, 1, 28, 28], 'float32', 'x')]
     labels = [InputSpec([None, 1], 'int64', 'label')]
     model = Model(net, inputs=inputs, labels=labels)
     model.prepare(optim, loss=CrossEntropyLoss(), metrics=[Accuracy()])
     callbacks = paddle.callbacks.ReduceLROnPlateau(patience=1,
                                                    verbose=1,
                                                    cooldown=1)
     model.fit(train_dataset,
               val_dataset,
               batch_size=8,
               log_freq=1,
               save_freq=10,
               epochs=10,
               callbacks=[callbacks])
Ejemplo n.º 12
0
def print_logs(args, step, logits, labels, loss, total_time, metric):
    if args.task_name in ['udc', 'atis_intent', 'mrda', 'swda']:
        if args.task_name == 'udc':
            metric = Accuracy()
        metric.reset()
        correct = metric.compute(logits, labels)
        metric.update(correct)
        acc = metric.accumulate()
        print('step %d - loss: %.4f - acc: %.4f - %.3fs/step' %
              (step, loss, acc, total_time / args.logging_steps))
    elif args.task_name == 'dstc2':
        metric.reset()
        metric.update(logits, labels)
        joint_acc = metric.accumulate()
        print('step %d - loss: %.4f - joint_acc: %.4f - %.3fs/step' %
              (step, loss, joint_acc, total_time / args.logging_steps))
    elif args.task_name == 'atis_slot':
        metric.reset()
        metric.update(logits, labels)
        f1_micro = metric.accumulate()
        print('step %d - loss: %.4f - f1_micro: %.4f - %.3fs/step' %
              (step, loss, f1_micro, total_time / args.logging_steps))
Ejemplo n.º 13
0
def evaluate(model, args, mode='test'):
    """evaluate the model"""
    model.eval()
    metric = Accuracy()
    eval_dataloader, processor = load_example(args, mode)
    for batch in tqdm(eval_dataloader, total=len(eval_dataloader)):
        logits = model(input_ids=batch[0], token_type_ids=batch[1])
        labels = batch[2].reshape((
            -1,
            1,
        ))
        correct = metric.compute(logits, labels)
        metric.update(correct)
    res = metric.accumulate()
    print('Accuracy:', res)
    model.train()
    return res
Ejemplo n.º 14
0
parser.add_argument("--memory_length", type=int, default=128, help="Length of the retained previous heads.")
parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight decay if we apply some.")
parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Linear warmup proption over the training process.")
parser.add_argument("--dataset", default="imdb", choices=["imdb", "iflytek", "thucnews", "hyp"], type=str, help="The training dataset")
parser.add_argument("--layerwise_decay", default=1.0, type=float, help="Layerwise decay ratio")
parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.",)
# yapf: enable
args = parser.parse_args()

# tokenizer, eval_dataset, test_dataset, preprocess_text_fn, metric
# BPETokenizer for English Tasks
# ErnieDocTokenizer for Chinese Tasks

DATASET_INFO = {
    "imdb":
    (ErnieDocBPETokenizer, "test", "test", ImdbTextPreprocessor(), Accuracy()),
    "hyp": (ErnieDocBPETokenizer, "dev", "test", HYPTextPreprocessor(), F1()),
    "iflytek": (ErnieDocTokenizer, "dev", "dev", None, Accuracy()),
    "thucnews": (ErnieDocTokenizer, "dev", "test", None, Accuracy())
}


def set_seed(args):
    # Use the same data seed(for data shuffle) for all procs to guarantee data
    # consistency after sharding.
    random.seed(args.seed)
    np.random.seed(args.seed)
    # Maybe different op seeds(for dropout) for different procs is better. By:
    # `paddle.seed(args.seed + paddle.distributed.get_rank())`
    paddle.seed(args.seed)
Ejemplo n.º 15
0
from paddle.vision.models import resnet50, vgg16, LeNet
from paddle.vision.datasets import Cifar10
from paddle.optimizer import Momentum
from paddle.regularizer import L2Decay
from paddle.nn import CrossEntropyLoss
from paddle.metric import Accuracy
from paddle.vision.transforms import Transpose

# 确保从paddle.vision.datasets.Cifar10中加载的图像数据是np.ndarray类型
paddle.vision.set_image_backend('cv2')
# 调用resnet50模型
model = paddle.Model(resnet50(pretrained=False, num_classes=10))

# 使用Cifar10数据集
train_dataset = Cifar10(mode='train', transform=Transpose())
val_dataset = Cifar10(mode='test', transform=Transpose())
# 定义优化器
optimizer = Momentum(learning_rate=0.01,
                     momentum=0.9,
                     weight_decay=L2Decay(1e-4),
                     parameters=model.parameters())
# 进行训练前准备
model.prepare(optimizer, CrossEntropyLoss(), Accuracy(topk=(1, 5)))
# 启动训练
model.fit(train_dataset,
          val_dataset,
          epochs=50,
          batch_size=64,
          save_dir="./output",
          num_workers=8)
Ejemplo n.º 16
0
    def test_earlystopping(self):
        paddle.seed(2020)
        for dynamic in [True, False]:
            paddle.enable_static if not dynamic else None
            device = paddle.set_device('cpu')
            sample_num = 100
            train_dataset = MnistDataset(mode='train', sample_num=sample_num)
            val_dataset = MnistDataset(mode='test', sample_num=sample_num)

            net = LeNet()
            optim = paddle.optimizer.Adam(learning_rate=0.001,
                                          parameters=net.parameters())

            inputs = [InputSpec([None, 1, 28, 28], 'float32', 'x')]
            labels = [InputSpec([None, 1], 'int64', 'label')]

            model = Model(net, inputs=inputs, labels=labels)
            model.prepare(optim,
                          loss=CrossEntropyLoss(reduction="sum"),
                          metrics=[Accuracy()])
            callbacks_0 = paddle.callbacks.EarlyStopping('loss',
                                                         mode='min',
                                                         patience=1,
                                                         verbose=1,
                                                         min_delta=0,
                                                         baseline=None,
                                                         save_best_model=True)
            callbacks_1 = paddle.callbacks.EarlyStopping('acc',
                                                         mode='auto',
                                                         patience=1,
                                                         verbose=1,
                                                         min_delta=0,
                                                         baseline=0,
                                                         save_best_model=True)
            callbacks_2 = paddle.callbacks.EarlyStopping('loss',
                                                         mode='auto_',
                                                         patience=1,
                                                         verbose=1,
                                                         min_delta=0,
                                                         baseline=None,
                                                         save_best_model=True)
            callbacks_3 = paddle.callbacks.EarlyStopping('acc_',
                                                         mode='max',
                                                         patience=1,
                                                         verbose=1,
                                                         min_delta=0,
                                                         baseline=0,
                                                         save_best_model=True)
            model.fit(
                train_dataset,
                val_dataset,
                batch_size=64,
                save_freq=10,
                save_dir=self.save_dir,
                epochs=10,
                verbose=0,
                callbacks=[callbacks_0, callbacks_1, callbacks_2, callbacks_3])
            # Test for no val_loader
            model.fit(train_dataset,
                      batch_size=64,
                      save_freq=10,
                      save_dir=self.save_dir,
                      epochs=10,
                      verbose=0,
                      callbacks=[callbacks_0])
Ejemplo n.º 17
0
def run(args):
    paddle.set_device(args.device)
    set_seed(args)

    max_seq_length = args.max_seq_length
    max_num_choices = 10

    def preprocess_function(examples, do_predict=False):
        SPIECE_UNDERLINE = '▁'

        def _is_chinese_char(cp):
            if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
                (cp >= 0x3400 and cp <= 0x4DBF) or  #
                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
                (cp >= 0x2B820 and cp <= 0x2CEAF) or
                (cp >= 0xF900 and cp <= 0xFAFF) or  #
                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
                return True

            return False

        def is_fuhao(c):
            if c == '。' or c == ',' or c == '!' or c == '?' or c == ';' or c == '、' or c == ':' or c == '(' or c == ')' \
                    or c == '-' or c == '~' or c == '「' or c == '《' or c == '》' or c == ',' or c == '」' or c == '"' or c == '“' or c == '”' \
                    or c == '$' or c == '『' or c == '』' or c == '—' or c == ';' or c == '。' or c == '(' or c == ')' or c == '-' or c == '~' or c == '。' \
                    or c == '‘' or c == '’':
                return True
            return False

        def _tokenize_chinese_chars(text):
            """Adds whitespace around any CJK character."""
            output = []
            is_blank = False
            for index, char in enumerate(text):
                cp = ord(char)
                if is_blank:
                    output.append(char)
                    if context[index - 12:index + 1].startswith("#idiom"):
                        is_blank = False
                        output.append(SPIECE_UNDERLINE)
                else:
                    if text[index:index + 6] == "#idiom":
                        is_blank = True
                        if len(output) > 0 and output[-1] != SPIECE_UNDERLINE:
                            output.append(SPIECE_UNDERLINE)
                        output.append(char)
                    elif _is_chinese_char(cp) or is_fuhao(char):
                        if len(output) > 0 and output[-1] != SPIECE_UNDERLINE:
                            output.append(SPIECE_UNDERLINE)
                        output.append(char)
                        output.append(SPIECE_UNDERLINE)
                    else:
                        output.append(char)
            return "".join(output)

        def is_whitespace(c):
            if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(
                    c) == 0x202F or c == SPIECE_UNDERLINE:
                return True
            return False

        def add_tokens_for_around(tokens, pos, num_tokens):
            num_l = num_tokens // 2
            num_r = num_tokens - num_l

            if pos >= num_l and (len(tokens) - 1 - pos) >= num_r:
                tokens_l = tokens[pos - num_l:pos]
                tokens_r = tokens[pos + 1:pos + 1 + num_r]
            elif pos <= num_l:
                tokens_l = tokens[:pos]
                right_len = num_tokens - len(tokens_l)
                tokens_r = tokens[pos + 1:pos + 1 + right_len]
            elif (len(tokens) - 1 - pos) <= num_r:
                tokens_r = tokens[pos + 1:]
                left_len = num_tokens - len(tokens_r)
                tokens_l = tokens[pos - left_len:pos]
            else:
                raise ValueError('impossible')

            return tokens_l, tokens_r

        max_tokens_for_doc = max_seq_length - 3
        num_tokens = max_tokens_for_doc - 5
        num_examples = len(examples.data["candidates"])
        if do_predict:
            result = {"input_ids": [], "token_type_ids": []}
        else:
            result = {"input_ids": [], "token_type_ids": [], "labels": []}
        for idx in range(num_examples):
            candidate = 0
            options = examples.data['candidates'][idx]

            # Each content may have several sentences.
            for context in examples.data['content'][idx]:
                context = context.replace("“", "\"").replace("”", "\"").replace("——", "--"). \
                    replace("—", "-").replace("―", "-").replace("…", "...").replace("‘", "\'").replace("’", "\'")
                context = _tokenize_chinese_chars(context)
                paragraph_text = context.strip()
                doc_tokens = []
                prev_is_whitespace = True
                for c in paragraph_text:
                    if is_whitespace(c):
                        prev_is_whitespace = True
                    else:
                        if prev_is_whitespace:
                            doc_tokens.append(c)
                        else:
                            doc_tokens[-1] += c
                        prev_is_whitespace = False
                all_doc_tokens = []
                for (i, token) in enumerate(doc_tokens):
                    if '#idiom' in token:
                        sub_tokens = [str(token)]
                    else:
                        sub_tokens = tokenizer.tokenize(token)
                    for sub_token in sub_tokens:
                        all_doc_tokens.append(sub_token)
                tags = [blank for blank in doc_tokens if '#idiom' in blank]

                # Each sentence may have several tags
                for tag_index, tag in enumerate(tags):
                    pos = all_doc_tokens.index(tag)

                    tmp_l, tmp_r = add_tokens_for_around(all_doc_tokens, pos,
                                                         num_tokens)
                    num_l = len(tmp_l)
                    num_r = len(tmp_r)
                    tokens_l = []
                    for token in tmp_l:
                        if '#idiom' in token and token != tag:
                            # Mask tag which is not considered in this new sample.
                            # Each idiom has four words, so 4 mask tokens are used.
                            tokens_l.extend(['[MASK]'] * 4)
                        else:
                            tokens_l.append(token)
                    tokens_l = tokens_l[-num_l:]
                    del tmp_l

                    tokens_r = []
                    for token in tmp_r:
                        if '#idiom' in token and token != tag:
                            tokens_r.extend(['[MASK]'] * 4)
                        else:
                            tokens_r.append(token)
                    tokens_r = tokens_r[:num_r]
                    del tmp_r

                    tokens_list = []
                    # Each tag has ten choices, and the shape of each new
                    # example is [num_choices, seq_len]
                    for i, elem in enumerate(options):
                        option = tokenizer.tokenize(elem)
                        tokens = option + ['[SEP]'] + tokens_l + ['[unused1]'
                                                                  ] + tokens_r
                        tokens_list.append(tokens)
                    new_data = tokenizer(tokens_list, is_split_into_words=True)
                    # Final shape of input_ids: [batch_size, num_choices, seq_len]
                    result["input_ids"].append(new_data["input_ids"])
                    result["token_type_ids"].append(new_data["token_type_ids"])
                    if not do_predict:
                        label = examples.data["answers"][idx]["candidate_id"][
                            candidate]
                        result["labels"].append(label)
                    candidate += 1
            if (idx + 1) % 10000 == 0:
                print(idx + 1, "samples have been processed.")
        return result

    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    model = AutoModelForMultipleChoice.from_pretrained(
        args.model_name_or_path, num_choices=max_num_choices)
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    train_ds, dev_ds, test_ds = load_dataset(
        "clue", "chid", split=["train", "validation", "test"])

    if args.do_train:
        args.batch_size = int(args.batch_size /
                              args.gradient_accumulation_steps)
        column_names = train_ds.column_names
        train_ds = train_ds.map(partial(preprocess_function),
                                batched=True,
                                batch_size=len(train_ds),
                                num_proc=1,
                                remove_columns=column_names)
        batchify_fn = lambda samples, fn=Dict({
            'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id),  # input
            'token_type_ids': Pad(axis=1, pad_val=tokenizer.pad_token_type_id),  # segment
            'labels': Stack(dtype="int64")  # label
        }): fn(samples)

        train_batch_sampler = paddle.io.DistributedBatchSampler(
            train_ds, batch_size=args.batch_size, shuffle=True)
        train_data_loader = paddle.io.DataLoader(
            dataset=train_ds,
            batch_sampler=train_batch_sampler,
            collate_fn=batchify_fn,
            num_workers=0,
            return_list=True)

        dev_ds = dev_ds.map(partial(preprocess_function),
                            batched=True,
                            batch_size=len(dev_ds),
                            remove_columns=column_names,
                            num_proc=1)

        dev_batch_sampler = paddle.io.BatchSampler(
            dev_ds, batch_size=args.eval_batch_size, shuffle=False)

        dev_data_loader = paddle.io.DataLoader(
            dataset=dev_ds,
            batch_sampler=dev_batch_sampler,
            collate_fn=batchify_fn,
            return_list=True)

        num_training_steps = int(
            len(train_data_loader) * args.num_train_epochs /
            args.gradient_accumulation_steps)
        lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                             num_training_steps, 0)
        # Generate parameter names needed to perform weight decay.
        # All bias and LayerNorm parameters are excluded.
        decay_params = [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ]
        grad_clip = paddle.nn.ClipGradByGlobalNorm(args.max_grad_norm)
        optimizer = paddle.optimizer.AdamW(
            learning_rate=lr_scheduler,
            parameters=model.parameters(),
            weight_decay=args.weight_decay,
            apply_decay_param_fun=lambda x: x in decay_params,
            grad_clip=grad_clip)

        loss_fct = nn.CrossEntropyLoss()
        metric = Accuracy()

        model.train()
        global_step = 0
        best_acc = 0.0
        tic_train = time.time()
        for epoch in range(args.num_train_epochs):
            for step, batch in enumerate(train_data_loader):
                input_ids, segment_ids, labels = batch
                logits = model(input_ids=input_ids, token_type_ids=segment_ids)
                loss = loss_fct(logits, labels)
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    global_step += 1
                    optimizer.step()
                    lr_scheduler.step()
                    optimizer.clear_grad()
                    if global_step % args.logging_steps == 0:
                        print(
                            "global step %d/%d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s"
                            % (global_step, num_training_steps, epoch, step + 1,
                               loss,
                               args.logging_steps / (time.time() - tic_train)))
                        tic_train = time.time()
            tic_eval = time.time()
            acc = evaluate(model, loss_fct, metric, dev_data_loader)
            print("eval acc: %.5f, eval done total : %s s" %
                  (acc, time.time() - tic_eval))
            if paddle.distributed.get_rank() == 0 and acc > best_acc:
                best_acc = acc
                model_to_save = model._layers if isinstance(
                    model, paddle.DataParallel) else model
                if not os.path.exists(args.output_dir):
                    os.makedirs(args.output_dir)
                model_to_save.save_pretrained(args.output_dir)
                tokenizer.save_pretrained(args.output_dir)
        print("best_acc: ", best_acc)

    if args.do_predict:
        column_names = test_ds.column_names
        test_ds = test_ds.map(partial(
            preprocess_function, do_predict=True),
                              batched=True,
                              batch_size=len(test_ds),
                              remove_columns=column_names,
                              num_proc=1)

        test_batch_sampler = paddle.io.BatchSampler(
            test_ds, batch_size=args.eval_batch_size, shuffle=False)

        batchify_fn = lambda samples, fn=Dict({
            'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id),  # input
            'token_type_ids': Pad(axis=1, pad_val=tokenizer.pad_token_type_id),  # segment
        }): fn(samples)

        test_data_loader = paddle.io.DataLoader(
            dataset=test_ds,
            batch_sampler=test_batch_sampler,
            collate_fn=batchify_fn,
            return_list=True)

        result = {}
        idx = 623377
        for step, batch in enumerate(test_data_loader):
            input_ids, segment_ids = batch
            with paddle.no_grad():
                logits = model(input_ids, segment_ids)
            preds = paddle.argmax(logits, axis=1).numpy().tolist()
            for pred in preds:
                result["#idiom" + str(idx)] = pred
                idx += 1

        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        with open(
                os.path.join(args.output_dir, 'chid11_predict.json'),
                "w",
                encoding='utf-8') as writer:
            writer.write(
                json.dumps(
                    result, ensure_ascii=False, indent=4) + "\n")
Ejemplo n.º 18
0
def do_train(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args)
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         max_seq_length=args.max_seq_length)
    if args.task_type == "cross-lingual-transfer":
        train_ds = load_dataset("xnli", "en", splits="train")
        train_ds = train_ds.map(trans_func, lazy=True)
    elif args.task_type == "translate-train-all":
        all_train_ds = []
        for language in all_languages:
            train_ds = load_dataset("xnli", language, splits="train")
            all_train_ds.append(train_ds.map(trans_func, lazy=True))
        train_ds = XnliDataset(all_train_ds)
    train_batch_sampler = DistributedBatchSampler(train_ds,
                                                  batch_size=args.batch_size,
                                                  shuffle=True)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"
            ),  # input_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"
            ),  # position_ids
        Pad(axis=0, pad_val=0, dtype="int64"),  # attention_mask
        Stack(dtype="int64")  # labels
    ): fn(samples)
    train_data_loader = DataLoader(dataset=train_ds,
                                   batch_sampler=train_batch_sampler,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   return_list=True)

    num_classes = 3
    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name_or_path, num_classes=num_classes, dropout=args.dropout)
    n_layers = model.ernie_m.config['num_hidden_layers']
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    if args.max_steps > 0:
        num_training_steps = args.max_steps
        num_train_epochs = math.ceil(num_training_steps /
                                     len(train_data_loader))
    else:
        num_training_steps = len(train_data_loader) * args.num_train_epochs
        num_train_epochs = args.num_train_epochs

    warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, warmup)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    # Construct dict
    name_dict = dict()
    for n, p in model.named_parameters():
        name_dict[p.name] = n
    optimizer = AdamWDL(learning_rate=lr_scheduler,
                        beta1=0.9,
                        beta2=0.999,
                        epsilon=args.adam_epsilon,
                        parameters=model.parameters(),
                        weight_decay=args.weight_decay,
                        n_layers=n_layers,
                        layerwise_decay=args.layerwise_decay,
                        apply_decay_param_fun=lambda x: x in decay_params,
                        name_dict=name_dict)

    loss_fct = nn.CrossEntropyLoss()
    if args.use_amp:
        scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)
    metric = Accuracy()

    global_step = 0
    tic_train = time.time()
    for epoch in range(num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, position_ids, attention_mask, labels = batch
            with paddle.amp.auto_cast(
                    args.use_amp,
                    custom_white_list=["layer_norm", "softmax", "gelu"]):
                logits = model(input_ids, position_ids, attention_mask)
                loss = loss_fct(logits, labels)
            if args.use_amp:
                scaled_loss = scaler.scale(loss)
                scaled_loss.backward()
                scaler.minimize(optimizer, scaled_loss)
            else:
                loss.backward()
                optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % args.logging_steps == 0:
                print(
                    "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s"
                    % (global_step, num_training_steps, epoch, step,
                       paddle.distributed.get_rank(), loss, optimizer.get_lr(),
                       args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()
            if global_step % args.save_steps == 0 or global_step == num_training_steps:
                for language in all_languages:
                    tic_eval = time.time()
                    test_data_loader = get_test_dataloader(
                        args, language, batchify_fn, trans_func)
                    evaluate(model, loss_fct, metric, test_data_loader,
                             language)
                    print("eval done total : %s s" % (time.time() - tic_eval))
                    if paddle.distributed.get_rank() == 0:
                        output_dir = os.path.join(
                            args.output_dir,
                            "ernie_m_ft_model_%d.pdparams" % (global_step))
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        # Need better way to get inner model of DataParallel
                        model_to_save = model._layers if isinstance(
                            model, paddle.DataParallel) else model
                        model_to_save.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)
            if global_step >= num_training_steps:
                break
        if global_step >= num_training_steps:
            break
    if paddle.distributed.get_rank() == 0:
        output_dir = os.path.join(
            args.output_dir, "ernie_m_final_model_%d.pdparams" % global_step)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        # Need better way to get inner model of DataParallel
        model_to_save = model._layers if isinstance(
            model, paddle.DataParallel) else model
        model_to_save.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
Ejemplo n.º 19
0
def do_train(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args)
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

    train_ds = load_dataset('thucnews', splits="train")
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)

    trans_func = partial(
        convert_example,
        tokenizer=tokenizer,
        label_list=train_ds.label_list,
        max_seq_length=args.max_seq_length)
    train_ds = train_ds.map(trans_func, lazy=True)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
        Stack(dtype="int64" if train_ds.label_list else "float32")  # label
    ): fn(samples)
    train_data_loader = DataLoader(
        dataset=train_ds,
        batch_sampler=train_batch_sampler,
        collate_fn=batchify_fn,
        num_workers=0,
        return_list=True)

    dev_ds = load_dataset('thucnews', splits='dev')
    dev_ds = dev_ds.map(trans_func, lazy=True)
    dev_batch_sampler = paddle.io.BatchSampler(
        dev_ds, batch_size=args.batch_size, shuffle=False)
    dev_data_loader = DataLoader(
        dataset=dev_ds,
        batch_sampler=dev_batch_sampler,
        collate_fn=batchify_fn,
        num_workers=0,
        return_list=True)

    num_classes = 1 if train_ds.label_list == None else len(train_ds.label_list)
    model = model_class.from_pretrained(
        args.model_name_or_path, num_classes=num_classes)
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    num_training_steps = args.max_steps if args.max_steps > 0 else (
        len(train_data_loader) * args.num_train_epochs)
    warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
                                         warmup)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        beta1=0.9,
        beta2=0.999,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    loss_fct = paddle.nn.loss.CrossEntropyLoss(
    ) if train_ds.label_list else paddle.nn.loss.MSELoss()

    metric = Accuracy()
    if args.use_amp:
        scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)

    global_step = 0
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            global_step += 1

            input_ids, segment_ids, labels = batch
            with paddle.amp.auto_cast(
                    args.use_amp,
                    custom_white_list=["layer_norm", "softmax", "gelu"]):
                logits = model(input_ids, segment_ids)
                loss = loss_fct(logits, labels)
            if args.use_amp:
                scaler.scale(loss).backward()
                scaler.minimize(optimizer, loss)
            else:
                loss.backward()
                optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % args.logging_steps == 0:
                print(
                    "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s"
                    % (global_step, num_training_steps, epoch, step,
                       paddle.distributed.get_rank(), loss, optimizer.get_lr(),
                       args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()
            if global_step % args.save_steps == 0 or global_step == num_training_steps:
                tic_eval = time.time()
                evaluate(model, loss_fct, metric, dev_data_loader)
                print("eval done total : %s s" % (time.time() - tic_eval))
                if paddle.distributed.get_rank() == 0:
                    output_dir = os.path.join(
                        args.output_dir, "ft_model_%d.pdparams" % (global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # Need better way to get inner model of DataParallel
                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
            if global_step >= num_training_steps:
                return
Ejemplo n.º 20
0
class AccuracyAndF1(Metric):
    """
    This class encapsulates Accuracy, Precision, Recall and F1 metric logic,
    and `accumulate` function returns accuracy, precision, recall and f1.
    The overview of all metrics could be seen at the document of `paddle.metric
    <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/metric/Overview_cn.html>`_ 
    for details.

    Args:
        topk (int or tuple(int), optional):
            Number of top elements to look at for computing accuracy.
            Defaults to (1,).
        pos_label (int, optional): The positive label for calculating precision
            and recall.
            Defaults to 1.
        name (str, optional):
            String name of the metric instance. Defaults to 'acc_and_f1'.

    Example:

        .. code-block::

            import paddle
            from paddlenlp.metrics import AccuracyAndF1

            x = paddle.to_tensor([[0.1, 0.9], [0.5, 0.5], [0.6, 0.4], [0.7, 0.3]])
            y = paddle.to_tensor([[1], [0], [1], [1]])

            m = AccuracyAndF1()
            correct = m.compute(x, y)
            m.update(correct)
            res = m.accumulate()
            print(res) # (0.5, 0.5, 0.3333333333333333, 0.4, 0.45)

    """
    def __init__(self,
                 topk=(1, ),
                 pos_label=1,
                 name='acc_and_f1',
                 *args,
                 **kwargs):
        super(AccuracyAndF1, self).__init__(*args, **kwargs)
        self.topk = topk
        self.pos_label = pos_label
        self._name = name
        self.acc = Accuracy(self.topk, *args, **kwargs)
        self.precision = Precision(*args, **kwargs)
        self.recall = Recall(*args, **kwargs)
        self.reset()

    def compute(self, pred, label, *args):
        """
        Accepts network's output and the labels, and calculates the top-k
        (maximum value in topk) indices for accuracy.

        Args:
            pred (Tensor): 
                Predicted tensor, and its dtype is float32 or float64, and
                has a shape of [batch_size, num_classes].
            label (Tensor):
                The ground truth tensor, and its dtype is is int64, and has a
                shape of [batch_size, 1] or [batch_size, num_classes] in one
                hot representation.

        Returns:
            Tensor: Correct mask, each element indicates whether the prediction
            equals to the label. Its' a tensor with a data type of float32 and
            has a shape of [batch_size, topk].

        """
        self.label = label
        self.preds_pos = paddle.nn.functional.softmax(pred)[:, self.pos_label]
        return self.acc.compute(pred, label)

    def update(self, correct, *args):
        """
        Updates the metrics states (accuracy, precision and recall), in order to
        calculate accumulated accuracy, precision and recall of all instances.

        Args:
            correct (Tensor):
                Correct mask for calculating accuracy, and it's a tensor with
                shape [batch_size, topk] and has a dtype of
                float32.

        """
        self.acc.update(correct)
        self.precision.update(self.preds_pos, self.label)
        self.recall.update(self.preds_pos, self.label)

    def accumulate(self):
        """
        Calculates and returns the accumulated metric.

        Returns:
            tuple: The accumulated metric. A tuple of shape (acc, precision,
            recall, f1, average_of_acc_and_f1)

            With the fileds:

            - acc (numpy.float64):
                The accumulated accuracy.
            - precision (numpy.float64):
                The accumulated precision.
            - recall (numpy.float64):
                The accumulated recall.
            - f1 (numpy.float64):
                The accumulated f1.
            - average_of_acc_and_f1 (numpy.float64):
                The average of accumulated accuracy and f1.

        """
        acc = self.acc.accumulate()
        precision = self.precision.accumulate()
        recall = self.recall.accumulate()
        if precision == 0.0 or recall == 0.0:
            f1 = 0.0
        else:
            # 1/f1 = 1/2 * (1/precision + 1/recall)
            f1 = (2 * precision * recall) / (precision + recall)
        return (
            acc,
            precision,
            recall,
            f1,
            (acc + f1) / 2,
        )

    def reset(self):
        """
        Resets all metric states.
        """
        self.acc.reset()
        self.precision.reset()
        self.recall.reset()
        self.label = None
        self.preds_pos = None

    def name(self):
        """
        Returns name of the metric instance.

        Returns:
           str: The name of the metric instance.

        """
        return self._name
Ejemplo n.º 21
0
def train(args):

    # 加载数据
    trainset = IMDBDataset(is_training=True)
    testset = IMDBDataset(is_training=False)

    # 封装成MapDataSet的形式
    train_ds = MapDataset(trainset, label_list=[0, 1])
    test_ds = MapDataset(testset, label_list=[0, 1])

    # 定义XLNet的Tokenizer
    tokenizer = XLNetTokenizer.from_pretrained(args.model_name_or_path)

    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         label_list=train_ds.label_list,
                         max_seq_length=args.max_seq_length)

    # 构造train_data_loader 和 dev_data_loader
    train_ds = train_ds.map(trans_func, lazy=True)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, pad_right=False),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, pad_right=False
            ),  # token_type
        Pad(axis=0, pad_val=0, pad_right=False),  # attention_mask
        Stack(dtype="int64" if train_ds.label_list else "float32"),  # label
    ): fn(samples)

    train_data_loader = DataLoader(dataset=train_ds,
                                   batch_sampler=train_batch_sampler,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   return_list=True)

    dev_ds = MapDataset(testset)
    dev_ds = dev_ds.map(trans_func, lazy=True)
    dev_batch_sampler = paddle.io.BatchSampler(dev_ds,
                                               batch_size=args.batch_size,
                                               shuffle=False)

    dev_data_loader = DataLoader(dataset=dev_ds,
                                 batch_sampler=dev_batch_sampler,
                                 collate_fn=batchify_fn,
                                 num_workers=0,
                                 return_list=True)

    # 训练配置
    # 固定随机种子
    set_seed(args)

    # 设定运行环境
    use_gpu = True if paddle.get_device().startswith("gpu") else False
    if use_gpu:
        paddle.set_device('gpu:0')

    num_classes = len(train_ds.label_list)
    model = XLNetForSequenceClassification.from_pretrained(
        args.model_name_or_path, num_classes=num_classes)

    #paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()
        model = paddle.DataParallel(model)

    # 设定lr_scheduler
    if args.max_steps > 0:
        num_training_steps = args.max_steps
        num_train_epochs = ceil(num_training_steps / len(train_data_loader))
    else:
        num_training_steps = len(train_data_loader) * args.num_train_epochs
        num_train_epochs = args.num_train_epochs

    warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion
    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, warmup)

    # 制定优化器
    clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=args.max_grad_norm)
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "layer_norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        beta1=0.9,
        beta2=0.999,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        grad_clip=clip,
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    # 模型训练
    metric = Accuracy()

    # 定义损失函数
    loss_fct = paddle.nn.loss.CrossEntropyLoss(
    ) if train_ds.label_list else paddle.nn.loss.MSELoss()

    global_step = 0
    tic_train = time.time()
    model.train()
    for epoch in range(num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, token_type_ids, attention_mask, labels = batch
            logits = model(input_ids, token_type_ids, attention_mask)
            loss = loss_fct(logits, labels)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()

            if global_step % args.logging_steps == 0:
                print(
                    "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s"
                    % (global_step, num_training_steps, epoch, step,
                       paddle.distributed.get_rank(), loss, optimizer.get_lr(),
                       args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()

            if global_step % args.save_steps == 0 or global_step == num_training_steps:
                tic_eval = time.time()
                evaluate(model, loss_fct, metric, dev_data_loader)
                print("eval done total : %s s" % (time.time() - tic_eval))

                if (not paddle.distributed.get_world_size() > 1
                    ) or paddle.distributed.get_rank() == 0:
                    output_dir = os.path.join(
                        args.output_dir,
                        "%s_ft_model_%d" % (args.task_name, global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # Need better way to get inner model of DataParallel
                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                if global_step == num_training_steps:
                    exit(0)
                tic_train += time.time() - tic_eval
Ejemplo n.º 22
0
        r, t = self.rnn0(x)

        x = paddle.concat([y1, y2, y3, r], axis=-1)

        x = self.rnn1(x)
        x = self.rnn2(x)
        x = paddle.flatten(x, start_axis=1)

        x = self.cls(x)
        return x


if __name__ == '__main__':
    model = FallNet()
    model = paddle.Model(model)
    # model.summary((1,3,151))

    # x = np.random.rand(1,3,151)
    # y = model.predict(x)
    # print(y.shape)

    from paddle.metric import Accuracy

    model.prepare(paddle.optimizer.Adam(0.0001, parameters=model.parameters()),
                  paddle.nn.CrossEntropyLoss(), Accuracy())

    # model.fit(train_dataset,
    #         epochs=20,
    #         batch_size=32,
    #         verbose=1,
    #         )
Ejemplo n.º 23
0
def _dynabert_training(self, task_name, ofa_model, model, teacher_model,
                       train_dataloader, eval_dataloader, width_mult_list,
                       criterion, num_train_epochs, output_dir):
    metric = Accuracy()
    if task_name == "msra_ner":
        metric = ChunkEvaluator(label_list=self.train_dataset.label_list)

    @paddle.no_grad()
    def evaluate(model, criterion, data_loader, width_mult=1.0):
        model.eval()
        all_start_logits = []
        all_end_logits = []
        metric.reset()
        for batch in data_loader:
            if "cmrc2018" in task_name:
                input_ids, token_type_ids = batch['input_ids'], batch[
                    'token_type_ids']
                logits = model(
                    input_ids, token_type_ids, attention_mask=[None, None])
                if width_mult == 100:
                    start_logits_tensor, end_logits_tensor = logits
                else:
                    start_logits_tensor, end_logits_tensor = logits[0]
                for idx in range(start_logits_tensor.shape[0]):
                    if len(all_start_logits) % 1000 == 0 and len(
                            all_start_logits):
                        logger.info("Processing example: %d" %
                                    len(all_start_logits))
                    all_start_logits.append(start_logits_tensor.numpy()[idx])
                    all_end_logits.append(end_logits_tensor.numpy()[idx])

            else:
                input_ids, segment_ids, labels = batch['input_ids'], batch[
                    'token_type_ids'], batch['labels']
                logits = model(
                    input_ids, segment_ids, attention_mask=[None, None])
                if isinstance(logits, tuple):
                    logits = logits[0]
                loss = criterion(logits, labels)
                if task_name == "msra_ner":
                    preds = logits.argmax(axis=2)
                    num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute(
                        batch['seq_len'], preds, batch['labels'])
                    metric.update(num_infer_chunks.numpy(),
                                  num_label_chunks.numpy(),
                                  num_correct_chunks.numpy())
                else:
                    correct = metric.compute(logits, labels)
                    metric.update(correct)
        if "cmrc2018" in task_name:
            n_best_size = 20
            max_answer_length = 50
            all_predictions, _, _ = compute_prediction(
                self.eval_examples, self.eval_dataset,
                (all_start_logits, all_end_logits), False, n_best_size,
                max_answer_length)
            res = squad_evaluate(
                examples=[raw_data for raw_data in self.eval_examples],
                preds=all_predictions,
                is_whitespace_splited=False)
            if width_mult == 100:
                logger.info("teacher model, EM: %f, F1: %f" %
                            (res['exact'], res['f1']))
            else:
                logger.info("width_mult: %s, EM: %f, F1: %f, " %
                            (str(width_mult), res['exact'], res['f1']))
            res = res['exact']
        else:
            res = metric.accumulate()
            # Teacher model's evaluation
            if task_name == "msra_ner":
                if width_mult == 100:
                    logger.info(
                        "teacher model, eval loss: %f, precision: %f, recall: %f, f1_score: %f"
                        % (paddle.mean(loss).numpy(), res[0], res[1], res[2]))
                else:
                    logger.info(
                        "width_mult: %s, eval loss: %f, precision: %f, recall: %f, f1_score: %f"
                        % (str(width_mult), paddle.mean(loss).numpy(), res[0],
                           res[1], res[2]))
                res = res[2]
            else:
                if width_mult == 100:
                    logger.info("teacher model, eval loss: %f, acc: %s, " %
                                (loss.numpy(), res))
                else:
                    logger.info("width_mult: %s, eval loss: %f, acc: %s, " %
                                (str(width_mult), loss.numpy(), res))
        model.train()
        return res

    from paddleslim.nas.ofa import OFA, DistillConfig, utils
    global_step = 0
    lambda_logit = 1.0
    tic_train = time.time()
    best_acc = 0.0
    acc = 0.0
    logger.info("DynaBERT training starts. This period will cost some time.")
    for epoch in range(num_train_epochs):
        # Step7: Set current epoch and task.
        ofa_model.set_epoch(epoch)
        ofa_model.set_task('width')

        for step, batch in enumerate(train_dataloader):
            global_step += 1
            if "cmrc2018" in task_name:
                input_ids, token_type_ids, start_positions, end_positions = batch[
                    'input_ids'], batch['token_type_ids'], batch[
                        'start_positions'], batch['end_positions']
            else:
                input_ids, token_type_ids, labels = batch['input_ids'], batch[
                    'token_type_ids'], batch['labels']
            for width_mult in width_mult_list:
                # Step8: Broadcast supernet config from width_mult,
                # and use this config in supernet training.
                net_config = utils.dynabert_config(ofa_model, width_mult)
                ofa_model.set_net_config(net_config)
                logits, teacher_logits = ofa_model(
                    input_ids, token_type_ids, attention_mask=[None, None])
                rep_loss = ofa_model.calc_distill_loss()
                if "cmrc2018" in task_name:
                    logit_loss = (soft_cross_entropy(logits[0], teacher_logits[0].detach()) \
                                + \
                                soft_cross_entropy(logits[1], teacher_logits[1].detach()))/2
                else:
                    logit_loss = soft_cross_entropy(logits,
                                                    teacher_logits.detach())
                loss = rep_loss + lambda_logit * logit_loss
                loss.backward()
            self.optimizer.step()
            self.lr_scheduler.step()
            self.optimizer.clear_grad()

            if global_step % self.args.logging_steps == 0:
                if paddle.distributed.get_rank() == 0:
                    logger.info(
                        "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                        % (global_step, epoch, step, loss,
                           self.args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()

            if "cmrc2018" not in task_name and global_step % self.args.save_steps == 0:
                tic_eval = time.time()

                evaluate(
                    teacher_model, criterion, eval_dataloader, width_mult=100)
                logger.info("eval done total : %s s" % (time.time() - tic_eval))
                for idx, width_mult in enumerate(width_mult_list):
                    net_config = utils.dynabert_config(ofa_model, width_mult)
                    ofa_model.set_net_config(net_config)
                    tic_eval = time.time()
                    acc = evaluate(ofa_model, criterion, eval_dataloader,
                                   width_mult)
                    if acc > best_acc:
                        best_acc = acc
                        if paddle.distributed.get_rank() == 0:
                            output_dir_width = os.path.join(output_dir,
                                                            str(width_mult))
                            if not os.path.exists(output_dir_width):
                                os.makedirs(output_dir_width)
                            # need better way to get inner model of DataParallel
                            model_to_save = model._layers if isinstance(
                                model, paddle.DataParallel) else model
                            model_to_save.save_pretrained(output_dir_width)
                    logger.info("eval done total : %s s" %
                                (time.time() - tic_eval))
            if global_step > self.args.num_training_steps:
                if best_acc == 0.0:
                    output_dir_width = os.path.join(output_dir, str(width_mult))
                    if not os.path.exists(output_dir_width):
                        os.makedirs(output_dir_width)
                    # need better way to get inner model of DataParallel
                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    model_to_save.save_pretrained(output_dir_width)
                logger.info("Best acc: %.4f" % (best_acc))
                return ofa_model

        if "cmrc2018" in task_name:
            tic_eval = time.time()
            evaluate(teacher_model, criterion, eval_dataloader, width_mult=100)
            logger.info("eval done total : %s s" % (time.time() - tic_eval))
            for idx, width_mult in enumerate(width_mult_list):
                net_config = utils.dynabert_config(ofa_model, width_mult)
                ofa_model.set_net_config(net_config)
                tic_eval = time.time()
                acc = evaluate(ofa_model, criterion, eval_dataloader,
                               width_mult)
                if acc > best_acc:
                    best_acc = acc
                    if paddle.distributed.get_rank() == 0:
                        output_dir_width = os.path.join(output_dir,
                                                        str(width_mult))
                        if not os.path.exists(output_dir_width):
                            os.makedirs(output_dir_width)
                        # need better way to get inner model of DataParallel
                        model_to_save = model._layers if isinstance(
                            model, paddle.DataParallel) else model
                        model_to_save.save_pretrained(output_dir_width)
                logger.info("eval done total : %s s" % (time.time() - tic_eval))

    logger.info("Best acc: %.4f" % (best_acc))
    return ofa_model
Ejemplo n.º 24
0
# # 初始化一个loss_log 的实例,然后将其作为参数传递给fit
# loss_log = LossCallback()

# 调用飞桨框架的VisualDL模块,保存信息到目录中。
callback = paddle.callbacks.VisualDL(log_dir='visualdl_log_dir')

model = paddle.Model(model)
optim = paddle.optimizer.Adam(learning_rate=0.001, parameters=model.parameters())


# model.prepare()
# 配置模型
model.prepare(
    optim,
    paddle.nn.CrossEntropyLoss(),
    Accuracy(topk=(1, 2))
    )

model.fit(train_loader,
        epochs=500,
        verbose=1,
        callbacks=callback
        )

# print(loss_log.losses)

model.evaluate(train_dataset, batch_size=8, verbose=1)

# model.save()

model.save('inference_model', False)
Ejemplo n.º 25
0
parser.add_argument("--epochs", type=int, default=15, help="Number of epoches for training.")
parser.add_argument("--device", type=str, default="gpu", choices=["cpu", "gpu"], help="Select cpu, gpu devices to train model.")
parser.add_argument("--seed", type=int, default=1, help="Random seed for initialization.")
parser.add_argument("--memory_length", type=int, default=128, help="Length of the retained previous heads.")
parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight decay if we apply some.")
parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Linear warmup proption over the training process.")
parser.add_argument("--dataset", default="cail2019_scm", choices=["cail2019_scm"], type=str, help="The training dataset")
parser.add_argument("--dropout", default=0.1, type=float, help="Dropout ratio of ernie_doc")
parser.add_argument("--layerwise_decay", default=1.0, type=float, help="Layerwise decay ratio")
parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.",)

# yapf: enable
args = parser.parse_args()

DATASET_INFO = {
    "cail2019_scm": (ErnieDocTokenizer, "dev", "test", Accuracy()),
}


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    paddle.seed(args.seed)


def init_memory(batch_size, memory_length, d_model, n_layers):
    return [
        paddle.zeros([batch_size, memory_length, d_model], dtype="float32")
        for _ in range(n_layers)
    ]
Ejemplo n.º 26
0
        self.linear2 = paddle.nn.Linear(in_features=120, out_features=84)
        self.linear3 = paddle.nn.Linear(in_features=84, out_features=10)

    def forward(self, x):
        # x = x.reshape((-1, 1, 28, 28))
        x = self.conv1(x)
        x = F.relu(x)
        x = self.max_pool1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = self.max_pool2(x)
        x = paddle.flatten(x, start_axis=1, stop_axis=-1)
        x = self.linear1(x)
        x = F.relu(x)
        x = self.linear2(x)
        x = F.relu(x)
        x = self.linear3(x)
        return x


from paddle.metric import Accuracy
model = paddle.Model(LeNet())  # 用Model封装模型
optim = paddle.optimizer.Adam(learning_rate=0.001,
                              parameters=model.parameters())

# 配置模型
model.prepare(optim, paddle.nn.CrossEntropyLoss(), Accuracy())
# 训练模型
model.fit(train_dataset, epochs=2, batch_size=64, verbose=1)
model.evaluate(test_dataset, batch_size=64, verbose=1)
Ejemplo n.º 27
0
parser.add_argument("--device", type=str, default="gpu", choices=["cpu", "gpu"], help="Select cpu, gpu devices to train model.")
parser.add_argument("--seed", type=int, default=1, help="Random seed for initialization.")
parser.add_argument("--memory_length", type=int, default=128, help="Length of the retained previous heads.")
parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight decay if we apply some.")
parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Linear warmup proption over the training process.")
parser.add_argument("--dataset", default="c3", choices=["c3"], type=str, help="The training dataset")
parser.add_argument("--layerwise_decay", default=0.8, type=float, help="Layerwise decay ratio")
parser.add_argument("--batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
parser.add_argument("--gradient_accumulation_steps", default=4, type=int, help="Number of updates steps to accumualte before performing a backward/update pass.")
parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.",)

# yapf: enable
args = parser.parse_args()

DATASET_INFO = {
    "c3": (ErnieDocTokenizer, "dev", "test", Accuracy()),
}


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    paddle.seed(args.seed)


def init_memory(batch_size, memory_length, d_model, n_layers):
    return [
        paddle.zeros([batch_size, memory_length, d_model], dtype="float32")
        for _ in range(n_layers)
    ]