Exemple #1
0
def from_local_dir():
    print('From_local_dir:--------------------------')
    # From local dir path
    tokenizer = AutoTokenizer.from_pretrained(('saved_model/my_bert'))
    print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!"))
    tokenizer = AutoTokenizer.from_pretrained(('saved_model/my_bart'))
    print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!"))
    tokenizer = AutoTokenizer.from_pretrained(('saved_model/my_bigbird'))
    print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!"))
Exemple #2
0
    def initialize(self, args):
        """`initialize` is called only once when the model is being loaded.
        Implementing `initialize` function is optional. This function allows
        the model to intialize any state associated with this model.
        Parameters
        ----------
        args : dict
          Both keys and values are strings. The dictionary keys and values are:
          * model_config: A JSON string containing the model configuration
          * model_instance_kind: A string containing model instance kind
          * model_instance_device_id: A string containing model instance device ID
          * model_repository: Model repository path
          * model_version: Model version
          * model_name: Model name
        """
        self.tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-medium-zh",
                                                       use_faster=True)
        # You must parse model_config. JSON string is not parsed here
        self.model_config = json.loads(args['model_config'])
        print("model_config:", self.model_config)

        self.input_names = []
        for input_config in self.model_config["input"]:
            self.input_names.append(input_config["name"])
        print("input:", self.input_names)

        self.output_names = []
        self.output_dtype = []
        for output_config in self.model_config["output"]:
            self.output_names.append(output_config["name"])
            dtype = pb_utils.triton_string_to_numpy(output_config["data_type"])
            self.output_dtype.append(dtype)
        print("output:", self.output_names)
    def __init__(self, args):
        if not isinstance(args.device, six.string_types):
            print(
                ">>> [InferBackend] The type of device must be string, but the type you set is: ",
                type(device))
            exit(0)
        args.device = args.device.lower()
        if args.device not in ['cpu', 'gpu']:
            print(
                ">>> [InferBackend] The device must be cpu or gpu, but your device is set to:",
                type(args.device))
            exit(0)

        self.task_name = args.task_name
        self.tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path,
                                                       use_faster=True)
        if args.task_name == 'seq_cls':
            self.label_names = []
            self.preprocess = self.seq_cls_preprocess
            self.postprocess = self.seq_cls_postprocess
            self.printer = seq_cls_print_ret
        elif args.task_name == 'token_cls':
            self.label_names = [
                'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'
            ]
            self.preprocess = self.token_cls_preprocess
            self.postprocess = self.token_cls_postprocess
            self.printer = token_cls_print_ret
        else:
            print(
                "[ErniePredictor]: task_name only support seq_cls and token_cls now."
            )
            exit(0)

        self.max_seq_length = args.max_seq_length

        if args.device == 'cpu':
            args.use_fp16 = False
            args.set_dynamic_shape = False
            args.batch_size = 32
            args.shape_info_file = None
        if args.device == 'gpu':
            args.num_threads = cpu_count()
            args.use_quantize = False
        self.inference_backend = InferBackend(
            args.model_path,
            batch_size=args.batch_size,
            device=args.device,
            use_fp16=args.use_fp16,
            use_quantize=args.use_quantize,
            set_dynamic_shape=args.set_dynamic_shape,
            shape_info_file=args.shape_info_file,
            num_threads=args.num_threads)
        if args.set_dynamic_shape:
            # If set_dynamic_shape is turned on, all required dynamic shapes will be automatically set according to the batch_size and max_seq_length.
            self.set_dynamic_shape(args.max_seq_length, args.batch_size)
            exit(0)
def do_eval(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args)

    args.task_name = args.task_name.lower()
    metric_class = METRIC_CLASSES[args.task_name]

    dev_ds = load_dataset('clue', args.task_name, splits='dev')

    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
    trans_func = partial(
        convert_example,
        label_list=dev_ds.label_list,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length)

    dev_ds = dev_ds.map(trans_func, lazy=True)
    dev_batch_sampler = paddle.io.BatchSampler(
        dev_ds, batch_size=args.batch_size, shuffle=False)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
        Stack(dtype="int64" if dev_ds.label_list else "float32")  # label
    ): fn(samples)

    dev_data_loader = DataLoader(
        dataset=dev_ds,
        batch_sampler=dev_batch_sampler,
        collate_fn=batchify_fn,
        num_workers=0,
        return_list=True)

    num_classes = 1 if dev_ds.label_list == None else len(dev_ds.label_list)

    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name_or_path, num_classes=num_classes)
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    metric = metric_class()
    best_acc = 0.0
    global_step = 0
    tic_train = time.time()
    model.eval()
    metric.reset()
    for batch in dev_data_loader:
        input_ids, segment_ids, labels = batch
        logits = model(input_ids, segment_ids)
        correct = metric.compute(logits, labels)
        metric.update(correct)
    res = metric.accumulate()
    print("acc: %s\n, " % (res), end='')
Exemple #5
0
def from_community_models():
    print('From_community_models:-------------------')
    # From community-contributed pretrained models
    tokenizer = AutoTokenizer.from_pretrained(
        'yingyibiao/bert-base-uncased-sst-2-finetuned')
    print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!"))
    tokenizer.save_pretrained('saved_tokenizer/community_bert_auto')

    tokenizer = BertTokenizer.from_pretrained(
        'yingyibiao/bert-base-uncased-sst-2-finetuned')
    print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!"))
    tokenizer.save_pretrained('saved_tokenizer/community_bert')

    # community without init_class
    tokenizer = AutoTokenizer.from_pretrained(
        'junnyu/ckiplab-bert-base-chinese-ner')
    print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!"))

    tokenizer = BertTokenizer.from_pretrained(
        'junnyu/ckiplab-bert-base-chinese-ner')
    print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!"))
def do_predict(args):
    paddle.set_device(args.device)
    args.task_name = args.task_name.lower()

    train_ds, test_ds = load_dataset(
        'clue', args.task_name, splits=('train', 'test'))
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)

    trans_func = partial(
        convert_example,
        tokenizer=tokenizer,
        label_list=train_ds.label_list,
        max_seq_length=args.max_seq_length,
        is_test=True)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
    ): fn(samples)

    test_ds = test_ds.map(trans_func, lazy=True)
    test_batch_sampler = paddle.io.BatchSampler(
        test_ds, batch_size=args.batch_size, shuffle=False)
    test_data_loader = DataLoader(
        dataset=test_ds,
        batch_sampler=test_batch_sampler,
        collate_fn=batchify_fn,
        num_workers=0,
        return_list=True)

    num_classes = 1 if train_ds.label_list == None else len(train_ds.label_list)

    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name_or_path, num_classes=num_classes)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    if args.task_name == 'ocnli':
        args.task_name = 'ocnli_50k'
    f = open(
        os.path.join(args.output_dir, args.task_name + "_predict.json"), 'w')

    for step, batch in enumerate(test_data_loader):
        input_ids, segment_ids = batch

        with paddle.no_grad():
            logits = model(input_ids, segment_ids)

        preds = paddle.argmax(logits, axis=1)
        for idx, pred in enumerate(preds):
            j = json.dumps({"id": idx, "label": train_ds.label_list[pred]})
            f.write(j + "\n")
 def init_op(self):
     from paddlenlp.transformers import AutoTokenizer
     self.tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-medium-zh",
                                                    use_faster=True)
     # The label names of NER models trained by different data sets may be different
     self.label_names = [
         'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'
     ]
     # Output nodes may differ from model to model
     # You can see the output node name in the conf.prototxt file of serving_server
     self.fetch_names = [
         "linear_113.tmp_1",
     ]
Exemple #8
0
def from_built_in_models():
    print('From_built_in_models:------------------')
    # From built-in pretrained models
    tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
    print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!"))

    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!"))

    tokenizer = AutoTokenizer.from_pretrained('ernie-ctm')
    print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!"))

    tokenizer = AutoTokenizer.from_pretrained('plato-mini')
    print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!"))

    tokenizer = UnifiedTransformerTokenizer.from_pretrained('plato-mini')
    print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!"))

    tokenizer = AutoTokenizer.from_pretrained('bigbird-base-uncased')
    print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!"))

    tokenizer = BigBirdTokenizer.from_pretrained('bigbird-base-uncased')
    print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!"))
Exemple #9
0
def seg(args):
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path,
                                              do_lower_case=True)
    seg_file(
        os.path.join(args.output_dir, args.data_split + ".txt.tmp"),
        tokenizer,
        args.max_len,
    )
    seg_file(
        os.path.join(args.output_dir, args.data_split + "_box.txt.tmp"),
        tokenizer,
        args.max_len,
    )
    seg_file(
        os.path.join(args.output_dir, args.data_split + "_image.txt.tmp"),
        tokenizer,
        args.max_len,
    )
    def __init__(self, args):
        if not isinstance(args.device, six.string_types):
            print(
                ">>> [InferBackend] The type of device must be string, but the type you set is: ",
                type(device))
            exit(0)
        if args.device not in ['cpu', 'gpu']:
            print(
                ">>> [InferBackend] The device must be cpu or gpu, but your device is set to:",
                type(args.device))
            exit(0)

        self._tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh",
                                                        use_faster=True)
        self._position_prob = args.position_prob
        self._max_seq_len = args.max_seq_len
        self._schema_tree = None
        self.set_schema(args.schema)
        if args.device == 'cpu':
            args.use_fp16 = False
        self.inference_backend = InferBackend(args.model_path_prefix,
                                              device=args.device,
                                              use_fp16=args.use_fp16)
Exemple #11
0
    def __init__(self, args):
        self.task_name = args.task_name
        self.tokenizer = AutoTokenizer.from_pretrained(
            args.model_name_or_path, use_faster=True)
        if args.task_name == 'seq_cls':
            self.label_names = []
            self.preprocess = self.seq_cls_preprocess
            self.postprocess = self.seq_cls_postprocess
            self.printer = seq_cls_print_ret
        elif args.task_name == 'token_cls':
            self.label_names = [
                'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'
            ]
            self.preprocess = self.token_cls_preprocess
            self.postprocess = self.token_cls_postprocess
            self.printer = token_cls_print_ret
        else:
            print(
                "[ErniePredictor]: task_name only support seq_cls and token_cls now."
            )
            exit(0)

        self.max_seq_length = args.max_seq_length
        self.inference_backend = InferBackend(args.model_path, args.use_fp16)
Exemple #12
0
def do_eval():
    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
    model = UIE.from_pretrained(args.model_path)

    test_ds = load_dataset(reader,
                           data_path=args.test_path,
                           max_seq_len=args.max_seq_len,
                           lazy=False)
    test_ds = test_ds.map(
        partial(convert_example,
                tokenizer=tokenizer,
                max_seq_len=args.max_seq_len))

    test_batch_sampler = paddle.io.BatchSampler(dataset=test_ds,
                                                batch_size=args.batch_size,
                                                shuffle=False)
    test_data_loader = paddle.io.DataLoader(dataset=test_ds,
                                            batch_sampler=test_batch_sampler,
                                            return_list=True)

    metric = SpanEvaluator()
    precision, recall, f1 = evaluate(model, metric, test_data_loader)
    logger.info("Evaluation precision: %.5f, recall: %.5f, F1: %.5f" %
                (precision, recall, f1))
Exemple #13
0
def do_train():
    parser = PdArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    paddle.set_device(training_args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, "
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # set_seed(args)
    data_args.dataset = data_args.dataset.strip()
    if data_args.dataset not in ALL_DATASETS:
        raise ValueError("Not found dataset {}".format(data_args.dataset))

    # Use yaml config to rewrite all args.
    config = ALL_DATASETS[data_args.dataset]
    for args in (model_args, data_args, training_args):
        for arg in vars(args):
            if arg in config.keys():
                setattr(args, arg, config[arg])

    training_args.per_device_train_batch_size = config["batch_size"]
    training_args.per_device_eval_batch_size = config["batch_size"]

    dataset_config = data_args.dataset.split(" ")
    all_ds = load_dataset(
        dataset_config[0],
        None if len(dataset_config) <= 1 else dataset_config[1],
    )

    label_list = getattr(all_ds['train'], "label_list", None)
    data_args.label_list = label_list
    data_args.ignore_label = -100
    data_args.no_entity_id = len(data_args.label_list) - 1

    num_classes = 1 if all_ds["train"].label_list == None else len(
        all_ds['train'].label_list)

    # Define tokenizer, model, loss function.
    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
    model = AutoModelForTokenClassification.from_pretrained(
        model_args.model_name_or_path, num_classes=num_classes)

    class criterion(nn.Layer):
        def __init__(self):
            super(criterion, self).__init__()
            self.loss_fn = paddle.nn.loss.CrossEntropyLoss(
                ignore_index=data_args.ignore_label)

        def forward(self, *args, **kwargs):
            return paddle.mean(self.loss_fn(*args, **kwargs))

    loss_fct = criterion()

    # Define dataset pre-process function
    trans_fn = partial(ner_trans_fn, tokenizer=tokenizer, args=data_args)

    # Define data collector
    batchify_fn = ner_collator(tokenizer, data_args)

    # Dataset pre-process
    train_dataset = all_ds["train"].map(trans_fn)
    eval_dataset = all_ds["dev"].map(trans_fn)
    test_dataset = all_ds["test"].map(trans_fn)

    # Define the metrics of tasks.
    # Metrics
    metric = load_metric("seqeval")

    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [[
            label_list[p] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]
        true_labels = [[
            label_list[l] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]
        results = metric.compute(predictions=true_predictions,
                                 references=true_labels)
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

    trainer = Trainer(
        model=model,
        criterion=loss_fct,
        args=training_args,
        data_collator=batchify_fn,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # Log model and data config
    trainer.print_config(model_args, "Model")
    trainer.print_config(data_args, "Data")

    checkpoint = None
    if training_args.resume_from_checkpoint is not None:
        checkpoint = training_args.resume_from_checkpoint
    elif last_checkpoint is not None:
        checkpoint = last_checkpoint

    # Training
    train_result = trainer.train(resume_from_checkpoint=checkpoint)
    metrics = train_result.metrics
    trainer.save_model()  # Saves the tokenizer too for easy upload
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()

    # Evaluate and tests model
    eval_metrics = trainer.evaluate()
    trainer.log_metrics("eval", eval_metrics)

    test_ret = trainer.predict(test_dataset)
    trainer.log_metrics("test", test_ret.metrics)
    if test_ret.label_ids is None:
        paddle.save(
            test_ret.predictions,
            os.path.join(training_args.output_dir, "test_results.pdtensor"),
        )

    # export inference model
    input_spec = [
        paddle.static.InputSpec(shape=[None, None],
                                dtype="int64"),  # input_ids
        paddle.static.InputSpec(shape=[None, None],
                                dtype="int64")  # segment_ids
    ]
    trainer.export_model(input_spec=input_spec,
                         load_best_model=True,
                         output_dir=model_args.export_model_dir)
def main():
    parser = PdArgumentParser(
        (ModelArguments, DataArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    paddle.set_device(training_args.device)

    data_args.dataset = data_args.dataset.strip()
    if data_args.dataset not in ALL_DATASETS:
        raise ValueError("Not found dataset {}".format(data_args.dataset))

    if data_args.dataset in ALL_DATASETS:
        # if you custom you hyper-parameters in yaml config, it will overwrite all args.
        config = ALL_DATASETS[data_args.dataset]
        for args in (model_args, data_args, training_args):
            for arg in vars(args):
                if arg in config.keys():
                    setattr(args, arg, config[arg])

        training_args.per_device_train_batch_size = config["batch_size"]
        training_args.per_device_eval_batch_size = config["batch_size"]

    # Log model and data config
    training_args.print_config(model_args, "Model")
    training_args.print_config(data_args, "Data")

    dataset_config = data_args.dataset.split(" ")
    raw_datasets = load_dataset(
        dataset_config[0],
        None if len(dataset_config) <= 1 else dataset_config[1],
    )

    label_list = raw_datasets['train'].features['ner_tags'].feature.names
    data_args.label_list = label_list
    data_args.ignore_label = -100

    data_args.no_entity_id = 0
    num_classes = 1 if label_list == None else len(label_list)

    # Define tokenizer, model, loss function.
    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
    model = AutoModelForTokenClassification.from_pretrained(
        model_args.model_name_or_path, num_classes=num_classes)

    class criterion(nn.Layer):
        def __init__(self):
            super(criterion, self).__init__()
            self.loss_fn = paddle.nn.loss.CrossEntropyLoss(
                ignore_index=data_args.ignore_label)

        def forward(self, *args, **kwargs):
            return paddle.mean(self.loss_fn(*args, **kwargs))

    loss_fct = criterion()

    # Define dataset pre-process function
    trans_fn = partial(ner_trans_fn, tokenizer=tokenizer, args=data_args)

    # Define data collector
    data_collator = DataCollatorForTokenClassification(
        tokenizer, label_pad_token_id=data_args.ignore_label)

    column_names = raw_datasets["train"].column_names

    # Dataset pre-process
    train_dataset = raw_datasets["train"].map(trans_fn,
                                              remove_columns=column_names)
    train_dataset.label_list = label_list

    eval_dataset = raw_datasets["test"].map(trans_fn,
                                            remove_columns=column_names)

    trainer = Trainer(model=model,
                      criterion=loss_fct,
                      args=training_args,
                      data_collator=data_collator,
                      train_dataset=train_dataset,
                      eval_dataset=eval_dataset,
                      tokenizer=tokenizer)

    output_dir = os.path.join(model_args.model_name_or_path, "compress")

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    compress_config = CompressConfig(quantization_config=PTQConfig(
        algo_list=['hist', 'mse'], batch_size_list=[4, 8, 16]))

    trainer.compress(data_args.dataset,
                     output_dir,
                     pruning=True,
                     quantization=True,
                     compress_config=compress_config)
def run(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    raw_datasets = load_dataset(args.task_name)

    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
    train_ds = raw_datasets['train']
    column_names = train_ds.column_names

    label_list = train_ds.features['ner_tags'].feature.names
    label_num = len(label_list)

    batchify_fn = DataCollatorForTokenClassification(tokenizer=tokenizer)

    # Define the model netword and its loss
    model = AutoModelForTokenClassification.from_pretrained(
        args.model_name_or_path, num_classes=label_num)

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    def tokenize_and_align_labels(examples, no_entity_id=0):
        tokenized_inputs = tokenizer(
            examples['tokens'],
            max_seq_len=args.max_seq_length,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
            return_length=True)
        labels = []

        for i, label in enumerate(examples['ner_tags']):
            label_ids = label
            if len(tokenized_inputs['input_ids'][i]) - 2 < len(label_ids):
                label_ids = label_ids[:len(tokenized_inputs['input_ids'][i]) -
                                      2]
            label_ids = [no_entity_id] + label_ids + [no_entity_id]
            label_ids += [no_entity_id] * (
                len(tokenized_inputs['input_ids'][i]) - len(label_ids))

            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    test_ds = raw_datasets['test']
    test_ds = test_ds.select(range(len(test_ds) - 1))
    test_ds = test_ds.map(tokenize_and_align_labels,
                          batched=True,
                          remove_columns=column_names)
    test_data_loader = DataLoader(dataset=test_ds,
                                  collate_fn=batchify_fn,
                                  num_workers=0,
                                  batch_size=args.batch_size,
                                  return_list=True)

    if args.do_train:
        train_ds = train_ds.select(range(len(train_ds) - 1))

        train_ds = train_ds.map(tokenize_and_align_labels,
                                batched=True,
                                remove_columns=column_names)

        train_batch_sampler = paddle.io.DistributedBatchSampler(
            train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True)

        train_data_loader = DataLoader(dataset=train_ds,
                                       collate_fn=batchify_fn,
                                       num_workers=0,
                                       batch_sampler=train_batch_sampler,
                                       return_list=True)

        num_training_steps = args.max_steps if args.max_steps > 0 else len(
            train_data_loader) * args.num_train_epochs

        lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                             num_training_steps,
                                             args.warmup_steps)

        # Generate parameter names needed to perform weight decay.
        # All bias and LayerNorm parameters are excluded.
        decay_params = [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ]
        optimizer = paddle.optimizer.AdamW(
            learning_rate=lr_scheduler,
            epsilon=args.adam_epsilon,
            parameters=model.parameters(),
            weight_decay=args.weight_decay,
            apply_decay_param_fun=lambda x: x in decay_params)

        loss_fct = paddle.nn.loss.CrossEntropyLoss()

        metric = ChunkEvaluator(label_list=label_list)

        global_step = 0
        best_f1 = 0.0
        last_step = args.num_train_epochs * len(train_data_loader)
        tic_train = time.time()
        for epoch in range(args.num_train_epochs):
            for step, batch in enumerate(train_data_loader):
                global_step += 1
                logits = model(batch['input_ids'], batch['token_type_ids'])
                loss = loss_fct(logits, batch['labels'])
                avg_loss = paddle.mean(loss)
                if global_step % args.logging_steps == 0:
                    print(
                        "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                        % (global_step, epoch, step, avg_loss,
                           args.logging_steps / (time.time() - tic_train)))
                    tic_train = time.time()
                avg_loss.backward()
                optimizer.step()
                lr_scheduler.step()
                optimizer.clear_grad()
                if global_step % args.save_steps == 0 or global_step == num_training_steps:
                    if paddle.distributed.get_rank() == 0:
                        f1 = evaluate(model, loss_fct, metric,
                                      test_data_loader, label_num, "test")
                        if f1 > best_f1:
                            best_f1 = f1
                            output_dir = args.output_dir
                            if not os.path.exists(output_dir):
                                os.makedirs(output_dir)
                            # Need better way to get inner model of DataParallel
                            model_to_save = model._layers if isinstance(
                                model, paddle.DataParallel) else model
                            model_to_save.save_pretrained(output_dir)
                            tokenizer.save_pretrained(output_dir)
                if global_step >= num_training_steps:
                    print("best_f1: ", best_f1)
                    return
        print("best_f1: ", best_f1)

    if args.do_eval:
        eval_data_loader = DataLoader(dataset=test_ds,
                                      collate_fn=batchify_fn,
                                      num_workers=0,
                                      batch_size=args.batch_size,
                                      return_list=True)

        # Define the model netword and its loss
        model = AutoModelForTokenClassification.from_pretrained(
            args.model_name_or_path, num_classes=label_num)
        loss_fct = paddle.nn.loss.CrossEntropyLoss()

        metric = ChunkEvaluator(label_list=label_list)

        model.eval()
        metric.reset()
        for step, batch in enumerate(eval_data_loader):
            logits = model(batch["input_ids"], batch["token_type_ids"])
            loss = loss_fct(logits, batch["labels"])
            avg_loss = paddle.mean(loss)
            preds = logits.argmax(axis=2)
            num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute(
                batch["length"], preds, batch["labels"])
            metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(),
                          num_correct_chunks.numpy())
            precision, recall, f1_score = metric.accumulate()
        print("eval loss: %f, precision: %f, recall: %f, f1: %f" %
              (avg_loss, precision, recall, f1_score))
Exemple #16
0
def main():
    parser = PdArgumentParser(
        (ModelArguments, DataArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    paddle.set_device(training_args.device)

    data_args.dataset = data_args.dataset.strip()

    if data_args.dataset in ALL_DATASETS:
        # if you custom you hyper-parameters in yaml config, it will overwrite all args.
        config = ALL_DATASETS[data_args.dataset]
        logger.info("Over-writing training config by yaml config!")
        for args in (model_args, data_args, training_args):
            for arg in vars(args):
                if arg in config.keys():
                    setattr(args, arg, config[arg])

        training_args.per_device_train_batch_size = config["batch_size"]
        training_args.per_device_eval_batch_size = config["batch_size"]

    # Log model and data config
    training_args.print_config(model_args, "Model")
    training_args.print_config(data_args, "Data")

    dataset_config = data_args.dataset.split(" ")
    raw_datasets = load_dataset(
        dataset_config[0],
        None if len(dataset_config) <= 1 else dataset_config[1],
        splits=("train", "dev", "test"))

    data_args.label_list = getattr(raw_datasets['train'], "label_list", None)
    num_classes = 1 if raw_datasets["train"].label_list == None else len(
        raw_datasets['train'].label_list)

    criterion = paddle.nn.CrossEntropyLoss()
    # Define tokenizer, model, loss function.
    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path, num_classes=num_classes)

    # Define dataset pre-process function
    if "clue" in data_args.dataset:
        trans_fn = partial(clue_trans_fn, tokenizer=tokenizer, args=data_args)
    else:
        trans_fn = partial(seq_trans_fn, tokenizer=tokenizer, args=data_args)

    # Define data collector
    data_collator = DataCollatorWithPadding(tokenizer)

    train_dataset = raw_datasets["train"].map(trans_fn)
    eval_dataset = raw_datasets["dev"].map(trans_fn)

    trainer = Trainer(model=model,
                      args=training_args,
                      data_collator=data_collator,
                      train_dataset=train_dataset,
                      eval_dataset=eval_dataset,
                      tokenizer=tokenizer,
                      criterion=criterion)

    output_dir = os.path.join(model_args.model_name_or_path, "compress")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    compress_config = CompressConfig(quantization_config=PTQConfig(
        algo_list=['hist', 'mse'], batch_size_list=[4, 8, 16]))

    trainer.compress(data_args.dataset,
                     output_dir,
                     pruning=True,
                     quantization=True,
                     compress_config=compress_config)
def do_train(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args)

    args.task_name = args.task_name.lower()
    metric_class = METRIC_CLASSES[args.task_name]

    train_ds, dev_ds = load_dataset(
        'clue', args.task_name, splits=('train', 'dev'))

    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)

    trans_func = partial(
        convert_example,
        label_list=train_ds.label_list,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length)

    train_ds = train_ds.map(trans_func, lazy=True)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True)

    dev_ds = dev_ds.map(trans_func, lazy=True)
    dev_batch_sampler = paddle.io.BatchSampler(
        dev_ds, batch_size=args.batch_size, shuffle=False)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
        Stack(dtype="int64" if train_ds.label_list else "float32")  # label
    ): fn(samples)

    train_data_loader = DataLoader(
        dataset=train_ds,
        batch_sampler=train_batch_sampler,
        collate_fn=batchify_fn,
        num_workers=0,
        return_list=True)
    dev_data_loader = DataLoader(
        dataset=dev_ds,
        batch_sampler=dev_batch_sampler,
        collate_fn=batchify_fn,
        num_workers=0,
        return_list=True)

    num_classes = 1 if train_ds.label_list == None else len(train_ds.label_list)
    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name_or_path, num_classes=num_classes)
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    if args.max_steps > 0:
        num_training_steps = args.max_steps
        num_train_epochs = math.ceil(num_training_steps /
                                     len(train_data_loader))
    else:
        num_training_steps = len(train_data_loader) * args.num_train_epochs
        num_train_epochs = args.num_train_epochs

    warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
                                         warmup)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        beta1=0.9,
        beta2=0.999,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params,
        grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm))

    loss_fct = paddle.nn.loss.CrossEntropyLoss(
    ) if train_ds.label_list else paddle.nn.loss.MSELoss()

    metric = metric_class()
    best_acc = 0.0
    global_step = 0
    tic_train = time.time()
    for epoch in range(num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, segment_ids, labels = batch
            logits = model(input_ids, segment_ids)
            loss = loss_fct(logits, labels)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % args.logging_steps == 0:
                print(
                    "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s"
                    % (global_step, num_training_steps, epoch, step,
                       paddle.distributed.get_rank(), loss, optimizer.get_lr(),
                       args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()
            if global_step % args.save_steps == 0 or global_step == num_training_steps:
                tic_eval = time.time()
                acc = evaluate(model, loss_fct, metric, dev_data_loader)
                print("eval done total : %s s" % (time.time() - tic_eval))
                if acc > best_acc:
                    best_acc = acc
                    output_dir = args.output_dir
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # Need better way to get inner model of DataParallel
                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
            if global_step >= num_training_steps:
                print("best_acc: ", best_acc)
                return
    print("best_acc: ", best_acc)
Exemple #18
0
def do_train():
    paddle.set_device(args.device)
    rank = paddle.distributed.get_rank()
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args.seed)
    train_ds = load_dataset(
        read_text_pair, data_path=args.train_set_file,is_test=False, lazy=False)
    model_name_or_path='rocketqa-zh-dureader-query-encoder'
    pretrained_model = AutoModel.from_pretrained(
       model_name_or_path,
       hidden_dropout_prob=args.dropout,
       attention_probs_dropout_prob=args.dropout)

    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

    trans_func = partial(
        convert_example,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),  # query_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"),  # query_segment
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),  # title_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"),  # tilte_segment
    ): [data for data in fn(samples)]


    train_data_loader = create_dataloader(
        train_ds,
        mode='train',
        batch_size=args.batch_size,
        batchify_fn=batchify_fn,
        trans_fn=trans_func)


    model = SimCSE(
        pretrained_model,
        margin=args.margin,
        scale=args.scale,
        output_emb_size=args.output_emb_size)

    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
        state_dict = paddle.load(args.init_from_ckpt)
        model.set_dict(state_dict)
        print("warmup from:{}".format(args.init_from_ckpt))

    model = paddle.DataParallel(model)

    num_training_steps = args.max_steps if args.max_steps > 0 else len(
        train_data_loader) * args.epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
                                         args.warmup_proportion)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    global_step = 0
    tic_train = time.time()
    for epoch in range(1, args.epochs + 1):
        for step, batch in enumerate(train_data_loader, start=1):
            query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids = batch
            if(random.random()<0.2):
                title_input_ids,title_token_type_ids=query_input_ids,query_token_type_ids
                query_input_ids,query_token_type_ids=word_repetition(query_input_ids,query_token_type_ids,args.dup_rate)
                title_input_ids,title_token_type_ids=word_repetition(title_input_ids,title_token_type_ids,args.dup_rate)

            loss, kl_loss = model(
                query_input_ids=query_input_ids,
                title_input_ids=title_input_ids,
                query_token_type_ids=query_token_type_ids,
                title_token_type_ids=title_token_type_ids)

            loss = loss + kl_loss * args.rdrop_coef

            global_step += 1
            if global_step % 10 == 0 and rank == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s"
                    % (global_step, epoch, step, loss,
                       10 / (time.time() - tic_train)))
                tic_train = time.time()

            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % args.save_steps == 0 and rank == 0:
                save_dir = os.path.join(args.save_dir, "model_%d" % global_step)
                if not os.path.exists(save_dir):
                    os.makedirs(save_dir)
                save_param_path = os.path.join(save_dir, 'model_state.pdparams')
                paddle.save(model.state_dict(), save_param_path)
                tokenizer.save_pretrained(save_dir)

            if args.max_steps > 0 and global_step >= args.max_steps:
                return

    save_dir = os.path.join(args.save_dir, "model_%d" % global_step)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        save_param_path = os.path.join(save_dir, 'model_state.pdparams')
        paddle.save(model.state_dict(), save_param_path)
        tokenizer.save_pretrained(save_dir)
Exemple #19
0
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--params_path", type=str, required=True,
                    default='./checkpoint/model_50/model_state.pdparams', help="The path to model parameters to be loaded.")
parser.add_argument("--output_path", type=str, default='./output',
                    help="The path of model parameter in static graph to be saved.")
args = parser.parse_args()
# yapf: enable

if __name__ == "__main__":
    # If you want to use ernie1.0 model, plesace uncomment the following code
    output_emb_size = 256

    pretrained_model = AutoModel.from_pretrained("ernie-1.0")

    tokenizer = AutoTokenizer.from_pretrained('ernie-1.0')
    model = SimCSE(pretrained_model, output_emb_size=output_emb_size)

    if args.params_path and os.path.isfile(args.params_path):
        state_dict = paddle.load(args.params_path)
        model.set_dict(state_dict)
        print("Loaded parameters from %s" % args.params_path)

    model.eval()
    # Convert to static graph with specific input description
    model = paddle.jit.to_static(
        model,
        input_spec=[
            paddle.static.InputSpec(shape=[None, None],
                                    dtype="int64"),  # input_ids
            paddle.static.InputSpec(shape=[None, None],
Exemple #20
0
def run(args):
    max_seq_length = args.max_seq_length
    max_num_choices = 4

    def preprocess_function(examples, do_predict=False):
        def _truncate_seq_tuple(tokens_a, tokens_b, tokens_c, max_length):
            """Truncates a sequence tuple in place to the maximum length."""
            # This is a simple heuristic which will always truncate the longer
            # sequence one token at a time. This makes more sense than
            # truncating an equal percent of tokens from each, since if one
            # sequence is very short then each token that's truncated likely
            # contains more information than a longer sequence.
            while True:
                total_length = len(tokens_a) + len(tokens_b) + len(tokens_c)
                if total_length <= max_length:
                    break
                if len(tokens_a) >= len(tokens_b) and len(tokens_a) >= len(
                        tokens_c):
                    tokens_a.pop()
                elif len(tokens_b) >= len(tokens_a) and len(tokens_b) >= len(
                        tokens_c):
                    tokens_b.pop()
                else:
                    tokens_c.pop()

        num_examples = len(examples.data["question"])
        if do_predict:
            result = {"input_ids": [], "token_type_ids": []}
        else:
            result = {"input_ids": [], "token_type_ids": [], "labels": []}
        for idx in range(num_examples):
            text = '\n'.join(examples.data["context"][idx]).lower()
            question = examples.data["question"][idx].lower()
            choice_list = examples.data["choice"][idx]
            choice_list = [choice.lower() for choice in choice_list]
            if not do_predict:
                answer = examples.data["answer"][idx].lower()
                label = choice_list.index(answer)

            tokens_t = tokenizer.tokenize(text)
            tokens_q = tokenizer.tokenize(question)

            tokens_t_list = []
            tokens_c_list = []

            # Pad each new example for axis=1, [batch_size, num_choices, seq_len]
            while len(choice_list) < max_num_choices:
                choice_list.append('无效答案')

            for choice in choice_list:
                tokens_c = tokenizer.tokenize(choice.lower())
                _truncate_seq_tuple(tokens_t, tokens_q, tokens_c,
                                    max_seq_length - 4)

                tokens_c = tokens_q + ["[SEP]"] + tokens_c
                tokens_t_list.append(tokens_t)
                tokens_c_list.append(tokens_c)

            new_data = tokenizer(tokens_t_list,
                                 text_pair=tokens_c_list,
                                 is_split_into_words=True)

            # Pad each new example for axis=2 of [batch_size, num_choices, seq_len],
            # because length of each choice could be different.
            input_ids = Pad(axis=0, pad_val=tokenizer.pad_token_id)(
                new_data["input_ids"])
            token_type_ids = Pad(axis=0, pad_val=tokenizer.pad_token_id)(
                new_data["token_type_ids"])

            # Final shape of input_ids: [batch_size, num_choices, seq_len]
            result["input_ids"].append(input_ids)
            result["token_type_ids"].append(token_type_ids)
            if not do_predict:
                result["labels"].append([label])
            if (idx + 1) % 1000 == 0:
                print(idx + 1, "samples have been processed.")
        return result

    paddle.set_device(args.device)
    set_seed(args)

    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
    model = AutoModelForMultipleChoice.from_pretrained(
        args.model_name_or_path, num_choices=max_num_choices)

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    train_ds, dev_ds, test_ds = load_dataset(
        "clue", "c3", split=["train", "validation", "test"])

    if args.do_train:
        args.batch_size = int(args.batch_size /
                              args.gradient_accumulation_steps)
        column_names = train_ds.column_names
        train_ds = train_ds.map(preprocess_function,
                                batched=True,
                                batch_size=len(train_ds),
                                num_proc=1,
                                remove_columns=column_names)
        batchify_fn = lambda samples, fn=Dict({
            'input_ids':
            Pad(axis=1, pad_val=tokenizer.pad_token_id),  # input
            'token_type_ids':
            Pad(axis=1, pad_val=tokenizer.pad_token_type_id),  # segment
            'labels':
            Stack(dtype="int64")  # label
        }): fn(samples)

        train_batch_sampler = paddle.io.DistributedBatchSampler(
            train_ds, batch_size=args.batch_size, shuffle=True)
        train_data_loader = paddle.io.DataLoader(
            dataset=train_ds,
            batch_sampler=train_batch_sampler,
            collate_fn=batchify_fn,
            num_workers=0,
            return_list=True)
        dev_ds = dev_ds.map(preprocess_function,
                            batched=True,
                            batch_size=len(dev_ds),
                            remove_columns=column_names,
                            num_proc=1)
        dev_batch_sampler = paddle.io.BatchSampler(
            dev_ds, batch_size=args.eval_batch_size, shuffle=False)
        dev_data_loader = paddle.io.DataLoader(dataset=dev_ds,
                                               batch_sampler=dev_batch_sampler,
                                               collate_fn=batchify_fn,
                                               return_list=True)
        num_training_steps = int(
            len(train_data_loader) * args.num_train_epochs /
            args.gradient_accumulation_steps)
        lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                             num_training_steps, 0)

        # Generate parameter names needed to perform weight decay.
        # All bias and LayerNorm parameters are excluded.
        decay_params = [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ]
        grad_clip = paddle.nn.ClipGradByGlobalNorm(args.max_grad_norm)
        optimizer = paddle.optimizer.AdamW(
            learning_rate=lr_scheduler,
            parameters=model.parameters(),
            weight_decay=args.weight_decay,
            apply_decay_param_fun=lambda x: x in decay_params,
            grad_clip=grad_clip)
        loss_fct = paddle.nn.loss.CrossEntropyLoss()
        metric = paddle.metric.Accuracy()
        model.train()
        global_step = 0
        best_acc = 0.0
        tic_train = time.time()
        for epoch in range(args.num_train_epochs):
            for step, batch in enumerate(train_data_loader):
                input_ids, segment_ids, label = batch
                logits = model(input_ids=input_ids, token_type_ids=segment_ids)
                loss = loss_fct(logits, label)
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    global_step += 1
                    optimizer.step()
                    lr_scheduler.step()
                    optimizer.clear_grad()
                    if global_step % args.logging_steps == 0:
                        print(
                            "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s"
                            % (global_step, num_training_steps, epoch,
                               step + 1, paddle.distributed.get_rank(), loss,
                               optimizer.get_lr(), args.logging_steps /
                               (time.time() - tic_train)))
                        tic_train = time.time()
            tic_eval = time.time()
            acc = evaluate(model, loss_fct, dev_data_loader, metric)
            print("eval acc: %.5f, eval done total : %s s" %
                  (acc, time.time() - tic_eval))
            if paddle.distributed.get_rank() == 0 and acc > best_acc:
                best_acc = acc
                model_to_save = model._layers if isinstance(
                    model, paddle.DataParallel) else model
                if not os.path.exists(args.output_dir):
                    os.makedirs(args.output_dir)
                model_to_save.save_pretrained(args.output_dir)
                tokenizer.save_pretrained(args.output_dir)
        print("best_acc: ", best_acc)

    if args.do_predict:
        column_names = test_ds.column_names
        test_ds = test_ds.map(partial(preprocess_function, do_predict=True),
                              batched=True,
                              batch_size=len(test_ds),
                              remove_columns=column_names,
                              num_proc=1)
        # Serveral samples have more than four choices.
        test_batch_sampler = paddle.io.BatchSampler(test_ds,
                                                    batch_size=1,
                                                    shuffle=False)

        batchify_fn = lambda samples, fn=Dict({
            'input_ids':
            Pad(axis=1, pad_val=tokenizer.pad_token_id),  # input
            'token_type_ids':
            Pad(axis=1, pad_val=tokenizer.pad_token_type_id),  # segment
        }): fn(samples)

        test_data_loader = paddle.io.DataLoader(
            dataset=test_ds,
            batch_sampler=test_batch_sampler,
            collate_fn=batchify_fn,
            return_list=True)

        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)

        f = open(os.path.join(args.output_dir, "c311_predict.json"), 'w')
        result = {}
        idx = 0
        for step, batch in enumerate(test_data_loader):
            input_ids, segment_ids = batch
            with paddle.no_grad():
                logits = model(input_ids, segment_ids)
            preds = paddle.argmax(logits, axis=1).numpy().tolist()
            for pred in preds:
                result[str(idx)] = pred
                idx += 1
                j = json.dumps({"id": idx, "label": pred})
                f.write(j + "\n")
Exemple #21
0
def do_train():
    paddle.set_device(args.device)
    rank = paddle.distributed.get_rank()
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args.seed)

    resource_file_urls = MODEL_MAP[args.model]['resource_file_urls']

    logger.info("Downloading resource files...")
    for key, val in resource_file_urls.items():
        file_path = os.path.join(args.model, key)
        if not os.path.exists(file_path):
            get_path_from_url(val, args.model)

    tokenizer = AutoTokenizer.from_pretrained(args.model)
    model = UIE.from_pretrained(args.model)

    train_ds = load_dataset(reader,
                            data_path=args.train_path,
                            max_seq_len=args.max_seq_len,
                            lazy=False)
    dev_ds = load_dataset(reader,
                          data_path=args.dev_path,
                          max_seq_len=args.max_seq_len,
                          lazy=False)

    train_ds = train_ds.map(
        partial(convert_example,
                tokenizer=tokenizer,
                max_seq_len=args.max_seq_len))
    dev_ds = dev_ds.map(
        partial(convert_example,
                tokenizer=tokenizer,
                max_seq_len=args.max_seq_len))

    train_batch_sampler = paddle.io.BatchSampler(dataset=train_ds,
                                                 batch_size=args.batch_size,
                                                 shuffle=True)
    train_data_loader = paddle.io.DataLoader(dataset=train_ds,
                                             batch_sampler=train_batch_sampler,
                                             return_list=True)

    dev_batch_sampler = paddle.io.BatchSampler(dataset=dev_ds,
                                               batch_size=args.batch_size,
                                               shuffle=False)
    dev_data_loader = paddle.io.DataLoader(dataset=dev_ds,
                                           batch_sampler=dev_batch_sampler,
                                           return_list=True)

    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
        state_dict = paddle.load(args.init_from_ckpt)
        model.set_dict(state_dict)

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    optimizer = paddle.optimizer.AdamW(learning_rate=args.learning_rate,
                                       parameters=model.parameters())

    criterion = paddle.nn.BCELoss()
    metric = SpanEvaluator()

    loss_list = []
    global_step = 0
    best_step = 0
    best_f1 = 0
    tic_train = time.time()
    for epoch in range(1, args.num_epochs + 1):
        for batch in train_data_loader:
            input_ids, token_type_ids, att_mask, pos_ids, start_ids, end_ids = batch
            start_prob, end_prob = model(input_ids, token_type_ids, att_mask,
                                         pos_ids)
            start_ids = paddle.cast(start_ids, 'float32')
            end_ids = paddle.cast(end_ids, 'float32')
            loss_start = criterion(start_prob, start_ids)
            loss_end = criterion(end_prob, end_ids)
            loss = (loss_start + loss_end) / 2.0
            loss.backward()
            optimizer.step()
            optimizer.clear_grad()
            loss_list.append(float(loss))

            global_step += 1
            if global_step % args.logging_steps == 0 and rank == 0:
                time_diff = time.time() - tic_train
                loss_avg = sum(loss_list) / len(loss_list)
                logger.info(
                    "global step %d, epoch: %d, loss: %.5f, speed: %.2f step/s"
                    % (global_step, epoch, loss_avg,
                       args.logging_steps / time_diff))
                tic_train = time.time()

            if global_step % args.valid_steps == 0 and rank == 0:
                save_dir = os.path.join(args.save_dir,
                                        "model_%d" % global_step)
                if not os.path.exists(save_dir):
                    os.makedirs(save_dir)
                model_to_save = model._layers if isinstance(
                    model, paddle.DataParallel) else model
                model_to_save.save_pretrained(save_dir)
                tokenizer.save_pretrained(save_dir)

                precision, recall, f1 = evaluate(model, metric,
                                                 dev_data_loader)
                logger.info(
                    "Evaluation precision: %.5f, recall: %.5f, F1: %.5f" %
                    (precision, recall, f1))
                if f1 > best_f1:
                    logger.info(
                        f"best F1 performence has been updated: {best_f1:.5f} --> {f1:.5f}"
                    )
                    best_f1 = f1
                    save_dir = os.path.join(args.save_dir, "model_best")
                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    model_to_save.save_pretrained(save_dir)
                    tokenizer.save_pretrained(save_dir)
                tic_train = time.time()
Exemple #22
0
def do_train():
    parser = PdArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    paddle.set_device(training_args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, "
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # set_seed(args)
    data_args.dataset = data_args.dataset.strip()
    if data_args.dataset not in ALL_DATASETS:
        raise ValueError("Not found dataset {}".format(data_args.dataset))

    # Use yaml config to rewrite all args.
    config = ALL_DATASETS[data_args.dataset]
    for args in (model_args, data_args, training_args):
        for arg in vars(args):
            if arg in config.keys():
                setattr(args, arg, config[arg])

    training_args.per_device_train_batch_size = config["batch_size"]
    training_args.per_device_eval_batch_size = config["batch_size"]

    dataset_config = data_args.dataset.split(" ")
    raw_datasets = load_dataset(
        dataset_config[0],
        None if len(dataset_config) <= 1 else dataset_config[1],
        cache_dir=model_args.cache_dir)

    label_list = getattr(raw_datasets['train'], "label_list", None)
    data_args.label_list = label_list

    # Define tokenizer, model, loss function.
    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
    model = AutoModelForQuestionAnswering.from_pretrained(
        model_args.model_name_or_path)

    loss_fct = CrossEntropyLossForSQuAD()

    train_dataset = raw_datasets["train"]
    eval_examples = raw_datasets["validation"]
    predict_examples = raw_datasets["test"]

    column_names = raw_datasets["train"].column_names
    # Dataset pre-process
    train_dataset = train_dataset.map(
        partial(prepare_train_features, tokenizer=tokenizer, args=data_args),
        batched=True,
        num_proc=4,
        remove_columns=column_names,
        load_from_cache_file=not data_args.overwrite_cache,
        desc="Running tokenizer on train dataset",
    )

    eval_dataset = eval_examples.map(
        partial(prepare_validation_features,
                tokenizer=tokenizer,
                args=data_args),
        batched=True,
        num_proc=4,
        remove_columns=column_names,
        load_from_cache_file=not data_args.overwrite_cache,
        desc="Running tokenizer on validation dataset",
    )

    predict_dataset = predict_examples.map(
        partial(prepare_validation_features,
                tokenizer=tokenizer,
                args=data_args),
        batched=True,
        num_proc=4,
        remove_columns=column_names,
        load_from_cache_file=not data_args.overwrite_cache,
        desc="Running tokenizer on prediction dataset",
    )

    # Define data collector
    data_collator = qa_collator(tokenizer, data_args)

    # Post-processing:
    def post_processing_function(examples,
                                 features,
                                 predictions,
                                 stage="eval"):
        # Post-processing: we match the start logits and end logits to answers in the original context.
        predictions, all_nbest_json, scores_diff_json = compute_prediction(
            examples=examples,
            features=features,
            predictions=predictions,
            n_best_size=data_args.n_best_size,
            max_answer_length=data_args.max_answer_length,
            null_score_diff_threshold=data_args.null_score_diff_threshold,
        )
        # Format the result to the format the metric expects.
        formatted_predictions = [{
            "id": k,
            "prediction_text": v
        } for k, v in predictions.items()]
        references = [{
            "id": ex["id"],
            "answers": ex["answers"]
        } for ex in examples]
        return EvalPrediction(predictions=formatted_predictions,
                              label_ids=references)

    # Define the metrics of tasks.
    # Metrics
    metric = load_metric("squad")

    def compute_metrics(p: EvalPrediction):
        return metric.compute(predictions=p.predictions,
                              references=p.label_ids)

    trainer = QuestionAnsweringTrainer(
        model=model,
        criterion=loss_fct,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        eval_examples=eval_examples,
        data_collator=data_collator,
        post_process_function=post_processing_function,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # Log model and data config
    trainer.print_config(model_args, "Model")
    trainer.print_config(data_args, "Data")

    checkpoint = None
    if training_args.resume_from_checkpoint is not None:
        checkpoint = training_args.resume_from_checkpoint
    elif last_checkpoint is not None:
        checkpoint = last_checkpoint

    # Training
    train_result = trainer.train(resume_from_checkpoint=checkpoint)
    metrics = train_result.metrics
    trainer.save_model()  # Saves the tokenizer too for easy upload
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()

    # Evaluate and tests model
    eval_metrics = trainer.evaluate()
    trainer.log_metrics("eval", eval_metrics)

    test_ret = trainer.predict(predict_dataset, predict_examples)
    trainer.log_metrics("predict", test_ret.metrics)
    if test_ret.label_ids is None:
        paddle.save(
            test_ret.predictions,
            os.path.join(training_args.output_dir, "test_results.pdtensor"),
        )

    # export inference model
    input_spec = [
        paddle.static.InputSpec(shape=[None, None],
                                dtype="int64"),  # input_ids
        paddle.static.InputSpec(shape=[None, None],
                                dtype="int64")  # segment_ids
    ]
    trainer.export_model(input_spec=input_spec,
                         load_best_model=True,
                         output_dir=model_args.export_model_dir)
Exemple #23
0
def do_train():
    parser = PdArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    paddle.set_device(training_args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, "
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # set_seed(args)
    data_args.dataset = data_args.dataset.strip()
    if data_args.dataset not in ALL_DATASETS:
        raise ValueError("Not found dataset {}".format(data_args.dataset))

    # Use yaml config to rewrite all args.
    config = ALL_DATASETS[data_args.dataset]
    for args in (model_args, data_args, training_args):
        for arg in vars(args):
            if arg in config.keys():
                setattr(args, arg, config[arg])

    training_args.per_device_train_batch_size = config["batch_size"]
    training_args.per_device_eval_batch_size = config["batch_size"]

    dataset_config = data_args.dataset.split(" ")
    raw_datasets = load_dataset(
        dataset_config[0],
        None if len(dataset_config) <= 1 else dataset_config[1],
    )

    data_args.label_list = getattr(raw_datasets['train'], "label_list", None)
    num_classes = 1 if raw_datasets["train"].label_list == None else len(
        raw_datasets['train'].label_list)

    # Define tokenizer, model, loss function.
    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path, num_classes=num_classes)
    loss_fct = nn.loss.CrossEntropyLoss(
    ) if data_args.label_list else nn.loss.MSELoss()

    # Define dataset pre-process function
    if "clue" in data_args.dataset:
        trans_fn = partial(clue_trans_fn, tokenizer=tokenizer, args=data_args)
    else:
        trans_fn = partial(seq_trans_fn, tokenizer=tokenizer, args=data_args)

    # Define data collector
    batchify_fn = defaut_collator(tokenizer, data_args)

    # Dataset pre-process
    train_dataset = raw_datasets["train"].map(trans_fn)
    eval_dataset = raw_datasets["dev"].map(trans_fn)
    test_dataset = raw_datasets["test"].map(trans_fn)

    # Define the metrics of tasks.
    def compute_metrics(p):
        preds = p.predictions[0] if isinstance(p.predictions,
                                               tuple) else p.predictions

        preds = paddle.to_tensor(preds)
        label = paddle.to_tensor(p.label_ids)

        probs = F.softmax(preds, axis=1)
        metric = Accuracy()
        metric.reset()
        result = metric.compute(preds, label)
        metric.update(result)
        accu = metric.accumulate()
        metric.reset()
        return {"accuracy": accu}

    trainer = Trainer(
        model=model,
        criterion=loss_fct,
        args=training_args,
        data_collator=batchify_fn,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # Log model and data config
    trainer.print_config(model_args, "Model")
    trainer.print_config(data_args, "Data")

    checkpoint = None
    if training_args.resume_from_checkpoint is not None:
        checkpoint = training_args.resume_from_checkpoint
    elif last_checkpoint is not None:
        checkpoint = last_checkpoint

    # Training
    train_result = trainer.train(resume_from_checkpoint=checkpoint)
    metrics = train_result.metrics
    trainer.save_model()  # Saves the tokenizer too for easy upload
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()

    # Evaluate and tests model
    eval_metrics = trainer.evaluate()
    trainer.log_metrics("eval", eval_metrics)

    test_ret = trainer.predict(test_dataset)
    trainer.log_metrics("test", test_ret.metrics)
    if test_ret.label_ids is None:
        paddle.save(
            test_ret.predictions,
            os.path.join(training_args.output_dir, "test_results.pdtensor"),
        )

    # export inference model
    input_spec = [
        paddle.static.InputSpec(shape=[None, None],
                                dtype="int64"),  # input_ids
        paddle.static.InputSpec(shape=[None, None],
                                dtype="int64")  # segment_ids
    ]
    trainer.export_model(input_spec=input_spec,
                         load_best_model=True,
                         output_dir=model_args.export_model_dir)
Exemple #24
0
parser.add_argument("--hnsw_m", default=100, type=int, help="Recall number for each query from Ann index.")
parser.add_argument("--hnsw_ef", default=100, type=int, help="Recall number for each query from Ann index.")
parser.add_argument("--hnsw_max_elements", default=1000000, type=int, help="Recall number for each query from Ann index.")

parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
args = parser.parse_args()
# yapf: enable

if __name__ == "__main__":
    paddle.set_device(args.device)
    rank = paddle.distributed.get_rank()
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()
    model_name_or_path = 'rocketqa-zh-dureader-query-encoder'
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

    trans_func = partial(convert_example_test,
                         tokenizer=tokenizer,
                         max_seq_length=args.max_seq_length)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # text_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # text_segment
    ): [data for data in fn(samples)]

    pretrained_model = AutoModel.from_pretrained(model_name_or_path)

    model = SimCSE(pretrained_model, output_emb_size=args.output_emb_size)
    model = paddle.DataParallel(model)
def main():
    parser = PdArgumentParser(
        (ModelArguments, DataArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # Log model and data config
    training_args.print_config(model_args, "Model")
    training_args.print_config(data_args, "Data")

    paddle.set_device(training_args.device)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, "
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    data_args.dataset = data_args.dataset.strip()

    dataset_config = data_args.dataset.split(" ")
    print(dataset_config)
    raw_datasets = load_dataset(
        dataset_config[0],
        name=None if len(dataset_config) <= 1 else dataset_config[1],
        splits=('train', 'dev'))

    data_args.label_list = getattr(raw_datasets['train'], "label_list", None)
    num_classes = 1 if raw_datasets["train"].label_list == None else len(
        raw_datasets['train'].label_list)

    # Define tokenizer, model, loss function.
    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path, num_classes=num_classes)
    criterion = nn.loss.CrossEntropyLoss(
    ) if data_args.label_list else nn.loss.MSELoss()

    # Define dataset pre-process function
    trans_fn = partial(clue_trans_fn, tokenizer=tokenizer, args=data_args)

    # Define data collector
    data_collator = DataCollatorWithPadding(tokenizer)

    # Dataset pre-process
    if training_args.do_train:
        train_dataset = raw_datasets["train"].map(trans_fn)
    if training_args.do_eval:
        eval_dataset = raw_datasets["dev"].map(trans_fn)
    if training_args.do_predict:
        test_dataset = raw_datasets["test"].map(trans_fn)

    # Define the metrics of tasks.
    def compute_metrics(p):
        preds = p.predictions[0] if isinstance(p.predictions,
                                               tuple) else p.predictions

        preds = paddle.to_tensor(preds)
        label = paddle.to_tensor(p.label_ids)

        probs = F.softmax(preds, axis=1)
        metric = Accuracy()
        metric.reset()
        result = metric.compute(preds, label)
        metric.update(result)
        accu = metric.accumulate()
        metric.reset()
        return {"accuracy": accu}

    trainer = Trainer(
        model=model,
        criterion=criterion,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    checkpoint = None
    if training_args.resume_from_checkpoint is not None:
        checkpoint = training_args.resume_from_checkpoint
    elif last_checkpoint is not None:
        checkpoint = last_checkpoint

    # Training
    if training_args.do_train:
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        metrics = train_result.metrics
        trainer.save_model()
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluate and tests model
    if training_args.do_eval:
        eval_metrics = trainer.evaluate()
        trainer.log_metrics("eval", eval_metrics)

    if training_args.do_predict:
        test_ret = trainer.predict(test_dataset)
        trainer.log_metrics("test", test_ret.metrics)
        if test_ret.label_ids is None:
            paddle.save(
                test_ret.predictions,
                os.path.join(training_args.output_dir,
                             "test_results.pdtensor"),
            )

    # export inference model
    if training_args.do_export:
        # You can also load from certain checkpoint
        # trainer.load_state_dict_from_checkpoint("/path/to/checkpoint/")
        input_spec = [
            paddle.static.InputSpec(shape=[None, None],
                                    dtype="int64"),  # input_ids
            paddle.static.InputSpec(shape=[None, None],
                                    dtype="int64")  # segment_ids
        ]
        if model_args.export_model_dir is None:
            model_args.export_model_dir = os.path.join(
                training_args.output_dir, "export")
        paddlenlp.transformers.export_model(model=trainer.model,
                                            input_spec=input_spec,
                                            path=model_args.export_model_dir)
Exemple #26
0
def do_train(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args)
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         max_seq_length=args.max_seq_length)
    if args.task_type == "cross-lingual-transfer":
        train_ds = load_dataset("xnli", "en", splits="train")
        train_ds = train_ds.map(trans_func, lazy=True)
    elif args.task_type == "translate-train-all":
        all_train_ds = []
        for language in all_languages:
            train_ds = load_dataset("xnli", language, splits="train")
            all_train_ds.append(train_ds.map(trans_func, lazy=True))
        train_ds = XnliDataset(all_train_ds)
    train_batch_sampler = DistributedBatchSampler(train_ds,
                                                  batch_size=args.batch_size,
                                                  shuffle=True)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"
            ),  # input_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"
            ),  # position_ids
        Pad(axis=0, pad_val=0, dtype="int64"),  # attention_mask
        Stack(dtype="int64")  # labels
    ): fn(samples)
    train_data_loader = DataLoader(dataset=train_ds,
                                   batch_sampler=train_batch_sampler,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   return_list=True)

    num_classes = 3
    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name_or_path, num_classes=num_classes, dropout=args.dropout)
    n_layers = model.ernie_m.config['num_hidden_layers']
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    if args.max_steps > 0:
        num_training_steps = args.max_steps
        num_train_epochs = math.ceil(num_training_steps /
                                     len(train_data_loader))
    else:
        num_training_steps = len(train_data_loader) * args.num_train_epochs
        num_train_epochs = args.num_train_epochs

    warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, warmup)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    # Construct dict
    name_dict = dict()
    for n, p in model.named_parameters():
        name_dict[p.name] = n
    optimizer = AdamWDL(learning_rate=lr_scheduler,
                        beta1=0.9,
                        beta2=0.999,
                        epsilon=args.adam_epsilon,
                        parameters=model.parameters(),
                        weight_decay=args.weight_decay,
                        n_layers=n_layers,
                        layerwise_decay=args.layerwise_decay,
                        apply_decay_param_fun=lambda x: x in decay_params,
                        name_dict=name_dict)

    loss_fct = nn.CrossEntropyLoss()
    if args.use_amp:
        scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)
    metric = Accuracy()

    global_step = 0
    tic_train = time.time()
    for epoch in range(num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, position_ids, attention_mask, labels = batch
            with paddle.amp.auto_cast(
                    args.use_amp,
                    custom_white_list=["layer_norm", "softmax", "gelu"]):
                logits = model(input_ids, position_ids, attention_mask)
                loss = loss_fct(logits, labels)
            if args.use_amp:
                scaled_loss = scaler.scale(loss)
                scaled_loss.backward()
                scaler.minimize(optimizer, scaled_loss)
            else:
                loss.backward()
                optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % args.logging_steps == 0:
                print(
                    "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s"
                    % (global_step, num_training_steps, epoch, step,
                       paddle.distributed.get_rank(), loss, optimizer.get_lr(),
                       args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()
            if global_step % args.save_steps == 0 or global_step == num_training_steps:
                for language in all_languages:
                    tic_eval = time.time()
                    test_data_loader = get_test_dataloader(
                        args, language, batchify_fn, trans_func)
                    evaluate(model, loss_fct, metric, test_data_loader,
                             language)
                    print("eval done total : %s s" % (time.time() - tic_eval))
                    if paddle.distributed.get_rank() == 0:
                        output_dir = os.path.join(
                            args.output_dir,
                            "ernie_m_ft_model_%d.pdparams" % (global_step))
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        # Need better way to get inner model of DataParallel
                        model_to_save = model._layers if isinstance(
                            model, paddle.DataParallel) else model
                        model_to_save.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)
            if global_step >= num_training_steps:
                break
        if global_step >= num_training_steps:
            break
    if paddle.distributed.get_rank() == 0:
        output_dir = os.path.join(
            args.output_dir, "ernie_m_final_model_%d.pdparams" % global_step)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        # Need better way to get inner model of DataParallel
        model_to_save = model._layers if isinstance(
            model, paddle.DataParallel) else model
        model_to_save.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
Exemple #27
0
def main():
    parser = PdArgumentParser(
        (ModelArguments, DataArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    paddle.set_device(training_args.device)
    data_args.dataset = data_args.dataset.strip()

    if data_args.dataset in ALL_DATASETS:
        # if you custom you hyper-parameters in yaml config, it will overwrite all args.
        config = ALL_DATASETS[data_args.dataset]
        for args in (model_args, data_args, training_args):
            for arg in vars(args):
                if arg in config.keys():
                    setattr(args, arg, config[arg])

        training_args.per_device_train_batch_size = config["batch_size"]
        training_args.per_device_eval_batch_size = config["batch_size"]

    # Log model and data config
    training_args.print_config(model_args, "Model")
    training_args.print_config(data_args, "Data")

    dataset_config = data_args.dataset.split(" ")
    raw_datasets = load_dataset(
        dataset_config[0],
        None if len(dataset_config) <= 1 else dataset_config[1],
        cache_dir=model_args.cache_dir)

    label_list = getattr(raw_datasets['train'], "label_list", None)
    data_args.label_list = label_list

    # Define tokenizer, model, loss function. 
    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
    model = AutoModelForQuestionAnswering.from_pretrained(
        model_args.model_name_or_path)

    loss_fct = CrossEntropyLossForSQuAD()

    # Preprocessing the datasets.
    # Preprocessing is slighlty different for training and evaluation.
    column_names = raw_datasets["train"].column_names

    column_names = raw_datasets["validation"].column_names

    train_dataset = raw_datasets["train"]
    # Create train feature from dataset
    with training_args.main_process_first(
            desc="train dataset map pre-processing"):
        # Dataset pre-process
        train_dataset = train_dataset.map(
            partial(
                prepare_train_features, tokenizer=tokenizer, args=data_args),
            batched=True,
            num_proc=4,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
            desc="Running tokenizer on train dataset", )
    eval_examples = raw_datasets["validation"]
    with training_args.main_process_first(
            desc="evaluate dataset map pre-processing"):
        eval_dataset = eval_examples.map(
            partial(
                prepare_validation_features,
                tokenizer=tokenizer,
                args=data_args),
            batched=True,
            num_proc=4,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
            desc="Running tokenizer on validation dataset", )

    # Define data collector
    data_collator = DataCollatorWithPadding(tokenizer)

    # Post-processing:
    def post_processing_function(examples, features, predictions, stage="eval"):
        # Post-processing: we match the start logits and end logits to answers in the original context.
        predictions, all_nbest_json, scores_diff_json = compute_prediction(
            examples=examples,
            features=features,
            predictions=predictions,
            n_best_size=data_args.n_best_size,
            max_answer_length=data_args.max_answer_length,
            null_score_diff_threshold=data_args.null_score_diff_threshold, )

        references = [{
            "id": ex["id"],
            "answers": ex["answers"]
        } for ex in examples]
        return EvalPrediction(predictions=predictions, label_ids=references)

    trainer = QuestionAnsweringTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        eval_examples=eval_examples,
        data_collator=data_collator,
        post_process_function=post_processing_function,
        tokenizer=tokenizer)

    output_dir = os.path.join(model_args.model_name_or_path, "compress")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    prune = True
    compress_config = CompressConfig(quantization_config=PTQConfig(
        algo_list=['hist', 'mse'], batch_size_list=[4, 8, 16]))
    trainer.compress(
        data_args.dataset,
        output_dir,
        pruning=prune,
        quantization=True,
        compress_config=compress_config)
Exemple #28
0
def run(args):
    if args.do_train:
        assert args.batch_size % args.gradient_accumulation_steps == 0, \
            "Please make sure argmument `batch_size` must be divisible by `gradient_accumulation_steps`."
    paddle.set_device(args.device)
    set_seed(args)

    max_seq_length = args.max_seq_length
    max_num_choices = 10

    def preprocess_function(examples, do_predict=False):
        SPIECE_UNDERLINE = '▁'

        def _is_chinese_char(cp):
            if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
                (cp >= 0x3400 and cp <= 0x4DBF) or  #
                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
                (cp >= 0x2B820 and cp <= 0x2CEAF) or
                (cp >= 0xF900 and cp <= 0xFAFF) or  #
                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
                return True

            return False

        def is_fuhao(c):
            if c == '。' or c == ',' or c == '!' or c == '?' or c == ';' or c == '、' or c == ':' or c == '(' or c == ')' \
                    or c == '-' or c == '~' or c == '「' or c == '《' or c == '》' or c == ',' or c == '」' or c == '"' or c == '“' or c == '”' \
                    or c == '$' or c == '『' or c == '』' or c == '—' or c == ';' or c == '。' or c == '(' or c == ')' or c == '-' or c == '~' or c == '。' \
                    or c == '‘' or c == '’':
                return True
            return False

        def _tokenize_chinese_chars(text):
            """Adds whitespace around any CJK character."""
            output = []
            is_blank = False
            for index, char in enumerate(text):
                cp = ord(char)
                if is_blank:
                    output.append(char)
                    if context[index - 12:index + 1].startswith("#idiom"):
                        is_blank = False
                        output.append(SPIECE_UNDERLINE)
                else:
                    if text[index:index + 6] == "#idiom":
                        is_blank = True
                        if len(output) > 0 and output[-1] != SPIECE_UNDERLINE:
                            output.append(SPIECE_UNDERLINE)
                        output.append(char)
                    elif _is_chinese_char(cp) or is_fuhao(char):
                        if len(output) > 0 and output[-1] != SPIECE_UNDERLINE:
                            output.append(SPIECE_UNDERLINE)
                        output.append(char)
                        output.append(SPIECE_UNDERLINE)
                    else:
                        output.append(char)
            return "".join(output)

        def is_whitespace(c):
            if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(
                    c) == 0x202F or c == SPIECE_UNDERLINE:
                return True
            return False

        def add_tokens_for_around(tokens, pos, num_tokens):
            num_l = num_tokens // 2
            num_r = num_tokens - num_l

            if pos >= num_l and (len(tokens) - 1 - pos) >= num_r:
                tokens_l = tokens[pos - num_l:pos]
                tokens_r = tokens[pos + 1:pos + 1 + num_r]
            elif pos <= num_l:
                tokens_l = tokens[:pos]
                right_len = num_tokens - len(tokens_l)
                tokens_r = tokens[pos + 1:pos + 1 + right_len]
            elif (len(tokens) - 1 - pos) <= num_r:
                tokens_r = tokens[pos + 1:]
                left_len = num_tokens - len(tokens_r)
                tokens_l = tokens[pos - left_len:pos]
            else:
                raise ValueError('impossible')

            return tokens_l, tokens_r

        max_tokens_for_doc = max_seq_length - 3
        num_tokens = max_tokens_for_doc - 5
        num_examples = len(examples.data["candidates"])
        if do_predict:
            result = {"input_ids": [], "token_type_ids": [], "example_ids": []}
        else:
            result = {
                "input_ids": [],
                "token_type_ids": [],
                "labels": [],
                "example_ids": []
            }
        for idx in range(num_examples):
            candidate = 0
            options = examples.data['candidates'][idx]

            # Each content may have several sentences.
            for context in examples.data['content'][idx]:
                context = context.replace("“", "\"").replace("”", "\"").replace("——", "--"). \
                    replace("—", "-").replace("―", "-").replace("…", "...").replace("‘", "\'").replace("’", "\'")
                context = _tokenize_chinese_chars(context)
                paragraph_text = context.strip()
                doc_tokens = []
                prev_is_whitespace = True
                for c in paragraph_text:
                    if is_whitespace(c):
                        prev_is_whitespace = True
                    else:
                        if prev_is_whitespace:
                            doc_tokens.append(c)
                        else:
                            doc_tokens[-1] += c
                        prev_is_whitespace = False
                all_doc_tokens = []
                for (i, token) in enumerate(doc_tokens):
                    if '#idiom' in token:
                        sub_tokens = [str(token)]
                    else:
                        sub_tokens = tokenizer.tokenize(token)
                    for sub_token in sub_tokens:
                        all_doc_tokens.append(sub_token)
                tags = [blank for blank in doc_tokens if '#idiom' in blank]

                # Each sentence may have several tags
                for tag_index, tag in enumerate(tags):
                    pos = all_doc_tokens.index(tag)

                    tmp_l, tmp_r = add_tokens_for_around(
                        all_doc_tokens, pos, num_tokens)
                    num_l = len(tmp_l)
                    num_r = len(tmp_r)
                    tokens_l = []
                    for token in tmp_l:
                        if '#idiom' in token and token != tag:
                            # Mask tag which is not considered in this new sample.
                            # Each idiom has four words, so 4 mask tokens are used.
                            tokens_l.extend(['[MASK]'] * 4)
                        else:
                            tokens_l.append(token)
                    tokens_l = tokens_l[-num_l:]
                    del tmp_l

                    tokens_r = []
                    for token in tmp_r:
                        if '#idiom' in token and token != tag:
                            tokens_r.extend(['[MASK]'] * 4)
                        else:
                            tokens_r.append(token)
                    tokens_r = tokens_r[:num_r]
                    del tmp_r

                    tokens_list = []
                    # Each tag has ten choices, and the shape of each new
                    # example is [num_choices, seq_len]
                    for i, elem in enumerate(options):
                        option = tokenizer.tokenize(elem)
                        tokens = option + ['[SEP]'] + tokens_l + ['[unused1]'
                                                                  ] + tokens_r
                        tokens_list.append(tokens)
                    new_data = tokenizer(tokens_list, is_split_into_words=True)
                    # Final shape of input_ids: [batch_size, num_choices, seq_len]
                    result["input_ids"].append(new_data["input_ids"])
                    result["token_type_ids"].append(new_data["token_type_ids"])
                    result["example_ids"].append(idx)
                    if not do_predict:
                        label = examples.data["answers"][idx]["candidate_id"][
                            candidate]
                        result["labels"].append(label)
                    candidate += 1
            if (idx + 1) % 10000 == 0:
                logger.info("%d samples have been processed." % (idx + 1))
        return result

    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    model = AutoModelForMultipleChoice.from_pretrained(
        args.model_name_or_path, num_choices=max_num_choices)
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    train_ds, dev_ds, test_ds = load_dataset(
        "clue", "chid", split=["train", "validation", "test"])

    if args.do_train:
        args.batch_size = int(args.batch_size /
                              args.gradient_accumulation_steps)
        column_names = train_ds.column_names
        with main_process_first(desc="train dataset map pre-processing"):
            train_ds = train_ds.map(
                partial(preprocess_function),
                batched=True,
                batch_size=len(train_ds),
                num_proc=args.num_proc,
                remove_columns=column_names,
                load_from_cache_file=not args.overwrite_cache,
                desc="Running tokenizer on train dataset")
        batchify_fn = lambda samples, fn=Dict(
            {
                'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id
                                 ),  # input
                'token_type_ids': Pad(
                    axis=1, pad_val=tokenizer.pad_token_type_id),  # segment
                'labels': Stack(dtype="int64"),  # label
                'example_ids': Stack(dtype="int64"),  # example id
            }): fn(samples)

        train_batch_sampler = paddle.io.DistributedBatchSampler(
            train_ds, batch_size=args.batch_size, shuffle=True)
        train_data_loader = paddle.io.DataLoader(
            dataset=train_ds,
            batch_sampler=train_batch_sampler,
            collate_fn=batchify_fn,
            num_workers=0,
            return_list=True)
        with main_process_first(desc="evaluate dataset map pre-processing"):
            dev_ds = dev_ds.map(partial(preprocess_function),
                                batched=True,
                                batch_size=len(dev_ds),
                                remove_columns=column_names,
                                num_proc=args.num_proc,
                                load_from_cache_file=args.overwrite_cache,
                                desc="Running tokenizer on validation dataset")

        dev_batch_sampler = paddle.io.BatchSampler(
            dev_ds, batch_size=args.eval_batch_size, shuffle=False)

        dev_data_loader = paddle.io.DataLoader(dataset=dev_ds,
                                               batch_sampler=dev_batch_sampler,
                                               collate_fn=batchify_fn,
                                               return_list=True)

        num_training_steps = int(
            args.max_steps /
            args.gradient_accumulation_steps) if args.max_steps >= 0 else int(
                len(train_data_loader) * args.num_train_epochs /
                args.gradient_accumulation_steps)

        warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion
        lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                             num_training_steps, warmup)
        # Generate parameter names needed to perform weight decay.
        # All bias and LayerNorm parameters are excluded.
        decay_params = [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ]
        grad_clip = paddle.nn.ClipGradByGlobalNorm(args.max_grad_norm)
        optimizer = paddle.optimizer.AdamW(
            learning_rate=lr_scheduler,
            parameters=model.parameters(),
            weight_decay=args.weight_decay,
            apply_decay_param_fun=lambda x: x in decay_params,
            grad_clip=grad_clip)

        loss_fct = nn.CrossEntropyLoss()

        model.train()
        global_step = 0
        best_acc = 0.0
        tic_train = time.time()
        for epoch in range(args.num_train_epochs):
            for step, batch in enumerate(train_data_loader):
                input_ids, segment_ids, labels, example_ids = batch
                logits = model(input_ids=input_ids, token_type_ids=segment_ids)
                loss = loss_fct(logits, labels)
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    global_step += 1
                    optimizer.step()
                    lr_scheduler.step()
                    optimizer.clear_grad()
                    if global_step % args.logging_steps == 0:
                        logger.info(
                            "global step %d/%d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s"
                            % (global_step, num_training_steps, epoch,
                               step + 1, loss, args.logging_steps /
                               (time.time() - tic_train)))
                        tic_train = time.time()
                if global_step >= num_training_steps:
                    logger.info("best_result: %.2f" % (best_acc * 100))
                    return
            tic_eval = time.time()
            acc = evaluate(model, dev_data_loader)
            logger.info("eval acc: %.5f, eval done total : %s s" %
                        (acc, time.time() - tic_eval))
            if paddle.distributed.get_rank() == 0 and acc > best_acc:
                best_acc = acc
                if args.save_best_model:
                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    if not os.path.exists(args.output_dir):
                        os.makedirs(args.output_dir)
                    model_to_save.save_pretrained(args.output_dir)
                    tokenizer.save_pretrained(args.output_dir)

        logger.info("best_result: %.2f" % (best_acc * 100))

    if args.do_predict:
        column_names = test_ds.column_names
        test_ds = test_ds.map(partial(preprocess_function, do_predict=True),
                              batched=True,
                              batch_size=len(test_ds),
                              remove_columns=column_names,
                              num_proc=args.num_proc)
        test_batch_sampler = paddle.io.BatchSampler(
            test_ds, batch_size=args.eval_batch_size, shuffle=False)

        batchify_fn = lambda samples, fn=Dict({
            'input_ids':
            Pad(axis=1, pad_val=tokenizer.pad_token_id),  # input
            'token_type_ids':
            Pad(axis=1, pad_val=tokenizer.pad_token_type_id),  # segment
            'example_ids':
            Stack(dtype="int64"),  # example id
        }): fn(samples)

        test_data_loader = paddle.io.DataLoader(
            dataset=test_ds,
            batch_sampler=test_batch_sampler,
            collate_fn=batchify_fn,
            return_list=True)

        result = {}
        idx = 623377
        preds = evaluate(model, test_data_loader, do_predict=True)
        for pred in preds:
            result["#idiom" + str(idx) + "#"] = pred
            idx += 1
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        with open(os.path.join(args.output_dir, 'chid11_predict.json'),
                  "w") as writer:
            json.dump(result, writer, indent=2)
Exemple #29
0
def run(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()
    rank = paddle.distributed.get_rank()

    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)

    set_seed(args)

    train_examples, dev_examples, test_examples = load_dataset(
        'cmrc2018', split=["train", "validation", "test"])

    column_names = train_examples.column_names
    if rank == 0:
        if os.path.exists(args.model_name_or_path):
            print("init checkpoint from %s" % args.model_name_or_path)

    model = AutoModelForQuestionAnswering.from_pretrained(
        args.model_name_or_path)

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    def prepare_train_features(examples):
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        # NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is
        # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead.
        contexts = examples['context']
        questions = examples['question']

        tokenized_examples = tokenizer(questions,
                                       contexts,
                                       stride=args.doc_stride,
                                       max_seq_len=args.max_seq_length)

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample")
        # The offset mappings will give us a map from token to character position in the original context. This will
        # help us compute the start_positions and end_positions.
        offset_mapping = tokenized_examples.pop("offset_mapping")

        # Let's label those examples!
        tokenized_examples["start_positions"] = []
        tokenized_examples["end_positions"] = []

        for i, offsets in enumerate(offset_mapping):
            # We will label impossible answers with the index of the CLS token.
            input_ids = tokenized_examples["input_ids"][i]
            cls_index = input_ids.index(tokenizer.cls_token_id)

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples['token_type_ids'][i]

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            answers = examples['answers'][sample_index]
            # If no answers are given, set the cls_index as answer.
            if len(answers["answer_start"]) == 0:
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Start/end character index of the answer in the text.
                start_char = answers["answer_start"][0]
                end_char = start_char + len(answers["text"][0])

                # Start token index of the current span in the text.
                token_start_index = 0
                while sequence_ids[token_start_index] != 1:
                    token_start_index += 1

                # End token index of the current span in the text.
                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != 1:
                    token_end_index -= 1
                token_end_index -= 1

                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
                if not (offsets[token_start_index][0] <= start_char
                        and offsets[token_end_index][1] >= end_char):
                    tokenized_examples["start_positions"].append(cls_index)
                    tokenized_examples["end_positions"].append(cls_index)
                else:
                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                    # Note: we could go after the last offset if the answer is the last word (edge case).
                    while token_start_index < len(offsets) and offsets[
                            token_start_index][0] <= start_char:
                        token_start_index += 1
                    tokenized_examples["start_positions"].append(
                        token_start_index - 1)
                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_examples["end_positions"].append(
                        token_end_index + 1)

        return tokenized_examples

    def prepare_validation_features(examples):
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        #NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is
        # that HuggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead.
        contexts = examples['context']
        questions = examples['question']

        tokenized_examples = tokenizer(questions,
                                       contexts,
                                       stride=args.doc_stride,
                                       max_seq_len=args.max_seq_length,
                                       return_attention_mask=True)

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample")

        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
        # corresponding example_id and we will store the offset mappings.
        tokenized_examples["example_id"] = []

        for i in range(len(tokenized_examples["input_ids"])):
            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples['token_type_ids'][i]
            context_index = 1

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            tokenized_examples["example_id"].append(
                examples["id"][sample_index])

            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
            # position is part of the context or not.
            tokenized_examples["offset_mapping"][i] = [
                (o if sequence_ids[k] == context_index else None)
                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
            ]

        return tokenized_examples

    if args.do_train:
        args.batch_size = int(args.batch_size /
                              args.gradient_accumulation_steps)

        train_ds = train_examples.map(prepare_train_features,
                                      batched=True,
                                      remove_columns=column_names,
                                      num_proc=1)
        train_batch_sampler = paddle.io.DistributedBatchSampler(
            train_ds, batch_size=args.batch_size, shuffle=True)
        train_batchify_fn = lambda samples, fn=Dict(
            {
                "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),
                "token_type_ids": Pad(axis=0,
                                      pad_val=tokenizer.pad_token_type_id),
                "start_positions": Stack(dtype="int64"),
                "end_positions": Stack(dtype="int64")
            }): fn(samples)
        train_data_loader = DataLoader(dataset=train_ds,
                                       batch_sampler=train_batch_sampler,
                                       collate_fn=train_batchify_fn,
                                       return_list=True)

        dev_ds = dev_examples.map(prepare_validation_features,
                                  batched=True,
                                  remove_columns=column_names,
                                  num_proc=1)
        dev_batch_sampler = paddle.io.BatchSampler(
            dev_ds, batch_size=args.eval_batch_size, shuffle=False)
        dev_batchify_fn = lambda samples, fn=Dict({
            "input_ids":
            Pad(axis=0, pad_val=tokenizer.pad_token_id),
            "token_type_ids":
            Pad(axis=0, pad_val=tokenizer.pad_token_type_id)
        }): fn(samples)
        dev_data_loader = DataLoader(dataset=dev_ds,
                                     batch_sampler=dev_batch_sampler,
                                     collate_fn=dev_batchify_fn,
                                     return_list=True)

        num_training_steps = int(
            args.max_steps /
            args.gradient_accumulation_steps) if args.max_steps > 0 else int(
                len(train_data_loader) * args.num_train_epochs /
                args.gradient_accumulation_steps)

        lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                             num_training_steps,
                                             args.warmup_proportion)

        # Generate parameter names needed to perform weight decay.
        # All bias and LayerNorm parameters are excluded.
        decay_params = [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ]
        optimizer = paddle.optimizer.AdamW(
            learning_rate=lr_scheduler,
            epsilon=args.adam_epsilon,
            parameters=model.parameters(),
            weight_decay=args.weight_decay,
            apply_decay_param_fun=lambda x: x in decay_params)
        criterion = CrossEntropyLossForSQuAD()

        global_step = 0
        tic_train = time.time()
        for epoch in range(args.num_train_epochs):
            for step, batch in enumerate(train_data_loader):
                input_ids, token_type_ids, start_positions, end_positions = batch
                logits = model(input_ids=input_ids,
                               token_type_ids=token_type_ids)
                loss = criterion(logits, (start_positions, end_positions))
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    global_step += 1
                    optimizer.step()
                    lr_scheduler.step()
                    optimizer.clear_grad()

                    if global_step % args.logging_steps == 0:
                        print(
                            "global step %d/%d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                            % (global_step, num_training_steps, epoch,
                               step + 1, loss, args.logging_steps /
                               (time.time() - tic_train)))
                        tic_train = time.time()

                    if global_step % args.save_steps == 0 or global_step == num_training_steps:
                        if rank == 0:
                            output_dir = os.path.join(args.output_dir,
                                                      "model_%d" % global_step)
                            if not os.path.exists(output_dir):
                                os.makedirs(output_dir)
                            # need better way to get inner model of DataParallel
                            model_to_save = model._layers if isinstance(
                                model, paddle.DataParallel) else model
                            model_to_save.save_pretrained(output_dir)
                            tokenizer.save_pretrained(output_dir)
                            print('Saving checkpoint to:', output_dir)
                        if global_step == num_training_steps:
                            break
            evaluate(model, dev_examples, dev_data_loader, args)

    if args.do_predict and rank == 0:
        test_ds = test_examples.map(prepare_validation_features,
                                    batched=True,
                                    remove_columns=column_names,
                                    num_proc=1)
        test_batch_sampler = paddle.io.BatchSampler(
            test_ds, batch_size=args.eval_batch_size, shuffle=False)
        test_batchify_fn = lambda samples, fn=Dict({
            "input_ids":
            Pad(axis=0, pad_val=tokenizer.pad_token_id),
            "token_type_ids":
            Pad(axis=0, pad_val=tokenizer.pad_token_type_id)
        }): fn(samples)
        test_data_loader = DataLoader(dataset=test_ds,
                                      batch_sampler=test_batch_sampler,
                                      collate_fn=test_batchify_fn,
                                      return_list=True)

        evaluate(model, test_examples, test_data_loader, args, do_eval=False)
Exemple #30
0
def do_train():
    paddle.set_device(args.device)
    rank = paddle.distributed.get_rank()
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args.seed)

    train_ds, dev_ds, test_ds = load_dataset(args.dataset,
                                             splits=["train", "dev", "test"])

    model = AutoModelForSequenceClassification.from_pretrained(
        'ernie-1.0', num_classes=len(train_ds.label_list))
    tokenizer = AutoTokenizer.from_pretrained('ernie-1.0')

    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         max_seq_length=args.max_seq_length,
                         is_pair=args.dataset == "xnli_cn")
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
        Stack(dtype="int64")  # label
    ): [data for data in fn(samples)]
    train_data_loader = create_dataloader(train_ds,
                                          mode='train',
                                          batch_size=args.batch_size,
                                          batchify_fn=batchify_fn,
                                          trans_fn=trans_func)
    dev_data_loader = create_dataloader(dev_ds,
                                        mode='dev',
                                        batch_size=args.batch_size,
                                        batchify_fn=batchify_fn,
                                        trans_fn=trans_func)

    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
        state_dict = paddle.load(args.init_from_ckpt)
        model.set_dict(state_dict)
    model = paddle.DataParallel(model)

    num_training_steps = len(train_data_loader) * args.epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps,
                                         args.warmup_proportion)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    criterion = paddle.nn.loss.CrossEntropyLoss()
    metric = paddle.metric.Accuracy()
    if args.use_amp:
        scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)
    global_step = 0
    tic_train = time.time()
    for epoch in range(1, args.epochs + 1):
        for step, batch in enumerate(train_data_loader, start=1):
            input_ids, token_type_ids, labels = batch
            with paddle.amp.auto_cast(
                    args.use_amp,
                    custom_white_list=["layer_norm", "softmax", "gelu"],
            ):
                logits = model(input_ids, token_type_ids)
                loss = criterion(logits, labels)
            probs = F.softmax(logits, axis=1)
            correct = metric.compute(probs, labels)
            metric.update(correct)
            acc = metric.accumulate()

            if args.use_amp:
                scaler.scale(loss).backward()
                scaler.minimize(optimizer, loss)
            else:
                loss.backward()
                optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()

            global_step += 1
            if global_step % args.logging_steps == 0 and rank == 0:
                time_diff = time.time() - tic_train
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %.5f, accuracy: %.5f, speed: %.2f step/s"
                    % (global_step, epoch, step, loss, acc,
                       args.logging_steps / time_diff))
                tic_train = time.time()

            if global_step % args.valid_steps == 0 and rank == 0:
                evaluate(model, criterion, metric, dev_data_loader)
                tic_train = time.time()

            if global_step % args.save_steps == 0 and rank == 0:
                save_dir = os.path.join(args.save_dir,
                                        "model_%d" % global_step)
                if not os.path.exists(save_dir):
                    os.makedirs(save_dir)
                model._layers.save_pretrained(save_dir)
                tokenizer.save_pretrained(save_dir)
                tic_train = time.time()