Beispiel #1
0
def load_ds(datafiles):
    '''
    intput:
        datafiles -- str or list[str] -- the path of train or dev sets
        split_train -- Boolean -- split from train or not
        dev_size -- int -- split how much data from train 

    output:
        MapDataset
    '''

    datas = []

    def read(ds_file):
        with open(ds_file, 'r', encoding='utf-8') as fp:
            next(fp)  # Skip header
            for line in fp.readlines():
                data = line[:-1].split('\t')
                if len(data) == 2:
                    yield ({'text': data[1], 'label': int(data[0])})
                elif len(data) == 3:
                    yield ({'text': data[2], 'label': int(data[1])})

    if isinstance(datafiles, str):
        return MapDataset(list(read(datafiles)))
    elif isinstance(datafiles, list) or isinstance(datafiles, tuple):
        return [MapDataset(list(read(datafile))) for datafile in datafiles]
def main():
    """主函数"""
    paddle.set_device("gpu:1")
    set_seed(2021)  # 设置随机数种子
    logger.info("构建数据集")
    data_file = os.path.join(work_root, "data/NewsTrain.txt")
    datasets, labels = load_data(data_file)
    save_dict_obj(labels, os.path.join(work_root,
                                       "data/news_labels_info.json"))
    for i in range(3):
        random.shuffle(datasets)
    train_data_num = int(len(datasets) * 0.8)
    train_dataset, valid_dataset = datasets[:train_data_num], datasets[
        train_data_num:]
    train_dataset, valid_dataset = MapDataset(train_dataset), MapDataset(
        valid_dataset)
    logger.info("数据转换word2id")
    tokenizer = paddlenlp.transformers.ErnieTinyTokenizer.from_pretrained(
        'ernie-tiny')
    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         max_seq_length=64)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
        Stack(dtype="int64")  # label
    ): fn(samples)  # 有点难以理解, 没有Torch便于理解, pad_val非常好, 可以动态设置batch最大序列长度

    train_loader = create_dataloader(train_dataset,
                                     mode="train",
                                     batch_size=512,
                                     batchify_fn=batchify_fn,
                                     trans_fn=trans_func)
    valid_loader = create_dataloader(valid_dataset,
                                     mode="valid",
                                     batch_size=256,
                                     batchify_fn=batchify_fn,
                                     trans_fn=trans_func)
    epochs = 5  # 训练epoch number
    num_training_steps = len(train_loader) * epochs
    num_classes = len(labels)
    model, optimizer, criterion, lr_scheduler = create_classification_model(
        num_classes, num_training_steps)

    logger.info("训练模型")
    metric = paddle.metric.Accuracy()
    train(model,
          optimizer,
          criterion,
          lr_scheduler,
          metric,
          tokenizer,
          train_loader,
          valid_loader,
          epochs=epochs)
Beispiel #3
0
    def _load_dataset(self, datafiles):
        def read(data_path):
            with open(data_path, 'r', encoding='utf-8') as fp:
                for line in fp.readlines():
                    order, words, labels = line.strip('\n').split('\t')
                    yield {'tokens': words, 'labels': labels}

        if isinstance(datafiles, str):
            return MapDataset(list(read(datafiles)))
        elif isinstance(datafiles, list) or isinstance(datafiles, tuple):
            return [MapDataset(list(read(datafile))) for datafile in datafiles]
Beispiel #4
0
def load_dataset(datafiles):
    def read(data_path):
        with open(data_path, 'r', encoding='utf-8') as fp:
            next(fp)
            for line in fp.readlines():
                words, labels = line.strip('\n').split('\t')
                words = words.split('\002')
                labels = labels.split('\002')
                yield words, labels

    if isinstance(datafiles, str):
        return MapDataset(list(read(datafiles)))
    elif isinstance(datafiles, list) or isinstance(datafiles, tuple):
        return [MapDataset(list(read(datafile))) for datafile in datafiles]
Beispiel #5
0
def load_dataset(datafiles):
    def read(data_path):
        with open(data_path, 'r', encoding='utf-8') as fp:
            for i, line in enumerate(fp):
                example = json.loads(line)
                words = example["tokens"]
                tags = example["tags"]
                cls_label = example["cls_label"]
                yield words, tags, cls_label

    if isinstance(datafiles, str):
        return MapDataset(list(read(datafiles)))
    elif isinstance(datafiles, list) or isinstance(datafiles, tuple):
        return [MapDataset(list(read(datafile))) for datafile in datafiles]
def news_title_classification():
    """文本分类---新闻标题分类"""
    data = json.loads(request.data)
    if isinstance(data["texts"], str):
        data["texts"] = [data["texts"]]

    if isinstance(data["texts"], list):
        datasets = []
        for text in data["texts"]:
            datasets.append({"text": text})
        datasets = MapDataset(datasets)
        trans_func = partial(convert_example,
                             tokenizer=tokenizer,
                             max_seq_length=64)
        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
            Stack(dtype="int64")  # label
        ): fn(samples)
        data_loader = create_dataloader(datasets,
                                        mode="test",
                                        batch_size=32,
                                        batchify_fn=batchify_fn,
                                        trans_fn=trans_func)

        labels = inference(model, data_loader)
        labels_text = []
        for id, label in enumerate(labels):
            labels_text.append(labels_info[str(label)])

        return jsonify(status="Success", results=labels_text)
    else:
        return jsonify(status="Failure",
                       message="Incorrect parameter data type.")
Beispiel #7
0
    def _pre_process_text(self, input_texts, max_seq_len=128, batch_size=1):
        infer_data = []
        max_predict_len = max_seq_len - self.summary_num - 1
        short_input_texts = self._split_long_text2short_text_list(
            input_texts, max_predict_len)
        for text in short_input_texts:
            tokens = ["[CLS%i]" % i
                      for i in range(1, self.summary_num)] + list(text)
            tokenized_input = self._tokenizer(tokens,
                                              return_length=True,
                                              is_split_into_words=True,
                                              max_seq_len=max_seq_len)
            infer_data.append([
                tokenized_input['input_ids'],
                tokenized_input['token_type_ids'], tokenized_input['seq_len']
            ])
        infer_ds = MapDataset(infer_data)

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=self._tokenizer.pad_token_id, dtype='int64'
                ),  # input_ids
            Pad(axis=0,
                pad_val=self._tokenizer.pad_token_type_id,
                dtype='int64'),  # token_type_ids
            Stack(dtype='int64'),  # seq_len
        ): fn(samples)

        infer_data_loader = paddle.io.DataLoader(infer_ds,
                                                 collate_fn=batchify_fn,
                                                 num_workers=0,
                                                 batch_size=batch_size,
                                                 shuffle=False,
                                                 return_list=True)

        return infer_data_loader, short_input_texts
Beispiel #8
0
    def predict_cls(self, args, ext_results):
        test_ds = MapDataset(ext_results)
        trans_func = partial(convert_example_to_feature_cls,
                             tokenizer=self.tokenizer,
                             label2id=self.cls_label2id,
                             max_seq_len=args.cls_max_seq_len,
                             is_test=True)
        test_ds = test_ds.map(trans_func, lazy=False)
        batch_list = [
            test_ds[idx:idx + args.batch_size]
            for idx in range(0, len(test_ds), args.batch_size)
        ]

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype="int64"),
            Pad(axis=0,
                pad_val=self.tokenizer.pad_token_type_id,
                dtype="int64"), Stack(dtype="int64")): fn(samples)

        results = []
        for batch_data in batch_list:
            input_ids, token_type_ids, _ = batchify_fn(batch_data)
            self.cls_input_handles[0].copy_from_cpu(input_ids)
            self.cls_input_handles[1].copy_from_cpu(token_type_ids)
            self.cls_predictor.run()
            logits = self.cls_output_hanle.copy_to_cpu()

            predictions = logits.argmax(axis=1).tolist()
            results.extend(predictions)

        return results
Beispiel #9
0
def build_data_loader(args, tokenizer):
    """ build corpus_data_loader and text_data_loader
    """

    id2corpus = gen_id2corpus(args.corpus_file)

    # conver_example function's input must be dict
    corpus_list = [{idx: text} for idx, text in id2corpus.items()]
    corpus_ds = MapDataset(corpus_list)

    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         max_seq_length=args.max_seq_length)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # text_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # text_segment
    ): [data for data in fn(samples)]

    corpus_data_loader = create_dataloader(corpus_ds,
                                           mode='predict',
                                           batch_size=args.batch_size,
                                           batchify_fn=batchify_fn,
                                           trans_fn=trans_func)

    # build text data_loader
    text_list, text2similar_text = gen_text_file(args.similar_text_pair_file)

    text_ds = MapDataset(text_list)

    text_data_loader = create_dataloader(text_ds,
                                         mode='predict',
                                         batch_size=args.batch_size,
                                         batchify_fn=batchify_fn,
                                         trans_fn=trans_func)

    d = {
        "text_data_loader": text_data_loader,
        "corpus_data_loader": corpus_data_loader,
        "id2corpus": id2corpus,
        "text2similar_text": text2similar_text,
        "text_list": text_list
    }

    return d
Beispiel #10
0
def load_ds_xnli(datafiles):
    datas = []

    def read(ds_file):
        with open(ds_file, 'r', encoding='utf-8') as fp:
            # next(fp)  # Skip header
            for line in fp.readlines():
                data = line.strip().split('\t', 2)
                first, second, third = data
                yield ({
                    "sentence1": first,
                    "sentence2": second,
                    "label": third
                })

    if isinstance(datafiles, str):
        return MapDataset(list(read(datafiles)))
    elif isinstance(datafiles, list) or isinstance(datafiles, tuple):
        return [MapDataset(list(read(datafile))) for datafile in datafiles]
Beispiel #11
0
    def _create_examples(self, datasets, phase, task_label_description):
        """Creates examples for the training and dev sets."""

        examples = []

        if phase == "train":
            for example in datasets:
                true_label = example["label"]
                neg_examples = []
                for label, label_description in task_label_description.items():
                    new_example = dict()
                    new_example["sentence1"] = example["text"]
                    new_example["sentence2"] = example["target"][
                        "span1_text"] + label_description + example["target"][
                            "span2_text"]

                    # Todo: handle imbanlanced example, maybe hurt model performance
                    if true_label == label:
                        new_example["label"] = 1
                        examples.append(new_example)
                    else:
                        new_example["label"] = 0
                        neg_examples.append(new_example)
                neg_examples = random.sample(neg_examples, self.neg_num)
                examples.extend(neg_examples)

        elif phase == "dev":
            for example in datasets:
                true_label = str(example["label"])
                for label, label_description in task_label_description.items():
                    new_example = dict()
                    new_example["sentence1"] = example["text"]
                    new_example["sentence2"] = example["target"][
                        "span1_text"] + label_description + example["target"][
                            "span2_text"]

                    # Get true_label's index at task_label_description for evaluate
                    true_label_index = list(
                        task_label_description.keys()).index(true_label)
                    new_example["label"] = true_label_index
                    examples.append(new_example)

        elif phase == "test":
            for example in datasets:
                for label, label_description in task_label_description.items():
                    new_example = dict()
                    new_example["sentence1"] = example["text"]
                    new_example["sentence2"] = example["target"][
                        "span1_text"] + label_description + example["target"][
                            "span2_text"]
                    examples.append(new_example)

        return MapDataset(examples)
Beispiel #12
0
def load_dataset(datafiles):
    def read(data_path):
        with open(data_path, 'r', encoding='utf-8') as fp:
            if "infer" in data_path:
                next(fp)
            for line in fp:
                line = line.strip()
                if "infer" in data_path:
                    words = list(line)
                    yield [words]
                else:
                    words, labels = line.split("\t")
                    words = words.split(CHAR_DELIMITER)
                    labels = labels.split(CHAR_DELIMITER)
                    assert len(words) == len(
                        labels), "The word %s is not match with the label %s" % (
                            words, labels)
                    yield [words, labels]

    if isinstance(datafiles, str):
        return MapDataset(list(read(datafiles)))
    elif isinstance(datafiles, list) or isinstance(datafiles, tuple):
        return [MapDataset(list(read(datafile))) for datafile in datafiles]
Beispiel #13
0
def create_data_loader(args, dataset_class, trans_func, batchify_fn, mode):
    dataset = dataset_class(args.data_dir, mode)
    dataset = MapDataset(dataset).map(trans_func, lazy=True)
    if mode == 'train':
        batch_sampler = DistributedBatchSampler(
            dataset, batch_size=args.batch_size, shuffle=True)
    else:
        batch_sampler = BatchSampler(
            dataset, batch_size=args.test_batch_size, shuffle=False)
    data_loader = DataLoader(
        dataset,
        batch_sampler=batch_sampler,
        collate_fn=batchify_fn,
        return_list=True)
    return data_loader
Beispiel #14
0
    def _create_examples(self, datasets, phase, task_label_description):
        """Creates examples for the training and dev sets."""

        examples = []

        if phase == "train":
            for example in datasets:
                neg_examples = []
                true_label_index = int(example["answer"])
                candidates = example["candidates"]
                for idx, cantidate in enumerate(candidates):
                    new_example = dict()
                    new_example["sentence1"] = example["content"]
                    new_example["sentence2"] = "位置#idiom#处的成语应该填写" + cantidate

                    if idx == true_label_index:
                        new_example["label"] = 1
                        examples.append(new_example)
                    else:
                        new_example["label"] = 0
                        neg_examples.append(new_example)
                examples.extend(neg_examples)

        elif phase == "dev":
            for example in datasets:
                true_label = str(example["answer"])
                candidates = example["candidates"]
                for idx, cantidate in enumerate(candidates):
                    new_example = dict()
                    new_example["sentence1"] = example["content"]
                    new_example["sentence2"] = "位置#idiom#处的成语应该填写" + cantidate

                    # Get true_label's index at task_label_description for evaluate
                    true_label_index = int(true_label)
                    new_example["label"] = true_label_index
                    examples.append(new_example)

        elif phase == "test":
            for example in datasets:
                candidates = example["candidates"]
                for idx, cantidate in enumerate(candidates):
                    new_example = dict()
                    new_example["sentence1"] = example["content"]
                    new_example["sentence2"] = "位置#idiom#处的成语应该填写" + cantidate
                    examples.append(new_example)

        return MapDataset(examples)
Beispiel #15
0
def predict_cls(args, ext_results):
    # load dict
    model_name = "skep_ernie_1.0_large_ch"
    cls_label2id, cls_id2label = load_dict(args.cls_label_path)

    tokenizer = SkepTokenizer.from_pretrained(model_name)
    test_ds = MapDataset(ext_results)
    trans_func = partial(convert_example_to_feature_cls,
                         tokenizer=tokenizer,
                         label2id=cls_label2id,
                         max_seq_len=args.cls_max_seq_len,
                         is_test=True)
    test_ds = test_ds.map(trans_func, lazy=False)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id), Stack(dtype="int64")
    ): fn(samples)

    # set shuffle is False
    test_batch_sampler = paddle.io.BatchSampler(test_ds,
                                                batch_size=args.batch_size,
                                                shuffle=False)
    test_loader = paddle.io.DataLoader(test_ds,
                                       batch_sampler=test_batch_sampler,
                                       collate_fn=batchify_fn)
    print("test data loaded.")

    # load cls model
    cls_state_dict = paddle.load(args.cls_model_path)
    cls_model = SkepForSequenceClassification.from_pretrained(
        model_name, num_classes=len(cls_label2id))
    cls_model.load_dict(cls_state_dict)
    print("classification model loaded.")

    cls_model.eval()

    results = []
    for bid, batch_data in enumerate(test_loader):
        input_ids, token_type_ids, seq_lens = batch_data
        logits = cls_model(input_ids, token_type_ids=token_type_ids)

        predictions = logits.argmax(axis=1).numpy().tolist()
        results.extend(predictions)

    results = [cls_id2label[pred_id] for pred_id in results]
    return results
Beispiel #16
0
def convert_features_to_dataset(features):
    """
    Converts a list of feature dictionaries (one for each sample) into a Paddle Dataset.

    :param features: A list of dictionaries. Each dictionary corresponds to one sample. Its keys are the
                     names of the type of feature and the keys are the features themselves.
    :Return: a Paddle dataset and a list of tensor names.
    """
    # features can be an empty list in cases where down sampling occurs
    if len(features) == 0:
        return None, None
    tensor_names = list(features[0].keys())
    all_tensors = []
    for t_name in tensor_names:
        try:
            # Checking whether a non-integer will be silently converted to Paddle.long
            check = features[0][t_name]
            if isinstance(check, numbers.Number):
                base = check
            # extract a base variable from a nested lists or tuples
            elif isinstance(check, list):
                base = list(flatten_list(check))[0]
            # extract a base variable from numpy arrays
            else:
                base = check.ravel()[0]
            if not np.issubdtype(type(base), np.integer):
                logger.warning(
                    f"Problem during conversion to Paddle tensors:\n"
                    f"A non-integer value for feature '{t_name}' with a value of: "
                    f"'{base}' will be converted to a Paddle tensor of dtype long."
                )
        except:
            logger.debug(f"Could not determine type for feature '{t_name}'. "
                         "Converting now to a tensor of default type long.")

        # Convert all remaining python objects to Paddle long tensors
        cur_tensor = [sample[t_name] for sample in features]
        all_tensors.append(cur_tensor)

    # Todo(tianxin): When set to IterDataset, throw Exception with paddle.io.BatchSampler
    # all_tensors: List[List[all_token_ids], List[all_segment_ids]]
    # list(zip(*all_tensors)): List[([token_ids], [segment_ids]), ([token_ids], [segment_ids])]
    # For Question Answering: tensor_names: ['input_ids', 'padding_mask', 'segment_ids', 'passage_start_t', 'start_of_word', 'labels', 'id', 'seq_2_start_t', 'span_mask']
    dataset = MapDataset(list(zip(*all_tensors)))
    return dataset, tensor_names
Beispiel #17
0
def gen_pair(dataset, pool_size=100):
    """ 
    Generate triplet randomly based on dataset
 
    Args:
        dataset: A `MapDataset` or `IterDataset` or a tuple of those. 
            Each example is composed of 2 texts: exampe["query"], example["title"]
        pool_size: the number of example to sample negative example randomly

    Return:
        dataset: A `MapDataset` or `IterDataset` or a tuple of those.
        Each example is composed of 2 texts: exampe["query"], example["pos_title"]、example["neg_title"]
    """

    if len(dataset) < pool_size:
        pool_size = len(dataset)

    new_examples = []
    pool = []
    tmp_exmaples = []

    for example in dataset:
        label = example["label"]

        # Filter negative example
        if label == 0:
            continue

        tmp_exmaples.append(example)
        pool.append(example["title"])

        if len(pool) >= pool_size:
            np.random.shuffle(pool)
            for idx, example in enumerate(tmp_exmaples):
                example["neg_title"] = pool[idx]
                new_examples.append(example)
            tmp_exmaples = []
            pool = []
        else:
            continue
    return MapDataset(new_examples)
Beispiel #18
0
    model = paddle.DataParallel(model)

    # Load pretrained semantic model
    if args.params_path and os.path.isfile(args.params_path):
        state_dict = paddle.load(args.params_path)
        model.set_dict(state_dict)
        logger.info("Loaded parameters from %s" % args.params_path)
    else:
        raise ValueError(
            "Please set --params_path with correct pretrained model file")

    id2corpus = gen_id2corpus(args.corpus_file)

    # conver_example function's input must be dict
    corpus_list = [{idx: text} for idx, text in id2corpus.items()]
    corpus_ds = MapDataset(corpus_list)

    corpus_data_loader = create_dataloader(corpus_ds,
                                           mode='predict',
                                           batch_size=args.batch_size,
                                           batchify_fn=batchify_fn,
                                           trans_fn=trans_func)

    # Need better way to get inner model of DataParallel
    inner_model = model._layers

    final_index = build_index(args, corpus_data_loader, inner_model)

    text_list, text2similar_text = gen_text_file(args.similar_text_pair_file)

    query_ds = MapDataset(text_list)
Beispiel #19
0
def train(args):

    # 加载数据
    trainset = IMDBDataset(is_training=True)
    testset = IMDBDataset(is_training=False)

    # 封装成MapDataSet的形式
    train_ds = MapDataset(trainset, label_list=[0, 1])
    test_ds = MapDataset(testset, label_list=[0, 1])

    # 定义XLNet的Tokenizer
    tokenizer = XLNetTokenizer.from_pretrained(args.model_name_or_path)

    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         label_list=train_ds.label_list,
                         max_seq_length=args.max_seq_length)

    # 构造train_data_loader 和 dev_data_loader
    train_ds = train_ds.map(trans_func, lazy=True)
    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, pad_right=False),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, pad_right=False
            ),  # token_type
        Pad(axis=0, pad_val=0, pad_right=False),  # attention_mask
        Stack(dtype="int64" if train_ds.label_list else "float32"),  # label
    ): fn(samples)

    train_data_loader = DataLoader(dataset=train_ds,
                                   batch_sampler=train_batch_sampler,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   return_list=True)

    dev_ds = MapDataset(testset)
    dev_ds = dev_ds.map(trans_func, lazy=True)
    dev_batch_sampler = paddle.io.BatchSampler(dev_ds,
                                               batch_size=args.batch_size,
                                               shuffle=False)

    dev_data_loader = DataLoader(dataset=dev_ds,
                                 batch_sampler=dev_batch_sampler,
                                 collate_fn=batchify_fn,
                                 num_workers=0,
                                 return_list=True)

    # 训练配置
    # 固定随机种子
    set_seed(args)

    # 设定运行环境
    use_gpu = True if paddle.get_device().startswith("gpu") else False
    if use_gpu:
        paddle.set_device('gpu:0')

    num_classes = len(train_ds.label_list)
    model = XLNetForSequenceClassification.from_pretrained(
        args.model_name_or_path, num_classes=num_classes)

    #paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()
        model = paddle.DataParallel(model)

    # 设定lr_scheduler
    if args.max_steps > 0:
        num_training_steps = args.max_steps
        num_train_epochs = ceil(num_training_steps / len(train_data_loader))
    else:
        num_training_steps = len(train_data_loader) * args.num_train_epochs
        num_train_epochs = args.num_train_epochs

    warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion
    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, warmup)

    # 制定优化器
    clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=args.max_grad_norm)
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "layer_norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        beta1=0.9,
        beta2=0.999,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        grad_clip=clip,
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    # 模型训练
    metric = Accuracy()

    # 定义损失函数
    loss_fct = paddle.nn.loss.CrossEntropyLoss(
    ) if train_ds.label_list else paddle.nn.loss.MSELoss()

    global_step = 0
    tic_train = time.time()
    model.train()
    for epoch in range(num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, token_type_ids, attention_mask, labels = batch
            logits = model(input_ids, token_type_ids, attention_mask)
            loss = loss_fct(logits, labels)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()

            if global_step % args.logging_steps == 0:
                print(
                    "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s"
                    % (global_step, num_training_steps, epoch, step,
                       paddle.distributed.get_rank(), loss, optimizer.get_lr(),
                       args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()

            if global_step % args.save_steps == 0 or global_step == num_training_steps:
                tic_eval = time.time()
                evaluate(model, loss_fct, metric, dev_data_loader)
                print("eval done total : %s s" % (time.time() - tic_eval))

                if (not paddle.distributed.get_world_size() > 1
                    ) or paddle.distributed.get_rank() == 0:
                    output_dir = os.path.join(
                        args.output_dir,
                        "%s_ft_model_%d" % (args.task_name, global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # Need better way to get inner model of DataParallel
                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                if global_step == num_training_steps:
                    exit(0)
                tic_train += time.time() - tic_eval
    else:
        paddle.set_device("gpu:2")
        is_server = True
        model_dir = os.path.join(work_root, "models/epoch_{}".format(3))
        labels_info = load_json_obj(
            os.path.join(work_root, "data/news_labels_info.json"))
        model = paddlenlp.transformers.ErnieForSequenceClassification.from_pretrained(
            model_dir)
        tokenizer = paddlenlp.transformers.ErnieTinyTokenizer.from_pretrained(
            model_dir)
        if is_server:
            app.run(host="0.0.0.0", port=8280)  # 启动服务
        else:
            test_data_file = os.path.join(work_root, "data/NewsTest.txt")
            test_datasets, _ = load_data(test_data_file)
            test_datasets = MapDataset(test_datasets[:1000])
            trans_func = partial(convert_example,
                                 tokenizer=tokenizer,
                                 max_seq_length=64)
            batchify_fn = lambda samples, fn=Tuple(
                Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
                Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
                Stack(dtype="int64")  # label
            ): fn(samples)
            test_loader = create_dataloader(test_datasets,
                                            mode="test",
                                            batch_size=32,
                                            batchify_fn=batchify_fn,
                                            trans_fn=trans_func)

            labels = inference(model, test_loader)
Beispiel #21
0
    # pretrained_model=ErnieModel.from_pretrained("ernie-1.0")

    model = SimCSE(pretrained_model, output_emb_size=output_emb_size)

    # Load pretrained semantic model
    if params_path and os.path.isfile(params_path):
        state_dict = paddle.load(params_path)
        model.set_dict(state_dict)
        print("Loaded parameters from %s" % params_path)
    else:
        raise ValueError(
            "Please set --params_path with correct pretrained model file")

    # conver_example function's input must be dict
    corpus_list = [{idx: text} for idx, text in id2corpus.items()]
    corpus_ds = MapDataset(corpus_list)

    corpus_data_loader = create_dataloader(
        corpus_ds,
        mode='predict',
        batch_size=batch_size,
        batchify_fn=batchify_fn,
        trans_fn=trans_func)

    all_embeddings = []
    model.eval()
    with paddle.no_grad():
        for batch_data in corpus_data_loader:
            input_ids, token_type_ids = batch_data
            input_ids = paddle.to_tensor(input_ids)
            token_type_ids = paddle.to_tensor(token_type_ids)
Beispiel #22
0
def do_train(args):
    set_seed(args)
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    pinyin_vocab = Vocab.load_vocabulary(
        args.pinyin_vocab_file_path, unk_token='[UNK]', pad_token='[PAD]')

    tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path)
    ernie = ErnieModel.from_pretrained(args.model_name_or_path)

    model = ErnieForCSC(
        ernie,
        pinyin_vocab_size=len(pinyin_vocab),
        pad_pinyin_id=pinyin_vocab[pinyin_vocab.pad_token])

    train_ds, eval_ds = load_dataset('sighan-cn', splits=['train', 'dev'])

    # Extend current training dataset by providing extra training 
    # datasets directory. The suffix of dataset file name in extra 
    # dataset directory has to be ".txt". The data format of
    # dataset need to be a couple of senteces at every line, such as:
    # "城府宫员表示,这是过去三十六小时内第三期强烈的余震。\t政府官员表示,这是过去三十六小时内第三起强烈的余震。\n"
    if args.extra_train_ds_dir is not None and os.path.exists(
            args.extra_train_ds_dir):
        data = train_ds.data
        data_files = [
            os.path.join(args.extra_train_ds_dir, data_file)
            for data_file in os.listdir(args.extra_train_ds_dir)
            if data_file.endswith(".txt")
        ]
        for data_file in data_files:
            ds = load_dataset(
                read_train_ds,
                data_path=data_file,
                splits=["train"],
                lazy=False)
            data += ds.data
        train_ds = MapDataset(data)

    det_loss_act = paddle.nn.CrossEntropyLoss(
        ignore_index=args.ignore_label, use_softmax=False)
    corr_loss_act = paddle.nn.CrossEntropyLoss(
        ignore_index=args.ignore_label, reduction='none')

    trans_func = partial(
        convert_example,
        tokenizer=tokenizer,
        pinyin_vocab=pinyin_vocab,
        max_seq_length=args.max_seq_length)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
        Pad(axis=0, pad_val=pinyin_vocab.token_to_idx[pinyin_vocab.pad_token]),  # pinyin
        Pad(axis=0, dtype="int64"),  # detection label
        Pad(axis=0, dtype="int64"),  # correction label
        Stack(axis=0, dtype="int64")  # length
    ): [data for data in fn(samples)]

    train_data_loader = create_dataloader(
        train_ds,
        mode='train',
        batch_size=args.batch_size,
        batchify_fn=batchify_fn,
        trans_fn=trans_func)

    eval_data_loader = create_dataloader(
        eval_ds,
        mode='eval',
        batch_size=args.batch_size,
        batchify_fn=batchify_fn,
        trans_fn=trans_func)

    num_training_steps = args.max_steps if args.max_steps > 0 else len(
        train_data_loader) * args.epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
                                         args.warmup_proportion)

    logger.info("Total training step: {}".format(num_training_steps))
    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    global_steps = 1
    best_f1 = -1
    tic_train = time.time()
    for epoch in range(args.epochs):
        for step, batch in enumerate(train_data_loader, start=1):
            input_ids, token_type_ids, pinyin_ids, det_labels, corr_labels, length = batch
            det_error_probs, corr_logits = model(input_ids, pinyin_ids,
                                                 token_type_ids)
            # Chinese Spelling Correction has 2 tasks: detection task and correction task.
            # Detection task aims to detect whether each Chinese charater has spelling error.
            # Correction task aims to correct each potential wrong charater to right charater.
            # So we need to minimize detection loss and correction loss simultaneously.
            # See more loss design details on https://aclanthology.org/2021.findings-acl.198.pdf
            det_loss = det_loss_act(det_error_probs, det_labels)
            corr_loss = corr_loss_act(
                corr_logits, corr_labels) * det_error_probs.max(axis=-1)
            loss = (det_loss + corr_loss).mean()

            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()

            if global_steps % args.logging_steps == 0:
                logger.info(
                    "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                    % (global_steps, epoch, step, loss,
                       args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()
            if global_steps % args.save_steps == 0:
                if paddle.distributed.get_rank() == 0:
                    logger.info("Eval:")
                    det_f1, corr_f1 = evaluate(model, eval_data_loader)
                    f1 = (det_f1 + corr_f1) / 2
                    model_file = "model_%d" % global_steps
                    if f1 > best_f1:
                        # save best model
                        paddle.save(model.state_dict(),
                                    os.path.join(args.output_dir,
                                                 "best_model.pdparams"))
                        logger.info("Save best model at {} step.".format(
                            global_steps))
                        best_f1 = f1
                        model_file = model_file + "_best"
                    model_file = model_file + ".pdparams"
                    paddle.save(model.state_dict(),
                                os.path.join(args.output_dir, model_file))
                    logger.info("Save model at {} step.".format(global_steps))
            if args.max_steps > 0 and global_steps >= args.max_steps:
                return
            global_steps += 1