Python BertTokenizerの例、paddlehub.BertTokenizer Pythonの例

コード例 #1

0

ファイルを表示

ファイル: module.py プロジェクト: 18621579069/PaddleHub-yu

    def generate(self, texts):
        # Add 0x02 between characters to match the format of training data,
        # otherwise the length of prediction results will not match the input string
        # if the input string contains non-Chinese characters.
        formatted_text_a = list(map("\002".join, texts))

        # Use the appropriate tokenizer to preprocess the data
        # For ernie_tiny, it use BertTokenizer too.
        tokenizer = hub.BertTokenizer(vocab_file=self.module.get_vocab_path())
        encoded_data = [
            tokenizer.encode(text=text, max_seq_len=128)
            for text in formatted_text_a
        ]
        results = self.gen_task.generate(data=encoded_data,
                                         label_list=self.label_list,
                                         accelerate_mode=False)
        results = [[
            "".join(sample_result) for sample_result in sample_results
        ] for sample_results in results]
        return results

コード例 #2

0

ファイルを表示

# yapf: enable.

if __name__ == '__main__':

    # Load Paddlehub ERNIE pretrained model.
    module = hub.Module(name="ernie")

    # Pairwise task needs: query, title_left, right_title (3 slots).
    inputs, outputs, program = module.context(trainable=True,
                                              max_seq_len=args.max_seq_len,
                                              num_slots=3)

    # Tokenizer tokenizes the text data and encodes the data as model needed.
    # If you use transformer modules (ernie, bert, roberta and so on), tokenizer should be hub.BertTokenizer.
    # Otherwise, tokenizer should be hub.CustomTokenizer.
    tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path(),
                                  tokenize_chinese_chars=True)

    # Load dataset
    dataset = hub.dataset.DuEL(tokenizer=tokenizer,
                               max_seq_len=args.max_seq_len)

    # Construct transfer learning network.
    # Use sequence-level output.
    query = outputs["sequence_output"]
    left = outputs['sequence_output_2']
    right = outputs['sequence_output_3']

    # Select fine-tune strategy.
    strategy = hub.AdamWeightDecayStrategy()

    # Setup RunConfig for PaddleHub Fine-tune API.

コード例 #3

0

ファイルを表示

if __name__ == '__main__':

    # Load Paddlehub ERNIE Tiny pretrained model
    module = hub.Module(name="ernie_tiny")
    inputs, outputs, program = module.context(
        trainable=True, max_seq_len=args.max_seq_len)

    # Use the appropriate tokenizer to preprocess the data set
    # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
    if module.name == "ernie_tiny":
        tokenizer = hub.ErnieTinyTokenizer(
            vocab_file=module.get_vocab_path(),
            spm_path=module.get_spm_path(),
            word_dict_path=module.get_word_dict_path())
    else:
        tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())

    dataset = hub.dataset.ChnSentiCorp(
        tokenizer=tokenizer, max_seq_len=args.max_seq_len)

    # Construct transfer learning network
    # Use "pooled_output" for classification tasks on an entire sentence.
    # Use "sequence_output" for token-level output.
    pooled_output = outputs["pooled_output"]

    # Select fine-tune strategy, setup config and fine-tune
    strategy = hub.AdamWeightDecayStrategy(
        warmup_proportion=args.warmup_proportion,
        weight_decay=args.weight_decay,
        learning_rate=args.learning_rate)

コード例 #4

0

ファイルを表示

def get_task(args, schema_labels, id):
    # 加载PaddleHub 预训练模型ERNIE Tiny/RoBERTa large
    # 更多预训练模型 https://www.paddlepaddle.org.cn/hublist?filter=en_category&value=SemanticModel
    # model_name = "ernie_tiny"
    model_name = args.model_name
    module = hub.Module(name=model_name)
    inputs, outputs, program = module.context(trainable=True,
                                              max_seq_len=args.max_seq_len)
    # if args.model=='mcls':
    if (args.do_model == 'mcls' or args.do_model == 'mcls_onlysentence'):
        tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path()
                                      )  # 加载数据并通过SequenceLabelReader读取数据

        dataset = CCksDataset(args.data_dir,
                              schema_labels,
                              model=args.do_model,
                              tokenizer=tokenizer,
                              max_seq_len=args.max_seq_len)
        reader = MultiLabelClassifyReader(
            dataset=dataset,
            vocab_path=module.get_vocab_path(),
            max_seq_len=args.max_seq_len,
            sp_model_path=module.get_spm_path(),
            word_dict_path=module.get_word_dict_path())

        # 构建序列标注任务迁移网络
        # 使用ERNIE模型字级别的输出sequence_output作为迁移网络的输入
        output = outputs["pooled_output"]
    elif (args.do_model == 'mrc_relation'):
        print(schema_labels)
        dataset = MRCrelationDataset(args.data_dir,
                                     schema_labels,
                                     model=args.do_model)
        reader = ClassifyReader(dataset=dataset,
                                vocab_path=module.get_vocab_path(),
                                max_seq_len=args.max_seq_len,
                                sp_model_path=module.get_spm_path(),
                                word_dict_path=module.get_word_dict_path())

        # 构建序列标注任务迁移网络
        # 使用ERNIE模型字级别的输出sequence_output作为迁移网络的输入
        output = outputs["pooled_output"]
    else:
        # 加载数据并通过SequenceLabelReader读取数据
        dataset = EEDataset(args.data_dir, schema_labels, model=args.do_model)
        reader = SequenceLabelReader(
            dataset=dataset,
            vocab_path=module.get_vocab_path(),
            max_seq_len=args.max_seq_len,
            sp_model_path=module.get_spm_path(),
            word_dict_path=module.get_word_dict_path())

        # 构建序列标注任务迁移网络
        # 使用ERNIE模型字级别的输出sequence_output作为迁移网络的输入
        output = outputs["sequence_output"]
    # else:
    #     sequence_output = outputs["sequence_output"]
    # sequence_output  = fluid.layers.dropout(
    #     x=sequence_output ,
    #     dropout_prob=args.dropout,
    #     dropout_implementation="upscale_in_train")

    # 设置模型program需要输入的变量feed_list
    # 必须按照以下顺序设置
    feed_list = [
        inputs["input_ids"].name, inputs["position_ids"].name,
        inputs["segment_ids"].name, inputs["input_mask"].name
    ]

    # 选择优化策略
    strategy = hub.AdamWeightDecayStrategy(
        warmup_proportion=args.warmup_proportion,
        weight_decay=args.weight_decay,
        learning_rate=args.learning_rate)

    # 配置运行设置
    config = hub.RunConfig(
        log_interval=100,
        eval_interval=args.eval_step,
        save_ckpt_interval=args.model_save_step,
        use_data_parallel=args.use_data_parallel,
        use_cuda=args.use_gpu,
        # enable_memory_optim=True,
        num_epoch=args.num_epoch,
        batch_size=args.batch_size,
        checkpoint_dir=args.checkpoint_dir,
        strategy=strategy)

    # 构建序列标注迁移任务
    if args.do_model == 'mcls' or args.do_model == 'mcls_onlysentence':
        task = MultiLabelClassifierTask(data_reader=reader,
                                        feature=output,
                                        feed_list=feed_list,
                                        num_classes=dataset.num_labels,
                                        config=config)
    elif (args.do_model == 'mrc_relation'):
        print(dataset.num_labels)
        task = TextClassifierTask(data_reader=reader,
                                  feature=output,
                                  feed_list=feed_list,
                                  num_classes=dataset.num_labels,
                                  config=config,
                                  metrics_choices=['acc'])
    else:
        task = SequenceLabelTask(data_reader=reader,
                                 feature=output,
                                 feed_list=feed_list,
                                 max_seq_len=args.max_seq_len,
                                 num_classes=dataset.num_labels,
                                 config=config,
                                 add_crf=args.add_crf)
    task.main_program.random_seed = args.random_seed
    add_hook(args, task, id)
    return task, reader

コード例 #5

0

ファイルを表示

ファイル: text_classifier_dygraph.py プロジェクト: 18621579069/PaddleHub-yu

def finetune(args):
    module = hub.Module(name="ernie", max_seq_len=args.max_seq_len)
    # Use the appropriate tokenizer to preprocess the data set
    # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
    if module.name == "ernie_tiny":
        tokenizer = hub.ErnieTinyTokenizer(
            vocab_file=module.get_vocab_path(),
            spm_path=module.get_spm_path(),
            word_dict_path=module.get_word_dict_path(),
        )
    else:
        tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
    dataset = hub.dataset.ChnSentiCorp(tokenizer=tokenizer,
                                       max_seq_len=args.max_seq_len)

    with fluid.dygraph.guard():
        tc = TransformerClassifier(num_classes=dataset.num_labels,
                                   transformer=module)
        adam = AdamOptimizer(learning_rate=1e-5,
                             parameter_list=tc.parameters())
        state_dict_path = os.path.join(args.checkpoint_dir,
                                       'dygraph_state_dict')
        if os.path.exists(state_dict_path + '.pdparams'):
            state_dict, _ = fluid.load_dygraph(state_dict_path)
            tc.load_dict(state_dict)

        loss_sum = acc_sum = cnt = 0
        for epoch in range(args.num_epoch):
            for batch_id, data in enumerate(
                    dataset.batch_records_generator(
                        phase="train",
                        batch_size=args.batch_size,
                        shuffle=True,
                        pad_to_batch_max_seq_len=False)):
                batch_size = len(data["input_ids"])
                input_ids = np.array(data["input_ids"]).astype(
                    np.int64).reshape([batch_size, -1, 1])
                position_ids = np.array(data["position_ids"]).astype(
                    np.int64).reshape([batch_size, -1, 1])
                segment_ids = np.array(data["segment_ids"]).astype(
                    np.int64).reshape([batch_size, -1, 1])
                input_mask = np.array(data["input_mask"]).astype(
                    np.float32).reshape([batch_size, -1, 1])
                labels = np.array(data["label"]).astype(np.int64).reshape(
                    [batch_size, 1])
                pred = tc(input_ids, position_ids, segment_ids, input_mask)

                acc = fluid.layers.accuracy(pred, to_variable(labels))
                loss = fluid.layers.cross_entropy(pred, to_variable(labels))
                avg_loss = fluid.layers.mean(loss)
                avg_loss.backward()
                adam.minimize(avg_loss)

                loss_sum += avg_loss.numpy() * labels.shape[0]
                acc_sum += acc.numpy() * labels.shape[0]
                cnt += labels.shape[0]
                if batch_id % args.log_interval == 0:
                    print('epoch {}: loss {}, acc {}'.format(
                        epoch, loss_sum / cnt, acc_sum / cnt))
                    loss_sum = acc_sum = cnt = 0

                if batch_id % args.save_interval == 0:
                    state_dict = tc.state_dict()
                    fluid.save_dygraph(state_dict, state_dict_path)

コード例 #6

0

ファイルを表示

ファイル: sequence_label_dygraph.py プロジェクト: 18621579069/PaddleHub-yu

def finetune(args):
    module = hub.Module(name="ernie", max_seq_len=args.max_seq_len)
    # Use the appropriate tokenizer to preprocess the data set
    tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
    dataset = hub.dataset.MSRA_NER(tokenizer=tokenizer,
                                   max_seq_len=args.max_seq_len)

    with fluid.dygraph.guard():
        ts = TransformerSeqLabeling(num_classes=dataset.num_labels,
                                    transformer=module)
        adam = AdamOptimizer(learning_rate=1e-5,
                             parameter_list=ts.parameters())
        state_dict_path = os.path.join(args.checkpoint_dir,
                                       'dygraph_state_dict')
        if os.path.exists(state_dict_path + '.pdparams'):
            state_dict, _ = fluid.load_dygraph(state_dict_path)
            ts.load_dict(state_dict)

        loss_sum = total_infer = total_label = total_correct = cnt = 0
        for epoch in range(args.num_epoch):
            for batch_id, data in enumerate(
                    dataset.batch_records_generator(
                        phase="train",
                        batch_size=args.batch_size,
                        shuffle=True,
                        pad_to_batch_max_seq_len=False)):
                batch_size = len(data["input_ids"])
                input_ids = np.array(data["input_ids"]).astype(
                    np.int64).reshape([batch_size, -1, 1])
                position_ids = np.array(data["position_ids"]).astype(
                    np.int64).reshape([batch_size, -1, 1])
                segment_ids = np.array(data["segment_ids"]).astype(
                    np.int64).reshape([batch_size, -1, 1])
                input_mask = np.array(data["input_mask"]).astype(
                    np.float32).reshape([batch_size, -1, 1])
                labels = np.array(data["label"]).astype(np.int64).reshape(
                    -1, 1)
                seq_len = np.array(data["seq_len"]).astype(np.int64).reshape(
                    -1, 1)
                pred, ret_infers = ts(input_ids, position_ids, segment_ids,
                                      input_mask)

                loss = fluid.layers.cross_entropy(pred, to_variable(labels))
                avg_loss = fluid.layers.mean(loss)
                avg_loss.backward()
                adam.minimize(avg_loss)

                loss_sum += avg_loss.numpy() * labels.shape[0]
                label_num, infer_num, correct_num = chunk_eval(
                    labels, ret_infers.numpy(), seq_len, dataset.num_labels, 1)
                cnt += labels.shape[0]

                total_infer += infer_num
                total_label += label_num
                total_correct += correct_num

                if batch_id % args.log_interval == 0:
                    precision, recall, f1 = calculate_f1(
                        total_label, total_infer, total_correct)
                    print('epoch {}: loss {}, f1 {} recall {} precision {}'.
                          format(epoch, loss_sum / cnt, f1, recall, precision))
                    loss_sum = total_infer = total_label = total_correct = cnt = 0

                if batch_id % args.save_interval == 0:
                    state_dict = ts.state_dict()
                    fluid.save_dygraph(state_dict, state_dict_path)

コード例 #7

0

ファイルを表示

def main(type, cnf):
    class SouhuCompetition(TextMatchingDataset):
        def __init__(self, tokenizer=None, max_seq_len=None):
            base_path = './data'
            if type in ['ssA', 'slA', 'llA']:
                train_file = 'data78383/{}_train.tsv'.format(type)
                dev_file = 'data78383/{}_valid.tsv'.format(type)
            elif type in ['ssB', 'slB', 'llB']:
                train_file = 'data78384/{}_train.tsv'.format(type)
                dev_file = 'data78384/{}_valid.tsv'.format(type)
            super(SouhuCompetition, self).__init__(
                is_pair_wise=False,  # 文本匹配类型，是否为pairwise
                base_path=base_path,
                train_file=train_file,  # 相对于base_path的文件路径
                dev_file=dev_file,  # 相对于base_path的文件路径
                train_file_with_header=True,
                dev_file_with_header=True,
                label_list=["0", "1"],
                tokenizer=tokenizer,
                max_seq_len=max_seq_len)

    module = hub.Module(name="ernie")

    # pointwise任务需要: query, title_left (2 slots)
    inputs, outputs, program = module.context(trainable=True,
                                              max_seq_len=cnf.max_seq_len,
                                              num_slots=2)

    tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path(),
                                  tokenize_chinese_chars=True)
    dataset = SouhuCompetition(tokenizer=tokenizer,
                               max_seq_len=cnf.max_seq_len)

    strategy = hub.AdamWeightDecayStrategy(weight_decay=0.01,
                                           warmup_proportion=0.1,
                                           learning_rate=1e-5)
    config = hub.RunConfig(
        eval_interval=300,
        use_cuda=True,
        num_epoch=10,
        batch_size=cnf.train_and_eval_batch,
        checkpoint_dir='./ckpt_ernie_pointwise_matching_{}'.format(type),
        strategy=strategy)
    # 构建迁移网络，使用ernie的token-level输出
    query = outputs["sequence_output"]
    title = outputs['sequence_output_2']
    # 创建pointwise文本匹配任务
    pointwise_matching_task = hub.PointwiseTextMatchingTask(
        dataset=dataset,
        query_feature=query,
        title_feature=title,
        tokenizer=tokenizer,
        config=config)
    run_states = pointwise_matching_task.finetune_and_eval()

    # # 预测数据样例
    # text_pairs = [
    #     [
    #         "小孩吃了百令胶囊能打预防针吗",  # query
    #         "小孩吃了百令胶囊能不能打预防针",  # title
    #     ],
    #     [
    #         "请问呕血与咯血有什么区别?",  # query
    #         "请问呕血与咯血异同？",  # title
    #     ]
    # ]
    save_df = pd.DataFrame(columns=['id', 'label'])

    def predict(text_pairs):
        results = pointwise_matching_task.predict(
            data=text_pairs,
            max_seq_len=cnf.max_seq_len,
            label_list=dataset.get_labels(),
            return_result=True,
            accelerate_mode=False)
        return results

    if type in ['ssA', 'slA', 'llA']:
        test_file = './data/data78383/{}_test.tsv'.format(type)
    elif type in ['ssB', 'slB', 'llB']:
        test_file = './data/data78384/{}_test.tsv'.format(type)
    test_df = pd.read_csv(test_file, sep='\t')
    test_df.columns = ['text_a', 'text_b', 'id']
    text_pairs = []
    ids = []
    for index, row in test_df.iterrows():
        text_pairs.append([row['text_a'], row['text_b']])
        ids.append(row['id'])
        if len(text_pairs) == cnf.test_batch:
            results = predict(text_pairs)
            for i in range(len(ids)):
                new = pd.DataFrame({
                    'id': ids[i],
                    'label': results[i]
                },
                                   index=[0])
                save_df = save_df.append(new, ignore_index=True)
            text_pairs = []
            ids = []
    if len(text_pairs) != 0:
        results = predict(text_pairs)
        for i in range(len(ids)):
            new = pd.DataFrame({'id': ids[i], 'label': results[i]}, index=[0])
            save_df = save_df.append(new, ignore_index=True)

    save_df.to_csv('./results/{}.csv'.format(type),
                   header=True,
                   sep=',',
                   index=False)