Exemple #1
0
        """Reads a tab separated value file."""
        with io.open(input_file, "r", encoding="UTF-8") as file:
            examples = []
            for line in file:
                data = line.strip().split("_!_")
                example = InputExample(guid=data[0],
                                       label=data[1],
                                       text_a=data[3])
                examples.append(example)

            return examples


if __name__ == "__main__":
    from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
    tokenizer = BertTokenizer(vocab_file='vocab.txt')
    ds = TNews(tokenizer=tokenizer, max_seq_len=10)
    print("first 10 dev")
    for e in ds.get_dev_examples()[:10]:
        print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
    print("first 10 train")
    for e in ds.get_train_examples()[:10]:
        print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
    print("first 10 test")
    for e in ds.get_test_examples()[:10]:
        print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
    print(ds)
    print("first 10 dev records")
    for e in ds.get_dev_records()[:10]:
        print(e)
Exemple #2
0
                                   dev_file="dev.txt",
                                   test_file=None,
                                   predict_file="test.txt",
                                   train_file_with_header=True,
                                   dev_file_with_header=True,
                                   predict_file_with_header=True,
                                   label_list=["0", "1"],
                                   tokenizer=tokenizer,
                                   max_seq_len=max_seq_len)


if __name__ == "__main__":
    from paddlehub.tokenizer.tokenizer import CustomTokenizer
    from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
    tokenizer = BertTokenizer(
        vocab_file='/mnt/zhangxuefei/.paddlehub/modules/ernie/assets/vocab.txt',
        tokenize_chinese_chars=False)
    ds = DuEL(tokenizer=tokenizer, max_seq_len=60)
    print("first 10 train examples")
    for e in ds.get_train_examples()[:10]:
        print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.text_c,
                                      e.label))
    print("first 10 train records")
    for e in ds.get_train_records()[:10]:
        print(e)
    print("first 10 test records")
    for e in ds.get_test_records()[:10]:
        print(e)
    print("first 10 predict records")
    for e in ds.get_predict_records()[:10]:
        print(e)