def get_tokenizer(FLAGS, vocab_path, **kargs):
    if FLAGS.tokenizer == "bert":
        tokenizer = tokenization.FullTokenizer(
            vocab_file=vocab_path, do_lower_case=FLAGS.do_lower_case)
    elif FLAGS.tokenizer == "jieba_char":
        tokenizer = tokenization.Jieba_CHAR(config=kargs.get("config", {}))

        with tf.gfile.Open(vocab_path, "r") as f:
            lines = f.read().splitlines()
            vocab_lst = []
            for line in lines:
                vocab_lst.append(line)
            print(len(vocab_lst))

        tokenizer.load_vocab(vocab_lst)

    return tokenizer
def main(_):

    tokenizer = tokenization.Jieba_CHAR(config=FLAGS.config)

    # with tf.gfile.Open(FLAGS.vocab_file, "r") as f:
    # 	vocab_lst = []
    # 	for line in f:
    # 		vocab_lst.append(line.strip())

    vocab_path = os.path.join(FLAGS.buckets, FLAGS.vocab_file)
    train_file = os.path.join(FLAGS.buckets, FLAGS.train_file)
    test_file = os.path.join(FLAGS.buckets, FLAGS.test_file)
    dev_file = os.path.join(FLAGS.buckets, FLAGS.dev_file)

    train_result_file = os.path.join(FLAGS.buckets, FLAGS.train_result_file)
    test_result_file = os.path.join(FLAGS.buckets, FLAGS.test_result_file)
    dev_result_file = os.path.join(FLAGS.buckets, FLAGS.dev_result_file)

    corpus_vocab_path = os.path.join(FLAGS.buckets, FLAGS.corpus_vocab_path)

    print(FLAGS.with_char)
    with tf.gfile.Open(vocab_path, "r") as f:
        lines = f.read().splitlines()
        vocab_lst = []
        for line in lines:
            vocab_lst.append(line)
        print(len(vocab_lst))

    tokenizer.load_vocab(vocab_lst)

    print("==not apply rule==")
    if FLAGS.data_type == "lcqmc":
        classifier_data_api = classifier_processor.LCQMCProcessor()
    classifier_data_api.get_labels(FLAGS.label_id)

    train_examples = classifier_data_api.get_train_examples(train_file,
                                                            is_shuffle=True)

    vocab_filter.vocab_filter(train_examples, vocab_lst, tokenizer,
                              FLAGS.predefined_vocab_size, corpus_vocab_path)

    tokenizer_corpus = tokenization.Jieba_CHAR(config=FLAGS.config)

    with tf.gfile.Open(corpus_vocab_path, "r") as f:
        lines = f.read().splitlines()
        vocab_lst = []
        for line in lines:
            vocab_lst.append(line)
        print(len(vocab_lst))

    tokenizer_corpus.load_vocab(vocab_lst)

    write_to_tfrecords.convert_distillation_classifier_examples_to_features(
        train_examples, classifier_data_api.label2id, FLAGS.max_length,
        tokenizer_corpus, train_result_file, FLAGS.with_char, FLAGS.char_len)

    test_examples = classifier_data_api.get_train_examples(test_file,
                                                           is_shuffle=False)
    write_to_tfrecords.convert_distillation_classifier_examples_to_features(
        test_examples, classifier_data_api.label2id, FLAGS.max_length,
        tokenizer_corpus, test_result_file, FLAGS.with_char, FLAGS.char_len)

    dev_examples = classifier_data_api.get_train_examples(dev_file,
                                                          is_shuffle=False)
    write_to_tfrecords.convert_distillation_classifier_examples_to_features(
        dev_examples, classifier_data_api.label2id, FLAGS.max_length,
        tokenizer_corpus, dev_result_file, FLAGS.with_char, FLAGS.char_len)
Exemple #3
0
def main(_):

    # with tf.gfile.Open(FLAGS.vocab_file, "r") as f:
    # 	vocab_lst = []
    # 	for line in f:
    # 		vocab_lst.append(line.strip())

    vocab_path = os.path.join(FLAGS.buckets, FLAGS.vocab_file)
    train_file = os.path.join(FLAGS.buckets, FLAGS.train_file)
    test_file = os.path.join(FLAGS.buckets, FLAGS.test_file)
    dev_file = os.path.join(FLAGS.buckets, FLAGS.dev_file)

    train_result_file = os.path.join(FLAGS.buckets, FLAGS.train_result_file)
    test_result_file = os.path.join(FLAGS.buckets, FLAGS.test_result_file)
    dev_result_file = os.path.join(FLAGS.buckets, FLAGS.dev_result_file)

    corpus_vocab_path = os.path.join(FLAGS.buckets, FLAGS.corpus_vocab_path)
    unsupervised_distillation_file = os.path.join(
        FLAGS.buckets, FLAGS.unsupervised_distillation_file)
    supervised_distillation_file = os.path.join(
        FLAGS.buckets, FLAGS.supervised_distillation_file)

    if FLAGS.tokenizer_type == "jieba":
        tokenizer = tokenization.Jieba_CHAR(config=FLAGS.config)
    elif FLAGS.tokenizer_type == "full_bpe":
        tokenizer = tokenization.FullTokenizer(
            vocab_file=vocab_path,
            do_lower_case=True if FLAGS.lower_case == "true" else False)

    if FLAGS.tokenizer_type == "jieba":
        print(FLAGS.with_char)
        with tf.gfile.Open(vocab_path, "r") as f:
            lines = f.read().splitlines()
            vocab_lst = []
            for line in lines:
                vocab_lst.append(line)
            print(len(vocab_lst))

        tokenizer.load_vocab(vocab_lst)

    print("==not apply rule==")

    if FLAGS.distillation_type == "prob":
        classifier_data_api = classifier_processor.FasttextDistillationProcessor(
        )
    elif FLAGS.distillation_type == "structure":
        classifier_data_api = classifier_processor.FasttextStructureDistillationProcessor(
        )
    classifier_data_api.get_labels(FLAGS.label_id)

    train_examples = classifier_data_api.get_supervised_distillation_examples(
        train_file, supervised_distillation_file, is_shuffle=True)

    if FLAGS.tokenizer_type == "jieba":
        vocab_filter.vocab_filter(train_examples, vocab_lst, tokenizer,
                                  FLAGS.predefined_vocab_size,
                                  corpus_vocab_path)

        tokenizer_corpus = tokenization.Jieba_CHAR(config=FLAGS.config)

        with tf.gfile.Open(corpus_vocab_path, "r") as f:
            lines = f.read().splitlines()
            vocab_lst = []
            for line in lines:
                vocab_lst.append(line)
            print(len(vocab_lst))

        tokenizer_corpus.load_vocab(vocab_lst)
    elif FLAGS.tokenizer_type == "full_bpe":
        tokenizer_corpus = tokenizer

    dev_examples = classifier_data_api.get_unsupervised_distillation_examples(
        dev_file, unsupervised_distillation_file, is_shuffle=False)

    import random
    if FLAGS.if_add_unlabeled_distillation == "yes":
        total_train_examples = train_examples + dev_examples
    else:
        total_train_examples = train_examples
    random.shuffle(total_train_examples)

    if FLAGS.tokenizer_type == "jieba":

        write_to_tfrecords.convert_distillation_classifier_examples_to_features(
            total_train_examples, classifier_data_api.label2id,
            FLAGS.max_length, tokenizer_corpus, train_result_file,
            FLAGS.with_char, FLAGS.char_len)

        write_to_tfrecords.convert_distillation_classifier_examples_to_features(
            dev_examples, classifier_data_api.label2id, FLAGS.max_length,
            tokenizer_corpus, dev_result_file, FLAGS.with_char, FLAGS.char_len)

        test_examples = classifier_data_api.get_train_examples(
            test_file, is_shuffle=False)
        write_to_tfrecords.convert_distillation_classifier_examples_to_features(
            test_examples, classifier_data_api.label2id, FLAGS.max_length,
            tokenizer_corpus, test_result_file, FLAGS.with_char,
            FLAGS.char_len)
    elif FLAGS.tokenizer_type == "full_bpe":
        write_to_tfrecords.convert_bert_distillation_classifier_examples_to_features(
            total_train_examples, classifier_data_api.label2id,
            FLAGS.max_length, tokenizer_corpus, train_result_file,
            FLAGS.with_char, FLAGS.char_len)

        write_to_tfrecords.convert_bert_distillation_classifier_examples_to_features(
            dev_examples, classifier_data_api.label2id, FLAGS.max_length,
            tokenizer_corpus, dev_result_file, FLAGS.with_char, FLAGS.char_len)

        test_examples = classifier_data_api.get_train_examples(
            test_file, is_shuffle=False)
        write_to_tfrecords.convert_bert_distillation_classifier_examples_to_features(
            test_examples, classifier_data_api.label2id, FLAGS.max_length,
            tokenizer_corpus, test_result_file, FLAGS.with_char,
            FLAGS.char_len)
def main(_):

    # tokenizer = tokenization.Jieba_CHAR(
    # 	config=FLAGS.config)

    # with tf.gfile.Open(FLAGS.vocab_file, "r") as f:
    # 	vocab_lst = []
    # 	for line in f:
    # 		vocab_lst.append(line.strip())

    # vocab_path = os.path.join(FLAGS.buckets, FLAGS.vocab_file)
    vocab_path = FLAGS.vocab_file
    train_file = os.path.join(FLAGS.buckets, FLAGS.train_file)
    test_file = os.path.join(FLAGS.buckets, FLAGS.test_file)
    dev_file = os.path.join(FLAGS.buckets, FLAGS.dev_file)

    train_result_file = os.path.join(FLAGS.buckets, FLAGS.train_result_file)
    test_result_file = os.path.join(FLAGS.buckets, FLAGS.test_result_file)
    dev_result_file = os.path.join(FLAGS.buckets, FLAGS.dev_result_file)

    corpus_vocab_path = os.path.join(FLAGS.buckets, FLAGS.corpus_vocab_path)

    if FLAGS.tokenizer_type == "jieba":
        tokenizer = tokenization.Jieba_CHAR(config=FLAGS.config)
    elif FLAGS.tokenizer_type == "full_bpe":
        tokenizer = tokenization.FullTokenizer(vocab_file=vocab_path,
                                               do_lower_case=FLAGS.lower_case)

    if FLAGS.tokenizer_type == "jieba":
        print(FLAGS.with_char)
        with tf.gfile.Open(vocab_path, "r") as f:
            lines = f.read().splitlines()
            vocab_lst = []
            for line in lines:
                vocab_lst.append(line)
            print(len(vocab_lst))

        tokenizer.load_vocab(vocab_lst)

    print("==not apply rule==")
    if FLAGS.data_type == "fasttext":
        classifier_data_api = classifier_processor.FasttextClassifierProcessor(
        )

    classifier_data_api.get_labels(FLAGS.label_id)

    train_examples = classifier_data_api.get_train_examples(train_file,
                                                            is_shuffle=True)
    print("==total train examples==", len(train_examples))

    test_examples = classifier_data_api.get_train_examples(test_file,
                                                           is_shuffle=False)
    print("==total test examples==", len(test_examples))

    dev_examples = classifier_data_api.get_train_examples(dev_file,
                                                          is_shuffle=False)
    print("==total dev examples==", len(dev_examples))

    if FLAGS.tokenizer_type == "jieba":
        vocab_filter.vocab_filter(
            train_examples + test_examples + dev_examples, vocab_lst,
            tokenizer, FLAGS.predefined_vocab_size, corpus_vocab_path)

        tokenizer_corpus = tokenization.Jieba_CHAR(config=FLAGS.config)

        with tf.gfile.Open(corpus_vocab_path, "r") as f:
            lines = f.read().splitlines()
            vocab_lst = []
            for line in lines:
                vocab_lst.append(line)
            print(len(vocab_lst))
            # print(vocab_lst)

        tokenizer_corpus.load_vocab(vocab_lst)
    elif FLAGS.tokenizer_type == "full_bpe":
        tokenizer_corpus = tokenizer

    if FLAGS.tokenizer_type == "jieba":
        write_to_tfrecords.convert_distillation_classifier_examples_to_features(
            train_examples, classifier_data_api.label2id, FLAGS.max_length,
            tokenizer_corpus, train_result_file, FLAGS.with_char,
            FLAGS.char_len)

        write_to_tfrecords.convert_distillation_classifier_examples_to_features(
            test_examples, classifier_data_api.label2id, FLAGS.max_length,
            tokenizer_corpus, test_result_file, FLAGS.with_char,
            FLAGS.char_len)

        write_to_tfrecords.convert_distillation_classifier_examples_to_features(
            dev_examples, classifier_data_api.label2id, FLAGS.max_length,
            tokenizer_corpus, dev_result_file, FLAGS.with_char, FLAGS.char_len)
    elif FLAGS.tokenizer_type == "full_bpe":
        write_to_tfrecords.convert_bert_distillation_classifier_examples_to_features(
            train_examples,
            classifier_data_api.label2id,
            FLAGS.max_length,
            tokenizer_corpus,
            train_result_file,
            FLAGS.with_char,
            FLAGS.char_len,
            label_type=FLAGS.label_type)

        write_to_tfrecords.convert_bert_distillation_classifier_examples_to_features(
            dev_examples,
            classifier_data_api.label2id,
            FLAGS.max_length,
            tokenizer_corpus,
            dev_result_file,
            FLAGS.with_char,
            FLAGS.char_len,
            label_type=FLAGS.label_type)

        test_examples = classifier_data_api.get_train_examples(
            test_file, is_shuffle=False)
        write_to_tfrecords.convert_bert_distillation_classifier_examples_to_features(
            test_examples,
            classifier_data_api.label2id,
            FLAGS.max_length,
            tokenizer_corpus,
            test_result_file,
            FLAGS.with_char,
            FLAGS.char_len,
            label_type=FLAGS.label_type)