def main(unused_argv):
    if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern:
        raise ValueError(
            'Must only provide one of --corpus_filepattern or --vocab_filepattern'
        )

    elif FLAGS.corpus_filepattern:
        token_counts = tokenizer.corpus_token_counts(
            FLAGS.corpus_filepattern,
            FLAGS.corpus_max_lines,
            FLAGS.do_lower,
            split_on_newlines=FLAGS.split_on_newlines)

    elif FLAGS.vocab_filepattern:
        token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
                                                    FLAGS.do_lower,
                                                    FLAGS.corpus_max_lines)

    else:
        raise ValueError(
            'Must provide one of --corpus_filepattern or --vocab_filepattern')

    encoder = text_encoder.SubwordTextEncoder()
    encoder.build_from_token_counts(token_counts, FLAGS.min_count,
                                    FLAGS.num_iterations)
    encoder.store_to_file(FLAGS.output_filename, add_single_quotes=False)
def main(unused_argv):
    if FLAGS.log_level not in ['DEBUG', 'INFO', 'ERROR']:
        raise ValueError('Set verbosity among "DEBUG", "INFO", "ERROR"')
    tf.logging.set_verbosity(FLAGS.log_level)
    if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern:
        raise ValueError(
            'Must only provide one of --corpus_filepattern or --vocab_filepattern'
        )

    elif FLAGS.corpus_filepattern:
        token_counts = tokenizer.corpus_token_counts(
            FLAGS.corpus_filepattern,
            FLAGS.corpus_max_lines,
            split_on_newlines=FLAGS.split_on_newlines,
            additional_chars=FLAGS.additional_chars)

    elif FLAGS.vocab_filepattern:
        token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
                                                    FLAGS.corpus_max_lines)

    else:
        raise ValueError(
            'Must provide one of --corpus_filepattern or --vocab_filepattern')

    encoder = text_encoder.SubwordTextEncoder()
    encoder.build_from_token_counts(
        token_counts,
        FLAGS.min_count,
        FLAGS.num_iterations,
        max_subtoken_length=FLAGS.max_subtoken_length,
        backward=FLAGS.backward)
    encoder.store_to_file(FLAGS.output_filename, add_single_quotes=False)
def main(unused_argv):
    if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern:
        raise ValueError(
            'Must only provide one of --corpus_filepattern or --vocab_filepattern'
        )

    elif FLAGS.corpus_filepattern:
        token_counts = tokenizer.corpus_token_counts(
            FLAGS.corpus_filepattern,
            FLAGS.corpus_max_lines,
            split_on_newlines=FLAGS.split_on_newlines)

    elif FLAGS.vocab_filepattern:
        token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
                                                    FLAGS.corpus_max_lines)

    else:
        raise ValueError(
            'Must provide one of --corpus_filepattern or --vocab_filepattern')

    # encoder = text_encoder.SubwordTextEncoder()
    # encoder.build_from_token_counts(token_counts, FLAGS.min_count,
    #                                 FLAGS.num_iterations)

    # encoder.build_to_target_size(32000 , token_counts, FLAGS.min_count, 5 * FLAGS.min_count)
    # encoder.build_to_target_size(32000, token_counts, 2, 10000)

    encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
        32000, token_counts, 1, 1000)

    encoder.store_to_file(FLAGS.output_filename)
Ejemplo n.º 4
0
def main(unused_argv):
    if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern:
        raise ValueError(
            'Must only provide one of --corpus_filepattern or --vocab_filepattern'
        )

    elif FLAGS.corpus_filepattern:
        token_counts = tokenizer.corpus_token_counts(
            FLAGS.corpus_filepattern,
            FLAGS.corpus_max_lines,
            split_on_newlines=FLAGS.split_on_newlines,
            additional_chars=FLAGS.additional_chars,
            do_lower_case=FLAGS.do_lower_case)

    elif FLAGS.vocab_filepattern:
        token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
                                                    FLAGS.corpus_max_lines,
                                                    FLAGS.do_lower_case)

    else:
        raise ValueError(
            'Must provide one of --corpus_filepattern or --vocab_filepattern')
    reserved_tokens = None
    if FLAGS.raw_vocab:
        lines = open(FLAGS.raw_vocab, 'r', encoding='utf-8').readlines()
        lines = [s.strip() for s in lines if len(s) > 0]
        reserved_tokens = lines

    print(len(token_counts))
    print(len(reserved_tokens))
    target_size = FLAGS.vocab_size
    if target_size <= len(reserved_tokens):
        raise ValueError(
            "The vocab_size must be larger than the origin vocab's size ")
    if target_size >= len(token_counts):
        raise ValueError(
            "The vocab_size is too large. Please set it smaller or prepare more corpus."
        )
    min_val = 1
    max_val = len(token_counts) // (target_size**0.5)
    fd, temp_path = tempfile.mkstemp()
    encoder = SubwordTextEncoder.build_to_target_size(
        target_size,
        token_counts,
        min_val,
        max_val,
        num_iterations=FLAGS.num_iterations,
        reserved_tokens=reserved_tokens,
        max_subtoken_length=FLAGS.max_subtoken_length)
    # encoder = SubwordTextEncoder()
    # encoder.build_from_token_counts(token_counts, FLAGS.min_count,
    #                                 FLAGS.num_iterations, reserved_tokens=reserved_tokens, max_subtoken_length=FLAGS.max_subtoken_length)
    encoder.store_to_file(temp_path, add_single_quotes=False)
    merge_output_file_with_bert_vocab(FLAGS.output_filename, FLAGS.raw_vocab,
                                      temp_path)