def main(unused_argv): if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern: raise ValueError( 'Must only provide one of --corpus_filepattern or --vocab_filepattern' ) elif FLAGS.corpus_filepattern: token_counts = tokenizer.corpus_token_counts( FLAGS.corpus_filepattern, FLAGS.corpus_max_lines, FLAGS.do_lower, split_on_newlines=FLAGS.split_on_newlines) elif FLAGS.vocab_filepattern: token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern, FLAGS.do_lower, FLAGS.corpus_max_lines) else: raise ValueError( 'Must provide one of --corpus_filepattern or --vocab_filepattern') encoder = text_encoder.SubwordTextEncoder() encoder.build_from_token_counts(token_counts, FLAGS.min_count, FLAGS.num_iterations) encoder.store_to_file(FLAGS.output_filename, add_single_quotes=False)
def main(unused_argv): if FLAGS.log_level not in ['DEBUG', 'INFO', 'ERROR']: raise ValueError('Set verbosity among "DEBUG", "INFO", "ERROR"') tf.logging.set_verbosity(FLAGS.log_level) if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern: raise ValueError( 'Must only provide one of --corpus_filepattern or --vocab_filepattern' ) elif FLAGS.corpus_filepattern: token_counts = tokenizer.corpus_token_counts( FLAGS.corpus_filepattern, FLAGS.corpus_max_lines, split_on_newlines=FLAGS.split_on_newlines, additional_chars=FLAGS.additional_chars) elif FLAGS.vocab_filepattern: token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern, FLAGS.corpus_max_lines) else: raise ValueError( 'Must provide one of --corpus_filepattern or --vocab_filepattern') encoder = text_encoder.SubwordTextEncoder() encoder.build_from_token_counts( token_counts, FLAGS.min_count, FLAGS.num_iterations, max_subtoken_length=FLAGS.max_subtoken_length, backward=FLAGS.backward) encoder.store_to_file(FLAGS.output_filename, add_single_quotes=False)
def main(unused_argv): if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern: raise ValueError( 'Must only provide one of --corpus_filepattern or --vocab_filepattern' ) elif FLAGS.corpus_filepattern: token_counts = tokenizer.corpus_token_counts( FLAGS.corpus_filepattern, FLAGS.corpus_max_lines, split_on_newlines=FLAGS.split_on_newlines) elif FLAGS.vocab_filepattern: token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern, FLAGS.corpus_max_lines) else: raise ValueError( 'Must provide one of --corpus_filepattern or --vocab_filepattern') # encoder = text_encoder.SubwordTextEncoder() # encoder.build_from_token_counts(token_counts, FLAGS.min_count, # FLAGS.num_iterations) # encoder.build_to_target_size(32000 , token_counts, FLAGS.min_count, 5 * FLAGS.min_count) # encoder.build_to_target_size(32000, token_counts, 2, 10000) encoder = text_encoder.SubwordTextEncoder.build_to_target_size( 32000, token_counts, 1, 1000) encoder.store_to_file(FLAGS.output_filename)
def main(unused_argv): if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern: raise ValueError( 'Must only provide one of --corpus_filepattern or --vocab_filepattern' ) elif FLAGS.corpus_filepattern: token_counts = tokenizer.corpus_token_counts( FLAGS.corpus_filepattern, FLAGS.corpus_max_lines, split_on_newlines=FLAGS.split_on_newlines, additional_chars=FLAGS.additional_chars, do_lower_case=FLAGS.do_lower_case) elif FLAGS.vocab_filepattern: token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern, FLAGS.corpus_max_lines, FLAGS.do_lower_case) else: raise ValueError( 'Must provide one of --corpus_filepattern or --vocab_filepattern') reserved_tokens = None if FLAGS.raw_vocab: lines = open(FLAGS.raw_vocab, 'r', encoding='utf-8').readlines() lines = [s.strip() for s in lines if len(s) > 0] reserved_tokens = lines print(len(token_counts)) print(len(reserved_tokens)) target_size = FLAGS.vocab_size if target_size <= len(reserved_tokens): raise ValueError( "The vocab_size must be larger than the origin vocab's size ") if target_size >= len(token_counts): raise ValueError( "The vocab_size is too large. Please set it smaller or prepare more corpus." ) min_val = 1 max_val = len(token_counts) // (target_size**0.5) fd, temp_path = tempfile.mkstemp() encoder = SubwordTextEncoder.build_to_target_size( target_size, token_counts, min_val, max_val, num_iterations=FLAGS.num_iterations, reserved_tokens=reserved_tokens, max_subtoken_length=FLAGS.max_subtoken_length) # encoder = SubwordTextEncoder() # encoder.build_from_token_counts(token_counts, FLAGS.min_count, # FLAGS.num_iterations, reserved_tokens=reserved_tokens, max_subtoken_length=FLAGS.max_subtoken_length) encoder.store_to_file(temp_path, add_single_quotes=False) merge_output_file_with_bert_vocab(FLAGS.output_filename, FLAGS.raw_vocab, temp_path)