def main(_): jieba_tokenization_file = FLAGS.train_file + ".tmp" fwobj = open(jieba_tokenization_file, "w") char_pattern = re.compile(u"[\u4e00-\u9fa5]+") with open(FLAGS.train_file, "r") as frobj: for line in frobj: out = [] result = list(jieba.cut(line.strip())) for word in result: char_cn = char_pattern.findall(word) if len(char_cn) >= 1: out.append(word) else: out.append(word) fwobj.write(" ".join(out) + "\n") train_config = { "corpus": jieba_tokenization_file, "model_prefix": os.path.join(FLAGS.output_folder, FLAGS.model_prefix), "vocab_size": FLAGS.vocab_size, "model_type": FLAGS.model_type, "character_coverage": FLAGS.character_coverage, "mining_sentence_size": FLAGS.mining_sentence_size, "input_sentence_size": FLAGS.input_sentence_size } my_spm = tokenization.SPM({}) my_spm.train_model(train_config)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) print(FLAGS.do_whole_word_mask, FLAGS.do_lower_case) if FLAGS.tokenizer_type == "spm": word_piece_model = os.path.join(FLAGS.buckets, FLAGS.word_piece_model) tokenizer = tokenization.SPM(config={ "word_dict":FLAGS.vocab_file, "word_piece_model":word_piece_model }) tokenizer.load_dict() tokenizer.load_model() tokenizer.add_extra_word() tokenizer.build_word_id() elif FLAGS.tokenizer_type == "word_piece": tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, do_whole_word_mask=FLAGS.do_whole_word_mask) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Reading from input files ***") for input_file in input_files: tf.logging.info(" %s", input_file) rng = random.Random(FLAGS.random_seed) output_files = FLAGS.output_file.split(",") tf.logging.info("*** Writing to output files ***") for output_file in output_files: tf.logging.info(" %s", output_file) start = time.time() multi_process( input_files=input_files, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, masked_lm_prob=FLAGS.masked_lm_prob, max_predictions_per_seq=FLAGS.max_predictions_per_seq, short_seq_prob=FLAGS.short_seq_prob, output_file=output_file, process_num=1, dupe_factor=FLAGS.dupe_factor, random_seed=1234567 ) print(time.time()-start, "==total time==")
def main(_): input_config = {"word_piece_model": FLAGS.word_piece_model} if FLAGS.tokenizer_type == 'spm': tokenizer = tokenization.SPM(input_config) tokenizer.load_model() elif FLAGS.tokenizer_type == 'jieba': tokenizer = jieba fwobj = open(FLAGS.output_file, "w") with open(FLAGS.train_file, "r") as frobj: for line in frobj: content = line.strip() if len(content) >= 1: token_lst = my_spm.tokenize(content) fwobj.write(" ".join(token_lst) + "\n") else: fwobj.write("\n") fwobj.close()
def main(_): tf.logging.set_verbosity(tf.logging.INFO) print(FLAGS.do_whole_word_mask, FLAGS.do_lower_case) if FLAGS.tokenizer_type == "spm": word_piece_model = os.path.join(FLAGS.buckets, FLAGS.word_piece_model) tokenizer = tokenization.SPM(config={ "word_dict":FLAGS.vocab_file, "word_piece_model":word_piece_model }) tokenizer.load_dict() tokenizer.load_model() tokenizer.add_extra_word() tokenizer.build_word_id() elif FLAGS.tokenizer_type == "word_piece":: tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, do_whole_word_mask=FLAGS.do_whole_word_mask) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Reading from input files ***") for input_file in input_files: tf.logging.info(" %s", input_file) rng = random.Random(FLAGS.random_seed) instances = create_training_instances( input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, rng) output_files = FLAGS.output_file.split(",") tf.logging.info("*** Writing to output files ***") for output_file in output_files: tf.logging.info(" %s", output_file) write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, FLAGS.max_predictions_per_seq, output_files)