Ejemplo n.º 1
0
def main(_):

    jieba_tokenization_file = FLAGS.train_file + ".tmp"
    fwobj = open(jieba_tokenization_file, "w")
    char_pattern = re.compile(u"[\u4e00-\u9fa5]+")
    with open(FLAGS.train_file, "r") as frobj:
        for line in frobj:
            out = []
            result = list(jieba.cut(line.strip()))
            for word in result:
                char_cn = char_pattern.findall(word)
                if len(char_cn) >= 1:
                    out.append(word)
                else:
                    out.append(word)
            fwobj.write(" ".join(out) + "\n")

    train_config = {
        "corpus": jieba_tokenization_file,
        "model_prefix": os.path.join(FLAGS.output_folder, FLAGS.model_prefix),
        "vocab_size": FLAGS.vocab_size,
        "model_type": FLAGS.model_type,
        "character_coverage": FLAGS.character_coverage,
        "mining_sentence_size": FLAGS.mining_sentence_size,
        "input_sentence_size": FLAGS.input_sentence_size
    }

    my_spm = tokenization.SPM({})
    my_spm.train_model(train_config)
def main(_):
	tf.logging.set_verbosity(tf.logging.INFO)

	print(FLAGS.do_whole_word_mask, FLAGS.do_lower_case)

	if FLAGS.tokenizer_type == "spm":
		word_piece_model = os.path.join(FLAGS.buckets, FLAGS.word_piece_model)
		tokenizer = tokenization.SPM(config={
			"word_dict":FLAGS.vocab_file,
			"word_piece_model":word_piece_model
			})
		tokenizer.load_dict()
		tokenizer.load_model()
		tokenizer.add_extra_word()
		tokenizer.build_word_id()
	elif FLAGS.tokenizer_type == "word_piece":
		tokenizer = tokenization.FullTokenizer(
			vocab_file=FLAGS.vocab_file, 
			do_lower_case=FLAGS.do_lower_case,
			do_whole_word_mask=FLAGS.do_whole_word_mask)

	input_files = []
	for input_pattern in FLAGS.input_file.split(","):
		input_files.extend(tf.gfile.Glob(input_pattern))

	tf.logging.info("*** Reading from input files ***")
	for input_file in input_files:
		tf.logging.info("  %s", input_file)

	rng = random.Random(FLAGS.random_seed)

	output_files = FLAGS.output_file.split(",")
	tf.logging.info("*** Writing to output files ***")
	for output_file in output_files:
		tf.logging.info("  %s", output_file)

	start = time.time()

	multi_process(
			input_files=input_files, 
			tokenizer=tokenizer,
			max_seq_length=FLAGS.max_seq_length,
			masked_lm_prob=FLAGS.masked_lm_prob, 
			max_predictions_per_seq=FLAGS.max_predictions_per_seq, 
			short_seq_prob=FLAGS.short_seq_prob,
			output_file=output_file,
			process_num=1,
			dupe_factor=FLAGS.dupe_factor,
			random_seed=1234567
		)
	print(time.time()-start, "==total time==")
Ejemplo n.º 3
0
def main(_):

    input_config = {"word_piece_model": FLAGS.word_piece_model}
    if FLAGS.tokenizer_type == 'spm':
        tokenizer = tokenization.SPM(input_config)
        tokenizer.load_model()
    elif FLAGS.tokenizer_type == 'jieba':
        tokenizer = jieba
    fwobj = open(FLAGS.output_file, "w")
    with open(FLAGS.train_file, "r") as frobj:
        for line in frobj:
            content = line.strip()
            if len(content) >= 1:
                token_lst = my_spm.tokenize(content)
                fwobj.write(" ".join(token_lst) + "\n")
            else:
                fwobj.write("\n")
    fwobj.close()
Ejemplo n.º 4
0
def main(_):
	tf.logging.set_verbosity(tf.logging.INFO)

	print(FLAGS.do_whole_word_mask, FLAGS.do_lower_case)

	if FLAGS.tokenizer_type == "spm":
		word_piece_model = os.path.join(FLAGS.buckets, FLAGS.word_piece_model)
		tokenizer = tokenization.SPM(config={
			"word_dict":FLAGS.vocab_file,
			"word_piece_model":word_piece_model
			})
		tokenizer.load_dict()
		tokenizer.load_model()
		tokenizer.add_extra_word()
		tokenizer.build_word_id()
	elif FLAGS.tokenizer_type == "word_piece"::
		tokenizer = tokenization.FullTokenizer(
			vocab_file=FLAGS.vocab_file, 
			do_lower_case=FLAGS.do_lower_case,
			do_whole_word_mask=FLAGS.do_whole_word_mask)

	input_files = []
	for input_pattern in FLAGS.input_file.split(","):
		input_files.extend(tf.gfile.Glob(input_pattern))

	tf.logging.info("*** Reading from input files ***")
	for input_file in input_files:
		tf.logging.info("  %s", input_file)

	rng = random.Random(FLAGS.random_seed)
	instances = create_training_instances(
			input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
			FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
			rng)

	output_files = FLAGS.output_file.split(",")
	tf.logging.info("*** Writing to output files ***")
	for output_file in output_files:
		tf.logging.info("  %s", output_file)

	write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
																	FLAGS.max_predictions_per_seq, output_files)