Ejemplo n.º 1
0
 def setUp(self):
     super(PreprocessingSmithTest, self).setUp()
     doc_one_text = (
         "I am in Dominick's for my dinner. OK, no problem. I am "
         "in Dominick's for my dinner which is the best dinner I have "
         "in my whole life.")
     doc_one_text = tokenization.convert_to_unicode(doc_one_text).strip()
     vocab_tokens = [
         "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "i", "am", "in",
         "for", "my", "dinner", "ok", "no", "problem", "which", "is", "the",
         "be", "##s", "##t", ","
     ]
     with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
         vocab_writer.write("".join([x + "\n" for x in vocab_tokens
                                     ]).encode("utf-8"))
         self.vocab_file = vocab_writer.name
     self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_file,
                                                 do_lower_case=True)
     self.vocab_words = list(self.tokenizer.vocab.keys())
     self.rng = random.Random(12345)
     self.doc_one_tokens, _ = preprocessing_smith.get_smith_model_tokens(
         doc_one_text, self.tokenizer, [0, 0])
     self.max_sent_length_by_word = 20
     self.max_doc_length_by_sentence = 3
     self.greedy_sentence_filling = True
     self.max_predictions_per_seq = 0
     self.masked_lm_prob = 0
    def tokenize_with_full_tokenizer(self):
        """Returns tokens and ids processed with FullTokenizer."""
        text = u"UNwant\u00E9d,running [unused0] [CLS] [unused55]"
        vocab_tokens = [
            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un",
            "runn", "##ing", ",", "[unused0]"
        ]
        with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
            if six.PY2:
                vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
            else:
                vocab_writer.write("".join([x + "\n" for x in vocab_tokens
                                            ]).encode("utf-8"))

            vocab_file = vocab_writer.name

        tokenizer = tokenization.FullTokenizer(vocab_file)
        os.unlink(vocab_file)
        tokens = tokenizer.tokenize(text)
        ids = tokenizer.convert_tokens_to_ids(tokens)
        return tokens, ids
Ejemplo n.º 3
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    input_files = []
    for input_pattern in FLAGS.input_file.split(","):
        input_files.extend(tf.gfile.Glob(input_pattern))

    tf.logging.info("*** Reading from input files ***")
    for input_file in input_files:
        tf.logging.info("  %s", input_file)
    rng = random.Random(FLAGS.random_seed)
    # Creates training instances.
    max_predictions_per_seq = FLAGS.max_predictions_per_seq if FLAGS.add_masks_lm else 0
    masked_lm_prob = FLAGS.masked_lm_prob if FLAGS.add_masks_lm else 0
    instances, sent_token_counter = create_training_instances_wiki_doc_pair(
        input_file=FLAGS.input_file,
        tokenizer=tokenizer,
        max_sent_length_by_word=FLAGS.max_sent_length_by_word,
        max_doc_length_by_sentence=FLAGS.max_doc_length_by_sentence,
        masked_lm_prob=masked_lm_prob,
        max_predictions_per_seq=max_predictions_per_seq,
        rng=rng)

    output_files = FLAGS.output_file.split(",")
    tf.logging.info("*** Writing to output files ***")
    for output_file in output_files:
        tf.logging.info("  %s", output_file)

    # Transfers training instances into tensorflow examples and write the results.
    write_instance_to_example_files(instances, tokenizer, output_files)

    # Finally outputs some data statistics.
    tf.logging.info("sent_count, token_count, doc_pair_count: %d %d %d",
                    sent_token_counter[0], sent_token_counter[1],
                    len(instances))