Ejemplo n.º 1
0
 def process_text(self, text):
     if self.use_spm:
         return tokenization.preprocess_text(text, lower=self.do_lower_case)
     else:
         return tokenization.convert_to_unicode(text)
Ejemplo n.º 2
0
def create_training_instances(
    input_files,
    tokenizer,
    max_seq_length,
    dupe_factor,
    short_seq_prob,
    masked_lm_prob,
    max_predictions_per_seq,
    rng,
):
    """Create `TrainingInstance`s from raw text."""
    all_documents = [[]]

    # Input file format:
    # (1) One sentence per line. These should ideally be actual sentences, not
    # entire paragraphs or arbitrary spans of text. (Because we use the
    # sentence boundaries for the "next sentence prediction" task).
    # (2) Blank lines between documents. Document boundaries are needed so
    # that the "next sentence prediction" task doesn't span between documents.
    for input_file in input_files:
        with tf.gfile.GFile(input_file, FLAGS.input_file_mode) as reader:
            while True:
                line = reader.readline()
                if not FLAGS.spm_model_file:
                    line = tokenization.convert_to_unicode(line)
                if not line:
                    break
                if FLAGS.spm_model_file:
                    line = tokenization.preprocess_text(
                        line, lower=FLAGS.do_lower_case)
                else:
                    line = line.strip()

                # Empty lines are used as document delimiters
                if not line:
                    all_documents.append([])
                tokens = tokenizer.tokenize(line)
                if tokens:
                    all_documents[-1].append(tokens)

    # Remove empty documents
    all_documents = [x for x in all_documents if x]
    rng.shuffle(all_documents)

    vocab_words = list(tokenizer.vocab.keys())
    instances = []
    for _ in range(dupe_factor):
        for document_index in range(len(all_documents)):
            instances.extend(
                create_instances_from_document(
                    all_documents,
                    document_index,
                    max_seq_length,
                    short_seq_prob,
                    masked_lm_prob,
                    max_predictions_per_seq,
                    vocab_words,
                    rng,
                ))

    rng.shuffle(instances)
    return instances