"The input data dir. Should contain the TFRecord files. " "Can be TF Dataset with prefix tfds://") flags.DEFINE_string( "output_dir", "/tmp/bigb", "The output directory where the model checkpoints will be written.") ## Other parameters flags.DEFINE_string( "init_checkpoint", None, "Initial checkpoint (usually from a pre-trained BigBird model).") flags.DEFINE_integer( "max_encoder_length", 512, "The maximum total input sequence length after SentencePiece tokenization. " "Sequences longer than this will be truncated, and sequences shorter " "than this will be padded.") flags.DEFINE_string( "substitute_newline", None, "Replace newline charachter from text with supplied string.") flags.DEFINE_bool( "do_train", True, "Whether to run training.") flags.DEFINE_bool( "do_eval", False, "Whether to run eval on the dev set.")
"data_dir", "tfds://wiki40b/en", "The input data dir. Should contain the TFRecord files. " "Can be TF Dataset with prefix tfds://") flags.DEFINE_string( "output_dir", "/tmp/bigb", "The output directory where the model checkpoints will be written.") ## Other parameters flags.DEFINE_string( "init_checkpoint", None, "Initial checkpoint (usually from a pre-trained BigBird model).") flags.DEFINE_integer( "max_encoder_length", 512, "The maximum total input sequence length after SentencePiece tokenization. " "Sequences longer than this will be truncated, and sequences shorter " "than this will be padded. Must match data generation.") flags.DEFINE_integer( "max_predictions_per_seq", 75, "Maximum number of masked LM predictions per sequence. " "Must match data generation.") flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.") flags.DEFINE_string( "substitute_newline", " ", "Replace newline charachter from text with supplied string.") flags.DEFINE_bool("do_train", True, "Whether to run training.")
import tokenization import numpy as np FLAGS = flags.FLAGS flags.DEFINE_string("input_file", None, "Input raw text file (or comma-separated list of files).") flags.DEFINE_string( "output_file", None, "Output TF example file (or comma-separated list of files).") flags.DEFINE_string("vocab_file", None, "The vocabulary file that the BERT model was trained on.") flags.DEFINE_integer("max_seq_length", 2048, "Maximum sequence length.") flags.DEFINE_integer("max_predictions_per_seq", 75, "Maximum number of masked LM predictions per sequence.") flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.") flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.") flags.DEFINE_integer("split_output_data_len", 6, "split output data len") flags.DEFINE_bool("is_train", True, "training data를 생성할 것인지") class TrainingInstance(object): """A single training instance (sentence pair)."""