from bigbird.core import modeling from bigbird.core import optimization from bigbird.core import utils import numpy as np import tensorflow.compat.v2 as tf import tensorflow_datasets as tfds import tensorflow_text as tft import sentencepiece as spm FLAGS = flags.FLAGS ## Required parameters flags.DEFINE_string( "data_dir", "tfds://wiki40b/en", "The input data dir. Should contain the TFRecord files. " "Can be TF Dataset with prefix tfds://") flags.DEFINE_string( "output_dir", "/tmp/bigb", "The output directory where the model checkpoints will be written.") ## Other parameters flags.DEFINE_string( "init_checkpoint", None, "Initial checkpoint (usually from a pre-trained BigBird model).") flags.DEFINE_integer( "max_encoder_length", 512, "The maximum total input sequence length after SentencePiece tokenization. " "Sequences longer than this will be truncated, and sequences shorter "
from __future__ import absolute_import from __future__ import division from __future__ import print_function from bigbird.core import flags from absl import app from absl import logging import collections import random import tensorflow as tf import tokenization import numpy as np FLAGS = flags.FLAGS flags.DEFINE_string("input_file", None, "Input raw text file (or comma-separated list of files).") flags.DEFINE_string( "output_file", None, "Output TF example file (or comma-separated list of files).") flags.DEFINE_string("vocab_file", None, "The vocabulary file that the BERT model was trained on.") flags.DEFINE_integer("max_seq_length", 2048, "Maximum sequence length.") flags.DEFINE_integer("max_predictions_per_seq", 75, "Maximum number of masked LM predictions per sequence.") flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")