Example #1
0
from bigbird.core import modeling
from bigbird.core import optimization
from bigbird.core import utils
import numpy as np
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds
import tensorflow_text as tft

import sentencepiece as spm

FLAGS = flags.FLAGS

## Required parameters

flags.DEFINE_string(
    "data_dir", "tfds://wiki40b/en",
    "The input data dir. Should contain the TFRecord files. "
    "Can be TF Dataset with prefix tfds://")

flags.DEFINE_string(
    "output_dir", "/tmp/bigb",
    "The output directory where the model checkpoints will be written.")

## Other parameters
flags.DEFINE_string(
    "init_checkpoint", None,
    "Initial checkpoint (usually from a pre-trained BigBird model).")

flags.DEFINE_integer(
    "max_encoder_length", 512,
    "The maximum total input sequence length after SentencePiece tokenization. "
    "Sequences longer than this will be truncated, and sequences shorter "
Example #2
0
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from bigbird.core import flags
from absl import app
from absl import logging

import collections
import random
import tensorflow as tf
import tokenization
import numpy as np

FLAGS = flags.FLAGS

flags.DEFINE_string("input_file", None,
                    "Input raw text file (or comma-separated list of files).")

flags.DEFINE_string(
    "output_file", None,
    "Output TF example file (or comma-separated list of files).")

flags.DEFINE_string("vocab_file", None,
                    "The vocabulary file that the BERT model was trained on.")

flags.DEFINE_integer("max_seq_length", 2048, "Maximum sequence length.")

flags.DEFINE_integer("max_predictions_per_seq", 75,
                     "Maximum number of masked LM predictions per sequence.")

flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")