Example #1
0
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
        )
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
        )
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if model_args.config_name:
        config = T5Config.from_pretrained(
            model_args.config_name, cache_dir=model_args.cache_dir, vocab_size=len(tokenizer)
        )
    elif model_args.model_name_or_path:
        config = T5Config.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir, vocab_size=len(tokenizer)
        )
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning("You are instantiating a new config instance from scratch.")

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names
Example #2
0
    def run_trt(
        self,
        metadata: NetworkMetadata,
        onnx_fpaths: Tuple[NetworkModel],
        network_input: List[str],
        working_directory: str,
        keep_trt_engine: bool,
        keep_onnx_model: bool,
        keep_torch_model: bool,
        timing_profile: TimingProfile,
    ) -> List[NetworkResult]:
        workspace = NNFolderWorkspace(
            self.frameworks_cmd.config.network_name, metadata, working_directory
        )

        results = []
        try:
            # no fpath provided for onnx files, download them
            if len(onnx_fpaths) == 0:
                onnx_fpaths = self.frameworks_cmd.generate_and_download_framework(
                    metadata, workspace
                ).onnx
            else:
                keep_onnx_model = True
                keep_torch_model = True

            # Output networks shall not exceed number of network segments explicitly defined by configuraiton file.
            assert len(onnx_fpaths) == len(
                T5ModelTRTConfig.NETWORK_SEGMENTS
            ), "There should only be {} exported ONNX segments in T5 model.".format(
                len(T5ModelTRTConfig.NETWORK_SEGMENTS)
            )

            hash_onnx_fpath = {v.name: v for v in onnx_fpaths}

            decoder_onnx_fpath = hash_onnx_fpath[
                T5ModelTRTConfig.NETWORK_DECODER_SEGMENT_NAME
            ].fpath
            encoder_onnx_fpath = hash_onnx_fpath[
                T5ModelTRTConfig.NETWORK_ENCODER_SEGMENT_NAME
            ].fpath

            self.t5_trt_encoder_engine = T5EncoderONNXFile(
                encoder_onnx_fpath, metadata
            ).as_trt_engine(encoder_onnx_fpath + ".engine")
            self.t5_trt_decoder_engine = T5DecoderONNXFile(
                decoder_onnx_fpath, metadata
            ).as_trt_engine(decoder_onnx_fpath + ".engine")
            tfm_config = T5Config(
                use_cache=metadata.other.kv_cache,
                num_layers=T5ModelTRTConfig.NUMBER_OF_LAYERS[metadata.variant],
            )
            self.t5_trt_encoder = T5TRTEncoder(
                self.t5_trt_encoder_engine, metadata, tfm_config
            )
            self.t5_trt_decoder = T5TRTDecoder(
                self.t5_trt_decoder_engine, metadata, tfm_config
            )

            for ninput in network_input:
                results.append(
                    self.execute_inference(
                        metadata, hash_onnx_fpath, ninput, timing_profile
                    )
                )

        finally:
            self.cleanup(workspace, keep_trt_engine, keep_onnx_model, keep_torch_model)

        return results
Example #3
0
        'input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'
    ]
    encoded.set_format(type='torch', columns=columns)

    train_dataloader = torch.utils.data.DataLoader(encoded["train"],
                                                   collate_fn=collate_fn,
                                                   batch_size=args.batch_size)
    val_dataloader = torch.utils.data.DataLoader(encoded["validation"],
                                                 collate_fn=collate_fn,
                                                 batch_size=args.batch_size *
                                                 4)

    if args.from_pretrained:
        model = T5ForConditionalGeneration.from_pretrained(args.model_select)
    else:
        config = T5Config.from_pretrained(args.model_select)
        model = T5ForConditionalGeneration(config)

    no_decay = ["bias", "LayerNorm.weight"]
    params_decay = [
        p for n, p in model.named_parameters()
        if not any(nd in n for nd in no_decay)
    ]
    params_nodecay = [
        p for n, p in model.named_parameters()
        if any(nd in n for nd in no_decay)
    ]
    optim_groups = [
        {
            "params": params_decay,
            "weight_decay": 0.1
Example #4
0
MODEL_PATH = os.environ.get("MODEL_PATH", "/data/model.pth")
BASE_MODEL = os.environ.get("BASE_MODEL", "t5-base")
DECODING = os.environ.get("DECODING", "greedy")  # greedy, topk-N (e.g., topk-10)

cuda = torch.cuda.is_available()
if cuda:
    torch.cuda.set_device(0)  # singe gpu
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

logger.info(f"question generation is set to run on {device}")

# init model
logger.info("question generation model is preparing...")
config = T5Config.from_pretrained(BASE_MODEL)
model = T5ForConditionalGeneration(config=config)
t = QGTokenizer(tokenizer=BASE_MODEL)
checkpoint = torch.load(MODEL_PATH, map_location=device)
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()
if cuda:
    model.cuda()

logger.info(f"question generation model is ready")

app = Flask(__name__)


@app.route("/question", methods=["POST"])
def respond():
from transformers import T5Config, T5Tokenizer, T5ForConditionalGeneration
from transformers.modeling_t5 import load_tf_weights_in_t5
from flask import Flask, request, jsonify

app = Flask(__name__)

base_model = "t5-large"
tokenizer = T5Tokenizer.from_pretrained(base_model)
model = T5ForConditionalGeneration(T5Config.from_pretrained(base_model))

load_tf_weights_in_t5(model, None, "/data/")
model.eval()

ret_dict = {
    'low air quality': 'LowAirQuality',
    'low humidity': 'LowHumidity',
    'low brightness': 'LowBrightness',
    'low noise level': 'LowNoise',
    'low security': 'LowSecurity',
    'low temperature': 'LowTemperature',
    'high air quality': 'HighAirQuality',
    'high humidity': 'HighHumidity',
    'high brightness': 'HighBrightness',
    'high noise level': 'HighNoise',
    'high security': 'HighSecurity',
    'high temperature': 'HighTemperature'
}


def run_model(input_string, **generator_args):
    input_ids = tokenizer.encode(input_string, return_tensors="pt")
class Funnel_T5_VAE_Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of :class:`~transformer_vae.T5_VAE_Model`.
    It is used to instantiate a Funnel-T5-VAE model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the T5 `funnel-t5-vae-base architecture.

    To be able to use `transformer.trainer.Trainer` we need some specific training logic & config in the model.

    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.

    Arguments:
        latent_size (:obj:`int`, `optional`, defaults to 1,000):
            Number of dimensions to use for the sequences latent code.
        funnel_name (:obj:`str`, `optional`, defaults to t5-base):
            Name of the transformer model to use as encoder & decoder.
        vae_encoder_model (:obj:`str`, `optional`, defaults to None):
            Name of the model to encode T5 hidden states into latent codes.
        vae_decoder_model (:obj:`str`, `optional`, defaults to None):
            Name of the model to decode latent codes into T5 hidden states.
        set_seq_size (:obj:`int`, `optional`, defaults to 60):
            NOTE: Every input sequence must be padded to be equal to this length.
        t5_name (:obj:`str`, `optional`, defaults to t5-base):
            Name of the Transformer model to use as a decoder.
        transformer_critic_name (:obj:`str`, `optional`, defaults to None):
            Name of the Transformer model to use as an advisery on interpolations.
        *** Training Args ***
        reg_schedule_k (:obj:`float`, `optional`, defaults to 0.0025):
            Multiplied by global_step in a sigmoid, more gradually increase regulariser loss weight.
        reg_schedule_b (:obj:`float`, `optional`, defaults to 6.25):
            Added to global step in sigmoid, further delays increase in regulariser loss weight.
        use_extra_logs (:obj:`bool`, `optional`, defaults to False):
            Store extra logs during each training inference.
        gradient_checkpoint (:obj:`bool`, `optional`, defaults to False):
            Checkpoint gradients in the model.
            Currently just checkpoints after the encoder + VAE
        funnel_block_sizes (:obj:`str`, `optional`, defaults to ''):
            Size of each Funnel Encoder block, sequence is halved between each block.
            Example specification: 1_1_1
        *** End ***

        TODO: Add extra models to condition on the latent
    """
    model_type = "transformer_vae"
    is_composition = True

    def __init__(
        self,
        latent_size=1_000,
        funnel_name="funnel-transformer/intermediate",
        t5_name="t5-base",
        vae_encoder_model='',
        vae_decoder_model='',
        critic_type='',
        critic_name='',
        set_seq_size=60,
        decoder_start_token_id=0,
        dont_use_reg_loss=False,
        reg_schedule_k=0.0025,
        reg_schedule_b=6.25,
        use_extra_logs=False,
        cache_dir=None,
        n_latent_tokens=5,  # set to -1 for full sequence
        funnel_block_sizes='',
        num_decoder_layers=0,
        num_decoder_heads=0,
        attention_window_size=0,
        attention_window_overlap=0,
        gradient_checkpoint_encoder=False,
        decoder_grad_chk_pnt_rate=0,
        skip_upsample=False,
        **kwargs,
    ):
        assertIn(vae_encoder_model, VAE_ENCODER_MODELS.keys(),
                 "Unexpected VAE encoder.")
        assertIn(vae_decoder_model, VAE_DECODER_MODELS.keys(),
                 "Unexpected VAE decoder.")

        super().__init__(**kwargs)

        self.set_seq_size = set_seq_size

        # VAE
        self.vae_encoder_model = vae_encoder_model
        self.vae_decoder_model = vae_decoder_model
        if set_seq_size < n_latent_tokens:
            logger.warning(
                f'set_seq_size size is smaller than n_latent_tokens, now using n_latent_tokens={set_seq_size} from {n_latent_tokens}'
            )
            n_latent_tokens = set_seq_size
        self.latent_size = latent_size
        self.n_latent_tokens = n_latent_tokens
        self.skip_upsample = skip_upsample

        # funnel encoder model
        if 'funnel' not in kwargs:
            self.funnel = AutoConfig.from_pretrained(funnel_name,
                                                     cache_dir=cache_dir)
            if funnel_block_sizes:
                self.funnel.block_sizes = [
                    int(i) for i in funnel_block_sizes.split('_')
                ]
            self.funnel.decoder_start_token_id = decoder_start_token_id
            self.funnel.n_positions = set_seq_size
        else:
            self.funnel = FunnelConfig(**kwargs.pop('funnel'))
        pooling_division = 2**(len(self.funnel.block_sizes) - 1)
        self.encoded_seq_size = math.ceil(self.funnel.n_positions /
                                          pooling_division)
        self.gradient_checkpoint_encoder = gradient_checkpoint_encoder

        # T5 decoder model
        if 't5' not in kwargs:
            self.t5 = AutoConfig.from_pretrained(t5_name, cache_dir=cache_dir)
            if num_decoder_layers:
                self.t5.num_layers = num_decoder_layers
            if num_decoder_heads:
                self.t5.num_heads = num_decoder_heads
            self.t5.decoder_start_token_id = decoder_start_token_id
            self.t5.n_positions = self.funnel.n_positions
            assertEqual(self.t5.model_type, "t5",
                        "Need t5 model type for transformer_decoder.")
        else:
            self.t5 = T5Config(**kwargs.pop('t5'))
        assertEqual(self.funnel.d_model, self.t5.d_model,
                    "Funnel & T5 transformers have different dimensions.")
        self.decoder_grad_chk_pnt_rate = decoder_grad_chk_pnt_rate
        assert (attention_window_size < set_seq_size
                ), 'Attention window must be smallar than set sequence size.'
        self.attention_window_size = attention_window_size
        self.attention_window_overlap = attention_window_overlap
        if attention_window_size:
            assert (
                set_seq_size % attention_window_size != 0
            ), 'When doing an alternating attention pattern the sequence size cannot be divisable by the window size as no alternations will be possible.'
            self.attention_window_overlap = set_seq_size % attention_window_size

        # extra training losses
        self.use_reg_loss = not dont_use_reg_loss
        if dont_use_reg_loss:
            logger.warning(
                "Regularisation loss is turned off, you are training an Autoencoder (not a VAE)."
            )
        self.reg_schedule_k = reg_schedule_k
        self.reg_schedule_b = reg_schedule_b
        self.use_extra_logs = use_extra_logs

        # critic model
        self.critic = None
        if critic_name:
            self.critic_type = critic_type
            if 'critic' not in kwargs:
                self.critic = AutoConfig.from_pretrained(critic_name,
                                                         cache_dir=cache_dir)
            else:
                self.critic = FunnelConfig(**kwargs.pop('critic'))
            assertEqual(self.t5.d_model, self.critic.d_model,
                        "Funnel & T5 transformers have different dimensions.")

        # misc
        self.use_cache = getattr(self.funnel, "use_cache", False)
Example #7
0
            "WARNING: e2e is meant to generate questions by context. The ouput of the script will be a csv instead of a json."
        )
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("Device:", device)

    model_created = False
    print("Loading model and tokenizer...", end="", flush=True)
    if args.checkpoint != None:
        model_created = True
        if args.bart:
            config = BartConfig.from_json_file(args.checkpoint +
                                               "/config.json")
            model = BartForConditionalGeneration.from_pretrained(
                args.checkpoint + "/pytorch_model.bin", config=config)
        if args.t5:
            config = T5Config.from_json_file(args.checkpoint + "/config.json")
            model = T5ForConditionalGeneration.from_pretrained(
                args.checkpoint + "/pytorch_model.bin", config=config)
        elif not args.bart and not args.t5:
            config = EncoderDecoderConfig.from_json_file(args.checkpoint +
                                                         "/config.json")
            model = EncoderDecoderModel.from_pretrained(args.checkpoint +
                                                        "/pytorch_model.bin",
                                                        config=config)
        model_name = args.checkpoint

    if args.bart:
        if args.checkpoint == None:
            model_name = "WikinewsSum/bart-large-multi-fr-wiki-news" if args.model_name == "" else args.model_name
        tokenizer = BartTokenizer.from_pretrained(
            args.tokenizer
Example #8
0
# This is a very small notebook showing how to grab a pre-trained T5 model, fine-tune it, and export it to onnx.]
# A lot of this is inspired by huggingface.

from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config, AdamW
import torch
from onnxt5 import generate_onnx_representation, GenerativeT5
from onnxt5.api import get_sess
import tempfile

temp_dir = tempfile.gettempdir()

base_model = "t5-base"

# Setting up the model and tokenizer
config = T5Config.from_pretrained(base_model)
config.n_positions = 256  # You can change the properties of your model here
model = T5ForConditionalGeneration(config=config)

# Download vocab file
tokenizer = T5Tokenizer(config=config, vocab_file="test_sentencepiece.model")
model.train()

# Let's setup our optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [{
    'params': [
        p for n, p in model.named_parameters()
        if not any(nd in n for nd in no_decay)
    ],
    'weight_decay':
def convert_model(args):
    if os.path.exists(args.decoder_onnx):
        print(f"skip convert_to_onnx since path existed: {args.decoder_onnx}")
    else:
        assert args.model_type == "gpt2", "please have onnx model ready for model type that is not gpt2"
        gpt2_to_onnx(args)

    # TODO: fix shape inference for T5. Currently symbolic shape inference on T5 is broken.
    enable_shape_inference = args.model_type == "gpt2"

    if enable_shape_inference:
        print(f"Run symbolic shape inference on {args.decoder_onnx}. The file will be overwritten.")
        shape_inference(args.decoder_onnx)

    global config
    if args.model_type == "gpt2":
        config = GPT2Config.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
    else:
        config = T5Config.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
    print(config)

    eos_token_id = config.eos_token_id
    pad_token_id = config.eos_token_id
    vocab_size = config.vocab_size

    # if vocab_size is given in parameters use that.
    if args.vocab_size != -1:
        vocab_size = args.vocab_size

    model = onnx.load(args.decoder_onnx)
    model.graph.name = f"{args.model_type} decoder subgraph"

    if args.model_type == "gpt2":
        verify_gpt2_subgraph(model.graph, args.precision)
    else:
        verify_t5_decoder_subgraph(model.graph, args.precision)

    inputs = [
        "input_ids",
        "max_length",
        "min_length",
        "num_beams",
        "num_return_sequences",
        "temperature",
        "length_penalty",
        "repetition_penalty",
        "vocab_mask",
    ]
    if args.prefix_vocab_mask:
        inputs.append("prefix_vocab_mask")

    outputs = ["sequences"]
    if args.output_sequences_scores:
        outputs.append("sequences_scores")

    if args.output_token_scores:
        assert args.output_sequences_scores, "--output_token_scores requires --output_sequences_scores"
        outputs.append("scores")

    node = helper.make_node(
        "BeamSearch",
        inputs=inputs,
        outputs=outputs,
        name=f"BeamSearch_{args.model_type}",
    )
    node.domain = "com.microsoft"
    node.attribute.extend(
        [
            helper.make_attribute("eos_token_id", eos_token_id),
            helper.make_attribute("pad_token_id", pad_token_id),
            helper.make_attribute("no_repeat_ngram_size", args.no_repeat_ngram_size),
            helper.make_attribute("early_stopping", 1 if args.early_stopping else 0),
            helper.make_attribute("model_type", 0 if args.model_type == "gpt2" else 1),
            helper.make_attribute("decoder", model.graph),
        ]
    )

    if args.model_type == "t5":
        if enable_shape_inference:
            print(f"Run symbolic shape inference on {args.encoder_decoder_init_onnx}. The file will be overwritten.")
            shape_inference(args.encoder_decoder_init_onnx)
        init_model = onnx.load(args.encoder_decoder_init_onnx)
        init_model.graph.name = f"{args.model_type} encoder decoder init subgraph"
        verify_t5_encoder_decoder_init_subgraph(init_model.graph, args.precision)
        node.attribute.extend(
            [
                helper.make_attribute("encoder_decoder_init", init_model.graph),
            ]
        )

    from onnx import TensorProto

    # graph inputs
    input_ids = helper.make_tensor_value_info("input_ids", TensorProto.INT32, ["batch_size", "sequence_length"])
    max_length = helper.make_tensor_value_info("max_length", TensorProto.INT32, [1])
    min_length = helper.make_tensor_value_info("min_length", TensorProto.INT32, [1])
    num_beams = helper.make_tensor_value_info("num_beams", TensorProto.INT32, [1])
    num_return_sequences = helper.make_tensor_value_info("num_return_sequences", TensorProto.INT32, [1])
    temperature = helper.make_tensor_value_info("temperature", TensorProto.FLOAT, [1])
    length_penalty = helper.make_tensor_value_info("length_penalty", TensorProto.FLOAT, [1])
    repetition_penalty = helper.make_tensor_value_info("repetition_penalty", TensorProto.FLOAT, [1])
    vocab_mask = helper.make_tensor_value_info("vocab_mask", TensorProto.INT32, [vocab_size])

    graph_inputs = [
        input_ids,
        max_length,
        min_length,
        num_beams,
        num_return_sequences,
        temperature,
        length_penalty,
        repetition_penalty,
        vocab_mask,
    ]

    if args.prefix_vocab_mask:
        prefix_vocab_mask = helper.make_tensor_value_info(
            "prefix_vocab_mask", TensorProto.INT32, ["batch_size", vocab_size]
        )
        graph_inputs.append(prefix_vocab_mask)

    # graph outputs
    sequences = helper.make_tensor_value_info(
        "sequences",
        TensorProto.INT32,
        ["batch_size", "num_return_sequences", "max_length"],
    )

    sequences_scores = helper.make_tensor_value_info(
        "sequences_scores", TensorProto.FLOAT, ["batch_size", "num_return_sequences"]
    )

    scores = helper.make_tensor_value_info(
        "scores",
        TensorProto.FLOAT,
        ["max_length - sequence_length", "batch_size", "num_beams", vocab_size],
    )

    initializers = []

    graph_outputs = [sequences]

    if args.output_sequences_scores:
        graph_outputs.append(sequences_scores)

    if args.output_token_scores:
        graph_outputs.append(scores)

    new_graph = helper.make_graph(
        [node],
        f"{args.model_type}-beam-search",
        graph_inputs,
        graph_outputs,
        initializers,
    )

    # Create the model
    new_model = helper.make_model(
        new_graph,
        producer_name="onnxruntime.transformers",
        opset_imports=model.opset_import,
    )
    onnx.save(new_model, args.output)
Example #10
0
from tqdm.notebook import tqdm
from transformers import T5Config, T5Tokenizer, T5ForConditionalGeneration

print("Downloading and unzipping model.", file=sys.stderr)
os.system(
    "wget -nc https://storage.googleapis.com/doctttttquery_git/t5-base.zip")
os.system("unzip -o t5-base.zip")

nltk.download('punkt')

# Define the target device. Use GPU if available.
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Instantiate and load the QG model to the GPU.
qg_tokenizer = T5Tokenizer.from_pretrained('t5-base')
qg_config = T5Config.from_pretrained('t5-base')
qg_model = T5ForConditionalGeneration.from_pretrained('model.ckpt-1004000',
                                                      from_tf=True,
                                                      config=qg_config)

qg_model.to(device)


def preprocess(document: str, span=10, stride=5) -> List[str]:
    """
    Define your preprocessing function.

    This function should take the a corpus document and output a list of generation
    spans. This is required so we can match the expected sequence size of the
    generation model.
    """
Example #11
0
        print("starting to train")
        # train
        word_tokens_train, pos_tokens_train = tasks.pos('UD_English-EWT/en_ewt-ud-train.conllu')
        tokenizer = T5Tokenizer.from_pretrained("t5-small")

        ## i want to append pos: - do I include the pos token associated with it?
        if args.control:
            word_tokens_train, pos_tokens_train = tasks.make_control(tokenizer, word_tokens_train, pos_tokens_train, args.embsize)

        torch_ids_train, torch_masks_train, torch_token_starts, torch_labels_train = r.prepare_data(tokenizer, word_tokens_train, pos_tokens_train)

        # data for training
        split = int(0.75 * len(torch_ids_train))
        #dataset_train = Dataset(torch_ids_train[:split], torch_masks_train[:split], torch_labels_train[:split])
        #dataset_dev = Dataset(torch_ids_train[split:], torch_masks_train[split:], torch_labels_train[split:])
        config = T5Config.from_pretrained("t5-small", output_hidden_states=True, output_attentions=True)
        model = T5ForConditionalGeneration.from_pretrained("t5-small", config=config)
        model.to(device)
        #train(model, dataset_train, dataset_dev, torch_token_starts[split:], tokenizer)

        # 100 values test
        dataset_train = Dataset(torch_ids_train[:200], torch_masks_train[:200], torch_labels_train[:200])
        dataset_dev = Dataset(torch_ids_train[200:400], torch_masks_train[200:400], torch_labels_train[200:400])

        train(model, dataset_train, dataset_dev, torch_token_starts[200:400], tokenizer)

        print("done!")

    else:
        print("starting to evaluate")
        tokenizer = T5Tokenizer.from_pretrained("t5-small")
Example #12
0
    def __init__(
        self,
        model_name,
        args=None,
        tokenizer=None,
        use_cuda=False,
        cuda_device=-1,
        **kwargs,
    ):

        """
        Initializes a T5Model model.

        Args:
            model_name: The exact architecture and trained weights to use. This may be a Hugging Face Transformers compatible pre-trained model, a community model, or the path to a directory containing model files.
            args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
            use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
            cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default.
            **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied.
        """  # noqa: ignore flake8"

        self.args = self._load_model_args(model_name)

        if isinstance(args, dict):
            self.args.update_from_dict(args)
        elif isinstance(args, T5Args):
            self.args = args

        if "sweep_config" in kwargs:
            sweep_config = kwargs.pop("sweep_config")
            sweep_values = sweep_config_to_sweep_values(sweep_config)
            self.args.update_from_dict(sweep_values)

        if self.args.manual_seed:
            random.seed(self.args.manual_seed)
            np.random.seed(self.args.manual_seed)
            torch.manual_seed(self.args.manual_seed)
            if self.args.n_gpu > 0:
                torch.cuda.manual_seed_all(self.args.manual_seed)

        self.device = "cpu"

        self.results = {}

        if model_name is None:
            self.config = self.args.config
            self.model = T5ForConditionalGeneration(config=self.config)
        else:
            self.config = T5Config.from_pretrained(model_name,
                                                   **self.args.config)
            self.model = T5ForConditionalGeneration.from_pretrained(
                model_name, config=self.config)

        if isinstance(tokenizer, T5Tokenizer):
            self.tokenizer = tokenizer
        else:
            self.tokenizer = T5Tokenizer.from_pretrained(model_name,
                                                         truncate=True)
        self.model.resize_token_embeddings(len(self.tokenizer))

        if self.args.dynamic_quantize:
            self.model = torch.quantization.quantize_dynamic(self.model,
                                                             {torch.nn.Linear},
                                                             dtype=torch.qint8)

        if not use_cuda:
            self.args.fp16 = False

        self.args.model_type = "T5"
        if model_name is None:
            self.args.model_name = "T5_from_scratch"
        else:
            self.args.model_name = model_name

        if self.args.wandb_project and not wandb_available:
            warnings.warn(
                "wandb_project specified but wandb is not available. Wandb disabled."
            )
            self.args.wandb_project = None
Example #13
0
 def __init__(self, tokenizer):
     super(T5Model, self).__init__()
     self.tokenizer = tokenizer
     config = T5Config.from_pretrained('t5-small')
     self.model = T5ForConditionalGeneration(config=config)
    def _build_vocab(self, max_vocab_cnt):
        # build vocab
        if self.tokenizer_type.startswith('word'):
            self._build_vocab_manual(max_vocab_cnt)
        elif self.tokenizer_type.startswith('bert-'):
            self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>")
            # self.vocab_count = 30522  # fixed for pretrained BERT vocab (old version)
            config_pretrained = BertConfig.from_pretrained(self.tokenizer_type)
            self.vocab_count = config_pretrained.vocab_size

            map_vocab = {}
            for ind in range(self.vocab_count):
                map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind)

            inv_map = {v: k for k, v in map_vocab.items()}

        elif self.tokenizer_type.startswith('xlnet-'):
            # self.vocab = self.tokenizer.vocab
            # self.rev_vocab = self.tokenizer.ids_to_tokens
            # self.pad_id = self.vocab["[PAD]"]
            self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>")
            # self.vocab_count = 32000  # fixed for pretrained BERT vocab
            config_pretrained = XLNetConfig.from_pretrained(
                self.tokenizer_type)
            self.vocab_count = config_pretrained.vocab_size

            map_vocab = {}
            for ind in range(self.vocab_count):
                map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind)

            inv_map = {v: k for k, v in map_vocab.items()}

            self.vocab = map_vocab
            self.rev_vocab = inv_map

        elif self.tokenizer_type.startswith('x5-'):
            self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>")
            # self.vocab_count = 32000
            config_pretrained = T5Config.from_pretrained(self.tokenizer_type)
            self.vocab_count = config_pretrained.vocab_size

            map_vocab = {}
            for ind in range(self.vocab_count):
                map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind)

            inv_map = {v: k for k, v in map_vocab.items()}
            self.vocab = map_vocab
            self.rev_vocab = inv_map

        elif self.tokenizer_type.startswith('bart-'):
            self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>")
            # self.vocab_count = 32000  # fixed for pretrained BERT vocab
            config_pretrained = BartConfig.from_pretrained(self.tokenizer_type)
            self.vocab_count = config_pretrained.vocab_size

            map_vocab = {}
            for ind in range(self.vocab_count):
                map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind)

            inv_map = {v: k for k, v in map_vocab.items()}

        return
Example #15
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_t5_mlm",
                           model_args,
                           data_args,
                           framework="flax")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty."
            "Use --overwrite_output_dir to overcome.")

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        level=logging.INFO,
        datefmt="[%X]",
    )

    # Log on each process the small summary:
    logger = logging.getLogger(__name__)

    # Set the verbosity to info of the Transformers logger (on main process only):
    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Handle the repository creation
    if training_args.push_to_hub:
        if training_args.hub_model_id is None:
            repo_name = get_full_repo_name(Path(
                training_args.output_dir).absolute().name,
                                           token=training_args.hub_token)
        else:
            repo_name = training_args.hub_model_id
        repo = Repository(training_args.output_dir, clone_from=repo_name)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(
            data_args.dataset_name,
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            use_auth_token=True if model_args.use_auth_token else None,
        )

        if "validation" not in datasets.keys():
            datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
                cache_dir=model_args.cache_dir,
                use_auth_token=True if model_args.use_auth_token else None,
            )
            datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
                cache_dir=model_args.cache_dir,
                use_auth_token=True if model_args.use_auth_token else None,
            )
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
        datasets = load_dataset(
            extension,
            data_files=data_files,
            cache_dir=model_args.cache_dir,
            use_auth_token=True if model_args.use_auth_token else None,
        )

        if "validation" not in datasets.keys():
            datasets["validation"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[:{data_args.validation_split_percentage}%]",
                cache_dir=model_args.cache_dir,
                use_auth_token=True if model_args.use_auth_token else None,
            )
            datasets["train"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[{data_args.validation_split_percentage}%:]",
                cache_dir=model_args.cache_dir,
                use_auth_token=True if model_args.use_auth_token else None,
            )
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name,
            cache_dir=model_args.cache_dir,
            use_fast=model_args.use_fast_tokenizer,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
            use_fast=model_args.use_fast_tokenizer,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if model_args.config_name:
        config = T5Config.from_pretrained(
            model_args.config_name,
            cache_dir=model_args.cache_dir,
            vocab_size=len(tokenizer),
            use_auth_token=True if model_args.use_auth_token else None,
        )
    elif model_args.model_name_or_path:
        config = T5Config.from_pretrained(
            model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
    # Since we make sure that all sequences are of the same length, no attention_mask is needed.
    def tokenize_function(examples):
        return tokenizer(examples[text_column_name],
                         return_attention_mask=False)

    tokenized_datasets = datasets.map(
        tokenize_function,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=not data_args.overwrite_cache,
    )

    # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token.
    # To ensure that the input length is `max_seq_length`, we need to increase the maximum length
    # according to `mlm_probability` and `mean_noise_span_length`. We can also define the label length accordingly.
    expanded_inputs_length, targets_length = compute_input_and_target_lengths(
        inputs_length=max_seq_length,
        noise_density=data_args.mlm_probability,
        mean_noise_span_length=data_args.mean_noise_span_length,
    )

    # Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length.
    def group_texts(examples):
        # Concatenate all texts.
        concatenated_examples = {
            k: list(chain(*examples[k]))
            for k in examples.keys()
        }
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
        if total_length >= expanded_inputs_length:
            total_length = (total_length //
                            expanded_inputs_length) * expanded_inputs_length
        # Split by chunks of max_len.
        result = {
            k: [
                t[i:i + expanded_inputs_length]
                for i in range(0, total_length, expanded_inputs_length)
            ]
            for k, t in concatenated_examples.items()
        }
        return result

    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
    # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
    # might be slower to preprocess.
    #
    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
    tokenized_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        load_from_cache_file=not data_args.overwrite_cache,
    )

    # Enable tensorboard only on the master node
    has_tensorboard = is_tensorboard_available()
    if has_tensorboard and jax.process_index() == 0:
        try:
            from flax.metrics.tensorboard import SummaryWriter

            summary_writer = SummaryWriter(
                log_dir=Path(training_args.output_dir))
        except ImportError as ie:
            has_tensorboard = False
            logger.warning(
                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
            )
    else:
        logger.warning(
            "Unable to display metrics through TensorBoard because the package is not installed: "
            "Please run pip install tensorboard to enable.")

    # Initialize our training
    rng = jax.random.PRNGKey(training_args.seed)
    dropout_rngs = jax.random.split(rng, jax.local_device_count())

    if model_args.model_name_or_path:
        model = FlaxT5ForConditionalGeneration.from_pretrained(
            model_args.model_name_or_path,
            config=config,
            seed=training_args.seed,
            dtype=getattr(jnp, model_args.dtype),
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        config.vocab_size = len(tokenizer)
        model = FlaxT5ForConditionalGeneration(
            config,
            seed=training_args.seed,
            dtype=getattr(jnp, model_args.dtype),
            use_auth_token=True if model_args.use_auth_token else None,
        )

    # Data collator
    # This one will take care of randomly masking the tokens.
    data_collator = FlaxDataCollatorForT5MLM(
        tokenizer=tokenizer,
        noise_density=data_args.mlm_probability,
        mean_noise_span_length=data_args.mean_noise_span_length,
        input_length=max_seq_length,
        target_length=targets_length,
        pad_token_id=model.config.pad_token_id,
        decoder_start_token_id=model.config.decoder_start_token_id,
    )

    # Store some constant
    num_epochs = int(training_args.num_train_epochs)
    train_batch_size = int(
        training_args.per_device_train_batch_size) * jax.device_count()
    eval_batch_size = int(
        training_args.per_device_eval_batch_size) * jax.device_count()

    num_train_steps = len(
        tokenized_datasets["train"]) // train_batch_size * num_epochs

    num_of_hosts = jax.process_count()
    current_host_idx = jax.process_index()

    # Create learning rate schedule
    warmup_fn = optax.linear_schedule(
        init_value=0.0,
        end_value=training_args.learning_rate,
        transition_steps=training_args.warmup_steps)
    decay_fn = optax.linear_schedule(
        init_value=training_args.learning_rate,
        end_value=0,
        transition_steps=num_train_steps - training_args.warmup_steps,
    )
    linear_decay_lr_schedule_fn = optax.join_schedules(
        schedules=[warmup_fn, decay_fn],
        boundaries=[training_args.warmup_steps])

    # We use Optax's "masking" functionality to not apply weight decay
    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
    # mask boolean with the same structure as the parameters.
    # The mask is True for parameters that should be decayed.
    def decay_mask_fn(params):
        flat_params = traverse_util.flatten_dict(params)
        flat_mask = {
            path: (path[-1] != "bias"
                   and path[-2:] not in [("layer_norm", "scale"),
                                         ("final_layer_norm", "scale")])
            for path in flat_params
        }
        return traverse_util.unflatten_dict(flat_mask)

    # create adam optimizer
    if training_args.adafactor:
        # We use the default parameters here to initialize adafactor,
        # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
        optimizer = optax.adafactor(
            learning_rate=linear_decay_lr_schedule_fn, )
    else:
        optimizer = optax.adamw(
            learning_rate=linear_decay_lr_schedule_fn,
            b1=training_args.adam_beta1,
            b2=training_args.adam_beta2,
            weight_decay=training_args.weight_decay,
            mask=decay_mask_fn,
        )

    # Setup train state
    state = train_state.TrainState.create(apply_fn=model.__call__,
                                          params=model.params,
                                          tx=optimizer)

    # Define gradient update step fn
    def train_step(state, batch, dropout_rng):
        dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)

        def loss_fn(params):
            labels = batch.pop("labels")

            logits = state.apply_fn(**batch,
                                    params=params,
                                    dropout_rng=dropout_rng,
                                    train=True)[0]

            # compute loss
            loss = optax.softmax_cross_entropy(
                logits, onehot(labels, logits.shape[-1])).mean()

            return loss

        grad_fn = jax.value_and_grad(loss_fn)
        loss, grad = grad_fn(state.params)
        grad = jax.lax.pmean(grad, "batch")
        new_state = state.apply_gradients(grads=grad)

        metrics = jax.lax.pmean(
            {
                "loss": loss,
                "learning_rate": linear_decay_lr_schedule_fn(state.step)
            },
            axis_name="batch")

        return new_state, metrics, new_dropout_rng

    # Create parallel version of the train step
    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0, ))

    # Define eval fn
    def eval_step(params, batch):
        labels = batch.pop("labels")

        logits = model(**batch, params=params, train=False)[0]

        # compute loss
        loss = optax.softmax_cross_entropy(logits,
                                           onehot(labels, logits.shape[-1]))

        # compute accuracy
        accuracy = jnp.equal(jnp.argmax(logits, axis=-1), labels)

        # summarize metrics
        metrics = {"loss": loss.mean(), "accuracy": accuracy.mean()}
        metrics = jax.lax.pmean(metrics, axis_name="batch")

        return metrics

    p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0, ))

    # Replicate the train state on each device
    state = jax_utils.replicate(state)

    train_time = 0
    epochs = tqdm(range(num_epochs), desc="Epoch ... ", position=0)
    for epoch in epochs:
        # ======================== Training ================================
        train_start = time.time()
        train_metrics = []

        # Create sampling rng
        rng, input_rng = jax.random.split(rng)

        # Generate an epoch by shuffling sampling indices from the train dataset
        num_train_samples = len(tokenized_datasets["train"])
        train_samples_idx = np.random.permutation(np.arange(num_train_samples))
        train_batch_idx = generate_batch_splits(train_samples_idx,
                                                train_batch_size)

        # Gather the indexes for creating the batch and do a training step
        for step, batch_idx in enumerate(
                tqdm(train_batch_idx, desc="Training...", position=1)):
            samples = [
                tokenized_datasets["train"][int(idx)] for idx in batch_idx
            ]
            model_inputs = data_collator(samples)

            local_host_model_inputs = {
                key: np.split(model_inputs.data[key], num_of_hosts,
                              axis=0)[current_host_idx]
                for key, value in model_inputs.data.items()
            }

            # Model forward
            model_inputs = shard(local_host_model_inputs)
            state, train_metric, dropout_rngs = p_train_step(
                state, model_inputs, dropout_rngs)
            train_metrics.append(train_metric)

            cur_step = epoch * (num_train_samples // train_batch_size) + step

            if cur_step % training_args.logging_steps == 0 and cur_step > 0:
                # Save metrics
                train_metric = jax_utils.unreplicate(train_metric)
                train_time += time.time() - train_start
                if has_tensorboard and jax.process_index() == 0:
                    write_train_metric(summary_writer, train_metrics,
                                       train_time, cur_step)

                epochs.write(
                    f"Step... ({cur_step} | Loss: {train_metric['loss'].mean()}, Learning Rate:"
                    f" {train_metric['learning_rate'].mean()})")

                train_metrics = []

            if cur_step % training_args.eval_steps == 0 and cur_step > 0:
                # ======================== Evaluating ==============================
                num_eval_samples = len(tokenized_datasets["validation"])
                eval_samples_idx = jnp.arange(num_eval_samples)
                eval_batch_idx = generate_batch_splits(eval_samples_idx,
                                                       eval_batch_size)

                eval_metrics = []
                for i, batch_idx in enumerate(
                        tqdm(eval_batch_idx, desc="Evaluating ...",
                             position=2)):
                    samples = [
                        tokenized_datasets["validation"][int(idx)]
                        for idx in batch_idx
                    ]
                    model_inputs = data_collator(samples)

                    # Model forward
                    model_inputs = shard(model_inputs.data)
                    metrics = p_eval_step(state.params, model_inputs)
                    eval_metrics.append(metrics)

                # get eval metrics
                eval_metrics = get_metrics(eval_metrics)
                eval_metrics = jax.tree_map(jnp.mean, eval_metrics)

                # Update progress bar
                epochs.write(
                    f"Step... ({cur_step} | Loss: {eval_metrics['loss']}, Acc: {eval_metrics['accuracy']})"
                )

                # Save metrics
                if has_tensorboard and jax.process_index() == 0:
                    write_eval_metric(summary_writer, eval_metrics, cur_step)

            if cur_step % training_args.save_steps == 0 and cur_step > 0:
                # save checkpoint after each epoch and push checkpoint to the hub
                if jax.process_index() == 0:
                    params = jax.device_get(
                        jax.tree_map(lambda x: x[0], state.params))
                    model.save_pretrained(training_args.output_dir,
                                          params=params)
                    tokenizer.save_pretrained(training_args.output_dir)
                    if training_args.push_to_hub:
                        repo.push_to_hub(
                            commit_message=
                            f"Saving weights and logs of step {cur_step}",
                            blocking=False)

    # Eval after training
    if training_args.do_eval:
        num_eval_samples = len(tokenized_datasets["validation"])
        eval_samples_idx = jnp.arange(num_eval_samples)
        eval_batch_idx = generate_batch_splits(eval_samples_idx,
                                               eval_batch_size)

        eval_metrics = []
        for i, batch_idx in enumerate(
                tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
            samples = [
                tokenized_datasets["validation"][int(idx)] for idx in batch_idx
            ]
            model_inputs = data_collator(samples)

            # Model forward
            model_inputs = shard(model_inputs.data)
            metrics = p_eval_step(state.params, model_inputs)
            eval_metrics.append(metrics)

        # get eval metrics
        eval_metrics = get_metrics(eval_metrics)
        eval_metrics = jax.tree_map(lambda metric: jnp.mean(metric).item(),
                                    eval_metrics)

        if jax.process_index() == 0:
            eval_metrics = {
                f"eval_{metric_name}": value
                for metric_name, value in eval_metrics.items()
            }
            path = os.path.join(training_args.output_dir, "eval_results.json")
            with open(path, "w") as f:
                json.dump(eval_metrics, f, indent=4, sort_keys=True)
Example #16
0
    def __init__(
        self,
        model_name,
        args=None,
        use_cuda=True,
        cuda_device=-1,
        **kwargs,
    ):

        """
        Initializes a T5Model model.

        Args:
            model_name: The exact architecture and trained weights to use. This may be a Hugging Face Transformers compatible pre-trained model, a community model, or the path to a directory containing model files.
            args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
            use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
            cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default.
            **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied.
        """  # noqa: ignore flake8"

        if args and "manual_seed" in args:
            random.seed(args["manual_seed"])
            np.random.seed(args["manual_seed"])
            torch.manual_seed(args["manual_seed"])
            if "n_gpu" in args and args["n_gpu"] > 0:
                torch.cuda.manual_seed_all(args["manual_seed"])

        self.args = {
            "dataset_class": None,
            "do_sample": False,
            "early_stopping": True,
            "evaluate_generated_text": False,
            "length_penalty": 2.0,
            "max_length": 20,
            "max_steps": -1,
            "num_beams": 1,
            "num_return_sequences": 1,
            "preprocess_inputs": True,
            "repetition_penalty": 1.0,
            "top_k": None,
            "top_p": None,
            "use_multiprocessed_decoding": True,
        }

        self.args.update(global_args)

        saved_model_args = self._load_model_args(model_name)
        if saved_model_args:
            self.args.update(saved_model_args)

        if args:
            self.args.update(args)

        if args:
            self.args.update(args)

        if use_cuda:
            if torch.cuda.is_available():
                if cuda_device == -1:
                    self.device = torch.device("cuda")
                else:
                    self.device = torch.device(f"cuda:{cuda_device}")
            else:
                raise ValueError(
                    "'use_cuda' set to True when cuda is unavailable."
                    "Make sure CUDA is available or set `use_cuda=False`.")
        else:
            self.device = "cpu"

        self.results = {}

        self.config = T5Config.from_pretrained(model_name,
                                               **self.args["config"])

        self.model = T5ForConditionalGeneration.from_pretrained(
            model_name, config=self.config)

        self.tokenizer = T5Tokenizer.from_pretrained(model_name)

        if not use_cuda:
            self.args["fp16"] = False

        self.args["model_name"] = model_name

        if self.args["wandb_project"] and not wandb_available:
            warnings.warn(
                "wandb_project specified but wandb is not available. Wandb disabled."
            )
            self.args["wandb_project"] = None
Example #17
0
    def __init__(
        self,
        model_name,
        args=None,
        use_cuda=True,
        cuda_device=-1,
        **kwargs,
    ):

        """
        Initializes a T5Model model.

        Args:
            model_name: The exact architecture and trained weights to use. This may be a Hugging Face Transformers compatible pre-trained model, a community model, or the path to a directory containing model files.
            args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
            use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
            cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default.
            **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied.
        """  # noqa: ignore flake8"

        self.args = self._load_model_args(model_name)

        if isinstance(args, dict):
            self.args.update_from_dict(args)
        elif isinstance(args, T5Args):
            self.args = args

        if "sweep_config" in kwargs:
            sweep_config = kwargs.pop("sweep_config")
            sweep_values = {
                key: value["value"]
                for key, value in sweep_config.as_dict().items()
                if key != "_wandb"
            }
            self.args.update_from_dict(sweep_values)

        if self.args.manual_seed:
            random.seed(self.args.manual_seed)
            np.random.seed(self.args.manual_seed)
            torch.manual_seed(self.args.manual_seed)
            if self.args.n_gpu > 0:
                torch.cuda.manual_seed_all(self.args.manual_seed)

        if use_cuda:
            if torch.cuda.is_available():
                if cuda_device == -1:
                    self.device = torch.device("cuda")
                else:
                    self.device = torch.device(f"cuda:{cuda_device}")
            else:
                raise ValueError(
                    "'use_cuda' set to True when cuda is unavailable."
                    "Make sure CUDA is available or set `use_cuda=False`.")
        else:
            self.device = "cpu"

        self.results = {}

        self.config = T5Config.from_pretrained(model_name, **self.args.config)

        self.model = T5ForConditionalGeneration.from_pretrained(
            model_name, config=self.config)

        self.tokenizer = T5Tokenizer.from_pretrained(model_name)

        if not use_cuda:
            self.args.fp16 = False

        self.args.model_type = "T5"
        self.args.model_name = model_name

        if self.args.wandb_project and not wandb_available:
            warnings.warn(
                "wandb_project specified but wandb is not available. Wandb disabled."
            )
            self.args.wandb_project = None
def convert_t5x_checkpoint_to_flax(t5x_checkpoint_path, config_name,
                                   flax_dump_folder_path):
    config = T5Config.from_pretrained(config_name)
    flax_model = FlaxT5ForConditionalGeneration(config=config)
    t5x_model = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)

    split_mlp_wi = "wi_0" in t5x_model["target"]["encoder"]["layers_0"]["mlp"]

    # Encoder
    for layer_index in range(config.num_layers):
        layer_name = f"layers_{str(layer_index)}"

        # Self-Attention
        t5x_attention_key = t5x_model["target"]["encoder"][layer_name][
            "attention"]["key"]["kernel"]
        t5x_attention_out = t5x_model["target"]["encoder"][layer_name][
            "attention"]["out"]["kernel"]
        t5x_attention_query = t5x_model["target"]["encoder"][layer_name][
            "attention"]["query"]["kernel"]
        t5x_attention_value = t5x_model["target"]["encoder"][layer_name][
            "attention"]["value"]["kernel"]

        # Layer Normalization
        t5x_attention_layer_norm = t5x_model["target"]["encoder"][layer_name][
            "pre_attention_layer_norm"]["scale"]

        if split_mlp_wi:
            t5x_mlp_wi_0 = t5x_model["target"]["encoder"][layer_name]["mlp"][
                "wi_0"]["kernel"]
            t5x_mlp_wi_1 = t5x_model["target"]["encoder"][layer_name]["mlp"][
                "wi_1"]["kernel"]
        else:
            t5x_mlp_wi = t5x_model["target"]["encoder"][layer_name]["mlp"][
                "wi"]["kernel"]

        t5x_mlp_wo = t5x_model["target"]["encoder"][layer_name]["mlp"]["wo"][
            "kernel"]

        # Layer Normalization
        t5x_mlp_layer_norm = t5x_model["target"]["encoder"][layer_name][
            "pre_mlp_layer_norm"]["scale"]

        # Assigning
        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"][
            "SelfAttention"]["k"]["kernel"] = t5x_attention_key
        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"][
            "SelfAttention"]["o"]["kernel"] = t5x_attention_out
        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"][
            "SelfAttention"]["q"]["kernel"] = t5x_attention_query
        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"][
            "SelfAttention"]["v"]["kernel"] = t5x_attention_value

        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"][
            "layer_norm"]["weight"] = t5x_attention_layer_norm

        if split_mlp_wi:
            flax_model.params["encoder"]["block"][str(layer_index)]["layer"][
                "1"]["DenseReluDense"]["wi_0"]["kernel"] = t5x_mlp_wi_0
            flax_model.params["encoder"]["block"][str(layer_index)]["layer"][
                "1"]["DenseReluDense"]["wi_1"]["kernel"] = t5x_mlp_wi_1
        else:
            flax_model.params["encoder"]["block"][str(layer_index)]["layer"][
                "1"]["DenseReluDense"]["wi"]["kernel"] = t5x_mlp_wi

        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"][
            "DenseReluDense"]["wo"]["kernel"] = t5x_mlp_wo
        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"][
            "layer_norm"]["weight"] = t5x_mlp_layer_norm

    # Only for layer 0:
    t5x_encoder_rel_embedding = t5x_model["target"]["encoder"]["relpos_bias"][
        "rel_embedding"].T
    flax_model.params["encoder"]["block"]["0"]["layer"]["0"]["SelfAttention"][
        "relative_attention_bias"]["embedding"] = t5x_encoder_rel_embedding

    # Assigning
    t5x_encoder_norm = t5x_model["target"]["encoder"]["encoder_norm"]["scale"]
    flax_model.params["encoder"]["final_layer_norm"][
        "weight"] = t5x_encoder_norm

    # Decoder
    for layer_index in range(config.num_decoder_layers):
        layer_name = f"layers_{str(layer_index)}"

        # Self-Attention
        t5x_attention_key = t5x_model["target"]["decoder"][layer_name][
            "self_attention"]["key"]["kernel"]
        t5x_attention_out = t5x_model["target"]["decoder"][layer_name][
            "self_attention"]["out"]["kernel"]
        t5x_attention_query = t5x_model["target"]["decoder"][layer_name][
            "self_attention"]["query"]["kernel"]
        t5x_attention_value = t5x_model["target"]["decoder"][layer_name][
            "self_attention"]["value"]["kernel"]

        # Layer Normalization
        t5x_pre_attention_layer_norm = t5x_model["target"]["decoder"][
            layer_name]["pre_self_attention_layer_norm"]["scale"]

        # Encoder-Decoder-Attention
        t5x_enc_dec_attention_key = t5x_model["target"]["decoder"][layer_name][
            "encoder_decoder_attention"]["key"]["kernel"]
        t5x_enc_dec_attention_out = t5x_model["target"]["decoder"][layer_name][
            "encoder_decoder_attention"]["out"]["kernel"]
        t5x_enc_dec_attention_query = t5x_model["target"]["decoder"][
            layer_name]["encoder_decoder_attention"]["query"]["kernel"]
        t5x_enc_dec_attention_value = t5x_model["target"]["decoder"][
            layer_name]["encoder_decoder_attention"]["value"]["kernel"]

        # Layer Normalization
        t5x_cross_layer_norm = t5x_model["target"]["decoder"][layer_name][
            "pre_cross_attention_layer_norm"]["scale"]

        # MLP
        if split_mlp_wi:
            t5x_mlp_wi_0 = t5x_model["target"]["decoder"][layer_name]["mlp"][
                "wi_0"]["kernel"]
            t5x_mlp_wi_1 = t5x_model["target"]["decoder"][layer_name]["mlp"][
                "wi_1"]["kernel"]
        else:
            t5x_mlp_wi = t5x_model["target"]["decoder"][layer_name]["mlp"][
                "wi"]["kernel"]

        t5x_mlp_wo = t5x_model["target"]["decoder"][layer_name]["mlp"]["wo"][
            "kernel"]

        # Layer Normalization
        tx5_mlp_layer_norm = t5x_model["target"]["decoder"][layer_name][
            "pre_mlp_layer_norm"]["scale"]

        # Assigning
        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"][
            "SelfAttention"]["k"]["kernel"] = t5x_attention_key
        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"][
            "SelfAttention"]["o"]["kernel"] = t5x_attention_out
        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"][
            "SelfAttention"]["q"]["kernel"] = t5x_attention_query
        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"][
            "SelfAttention"]["v"]["kernel"] = t5x_attention_value

        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"][
            "layer_norm"]["weight"] = t5x_pre_attention_layer_norm

        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"][
            "EncDecAttention"]["k"]["kernel"] = t5x_enc_dec_attention_key
        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"][
            "EncDecAttention"]["o"]["kernel"] = t5x_enc_dec_attention_out
        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"][
            "EncDecAttention"]["q"]["kernel"] = t5x_enc_dec_attention_query
        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"][
            "EncDecAttention"]["v"]["kernel"] = t5x_enc_dec_attention_value

        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"][
            "layer_norm"]["weight"] = t5x_cross_layer_norm

        if split_mlp_wi:
            flax_model.params["decoder"]["block"][str(layer_index)]["layer"][
                "2"]["DenseReluDense"]["wi_0"]["kernel"] = t5x_mlp_wi_0
            flax_model.params["decoder"]["block"][str(layer_index)]["layer"][
                "2"]["DenseReluDense"]["wi_1"]["kernel"] = t5x_mlp_wi_1
        else:
            flax_model.params["decoder"]["block"][str(layer_index)]["layer"][
                "2"]["DenseReluDense"]["wi"]["kernel"] = t5x_mlp_wi

        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"][
            "DenseReluDense"]["wo"]["kernel"] = t5x_mlp_wo

        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"][
            "layer_norm"]["weight"] = tx5_mlp_layer_norm

    # Decoder Normalization
    tx5_decoder_norm = t5x_model["target"]["decoder"]["decoder_norm"]["scale"]
    flax_model.params["decoder"]["final_layer_norm"][
        "weight"] = tx5_decoder_norm

    # Only for layer 0:
    t5x_decoder_rel_embedding = t5x_model["target"]["decoder"]["relpos_bias"][
        "rel_embedding"].T
    flax_model.params["decoder"]["block"]["0"]["layer"]["0"]["SelfAttention"][
        "relative_attention_bias"]["embedding"] = t5x_decoder_rel_embedding

    # Token Embeddings
    tx5_token_embeddings = t5x_model["target"]["token_embedder"]["embedding"]
    flax_model.params["shared"]["embedding"] = tx5_token_embeddings

    # LM Head (only in v1.1 checkpoints)
    if "logits_dense" in t5x_model["target"]["decoder"]:
        flax_model.params["lm_head"]["kernel"] = t5x_model["target"][
            "decoder"]["logits_dense"]["kernel"]

    flax_model.save_pretrained(flax_dump_folder_path)
    print("T5X Model was sucessfully converted!")
Example #19
0
])

tokenizer.save_model(tokenizer_dir)

tokenizer = ByteLevelBPETokenizer(
    "tokenizer/vocab.json",
    "tokenizer/merges.txt",
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

config = T5Config(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

tokenizer = T5TokenizerFast.from_pretrained(tokenizer_dir, max_len=512)

model = T5ForConditionalGeneration(config=config)
model.num_parameters()

train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=f"{data_dir}/train_texts.txt",
    block_size=128,
Example #20
0
    return torch.tensor(LA.norm(mat, n).item())


for dirname in [
        './models/11b/heads'
]:  # for your case, replace dirname with the path to your model file
    seeds = [0, 1, 2, 3, 4]
    for seed in seeds:
        gc.collect()
        results_encoder = defaultdict(list)
        results_decoder = defaultdict(list)
        table_file_decoder = open(
            f'l1_decoder_{dirname.split("/")[-1]}-{seed}.tsv', 'w')
        table_file_encoder = open(
            f'l1_encoder_{dirname.split("/")[-1]}-{seed}.tsv', 'w')
        config = T5Config.from_pretrained(f'{dirname}-{seed}')
        model = T5ForConditionalGeneration.from_pretrained(f'{dirname}-{seed}',
                                                           config=config)

        org_config = T5Config.from_pretrained(f'./models/11b')
        org_model = T5ForConditionalGeneration.from_pretrained(
            f'./models/11b', config=org_config)

        org_dict = org_model.state_dict()
        trained_dict = model.state_dict()

        for encoder_n in range(24):
            print("seed", seed, encoder_n)
            q_org = org_dict[
                f'encoder.block.{encoder_n}.layer.0.SelfAttention.q.weight']
            q_new = trained_dict[