tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.config_name: config = T5Config.from_pretrained( model_args.config_name, cache_dir=model_args.cache_dir, vocab_size=len(tokenizer) ) elif model_args.model_name_or_path: config = T5Config.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, vocab_size=len(tokenizer) ) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names
def run_trt( self, metadata: NetworkMetadata, onnx_fpaths: Tuple[NetworkModel], network_input: List[str], working_directory: str, keep_trt_engine: bool, keep_onnx_model: bool, keep_torch_model: bool, timing_profile: TimingProfile, ) -> List[NetworkResult]: workspace = NNFolderWorkspace( self.frameworks_cmd.config.network_name, metadata, working_directory ) results = [] try: # no fpath provided for onnx files, download them if len(onnx_fpaths) == 0: onnx_fpaths = self.frameworks_cmd.generate_and_download_framework( metadata, workspace ).onnx else: keep_onnx_model = True keep_torch_model = True # Output networks shall not exceed number of network segments explicitly defined by configuraiton file. assert len(onnx_fpaths) == len( T5ModelTRTConfig.NETWORK_SEGMENTS ), "There should only be {} exported ONNX segments in T5 model.".format( len(T5ModelTRTConfig.NETWORK_SEGMENTS) ) hash_onnx_fpath = {v.name: v for v in onnx_fpaths} decoder_onnx_fpath = hash_onnx_fpath[ T5ModelTRTConfig.NETWORK_DECODER_SEGMENT_NAME ].fpath encoder_onnx_fpath = hash_onnx_fpath[ T5ModelTRTConfig.NETWORK_ENCODER_SEGMENT_NAME ].fpath self.t5_trt_encoder_engine = T5EncoderONNXFile( encoder_onnx_fpath, metadata ).as_trt_engine(encoder_onnx_fpath + ".engine") self.t5_trt_decoder_engine = T5DecoderONNXFile( decoder_onnx_fpath, metadata ).as_trt_engine(decoder_onnx_fpath + ".engine") tfm_config = T5Config( use_cache=metadata.other.kv_cache, num_layers=T5ModelTRTConfig.NUMBER_OF_LAYERS[metadata.variant], ) self.t5_trt_encoder = T5TRTEncoder( self.t5_trt_encoder_engine, metadata, tfm_config ) self.t5_trt_decoder = T5TRTDecoder( self.t5_trt_decoder_engine, metadata, tfm_config ) for ninput in network_input: results.append( self.execute_inference( metadata, hash_onnx_fpath, ninput, timing_profile ) ) finally: self.cleanup(workspace, keep_trt_engine, keep_onnx_model, keep_torch_model) return results
'input_ids', 'attention_mask', 'labels', 'decoder_attention_mask' ] encoded.set_format(type='torch', columns=columns) train_dataloader = torch.utils.data.DataLoader(encoded["train"], collate_fn=collate_fn, batch_size=args.batch_size) val_dataloader = torch.utils.data.DataLoader(encoded["validation"], collate_fn=collate_fn, batch_size=args.batch_size * 4) if args.from_pretrained: model = T5ForConditionalGeneration.from_pretrained(args.model_select) else: config = T5Config.from_pretrained(args.model_select) model = T5ForConditionalGeneration(config) no_decay = ["bias", "LayerNorm.weight"] params_decay = [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ] params_nodecay = [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ] optim_groups = [ { "params": params_decay, "weight_decay": 0.1
MODEL_PATH = os.environ.get("MODEL_PATH", "/data/model.pth") BASE_MODEL = os.environ.get("BASE_MODEL", "t5-base") DECODING = os.environ.get("DECODING", "greedy") # greedy, topk-N (e.g., topk-10) cuda = torch.cuda.is_available() if cuda: torch.cuda.set_device(0) # singe gpu device = torch.device("cuda") else: device = torch.device("cpu") logger.info(f"question generation is set to run on {device}") # init model logger.info("question generation model is preparing...") config = T5Config.from_pretrained(BASE_MODEL) model = T5ForConditionalGeneration(config=config) t = QGTokenizer(tokenizer=BASE_MODEL) checkpoint = torch.load(MODEL_PATH, map_location=device) model.load_state_dict(checkpoint["model_state_dict"]) model.eval() if cuda: model.cuda() logger.info(f"question generation model is ready") app = Flask(__name__) @app.route("/question", methods=["POST"]) def respond():
from transformers import T5Config, T5Tokenizer, T5ForConditionalGeneration from transformers.modeling_t5 import load_tf_weights_in_t5 from flask import Flask, request, jsonify app = Flask(__name__) base_model = "t5-large" tokenizer = T5Tokenizer.from_pretrained(base_model) model = T5ForConditionalGeneration(T5Config.from_pretrained(base_model)) load_tf_weights_in_t5(model, None, "/data/") model.eval() ret_dict = { 'low air quality': 'LowAirQuality', 'low humidity': 'LowHumidity', 'low brightness': 'LowBrightness', 'low noise level': 'LowNoise', 'low security': 'LowSecurity', 'low temperature': 'LowTemperature', 'high air quality': 'HighAirQuality', 'high humidity': 'HighHumidity', 'high brightness': 'HighBrightness', 'high noise level': 'HighNoise', 'high security': 'HighSecurity', 'high temperature': 'HighTemperature' } def run_model(input_string, **generator_args): input_ids = tokenizer.encode(input_string, return_tensors="pt")
class Funnel_T5_VAE_Config(PretrainedConfig): r""" This is the configuration class to store the configuration of :class:`~transformer_vae.T5_VAE_Model`. It is used to instantiate a Funnel-T5-VAE model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the T5 `funnel-t5-vae-base architecture. To be able to use `transformer.trainer.Trainer` we need some specific training logic & config in the model. Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. Arguments: latent_size (:obj:`int`, `optional`, defaults to 1,000): Number of dimensions to use for the sequences latent code. funnel_name (:obj:`str`, `optional`, defaults to t5-base): Name of the transformer model to use as encoder & decoder. vae_encoder_model (:obj:`str`, `optional`, defaults to None): Name of the model to encode T5 hidden states into latent codes. vae_decoder_model (:obj:`str`, `optional`, defaults to None): Name of the model to decode latent codes into T5 hidden states. set_seq_size (:obj:`int`, `optional`, defaults to 60): NOTE: Every input sequence must be padded to be equal to this length. t5_name (:obj:`str`, `optional`, defaults to t5-base): Name of the Transformer model to use as a decoder. transformer_critic_name (:obj:`str`, `optional`, defaults to None): Name of the Transformer model to use as an advisery on interpolations. *** Training Args *** reg_schedule_k (:obj:`float`, `optional`, defaults to 0.0025): Multiplied by global_step in a sigmoid, more gradually increase regulariser loss weight. reg_schedule_b (:obj:`float`, `optional`, defaults to 6.25): Added to global step in sigmoid, further delays increase in regulariser loss weight. use_extra_logs (:obj:`bool`, `optional`, defaults to False): Store extra logs during each training inference. gradient_checkpoint (:obj:`bool`, `optional`, defaults to False): Checkpoint gradients in the model. Currently just checkpoints after the encoder + VAE funnel_block_sizes (:obj:`str`, `optional`, defaults to ''): Size of each Funnel Encoder block, sequence is halved between each block. Example specification: 1_1_1 *** End *** TODO: Add extra models to condition on the latent """ model_type = "transformer_vae" is_composition = True def __init__( self, latent_size=1_000, funnel_name="funnel-transformer/intermediate", t5_name="t5-base", vae_encoder_model='', vae_decoder_model='', critic_type='', critic_name='', set_seq_size=60, decoder_start_token_id=0, dont_use_reg_loss=False, reg_schedule_k=0.0025, reg_schedule_b=6.25, use_extra_logs=False, cache_dir=None, n_latent_tokens=5, # set to -1 for full sequence funnel_block_sizes='', num_decoder_layers=0, num_decoder_heads=0, attention_window_size=0, attention_window_overlap=0, gradient_checkpoint_encoder=False, decoder_grad_chk_pnt_rate=0, skip_upsample=False, **kwargs, ): assertIn(vae_encoder_model, VAE_ENCODER_MODELS.keys(), "Unexpected VAE encoder.") assertIn(vae_decoder_model, VAE_DECODER_MODELS.keys(), "Unexpected VAE decoder.") super().__init__(**kwargs) self.set_seq_size = set_seq_size # VAE self.vae_encoder_model = vae_encoder_model self.vae_decoder_model = vae_decoder_model if set_seq_size < n_latent_tokens: logger.warning( f'set_seq_size size is smaller than n_latent_tokens, now using n_latent_tokens={set_seq_size} from {n_latent_tokens}' ) n_latent_tokens = set_seq_size self.latent_size = latent_size self.n_latent_tokens = n_latent_tokens self.skip_upsample = skip_upsample # funnel encoder model if 'funnel' not in kwargs: self.funnel = AutoConfig.from_pretrained(funnel_name, cache_dir=cache_dir) if funnel_block_sizes: self.funnel.block_sizes = [ int(i) for i in funnel_block_sizes.split('_') ] self.funnel.decoder_start_token_id = decoder_start_token_id self.funnel.n_positions = set_seq_size else: self.funnel = FunnelConfig(**kwargs.pop('funnel')) pooling_division = 2**(len(self.funnel.block_sizes) - 1) self.encoded_seq_size = math.ceil(self.funnel.n_positions / pooling_division) self.gradient_checkpoint_encoder = gradient_checkpoint_encoder # T5 decoder model if 't5' not in kwargs: self.t5 = AutoConfig.from_pretrained(t5_name, cache_dir=cache_dir) if num_decoder_layers: self.t5.num_layers = num_decoder_layers if num_decoder_heads: self.t5.num_heads = num_decoder_heads self.t5.decoder_start_token_id = decoder_start_token_id self.t5.n_positions = self.funnel.n_positions assertEqual(self.t5.model_type, "t5", "Need t5 model type for transformer_decoder.") else: self.t5 = T5Config(**kwargs.pop('t5')) assertEqual(self.funnel.d_model, self.t5.d_model, "Funnel & T5 transformers have different dimensions.") self.decoder_grad_chk_pnt_rate = decoder_grad_chk_pnt_rate assert (attention_window_size < set_seq_size ), 'Attention window must be smallar than set sequence size.' self.attention_window_size = attention_window_size self.attention_window_overlap = attention_window_overlap if attention_window_size: assert ( set_seq_size % attention_window_size != 0 ), 'When doing an alternating attention pattern the sequence size cannot be divisable by the window size as no alternations will be possible.' self.attention_window_overlap = set_seq_size % attention_window_size # extra training losses self.use_reg_loss = not dont_use_reg_loss if dont_use_reg_loss: logger.warning( "Regularisation loss is turned off, you are training an Autoencoder (not a VAE)." ) self.reg_schedule_k = reg_schedule_k self.reg_schedule_b = reg_schedule_b self.use_extra_logs = use_extra_logs # critic model self.critic = None if critic_name: self.critic_type = critic_type if 'critic' not in kwargs: self.critic = AutoConfig.from_pretrained(critic_name, cache_dir=cache_dir) else: self.critic = FunnelConfig(**kwargs.pop('critic')) assertEqual(self.t5.d_model, self.critic.d_model, "Funnel & T5 transformers have different dimensions.") # misc self.use_cache = getattr(self.funnel, "use_cache", False)
"WARNING: e2e is meant to generate questions by context. The ouput of the script will be a csv instead of a json." ) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("Device:", device) model_created = False print("Loading model and tokenizer...", end="", flush=True) if args.checkpoint != None: model_created = True if args.bart: config = BartConfig.from_json_file(args.checkpoint + "/config.json") model = BartForConditionalGeneration.from_pretrained( args.checkpoint + "/pytorch_model.bin", config=config) if args.t5: config = T5Config.from_json_file(args.checkpoint + "/config.json") model = T5ForConditionalGeneration.from_pretrained( args.checkpoint + "/pytorch_model.bin", config=config) elif not args.bart and not args.t5: config = EncoderDecoderConfig.from_json_file(args.checkpoint + "/config.json") model = EncoderDecoderModel.from_pretrained(args.checkpoint + "/pytorch_model.bin", config=config) model_name = args.checkpoint if args.bart: if args.checkpoint == None: model_name = "WikinewsSum/bart-large-multi-fr-wiki-news" if args.model_name == "" else args.model_name tokenizer = BartTokenizer.from_pretrained( args.tokenizer
# This is a very small notebook showing how to grab a pre-trained T5 model, fine-tune it, and export it to onnx.] # A lot of this is inspired by huggingface. from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config, AdamW import torch from onnxt5 import generate_onnx_representation, GenerativeT5 from onnxt5.api import get_sess import tempfile temp_dir = tempfile.gettempdir() base_model = "t5-base" # Setting up the model and tokenizer config = T5Config.from_pretrained(base_model) config.n_positions = 256 # You can change the properties of your model here model = T5ForConditionalGeneration(config=config) # Download vocab file tokenizer = T5Tokenizer(config=config, vocab_file="test_sentencepiece.model") model.train() # Let's setup our optimizer optimizer = AdamW(model.parameters(), lr=1e-5) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay':
def convert_model(args): if os.path.exists(args.decoder_onnx): print(f"skip convert_to_onnx since path existed: {args.decoder_onnx}") else: assert args.model_type == "gpt2", "please have onnx model ready for model type that is not gpt2" gpt2_to_onnx(args) # TODO: fix shape inference for T5. Currently symbolic shape inference on T5 is broken. enable_shape_inference = args.model_type == "gpt2" if enable_shape_inference: print(f"Run symbolic shape inference on {args.decoder_onnx}. The file will be overwritten.") shape_inference(args.decoder_onnx) global config if args.model_type == "gpt2": config = GPT2Config.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir) else: config = T5Config.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir) print(config) eos_token_id = config.eos_token_id pad_token_id = config.eos_token_id vocab_size = config.vocab_size # if vocab_size is given in parameters use that. if args.vocab_size != -1: vocab_size = args.vocab_size model = onnx.load(args.decoder_onnx) model.graph.name = f"{args.model_type} decoder subgraph" if args.model_type == "gpt2": verify_gpt2_subgraph(model.graph, args.precision) else: verify_t5_decoder_subgraph(model.graph, args.precision) inputs = [ "input_ids", "max_length", "min_length", "num_beams", "num_return_sequences", "temperature", "length_penalty", "repetition_penalty", "vocab_mask", ] if args.prefix_vocab_mask: inputs.append("prefix_vocab_mask") outputs = ["sequences"] if args.output_sequences_scores: outputs.append("sequences_scores") if args.output_token_scores: assert args.output_sequences_scores, "--output_token_scores requires --output_sequences_scores" outputs.append("scores") node = helper.make_node( "BeamSearch", inputs=inputs, outputs=outputs, name=f"BeamSearch_{args.model_type}", ) node.domain = "com.microsoft" node.attribute.extend( [ helper.make_attribute("eos_token_id", eos_token_id), helper.make_attribute("pad_token_id", pad_token_id), helper.make_attribute("no_repeat_ngram_size", args.no_repeat_ngram_size), helper.make_attribute("early_stopping", 1 if args.early_stopping else 0), helper.make_attribute("model_type", 0 if args.model_type == "gpt2" else 1), helper.make_attribute("decoder", model.graph), ] ) if args.model_type == "t5": if enable_shape_inference: print(f"Run symbolic shape inference on {args.encoder_decoder_init_onnx}. The file will be overwritten.") shape_inference(args.encoder_decoder_init_onnx) init_model = onnx.load(args.encoder_decoder_init_onnx) init_model.graph.name = f"{args.model_type} encoder decoder init subgraph" verify_t5_encoder_decoder_init_subgraph(init_model.graph, args.precision) node.attribute.extend( [ helper.make_attribute("encoder_decoder_init", init_model.graph), ] ) from onnx import TensorProto # graph inputs input_ids = helper.make_tensor_value_info("input_ids", TensorProto.INT32, ["batch_size", "sequence_length"]) max_length = helper.make_tensor_value_info("max_length", TensorProto.INT32, [1]) min_length = helper.make_tensor_value_info("min_length", TensorProto.INT32, [1]) num_beams = helper.make_tensor_value_info("num_beams", TensorProto.INT32, [1]) num_return_sequences = helper.make_tensor_value_info("num_return_sequences", TensorProto.INT32, [1]) temperature = helper.make_tensor_value_info("temperature", TensorProto.FLOAT, [1]) length_penalty = helper.make_tensor_value_info("length_penalty", TensorProto.FLOAT, [1]) repetition_penalty = helper.make_tensor_value_info("repetition_penalty", TensorProto.FLOAT, [1]) vocab_mask = helper.make_tensor_value_info("vocab_mask", TensorProto.INT32, [vocab_size]) graph_inputs = [ input_ids, max_length, min_length, num_beams, num_return_sequences, temperature, length_penalty, repetition_penalty, vocab_mask, ] if args.prefix_vocab_mask: prefix_vocab_mask = helper.make_tensor_value_info( "prefix_vocab_mask", TensorProto.INT32, ["batch_size", vocab_size] ) graph_inputs.append(prefix_vocab_mask) # graph outputs sequences = helper.make_tensor_value_info( "sequences", TensorProto.INT32, ["batch_size", "num_return_sequences", "max_length"], ) sequences_scores = helper.make_tensor_value_info( "sequences_scores", TensorProto.FLOAT, ["batch_size", "num_return_sequences"] ) scores = helper.make_tensor_value_info( "scores", TensorProto.FLOAT, ["max_length - sequence_length", "batch_size", "num_beams", vocab_size], ) initializers = [] graph_outputs = [sequences] if args.output_sequences_scores: graph_outputs.append(sequences_scores) if args.output_token_scores: graph_outputs.append(scores) new_graph = helper.make_graph( [node], f"{args.model_type}-beam-search", graph_inputs, graph_outputs, initializers, ) # Create the model new_model = helper.make_model( new_graph, producer_name="onnxruntime.transformers", opset_imports=model.opset_import, ) onnx.save(new_model, args.output)
from tqdm.notebook import tqdm from transformers import T5Config, T5Tokenizer, T5ForConditionalGeneration print("Downloading and unzipping model.", file=sys.stderr) os.system( "wget -nc https://storage.googleapis.com/doctttttquery_git/t5-base.zip") os.system("unzip -o t5-base.zip") nltk.download('punkt') # Define the target device. Use GPU if available. device = 'cuda' if torch.cuda.is_available() else 'cpu' # Instantiate and load the QG model to the GPU. qg_tokenizer = T5Tokenizer.from_pretrained('t5-base') qg_config = T5Config.from_pretrained('t5-base') qg_model = T5ForConditionalGeneration.from_pretrained('model.ckpt-1004000', from_tf=True, config=qg_config) qg_model.to(device) def preprocess(document: str, span=10, stride=5) -> List[str]: """ Define your preprocessing function. This function should take the a corpus document and output a list of generation spans. This is required so we can match the expected sequence size of the generation model. """
print("starting to train") # train word_tokens_train, pos_tokens_train = tasks.pos('UD_English-EWT/en_ewt-ud-train.conllu') tokenizer = T5Tokenizer.from_pretrained("t5-small") ## i want to append pos: - do I include the pos token associated with it? if args.control: word_tokens_train, pos_tokens_train = tasks.make_control(tokenizer, word_tokens_train, pos_tokens_train, args.embsize) torch_ids_train, torch_masks_train, torch_token_starts, torch_labels_train = r.prepare_data(tokenizer, word_tokens_train, pos_tokens_train) # data for training split = int(0.75 * len(torch_ids_train)) #dataset_train = Dataset(torch_ids_train[:split], torch_masks_train[:split], torch_labels_train[:split]) #dataset_dev = Dataset(torch_ids_train[split:], torch_masks_train[split:], torch_labels_train[split:]) config = T5Config.from_pretrained("t5-small", output_hidden_states=True, output_attentions=True) model = T5ForConditionalGeneration.from_pretrained("t5-small", config=config) model.to(device) #train(model, dataset_train, dataset_dev, torch_token_starts[split:], tokenizer) # 100 values test dataset_train = Dataset(torch_ids_train[:200], torch_masks_train[:200], torch_labels_train[:200]) dataset_dev = Dataset(torch_ids_train[200:400], torch_masks_train[200:400], torch_labels_train[200:400]) train(model, dataset_train, dataset_dev, torch_token_starts[200:400], tokenizer) print("done!") else: print("starting to evaluate") tokenizer = T5Tokenizer.from_pretrained("t5-small")
def __init__( self, model_name, args=None, tokenizer=None, use_cuda=False, cuda_device=-1, **kwargs, ): """ Initializes a T5Model model. Args: model_name: The exact architecture and trained weights to use. This may be a Hugging Face Transformers compatible pre-trained model, a community model, or the path to a directory containing model files. args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args. use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only. cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default. **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied. """ # noqa: ignore flake8" self.args = self._load_model_args(model_name) if isinstance(args, dict): self.args.update_from_dict(args) elif isinstance(args, T5Args): self.args = args if "sweep_config" in kwargs: sweep_config = kwargs.pop("sweep_config") sweep_values = sweep_config_to_sweep_values(sweep_config) self.args.update_from_dict(sweep_values) if self.args.manual_seed: random.seed(self.args.manual_seed) np.random.seed(self.args.manual_seed) torch.manual_seed(self.args.manual_seed) if self.args.n_gpu > 0: torch.cuda.manual_seed_all(self.args.manual_seed) self.device = "cpu" self.results = {} if model_name is None: self.config = self.args.config self.model = T5ForConditionalGeneration(config=self.config) else: self.config = T5Config.from_pretrained(model_name, **self.args.config) self.model = T5ForConditionalGeneration.from_pretrained( model_name, config=self.config) if isinstance(tokenizer, T5Tokenizer): self.tokenizer = tokenizer else: self.tokenizer = T5Tokenizer.from_pretrained(model_name, truncate=True) self.model.resize_token_embeddings(len(self.tokenizer)) if self.args.dynamic_quantize: self.model = torch.quantization.quantize_dynamic(self.model, {torch.nn.Linear}, dtype=torch.qint8) if not use_cuda: self.args.fp16 = False self.args.model_type = "T5" if model_name is None: self.args.model_name = "T5_from_scratch" else: self.args.model_name = model_name if self.args.wandb_project and not wandb_available: warnings.warn( "wandb_project specified but wandb is not available. Wandb disabled." ) self.args.wandb_project = None
def __init__(self, tokenizer): super(T5Model, self).__init__() self.tokenizer = tokenizer config = T5Config.from_pretrained('t5-small') self.model = T5ForConditionalGeneration(config=config)
def _build_vocab(self, max_vocab_cnt): # build vocab if self.tokenizer_type.startswith('word'): self._build_vocab_manual(max_vocab_cnt) elif self.tokenizer_type.startswith('bert-'): self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>") # self.vocab_count = 30522 # fixed for pretrained BERT vocab (old version) config_pretrained = BertConfig.from_pretrained(self.tokenizer_type) self.vocab_count = config_pretrained.vocab_size map_vocab = {} for ind in range(self.vocab_count): map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind) inv_map = {v: k for k, v in map_vocab.items()} elif self.tokenizer_type.startswith('xlnet-'): # self.vocab = self.tokenizer.vocab # self.rev_vocab = self.tokenizer.ids_to_tokens # self.pad_id = self.vocab["[PAD]"] self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>") # self.vocab_count = 32000 # fixed for pretrained BERT vocab config_pretrained = XLNetConfig.from_pretrained( self.tokenizer_type) self.vocab_count = config_pretrained.vocab_size map_vocab = {} for ind in range(self.vocab_count): map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind) inv_map = {v: k for k, v in map_vocab.items()} self.vocab = map_vocab self.rev_vocab = inv_map elif self.tokenizer_type.startswith('x5-'): self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>") # self.vocab_count = 32000 config_pretrained = T5Config.from_pretrained(self.tokenizer_type) self.vocab_count = config_pretrained.vocab_size map_vocab = {} for ind in range(self.vocab_count): map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind) inv_map = {v: k for k, v in map_vocab.items()} self.vocab = map_vocab self.rev_vocab = inv_map elif self.tokenizer_type.startswith('bart-'): self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>") # self.vocab_count = 32000 # fixed for pretrained BERT vocab config_pretrained = BartConfig.from_pretrained(self.tokenizer_type) self.vocab_count = config_pretrained.vocab_size map_vocab = {} for ind in range(self.vocab_count): map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind) inv_map = {v: k for k, v in map_vocab.items()} return
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_t5_mlm", model_args, data_args, framework="flax") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level=logging.INFO, datefmt="[%X]", ) # Log on each process the small summary: logger = logging.getLogger(__name__) # Set the verbosity to info of the Transformers logger (on main process only): logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Handle the repository creation if training_args.push_to_hub: if training_args.hub_model_id is None: repo_name = get_full_repo_name(Path( training_args.output_dir).absolute().name, token=training_args.hub_token) else: repo_name = training_args.hub_model_id repo = Repository(training_args.output_dir, clone_from=repo_name) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) if "validation" not in datasets.keys(): datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset( extension, data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) if "validation" not in datasets.keys(): datasets["validation"] = load_dataset( extension, data_files=data_files, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) datasets["train"] = load_dataset( extension, data_files=data_files, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, use_auth_token=True if model_args.use_auth_token else None, ) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, use_auth_token=True if model_args.use_auth_token else None, ) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.config_name: config = T5Config.from_pretrained( model_args.config_name, cache_dir=model_args.cache_dir, vocab_size=len(tokenizer), use_auth_token=True if model_args.use_auth_token else None, ) elif model_args.model_name_or_path: config = T5Config.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # Since we make sure that all sequences are of the same length, no attention_mask is needed. def tokenize_function(examples): return tokenizer(examples[text_column_name], return_attention_mask=False) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token. # To ensure that the input length is `max_seq_length`, we need to increase the maximum length # according to `mlm_probability` and `mean_noise_span_length`. We can also define the label length accordingly. expanded_inputs_length, targets_length = compute_input_and_target_lengths( inputs_length=max_seq_length, noise_density=data_args.mlm_probability, mean_noise_span_length=data_args.mean_noise_span_length, ) # Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: list(chain(*examples[k])) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. if total_length >= expanded_inputs_length: total_length = (total_length // expanded_inputs_length) * expanded_inputs_length # Split by chunks of max_len. result = { k: [ t[i:i + expanded_inputs_length] for i in range(0, total_length, expanded_inputs_length) ] for k, t in concatenated_examples.items() } return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Enable tensorboard only on the master node has_tensorboard = is_tensorboard_available() if has_tensorboard and jax.process_index() == 0: try: from flax.metrics.tensorboard import SummaryWriter summary_writer = SummaryWriter( log_dir=Path(training_args.output_dir)) except ImportError as ie: has_tensorboard = False logger.warning( f"Unable to display metrics through TensorBoard because some package are not installed: {ie}" ) else: logger.warning( "Unable to display metrics through TensorBoard because the package is not installed: " "Please run pip install tensorboard to enable.") # Initialize our training rng = jax.random.PRNGKey(training_args.seed) dropout_rngs = jax.random.split(rng, jax.local_device_count()) if model_args.model_name_or_path: model = FlaxT5ForConditionalGeneration.from_pretrained( model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype), use_auth_token=True if model_args.use_auth_token else None, ) else: config.vocab_size = len(tokenizer) model = FlaxT5ForConditionalGeneration( config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype), use_auth_token=True if model_args.use_auth_token else None, ) # Data collator # This one will take care of randomly masking the tokens. data_collator = FlaxDataCollatorForT5MLM( tokenizer=tokenizer, noise_density=data_args.mlm_probability, mean_noise_span_length=data_args.mean_noise_span_length, input_length=max_seq_length, target_length=targets_length, pad_token_id=model.config.pad_token_id, decoder_start_token_id=model.config.decoder_start_token_id, ) # Store some constant num_epochs = int(training_args.num_train_epochs) train_batch_size = int( training_args.per_device_train_batch_size) * jax.device_count() eval_batch_size = int( training_args.per_device_eval_batch_size) * jax.device_count() num_train_steps = len( tokenized_datasets["train"]) // train_batch_size * num_epochs num_of_hosts = jax.process_count() current_host_idx = jax.process_index() # Create learning rate schedule warmup_fn = optax.linear_schedule( init_value=0.0, end_value=training_args.learning_rate, transition_steps=training_args.warmup_steps) decay_fn = optax.linear_schedule( init_value=training_args.learning_rate, end_value=0, transition_steps=num_train_steps - training_args.warmup_steps, ) linear_decay_lr_schedule_fn = optax.join_schedules( schedules=[warmup_fn, decay_fn], boundaries=[training_args.warmup_steps]) # We use Optax's "masking" functionality to not apply weight decay # to bias and LayerNorm scale parameters. decay_mask_fn returns a # mask boolean with the same structure as the parameters. # The mask is True for parameters that should be decayed. def decay_mask_fn(params): flat_params = traverse_util.flatten_dict(params) flat_mask = { path: (path[-1] != "bias" and path[-2:] not in [("layer_norm", "scale"), ("final_layer_norm", "scale")]) for path in flat_params } return traverse_util.unflatten_dict(flat_mask) # create adam optimizer if training_args.adafactor: # We use the default parameters here to initialize adafactor, # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74 optimizer = optax.adafactor( learning_rate=linear_decay_lr_schedule_fn, ) else: optimizer = optax.adamw( learning_rate=linear_decay_lr_schedule_fn, b1=training_args.adam_beta1, b2=training_args.adam_beta2, weight_decay=training_args.weight_decay, mask=decay_mask_fn, ) # Setup train state state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer) # Define gradient update step fn def train_step(state, batch, dropout_rng): dropout_rng, new_dropout_rng = jax.random.split(dropout_rng) def loss_fn(params): labels = batch.pop("labels") logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0] # compute loss loss = optax.softmax_cross_entropy( logits, onehot(labels, logits.shape[-1])).mean() return loss grad_fn = jax.value_and_grad(loss_fn) loss, grad = grad_fn(state.params) grad = jax.lax.pmean(grad, "batch") new_state = state.apply_gradients(grads=grad) metrics = jax.lax.pmean( { "loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step) }, axis_name="batch") return new_state, metrics, new_dropout_rng # Create parallel version of the train step p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0, )) # Define eval fn def eval_step(params, batch): labels = batch.pop("labels") logits = model(**batch, params=params, train=False)[0] # compute loss loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) # compute accuracy accuracy = jnp.equal(jnp.argmax(logits, axis=-1), labels) # summarize metrics metrics = {"loss": loss.mean(), "accuracy": accuracy.mean()} metrics = jax.lax.pmean(metrics, axis_name="batch") return metrics p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0, )) # Replicate the train state on each device state = jax_utils.replicate(state) train_time = 0 epochs = tqdm(range(num_epochs), desc="Epoch ... ", position=0) for epoch in epochs: # ======================== Training ================================ train_start = time.time() train_metrics = [] # Create sampling rng rng, input_rng = jax.random.split(rng) # Generate an epoch by shuffling sampling indices from the train dataset num_train_samples = len(tokenized_datasets["train"]) train_samples_idx = np.random.permutation(np.arange(num_train_samples)) train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size) # Gather the indexes for creating the batch and do a training step for step, batch_idx in enumerate( tqdm(train_batch_idx, desc="Training...", position=1)): samples = [ tokenized_datasets["train"][int(idx)] for idx in batch_idx ] model_inputs = data_collator(samples) local_host_model_inputs = { key: np.split(model_inputs.data[key], num_of_hosts, axis=0)[current_host_idx] for key, value in model_inputs.data.items() } # Model forward model_inputs = shard(local_host_model_inputs) state, train_metric, dropout_rngs = p_train_step( state, model_inputs, dropout_rngs) train_metrics.append(train_metric) cur_step = epoch * (num_train_samples // train_batch_size) + step if cur_step % training_args.logging_steps == 0 and cur_step > 0: # Save metrics train_metric = jax_utils.unreplicate(train_metric) train_time += time.time() - train_start if has_tensorboard and jax.process_index() == 0: write_train_metric(summary_writer, train_metrics, train_time, cur_step) epochs.write( f"Step... ({cur_step} | Loss: {train_metric['loss'].mean()}, Learning Rate:" f" {train_metric['learning_rate'].mean()})") train_metrics = [] if cur_step % training_args.eval_steps == 0 and cur_step > 0: # ======================== Evaluating ============================== num_eval_samples = len(tokenized_datasets["validation"]) eval_samples_idx = jnp.arange(num_eval_samples) eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size) eval_metrics = [] for i, batch_idx in enumerate( tqdm(eval_batch_idx, desc="Evaluating ...", position=2)): samples = [ tokenized_datasets["validation"][int(idx)] for idx in batch_idx ] model_inputs = data_collator(samples) # Model forward model_inputs = shard(model_inputs.data) metrics = p_eval_step(state.params, model_inputs) eval_metrics.append(metrics) # get eval metrics eval_metrics = get_metrics(eval_metrics) eval_metrics = jax.tree_map(jnp.mean, eval_metrics) # Update progress bar epochs.write( f"Step... ({cur_step} | Loss: {eval_metrics['loss']}, Acc: {eval_metrics['accuracy']})" ) # Save metrics if has_tensorboard and jax.process_index() == 0: write_eval_metric(summary_writer, eval_metrics, cur_step) if cur_step % training_args.save_steps == 0 and cur_step > 0: # save checkpoint after each epoch and push checkpoint to the hub if jax.process_index() == 0: params = jax.device_get( jax.tree_map(lambda x: x[0], state.params)) model.save_pretrained(training_args.output_dir, params=params) tokenizer.save_pretrained(training_args.output_dir) if training_args.push_to_hub: repo.push_to_hub( commit_message= f"Saving weights and logs of step {cur_step}", blocking=False) # Eval after training if training_args.do_eval: num_eval_samples = len(tokenized_datasets["validation"]) eval_samples_idx = jnp.arange(num_eval_samples) eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size) eval_metrics = [] for i, batch_idx in enumerate( tqdm(eval_batch_idx, desc="Evaluating ...", position=2)): samples = [ tokenized_datasets["validation"][int(idx)] for idx in batch_idx ] model_inputs = data_collator(samples) # Model forward model_inputs = shard(model_inputs.data) metrics = p_eval_step(state.params, model_inputs) eval_metrics.append(metrics) # get eval metrics eval_metrics = get_metrics(eval_metrics) eval_metrics = jax.tree_map(lambda metric: jnp.mean(metric).item(), eval_metrics) if jax.process_index() == 0: eval_metrics = { f"eval_{metric_name}": value for metric_name, value in eval_metrics.items() } path = os.path.join(training_args.output_dir, "eval_results.json") with open(path, "w") as f: json.dump(eval_metrics, f, indent=4, sort_keys=True)
def __init__( self, model_name, args=None, use_cuda=True, cuda_device=-1, **kwargs, ): """ Initializes a T5Model model. Args: model_name: The exact architecture and trained weights to use. This may be a Hugging Face Transformers compatible pre-trained model, a community model, or the path to a directory containing model files. args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args. use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only. cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default. **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied. """ # noqa: ignore flake8" if args and "manual_seed" in args: random.seed(args["manual_seed"]) np.random.seed(args["manual_seed"]) torch.manual_seed(args["manual_seed"]) if "n_gpu" in args and args["n_gpu"] > 0: torch.cuda.manual_seed_all(args["manual_seed"]) self.args = { "dataset_class": None, "do_sample": False, "early_stopping": True, "evaluate_generated_text": False, "length_penalty": 2.0, "max_length": 20, "max_steps": -1, "num_beams": 1, "num_return_sequences": 1, "preprocess_inputs": True, "repetition_penalty": 1.0, "top_k": None, "top_p": None, "use_multiprocessed_decoding": True, } self.args.update(global_args) saved_model_args = self._load_model_args(model_name) if saved_model_args: self.args.update(saved_model_args) if args: self.args.update(args) if args: self.args.update(args) if use_cuda: if torch.cuda.is_available(): if cuda_device == -1: self.device = torch.device("cuda") else: self.device = torch.device(f"cuda:{cuda_device}") else: raise ValueError( "'use_cuda' set to True when cuda is unavailable." "Make sure CUDA is available or set `use_cuda=False`.") else: self.device = "cpu" self.results = {} self.config = T5Config.from_pretrained(model_name, **self.args["config"]) self.model = T5ForConditionalGeneration.from_pretrained( model_name, config=self.config) self.tokenizer = T5Tokenizer.from_pretrained(model_name) if not use_cuda: self.args["fp16"] = False self.args["model_name"] = model_name if self.args["wandb_project"] and not wandb_available: warnings.warn( "wandb_project specified but wandb is not available. Wandb disabled." ) self.args["wandb_project"] = None
def __init__( self, model_name, args=None, use_cuda=True, cuda_device=-1, **kwargs, ): """ Initializes a T5Model model. Args: model_name: The exact architecture and trained weights to use. This may be a Hugging Face Transformers compatible pre-trained model, a community model, or the path to a directory containing model files. args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args. use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only. cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default. **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied. """ # noqa: ignore flake8" self.args = self._load_model_args(model_name) if isinstance(args, dict): self.args.update_from_dict(args) elif isinstance(args, T5Args): self.args = args if "sweep_config" in kwargs: sweep_config = kwargs.pop("sweep_config") sweep_values = { key: value["value"] for key, value in sweep_config.as_dict().items() if key != "_wandb" } self.args.update_from_dict(sweep_values) if self.args.manual_seed: random.seed(self.args.manual_seed) np.random.seed(self.args.manual_seed) torch.manual_seed(self.args.manual_seed) if self.args.n_gpu > 0: torch.cuda.manual_seed_all(self.args.manual_seed) if use_cuda: if torch.cuda.is_available(): if cuda_device == -1: self.device = torch.device("cuda") else: self.device = torch.device(f"cuda:{cuda_device}") else: raise ValueError( "'use_cuda' set to True when cuda is unavailable." "Make sure CUDA is available or set `use_cuda=False`.") else: self.device = "cpu" self.results = {} self.config = T5Config.from_pretrained(model_name, **self.args.config) self.model = T5ForConditionalGeneration.from_pretrained( model_name, config=self.config) self.tokenizer = T5Tokenizer.from_pretrained(model_name) if not use_cuda: self.args.fp16 = False self.args.model_type = "T5" self.args.model_name = model_name if self.args.wandb_project and not wandb_available: warnings.warn( "wandb_project specified but wandb is not available. Wandb disabled." ) self.args.wandb_project = None
def convert_t5x_checkpoint_to_flax(t5x_checkpoint_path, config_name, flax_dump_folder_path): config = T5Config.from_pretrained(config_name) flax_model = FlaxT5ForConditionalGeneration(config=config) t5x_model = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path) split_mlp_wi = "wi_0" in t5x_model["target"]["encoder"]["layers_0"]["mlp"] # Encoder for layer_index in range(config.num_layers): layer_name = f"layers_{str(layer_index)}" # Self-Attention t5x_attention_key = t5x_model["target"]["encoder"][layer_name][ "attention"]["key"]["kernel"] t5x_attention_out = t5x_model["target"]["encoder"][layer_name][ "attention"]["out"]["kernel"] t5x_attention_query = t5x_model["target"]["encoder"][layer_name][ "attention"]["query"]["kernel"] t5x_attention_value = t5x_model["target"]["encoder"][layer_name][ "attention"]["value"]["kernel"] # Layer Normalization t5x_attention_layer_norm = t5x_model["target"]["encoder"][layer_name][ "pre_attention_layer_norm"]["scale"] if split_mlp_wi: t5x_mlp_wi_0 = t5x_model["target"]["encoder"][layer_name]["mlp"][ "wi_0"]["kernel"] t5x_mlp_wi_1 = t5x_model["target"]["encoder"][layer_name]["mlp"][ "wi_1"]["kernel"] else: t5x_mlp_wi = t5x_model["target"]["encoder"][layer_name]["mlp"][ "wi"]["kernel"] t5x_mlp_wo = t5x_model["target"]["encoder"][layer_name]["mlp"]["wo"][ "kernel"] # Layer Normalization t5x_mlp_layer_norm = t5x_model["target"]["encoder"][layer_name][ "pre_mlp_layer_norm"]["scale"] # Assigning flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"][ "SelfAttention"]["k"]["kernel"] = t5x_attention_key flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"][ "SelfAttention"]["o"]["kernel"] = t5x_attention_out flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"][ "SelfAttention"]["q"]["kernel"] = t5x_attention_query flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"][ "SelfAttention"]["v"]["kernel"] = t5x_attention_value flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"][ "layer_norm"]["weight"] = t5x_attention_layer_norm if split_mlp_wi: flax_model.params["encoder"]["block"][str(layer_index)]["layer"][ "1"]["DenseReluDense"]["wi_0"]["kernel"] = t5x_mlp_wi_0 flax_model.params["encoder"]["block"][str(layer_index)]["layer"][ "1"]["DenseReluDense"]["wi_1"]["kernel"] = t5x_mlp_wi_1 else: flax_model.params["encoder"]["block"][str(layer_index)]["layer"][ "1"]["DenseReluDense"]["wi"]["kernel"] = t5x_mlp_wi flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"][ "DenseReluDense"]["wo"]["kernel"] = t5x_mlp_wo flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"][ "layer_norm"]["weight"] = t5x_mlp_layer_norm # Only for layer 0: t5x_encoder_rel_embedding = t5x_model["target"]["encoder"]["relpos_bias"][ "rel_embedding"].T flax_model.params["encoder"]["block"]["0"]["layer"]["0"]["SelfAttention"][ "relative_attention_bias"]["embedding"] = t5x_encoder_rel_embedding # Assigning t5x_encoder_norm = t5x_model["target"]["encoder"]["encoder_norm"]["scale"] flax_model.params["encoder"]["final_layer_norm"][ "weight"] = t5x_encoder_norm # Decoder for layer_index in range(config.num_decoder_layers): layer_name = f"layers_{str(layer_index)}" # Self-Attention t5x_attention_key = t5x_model["target"]["decoder"][layer_name][ "self_attention"]["key"]["kernel"] t5x_attention_out = t5x_model["target"]["decoder"][layer_name][ "self_attention"]["out"]["kernel"] t5x_attention_query = t5x_model["target"]["decoder"][layer_name][ "self_attention"]["query"]["kernel"] t5x_attention_value = t5x_model["target"]["decoder"][layer_name][ "self_attention"]["value"]["kernel"] # Layer Normalization t5x_pre_attention_layer_norm = t5x_model["target"]["decoder"][ layer_name]["pre_self_attention_layer_norm"]["scale"] # Encoder-Decoder-Attention t5x_enc_dec_attention_key = t5x_model["target"]["decoder"][layer_name][ "encoder_decoder_attention"]["key"]["kernel"] t5x_enc_dec_attention_out = t5x_model["target"]["decoder"][layer_name][ "encoder_decoder_attention"]["out"]["kernel"] t5x_enc_dec_attention_query = t5x_model["target"]["decoder"][ layer_name]["encoder_decoder_attention"]["query"]["kernel"] t5x_enc_dec_attention_value = t5x_model["target"]["decoder"][ layer_name]["encoder_decoder_attention"]["value"]["kernel"] # Layer Normalization t5x_cross_layer_norm = t5x_model["target"]["decoder"][layer_name][ "pre_cross_attention_layer_norm"]["scale"] # MLP if split_mlp_wi: t5x_mlp_wi_0 = t5x_model["target"]["decoder"][layer_name]["mlp"][ "wi_0"]["kernel"] t5x_mlp_wi_1 = t5x_model["target"]["decoder"][layer_name]["mlp"][ "wi_1"]["kernel"] else: t5x_mlp_wi = t5x_model["target"]["decoder"][layer_name]["mlp"][ "wi"]["kernel"] t5x_mlp_wo = t5x_model["target"]["decoder"][layer_name]["mlp"]["wo"][ "kernel"] # Layer Normalization tx5_mlp_layer_norm = t5x_model["target"]["decoder"][layer_name][ "pre_mlp_layer_norm"]["scale"] # Assigning flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"][ "SelfAttention"]["k"]["kernel"] = t5x_attention_key flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"][ "SelfAttention"]["o"]["kernel"] = t5x_attention_out flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"][ "SelfAttention"]["q"]["kernel"] = t5x_attention_query flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"][ "SelfAttention"]["v"]["kernel"] = t5x_attention_value flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"][ "layer_norm"]["weight"] = t5x_pre_attention_layer_norm flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"][ "EncDecAttention"]["k"]["kernel"] = t5x_enc_dec_attention_key flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"][ "EncDecAttention"]["o"]["kernel"] = t5x_enc_dec_attention_out flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"][ "EncDecAttention"]["q"]["kernel"] = t5x_enc_dec_attention_query flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"][ "EncDecAttention"]["v"]["kernel"] = t5x_enc_dec_attention_value flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"][ "layer_norm"]["weight"] = t5x_cross_layer_norm if split_mlp_wi: flax_model.params["decoder"]["block"][str(layer_index)]["layer"][ "2"]["DenseReluDense"]["wi_0"]["kernel"] = t5x_mlp_wi_0 flax_model.params["decoder"]["block"][str(layer_index)]["layer"][ "2"]["DenseReluDense"]["wi_1"]["kernel"] = t5x_mlp_wi_1 else: flax_model.params["decoder"]["block"][str(layer_index)]["layer"][ "2"]["DenseReluDense"]["wi"]["kernel"] = t5x_mlp_wi flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"][ "DenseReluDense"]["wo"]["kernel"] = t5x_mlp_wo flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"][ "layer_norm"]["weight"] = tx5_mlp_layer_norm # Decoder Normalization tx5_decoder_norm = t5x_model["target"]["decoder"]["decoder_norm"]["scale"] flax_model.params["decoder"]["final_layer_norm"][ "weight"] = tx5_decoder_norm # Only for layer 0: t5x_decoder_rel_embedding = t5x_model["target"]["decoder"]["relpos_bias"][ "rel_embedding"].T flax_model.params["decoder"]["block"]["0"]["layer"]["0"]["SelfAttention"][ "relative_attention_bias"]["embedding"] = t5x_decoder_rel_embedding # Token Embeddings tx5_token_embeddings = t5x_model["target"]["token_embedder"]["embedding"] flax_model.params["shared"]["embedding"] = tx5_token_embeddings # LM Head (only in v1.1 checkpoints) if "logits_dense" in t5x_model["target"]["decoder"]: flax_model.params["lm_head"]["kernel"] = t5x_model["target"][ "decoder"]["logits_dense"]["kernel"] flax_model.save_pretrained(flax_dump_folder_path) print("T5X Model was sucessfully converted!")
]) tokenizer.save_model(tokenizer_dir) tokenizer = ByteLevelBPETokenizer( "tokenizer/vocab.json", "tokenizer/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) config = T5Config( vocab_size=52_000, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) tokenizer = T5TokenizerFast.from_pretrained(tokenizer_dir, max_len=512) model = T5ForConditionalGeneration(config=config) model.num_parameters() train_dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=f"{data_dir}/train_texts.txt", block_size=128,
return torch.tensor(LA.norm(mat, n).item()) for dirname in [ './models/11b/heads' ]: # for your case, replace dirname with the path to your model file seeds = [0, 1, 2, 3, 4] for seed in seeds: gc.collect() results_encoder = defaultdict(list) results_decoder = defaultdict(list) table_file_decoder = open( f'l1_decoder_{dirname.split("/")[-1]}-{seed}.tsv', 'w') table_file_encoder = open( f'l1_encoder_{dirname.split("/")[-1]}-{seed}.tsv', 'w') config = T5Config.from_pretrained(f'{dirname}-{seed}') model = T5ForConditionalGeneration.from_pretrained(f'{dirname}-{seed}', config=config) org_config = T5Config.from_pretrained(f'./models/11b') org_model = T5ForConditionalGeneration.from_pretrained( f'./models/11b', config=org_config) org_dict = org_model.state_dict() trained_dict = model.state_dict() for encoder_n in range(24): print("seed", seed, encoder_n) q_org = org_dict[ f'encoder.block.{encoder_n}.layer.0.SelfAttention.q.weight'] q_new = trained_dict[