def __init__(self, config, dataset): super(T5, self).__init__(config, dataset) self.max_source_length = dataset.max_source_length self.max_target_length = dataset.max_target_length self.pretrained_model_path = config['pretrained_model_path'] self.tokenizer = T5Tokenizer.from_pretrained( self.pretrained_model_path, add_prefix_space=True) self.configuration = T5Config.from_pretrained( self.pretrained_model_path) self.decoder = T5ForConditionalGeneration.from_pretrained( self.pretrained_model_path, config=self.configuration) self.padding_token_idx = self.tokenizer.pad_token_id self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx, reduction='none') if config['task_type'] == "summarization": self.t5_task_text = "summarize: " elif config['task_type'] == "translation": self.t5_task_text = "translate German to English: " else: raise NotImplementedError( "Only summarization and translation are supported.")
def convert_model(base_model, path, new_path): model = T5ForConditionalGeneration(T5Config.from_pretrained(base_model)) print("loading weights...") load_tf_weights_in_t5(model, None, path) model.eval() print("saving HF weights...") model.save_pretrained(new_path)
def __init__(self, class_count: int, label_str: str, model_name_str: str = 't5-base'): config = T5Config.from_pretrained(model_name_str) tokenizer = T5Tokenizer.from_pretrained(model_name_str) super().__init__(class_count, label_str, config, tokenizer, model_name_str)
def load_model(self): file = pathlib.Path('{}/pytorch_model.bin'.format(self.working_folder)) if file.exists(): self.model = T5ForConditionalGeneration.from_pretrained(self.working_folder) else: config = T5Config.from_pretrained(self.model_name) self.model = T5ForConditionalGeneration.from_pretrained(self.model_name, config=config) self.model.save_pretrained(self.working_folder)
def __init__(self, model_or_model_path, onnx_model_sessions): config = T5Config.from_pretrained(model_or_model_path) super().__init__(config) assert len( onnx_model_sessions) == 3, 'all three models should be given' encoder_sess, decoder_sess, decoder_sess_init = onnx_model_sessions self.encoder = T5Encoder(encoder_sess) self.decoder = T5Decoder(decoder_sess) self.decoder_init = T5DecoderInit(decoder_sess_init)
def __init__(self, checkpoint='model.ckpt-1004000', base_model='t5-base', num_samples=3, batch_size=4, doc_attr="text", append=False, out_attr="querygen", verbose=True): self.num_samples = num_samples self.doc_attr = doc_attr self.append = append self.out_attr = out_attr if append: assert out_attr == 'querygen', "append=True cannot be used with out_attr" self.verbose = verbose self.batch_size = batch_size self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.pattern = re.compile("^\\s*http\\S+") self.tokenizer = T5Tokenizer.from_pretrained(base_model) config = T5Config.from_pretrained(base_model) self.model = T5ForConditionalGeneration.from_pretrained(checkpoint, from_tf=True, config=config) self.model.to(self.device) self.model.eval() def _add_attr(df): iter = chunked(df.itertuples(), self.batch_size) if self.verbose: iter = pt.tqdm(iter, total=len(df) / self.batch_size, unit='d') output = [] for batch_rows in iter: docs = [getattr(row, self.doc_attr) for row in batch_rows] gens = self._doc2query(docs) if self.append: gens = [ f'{getattr(row, self.doc_attr)} {gen}' for row, gen in zip(batch_rows, gens) ] output.extend(gens) if self.append: df[self.doc_attr] = output # replace doc content else: df[self.out_attr] = output # add new column return df super().__init__(_add_attr) print("Doc2query using %s" % str(self.device))
def main( model_path: str, corpus: Corpus = "kaggle", split_name: str = "valid", max_len: int = 128, batch_size: int = 32): if "mt5" in Path(model_path).stem: tokenizer = MT5Tokenizer.from_pretrained(model_path) # print(tokenizer.encode("</s>")) model = MT5ForConditionalGeneration( MT5Config.from_pretrained(model_path) ).eval() else: tokenizer = T5Tokenizer.from_pretrained(model_path) # print(tokenizer.encode("</s>")) model = T5ForConditionalGeneration( T5Config.from_pretrained(model_path) ).eval() shrink_vocab(model_path, model) model.lm_head = torch.nn.Linear(model.lm_head.in_features, 3, bias=False) model.load_state_dict(torch.load(Path(model_path) / "pytorch_model.bin")) model = model.cuda() # model.load_state_dict(torch.load(model_path)) context_tokens_1 = tokenizer.encode("mnli hypothesis:")[:-1] context_tokens_2 = tokenizer.encode("premise:")[:-1] collate_fn = partial( collate_batch, pad=model.config.decoder_start_token_id, decode_start_token=model.config.pad_token_id, max_len=max_len, is_classifier=True ) dataset = XNLIDataset( corpus, split_name + ".jbl", context_tokens_1, context_tokens_2) data_loader = DataLoader( dataset, num_workers=1, shuffle=False, drop_last=False, batch_size=batch_size, collate_fn=collate_fn) preds, labels = [], [] for input_batch, label_batch in tqdm(data_loader, ncols=100): for key, val in input_batch.items(): input_batch[key] = val.cuda() outputs = model(**input_batch) preds_local = torch.argmax(outputs["logits"][:, 0, :].cpu(), dim=-1) preds.append(preds_local.numpy()) labels.append(np.asarray([x[0] for x in label_batch["ids"].cpu().numpy()])) full_labels = np.concatenate(labels) full_preds = np.concatenate(preds) # print("Label mapping:") # for key in np.unique(full_labels): # print(f"{key}: {tokenizer.decode([key])}") print("Labels:") print(pd.Series(full_labels).value_counts()) print("Predictions:") print(pd.Series(full_preds).value_counts()) print("Acc: %.2f%%" % (np.mean(full_labels == full_preds) * 100))
def from_pretrained(clz, config, do_not_download_weights=False, **kwargs): cfg = T5Config.from_pretrained(config['reader_transformer_type'], cache_dir=config["transformers_cache"]) cfg.attention_probs_dropout_prob = config["attention_dropout"] cfg.hidden_dropout_prob = config["hidden_dropout"] cfg.fusion_strategy = config["fusion_strategy"] cfg.custom_config = config if do_not_download_weights: return T5FusionInDecoder(config=cfg) return super(T5FusionInDecoder, clz).from_pretrained( config['reader_transformer_type'], config=cfg, cache_dir=config["transformers_cache"], **kwargs)
def init_from_base_t5_model(model_name_or_path='t5-base', output_root='./'): os.makedirs(output_root, exist_ok=True) tokenizer = T5Tokenizer.from_pretrained(model_name_or_path) model_config = T5Config.from_pretrained(model_name_or_path) # torch.save(model.encoder.embed_tokens.state_dict(), EMBEDDINGS_OUTPUT_FILE) tokenizer.save_pretrained(output_root) model = T5Siamese(config=model_config) model.encoder_left = T5EncoderModel.from_pretrained(model_name_or_path) model.encoder_right = T5EncoderModel.from_pretrained( model_name_or_path) model.save_pretrained(output_root) model_config.save_pretrained(output_root)
def __init__(self, hparams: argparse.Namespace, num_labels=None, **config_kwargs) -> 'T5QaModel': super().__init__() self.hparams = hparams cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None self.config = T5Config.from_pretrained( self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path, **({ "num_labels": num_labels } if num_labels is not None else {}), cache_dir=cache_dir, **config_kwargs, ) self.tokenizer = T5Tokenizer.from_pretrained( self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path, cache_dir=cache_dir, ) self.model = T5ForConditionalGeneration.from_pretrained( self.hparams.model_name_or_path, from_tf=bool(".ckpt" in self.hparams.model_name_or_path), config=self.config, cache_dir=cache_dir, ) # fix for eos token id problem # see https://github.com/huggingface/transformers/issues/5142 for more info on the problem and workaround if self.tokenizer.eos_token_id == 1: self.tokenizer.add_special_tokens({'eos_token': '[EOS]'}) self.model.resize_token_embeddings(len(self.tokenizer)) self.dataset_kwargs: dict = dict( data_dir=self.hparams.input_dir, max_source_length=1024, max_target_length=56, ) self.loss_names = ["loss"] self.metric_names = ROUGE_KEYS self.val_metric = "rouge2"
def load(self) -> T5ForConditionalGeneration: try: if not self.flush_cache: return self._fix_t5_model( T5ForConditionalGeneration.from_pretrained( str(self.model_cache_dir), from_tf=True, force_download=False)) except (RuntimeError, OSError): logging.info('T5 model weights not in cache.') m = re.search(r'model_checkpoint_path: "(.+?)"', self.ckpt_prefix) assert m is not None, 'checkpoint file malformed' # Copy over checkpoint data ckpt_patt = re.compile( rf'^{m.group(1)}\.(data-\d+-of-\d+|index|meta)$') for name in file_io.list_directory(self.url): if not ckpt_patt.match(name): continue url = os.path.join(self.url, name) url_stat = file_io.stat(url) cache_file_path = self.model_cache_dir / ckpt_patt.sub( rf'{TRANSFO_PREFIX}.\1', name) try: cs = os.stat(str(cache_file_path)) if cs.st_size == url_stat.length and cs.st_mtime_ns > url_stat.mtime_nsec and not self.flush_cache: logging.info(f'Skipping {name}...') continue except FileNotFoundError: pass logging.info(f'Caching {name}...') file_io.copy(url, str(cache_file_path), overwrite=True) # Transformers expects a model config.json config = T5Config.from_pretrained(self.model_type) with open(str(self.model_cache_dir / 'config.json'), 'w') as f: json.dump(config.__dict__, f, indent=4) return self._fix_t5_model( T5ForConditionalGeneration.from_pretrained(str( self.model_cache_dir), from_tf=True, force_download=False))
def load_pretained_model_and_tokenizer( base_model: str, model_dict_path: str, gpu_device: str, eval=False, ): ''' Load pretainted T5 model on UnifiedQA base_model: base model name for T5 model_dict_path: trained model checkpoint for unifiedQA ''' tokenizer = T5Tokenizer.from_pretrained(base_model) model = T5ForConditionalGeneration(T5Config.from_pretrained(base_model)) if eval: model = torch.load(model_dict_path, map_location=gpu_device) else: load_tf_weights_in_t5(model, None, model_dict_path) return tokenizer, model
def create_t2t_model(model_name_or_path, args, tokenizer=None, from_pretrained=True): ## transformer encoder if from_pretrained: encoder = TFT5ForConditionalGeneration.from_pretrained( model_name_or_path) encoder_config = encoder.config else: encoder_config = T5Config.from_pretrained(args.model_select) if tokenizer != None: assert encoder_config.vocab_size == len(tokenizer) assert encoder_config.pad_token_id == tokenizer.pad_token_id assert encoder_config.eos_token_id == tokenizer.eos_token_id assert encoder_config.decoder_start_token_id == tokenizer.pad_token_id encoder = TFT5ForConditionalGeneration(encoder_config) # build the model with dummy_inputs encoder(encoder.dummy_inputs, training=False) if not os.path.isfile(os.path.join(args.output_path, "config.json")): encoder_config.save_pretrained(args.output_path) return encoder
def __init__(self, hparams): super().__init__() self.hparams = hparams config = T5Config.from_pretrained( self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path, cache_dir=self.hparams.cache_dir if self.hparams.cache_dir else None, ) self.tokenizer = T5Tokenizer.from_pretrained( self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path, do_lower_case=self.hparams.do_lower_case, cache_dir=self.hparams.cache_dir if self.hparams.cache_dir else None, ) self.model = T5ForConditionalGeneration.from_pretrained( self.hparams.model_name_or_path, from_tf=bool(".ckpt" in self.hparams.model_name_or_path), config=config, cache_dir=self.hparams.cache_dir if self.hparams.cache_dir else None, )
from transformers import T5Config, T5Tokenizer, T5ForConditionalGeneration from transformers.modeling_t5 import load_tf_weights_in_t5 from flask import Flask, request, jsonify app = Flask(__name__) base_model = "t5-large" tokenizer = T5Tokenizer.from_pretrained(base_model) model = T5ForConditionalGeneration(T5Config.from_pretrained(base_model)) load_tf_weights_in_t5(model, None, "/data/") model.eval() ret_dict = { 'low air quality': 'LowAirQuality', 'low humidity': 'LowHumidity', 'low brightness': 'LowBrightness', 'low noise level': 'LowNoise', 'low security': 'LowSecurity', 'low temperature': 'LowTemperature', 'high air quality': 'HighAirQuality', 'high humidity': 'HighHumidity', 'high brightness': 'HighBrightness', 'high noise level': 'HighNoise', 'high security': 'HighSecurity', 'high temperature': 'HighTemperature' } def run_model(input_string, **generator_args): input_ids = tokenizer.encode(input_string, return_tensors="pt")
MODEL_PATH = os.environ.get("MODEL_PATH", "/data/model.pth") BASE_MODEL = os.environ.get("BASE_MODEL", "t5-base") DECODING = os.environ.get("DECODING", "greedy") # greedy, topk-N (e.g., topk-10) cuda = torch.cuda.is_available() if cuda: torch.cuda.set_device(0) # singe gpu device = torch.device("cuda") else: device = torch.device("cpu") logger.info(f"question generation is set to run on {device}") # init model logger.info("question generation model is preparing...") config = T5Config.from_pretrained(BASE_MODEL) model = T5ForConditionalGeneration(config=config) t = QGTokenizer(tokenizer=BASE_MODEL) checkpoint = torch.load(MODEL_PATH, map_location=device) model.load_state_dict(checkpoint["model_state_dict"]) model.eval() if cuda: model.cuda() logger.info(f"question generation model is ready") app = Flask(__name__) @app.route("/question", methods=["POST"]) def respond():
def convert_t5x_checkpoint_to_flax(t5x_checkpoint_path, config_name, flax_dump_folder_path): config = T5Config.from_pretrained(config_name) flax_model = FlaxT5ForConditionalGeneration(config=config) t5x_model = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path) split_mlp_wi = "wi_0" in t5x_model["target"]["encoder"]["layers_0"]["mlp"] # Encoder for layer_index in range(config.num_layers): layer_name = f"layers_{str(layer_index)}" # Self-Attention t5x_attention_key = t5x_model["target"]["encoder"][layer_name][ "attention"]["key"]["kernel"] t5x_attention_out = t5x_model["target"]["encoder"][layer_name][ "attention"]["out"]["kernel"] t5x_attention_query = t5x_model["target"]["encoder"][layer_name][ "attention"]["query"]["kernel"] t5x_attention_value = t5x_model["target"]["encoder"][layer_name][ "attention"]["value"]["kernel"] # Layer Normalization t5x_attention_layer_norm = t5x_model["target"]["encoder"][layer_name][ "pre_attention_layer_norm"]["scale"] if split_mlp_wi: t5x_mlp_wi_0 = t5x_model["target"]["encoder"][layer_name]["mlp"][ "wi_0"]["kernel"] t5x_mlp_wi_1 = t5x_model["target"]["encoder"][layer_name]["mlp"][ "wi_1"]["kernel"] else: t5x_mlp_wi = t5x_model["target"]["encoder"][layer_name]["mlp"][ "wi"]["kernel"] t5x_mlp_wo = t5x_model["target"]["encoder"][layer_name]["mlp"]["wo"][ "kernel"] # Layer Normalization t5x_mlp_layer_norm = t5x_model["target"]["encoder"][layer_name][ "pre_mlp_layer_norm"]["scale"] # Assigning flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"][ "SelfAttention"]["k"]["kernel"] = t5x_attention_key flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"][ "SelfAttention"]["o"]["kernel"] = t5x_attention_out flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"][ "SelfAttention"]["q"]["kernel"] = t5x_attention_query flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"][ "SelfAttention"]["v"]["kernel"] = t5x_attention_value flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"][ "layer_norm"]["weight"] = t5x_attention_layer_norm if split_mlp_wi: flax_model.params["encoder"]["block"][str(layer_index)]["layer"][ "1"]["DenseReluDense"]["wi_0"]["kernel"] = t5x_mlp_wi_0 flax_model.params["encoder"]["block"][str(layer_index)]["layer"][ "1"]["DenseReluDense"]["wi_1"]["kernel"] = t5x_mlp_wi_1 else: flax_model.params["encoder"]["block"][str(layer_index)]["layer"][ "1"]["DenseReluDense"]["wi"]["kernel"] = t5x_mlp_wi flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"][ "DenseReluDense"]["wo"]["kernel"] = t5x_mlp_wo flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"][ "layer_norm"]["weight"] = t5x_mlp_layer_norm # Only for layer 0: t5x_encoder_rel_embedding = t5x_model["target"]["encoder"]["relpos_bias"][ "rel_embedding"].T flax_model.params["encoder"]["block"]["0"]["layer"]["0"]["SelfAttention"][ "relative_attention_bias"]["embedding"] = t5x_encoder_rel_embedding # Assigning t5x_encoder_norm = t5x_model["target"]["encoder"]["encoder_norm"]["scale"] flax_model.params["encoder"]["final_layer_norm"][ "weight"] = t5x_encoder_norm # Decoder for layer_index in range(config.num_decoder_layers): layer_name = f"layers_{str(layer_index)}" # Self-Attention t5x_attention_key = t5x_model["target"]["decoder"][layer_name][ "self_attention"]["key"]["kernel"] t5x_attention_out = t5x_model["target"]["decoder"][layer_name][ "self_attention"]["out"]["kernel"] t5x_attention_query = t5x_model["target"]["decoder"][layer_name][ "self_attention"]["query"]["kernel"] t5x_attention_value = t5x_model["target"]["decoder"][layer_name][ "self_attention"]["value"]["kernel"] # Layer Normalization t5x_pre_attention_layer_norm = t5x_model["target"]["decoder"][ layer_name]["pre_self_attention_layer_norm"]["scale"] # Encoder-Decoder-Attention t5x_enc_dec_attention_key = t5x_model["target"]["decoder"][layer_name][ "encoder_decoder_attention"]["key"]["kernel"] t5x_enc_dec_attention_out = t5x_model["target"]["decoder"][layer_name][ "encoder_decoder_attention"]["out"]["kernel"] t5x_enc_dec_attention_query = t5x_model["target"]["decoder"][ layer_name]["encoder_decoder_attention"]["query"]["kernel"] t5x_enc_dec_attention_value = t5x_model["target"]["decoder"][ layer_name]["encoder_decoder_attention"]["value"]["kernel"] # Layer Normalization t5x_cross_layer_norm = t5x_model["target"]["decoder"][layer_name][ "pre_cross_attention_layer_norm"]["scale"] # MLP if split_mlp_wi: t5x_mlp_wi_0 = t5x_model["target"]["decoder"][layer_name]["mlp"][ "wi_0"]["kernel"] t5x_mlp_wi_1 = t5x_model["target"]["decoder"][layer_name]["mlp"][ "wi_1"]["kernel"] else: t5x_mlp_wi = t5x_model["target"]["decoder"][layer_name]["mlp"][ "wi"]["kernel"] t5x_mlp_wo = t5x_model["target"]["decoder"][layer_name]["mlp"]["wo"][ "kernel"] # Layer Normalization tx5_mlp_layer_norm = t5x_model["target"]["decoder"][layer_name][ "pre_mlp_layer_norm"]["scale"] # Assigning flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"][ "SelfAttention"]["k"]["kernel"] = t5x_attention_key flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"][ "SelfAttention"]["o"]["kernel"] = t5x_attention_out flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"][ "SelfAttention"]["q"]["kernel"] = t5x_attention_query flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"][ "SelfAttention"]["v"]["kernel"] = t5x_attention_value flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"][ "layer_norm"]["weight"] = t5x_pre_attention_layer_norm flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"][ "EncDecAttention"]["k"]["kernel"] = t5x_enc_dec_attention_key flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"][ "EncDecAttention"]["o"]["kernel"] = t5x_enc_dec_attention_out flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"][ "EncDecAttention"]["q"]["kernel"] = t5x_enc_dec_attention_query flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"][ "EncDecAttention"]["v"]["kernel"] = t5x_enc_dec_attention_value flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"][ "layer_norm"]["weight"] = t5x_cross_layer_norm if split_mlp_wi: flax_model.params["decoder"]["block"][str(layer_index)]["layer"][ "2"]["DenseReluDense"]["wi_0"]["kernel"] = t5x_mlp_wi_0 flax_model.params["decoder"]["block"][str(layer_index)]["layer"][ "2"]["DenseReluDense"]["wi_1"]["kernel"] = t5x_mlp_wi_1 else: flax_model.params["decoder"]["block"][str(layer_index)]["layer"][ "2"]["DenseReluDense"]["wi"]["kernel"] = t5x_mlp_wi flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"][ "DenseReluDense"]["wo"]["kernel"] = t5x_mlp_wo flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"][ "layer_norm"]["weight"] = tx5_mlp_layer_norm # Decoder Normalization tx5_decoder_norm = t5x_model["target"]["decoder"]["decoder_norm"]["scale"] flax_model.params["decoder"]["final_layer_norm"][ "weight"] = tx5_decoder_norm # Only for layer 0: t5x_decoder_rel_embedding = t5x_model["target"]["decoder"]["relpos_bias"][ "rel_embedding"].T flax_model.params["decoder"]["block"]["0"]["layer"]["0"]["SelfAttention"][ "relative_attention_bias"]["embedding"] = t5x_decoder_rel_embedding # Token Embeddings tx5_token_embeddings = t5x_model["target"]["token_embedder"]["embedding"] flax_model.params["shared"]["embedding"] = tx5_token_embeddings # LM Head (only in v1.1 checkpoints) if "logits_dense" in t5x_model["target"]["decoder"]: flax_model.params["lm_head"]["kernel"] = t5x_model["target"][ "decoder"]["logits_dense"]["kernel"] flax_model.save_pretrained(flax_dump_folder_path) print("T5X Model was sucessfully converted!")
def convert_model(args): if os.path.exists(args.decoder_onnx): print(f"skip convert_to_onnx since path existed: {args.decoder_onnx}") else: assert args.model_type == "gpt2", "please have onnx model ready for model type that is not gpt2" gpt2_to_onnx(args) # TODO: fix shape inference for T5. Currently symbolic shape inference on T5 is broken. enable_shape_inference = args.model_type == "gpt2" if enable_shape_inference: print(f"Run symbolic shape inference on {args.decoder_onnx}. The file will be overwritten.") shape_inference(args.decoder_onnx) global config if args.model_type == "gpt2": config = GPT2Config.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir) else: config = T5Config.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir) print(config) eos_token_id = config.eos_token_id pad_token_id = config.eos_token_id vocab_size = config.vocab_size # if vocab_size is given in parameters use that. if args.vocab_size != -1: vocab_size = args.vocab_size model = onnx.load(args.decoder_onnx) model.graph.name = f"{args.model_type} decoder subgraph" if args.model_type == "gpt2": verify_gpt2_subgraph(model.graph, args.precision) else: verify_t5_decoder_subgraph(model.graph, args.precision) inputs = [ "input_ids", "max_length", "min_length", "num_beams", "num_return_sequences", "temperature", "length_penalty", "repetition_penalty", "vocab_mask", ] if args.prefix_vocab_mask: inputs.append("prefix_vocab_mask") outputs = ["sequences"] if args.output_sequences_scores: outputs.append("sequences_scores") if args.output_token_scores: assert args.output_sequences_scores, "--output_token_scores requires --output_sequences_scores" outputs.append("scores") node = helper.make_node( "BeamSearch", inputs=inputs, outputs=outputs, name=f"BeamSearch_{args.model_type}", ) node.domain = "com.microsoft" node.attribute.extend( [ helper.make_attribute("eos_token_id", eos_token_id), helper.make_attribute("pad_token_id", pad_token_id), helper.make_attribute("no_repeat_ngram_size", args.no_repeat_ngram_size), helper.make_attribute("early_stopping", 1 if args.early_stopping else 0), helper.make_attribute("model_type", 0 if args.model_type == "gpt2" else 1), helper.make_attribute("decoder", model.graph), ] ) if args.model_type == "t5": if enable_shape_inference: print(f"Run symbolic shape inference on {args.encoder_decoder_init_onnx}. The file will be overwritten.") shape_inference(args.encoder_decoder_init_onnx) init_model = onnx.load(args.encoder_decoder_init_onnx) init_model.graph.name = f"{args.model_type} encoder decoder init subgraph" verify_t5_encoder_decoder_init_subgraph(init_model.graph, args.precision) node.attribute.extend( [ helper.make_attribute("encoder_decoder_init", init_model.graph), ] ) from onnx import TensorProto # graph inputs input_ids = helper.make_tensor_value_info("input_ids", TensorProto.INT32, ["batch_size", "sequence_length"]) max_length = helper.make_tensor_value_info("max_length", TensorProto.INT32, [1]) min_length = helper.make_tensor_value_info("min_length", TensorProto.INT32, [1]) num_beams = helper.make_tensor_value_info("num_beams", TensorProto.INT32, [1]) num_return_sequences = helper.make_tensor_value_info("num_return_sequences", TensorProto.INT32, [1]) temperature = helper.make_tensor_value_info("temperature", TensorProto.FLOAT, [1]) length_penalty = helper.make_tensor_value_info("length_penalty", TensorProto.FLOAT, [1]) repetition_penalty = helper.make_tensor_value_info("repetition_penalty", TensorProto.FLOAT, [1]) vocab_mask = helper.make_tensor_value_info("vocab_mask", TensorProto.INT32, [vocab_size]) graph_inputs = [ input_ids, max_length, min_length, num_beams, num_return_sequences, temperature, length_penalty, repetition_penalty, vocab_mask, ] if args.prefix_vocab_mask: prefix_vocab_mask = helper.make_tensor_value_info( "prefix_vocab_mask", TensorProto.INT32, ["batch_size", vocab_size] ) graph_inputs.append(prefix_vocab_mask) # graph outputs sequences = helper.make_tensor_value_info( "sequences", TensorProto.INT32, ["batch_size", "num_return_sequences", "max_length"], ) sequences_scores = helper.make_tensor_value_info( "sequences_scores", TensorProto.FLOAT, ["batch_size", "num_return_sequences"] ) scores = helper.make_tensor_value_info( "scores", TensorProto.FLOAT, ["max_length - sequence_length", "batch_size", "num_beams", vocab_size], ) initializers = [] graph_outputs = [sequences] if args.output_sequences_scores: graph_outputs.append(sequences_scores) if args.output_token_scores: graph_outputs.append(scores) new_graph = helper.make_graph( [node], f"{args.model_type}-beam-search", graph_inputs, graph_outputs, initializers, ) # Create the model new_model = helper.make_model( new_graph, producer_name="onnxruntime.transformers", opset_imports=model.opset_import, ) onnx.save(new_model, args.output)
from tqdm.notebook import tqdm from transformers import T5Config, T5Tokenizer, T5ForConditionalGeneration print("Downloading and unzipping model.", file=sys.stderr) os.system( "wget -nc https://storage.googleapis.com/doctttttquery_git/t5-base.zip") os.system("unzip -o t5-base.zip") nltk.download('punkt') # Define the target device. Use GPU if available. device = 'cuda' if torch.cuda.is_available() else 'cpu' # Instantiate and load the QG model to the GPU. qg_tokenizer = T5Tokenizer.from_pretrained('t5-base') qg_config = T5Config.from_pretrained('t5-base') qg_model = T5ForConditionalGeneration.from_pretrained('model.ckpt-1004000', from_tf=True, config=qg_config) qg_model.to(device) def preprocess(document: str, span=10, stride=5) -> List[str]: """ Define your preprocessing function. This function should take the a corpus document and output a list of generation spans. This is required so we can match the expected sequence size of the generation model. """
def __init__( self, model_name, args=None, use_cuda=True, cuda_device=-1, **kwargs, ): """ Initializes a T5Model model. Args: model_name: The exact architecture and trained weights to use. This may be a Hugging Face Transformers compatible pre-trained model, a community model, or the path to a directory containing model files. args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args. use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only. cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default. **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied. """ # noqa: ignore flake8" if args and "manual_seed" in args: random.seed(args["manual_seed"]) np.random.seed(args["manual_seed"]) torch.manual_seed(args["manual_seed"]) if "n_gpu" in args and args["n_gpu"] > 0: torch.cuda.manual_seed_all(args["manual_seed"]) self.args = { "dataset_class": None, "do_sample": False, "max_steps": -1, "evaluate_generated_text": False, "num_beams": 1, "max_length": 20, "repetition_penalty": 1.0, "length_penalty": 2.0, "early_stopping": True, "preprocess_inputs": True, } self.args.update(global_args) if args: self.args.update(args) if use_cuda: if torch.cuda.is_available(): if cuda_device == -1: self.device = torch.device("cuda") else: self.device = torch.device(f"cuda:{cuda_device}") else: raise ValueError( "'use_cuda' set to True when cuda is unavailable." "Make sure CUDA is available or set `use_cuda=False`.") else: self.device = "cpu" self.results = {} self.config = T5Config.from_pretrained(model_name, **self.args["config"]) self.model = T5ForConditionalGeneration.from_pretrained( model_name, config=self.config) self.tokenizer = T5Tokenizer.from_pretrained(model_name) if not use_cuda: self.args["fp16"] = False self.args["model_name"] = model_name if self.args["wandb_project"] and not wandb_available: warnings.warn( "wandb_project specified but wandb is not available. Wandb disabled." ) self.args["wandb_project"] = None
def _build_vocab(self, max_vocab_cnt): # build vocab if self.tokenizer_type.startswith('word'): self._build_vocab_manual(max_vocab_cnt) elif self.tokenizer_type.startswith('bert-'): self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>") # self.vocab_count = 30522 # fixed for pretrained BERT vocab (old version) config_pretrained = BertConfig.from_pretrained(self.tokenizer_type) self.vocab_count = config_pretrained.vocab_size map_vocab = {} for ind in range(self.vocab_count): map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind) inv_map = {v: k for k, v in map_vocab.items()} elif self.tokenizer_type.startswith('xlnet-'): # self.vocab = self.tokenizer.vocab # self.rev_vocab = self.tokenizer.ids_to_tokens # self.pad_id = self.vocab["[PAD]"] self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>") # self.vocab_count = 32000 # fixed for pretrained BERT vocab config_pretrained = XLNetConfig.from_pretrained( self.tokenizer_type) self.vocab_count = config_pretrained.vocab_size map_vocab = {} for ind in range(self.vocab_count): map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind) inv_map = {v: k for k, v in map_vocab.items()} self.vocab = map_vocab self.rev_vocab = inv_map elif self.tokenizer_type.startswith('x5-'): self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>") # self.vocab_count = 32000 config_pretrained = T5Config.from_pretrained(self.tokenizer_type) self.vocab_count = config_pretrained.vocab_size map_vocab = {} for ind in range(self.vocab_count): map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind) inv_map = {v: k for k, v in map_vocab.items()} self.vocab = map_vocab self.rev_vocab = inv_map elif self.tokenizer_type.startswith('bart-'): self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>") # self.vocab_count = 32000 # fixed for pretrained BERT vocab config_pretrained = BartConfig.from_pretrained(self.tokenizer_type) self.vocab_count = config_pretrained.vocab_size map_vocab = {} for ind in range(self.vocab_count): map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind) inv_map = {v: k for k, v in map_vocab.items()} return
print("starting to train") # train word_tokens_train, pos_tokens_train = tasks.pos('UD_English-EWT/en_ewt-ud-train.conllu') tokenizer = T5Tokenizer.from_pretrained("t5-small") ## i want to append pos: - do I include the pos token associated with it? if args.control: word_tokens_train, pos_tokens_train = tasks.make_control(tokenizer, word_tokens_train, pos_tokens_train, args.embsize) torch_ids_train, torch_masks_train, torch_token_starts, torch_labels_train = r.prepare_data(tokenizer, word_tokens_train, pos_tokens_train) # data for training split = int(0.75 * len(torch_ids_train)) #dataset_train = Dataset(torch_ids_train[:split], torch_masks_train[:split], torch_labels_train[:split]) #dataset_dev = Dataset(torch_ids_train[split:], torch_masks_train[split:], torch_labels_train[split:]) config = T5Config.from_pretrained("t5-small", output_hidden_states=True, output_attentions=True) model = T5ForConditionalGeneration.from_pretrained("t5-small", config=config) model.to(device) #train(model, dataset_train, dataset_dev, torch_token_starts[split:], tokenizer) # 100 values test dataset_train = Dataset(torch_ids_train[:200], torch_masks_train[:200], torch_labels_train[:200]) dataset_dev = Dataset(torch_ids_train[200:400], torch_masks_train[200:400], torch_labels_train[200:400]) train(model, dataset_train, dataset_dev, torch_token_starts[200:400], tokenizer) print("done!") else: print("starting to evaluate") tokenizer = T5Tokenizer.from_pretrained("t5-small")
'input_ids', 'attention_mask', 'labels', 'decoder_attention_mask' ] encoded.set_format(type='torch', columns=columns) train_dataloader = torch.utils.data.DataLoader(encoded["train"], collate_fn=collate_fn, batch_size=args.batch_size) val_dataloader = torch.utils.data.DataLoader(encoded["validation"], collate_fn=collate_fn, batch_size=args.batch_size * 4) if args.from_pretrained: model = T5ForConditionalGeneration.from_pretrained(args.model_select) else: config = T5Config.from_pretrained(args.model_select) model = T5ForConditionalGeneration(config) no_decay = ["bias", "LayerNorm.weight"] params_decay = [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ] params_nodecay = [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ] optim_groups = [ { "params": params_decay, "weight_decay": 0.1
tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.config_name: config = T5Config.from_pretrained( model_args.config_name, cache_dir=model_args.cache_dir, vocab_size=len(tokenizer) ) elif model_args.model_name_or_path: config = T5Config.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, vocab_size=len(tokenizer) ) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names
def __init__(self, tokenizer): super(T5Model, self).__init__() self.tokenizer = tokenizer config = T5Config.from_pretrained('t5-small') self.model = T5ForConditionalGeneration(config=config)
def __init__( self, model_name, args=None, use_cuda=True, cuda_device=-1, **kwargs, ): """ Initializes a T5Model model. Args: model_name: The exact architecture and trained weights to use. This may be a Hugging Face Transformers compatible pre-trained model, a community model, or the path to a directory containing model files. args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args. use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only. cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default. **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied. """ # noqa: ignore flake8" self.args = self._load_model_args(model_name) if isinstance(args, dict): self.args.update_from_dict(args) elif isinstance(args, T5Args): self.args = args if "sweep_config" in kwargs: sweep_config = kwargs.pop("sweep_config") sweep_values = { key: value["value"] for key, value in sweep_config.as_dict().items() if key != "_wandb" } self.args.update_from_dict(sweep_values) if self.args.manual_seed: random.seed(self.args.manual_seed) np.random.seed(self.args.manual_seed) torch.manual_seed(self.args.manual_seed) if self.args.n_gpu > 0: torch.cuda.manual_seed_all(self.args.manual_seed) if use_cuda: if torch.cuda.is_available(): if cuda_device == -1: self.device = torch.device("cuda") else: self.device = torch.device(f"cuda:{cuda_device}") else: raise ValueError( "'use_cuda' set to True when cuda is unavailable." "Make sure CUDA is available or set `use_cuda=False`.") else: self.device = "cpu" self.results = {} self.config = T5Config.from_pretrained(model_name, **self.args.config) self.model = T5ForConditionalGeneration.from_pretrained( model_name, config=self.config) self.tokenizer = T5Tokenizer.from_pretrained(model_name, truncate=True) if self.args.dynamic_quantize: self.model = torch.quantization.quantize_dynamic(self.model, {torch.nn.Linear}, dtype=torch.qint8) if not use_cuda: self.args.fp16 = False self.args.model_type = "T5" self.args.model_name = model_name if self.args.wandb_project and not wandb_available: warnings.warn( "wandb_project specified but wandb is not available. Wandb disabled." ) self.args.wandb_project = None
# This is a very small notebook showing how to grab a pre-trained T5 model, fine-tune it, and export it to onnx.] # A lot of this is inspired by huggingface. from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config, AdamW import torch from onnxt5 import generate_onnx_representation, GenerativeT5 from onnxt5.api import get_sess import tempfile temp_dir = tempfile.gettempdir() base_model = "t5-base" # Setting up the model and tokenizer config = T5Config.from_pretrained(base_model) config.n_positions = 256 # You can change the properties of your model here model = T5ForConditionalGeneration(config=config) # Download vocab file tokenizer = T5Tokenizer(config=config, vocab_file="test_sentencepiece.model") model.train() # Let's setup our optimizer optimizer = AdamW(model.parameters(), lr=1e-5) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay':
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_t5_mlm", model_args, data_args, framework="flax") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level=logging.INFO, datefmt="[%X]", ) # Log on each process the small summary: logger = logging.getLogger(__name__) # Set the verbosity to info of the Transformers logger (on main process only): logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Handle the repository creation if training_args.push_to_hub: if training_args.hub_model_id is None: repo_name = get_full_repo_name(Path( training_args.output_dir).absolute().name, token=training_args.hub_token) else: repo_name = training_args.hub_model_id repo = Repository(training_args.output_dir, clone_from=repo_name) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) if "validation" not in datasets.keys(): datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset( extension, data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) if "validation" not in datasets.keys(): datasets["validation"] = load_dataset( extension, data_files=data_files, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) datasets["train"] = load_dataset( extension, data_files=data_files, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, use_auth_token=True if model_args.use_auth_token else None, ) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, use_auth_token=True if model_args.use_auth_token else None, ) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.config_name: config = T5Config.from_pretrained( model_args.config_name, cache_dir=model_args.cache_dir, vocab_size=len(tokenizer), use_auth_token=True if model_args.use_auth_token else None, ) elif model_args.model_name_or_path: config = T5Config.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # Since we make sure that all sequences are of the same length, no attention_mask is needed. def tokenize_function(examples): return tokenizer(examples[text_column_name], return_attention_mask=False) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token. # To ensure that the input length is `max_seq_length`, we need to increase the maximum length # according to `mlm_probability` and `mean_noise_span_length`. We can also define the label length accordingly. expanded_inputs_length, targets_length = compute_input_and_target_lengths( inputs_length=max_seq_length, noise_density=data_args.mlm_probability, mean_noise_span_length=data_args.mean_noise_span_length, ) # Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: list(chain(*examples[k])) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. if total_length >= expanded_inputs_length: total_length = (total_length // expanded_inputs_length) * expanded_inputs_length # Split by chunks of max_len. result = { k: [ t[i:i + expanded_inputs_length] for i in range(0, total_length, expanded_inputs_length) ] for k, t in concatenated_examples.items() } return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Enable tensorboard only on the master node has_tensorboard = is_tensorboard_available() if has_tensorboard and jax.process_index() == 0: try: from flax.metrics.tensorboard import SummaryWriter summary_writer = SummaryWriter( log_dir=Path(training_args.output_dir)) except ImportError as ie: has_tensorboard = False logger.warning( f"Unable to display metrics through TensorBoard because some package are not installed: {ie}" ) else: logger.warning( "Unable to display metrics through TensorBoard because the package is not installed: " "Please run pip install tensorboard to enable.") # Initialize our training rng = jax.random.PRNGKey(training_args.seed) dropout_rngs = jax.random.split(rng, jax.local_device_count()) if model_args.model_name_or_path: model = FlaxT5ForConditionalGeneration.from_pretrained( model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype), use_auth_token=True if model_args.use_auth_token else None, ) else: config.vocab_size = len(tokenizer) model = FlaxT5ForConditionalGeneration( config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype), ) # Data collator # This one will take care of randomly masking the tokens. data_collator = FlaxDataCollatorForT5MLM( tokenizer=tokenizer, noise_density=data_args.mlm_probability, mean_noise_span_length=data_args.mean_noise_span_length, input_length=max_seq_length, target_length=targets_length, pad_token_id=model.config.pad_token_id, decoder_start_token_id=model.config.decoder_start_token_id, ) # Store some constant num_epochs = int(training_args.num_train_epochs) train_batch_size = int( training_args.per_device_train_batch_size) * jax.device_count() per_device_eval_batch_size = int(training_args.per_device_eval_batch_size) eval_batch_size = per_device_eval_batch_size * jax.device_count() num_train_steps = len( tokenized_datasets["train"]) // train_batch_size * num_epochs num_of_hosts = jax.process_count() current_host_idx = jax.process_index() # Create learning rate schedule warmup_fn = optax.linear_schedule( init_value=0.0, end_value=training_args.learning_rate, transition_steps=training_args.warmup_steps) decay_fn = optax.linear_schedule( init_value=training_args.learning_rate, end_value=0, transition_steps=num_train_steps - training_args.warmup_steps, ) linear_decay_lr_schedule_fn = optax.join_schedules( schedules=[warmup_fn, decay_fn], boundaries=[training_args.warmup_steps]) # We use Optax's "masking" functionality to not apply weight decay # to bias and LayerNorm scale parameters. decay_mask_fn returns a # mask boolean with the same structure as the parameters. # The mask is True for parameters that should be decayed. def decay_mask_fn(params): flat_params = traverse_util.flatten_dict(params) # find out all LayerNorm parameters layer_norm_candidates = ["layernorm", "layer_norm", "ln"] layer_norm_named_params = set([ layer[-2:] for layer_norm_name in layer_norm_candidates for layer in flat_params.keys() if layer_norm_name in "".join(layer).lower() ]) flat_mask = { path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params } return traverse_util.unflatten_dict(flat_mask) # create adam optimizer if training_args.adafactor: # We use the default parameters here to initialize adafactor, # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74 optimizer = optax.adafactor( learning_rate=linear_decay_lr_schedule_fn, ) else: optimizer = optax.adamw( learning_rate=linear_decay_lr_schedule_fn, b1=training_args.adam_beta1, b2=training_args.adam_beta2, weight_decay=training_args.weight_decay, mask=decay_mask_fn, ) # Setup train state state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer) # Define gradient update step fn def train_step(state, batch, dropout_rng): dropout_rng, new_dropout_rng = jax.random.split(dropout_rng) def loss_fn(params): labels = batch.pop("labels") logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0] # compute loss loss = optax.softmax_cross_entropy( logits, onehot(labels, logits.shape[-1])).mean() return loss grad_fn = jax.value_and_grad(loss_fn) loss, grad = grad_fn(state.params) grad = jax.lax.pmean(grad, "batch") new_state = state.apply_gradients(grads=grad) metrics = jax.lax.pmean( { "loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step) }, axis_name="batch") return new_state, metrics, new_dropout_rng # Create parallel version of the train step p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0, )) # Define eval fn def eval_step(params, batch): labels = batch.pop("labels") logits = model(**batch, params=params, train=False)[0] # compute loss loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) # compute accuracy accuracy = jnp.equal(jnp.argmax(logits, axis=-1), labels) # summarize metrics metrics = {"loss": loss.mean(), "accuracy": accuracy.mean()} metrics = jax.lax.pmean(metrics, axis_name="batch") return metrics p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0, )) # Replicate the train state on each device state = jax_utils.replicate(state) train_time = 0 epochs = tqdm(range(num_epochs), desc="Epoch ... ", position=0) for epoch in epochs: # ======================== Training ================================ train_start = time.time() train_metrics = [] # Create sampling rng rng, input_rng = jax.random.split(rng) # Generate an epoch by shuffling sampling indices from the train dataset num_train_samples = len(tokenized_datasets["train"]) # Avoid using jax.numpy here in case of TPU training train_samples_idx = np.random.permutation(np.arange(num_train_samples)) train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size) # Gather the indexes for creating the batch and do a training step for step, batch_idx in enumerate( tqdm(train_batch_idx, desc="Training...", position=1)): samples = [ tokenized_datasets["train"][int(idx)] for idx in batch_idx ] model_inputs = data_collator(samples) local_host_model_inputs = { key: np.split(model_inputs.data[key], num_of_hosts, axis=0)[current_host_idx] for key, value in model_inputs.data.items() } # Model forward model_inputs = shard(local_host_model_inputs) state, train_metric, dropout_rngs = p_train_step( state, model_inputs, dropout_rngs) train_metrics.append(train_metric) cur_step = epoch * (num_train_samples // train_batch_size) + step if cur_step % training_args.logging_steps == 0 and cur_step > 0: # Save metrics train_metric = jax_utils.unreplicate(train_metric) train_time += time.time() - train_start if has_tensorboard and jax.process_index() == 0: write_train_metric(summary_writer, train_metrics, train_time, cur_step) epochs.write( f"Step... ({cur_step} | Loss: {train_metric['loss'].mean()}, Learning Rate:" f" {train_metric['learning_rate'].mean()})") train_metrics = [] if cur_step % training_args.eval_steps == 0 and cur_step > 0: # ======================== Evaluating ============================== num_eval_samples = len(tokenized_datasets["validation"]) # Avoid using jax.numpy here in case of TPU training eval_samples_idx = np.arange(num_eval_samples) eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size, drop_last=False) eval_metrics = [] for i, batch_idx in enumerate( tqdm(eval_batch_idx, desc="Evaluating ...", position=2)): samples = [ tokenized_datasets["validation"][int(idx)] for idx in batch_idx ] model_inputs = data_collator(samples) # Model forward metrics = pad_shard_unpad(p_eval_step, static_return=True)( state.params, model_inputs.data, min_device_batch=per_device_eval_batch_size) eval_metrics.append(metrics) # get eval metrics eval_metrics = get_metrics(eval_metrics) eval_metrics = jax.tree_map(jnp.mean, eval_metrics) # Update progress bar epochs.write( f"Step... ({cur_step} | Loss: {eval_metrics['loss']}, Acc: {eval_metrics['accuracy']})" ) # Save metrics if has_tensorboard and jax.process_index() == 0: write_eval_metric(summary_writer, eval_metrics, cur_step) if cur_step % training_args.save_steps == 0 and cur_step > 0: # save checkpoint after each epoch and push checkpoint to the hub if jax.process_index() == 0: params = jax.device_get( jax.tree_map(lambda x: x[0], state.params)) model.save_pretrained(training_args.output_dir, params=params) tokenizer.save_pretrained(training_args.output_dir) if training_args.push_to_hub: repo.push_to_hub( commit_message= f"Saving weights and logs of step {cur_step}", blocking=False) # Eval after training if training_args.do_eval: num_eval_samples = len(tokenized_datasets["validation"]) # Avoid using jax.numpy here in case of TPU training eval_samples_idx = np.arange(num_eval_samples) eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size, drop_last=False) eval_metrics = [] for i, batch_idx in enumerate( tqdm(eval_batch_idx, desc="Evaluating ...", position=2)): samples = [ tokenized_datasets["validation"][int(idx)] for idx in batch_idx ] model_inputs = data_collator(samples) # Model forward metrics = pad_shard_unpad(p_eval_step, static_return=True)( state.params, model_inputs.data, min_device_batch=per_device_eval_batch_size) eval_metrics.append(metrics) # get eval metrics eval_metrics = get_metrics(eval_metrics) eval_metrics = jax.tree_map(lambda metric: jnp.mean(metric).item(), eval_metrics) if jax.process_index() == 0: eval_metrics = { f"eval_{metric_name}": value for metric_name, value in eval_metrics.items() } path = os.path.join(training_args.output_dir, "eval_results.json") with open(path, "w") as f: json.dump(eval_metrics, f, indent=4, sort_keys=True)
return torch.tensor(LA.norm(mat, n).item()) for dirname in [ './models/11b/heads' ]: # for your case, replace dirname with the path to your model file seeds = [0, 1, 2, 3, 4] for seed in seeds: gc.collect() results_encoder = defaultdict(list) results_decoder = defaultdict(list) table_file_decoder = open( f'l1_decoder_{dirname.split("/")[-1]}-{seed}.tsv', 'w') table_file_encoder = open( f'l1_encoder_{dirname.split("/")[-1]}-{seed}.tsv', 'w') config = T5Config.from_pretrained(f'{dirname}-{seed}') model = T5ForConditionalGeneration.from_pretrained(f'{dirname}-{seed}', config=config) org_config = T5Config.from_pretrained(f'./models/11b') org_model = T5ForConditionalGeneration.from_pretrained( f'./models/11b', config=org_config) org_dict = org_model.state_dict() trained_dict = model.state_dict() for encoder_n in range(24): print("seed", seed, encoder_n) q_org = org_dict[ f'encoder.block.{encoder_n}.layer.0.SelfAttention.q.weight'] q_new = trained_dict[