def main(): parser = argparse.ArgumentParser( description="reducing transformer tokenizer size") parser.add_argument("--source_model", type=str, required=True, default='', help="The multilingual transformer to start from") parser.add_argument( "--custom_corpus", type=str, required=True, default='custom_corpus.txt', help="the custom corpus similar to target corpus with limited tokens") parser.add_argument("--vocab_size", type=int, required=True, default=8000, help="vocabulary size") parser.add_argument("--output_model", type=str, required=True, default='output_model', help="The name of the final reduced model") args = parser.parse_args() # Load original tokenizer, model and vocab logging.info('starting from model: ' + args.source_model) if "mt5" in args.source_model: tokenizer = T5Tokenizer.from_pretrained(args.source_model) model_type = "mt5" elif "mbart" in args.source_model: tokenizer = MBartTokenizer.from_pretrained(args.source_model) model_type = "mbart" else: logging.info("model type not supported...") exit() vocab = tokenizer.get_vocab() spm.SentencePieceTrainer.train(input=args.custom_corpus, model_prefix=os.path.join( args.output_model, "reduce_sentencepiece.bpe"), vocab_size=args.vocab_size, model_type="bpe", vocabulary_output_piece_score=False) bpe_model_path = os.path.join(args.output_model, "reduce_sentencepiece.bpe.model") if model_type == "mt5": new_tokenizer = T5Tokenizer(vocab_file=bpe_model_path) new_tokenizer.save_pretrained(args.output_model) elif model_type == "mbart": new_tokenizer = MBartTokenizer(vocab_file=bpe_model_path) new_tokenizer.save_pretrained(args.output_model) else: logging.info("model type not supported...") exit()
def __init__(self, model_dir, **kwargs): from models.tokenization_mbart import MBartTokenizer from models.modeling_mbart import MBartForConditionalGeneration self.model = MBartForConditionalGeneration.from_pretrained(model_dir) self.tokenizer = MBartTokenizer.from_pretrained(model_dir) self.model.cuda() self.model.eval()
def get_tokenizer(cfg): if cfg["train"]["model_name"] == "nezha": from models.tokenization_bert import BertTokenizer if cfg["train"]["pretrained_model"]: tokenizer = BertTokenizer.from_pretrained(cfg["train"]["pretrained_model"]) else: logger.error("BERT vocab file not set, please check your ber_model_dir or trained_model_dir") logger.info('vocab size is %d' % (len(tokenizer.vocab))) return tokenizer elif cfg["train"]["model_name"] == "bart" or cfg["train"]["model_name"] == "mbart": from models.tokenization_mbart import MBartTokenizer tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25') return tokenizer elif cfg["train"]["model_name"] == "t5" or cfg["train"]["model_name"] == "mt5": from models.tokenization_t5 import T5Tokenizer pretrained_tag = "/mnt/dl/public/pretrained_models/mt5-large" tokenizer = T5Tokenizer.from_pretrained(pretrained_tag) return tokenizer elif cfg["train"]["model_name"] == "bert" or cfg["train"]["model_name"] == "hfl": from models.tokenization_bert import BertTokenizer pretrained_tag = "/mnt/bigfiles/Models/pretrained_models/chinese_roberta_wwm_ext_pytorch" tokenizer = BertTokenizer.from_pretrained(pretrained_tag) return tokenizer else: logger.error("can not find the proper tokenizer type...") return None
def main(): parser = argparse.ArgumentParser(description="reducing transformers size") parser.add_argument("--source_model", type=str, required=True, default='', help="The multilingual transformer to start from") parser.add_argument("--vocab_file", type=str, required=True, default='vocab_5langs.txt', help="The intended vocabulary file path") parser.add_argument("--output_model", type=str, required=True, default='output_model', help="The name of the final reduced model") parser.add_argument("--convert_to_tf", type=str2bool, required=False, default=False, help="Whether to generate a tenserflow version or not") args = parser.parse_args() # Load original tokenizer, model and vocab logging.info('starting from model: ' + args.source_model) if "mt5" in args.source_model: tokenizer = T5Tokenizer.from_pretrained(args.source_model) model = MT5ForConditionalGeneration.from_pretrained(args.source_model) model_type = "mt5" elif "mbart" in args.source_model: tokenizer = MBartTokenizer.from_pretrained(args.source_model) model = MBartForConditionalGeneration.from_pretrained( args.source_model) model_type = "mbart" else: logging.info("model type not supported...") exit() vocab = tokenizer.get_vocab() logging.info(args.source_model + " - num_parameters : " + str(model.num_parameters())) logging.info(args.source_model + " - num_tokens : " + str(len(vocab))) # Load new vocab new_vocab = open(args.vocab_file).read().splitlines() # TODO retrain tokenizer from corpus... # ... # Rebuild pytorch model new_embs = select_embeddings(model, vocab, new_vocab, model_type, args.output_model) # convert to tensorflow if (args.convert_to_tf): if os.path.isfile(f"{args.output_model}/tf_model.h5"): logging.info(f"{args.output_model}/tf_model.h5 already exists") else: tf_model = TFAutoModel.from_pretrained(args.output_model, from_pt=True) tf_model.save_pretrained(args.output_model)
def get_tokenizer_and_model(cfg, label_map=None, num_labels=None): if num_labels is None: num_labels = cfg["data"]["num_labels"] tokenizer = None model = None ptd = get_pretrained_model_path(cfg) # Huawei nezha if cfg["train"]["model_name"] == "nezha": from models.tokenization_bert import BertTokenizer from models.modeling_nezha import ( NeZhaForSequenceClassification, NeZhaForTokenClassification, NeZhaBiLSTMForTokenClassification, NeZhaForDocumentClassification, NeZhaForDocumentTagClassification, NeZhaForTagClassification ) tokenizer = BertTokenizer.from_pretrained(ptd) if cfg["train"]["task_name"] == "ner": if cfg["train"]["use_bilstm"]: _label_map = {k:v for k,v in label_map.items()} model = NeZhaBiLSTMForTokenClassification.from_pretrained(ptd, label_map=_label_map, num_labels=num_labels) else: model = NeZhaForTokenClassification.from_pretrained(ptd, num_labels=num_labels) if cfg["train"]["task_name"] == "textclf": if cfg["train"]["encode_document"]: model = NeZhaForDocumentClassification.from_pretrained(ptd, doc_inner_batch_size=cfg["train"]["doc_inner_batch_size"], num_labels=num_labels) else: model = NeZhaForSequenceClassification.from_pretrained(ptd, num_labels=num_labels) if cfg["train"]["task_name"] == "tag": if cfg["train"]["encode_document"]: model = NeZhaForDocumentTagClassification.from_pretrained(ptd, doc_inner_batch_size=cfg["train"]["doc_inner_batch_size"], num_labels=num_labels) else: model = NeZhaForTagClassification.from_pretrained(ptd, num_labels=num_labels) # facebook bert and XunFei hfl elif cfg["train"]["model_name"] == "bert" or cfg["train"]["model_name"] == "hfl": from models.tokenization_bert import BertTokenizer tokenizer = BertTokenizer.from_pretrained(ptd) if cfg["train"]["task_name"] == "ner": from models.modeling_bert import BertForTokenClassification if cfg["train"]["use_bilstm"]: # FIXME Process BiLSTM # model = NeZhaBiLSTMForTokenClassification(bert_config, label_map, num_labels=num_labels) pass else: model = BertForTokenClassification.from_pretrained(ptd, num_labels=num_labels) if cfg["train"]["task_name"] == "textclf": from models.modeling_bert import BertForSequenceClassification if cfg["train"]["encode_document"]: # FIXME Process NeZhaForDocumentClassification # model = NeZhaForDocumentClassification(bert_config, cfg["train"]["doc_inner_batch_size"], num_labels=num_labels) pass else: model = BertForSequenceClassification.from_pretrained(ptd, num_labels=num_labels) if cfg["train"]["task_name"] == "tag": from models.modeling_bert import BertForTagClassification if cfg["train"]["encode_document"]: # FIXME Process NeZhaForDocumentTagClassification # model = NeZhaForDocumentTagClassification(bert_config, cfg["train"]["doc_inner_batch_size"], num_labels=num_labels) pass else: model = BertForTagClassification.from_pretrained(ptd, num_labels=num_labels) # facebook bart/mbart elif cfg["train"]["model_name"] == "bart" or cfg["train"]["model_name"] == "mbart": from models.tokenization_mbart import MBartTokenizer tokenizer = MBartTokenizer.from_pretrained(ptd) if cfg["train"]["task_name"] in nlg_tasks: from models.modeling_mbart import MBartForConditionalGeneration gradient_checkpointing_flag = True if cfg["train"]["gradient_checkpointing"] else False if gradient_checkpointing_flag: logger.info("gradient checkpointing enabled") model = MBartForConditionalGeneration.from_pretrained(ptd, gradient_checkpointing=gradient_checkpointing_flag) # google t5/mt5 elif cfg["train"]["model_name"] == "t5" or cfg["train"]["model_name"] == "mt5": from models.tokenization_t5 import T5Tokenizer tokenizer = T5Tokenizer.from_pretrained(ptd) if cfg["train"]["task_name"] in nlg_tasks: from models.modeling_mt5 import MT5ForConditionalGeneration model = MT5ForConditionalGeneration.from_pretrained(ptd) else: logger.error("model type not supported!") assert tokenizer and model, "get tokenizer or model error" if cfg["train"]["freeze_encoder"] and "freeze_encoder" in dir(model): model.freeze_encoder() if "unfreeze_encoder_last_layers" in dir(model): model.unfreeze_encoder_last_layers() return tokenizer, model