def main():
    parser = argparse.ArgumentParser(
        description="reducing transformer tokenizer size")
    parser.add_argument("--source_model",
                        type=str,
                        required=True,
                        default='',
                        help="The multilingual transformer to start from")
    parser.add_argument(
        "--custom_corpus",
        type=str,
        required=True,
        default='custom_corpus.txt',
        help="the custom corpus similar to target corpus with limited tokens")
    parser.add_argument("--vocab_size",
                        type=int,
                        required=True,
                        default=8000,
                        help="vocabulary size")
    parser.add_argument("--output_model",
                        type=str,
                        required=True,
                        default='output_model',
                        help="The name of the final reduced model")
    args = parser.parse_args()

    # Load original tokenizer, model and vocab
    logging.info('starting from model: ' + args.source_model)
    if "mt5" in args.source_model:
        tokenizer = T5Tokenizer.from_pretrained(args.source_model)
        model_type = "mt5"
    elif "mbart" in args.source_model:
        tokenizer = MBartTokenizer.from_pretrained(args.source_model)
        model_type = "mbart"
    else:
        logging.info("model type not supported...")
        exit()
    vocab = tokenizer.get_vocab()

    spm.SentencePieceTrainer.train(input=args.custom_corpus,
                                   model_prefix=os.path.join(
                                       args.output_model,
                                       "reduce_sentencepiece.bpe"),
                                   vocab_size=args.vocab_size,
                                   model_type="bpe",
                                   vocabulary_output_piece_score=False)

    bpe_model_path = os.path.join(args.output_model,
                                  "reduce_sentencepiece.bpe.model")
    if model_type == "mt5":
        new_tokenizer = T5Tokenizer(vocab_file=bpe_model_path)
        new_tokenizer.save_pretrained(args.output_model)
    elif model_type == "mbart":
        new_tokenizer = MBartTokenizer(vocab_file=bpe_model_path)
        new_tokenizer.save_pretrained(args.output_model)
    else:
        logging.info("model type not supported...")
        exit()
Exemple #2
0
 def __init__(self, model_dir, **kwargs):
     from models.tokenization_mbart import MBartTokenizer
     from models.modeling_mbart import MBartForConditionalGeneration
     self.model = MBartForConditionalGeneration.from_pretrained(model_dir)
     self.tokenizer = MBartTokenizer.from_pretrained(model_dir)
     self.model.cuda()
     self.model.eval()
def get_tokenizer(cfg):
    if cfg["train"]["model_name"] == "nezha":
        from models.tokenization_bert import BertTokenizer
        if cfg["train"]["pretrained_model"]:
            tokenizer = BertTokenizer.from_pretrained(cfg["train"]["pretrained_model"])
        else:
            logger.error("BERT vocab file not set, please check your ber_model_dir or trained_model_dir")
        logger.info('vocab size is %d' % (len(tokenizer.vocab)))
        return tokenizer
    elif cfg["train"]["model_name"] == "bart" or cfg["train"]["model_name"] == "mbart":
        from models.tokenization_mbart import MBartTokenizer
        tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
        return tokenizer
    elif cfg["train"]["model_name"] == "t5" or cfg["train"]["model_name"] == "mt5":
        from models.tokenization_t5 import T5Tokenizer
        pretrained_tag = "/mnt/dl/public/pretrained_models/mt5-large"
        tokenizer = T5Tokenizer.from_pretrained(pretrained_tag)
        return tokenizer
    elif cfg["train"]["model_name"] == "bert" or cfg["train"]["model_name"] == "hfl":
        from models.tokenization_bert import BertTokenizer
        pretrained_tag = "/mnt/bigfiles/Models/pretrained_models/chinese_roberta_wwm_ext_pytorch"
        tokenizer = BertTokenizer.from_pretrained(pretrained_tag)
        return tokenizer
    else:
        logger.error("can not find the proper tokenizer type...")
        return None
Exemple #4
0
def main():
    parser = argparse.ArgumentParser(description="reducing transformers size")
    parser.add_argument("--source_model",
                        type=str,
                        required=True,
                        default='',
                        help="The multilingual transformer to start from")
    parser.add_argument("--vocab_file",
                        type=str,
                        required=True,
                        default='vocab_5langs.txt',
                        help="The intended vocabulary file path")
    parser.add_argument("--output_model",
                        type=str,
                        required=True,
                        default='output_model',
                        help="The name of the final reduced model")
    parser.add_argument("--convert_to_tf",
                        type=str2bool,
                        required=False,
                        default=False,
                        help="Whether to generate a tenserflow version or not")

    args = parser.parse_args()

    # Load original tokenizer, model and vocab
    logging.info('starting from model: ' + args.source_model)
    if "mt5" in args.source_model:
        tokenizer = T5Tokenizer.from_pretrained(args.source_model)
        model = MT5ForConditionalGeneration.from_pretrained(args.source_model)
        model_type = "mt5"
    elif "mbart" in args.source_model:
        tokenizer = MBartTokenizer.from_pretrained(args.source_model)
        model = MBartForConditionalGeneration.from_pretrained(
            args.source_model)
        model_type = "mbart"
    else:
        logging.info("model type not supported...")
        exit()
    vocab = tokenizer.get_vocab()

    logging.info(args.source_model + " - num_parameters : " +
                 str(model.num_parameters()))
    logging.info(args.source_model + " - num_tokens : " + str(len(vocab)))

    # Load new vocab
    new_vocab = open(args.vocab_file).read().splitlines()

    # TODO retrain tokenizer from corpus...
    # ...

    # Rebuild pytorch model
    new_embs = select_embeddings(model, vocab, new_vocab, model_type,
                                 args.output_model)

    # convert to tensorflow
    if (args.convert_to_tf):
        if os.path.isfile(f"{args.output_model}/tf_model.h5"):
            logging.info(f"{args.output_model}/tf_model.h5 already exists")
        else:
            tf_model = TFAutoModel.from_pretrained(args.output_model,
                                                   from_pt=True)
            tf_model.save_pretrained(args.output_model)
def get_tokenizer_and_model(cfg, label_map=None, num_labels=None):
    if num_labels is None:
        num_labels = cfg["data"]["num_labels"]

    tokenizer = None
    model = None
    ptd = get_pretrained_model_path(cfg)

    # Huawei nezha
    if cfg["train"]["model_name"] == "nezha":
        from models.tokenization_bert import BertTokenizer
        from models.modeling_nezha import (
                            NeZhaForSequenceClassification, NeZhaForTokenClassification,
                            NeZhaBiLSTMForTokenClassification, NeZhaForDocumentClassification,
                            NeZhaForDocumentTagClassification, NeZhaForTagClassification
                        )
        tokenizer = BertTokenizer.from_pretrained(ptd)

        if cfg["train"]["task_name"] == "ner":
            if cfg["train"]["use_bilstm"]:
                _label_map = {k:v for k,v in label_map.items()}
                model = NeZhaBiLSTMForTokenClassification.from_pretrained(ptd, label_map=_label_map, num_labels=num_labels)
            else:
                model = NeZhaForTokenClassification.from_pretrained(ptd, num_labels=num_labels)
        if cfg["train"]["task_name"] == "textclf":
            if cfg["train"]["encode_document"]:
                model = NeZhaForDocumentClassification.from_pretrained(ptd, doc_inner_batch_size=cfg["train"]["doc_inner_batch_size"], num_labels=num_labels)
            else:
                model = NeZhaForSequenceClassification.from_pretrained(ptd, num_labels=num_labels)
        if cfg["train"]["task_name"] == "tag":
            if cfg["train"]["encode_document"]:
                model = NeZhaForDocumentTagClassification.from_pretrained(ptd, doc_inner_batch_size=cfg["train"]["doc_inner_batch_size"], num_labels=num_labels)
            else:
                model = NeZhaForTagClassification.from_pretrained(ptd, num_labels=num_labels)

    # facebook bert and XunFei hfl
    elif cfg["train"]["model_name"] == "bert" or cfg["train"]["model_name"] == "hfl":
        from models.tokenization_bert import BertTokenizer
        tokenizer = BertTokenizer.from_pretrained(ptd)
        if cfg["train"]["task_name"] == "ner":
            from models.modeling_bert import BertForTokenClassification
            if cfg["train"]["use_bilstm"]:
                # FIXME Process BiLSTM
                # model = NeZhaBiLSTMForTokenClassification(bert_config, label_map, num_labels=num_labels)
                pass
            else:
                model = BertForTokenClassification.from_pretrained(ptd, num_labels=num_labels)
        if cfg["train"]["task_name"] == "textclf":
            from models.modeling_bert import BertForSequenceClassification
            if cfg["train"]["encode_document"]:
                # FIXME Process NeZhaForDocumentClassification
                # model = NeZhaForDocumentClassification(bert_config, cfg["train"]["doc_inner_batch_size"], num_labels=num_labels)
                pass
            else:
                model = BertForSequenceClassification.from_pretrained(ptd, num_labels=num_labels)
        if cfg["train"]["task_name"] == "tag":
            from models.modeling_bert import BertForTagClassification
            if cfg["train"]["encode_document"]:
                # FIXME Process NeZhaForDocumentTagClassification
                # model = NeZhaForDocumentTagClassification(bert_config, cfg["train"]["doc_inner_batch_size"], num_labels=num_labels)
                pass
            else:
                model = BertForTagClassification.from_pretrained(ptd, num_labels=num_labels)

    # facebook bart/mbart
    elif cfg["train"]["model_name"] == "bart" or cfg["train"]["model_name"] == "mbart":
        from models.tokenization_mbart import MBartTokenizer
        tokenizer = MBartTokenizer.from_pretrained(ptd)
        if cfg["train"]["task_name"] in nlg_tasks:
            from models.modeling_mbart import MBartForConditionalGeneration
            gradient_checkpointing_flag = True if cfg["train"]["gradient_checkpointing"] else False
            if gradient_checkpointing_flag:
                logger.info("gradient checkpointing enabled")
            model = MBartForConditionalGeneration.from_pretrained(ptd, gradient_checkpointing=gradient_checkpointing_flag)

    # google t5/mt5
    elif cfg["train"]["model_name"] == "t5" or cfg["train"]["model_name"] == "mt5":
        from models.tokenization_t5 import T5Tokenizer
        tokenizer = T5Tokenizer.from_pretrained(ptd)
        if cfg["train"]["task_name"] in nlg_tasks:
            from models.modeling_mt5 import MT5ForConditionalGeneration
            model = MT5ForConditionalGeneration.from_pretrained(ptd)
    else:
        logger.error("model type not supported!")

    assert tokenizer and model, "get tokenizer or model error"
    if cfg["train"]["freeze_encoder"] and "freeze_encoder" in dir(model):
        model.freeze_encoder()
        if "unfreeze_encoder_last_layers" in dir(model):
            model.unfreeze_encoder_last_layers()
    return tokenizer, model