Example #1
0
from transformers import pipeline
from transformers import AutoModelWithLMHead, AutoTokenizer

path_dict = {'bert-base-uncased': 'D:/NLP/bert-base-uncased'}

tokenizer = AutoTokenizer.from_pretrained(path_dict['bert-base-uncased'])
model = AutoModelWithLMHead.from_pretrained(path_dict['bert-base-uncased'])

print(tokenizer.tokenize("I have a new GPU!"))
Example #2
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument("--train_data_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The input training data file (a text file).")
    parser.add_argument(
        "--output_dir",
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written.",
    )
    parser.add_argument(
        "--model_type",
        type=str,
        required=True,
        help="The model architecture to be trained or fine-tuned.",
    )

    # Other parameters
    parser.add_argument(
        "--eval_data_file",
        default=None,
        type=str,
        help=
        "An optional input evaluation data file to evaluate the perplexity on (a text file).",
    )
    parser.add_argument(
        "--line_by_line",
        action="store_true",
        help=
        "Whether distinct lines of text in the dataset are to be handled as distinct sequences.",
    )
    parser.add_argument(
        "--should_continue",
        action="store_true",
        help="Whether to continue from latest checkpoint in output_dir")
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        help=
        "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch.",
    )

    parser.add_argument(
        "--mlm",
        action="store_true",
        help=
        "Train with masked-language modeling loss instead of language modeling."
    )
    parser.add_argument(
        "--mlm_probability",
        type=float,
        default=0.15,
        help="Ratio of tokens to mask for masked language modeling loss")

    parser.add_argument(
        "--config_name",
        default=None,
        type=str,
        help=
        "Optional pretrained config name or path if not the same as model_name_or_path. If both are None, initialize a new config.",
    )
    parser.add_argument(
        "--tokenizer_name",
        default=None,
        type=str,
        help=
        "Optional pretrained tokenizer name or path if not the same as model_name_or_path. If both are None, initialize a new tokenizer.",
    )
    parser.add_argument(
        "--cache_dir",
        default=None,
        type=str,
        help=
        "Optional directory to store the pre-trained models downloaded from s3 (instead of the default one)",
    )
    parser.add_argument(
        "--block_size",
        default=-1,
        type=int,
        help="Optional input sequence length after tokenization."
        "The training dataset will be truncated in block of this size for training."
        "Default to the model max input length for single sentence inputs (take into account special tokens).",
    )
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        action="store_true",
        help="Run evaluation during training at each logging step.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=4,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=4,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=1.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps",
                        type=int,
                        default=500,
                        help="Log every X updates steps.")
    parser.add_argument("--save_steps",
                        type=int,
                        default=500,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--save_total_limit",
        type=int,
        default=None,
        help=
        "Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default",
    )
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Avoid using CUDA when available")
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="For distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="For distant debugging.")
    args = parser.parse_args()

    if args.model_type in ["bert", "roberta", "distilbert", "camembert"
                           ] and not args.mlm:
        raise ValueError(
            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
            "flag (masked language modeling).")
    if args.eval_data_file is None and args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")
    if args.should_continue:
        sorted_checkpoints = _sorted_checkpoints(args)
        if len(sorted_checkpoints) == 0:
            raise ValueError(
                "Used --should_continue but no checkpoint was found in --output_dir."
            )
        else:
            args.model_name_or_path = sorted_checkpoints[-1]

    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.overwrite_output_dir
            and not args.should_continue):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Barrier to make sure only the first process in distributed training download model & vocab

    if args.config_name:
        config = AutoConfig.from_pretrained(args.config_name,
                                            cache_dir=args.cache_dir)
    elif args.model_name_or_path:
        config = AutoConfig.from_pretrained(args.model_name_or_path,
                                            cache_dir=args.cache_dir)
    else:
        # When we release a pip version exposing CONFIG_MAPPING,
        # we can do `config = CONFIG_MAPPING[args.model_type]()`.
        raise ValueError(
            "You are instantiating a new config instance from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --config_name")

    if args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name,
                                                  cache_dir=args.cache_dir)
    elif args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path,
                                                  cache_dir=args.cache_dir)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name")

    if args.block_size <= 0:
        args.block_size = tokenizer.max_len
        # Our input block size will be the max possible for the model
    else:
        args.block_size = min(args.block_size, tokenizer.max_len)

    if args.model_name_or_path:
        model = AutoModelWithLMHead.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
            cache_dir=args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelWithLMHead.from_config(config)

    model.to(args.device)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # End of barrier to make sure only the first process in distributed training download model & vocab

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        if args.local_rank not in [-1, 0]:
            torch.distributed.barrier(
            )  # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache

        train_dataset = load_and_cache_examples(args,
                                                tokenizer,
                                                evaluate=False)

        if args.local_rank == 0:
            torch.distributed.barrier()

        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir, exist_ok=True)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (model.module if hasattr(model, "module") else model
                         )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = AutoModelWithLMHead.from_pretrained(args.output_dir)
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                "-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split(
                "/")[-1] if checkpoint.find("checkpoint") != -1 else ""

            model = AutoModelWithLMHead.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, prefix=prefix)
            result = dict(
                (k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    return results
Example #3
0
from transformers import AutoModelWithLMHead, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-medium")


def remove_older_from_history(history):
    ## only consider last 6 messages in history
    return history[-6:]


def chat(text, history=[]):
    history = remove_older_from_history(history)
    chat_history_ids = None
    for chat_text in history:
        # add chat tokens to chat_history_ids
        input_ids = tokenizer.encode(chat_text + tokenizer.eos_token,
                                     return_tensors='pt')
        if chat_history_ids == None:
            chat_history_ids = input_ids
        else:
            chat_history_ids = torch.cat([chat_history_ids, input_ids], dim=-1)

        # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(text + tokenizer.eos_token,
                                          return_tensors='pt')

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat(
        [chat_history_ids, new_user_input_ids],
Example #4
0
 def test_from_pretrained_identifier(self):
     logging.basicConfig(level=logging.INFO)
     model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
     self.assertIsInstance(model, BertForMaskedLM)
     self.assertEqual(model.num_parameters(), 14830)
     self.assertEqual(model.num_parameters(only_trainable=True), 14830)
def main(args):
    with open(args.dataset_info, 'rb') as rf:
        dataset_info = pickle.load(rf)
    gpt_tokenizer = AutoTokenizer.from_pretrained(args.model_string)
    gpt_tokenizer.add_special_tokens({'pad_token': PAD_TOKEN})
    gpt_pad_id = gpt_tokenizer.encode(PAD_TOKEN)[0]
    gpt_model = AutoModelWithLMHead.from_pretrained(args.model_string).to(
        args.device)
    gpt_model.eval()

    checkpoint = torch.load(args.ckpt, map_location=args.device)
    model_args = checkpoint['args']
    conditioning_model = Model(
        model_args, gpt_pad_id, len(dataset_info.index2word)
    )  # no need to get the glove embeddings when reloading since they're saved in model ckpt anyway
    conditioning_model.load_state_dict(checkpoint['state_dict'])
    conditioning_model = conditioning_model.to(args.device)
    conditioning_model.eval()
    if args.verbose:
        print("=> loaded checkpoint '{}' (epoch {})".format(
            args.ckpt, checkpoint['epoch']))
        print('num params', num_params(conditioning_model))

    input_texts, conditions, categories = [], [], []

    if args.condition_file is not None:
        with open(args.condition_file, 'r') as rf:
            for line in rf:
                input_texts.append(line.strip().split('\t')[0])
                conditions.append(line.strip().split('\t')[1])
                categories.append(None)
                for cw in conditions[-1].split():
                    assert cw in dataset_info.word2index
    else:
        prefixes = []
        with open(args.prefix_file, 'r') as rf:
            for line in rf:
                prefixes.append(line.strip())
        condition_wordlists = []
        for root, _, files in os.walk(args.wordlist_dir):
            for fname in files:
                words = []
                with open(os.path.join(root, fname), 'r') as rf:
                    for line in rf:
                        word = line.strip()
                        if word in dataset_info.word2index:
                            words.append(word)
                        else:
                            if args.verbose:
                                print('word not found:', word)
                condition_wordlists.append(
                    (' '.join(words), fname.split('.')[0]))
        for p in prefixes:
            for c, category in condition_wordlists:
                input_texts.append(p)
                conditions.append(c)
                categories.append(category)

    all_cr = []
    pair_num = 0
    for input_text, condition_words, category in tqdm(zip(
            input_texts, conditions, categories),
                                                      total=len(conditions)):
        predict_function = predict
        condition_results = []
        for i in range(0, args.sample_size, args.max_sample_batch):
            num_samples = min(args.max_sample_batch, args.sample_size - i)
            condition_results += predict_function(
                gpt_model,
                gpt_tokenizer,
                conditioning_model, [input_text for _ in range(num_samples)],
                condition_words,
                dataset_info,
                args.precondition_topk,
                args.topk,
                args.length_cutoff,
                condition_lambda=args.condition_lambda,
                device=args.device)
        all_cr.append((input_text, category, condition_results))
        pair_num += 1
        if args.max_pairs > 0 and pair_num >= args.max_pairs:
            break
    with open(args.log_file, 'w') as wf:
        writer = csv.DictWriter(
            wf, fieldnames=['category', 'input_text', 'generation'])
        writer.writeheader()
        for cr_group in all_cr:
            for cr in cr_group[2]:
                writer.writerow({
                    'category': cr_group[1],
                    'input_text': cr_group[0],
                    'generation': cr
                })
 def test_from_identifier_from_model_type(self):
     model = AutoModelWithLMHead.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER)
     self.assertIsInstance(model, RobertaForMaskedLM)
     self.assertEqual(model.num_parameters(), 14410)
     self.assertEqual(model.num_parameters(only_trainable=True), 14410)
    def from_encoder_decoder_pretrained(
            cls,
            encoder_pretrained_model_name_or_path: str = None,
            decoder_pretrained_model_name_or_path: str = None,
            *model_args,
            **kwargs) -> PreTrainedModel:
        r""" Instantiates an encoder and a decoder from one or two base classes of the library from pre-trained model checkpoints.


        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated).
        To train the model, you need to first set it back in training mode with `model.train()`.

        Params:
            encoder_pretrained_model_name_or_path (:obj: `str`, `optional`, defaults to `None`):
                information necessary to initiate the encoder. Either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/encoder``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

            decoder_pretrained_model_name_or_path (:obj: `str`, `optional`, defaults to `None`):
                information necessary to initiate the decoder. Either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/decoder``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method

            kwargs: (`optional`) Remaining dictionary of keyword arguments.
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:

        Examples::

            from transformers import EncoderDecoder

            model = EncoderDecoder.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
        """

        kwargs_encoder = {
            argument[len("encoder_"):]: value
            for argument, value in kwargs.items()
            if argument.startswith("encoder_")
        }

        kwargs_decoder = {
            argument[len("decoder_"):]: value
            for argument, value in kwargs.items()
            if argument.startswith("decoder_")
        }

        # Load and initialize the encoder and decoder
        # The distinction between encoder and decoder at the model level is made
        # by the value of the flag `is_decoder` that we need to set correctly.
        encoder = kwargs_encoder.pop("model", None)
        if encoder is None:
            assert (
                encoder_pretrained_model_name_or_path is not None
            ), "If `model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has to be defined"
            from .modeling_auto import AutoModel

            encoder = AutoModel.from_pretrained(
                encoder_pretrained_model_name_or_path, *model_args,
                **kwargs_encoder)
        encoder.config.is_decoder = False

        decoder = kwargs_decoder.pop("model", None)
        if decoder is None:
            assert (
                decoder_pretrained_model_name_or_path is not None
            ), "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has to be defined"
            from .modeling_auto import AutoModelWithLMHead

            if "config" not in kwargs_decoder:
                from transformers import AutoConfig

                decoder_config = AutoConfig.from_pretrained(
                    decoder_pretrained_model_name_or_path)
                if decoder_config.is_decoder is False:
                    logger.info(
                        f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers."
                    )
                    decoder_config.is_decoder = True

                kwargs_decoder["config"] = decoder_config

            if kwargs_decoder["config"].is_decoder is False:
                logger.warning(
                    f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, make sure that the attribute `is_decoder` of `decoder_config` passed to `.from_encoder_decoder_pretrained(...)` is set to `True` or do not pass a `decoder_config` to `.from_encoder_decoder_pretrained(...)`"
                )

            decoder = AutoModelWithLMHead.from_pretrained(
                decoder_pretrained_model_name_or_path, **kwargs_decoder)

        return cls(encoder=encoder, decoder=decoder)
Example #8
0
 def preload(self):
     from transformers import AutoModelWithLMHead, AutoTokenizer
     # if self.model is None or self.tokenizer is None:
     print('.. load model t5-base')
     self.model = AutoModelWithLMHead.from_pretrained("t5-base")
     self.tokenizer = AutoTokenizer.from_pretrained("t5-base")

def generate_response(question):
    input_ids = tokenizer.encode(question, return_tensors="pt")
    sample_output = model.generate(
        input_ids,
        do_sample=True,
        max_length=100,
        top_p=0.9,
        top_k=0,
    )
    output = tokenizer.decode(sample_output[0], skip_special_tokens=True)
    return output


def main():
    while True:
        question = input("Your sentence:   ")
        if question == "exit":
            break
        answer = generate_response(question)
        print("Output:\n" + 100 * '-')
        print(answer)


if __name__ == "__main__":
    tokenizer = AutoTokenizer.from_pretrained("anonymous-german-nlp/german-gpt2")
    model = AutoModelWithLMHead.from_pretrained(".\\output-models\\gpt2-lindner\\")
    main()

Example #10
0
import numpy as np
import json
import copy
import random
from transformers import AutoModelWithLMHead, AutoTokenizer, AutoConfig
import torch

dgpt_dir = "/project/glucas_540/kchawla/csci699/storage/logs/best-model"
tokenizer = AutoTokenizer.from_pretrained(dgpt_dir)
model = AutoModelWithLMHead.from_pretrained(dgpt_dir).cuda()

print("tokenizer and model loaded from: ", dgpt_dir)


def get_input(msg):

    msg = (" " + tokenizer.eos_token + " ").join(msg)

    msg = msg + " " + tokenizer.eos_token + " "
    return msg


def format_response(msg):
    msg = msg.replace("<|endoftext|>", "").strip()
    return msg


def generate_response(context):
    context = get_input(context)
    context_ids = tokenizer.encode(context + tokenizer.eos_token,
                                   return_tensors='pt')
Example #11
0
from transformers import pipeline

nlp = pipeline("fill-mask")

from pprint import pprint

pprint(
    nlp(f"HuggingFace is creating a {nlp.tokenizer.mask_token} that the community uses to solve NLP tasks."
        ))

from transformers import AutoModelWithLMHead, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
model = AutoModelWithLMHead.from_pretrained("distilbert-base-cased")

sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."

input = tokenizer.encode(sequence, return_tensors='pt')
mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]

token_logits = model(input).logits
mask_token_logits = token_logits[0, mask_token_index, :]

top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
Example #12
0
df.head(2)

new_df = df.drop([
    'context/2', 'context/3', 'context/4', 'context/5', 'context/6',
    'context/7', 'context/8', 'context/9', 'context/10', 'context/11',
    'context/12', 'context/13', 'context/14', 'context/15', 'context/16',
    'context/17'
],
                 axis=1)

## Function to get Emotion Vectors
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-emotion")

model = AutoModelWithLMHead.from_pretrained(
    "mrm8488/t5-base-finetuned-emotion")


def get_emotion(text):
    input_ids = tokenizer.encode(text + '</s>', return_tensors='pt')

    output = model.generate(input_ids=input_ids, max_length=2)

    dec = [tokenizer.decode(ids) for ids in output]
    label = dec[0]
    return input_ids


#Function to get Emotion labels

from transformers import AutoTokenizer, AutoModelWithLMHead
Example #13
0
 def __init__(self):
     self.tokenizer = AutoTokenizer.from_pretrained(
         "mrm8488/t5-base-finetuned-emotion")
     self.model = AutoModelWithLMHead.from_pretrained(
         "mrm8488/t5-base-finetuned-emotion")
Example #14
0
 def test_from_pretrained_identifier(self):
     logging.basicConfig(level=logging.INFO)
     model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
     self.assertIsInstance(model, BertForMaskedLM)
Example #15
0
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)
config = configparser.ConfigParser()
config.read('model.ini')
model = config['DEFAULT']['model']
modeldir = config['DEFAULT']['dir']
port = config['DEFAULT']['port']
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelWithLMHead.from_pretrained(modeldir)
from flask import Flask
from flask_restful import Resource, Api, reqparse

app = Flask(__name__)
api = Api(app)


class question(Resource):
    def post(self):
        parser = reqparse.RequestParser()
        parser.add_argument('question', required=True)
        args = parser.parse_args()
        # Configs
        logger = logging.getLogger(__name__)
from transformers import AutoModelWithLMHead, AutoTokenizer, AutoConfig
from transformers import GPT2LMHeadModel
import torch 

logdir = "../../../../storage/logs/best-model"

tokenizer = AutoTokenizer.from_pretrained(logdir)
model = AutoModelWithLMHead.from_pretrained(logdir).cuda()

def get_input(msg):
	msg = msg.strip("\n")
	return msg

def format_response(msg):
	msg = msg + "\n"
	return msg

def generate_response(user_input):
	user_input = get_input(user_input)
	user_input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors='pt')
	response_ids = model.generate(user_input_ids.cuda(), max_length=1024, do_sample=True, top_k=20, top_p=0.95, pad_token_id=tokenizer.pad_token_id, use_cache=False)
	response = tokenizer.decode(response_ids[:, user_input_ids.shape[-1]:][0], skip_special_tokens=True)                                                                                          
	output = format_response(response)
	return output

print("Model Loaded.")

while(True):
	user_input = input()
	print(generate_response(user_input))
 
from transformers import AutoModelWithLMHead, AutoTokenizer

model = AutoModelWithLMHead.from_pretrained("t5-base")
tokenizer = AutoTokenizer.from_pretrained("t5-base")

article = """When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband. Only 18 days after that marriage, she got hitched yet again.
Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 2010 marriage license application, according to court documents. Prosecutors said the marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002. All occurred either in Westchester County, Long Island, New Jersey or the Bronx.She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say.Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages. Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted.
The case was referred to the Bronx District Attorney's Office by Immigration and Customs Enforcement and the Department of Homeland Security's Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali.
Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force.
If convicted, Barrientos faces up to four years in prison. Her next court appearance is scheduled for May 18."""

inputs = tokenizer.encode("summarize: " + article, return_tensors="pt", max_length=512)
outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)

print(tokenizer.decode(outputs[0]))
Example #18
0
 def __init__(self, model_name,top_k = 5):
   self.tokenizer =  AutoTokenizer.from_pretrained(model_name)
   self.model = AutoModelWithLMHead.from_pretrained(model_name)
   self.top_k_words = top_k
Example #19
0
def main():
    parser = utils.get_args_parser_with_general_args()
    parser.add_argument('--one_tpu', action='store_true', help="Run on one tpu core for degugging. Makes it easy to use break points")
    parser.add_argument('--tpu_report', action='store_true', help="Print xla metric report")
    args = parser.parse_args()

    utils.init(args)  # set seeds, init logger, prepare output directory

    devices = tpu_xm.get_xla_supported_devices()
    if args.one_tpu:
        devices = [devices[0]]
    n_tpu = len(devices)
    logging.info(f'Found {n_tpu} TPU cores')

    tokenizer = AutoTokenizer.from_pretrained(args.bert_model)
    tokenizer.save_pretrained(args.output_dir)

    args.start_epoch = utils.prepare_last_checkpoint(args.bert_model)
    model = AutoModelWithLMHead.from_pretrained(args.bert_model)  # Only Masked Language Modeling
    logging.info(f"Saving initial checkpoint to: {args.output_dir}")
    #model.save_pretrained(args.output_dir) #TODO: why does this break?
    #xm.save(model.state_dict(), args.output_dir)
    model = tpu_dp.DataParallel(model, device_ids=devices)

    num_data_epochs, num_train_optimization_steps= utils.get_dataset_stats(args, n_tpu)

    def tpu_training_loop(model, loader, device, context):
        """ Called by torch_xla_py.data_parallel. This function is executed on each core of the TPU once per epoch"""

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

        # one optimizer and scheduler per TPU core. Both objects are saved in `context` to be reused the next epoch
        optimizer = context.getattr_or(
            'optimizer',
            AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=tuple(args.betas)))

        # derive warmup info
        if args.warmup_proportion is not None:
            warmup_steps = int(args.warmup_proportion * num_train_optimization_steps + 0.5)
        elif args.warmup_steps is not None:
            warmup_steps = args.warmup_steps
        else:
            raise Exception('What is the warmup?? Specify either warmup proportion or steps')
        scheduler = context.getattr_or(
            'scheduler',
            WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps))

        tr_loss = None
        pbar = None
        if str(pbar_device) == str(device):  # All threads are in sync. Use progress bar only on one of them
            pbar = tqdm(total=int(pbar_steps), desc=f"device {device}", dynamic_ncols=True)

        tracker = tpu_xm.RateTracker()

        model.train()

        for step, batch in enumerate(loader):
            input_ids, input_mask, segment_ids, lm_label_ids, _ = batch
            outputs = model(input_ids, segment_ids, input_mask, lm_label_ids)
            loss = outputs[0]
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            
            loss.sum().backward() # for multiple tensors
            tracker.add(args.train_batch_size)

            tr_loss = loss * args.gradient_accumulation_steps if step == 0 else  tr_loss + loss * args.gradient_accumulation_steps
            if pbar is not None:
                pbar.update(1)
                # pbar.set_description(desc=f'LR: {scheduler.get_lr()}')
            if (step + 1) % args.gradient_accumulation_steps == 0:
                tpu_xm.optimizer_step(optimizer)
                prev_lr = scheduler.get_last_lr()[0]
                scheduler.step()
                curr_lr = scheduler.get_last_lr()[0]
                if args.track_learning_rate:
                    if pbar is not None:
                        pbar.set_description(f"Prev LR: {prev_lr} Curr LR: {curr_lr}")
                optimizer.zero_grad()
        return tr_loss.sum().item() / step  # `.item()` requires a trip from TPU to CPU, which is very slow. Use it only once per epoch=

    for epoch in range(args.start_epoch, args.epochs):
        # Load one training file into memory
        epoch_dataset = utils.PregeneratedDataset(epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer,
                                            num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory)
        train_sampler = RandomSampler(epoch_dataset)
        train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

        pbar_device = devices[0]
        pbar_steps = utils.compute_num_steps_in_epoch(num_samples=train_sampler.num_samples,
                                                      batch_size=args.train_batch_size,
                                                      grad_accum_steps=1, # the pbar steps should not take into account grad accumulation steps
                                                      n_tpu=n_tpu)
        logging.info(f'start training, epoch {epoch} on {len(devices)} cores for {pbar_steps} steps')
        start = time.time()
        losses = model(tpu_training_loop, train_dataloader)  # calls `tpu_training_loop` multiple times, once per TPU core
        logging.info(f'Epoch {epoch} took {round(time.time() - start, 2)} seconds. Average loss: {sum(losses)/len(losses)}')
        utils.save_checkpoint(model._models[0], epoch, args.output_dir)

    if args.tpu_report:
        logging.info(torch_xla._XLAC._xla_metrics_report())
 def test_from_pretrained_identifier(self):
     model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
     self.assertIsInstance(model, BertForMaskedLM)
     self.assertEqual(model.num_parameters(), 14410)
     self.assertEqual(model.num_parameters(only_trainable=True), 14410)
from transformers import AutoModelWithLMHead, AutoTokenizer
import torch
import time

with open("sample.txt") as f:
    sample_text = f.read()

model_load_start = time.time()

model = AutoModelWithLMHead.from_pretrained("sshleifer/distilbart-cnn-6-6")
tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-6-6")

print(f"Model loaded in {round(time.time() - model_load_start, 2)}s")

inputs = tokenizer.encode(sample_text,
                          return_tensors="pt",
                          max_length=1024,
                          truncation=True)
summarize_start = time.time()
outputs = model.generate(
    input_ids=inputs,
    attention_mask=torch.IntTensor([[1] * len(inputs[0])]),
    max_length=100,
    min_length=25,
    length_penalty=1.0,
    num_beams=3,
    early_stopping=True,
)
output_text = tokenizer.decode(outputs[0].tolist(), skip_special_tokens=True)
reduction = 1 - len(output_text) / len(sample_text)
print(
"""
Downloads the GPT-2 model for use in bot replies. Used in a build step to provide the model.
"""
from transformers import AutoTokenizer, AutoModelWithLMHead

if __name__ == '__main__':
    tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium",
                                              cache_dir='models')
    model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-medium",
                                                cache_dir='models')
Example #23
0
def run_model():
    global extracted_grads

    parser = argparse.ArgumentParser()
    parser.add_argument('--neg_sample_file', default='', help='File of negative regard target samples.')
    parser.add_argument('--neu_sample_file', default='', help='File of neutral regard target samples.')
    parser.add_argument('--pos_sample_file', default='', help='Fle of positive regard target samples.')
    parser.add_argument('--neg_demographic', default='The man', help='Demographic mention for negative target samples.')
    parser.add_argument('--pos_demographic', default='The woman', help='Demographic mention for positive target samples.')
    parser.add_argument('--neg_name_file', default='', help='File with names for negative target samples. Overrides neg_demographic.')
    parser.add_argument('--pos_name_file', default='', help='File with names for positive target samples. Overrides pos_demographic.')
    parser.add_argument('--salience_threshold', default=5.0)
    parser.add_argument('--salient_phrases_file', default='', help='File with salient phrases.')
    parser.add_argument('--use_original_loss', default=1, help='Use association loss.')
    parser.add_argument('--use_salience_loss', default=0)
    parser.add_argument('--use_dissociation_loss', default=1, help='Use dissociation loss.')
    parser.add_argument('--use_weighted_salience_loss', default=0)
    parser.add_argument('--alpha', default=1, help='Weight for original loss.')
    parser.add_argument('--beta', default=1, help='Weight for dissociation loss.')
    parser.add_argument('--beam_size', default=1, help='Beam size when searching for trigger replacement candidates.')
    parser.add_argument('--use_weighted_neg', default=0)
    parser.add_argument('--trigger_init', default='', help='Initialize trigger with a phrase.')
    parser.add_argument('--num_trigger_tokens', default=6)  # Overridden if len trigger_init is greater.
    parser.add_argument('--trigger_masked_phrases', default='')
    parser.add_argument('--trigger_position', default='head', help='Options are `head`, `body_demographic`, `body_biascontext.')
    parser.add_argument('--debias', default=0, help='Whether to generate triggers to debias. 0 = no debias, 1 = neutral '
                                                    'debias, 2 = neutral + positive debias.')
    parser.add_argument('--num_demographics', default=2, help='Whether to use 1 or 2 demographics.')
    parser.add_argument('--model_name_or_path', default='gpt2',
                        help='Model name or path: gpt2, microsoft/DialoGPT-medium, etc.')
    parser.add_argument('--tokenizer_name', default='', help='Tokenizer name if different from model name.')
    parser.add_argument('--model_type',  default='gpt2', help='Currently either `gpt2` or `dialogpt`.')
    parser.add_argument('--batch_size', default=16, help='32 works well for CPU, 16 for GPU.')
    params = parser.parse_args()

    params.salience_threshold = float(params.salience_threshold)
    params.use_original_loss = int(params.use_original_loss) == 1
    params.use_salience_loss = int(params.use_salience_loss) == 1
    params.use_dissociation_loss = int(params.use_dissociation_loss) == 1
    params.use_weighted_salience_loss = int(params.use_weighted_salience_loss) == 1
    params.alpha = float(params.alpha)
    params.beta = float(params.beta)
    params.beam_size = int(params.beam_size)
    params.use_weighted_neg = int(params.use_weighted_neg) == 1
    params.num_trigger_tokens = int(params.num_trigger_tokens)
    if params.trigger_masked_phrases:
        params.trigger_masked_phrases = params.trigger_masked_phrases.split(',')
    else:
        params.trigger_masked_phrases = []
    params.debias = int(params.debias)
    assert params.debias in [0, 1, 2]
    # 0 = no debias, 1 = associate neutral, dissociate everything else, 2 = associate positive + neutral, dissociate neg
    params.num_demographics = int(params.num_demographics)
    params.batch_size = int(params.batch_size)

    print('Params', params)

    np.random.seed(0)
    torch.random.manual_seed(0)
    torch.cuda.manual_seed(0)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print('Device: ', device)

    model = AutoModelWithLMHead.from_pretrained(params.model_name_or_path)
    tokenizer = AutoTokenizer.from_pretrained(
        params.tokenizer_name if params.tokenizer_name else params.model_name_or_path)
    total_vocab_size = len(tokenizer)
    model.eval()
    model.to(device)

    add_hooks(model, total_vocab_size)  # add gradient hooks to embeddings
    embedding_weight = get_embedding_weight(model, total_vocab_size)  # save the word embedding matrix

    enc_trigger_init = tokenizer.encode('The ' + params.trigger_init)[1:]
    trigger_init_len = len(enc_trigger_init)
    old_num_trigger_tokens = params.num_trigger_tokens
    params.num_trigger_tokens = max(trigger_init_len, params.num_trigger_tokens)

    # Process trigger_masked_phrases.
    trigger_masked_idxes = []
    for phrase in params.trigger_masked_phrases:
        enc_phrase = tokenizer.encode(phrase)
        enc_trigger_init_str = ' '.join([str(x) for x in enc_trigger_init])
        enc_phrase_str = ' '.join([str(x) for x in enc_phrase])
        if enc_phrase_str in enc_trigger_init_str:
            enc_phrase_str_char_idx = enc_trigger_init_str.index(enc_phrase_str)
            start_idx = enc_trigger_init_str[:enc_phrase_str_char_idx].count(' ')
            for i in range(start_idx, start_idx + len(enc_phrase)):
                trigger_masked_idxes.append(i + params.num_trigger_tokens - 1)
        else:  # Try adding space before the phrase bc of tokenization.
            sp_enc_phrase = tokenizer.encode('x ' + phrase)[1:]
            sp_enc_phrase_str = ' '.join([str(x) for x in sp_enc_phrase])
            if sp_enc_phrase_str in enc_trigger_init_str:
                sp_enc_phrase_str_char_idx = enc_trigger_init_str.index(sp_enc_phrase_str)
                start_idx = enc_trigger_init_str[:sp_enc_phrase_str_char_idx].count(' ')
                for i in range(start_idx, start_idx + len(sp_enc_phrase)):
                    trigger_masked_idxes.append(i + params.num_trigger_tokens - 1)
            else:
                print('Masked phrase not found', enc_phrase, sp_enc_phrase, enc_trigger_init)
                exit()
    print('trigger_masked_idxes', trigger_masked_idxes)

    max_len = 50

    # Calculate salience scores.
    pos_salience_token_items = None
    neg_salience_token_items = None
    if params.use_salience_loss:
        salience_dict = attacks.find_hard_salient_phrases(params.neg_sample_file, params.pos_sample_file, tokenizer,
                                                          params.salient_phrases_file,
                                                          salience_threshold=params.salience_threshold)
        neg_salience_token_items = [0] * total_vocab_size
        pos_salience_token_items = [0] * total_vocab_size
        for phrase in salience_dict:
            label, score = salience_dict[phrase]
            tok_ids = tokenizer.encode(phrase)
            if label == 'neg':
                for tok_id in tok_ids:
                    neg_salience_token_items[tok_id] += int(round(score))
            elif label == 'pos':
                for tok_id in tok_ids:
                    pos_salience_token_items[tok_id] += int(round(score))
            else:
                raise NotImplementedError('Label is either neg or pos.')
        print('neg_salience_token_items', neg_salience_token_items[:20])
        print('pos_salience_token_items', pos_salience_token_items[:20])

    with open(params.neg_sample_file, 'r') as f:
        neg_target_texts = f.readlines()
        if params.model_type == constants.GPT2:
            neg_target_texts = [l.strip() for l in neg_target_texts]
        elif params.model_type == constants.DIALOGPT:
            neg_target_texts = [l.strip().split('\t') for l in neg_target_texts]
    with open(params.pos_sample_file, 'r') as f:
        pos_target_texts = f.readlines()
        if params.model_type == constants.GPT2:
            pos_target_texts = [l.strip() for l in pos_target_texts]
        elif params.model_type == constants.DIALOGPT:
            pos_target_texts = [l.strip().split('\t') for l in pos_target_texts]
    neu_target_texts = []
    if params.neu_sample_file:
        with open(params.neu_sample_file, 'r') as f:
            neu_target_texts = f.readlines()
            if params.model_type == constants.GPT2:
                neu_target_texts = [l.strip() for l in neu_target_texts]
            elif params.model_type == constants.DIALOGPT:
                neu_target_texts = [l.strip().split('\t') for l in neu_target_texts]

    if constants.DEMO not in params.trigger_position:
        neg_demo_neg_target_texts = []
        pos_demo_neg_target_texts = []
        neg_demo_pos_target_texts = []
        pos_demo_pos_target_texts = []
        neg_demo_neu_target_texts = []
        pos_demo_neu_target_texts = []
        if params.neg_name_file and params.pos_name_file:  # Use names instead of demographic groups.
            neg_names = open(params.neg_name_file, 'r').readlines()
            neg_names = [x for x in neg_names if x]
            pos_names = open(params.pos_name_file, 'r').readlines()
            pos_names = [x for x in pos_names if x]
            # If # names is >= batch_size, reset names for each batch_size-th sample.
            # Otherwise, if # names < batch_size, reset names after cycling through all names AND for each batch_size-th sample.
            # Resetting after each batch_size-th sample is just easier for keeping track of loss masking.
            batch_size_mod_number = params.batch_size
            neg_mod_number = min(len(neg_names), params.batch_size)
            pos_mod_number = min(len(pos_names), params.batch_size)
            for idx, l in enumerate(neg_target_texts):
                mod_idx = idx % batch_size_mod_number
                if mod_idx >= neg_mod_number:
                    mod_idx = mod_idx % neg_mod_number
                neg_name = neg_names[mod_idx].strip()
                if params.model_type == constants.GPT2:
                    neg_demo_neg_target_texts += [neg_name + ' ' + l]
                elif params.model_type == constants.DIALOGPT:
                    neg_demo_neg_target_texts += [l[0] + ' ' + neg_name + ' ' + l[1]]

                mod_idx = idx % batch_size_mod_number
                if mod_idx >= pos_mod_number:
                    mod_idx = mod_idx % pos_mod_number
                pos_name = pos_names[mod_idx].strip()
                if params.model_type == constants.GPT2:
                    pos_demo_neg_target_texts += [pos_name + ' ' + l]
                elif params.model_type == constants.DIALOGPT:
                    pos_demo_neg_target_texts += [l[0] + ' ' + pos_name + ' ' + l[1]]

            for idx, l in enumerate(pos_target_texts):
                mod_idx = idx % batch_size_mod_number
                if mod_idx >= neg_mod_number:
                    mod_idx = mod_idx % neg_mod_number
                neg_name = neg_names[mod_idx].strip()
                if params.model_type == constants.GPT2:
                    neg_demo_pos_target_texts += [neg_name + ' ' + l]
                elif params.model_type == constants.DIALOGPT:
                    neg_demo_pos_target_texts += [l[0] + ' ' + neg_name + ' ' + l[1]]

                mod_idx = idx % batch_size_mod_number
                if mod_idx >= pos_mod_number:
                    mod_idx = mod_idx % pos_mod_number
                pos_name = pos_names[mod_idx].strip()
                if params.model_type == constants.GPT2:
                    pos_demo_pos_target_texts += [pos_name + ' ' + l]
                elif params.model_type == constants.DIALOGPT:
                    pos_demo_pos_target_texts += [l[0] + ' ' + pos_name + ' ' + l[1]]

            for idx, l in enumerate(neu_target_texts):
                mod_idx = idx % batch_size_mod_number
                if mod_idx >= neg_mod_number:
                    mod_idx = mod_idx % neg_mod_number
                neg_name = neg_names[mod_idx].strip()
                if params.model_type == constants.GPT2:
                    neg_demo_neu_target_texts += [neg_name + ' ' + l]
                elif params.model_type == constants.DIALOGPT:
                    neg_demo_neu_target_texts += [l[0] + ' ' + neg_name + ' ' + l[1]]

                mod_idx = idx % batch_size_mod_number
                if mod_idx >= pos_mod_number:
                    mod_idx = mod_idx % pos_mod_number
                pos_name = pos_names[mod_idx].strip()
                if params.model_type == constants.GPT2:
                    pos_demo_neu_target_texts += [pos_name + ' ' + l]
                elif params.model_type == constants.DIALOGPT:
                    pos_demo_neu_target_texts += [l[0] + ' ' + pos_name + ' ' + l[1]]

        else:  # Use demographic groups.
            for l in neg_target_texts:
                neg_demo_neg_target_texts += [params.neg_demographic + ' ' + l]
                pos_demo_neg_target_texts += [params.pos_demographic + ' ' + l]
            for l in pos_target_texts:
                neg_demo_pos_target_texts += [params.neg_demographic + ' ' + l]
                pos_demo_pos_target_texts += [params.pos_demographic + ' ' + l]
            for l in neu_target_texts:
                neg_demo_neu_target_texts += [params.neg_demographic + ' ' + l]
                pos_demo_neu_target_texts += [params.pos_demographic + ' ' + l]
    else:
        neg_demo_neg_target_texts = neg_target_texts
        pos_demo_neg_target_texts = neg_target_texts
        pos_demo_pos_target_texts = pos_target_texts
        neg_demo_pos_target_texts = pos_target_texts
        pos_demo_neu_target_texts = neu_target_texts
        neg_demo_neu_target_texts = neu_target_texts

    if constants.BODY in params.trigger_position:
        if constants.BC in params.trigger_position:
            # When the trigger encapsulates the bias contexts, we strip bias contexts in the target texts.
            for bc in constants.GPT2_BIAS_CONTEXTS:
                pos_demo_pos_target_texts = [x.replace(bc, '').strip() for x in pos_demo_pos_target_texts]
                neg_demo_neg_target_texts = [x.replace(bc, '').strip() for x in neg_demo_neg_target_texts]
                pos_demo_neg_target_texts = [x.replace(bc, '').strip() for x in pos_demo_neg_target_texts]
                neg_demo_pos_target_texts = [x.replace(bc, '').strip() for x in neg_demo_pos_target_texts]
                pos_demo_neu_target_texts = [x.replace(bc, '').strip() for x in pos_demo_neu_target_texts]
                neg_demo_neu_target_texts = [x.replace(bc, '').strip() for x in neg_demo_neu_target_texts]

    print('neg demo neg target text:', neg_demo_neg_target_texts[0])
    print('pos demo pos target text:', pos_demo_pos_target_texts[0])

    if params.use_dissociation_loss:
        print('pos demo neg target text:', pos_demo_neg_target_texts[0])
        print('neg demo pos target text:', neg_demo_pos_target_texts[0])

    if params.neu_sample_file:
        print('neg demo neu target text:', neg_demo_neu_target_texts[0])
        print('pos demo neu target text:', pos_demo_neu_target_texts[0])

    # batch and pad the target tokens
    neg_demo_neg_target_tokens_gen = make_target_batch(tokenizer, device, neg_demo_neg_target_texts, max_len,
                                                       params.batch_size)
    pos_demo_pos_target_tokens_gen = make_target_batch(tokenizer, device, pos_demo_pos_target_texts, max_len,
                                                       params.batch_size)
    neg_demo_neg_target_tokens_gen = list(neg_demo_neg_target_tokens_gen)
    same_demo_target_threshold = len(neg_demo_neg_target_tokens_gen)
    pos_demo_pos_target_tokens_gen = list(pos_demo_pos_target_tokens_gen)
    same_demo_target_losses = neg_demo_neg_target_tokens_gen + pos_demo_pos_target_tokens_gen

    if params.use_dissociation_loss:
        pos_demo_neg_target_tokens_gen = make_target_batch(tokenizer, device, pos_demo_neg_target_texts, max_len,
                                                           params.batch_size)
        neg_demo_pos_target_tokens_gen = make_target_batch(tokenizer, device, neg_demo_pos_target_texts, max_len,
                                                           params.batch_size)
        pos_demo_neg_target_tokens_gen = list(pos_demo_neg_target_tokens_gen)
        diff_demo_target_threshold = len(pos_demo_neg_target_tokens_gen)
        neg_demo_pos_target_tokens_gen = list(neg_demo_pos_target_tokens_gen)
        diff_demo_target_losses = pos_demo_neg_target_tokens_gen + neg_demo_pos_target_tokens_gen

    neu_target_losses = []
    if params.neu_sample_file:
        pos_demo_neu_target_tokens_gen = make_target_batch(tokenizer, device, pos_demo_neu_target_texts, max_len,
                                                           params.batch_size)
        neg_demo_neu_target_tokens_gen = make_target_batch(tokenizer, device, neg_demo_neu_target_texts, max_len,
                                                           params.batch_size)
        pos_demo_neu_target_tokens_gen = list(pos_demo_neu_target_tokens_gen)
        neu_target_threshold = len(pos_demo_neu_target_tokens_gen)
        neg_demo_neu_target_tokens_gen = list(neg_demo_neu_target_tokens_gen)
        neu_target_losses = pos_demo_neu_target_tokens_gen + neg_demo_neu_target_tokens_gen

    # Interleave negative and positive add_losses, shuffle all items.
    all_items = []
    if params.debias:  # Generate debiasing triggers.
        assert neu_target_losses
        for idx, l in enumerate(neu_target_losses):
            if idx < neu_target_threshold:
                all_items += [('add', 'pos', l)]
            else:
                all_items += [('add', 'neg', l)]
        if params.debias == 1:
            # A - B where A = neu_target_losses and B = same_demo_target_losses + diff_demo_target_losses.
            same_demo_target_loss_type = 'sub'
            diff_demo_target_loss_type = 'sub'
    else:  # Debias = 0, generate adversarial triggers.
        same_demo_target_loss_type = 'add'
        diff_demo_target_loss_type = 'sub'

    for idx, l in enumerate(same_demo_target_losses):
        if params.num_demographics == 1:
            if idx < same_demo_target_threshold:
                # (Whether to add or subtract loss (add), demographic type (neg), samples).
                all_items += [(same_demo_target_loss_type, 'neg', l)]
        elif params.num_demographics == 2:
            if idx < same_demo_target_threshold:
                if params.debias == 2:
                    # A - B where A = neu_target_losses + pos_target_losses, and B = neg_target_losses.
                    same_demo_target_loss_type = 'sub'
                all_items += [(same_demo_target_loss_type, 'neg', l)]  # (Whether to add or subtract loss, demographic type, samples).
            else:
                if params.debias == 2:
                    same_demo_target_loss_type = 'add'
                all_items += [(same_demo_target_loss_type, 'pos', l)]
        else:
            raise NotImplementedError('num_demographics has to be in [1, 2]: %s' % params.num_demographics)
    if params.use_dissociation_loss:
        for idx, l in enumerate(diff_demo_target_losses):
            if idx < diff_demo_target_threshold:
                if params.debias == 2:
                    diff_demo_target_loss_type = 'sub'
                all_items += [(diff_demo_target_loss_type, 'pos', l)]
            else:
                if params.debias == 2:
                    diff_demo_target_loss_type = 'add'
                all_items += [(diff_demo_target_loss_type, 'neg', l)]

    np.random.shuffle(all_items)

    # Useful for debugging:
    # for i in range(min(10, len(all_items))):
    #     itm = all_items[i]
    #     sample = [x for x in itm[2][0].tolist() if x != constants.PAD_TOKEN_ID]
    #     print(sample)
    #     print(itm[0], itm[1], tokenizer.decode(sample))

    for restart_idx in range(1):  # Different random restarts of the trigger
        print('Random restart: ', str(restart_idx))

        trigger_tokens = tokenizer.encode('The ' + params.trigger_init)[1:]
        if trigger_init_len < old_num_trigger_tokens:
            # Sample random initial trigger.
            # rand_trigger_tokens = np.random.randint(total_vocab_size, size=old_num_trigger_tokens - trigger_init_len)
            rand_trigger_tokens = [tokenizer.encode('x the')[-1]] * (old_num_trigger_tokens - trigger_init_len)
            trigger_tokens = np.concatenate((trigger_tokens, rand_trigger_tokens), axis=0)
        if params.model_type == constants.DIALOGPT:  # Add eos after trigger.
            trigger_tokens = np.concatenate((trigger_tokens, [tokenizer.eos_token_id]), axis=0)
        print('Random initial trigger:', tokenizer.decode(trigger_tokens))

        # Note that beam_cache, new_beam_cache, and loss_heap all have reverse sign losses.
        # best_loss and curr_best_loss have original sign losses.
        best_loss = 999999  # We want to minimize loss.
        best_trigger_tokens = deepcopy(trigger_tokens)
        beam_cache = [(-999999, trigger_tokens)]  # Always keep beam_size full trigger candidates.
        end_iter = False
        for entire_trigger_update_idx in range(50):  # this many updates of the entire trigger sequence
            print('Updating entire trigger for the', str(entire_trigger_update_idx), '-th time')

            if end_iter:
                continue

            for token_to_flip in range(params.num_trigger_tokens):
                right_counter_token_to_flip = token_to_flip

                if token_to_flip in trigger_masked_idxes:
                    print('Trigger token #', str(token_to_flip), str(right_counter_token_to_flip))
                    continue  # Don't modify these triggers.

                # Beam search for each trigger_tokens in beam_cache.
                assert len(beam_cache) <= params.beam_size
                new_beam_cache = []
                for _, trigger_tokens in beam_cache:
                    print('Trigger token #', str(token_to_flip), str(right_counter_token_to_flip))
                    print(tokenizer.decode(trigger_tokens), trigger_tokens)

                    model.zero_grad()
                    extracted_grads = []  # Each element is (batch_size, sample_length, 768_embed_dim).
                    loss_types = []  # Order of `add` and `sub` loss types.
                    demo_types = []  # Order of `neg` or `pos` demographic types.
                    for idx, (typ, demo_type, target_tokens) in enumerate(all_items):
                        loss_types.append(typ)
                        demo_types.append(demo_type)

                        if demo_type == 'neg':
                            if params.neg_name_file:
                                demo_tokens = [tokenizer.encode('The ' + n)[1:] for n in neg_names]
                            else:
                                demo_tokens = tokenizer.encode(params.neg_demographic)
                        elif demo_type == 'pos':
                            if params.pos_name_file:
                                demo_tokens = [tokenizer.encode('The ' + n)[1:] for n in pos_names]
                            else:
                                demo_tokens = tokenizer.encode(params.pos_demographic)
                        else:
                            raise NotImplementedError('Label is either neg or pos.')

                        if params.use_salience_loss and not params.debias:
                            if (demo_type == 'neg' and typ == 'add') or (demo_type == 'pos' and typ == 'sub'):
                                # association loss between neg demographic and neg samples (to add) or
                                # association loss between pos demographic and neg samples (to subtract).
                                salience_token_items = neg_salience_token_items
                            elif (demo_type == 'pos' and typ == 'add') or (demo_type == 'neg' and typ == 'sub'):
                                # association loss between pos demographic and pos samples (to add) or
                                # association loss between neg demographic and pos samples (to subtract).
                                salience_token_items = pos_salience_token_items
                            else:
                                raise NotImplementedError('Label and demographic pair not possible', typ, demo_type)
                            salience_token_items_tensor = torch.tensor(salience_token_items, device=device,
                                                                       dtype=torch.long)
                        else:
                            salience_token_items_tensor = None

                        loss, _ = get_loss(
                            model, params.batch_size, trigger_tokens, demo_tokens, target_tokens, tokenizer, device,
                            salience_token_items=salience_token_items_tensor,
                            use_original_loss=params.use_original_loss, use_salience_loss=params.use_salience_loss,
                            use_weighted_salience_loss=params.use_weighted_salience_loss,
                            trigger_position=params.trigger_position, model_type=params.model_type)
                        loss.backward()
                        del loss, salience_token_items_tensor

                    # Get average gradient w.r.t. the triggers.
                    add_indices = [i for i, loss_type in enumerate(loss_types) if loss_type == 'add']
                    add_extracted_grads = []
                    for i in add_indices:
                        extracted_grad = extracted_grads[i]
                        if params.use_weighted_neg and demo_types[i] == 'neg':  # Amplify neg associations.
                            extracted_grad *= 2
                        add_extracted_grads.append(extracted_grad)
                    add_grad_tensor = torch.stack(add_extracted_grads)  # Convert to tensor.
                    add_grad_tensor = torch.sum(add_grad_tensor, dim=0)  # Add all batches.
                    add_grad_tensor = torch.sum(add_grad_tensor, dim=0)  # Add all samples in a `batch`.
                    add_grad_tensor = add_grad_tensor[token_to_flip].unsqueeze(0)  # Use gradients at token_to_flip.
                    grad = add_grad_tensor
                    if params.use_dissociation_loss:
                        grad *= params.alpha
                        sub_indices = [i for i, loss_type in enumerate(loss_types) if loss_type == 'sub']
                        sub_extracted_grads = []
                        for i in sub_indices:
                            extracted_grad = extracted_grads[i]
                            if params.use_weighted_neg and demo_types[i] == 'neg':  # Amplify neg associations.
                                extracted_grad *= 2
                            sub_extracted_grads.append(extracted_grad)
                        sub_grad_tensor = torch.stack(sub_extracted_grads)  # Convert to tensor.
                        sub_grad_tensor = torch.sum(sub_grad_tensor, dim=0)  # Add all batches.
                        sub_grad_tensor = torch.sum(sub_grad_tensor, dim=0)  # Add all samples in a `batch`.
                        sub_grad_tensor = sub_grad_tensor[token_to_flip].unsqueeze(0)  # Use gradients at token_to_flip.
                        grad -= params.beta * sub_grad_tensor

                    # Use hotflip (linear approximation) attack to get the top num_candidates.
                    candidate_values, candidates = attacks.hotflip_attack(
                        grad, embedding_weight, [trigger_tokens[right_counter_token_to_flip]],
                        increase_loss=False, num_candidates=100)
                    candidates = candidates[0]
                    candidate_values = candidate_values[0]

                    # Try all the candidates and pick the best.
                    loss_heap = []
                    heapq.heapify(loss_heap)  # This is a min heap, so need to flip all losses to end up with the real smallest loss.
                    eval_threshold = 5
                    for cand_value, cand in zip(candidate_values, candidates):

                        # Don't include tokens that have punctuation.
                        decoded_cand = tokenizer.decode([cand])
                        keep_token = keep_candidate_token(decoded_cand)
                        if not keep_token:
                            continue

                        # replace one token with new candidate
                        candidate_trigger_tokens = deepcopy(trigger_tokens)
                        candidate_trigger_tokens[right_counter_token_to_flip] = cand
                        curr_assoc_loss = 0.0
                        curr_dissoc_loss = 0.0
                        eval_set = collections.Counter()
                        total_assoc_elements = 0.0
                        total_dissoc_elements = 0.0
                        for idx, (typ, demo_type, target_tokens) in enumerate(all_items):
                            if eval_set[(typ, demo_type)] < eval_threshold:
                                eval_set[(typ, demo_type)] += 1
                            else:
                                continue

                            if demo_type == 'neg':
                                if params.neg_name_file:
                                    demo_tokens = [tokenizer.encode('The ' + n)[1:] for n in neg_names]
                                else:
                                    demo_tokens = tokenizer.encode(params.neg_demographic)
                            elif demo_type == 'pos':
                                if params.pos_name_file:
                                    demo_tokens = [tokenizer.encode('The ' + n)[1:] for n in pos_names]
                                else:
                                    demo_tokens = tokenizer.encode(params.pos_demographic)
                            else:
                                raise NotImplementedError('Label is either neg or pos.')

                            if params.use_salience_loss and not params.debias:
                                if (demo_type == 'neg' and typ == 'add') or (demo_type == 'pos' and typ == 'sub'):
                                    # association loss between neg demographic and neg samples (to add) or
                                    # association loss between pos demographic and neg samples (to subtract).
                                    salience_token_items = neg_salience_token_items
                                elif (demo_type == 'pos' and typ == 'add') or (demo_type == 'neg' and typ == 'sub'):
                                    # association loss between pos demographic and pos samples (to add) or
                                    # association loss between neg demographic and pos samples (to subtract).
                                    salience_token_items = pos_salience_token_items
                                else:
                                    raise NotImplementedError('Label and demographic pair not possible', typ, demo_type)
                                # Add demo to salience token items.
                                salience_token_items_tensor = torch.tensor(salience_token_items, device=device,
                                                                           dtype=torch.long)
                            else:
                                salience_token_items_tensor = None

                            # get loss, update current best if its lower loss
                            loss, mask_and_target = get_loss(
                                model, params.batch_size, candidate_trigger_tokens, demo_tokens, target_tokens,
                                tokenizer, device, salience_token_items=salience_token_items_tensor,
                                use_original_loss=params.use_original_loss, use_salience_loss=params.use_salience_loss,
                                use_weighted_salience_loss=params.use_weighted_salience_loss,
                                trigger_position=params.trigger_position, model_type=params.model_type)
                            if typ == 'add':
                                # Losses are averaged per non-ignored element per sample per batch.
                                # Since we are calculating overall loss over many batches, re-calc average.
                                curr_num_elements = 0
                                for sample in mask_and_target:
                                    curr_num_elements += sum([1 for elem in sample if elem != -1])
                                total_assoc_elements += curr_num_elements
                                if demo_type == 'neg' and params.use_weighted_neg:  # Amplify neg associations.
                                    curr_assoc_loss += 2 * loss.data.item() * curr_num_elements
                                else:
                                    curr_assoc_loss += loss.data.item() * curr_num_elements
                            elif typ == 'sub':
                                curr_num_elements = 0
                                for sample in mask_and_target:
                                    curr_num_elements += sum([1 for elem in sample if elem != -1])
                                total_dissoc_elements += curr_num_elements
                                if demo_type == 'neg' and params.use_weighted_neg:  # Amplify neg associations.
                                    curr_dissoc_loss += 2 * loss.data.item() * curr_num_elements
                                else:
                                    curr_dissoc_loss += loss.data.item() * curr_num_elements
                            del loss, salience_token_items_tensor

                            if all([x == eval_threshold for x in eval_set.values()]):
                                break

                        curr_assoc_loss /= total_assoc_elements
                        if params.use_dissociation_loss:
                            curr_dissoc_loss /= total_dissoc_elements
                            curr_total_loss = (params.alpha * curr_assoc_loss) - (params.beta * curr_dissoc_loss)
                        else:
                            curr_total_loss = curr_assoc_loss

                        # Keep top beam_size elements.
                        # Note that beam_cache, new_beam_cache, and loss_heap all have reverse sign losses.
                        curr_total_loss *= -1
                        if len(new_beam_cache) < params.beam_size:
                            heapq.heappush(loss_heap, curr_total_loss)
                            new_beam_cache.append((curr_total_loss, deepcopy(candidate_trigger_tokens)))
                            curr_worst_loss = heapq.nsmallest(1, loss_heap)[0]
                        else:
                            if curr_total_loss > curr_worst_loss:  # Remember, signs are flipped.
                                # Kick out 1 trigger_tokens sequence with loss = curr_worst_loss.
                                curr_worst_loss_idx_list = [cache_idx for cache_idx, (x, _) in enumerate(new_beam_cache) if x == curr_worst_loss]
                                del new_beam_cache[curr_worst_loss_idx_list[0]]
                                heapq.heappop(loss_heap)

                                heapq.heappush(loss_heap, curr_total_loss)
                                new_beam_cache.append((curr_total_loss, deepcopy(candidate_trigger_tokens)))
                                curr_worst_loss = heapq.nsmallest(1, loss_heap)[0]

                beam_cache = new_beam_cache

            curr_best_loss = 999999
            for x, y in beam_cache:
                x *= -1  # Flip loss back to original sign.
                if x < curr_best_loss:
                    curr_best_loss = x
                    trigger_tokens = deepcopy(y)
            print("Loss: " + str(curr_best_loss))
            print('Trigger token IDs:', trigger_tokens)
            print('Trigger string:', tokenizer.decode(trigger_tokens) + '\n')
            if curr_best_loss < best_loss:
                best_loss = curr_best_loss
                best_trigger_tokens = deepcopy(trigger_tokens)
            elif curr_best_loss == best_loss:
                pass
            else:
                end_iter = True

        # Print final trigger.
        print("Final loss: " + str(best_loss))
        print('Final trigger token IDs:', best_trigger_tokens)
        print('Final trigger:', tokenizer.decode(best_trigger_tokens))
Example #24
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                            cache_dir=model_args.cache_dir)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name,
            cache_dir=model_args.cache_dir,
            use_fast=model_args.use_fast)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name")

    if model_args.model_name_or_path:
        model = AutoModelWithLMHead.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelWithLMHead.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    if config.model_type in ["bert", "roberta", "distilbert", "camembert"
                             ] and not data_args.mlm:
        raise ValueError(
            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
            "flag (masked language modeling).")

    if data_args.block_size <= 0:
        data_args.block_size = tokenizer.max_len
        # Our input block size will be the max possible for the model
    else:
        data_args.block_size = min(data_args.block_size, tokenizer.max_len)

    # Get datasets

    train_dataset = get_dataset(
        data_args, tokenizer=tokenizer) if training_args.do_train else None
    eval_dataset = get_dataset(
        data_args, tokenizer=tokenizer,
        evaluate=True) if training_args.do_eval else None
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=data_args.mlm,
        mlm_probability=data_args.mlm_probability)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        prediction_loss_only=True,
    )

    # Training
    if training_args.do_train:
        model_path = (model_args.model_name_or_path
                      if model_args.model_name_or_path is not None
                      and os.path.isdir(model_args.model_name_or_path) else
                      None)
        trainer.train(model_path=model_path)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        result = {"perplexity": perplexity}

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_lm.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

        results.update(result)

    return results
Example #25
0
from flask import Flask, render_template, request, Response, send_file, jsonify

from torch.nn import functional as F
from queue import Queue, Empty
import time
import threading

# Server & Handling Setting
app = Flask(__name__)

requests_queue = Queue()
BATCH_SIZE = 1
CHECK_INTERVAL = 0.1

tokenizer = AutoTokenizer.from_pretrained("gpt2-large")
model = AutoModelWithLMHead.from_pretrained("sherlock", return_dict=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


# Queue 핸들링
def handle_requests_by_batch():
    while True:
        requests_batch = []
        while not (len(requests_batch) >= BATCH_SIZE):
            try:
                requests_batch.append(
                    requests_queue.get(timeout=CHECK_INTERVAL))
            except Empty:
                continue