Beispiel #1
0
def load_model(target_folder, config):
    # Parse parameters
    model_size = config.get('model', 'model_size')
    no_cuda = config.getboolean('model', 'no_cuda')

    logger.info("Loading the model...")
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
    # Tokenizer
    tokenizer = GPT2Tokenizer(os.path.join(target_folder, 'vocab.json'),
                              os.path.join(target_folder, 'merges.txt'))
    # Config
    config = GPT2Config.from_json_file(
        os.path.join(target_folder, 'config.json'))
    # Weights
    state_dict_path = glob(os.path.join(target_folder, f'*.pkl'))[0]
    state_dict = torch.load(state_dict_path, map_location=device)
    if model_size == 'small':
        for key in list(state_dict.keys()):
            state_dict[key.replace('module.', '')] = state_dict.pop(key)
    state_dict['lm_head.weight'] = state_dict['lm_head.decoder.weight']
    state_dict.pop("lm_head.decoder.weight", None)
    # Model
    model = GPT2LMHeadModel(config)
    model.load_state_dict(state_dict)
    model.to(device)
    model.eval()
    return model, tokenizer
Beispiel #2
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--config_path",
                        default="../../models/gpt2/gpt2-config.json",
                        type=str,
                        required=False)
    parser.add_argument("--model_path",
                        default="../../models/gpt2/gpt2-pytorch_model.bin",
                        type=str,
                        required=False)
    parser.add_argument("--vocab_path",
                        default="../../models/gpt2/gpt2-vocab.json",
                        type=str,
                        required=False)
    parser.add_argument("--merges_path",
                        default="../../models/gpt2/gpt2-merges.txt",
                        type=str,
                        required=False)
    parser.add_argument(
        "--sentence",
        default="In this article, I am excited to take you through",
        type=str,
        required=False)
    args = parser.parse_args()

    config = GPT2Config.from_pretrained(args.config_path)
    model = GPT2LMHeadModel.from_pretrained(args.model_path, config=config)
    tokenizer = GPT2Tokenizer(args.vocab_path, args.merges_path)
    # logging.basicConfig(filename="default.txt", level=logging.DEBUG, filemode='w')
    # gpt2_generate_greedy(model, tokenizer, sentence=sys.argv[1])
    gpt2_generate_beam_search(model, tokenizer, sentence=args.sentence)
Beispiel #3
0
    def __init__(self, vocab_file, merge_file):
        name = "GPT2 BPE"
        super().__init__(name)

        self.tokenizer = GPT2Tokenizer(
            vocab_file, merge_file, errors="replace", special_tokens=[], max_len=None
        )
        self.eod_id = self.tokenizer.encoder["<|endoftext|>"]
 def _load_mscoco(invalid_vocab_times=0):
     from transformers import GPT2Tokenizer
     toker = PretrainedTokenizer(
         GPT2Tokenizer('./tests/dataloader/dummy_gpt2vocab/vocab.json',
                       './tests/dataloader/dummy_gpt2vocab/merges.txt'))
     return MSCOCO("./tests/dataloader/dummy_mscoco#MSCOCO",
                   tokenizer=toker,
                   pretrained='gpt2',
                   min_rare_vocab_times=invalid_vocab_times)
 def _load_opensubtitles(invalid_vocab_times=0):
     from transformers import GPT2Tokenizer
     toker = PretrainedTokenizer(
         GPT2Tokenizer('./tests/dataloader/dummy_gpt2vocab/vocab.json',
                       './tests/dataloader/dummy_gpt2vocab/merges.txt'))
     return OpenSubtitles(
         "./tests/dataloader/dummy_opensubtitles#OpenSubtitles",
         tokenizer=toker,
         pretrained='gpt2',
         min_rare_vocab_times=invalid_vocab_times)
    def test_full_tokenizer(self):
        tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
        text = "lower newer"
        bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
        tokens = tokenizer.tokenize(text, add_prefix_space=True)
        self.assertListEqual(tokens, bpe_tokens)

        input_tokens = tokens + [tokenizer.unk_token]
        input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
def load_gpt_tokenizer(path):

    tokenizer = GPT2Tokenizer(
        vocab_file=f'./{path}/vocab.json',
        merges_file=f'./{path}/merges.txt',
    )
    # unk_token='<unk>',
    # bos_token='<bos>',
    # eos_token='<eos>')
    return tokenizer
def main():
    # Config
    config = InferenceConfig()
    gpt_config = GPT2Config.from_json_file(config.model_config_path)

    # torch related
    torch.set_grad_enabled(False)
    torch.manual_seed(config.random_seed)

    # Logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)
    handler = StreamHandler(sys.stdout)
    handler.setFormatter(logging.Formatter("[%(asctime)s] %(message)s"))
    logger.addHandler(handler)

    # Text Utils
    logging.info(f"loading Tokenizer...")
    tokenizer = GPT2Tokenizer(config.tokenizer_vocab_path,
                              config.tokenizer_merge_path)

    # Forward Model
    logging.info(f"loading Forward Model...")
    forward_model = GPT2LMHeadModel(gpt_config)
    forward_model.load_state_dict(
        load_model_weight(gpt_config, config.forward_model_path))

    # Backward Model
    logging.info(f"loading Backward Model...")
    backward_model = GPT2LMHeadModel(gpt_config)
    backward_model.load_state_dict(
        load_model_weight(gpt_config, config.backward_model_path))

    # Example
    example_contexts = [
        "<|endoftext|>".join(["How are you doing?"]),
        "<|endoftext|>".join(["Does money buy happiness?"]),
        "<|endoftext|>".join([
            "Does money buy happiness?",
            "Depends how much money you spend on it .",
        ]),
        "<|endoftext|>".join([
            "Does money buy happiness?",
            "Depends how much money you spend on it .",
            "What is the best way to buy happiness ?",
        ]),
    ]
    inferencer = Inferencer(config, tokenizer, forward_model, backward_model)
    results = inferencer.run(example_contexts)

    for context, results in zip(example_contexts, results):
        logging.info(f"Example Context:{context}")
        for i, reply in enumerate(results):
            logging.info(f"Output Utterance Top-{i+1}: {reply}")
Beispiel #9
0
def model_fn(model_dir):
    """
    Load the model for inference
    """

    # Load GPT2 tokenizer from disk.
    vocab_path = os.path.join(model_dir, 'model/vocab.json')
    merges_path = os.path.join(model_dir, 'model/merges.txt')

    tokenizer = GPT2Tokenizer(vocab_file=vocab_path, merges_file=merges_path)

    # Load GPT2 model from disk.
    model_path = os.path.join(model_dir, 'model/')
    model = GPT2LMHeadModel.from_pretrained(model_path)

    return TextGenerationPipeline(model=model, tokenizer=tokenizer)
def load_tokenizer(vocab='./tokenizer/vocab.json', merges='./tokenizer/merges.txt', gpt=False, load_from=None):
    if gpt:
        if load_from:
            tokenizer = GPT2Tokenizer.from_pretrained(load_from)
        else:
            tokenizer = GPT2Tokenizer(
                vocab, merges, 
                bos_token=CARD_BEGIN, eos_token=CARD_END, sep_token=CARD_END,
                unk_token=UNK, pad_token=CARD_PAD, mask_token=CARD_MASK, padding_side="left"
            )
    else:
        tokenizer = ByteLevelBPETokenizer(vocab, merges)
        tokenizer.add_special_tokens(SPECIAL_TOKENS + OTHER_TOKENS)
        tokenizer.mask_token = CARD_MASK
    
    tokenizer.pre_tokenizer = Whitespace()
    return tokenizer
    def build_input_features(dials, vocab_file, bpe_merges, end_text="<|endoftext|>"):
        tokenizer = GPT2Tokenizer(vocab_file, bpe_merges)
        feature = []
        for dial in dials:
            inputs = sum([tokenizer.encode(u) for u in dial[:-1]], [])
            lm_labels = [-1] * len(inputs) + tokenizer.encode(dial[-1] + end_text)
            token_type_ids = [0] * len(inputs) + [1] * (len(tokenizer.encode(dial[-1] + end_text)))
            weights = [0.0] * len(inputs) + [1.0] * (len(tokenizer.encode(dial[-1] + end_text)))
            input_ids = inputs + tokenizer.encode(end_text + dial[-1])
            input_len = len(input_ids)
            position_ids = list(range(len(input_ids)))

            feat_dict = {"input_ids": input_ids,
                         "position_ids": position_ids,
                         "token_type_ids": token_type_ids,
                         "lm_labels": lm_labels,
                         "weights": weights,
                         "input_len": input_len}
            feature.append(feat_dict)
        return feature
Beispiel #12
0
def load_model(target_folder_name, config):
    # Parse parameters
    data_folder = config.get('model', 'data_folder')
    model_size = config.get('model', 'model_size')
    no_cuda = config.getboolean('model', 'no_cuda')

    logger.info(f"Loading model from {target_folder_name}...")
    print(1)
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
    print(2)
    # Tokenizer
    target_folder = os.path.join(data_folder, target_folder_name)
    tokenizer = GPT2Tokenizer(os.path.join(target_folder, 'vocab.json'),
                              os.path.join(target_folder, 'merges.txt'))
    print(3)
    # Config
    config = GPT2Config.from_json_file(
        os.path.join(target_folder, 'config.json'))
    print(4)
    # Weights
    torch.cuda.set_device(0)
    state_dict_path = glob(os.path.join(target_folder, f'*.pkl'))[0]
    state_dict = torch.load(state_dict_path, map_location=device)
    print(5)
    if model_size == 'small':
        for key in list(state_dict.keys()):
            state_dict[key.replace('module.', '')] = state_dict.pop(key)
    state_dict['lm_head.weight'] = state_dict['lm_head.decoder.weight']
    state_dict.pop("lm_head.decoder.weight", None)
    # Model
    print(6)
    model = GPT2LMHeadModel(config)
    print(7)
    model.load_state_dict(state_dict)
    print(8)
    model.to(device)
    print(9)
    model.eval()
    print(10)
    return model, tokenizer
def run_test(model_dir, data_dir, mode, config_path='345M/', beam_width=10):
    config_path = config_path + 'config.json'
    vocab_path = config_path + 'vocab.json'
    merge_path = config_path + 'merges.txt'
    checkpoint_path = model_dir + '/GPT_model.pkl'
    log_filename = model_dir + '/test_data.log'

    config = GPT2Config.from_json_file(os.path.join('./configs/', config_path))

    create_log(log_filename)
    print("Building model")
    model = load_model(GPT2LMHeadModel(config), checkpoint_path,
                       test=True).cuda()
    model.eval()
    tokenizer = GPT2Tokenizer(vocab_path, merge_path)
    if mode == 'test':
        print('Loading test dataset...')
        test_data_loader = GPT2DataLoader(data_path=data_dir,
                                          vocab_file=vocab_path,
                                          bpe_merges=merge_path,
                                          bucket=2,
                                          batch_size=1,
                                          max_seq_len=512)
Beispiel #14
0
import torch
import torch.nn.functional as F
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from config import device_f, device_r, num_samples, MMI_temperature, top_k

torch.set_grad_enabled(False)

tokenizer = GPT2Tokenizer('medium/vocab.json', 'medium/merges.txt')

weights = torch.load('medium/medium_ft.pkl')
# fix misused key value
weights["lm_head.weight"] = weights["lm_head.decoder.weight"]
weights.pop("lm_head.decoder.weight", None)

cfg = GPT2Config.from_json_file('medium/config.json')
model: GPT2LMHeadModel = GPT2LMHeadModel(cfg)
model.load_state_dict(weights)
if device_f == 'cuda':
    model.half()
model.to(device_f)
model.eval()

weights = torch.load('medium/small_reverse.pkl')
# fix misused key value
weights["lm_head.weight"] = weights["lm_head.decoder.weight"]
weights.pop("lm_head.decoder.weight", None)

reverse_model: GPT2LMHeadModel = GPT2LMHeadModel(cfg)
reverse_model.load_state_dict(weights)
if device_r == 'cuda':
    reverse_model.half()
Beispiel #15
0
    #                                   batch_size=1,
    #                                   max_seq_len=512)
    hparams = {'learning_rate': 1e-5,
               'accumulate_step': 2,
               'lr_schedule': 'noam',
               'warmup_steps': 16000,
               'warmup_proportion': 0.1,
               'n_embd': 768,
               'num_optim_steps': 100000,
               'train_batch_size': 1,
               'valid_step': 10000,
               'device':device,
               'vocab_file':vocab_file,
               'bpe_merge':bpe_merges,
               'beam_width':1,
               'max_len':1024}
    tokenizer = GPT2Tokenizer(hparams['vocab_file'], hparams['bpe_merge'])
    #test(hparams,model,valid_data_loader)
    chat(model,tokenizer,device)











Beispiel #16
0
import torch.nn.functional as F
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from mmi_config import device_f, device_r, num_samples, MMI_temperature, top_k, focus_last_message
import time

logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    level=logging.INFO)
logger = logging.getLogger(__name__)

logger.info("Load and properly initialize models...")

torch.set_grad_enabled(False)

tokenizer = GPT2Tokenizer('vocab.json', 'merges.txt')

weights = torch.load('GP2-pretrain-step-615.pkl')
# weights = torch.load('models/medium/medium_ft.pkl')

# distributed training will prepend weights with 'module.'
keys = list(weights.keys())
for k in keys:
    # if 'module.' in k:
    weights[re.sub('module.', '', k)] = weights[k]
    weights.pop(k, None)

# fix misused key value
weights["lm_head.weight"] = weights["lm_head.decoder.weight"]
weights.pop("lm_head.decoder.weight", None)
Beispiel #17
0
def main():
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    json_file_parser = ArgumentParser()
    json_file_parser.add_argument("--config_file", type=str, default=None)
    json_file_parser.add_argument("--tpu_num_cores", type=int, default=None)
    json_parser_args = json_file_parser.parse_args()

    parser = HfArgumentParser([TrainingArguments, ExtraArgs])

    if json_parser_args.config_file is None:
        training_args, extra_args = parser.parse_args_into_dataclasses()
    else:
        training_args, extra_args = parser.parse_json_file(
            json_parser_args.config_file)

    with h5pickle.File("data/train.hdf5",
                       "r",
                       libver="latest",
                       swmr=True,
                       skip_cache=False) as f:
        train_dataset = f["train"]
        val_dataset = f["val"]

        if extra_args.max_n_train is not None:
            train_dataset = train_dataset[:extra_args.max_n_train]

        if extra_args.max_n_val is not None:
            val_dataset = val_dataset[:extra_args.max_n_val]

        model = get_model(extra_args)

        tokenizer = GPT2Tokenizer(
            "data/german_tokenizer_cc/vocab.json",
            "data/german_tokenizer_cc/merges.txt",
        )
        tokenizer.pad_token = tokenizer.eos_token

        name = generate_slug(2)

        if json_parser_args.tpu_num_cores is not None:
            training_args.tpu_num_cores = json_parser_args.tpu_num_cores

        training_args.remove_unused_columns = False
        steps_per_epoch = int(
            len(train_dataset) / training_args.per_device_train_batch_size /
            training_args.gradient_accumulation_steps /
            training_args.tpu_num_cores)
        training_args.steps_per_epoch = steps_per_epoch
        training_args.eval_steps = steps_per_epoch
        training_args.save_steps = (
            steps_per_epoch * training_args.num_train_epochs
        )  # only save once at the end to save space
        training_args.run_name = name
        training_args.output_dir = os.path.join("checkpoints", name)

        trainer = GPT2Trainer(
            model,
            training_args,
            extra_args=extra_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            callbacks=[GPT2WandbCallback],
        )
        trainer.remove_callback(WandbCallback)

        trainer.train()
        print("Done!")
Beispiel #18
0
def main():
    '''
    python -m ipdb run_gpt2.py      \
        --data-path /path/to/americanlit/     \
        --output-dir path/to/checkpoint/     \
        --eval-split valid     \
        --train-n-steps 20000     \
        --validate-every 1000     \
        --sequence-tune-rate 0.0     \
        --mode train \
        --model-name from_scratch \
        --batch-size 32 --seqlen 80 --gradient-accumulation-steps 4

    '''#with this bsz, seqlen, fits to bm gpus

    parser = argparse.ArgumentParser(description='openGPT-2 analysis')

    #debug menu
    parser.add_argument('--debug',
                        action='store_true',
                        help='use dbg1000.jsonl for faster programming')

    #training options
    #--> consider redefining FT...
    parser.add_argument('--mode',
                        choices=[
                            'train', 'FT', 'eval-singletoken',
                            'eval-completion', 'eval-both'
                        ],
                        default='eval-singletoken')
    parser.add_argument(
        '--input-mode',
        choices=['CLM', 'relFT'],
        default='CLM',
        help=
        'determine whether or not to put specials amongst sentences (CLM => do not  /  relFT => do)'
    )
    parser.add_argument('--data-path',
                        default='../jsonlpath/DBG',
                        help='path/to/jsonl/files')

    parser.add_argument('--eval-split', choices=['train', 'valid', 'test'])
    parser.add_argument(
        '--model-name',
        choices=['from_scratch', 'gpt2', 'gpt2-medium', 'gpt2-large'],
        default='gpt2')
    parser.add_argument('--model-load-dir', type=str, default=None)
    parser.add_argument('--seed', type=int, default=777)
    #parser.add_argument('--data-base', type=str)

    parser.add_argument('--batch-size', type=int, default=32)
    parser.add_argument("--max-steps",
                        default=-1,
                        type=int,
                        help="If > 0: set total number of training \
                            steps to perform. Override num_train_epochs.")
    parser.add_argument('--num-train-epochs', type=int, default=1)
    parser.add_argument('--gradient-accumulation-steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before\
                            performing a backward/update pass.")
    parser.add_argument('--seqlen', type=int, default=120)
    parser.add_argument(
        '--tolerate_offset',
        type=int,
        default=20,
        help=
        'when training with TPLoss, length to be additionally tolerated to args.seqlen.'
    )
    #training is done upto this step. regardless of args.max_steps or args.num_train_epochs
    parser.add_argument('--train-n-steps', type=int, default=-1)  #10000)

    parser.add_argument('--seqlen-singletoken', type=int, default=1024)
    parser.add_argument('--seqlen-completion', type=int,
                        default=300)  # need to unify both and use only one
    parser.add_argument('--seqlen-train', type=int, default=300)

    parser.add_argument(
        "--output-dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    # eval-completion
    parser.add_argument('--prefix-length', type=int, default=50)
    parser.add_argument('--continuation-length', type=int, default=100)
    parser.add_argument('--top-k', type=int, default=1)
    parser.add_argument('--top-p', type=float, default=0.0)

    # custom training
    parser.add_argument('--sequence-tune-rate', type=float, default=0.5)

    parser.add_argument('--report-metrics-every', type=int, default=10)
    parser.add_argument('--save-every', type=int, default=1000)
    parser.add_argument('--sequence-ngram-n', type=int, default=4)

    parser.add_argument('--validate-every', type=int, default=10000)

    # training loop
    parser.add_argument("--adam-epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument('--max-grad-norm', type=int, default=1)

    parser.add_argument('--learning-rate', type=float, default=6.25e-5)
    parser.add_argument("--warmup-steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument('--lr-schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight-decay', type=float, default=0.01)
    parser.add_argument('--lm-coef', type=float, default=0.9)
    parser.add_argument('--num-workers', type=int, default=0)

    args = parser.parse_args()
    print(args)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    ## file below prep'd by flatten.py using amerlit jsonl splits (which are all post processed)
    ## root / 'flattened_amerlit.txt'
    if args.mode == 'FT':
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    elif args.mode == 'train':  # train tokenizer based on corpus
        d_root = Path(args.data_path)
        vocab_path = d_root / 'vocab.json'
        rawtxt_path = d_root / 'flattened_amerlit.txt'  # this is obtained by running "python 4_flatten4vocab.py @ dataroot"
        merge_path = d_root / 'merges.txt'

        if not (vocab_path.exists()
                and merge_path.exists()):  #check if vocab file exists
            vocabgenerator = ByteLevelBPETokenizer()
            vocabgenerator.train(str(rawtxt_path),
                                 vocab_size=50_000,
                                 min_frequency=2)
            vocabgenerator.save(
                str(d_root)
            )  # vocabgenerator is also tokenizer but not from transformers
            del vocabgenerator
        tokenizer = GPT2Tokenizer(vocab_path, merge_path, errors='replace')

    # add CLS to the vocab
    # see example here: https://huggingface.co/transformers/model_doc/gpt2.html#transformers.GPT2DoubleHeadsModel.forward
    tokenizer = init_special_tokens(tokenizer)

    dataset_paths = {
        'train': d_root / 'train.jsonl',
        'valid': d_root / 'val.jsonl',
        'test': d_root / 'test.jsonl',
    }  # keep this for later code compatibility albeit it looks crappy

    if args.model_load_dir:
        model = GPT2LMHeadModel.from_pretrained(args.model_load_dir)
    elif args.model_name == 'from_scratch':
        config = GPT2Config()
        config.architectures = ["GPT2LMHeadModel"]
        model = GPT2LMHeadModel(config)

        #mp = GPT2LMHeadModel.from_pretrained('gpt2')
        #pretrained config vs GPT2Config has only difference
        # "architectures": ['GPT2LMHeadModel']
    else:
        model = GPT2LMHeadModel.from_pretrained(args.model_name)

    model.resize_token_embeddings(len(tokenizer))
    model.config.output_hidden_states = True  # make them return output hidden
    model.to(device)
    '''if args.mode == 'eval-singletoken' or args.mode == 'eval-both':
        eval_singletoken(model, args, dataset_paths)
    '''
    if args.mode == 'eval-completion' or args.mode == 'eval-both':
        datasets = get_datasets(dataset_paths, max_len=args.seqlen_completion)
        eval_sampler = SequentialSampler(datasets[args.eval_split])
        eval_dataloader = DataLoader(datasets[args.eval_split],
                                     sampler=eval_sampler,
                                     batch_size=1)

        model.eval()

        with torch.no_grad():
            all_text_completions = []

            bpe_ngram_metrics = Metrics(pad=-1)
            word_ngram_metrics = Metrics(pad=-1)

            for i, batch in tqdm(enumerate(eval_dataloader),
                                 desc="Evaluating",
                                 total=len(eval_dataloader)):
                input_sequence = batch[0].cuda()
                if input_sequence.size(1) < args.prefix_length:
                    continue

                # Predict the completions.
                batch = batch_input_sequence_by_prefix_length(
                    input_sequence, args.prefix_length)
                bpe_completions, _ = sample_sequence(model, batch,
                                                     args.prefix_length,
                                                     args.continuation_length,
                                                     args.top_k, args.top_p)
                bpe_completions = bpe_completions.tolist()

                # Extract continuations from the predicted completions.
                bpe_continuations = []
                text_continuations = []
                for bpe_completion in bpe_completions:
                    bpe_continuations.append(
                        bpe_completion[args.prefix_length:])
                    text_continuations.append(
                        get_text_continuation(bpe_completion, tokenizer, args))
                    all_text_completions.append(
                        tokenizer.decode(bpe_completion))

                # Only keep continuations with at least one 4-gram
                # (A short continuation may occur due to predicted whitespace, then tokenizing, despite being
                #  normal length in BPE tokens).
                text_continuations = [
                    c for c in text_continuations if len(c) > 3
                ]

                # Update metrics with this batch of continuations.
                bpe_ngram_metrics.update(bpe_continuations)
                word_ngram_metrics.update(text_continuations)

                # Save the (possibly intermediate) metrics.
                save_completion_metrics(bpe_metrics=bpe_ngram_metrics.report(
                    'bpe_%s' % args.eval_split),
                                        word_metrics=word_ngram_metrics.report(
                                            'word_%s' % args.eval_split),
                                        text_completions=all_text_completions,
                                        config=model.config.to_dict(),
                                        args=args)

    if args.mode == 'train':
        if not os.path.exists(os.path.join(args.output_dir, 'best')):
            os.makedirs(os.path.join(args.output_dir, 'best'))

        token_loss = mle_loss
        if args.debug:
            train_seq_dataloader = get_dataloaders(args,
                                                   tokenizer,
                                                   spl='dbg1000')
            #for batch in train_seq_dataloader:
            #print(batch.pre_tru.shape)
            #print(batch.pre_fals) # None
            #set_trace()
        else:  # debugging mode
            train_seq_dataloader = get_dataloaders(args,
                                                   tokenizer,
                                                   spl='train')

        # Setup optimizer

        # one of both need to be specified for training
        # args.num_train_epochs  /   args.max_steps
        if args.max_steps > 0:
            t_total = args.max_steps
            args.num_train_epochs = args.max_steps // (args.batch_size * len(
                train_seq_dataloader) // args.gradient_accumulation_steps) + 1

            #if performing gradient accumulation, steps won't update.
            #this means actual epochs training multiplied directly by "gradient_accumulation_steps"

        else:
            t_total = len(
                train_seq_dataloader
            ) // args.gradient_accumulation_steps * args.num_train_epochs

            #if not specified,

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(optimizer,
                                                    args.warmup_steps, t_total)

        total_steps = 0
        best_ppl = 1e20
        for _ in trange(args.num_train_epochs, desc="Epoch"):
            logging_outputs = []
            epoch_loss = 0
            epoch_steps = 0
            tqdm_bar = tqdm(train_seq_dataloader,
                            desc="Training",
                            total=t_total
                            if args.train_n_steps <= 1 else args.train_n_steps)
            for step, batch in enumerate(tqdm_bar):
                optimizer.zero_grad()

                # Sequence loss
                if torch.rand(1).item() < args.sequence_tune_rate:
                    if batch[0].size(1) < args.prefix_length:
                        continue
                    loss, batch_metrics = ul_seq(model, batch, args)

                # Token loss
                else:
                    loss, batch_metrics = token_loss(
                        model, batch, args)  # == mleloss(model, batch, args)

                loss.backward()
                optimizer.step()
                scheduler.step()
                epoch_loss += loss.item()
                epoch_steps += 1
                total_steps += 1
                tqdm_bar.desc = f"Training loss: {(epoch_loss/epoch_steps):.2f} lr: {scheduler.get_lr()[0]:.2f}"  # get_last_lr in pytorch 1.4.0
                #tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(epoch_loss/epoch_steps, scheduler.get_lr()[0]) # scheduler.get_last_lr() is for 1.4.0

                logging_outputs.append(batch_metrics)

                if epoch_steps % args.report_metrics_every == 0:
                    logging_average = CrossEntropyCriterionWCustomMetrics.aggregate_logging_outputs(
                        logging_outputs)
                    temp = SequencePenaltyCriterion.aggregate_logging_outputs(
                        logging_outputs)
                    for k, v in temp.items():
                        logging_average[k] = v
                    logging_average['ppl'] = 2**logging_average['loss']
                    print(logging_average)
                    logging_outputs = []

                if step == args.train_n_steps:
                    break  # here train_n_steps

                if epoch_steps % args.save_every == 0:
                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    output_model_file = os.path.join(args.output_dir,
                                                     WEIGHTS_NAME)
                    output_config_file = os.path.join(args.output_dir,
                                                      CONFIG_NAME)
                    torch.save(model_to_save.state_dict(), output_model_file)
                    model_to_save.config.to_json_file(output_config_file)
                    tokenizer.save_vocabulary(args.output_dir)

                if total_steps % args.validate_every == 0:
                    print("Validating...")
                    validation_outputs = eval_singletoken(
                        model, args, dataset_paths, train_iter=total_steps)
                    if validation_outputs['ppl'] < best_ppl:
                        best_ppl = validation_outputs['ppl']
                        model_to_save = model.module if hasattr(
                            model, 'module') else model
                        output_model_file = os.path.join(
                            args.output_dir, 'best', WEIGHTS_NAME)
                        output_config_file = os.path.join(
                            args.output_dir, 'best', CONFIG_NAME)
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        model_to_save.config.to_json_file(output_config_file)
                        tokenizer.save_vocabulary(
                            os.path.join(args.output_dir, 'best'))
                        save_singletoken_metrics(validation_outputs,
                                                 model.config.to_dict(),
                                                 args,
                                                 train_iter=total_steps,
                                                 best=True)
	def _load_switchboardcorpus(min_rare_vocab_times=0):
		from transformers import GPT2Tokenizer
		toker = PretrainedTokenizer(GPT2Tokenizer('./tests/dataloader/dummy_gpt2vocab/vocab.json', './tests/dataloader/dummy_gpt2vocab/merges.txt'))
		return SwitchboardCorpus("./tests/dataloader/dummy_switchboardcorpus#SwitchboardCorpus",
								 min_rare_vocab_times=min_rare_vocab_times, tokenizer=toker, pretrained="gpt2")
Beispiel #20
0
def main():
    args = setup_train_args()
    # 日志同时输出到文件和console
    global logger
    logger = create_logger(args)
    # 当用户使用GPU,并且GPU可用时
    args.cuda = torch.cuda.is_available() and not args.no_cuda
    device = 'cuda' if args.cuda else 'cpu'
    logger.info('using device:{}'.format(device))
    # 为CPU设置种子用于生成随机数,以使得结果是确定的
    # 为当前GPU设置随机种子;如果使用多个GPU,应该使用torch.cuda.manual_seed_all()为所有的GPU设置种子。
    # 当得到比较好的结果时我们通常希望这个结果是可以复现
    if args.seed:
        set_random_seed(args)

    # 设置使用哪些显卡进行训练
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device

    # 初始化tokenizer
    tokenizer = GPT2Tokenizer(args.vocab_path + 'vocab.json',
                              args.vocab_path + 'merges.txt',
                              pad_token=PAD,
                              cls_token=CLS,
                              sep_token=SEP)
    tokenizer.add_special_tokens({
        "pad_token": PAD,
        "cls_token": CLS,
        "sep_token": SEP
    })
    # tokenizer的字典大小
    vocab_size = len(tokenizer)

    global pad_id
    pad_id = tokenizer.convert_tokens_to_ids(PAD)

    # 创建对话模型的输出目录
    if not os.path.exists(args.dialogue_model_output_path):
        os.mkdir(args.dialogue_model_output_path)
    # 创建MMI模型的输出目录
    if not os.path.exists(args.mmi_model_output_path):
        os.mkdir(args.mmi_model_output_path)
    # 加载GPT2模型
    # ----------- Model
    model, n_ctx = create_model(args, vocab_size)
    model.to(device)
    # 对原始数据进行预处理,将原始语料转换成对应的token_id
    # ----------- Data
    if args.raw and args.train_mmi:  # 如果当前是要训练MMI模型
        preprocess_mmi_raw_data(args, tokenizer, n_ctx)
    elif args.raw and not args.train_mmi:  # 如果当前是要训练对话生成模型
        preprocess_raw_data(args, tokenizer, n_ctx)  # 数据处理 -- 重点要看的
    # 是否使用多块GPU进行并行运算
    multi_gpu = False
    if args.cuda and torch.cuda.device_count() > 1:
        logger.info("Let's use GPUs to train")
        model = DataParallel(
            model, device_ids=[int(i) for i in args.device.split(',')])
        multi_gpu = True
    # 记录模型参数数量
    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    logger.info('number of model parameters: {}'.format(num_parameters))

    # 加载数据
    logger.info("loading traing data")
    if args.train_mmi:  # 如果是训练MMI模型
        with open(args.train_mmi_tokenized_path, "r", encoding="utf8") as f:
            data = f.read()
    else:  # 如果是训练对话生成模型
        with open(args.train_tokenized_path, "r", encoding="utf8") as f:
            data = f.read()
    data_list = data.split("\n")
    train_list, test_list = train_test_split(data_list,
                                             test_size=0.01,
                                             random_state=1)
    # 开始训练
    train(model, device, train_list, multi_gpu, args)
    # 测试模型
    evaluate(model, device, test_list, multi_gpu, args)
Beispiel #21
0
    def __init__(
        self,
        model: str = None,
        config: Union[str, GPT2Config] = None,
        vocab_file: str = None,
        merges_file: str = None,
        cache_dir: str = "aitextgen",
        tf_gpt2: str = None,
        to_gpu: bool = False,
        to_fp16: bool = False,
        verbose: bool = False,
        torchscript: bool = False,
        ts_to_trace: bool = False,
        bos_token: str = None,
        eos_token: str = None,
        unk_token: str = None,
        **kwargs,
    ) -> None:

        if not verbose:
            for module in [
                    "transformers.file_utils",
                    "transformers.configuration_utils",
                    "transformers.tokenization_utils",
                    "filelock",
                    "transformers.modeling_gpt2",
            ]:
                logging.getLogger(module).setLevel(logging.WARN)
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.ERROR)

        if torchscript:
            assert model
            logger.info(f"Loading traced GPT-2 model from provided {model}.")
            if config is None:
                config = GPT2Config()
            self.torchscript = True
            self.model = GPT2LMHeadModel(config)

            # Transpose the traced model attributes to a GPT2LMHeadModel class
            # so it can inherit its functions
            pt_model = torch.jit.load(model)
            self.model.transformer = pt_model.transformer
            self.model.lm_head = pt_model.lm_head

        elif tf_gpt2:
            # Download + convert the TF weights if a PyTorch model has not been created
            if not os.path.isfile(
                    os.path.join(cache_dir, f"pytorch_model_{tf_gpt2}.bin")):
                assert tf_gpt2 in [
                    "124M",
                    "355M",
                    "774M",
                    "1558M",
                ], "Invalid TensorFlow GPT-2 model size."

                logger.info(
                    f"Downloading the {tf_gpt2} GPT-2 TensorFlow weights/config "
                    + "from Google's servers")

                download_gpt2(cache_dir, tf_gpt2)

                logger.info(
                    f"Converting the {tf_gpt2} GPT-2 TensorFlow weights to PyTorch."
                )

                config_path = os.path.join(cache_dir, tf_gpt2, "hparams.json")

                convert_gpt2_checkpoint_to_pytorch(
                    os.path.join(cache_dir, tf_gpt2),
                    config_path,
                    cache_dir,
                )

                os.rename(
                    os.path.join(cache_dir, f"pytorch_model.bin"),
                    os.path.join(cache_dir, f"pytorch_model_{tf_gpt2}.bin"),
                )

                os.rename(
                    os.path.join(cache_dir, f"config.json"),
                    os.path.join(cache_dir, f"config_{tf_gpt2}.json"),
                )

            logger.info(f"Loading {tf_gpt2} GPT-2 model from /{cache_dir}.")
            model = os.path.join(cache_dir, f"pytorch_model_{tf_gpt2}.bin")
            config = os.path.join(cache_dir, f"config_{tf_gpt2}.json")

            self.model = GPT2LMHeadModel.from_pretrained(model, config=config)

        elif model and os.path.exists(model):
            # A pytorch_model.bin (+ optional config/config.json) is provided
            logger.info(f"Loading GPT-2 model from provided {model}.")
            if config is None:
                config = GPT2Config()
            if ts_to_trace:
                config.torchscript = True
            self.model = GPT2LMHeadModel.from_pretrained(model, config=config)
        elif config:
            if ts_to_trace:
                config.torchscript = True
            # Manually construct a GPT-2 model from scratch
            logger.info("Constructing GPT-2 model from provided config.")
            self.model = AutoModelWithLMHead.from_config(config=config)
        else:
            # Download and cache model from Huggingface
            if os.path.isdir(cache_dir) and len(os.listdir(cache_dir)) > 0:
                logger.info(
                    f"Loading {model or 'gpt2'} model from /{cache_dir}.")
            else:
                logger.info(
                    f"Downloading {model or 'gpt2'} model to /{cache_dir}.")
            self.model = GPT2LMHeadModel.from_pretrained(
                model or "gpt2", cache_dir=cache_dir, torchscript=ts_to_trace)
            if model and "gpt2" not in model:
                logger.info(f"Using the tokenizer for {model}.")
                self.tokenizer = GPT2Tokenizer.from_pretrained(
                    model,
                    cache_dir=cache_dir,
                )

        if self.tokenizer is None:
            # Update tokenizer settings (if not set already)
            args = locals()
            custom_tokenizer = False
            for attr in [
                    "vocab_file",
                    "merges_file",
                    "bos_token",
                    "eos_token",
                    "unk_token",
            ]:
                if args[attr] is not None:
                    custom_tokenizer = True
                    setattr(self, attr, args[attr])

            if custom_tokenizer:
                logger.info("Using a custom tokenizer.")
            else:
                logger.info("Using the default GPT-2 Tokenizer.")

            self.tokenizer = GPT2Tokenizer(
                vocab_file=self.vocab_file,
                merges_file=self.merges_file,
                bos_token=self.bos_token,
                eos_token=self.eos_token,
                unk_token=self.unk_token,
                pad_token=self.pad_token,
            )

        if to_gpu:
            if to_fp16:
                self.to_fp16()
            self.to_gpu()
Beispiel #22
0
 def __init__(self, token_json, merges):
     self.tokenizer = GPT2Tokenizer(token_json, merges)
     # not ensure which pad_token should be
     self.tokenizer.pad_token = '!'  # padding token = 0