Python BertTokenizer Examples, tokenizations.tokenization_bert.BertTokenizer Python Examples

Example #1

0

Show file

def getModel(path_config, gpu='0', fp16=False):
    print("load model......")
    torch.cuda.set_device(int(gpu))
    #os.environ["CUDA_VISIBLE_DEVICES"] = gpu
    with open(path_config, 'r') as f:
        config = json.load(f)
    from tokenizations import tokenization_bert
    tokenizer_path = config['tokenizer_path']
    model_path = config['model_path']
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("use device:%s" % device)
    tokenizer = tokenization_bert.BertTokenizer(vocab_file=tokenizer_path)
    model = GPT2LMHeadModel.from_pretrained(model_path)
    model.to(device)
    model.eval()
    if fp16:
        optimizer = transformers.AdamW(model.parameters(),
                                       lr=0.1,
                                       correct_bias=True)
        from apex import amp
        fp16_opt_level = 'O1'
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)
    return model, tokenizer, config, device

Example #2

0

Show file

 def load_model(self,path='model/mini/'):
     device = "cuda" if torch.cuda.is_available() else "cpu"
     device = "cpu"
     tokenizer = tokenization_bert.BertTokenizer(vocab_file=path+'vocab.txt')
     model = GPT2LMHeadModel.from_pretrained(path)
     model.to(device)
     model.eval()
     return model, tokenizer

Example #3

0

Show file

File: datapro.py Project: baokui/GPT2-Chinese

def main(path_source, path_target, path_vocab, nb_piece, n_ctx):
    #tokenizer_path = '../data/vocab/vocab_god_userdata.txt'
    #tokenized_data_path = '../data/userdata_tokenized_new/'
    full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=path_vocab)
    build_files(full_tokenizer,
                path_source,
                path_target,
                nb_piece=nb_piece,
                n_ctx=n_ctx)

Example #4

0

Show file

 def __init__(self, args):
     self.device = "cuda" if torch.cuda.is_available() else "cpu"
     self.batch_size = args.batch_size
     self.tokenizer = tokenization_bert.BertTokenizer(
         vocab_file=args.tokenizer_path)
     self.model = GPT2KWModel.from_pretrained(args.model_path)
     self.model.to(self.device)
     self.model.eval()
     self.keywords_max_length = 64

Example #5

0

Show file

File: generate.py Project: Hongyu-Li/MC_Sandwich_Generator

def main(length, prefix, lucky_mode, rhyme_pattern=''):
    os.environ["CUDA_VISIBLE_DEVICES"] = '0,1,2,3'
    batch_size = 1
    nsamples = 1
    temperature = 1
    topk = 8
    topp = 0
    repetition_penalty = 1.0

    device = "cuda" if torch.cuda.is_available() else "cpu"

    tokenizer = tokenization_bert.BertTokenizer(vocab_file="./cache/vocab_small.txt")
    model = GPT2LMHeadModel.from_pretrained("./model/final_model")
    model.to(device)
    model.eval()

    n_ctx = model.config.n_ctx

    while True:
        raw_text = prefix
        context_tokens = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(raw_text))
        generated = 0
        for _ in range(nsamples // batch_size):
            out = generate(
                n_ctx=n_ctx,
                model=model,
                context=context_tokens,
                length=int(length),
                lucky_mode=lucky_mode,
                rhyme_pattern=rhyme_pattern,
                tokenizer=tokenizer,
                temperature=temperature,
                top_k=topk, top_p=topp,
                repitition_penalty=repetition_penalty,
                device=device
            )
            for i in range(batch_size):
                generated += 1
                text = tokenizer.convert_ids_to_tokens(out)
                for i, item in enumerate(text[:-1]):  # 确保英文前后有空格
                    if is_word(item) and is_word(text[i + 1]):
                        text[i] = item + ' '
                for i, item in enumerate(text):
                    if item == '[MASK]':
                        text[i] = ''
                    elif item == '[CLS]':
                        text[i] = '\n\n'
                    elif item == '[SEP]':
                        text[i] = '\n'
                    elif item == '@':
                        text[i] = ' '
                if text:
                    text = ''.join(text).replace('##', '').strip()
        if generated == nsamples:
            break
    return text

Example #6

0

Show file

File: datapro_userdata.py Project: baokui/GPT2-Chinese

def main(data_path, idx, dataname, tokenized_data_path, path_vocab, padding):
    #tokenizer_path = '../data/vocab/vocab_god_userdata.txt'
    #tokenized_data_path = '../data/userdata_tokenized_new/'
    full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=path_vocab)
    build_files(data_path,
                dataname,
                tokenized_data_path,
                full_tokenizer,
                idx,
                padding=padding)

Example #7

0

Show file

def generate(titles, k):
    length = 1024
    temperature = 1
    topk = 8
    topp = 0
    repetition_penalty = 1.5
    save_path = 'generated_template/'
    device = "cuda" if torch.cuda.is_available() else "cpu"
    tokenizer = tokenization_bert.BertTokenizer('../cache/vocab_small.txt')
    model = GPT2LMHeadModel.from_pretrained('model_template/final_model')
    model.to(device)
    model.eval()
    n_ctx = model.config.n_ctx
    if not os.path.exists(save_path):
        os.mkdir(save_path)

    with open(save_path + str(k) + '.txt', 'w', encoding='utf-8') as f:
        context_tokens = tokenizer.convert_tokens_to_ids(
            tokenizer.tokenize(titles))
        generated = 0
        out = sample_sequence(n_ctx=n_ctx,
                              model=model,
                              length=length,
                              context=context_tokens,
                              tokenizer=tokenizer,
                              temperature=temperature,
                              top_k=topk,
                              top_p=topp,
                              repitition_penalty=repetition_penalty,
                              device=device)
        out = out.tolist()[0]

        generated += 1
        text = tokenizer.convert_ids_to_tokens(out)

        for i, item in enumerate(text[:-1]):  # 确保英文前后有空格
            if is_word(item) and is_word(text[i + 1]):
                text[i] = item + ' '

        for i, item in enumerate(text):
            if item == '[MASK]':
                text[i] = ''
            if item == '[CLS]' or item == '[SEP]':
                text[i] = '\n'

        print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
        text = ''.join(text).replace('##', '').strip()
        # text = ''.join(text.split('\n')[:-1])
        genarateText = text[-length:]
        text = titles + genarateText
        print(genarateText)
        f.write(text)
        print("=" * 80)
        k = k + 1
        return text, k

Example #8

0

Show file

def getModel(path_config):
    with open(path_config, 'r') as f:
        config = json.load(f)
    from tokenizations import tokenization_bert
    tokenizer_path = config['tokenizer_path']
    model_path = config['model_path']
    device = 'cpu'
    tokenizer = tokenization_bert.BertTokenizer(vocab_file=tokenizer_path)
    model = GPT2LMHeadModel.from_pretrained(model_path)
    model.to(device)
    model.eval()
    return model, tokenizer, config

Example #9

0

Show file

File: gpt_gen.py Project: baokui/GPT2-Chinese

def getModel(path_config, gpu='0'):
    print("load model......")
    torch.cuda.set_device(int(gpu))
    #os.environ["CUDA_VISIBLE_DEVICES"] = gpu
    with open(path_config, 'r') as f:
        config = json.load(f)
    from tokenizations import tokenization_bert
    tokenizer_path = config['tokenizer_path']
    model_path = config['model_path']
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("use device:%s" % device)
    tokenizer = tokenization_bert.BertTokenizer(vocab_file=tokenizer_path)
    model = GPT2LMHeadModel.from_pretrained(model_path)
    model.to(device)
    model.eval()
    return model, tokenizer, config, device

Example #10

0

Show file

    def __init__(self,
                 model_path=MODEL7_PATH,
                 tokenizer_path=TOKEN7_PATH,
                 verbose=0):
        """Init model with given path."""
        super(genModel, self).__init__()

        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = tokenization_bert.BertTokenizer(
            vocab_file=tokenizer_path)
        self.model = GPT2LMHeadModel.from_pretrained(model_path)
        self.model.to(self.device)
        self.model.eval()

        self.n_ctx = self.model.config.n_ctx
        self.past = None  # pre-computed hidden-states
        self.verbose = verbose

Example #11

0

Show file

def main():
    log('start')
    filename = '全唐诗.txt'
    nodel = filename.split('.')[0]
    nodelname = pinyin(nodel)
    # 新建文件夹
    createfilepath(nodelname)
    createfilepath(nodelname + '/data_' + nodelname)
    createfilepath(nodelname + '/data_' + nodelname + '/tokenized')
    createfilepath(nodelname + '/generated_' + nodelname)
    createfilepath(nodelname + '/model_' + nodelname)
    createfilepath(nodelname + '/model_' + nodelname + '/final_model')
    log('files build')

    # 根据模板创建需要的py文件和bat脚本
    createpyfromtemplate('start-parse-template.bat', nodelname)
    createpyfromtemplate('start-train-template.bat', nodelname)
    createpyfromtemplate('generate_template.py', nodelname)
    createpyfromtemplate('train-template.py', nodelname)

    log('py and bat build')

    # return
    # 根据指定的小说生成训练数据集
    tokenizer_path = 'cache/vocab_small.txt'
    raw_data_path = nodelname + '/data_' + nodelname
    tokenized_data_path = raw_data_path + '/tokenized/'

    from tokenizations import tokenization_bert
    full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=tokenizer_path)
    full_tokenizer.max_len = 999999

    log('building datas')

    build_files(filename=filename,
                data_path=raw_data_path,
                tokenized_data_path=tokenized_data_path,
                num_pieces=1,
                full_tokenizer=full_tokenizer,
                min_length=128)
    log('files built')

Example #12

0

Show file

File: create_pretraining_data.py Project: xinfeng1i/GPT2-Chinese

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--tokenizer_path', type=str, required=True, help='选择词库')
    parser.add_argument('--raw_data_path', type=str, required=True, help='原始训练语料')
    parser.add_argument('--tokenized_data_path', type=str, required=True, help='tokenized语料存放位置')
    parser.add_argument('--num_pieces', default=1, type=int, required=False, help='将训练语料分成多少份')
    parser.add_argument('--min_length', default=10, type=int, required=False, help='最短收录文章长度')

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    from tokenizations import tokenization_bert
    full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    print('building files...')
    build_files(data_path=args.raw_data_path, tokenized_data_path=args.tokenized_data_path, num_pieces=args.num_pieces,
                full_tokenizer=full_tokenizer, min_length=args.min_length)
    print('files built')

Example #13

0

Show file

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--vocab_file",
                        type=str,
                        default="cache/vocab.txt",
                        required=False,
                        help="词表路径")
    parser.add_argument("--input_file",
                        type=str,
                        required=True,
                        help="输入文本文件, 格式: 每个句子占一行，不同 doc 之间以空行隔开")
    parser.add_argument("--output_file",
                        type=str,
                        required=True,
                        help="输出文件, 格式：Token Ids")

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    # parameter checking
    if not os.path.exists(args.input_file) or not os.path.isfile(args.input_file) \
            or not os.access(args.input_file, os.R_OK):
        logger.error("Input file [%s] not exists or not readable!" %
                     args.input_file)
        sys.exit(1)

    if not os.path.exists(args.vocab_file) or not os.path.isfile(args.vocab_file) \
            or not os.access(args.vocab_file, os.R_OK):
        logger.error("Vocab file [%s] not exists or not readable!" %
                     args.vocab_file)
        sys.exit(1)

    full_tokenizer = tokenization_bert.BertTokenizer(
        vocab_file=args.vocab_file)
    convert(args.input_file, args.output_file, full_tokenizer)

    logger.info("Data convert finished!")

Example #14

0

Show file

from tokenizations import tokenization_bert

os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # 此处设置程序使用哪些显卡
tokenizer_path = "./model/gpt2_prose/vocab.txt"
model_config = "./model/model_godText/final_model/config.json"
model_path = "./model/model_godText/final_model/"
save_samples_path = "./test_godText/finetuned/"
length = 50
batch_size = 4
nsamples = 10
temperature = 1.0
topk = 8
topp = 0
repetition_penalty = 1.0
device = 'cpu'
tokenizer = tokenization_bert.BertTokenizer(vocab_file=tokenizer_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
model.to(device)
model.eval()

params = list(model.parameters())
k = 0
for i in params:
    l = 1
    #print("该层的结构：" + str(list(i.size())))
    for j in i.size():
        l *= j
    #print("该层参数和：" + str(l))
    k = k + l
print("总参数数量和:%dM" % int(k / 1024 / 1024))

Example #15

0

Show file

File: generate.py Project: xiaoshiqi/GPT2-Chinese

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='cuda visible devices')
    parser.add_argument('--nsamples',
                        default=8,
                        type=int,
                        required=False,
                        help='number of generated samples')
    parser.add_argument('--temperature', default=1, type=float, required=False)
    parser.add_argument('--topk',
                        default=8,
                        type=int,
                        required=False,
                        help='k for top k sampling')
    parser.add_argument('--topp',
                        default=0,
                        type=float,
                        required=False,
                        help='p for top p sampling')
    parser.add_argument('--tokenizer_path',
                        default='data/vocabs.txt',
                        type=str,
                        required=False,
                        help='path of the vocabulary file')
    parser.add_argument('--model_path',
                        default='model/model_epoch24',
                        type=str,
                        required=False,
                        help='pre-trained model dir')
    parser.add_argument('--prefix',
                        default='仁义礼智信',
                        type=str,
                        required=False,
                        help='prefix of the couplet')
    parser.add_argument('--save_samples',
                        action='store_true',
                        help='save samples')
    parser.add_argument('--save_samples_path',
                        default='data',
                        type=str,
                        required=False,
                        help="save the samples to this dir")

    parser.add_argument('--bow', action='store_true', help='use PPLM-BOW')
    parser.add_argument('--bow_path',
                        default='data/bow_newyear.txt',
                        type=str,
                        required=False,
                        help='path of the bag of considered characters')
    parser.add_argument('--bow_stepsize',
                        default=0.3,
                        type=float,
                        required=False,
                        help='stepsize of the PPLM')
    parser.add_argument('--bow_num_iterations',
                        default=3,
                        type=int,
                        required=False,
                        help='num_iterations of the PPLM')

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device
    device = "cuda" if torch.cuda.is_available() else "cpu"

    tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    model = GPT2LMHeadModel.from_pretrained(args.model_path,
                                            output_hidden_states=True)
    model.to(device)
    model.eval()

    if args.prefix.find("|") < 0:
        length = model.config.n_ctx - len(args.prefix)
    else:
        length = 2 * args.prefix.index("|") - len(args.prefix) + 1

    bow_vectors = None
    if args.bow:
        bow_vectors = build_bow_vectors(args.bow_path, tokenizer, device)

    if args.save_samples:
        if not os.path.exists(args.save_samples_path):
            os.makedirs(args.save_samples_path)
        samples_file = open(args.save_samples_path + '/generated.txt',
                            'w',
                            encoding='utf8')

    context_tokens = [tokenizer.convert_tokens_to_ids('[MASK]')
                      ] + tokenizer.convert_tokens_to_ids(
                          tokenizer.tokenize(args.prefix))

    for t in range(args.nsamples):
        out = sample_sequence(model,
                              context_tokens,
                              length,
                              temperature=args.temperature,
                              top_k=args.topk,
                              top_p=args.topp,
                              device=device,
                              bow=args.bow,
                              bow_vectors=bow_vectors,
                              bow_stepsize=args.bow_stepsize,
                              bow_num_iterations=args.bow_num_iterations,
                              tokenizer=tokenizer)
        text = tokenizer.convert_ids_to_tokens(out)
        for i, item in enumerate(text):
            if item == '[MASK]':
                text[i] = '上联：'
            elif item == '[PAD]':
                text[i] = ' '
            elif item == '|':
                text[i] = ' 下联：'

        print("=" * 40 + " SAMPLE " + str(t + 1) + " " + "=" * 40 + "\n")
        text = ''.join(text)
        print(text)
        if args.save_samples:
            samples_file.write(text + '\n')

    if args.save_samples:
        samples_file.close()

Example #16

0

Show file

File: datapro_godText_finetune.py Project: baokui/GPT2-Chinese

def main_seg(data_path, dataname):
    tokenizer_path = '../model/model_dabaigou_seg/vocab.txt'
    tokenized_data_path = '../data/dabaigou_tokenized_seg/'
    full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=tokenizer_path)
    build_files_seg(data_path, dataname, tokenized_data_path, full_tokenizer)

Example #17

0

Show file

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='设置使用哪些显卡')
    parser.add_argument('--model_config',
                        default='config/model_config_small.json',
                        type=str,
                        required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path',
                        default='cache/vocab_small.txt',
                        type=str,
                        required=False,
                        help='选择词库')
    parser.add_argument('--raw_data_path',
                        default='data/eval.json',
                        type=str,
                        required=False,
                        help='原始语料')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized_eval/',
                        type=str,
                        required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--batch_size',
                        default=8,
                        type=int,
                        required=False,
                        help='batch size')
    parser.add_argument('--log_step',
                        default=1,
                        type=int,
                        required=False,
                        help='多少步汇报一次')
    parser.add_argument('--stride',
                        default=768,
                        type=int,
                        required=False,
                        help='取数据的窗口步长')
    parser.add_argument('--num_pieces',
                        default=100,
                        type=int,
                        required=False,
                        help='将训练语料分成多少份')
    parser.add_argument('--min_length',
                        default=128,
                        type=int,
                        required=False,
                        help='最短收录文章长度')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='模型起点路径')
    parser.add_argument('--output_dir',
                        default='eval_result/',
                        type=str,
                        required=False,
                        help='结果输出路径')
    parser.add_argument('--output_name',
                        default='result.txt',
                        type=str,
                        required=False,
                        help='结果输出')

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    # if args.no_wordpiece:
    #     from tokenizations import tokenization_bert_without_wordpiece as tokenization_bert
    # else:
    from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡

    # model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(args.model_config)
    # print('config:\n' + model_config.to_json_string())

    # n_ctx = model_config.n_ctx
    n_ctx = 1024
    full_tokenizer = tokenization_bert.BertTokenizer(
        vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = n_ctx
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw  # 选择是否从零开始构建数据集
    batch_size = args.batch_size
    log_step = args.log_step
    stride = args.stride
    num_pieces = args.num_pieces
    min_length = args.min_length
    output_dir = args.output_dir

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if raw:
        print('building files')
        build_files(data_path=raw_data_path,
                    tokenized_data_path=tokenized_data_path,
                    num_pieces=num_pieces,
                    full_tokenizer=full_tokenizer,
                    min_length=min_length)
        print('files built')

    assert args.pretrained_model != '', AssertionError(
        'you need to specify a trained model.')
    model = GPT2LMHeadModel.from_pretrained(args.pretrained_model)
    model.eval()
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))

    multi_gpu = False
    # full_len = 0
    # print('calculating total steps')
    # for i in tqdm(range(num_pieces)):
    #     with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f:
    #         full_len += len([int(item) for item in f.read().strip().split()])

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model)
        multi_gpu = True
    print('starting training')
    overall_step = 0

    total_loss = 0
    total_steps = 0
    #  eval
    now = datetime.now()
    print('time: {}'.format(now))
    piece_num = 0
    for i in range(num_pieces):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                  'r') as f:
            line = f.read().strip()
        tokens = line.split()
        tokens = [int(token) for token in tokens]
        start_point = 0
        samples = []
        while start_point < len(tokens) - n_ctx:
            samples.append(tokens[start_point:start_point + n_ctx])
            start_point += stride
        start_point -= stride
        last = tokens[start_point + n_ctx:]
        last.extend([
            full_tokenizer.convert_tokens_to_ids(['[PAD]']) *
            (n_ctx - len(last))
        ])
        random.shuffle(samples)
        for step in range(len(samples) // batch_size):  # drop last

            #  prepare data
            batch = samples[step * batch_size:(step + 1) * batch_size]
            batch_labels = []
            batch_inputs = []
            for ids in batch:
                int_ids_for_labels = [int(x) for x in ids]
                int_ids_for_inputs = [int(x) for x in ids]
                batch_labels.append(int_ids_for_labels)
                batch_inputs.append(int_ids_for_inputs)
            batch_labels = torch.tensor(batch_labels).long().to(device)
            batch_inputs = torch.tensor(batch_inputs).long().to(device)

            #  forward pass
            outputs = model.forward(input_ids=batch_inputs,
                                    labels=batch_labels)
            loss, logits = outputs[:2]

            #  get loss
            if multi_gpu:
                loss = loss.mean()
            total_loss += loss
            total_steps += 1

            if (overall_step + 1) % log_step == 0:
                print('now time: {}:{}. Step {} of piece {}, ppl {}'.format(
                    datetime.now().hour,
                    datetime.now().minute, (step + 1), piece_num,
                    torch.exp(loss)))
        piece_num += 1

    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)
    with open(args.output_dir + args.output_name, 'w') as f:
        f.write(total_loss / total_steps)

Example #18

0

Show file

File: generate_texts.py Project: Dongfeng-He/text_generation

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡')
    parser.add_argument('--length', default=-1, type=int, required=False, help='生成长度')
    parser.add_argument('--temperature', default=1, type=float, required=False, help='生成温度，越高越随机')
    parser.add_argument('--topk', default=8, type=int, required=False, help='生成的时候最高几选一')
    parser.add_argument('--topp', default=0, type=float, required=False, help='生成的时候积累概率最高多少')
    parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False,
                        help='模型参数路径')
    parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='词表路径')
    parser.add_argument('--model_path', default='model/final_model', type=str, required=False, help='模型路径')
    parser.add_argument('--save_path', default='generated/', type=str, required=False, help='存放生成的文件的路径')
    parser.add_argument('--articles_per_title', default=5, type=int, required=False, help='每个标题生成多少篇文章')
    parser.add_argument('--titles', default='萧炎', type=str, required=False, help='标题列表，是一个字符串，用空格分开')
    parser.add_argument('--titles_file', default='', type=str, required=False,
                        help='标题列表文件，文件中每行一个标题。如果这个选项有值则titles无效')

    args = parser.parse_args()
    print(args)

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    length = args.length
    temperature = args.temperature
    topk = args.topk
    topp = args.topp
    titles = args.title.split()  # 列表，里面每个元素是一个生成的标题
    if args.titles_file:
        with open(args.titles_file, 'r') as f:
            titles = [line.strip('\n') for line in f.readlines()]
    articles_per_title = args.articles_per_title  # 这里定义一个标题生成多少篇文章
    save_path = args.save_path  # 设置存到哪

    device = "cuda" if torch.cuda.is_available() else "cpu"

    tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    model_config = pytorch_transformers.GPT2Config.from_json_file(args.model_config)
    model = GPT2LMHeadModel.from_pretrained(args.model_path)
    model.to(device)
    model.eval()

    if not os.path.exists(save_path):
        os.mkdir(save_path)
    if length == -1:
        length = model.config.n_ctx // 2
    elif length > model.config.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)

    for i, title in enumerate(titles):
        for j in range(articles_per_title):
            with open(save_path + str(i * j), 'w') as f:
                context_tokens = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(title))
                generated = 0
                out = sample_sequence(
                    model=model, length=length,
                    context=context_tokens,
                    temperature=temperature, top_k=topk, top_p=topp, device=device
                )
                out = out.tolist()

                generated += 1
                text = tokenizer.convert_ids_to_tokens(out[0])

                for i, item in enumerate(text[:-1]):  # 确保英文前后有空格
                    if is_word(item) and is_word(text[i + 1]):
                        text[i] = item + ' '

                for i, item in enumerate(text):
                    if item == '[MASK]':
                        text[i] = ''
                    if item == '[CLS]' or item == '[SEP]':
                        text[i] = '\n'

                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                text = ''.join(text).replace('##', '').strip()
                # text = ''.join(text.split('\n')[:-1])
                print(text)
                f.write(text)
                print("=" * 80)

Example #19

0

Show file

def main(data_path, dataname):
    tokenizer_path = '../model/gpt2_prose/vocab.txt'
    tokenized_data_path = '../data/dabaigou_tokenized_new/'
    full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=tokenizer_path)
    build_files(data_path, dataname, tokenized_data_path, full_tokenizer)
    shutil.rmtree(data_path)

Example #20

0

Show file

File: generate_.py Project: hzyang95/gpt2_demo

                        text[i] = '\n\n'
                    elif item == '[SEP]':
                        text[i] = '\n'
                info = "=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40 + "\n"
                print(info)
                text = ''.join(text).replace('##', '').strip()
                print(text)
                res.append(text[len(raw_text):])
                if save_samples:
                    # samples_file.write(info)
                    samples_file.write(raw_text+text)
                    samples_file.write('\n')
                    # samples_file.write('=' * 90)
                    # samples_file.write('\n' * 2)
        print("=" * 80)
        if generated == nsamples:
            # close file when finish writing.
            if save_samples:
                samples_file.close()
            break
    return res

if __name__ == '__main__':
    from tokenizations import tokenization_bert
    device = "cpu"
    tokenizer = tokenization_bert.BertTokenizer(vocab_file='cache/vocab.txt')
    model = GPT2LMHeadModel.from_pretrained('model/covi_final_model/')
    model.to(device)
    model.eval()
    print(gen("", model, tokenizer, 3, 50, 1.0))

Example #21

0

Show file

File: gserver.py Project: EVASHINJI/dl4nlp_misc

app = Flask(__name__, static_url_path='')

config_file = os.environ.get("config_file")
if config_file is None: config_file = "songci.json"
print("using config_file: %s" % config_file)

args = {}
try:
    args = json.loads(open(config_file).read())
except Exception as e:
    print(e)
    sys.exit(1)

unk_idx = open(args['vocab_path']).read().split('\n').index('[UNK]')
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = tokenization_bert.BertTokenizer(vocab_file=args['vocab_path'])
model = GPT2LMHeadModel.from_pretrained(args['model_path'])
model.to(device)
model.eval()


@app.route("/api")
def get():
    prefix = request.args.get('text', '')

    length = args['length']  #101
    temperature = 1
    topk = 8
    topp = 0

    context_tokens = tokenizer.convert_tokens_to_ids(

Example #22

0

Show file

File: train.py Project: xiaoshiqi/GPT2-Chinese

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='cuda visible devices')
    parser.add_argument('--model_config',
                        default='config/model_config.json',
                        type=str,
                        required=False,
                        help='path of the model configration file')
    parser.add_argument('--tokenizer_path',
                        default='data/vocabs.txt',
                        type=str,
                        required=False,
                        help='path of the vocabulary file')
    parser.add_argument('--raw_data_path',
                        default='data/samples.json',
                        type=str,
                        required=False,
                        help='path of the samples file')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized/',
                        type=str,
                        required=False,
                        help='save the tokenized samples file to this dir')
    parser.add_argument(
        '--raw',
        action='store_true',
        help=
        'do tokenize before training, no need if already tokenized with same configration'
    )
    parser.add_argument('--epochs', default=24, type=int, required=False)
    parser.add_argument('--batch_size', default=16, type=int, required=False)
    parser.add_argument('--lr', default=2e-4, type=float, required=False)
    parser.add_argument('--warmup_steps',
                        default=4000,
                        type=int,
                        required=False)
    parser.add_argument('--log_step',
                        default=4000,
                        type=int,
                        required=False,
                        help='period of reporting loss')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=int,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--output_dir',
                        default='model/',
                        type=str,
                        required=False,
                        help='save the model to this dir')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='pre-trained model dir')

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device

    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    full_tokenizer = tokenization_bert.BertTokenizer(
        vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = 999999

    if torch.cuda.is_available():
        device = 'cuda'
        print(torch.cuda.get_device_name(0))
    else:
        device = 'cpu'
        print(device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    gradient_accumulation = args.gradient_accumulation
    max_grad_norm = args.max_grad_norm
    output_dir = args.output_dir
    assert log_step % gradient_accumulation == 0

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if raw:
        print('building files')
        build_files(data_path=raw_data_path,
                    tokenized_data_path=tokenized_data_path,
                    full_tokenizer=full_tokenizer,
                    n_ctx=n_ctx)
        print('files built')

    if not args.pretrained_model:
        model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model)
    model.train()
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))

    multi_gpu = False
    full_len = 0
    print('calculating total steps')

    with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(0),
              'r') as f:
        full_len += len([int(item) for item in f.read().strip().split()])

    total_steps = int(full_len / n_ctx * epochs / batch_size /
                      gradient_accumulation)
    print('total steps = {}'.format(total_steps))

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=lr,
                                   correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(optimizer,
                                                  warmup_steps=warmup_steps,
                                                  t_total=total_steps)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")

        device_ids = []
        for i in args.device.split(','):
            try:
                print(torch.cuda.get_device_name(int(i)))
                device_ids.append(int(i))
            except:
                pass
        model = DataParallel(model, device_ids=device_ids)
        multi_gpu = True
    print('starting training')
    overall_step = 0
    running_loss = 0

    with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(0),
              'r') as f:
        line = f.read().strip()
    tokens = line.split()
    tokens = [int(token) for token in tokens]
    start_point = 0
    samples = []

    while start_point < len(tokens) - n_ctx:
        samples.append(tokens[start_point:start_point + n_ctx])
        start_point += n_ctx
    if start_point < len(tokens):
        samples.append(tokens[len(tokens) - n_ctx:])

    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))

        samples2 = copy.deepcopy(samples)
        random.shuffle(samples2)

        for step in range(len(samples2) // batch_size):  # drop last
            #  prepare data
            batch = samples2[step * batch_size:(step + 1) * batch_size]
            batch_inputs = torch.tensor(batch).long().to(device)
            #  forward pass
            outputs = model.forward(input_ids=batch_inputs,
                                    labels=batch_inputs)
            loss, logits = outputs[:2]

            if multi_gpu:
                loss = loss.mean()
            if gradient_accumulation > 1:
                loss = loss / gradient_accumulation

            #  loss backward
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

            #  optimizer step
            if (overall_step + 1) % gradient_accumulation == 0:
                running_loss += loss.item()
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
            if (overall_step + 1) % log_step == 0:
                tb_writer.add_scalar('loss',
                                     loss.item() * gradient_accumulation,
                                     overall_step)
                print('now time: {}:{}. Step {} of epoch {}, loss {}'.format(
                    datetime.now().hour,
                    datetime.now().minute, step + 1, epoch + 1,
                    running_loss * gradient_accumulation /
                    (log_step / gradient_accumulation)))
                running_loss = 0
            overall_step += 1

        print('saving model for epoch {}'.format(epoch + 1))
        temp_epoch = (epoch + 1) % 2  # save disk space

        if not os.path.exists(output_dir + 'model_epoch{}'.format(temp_epoch)):
            os.mkdir(output_dir + 'model_epoch{}'.format(temp_epoch))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir +
                                      'model_epoch{}'.format(temp_epoch))
        #torch.save(scheduler, output_dir + 'model_epoch{}/scheduler.pt'.format(temp_epoch))
        #torch.save(optimizer, output_dir + 'model_epoch{}/optimizer.pt'.format(temp_epoch))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    print('training finished')

Example #23

0

Show file

File: generate.py Project: i-lovelife/GPT2-Chinese

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='生成设备')
    parser.add_argument('--length',
                        default=-1,
                        type=int,
                        required=False,
                        help='生成长度')
    parser.add_argument('--batch_size',
                        default=1,
                        type=int,
                        required=False,
                        help='生成的batch size')
    parser.add_argument('--nsamples',
                        default=10,
                        type=int,
                        required=False,
                        help='生成几个样本')
    parser.add_argument('--temperature',
                        default=1,
                        type=float,
                        required=False,
                        help='生成温度')
    parser.add_argument('--topk',
                        default=8,
                        type=int,
                        required=False,
                        help='最高几选一')
    parser.add_argument('--topp',
                        default=0,
                        type=float,
                        required=False,
                        help='最高积累概率')
    parser.add_argument('--model_config',
                        default='config/model_config_small.json',
                        type=str,
                        required=False,
                        help='模型参数')
    parser.add_argument('--tokenizer_path',
                        default='cache/vocab_small.txt',
                        type=str,
                        required=False,
                        help='词表路径')
    parser.add_argument('--model_path',
                        default='model/final_model',
                        type=str,
                        required=False,
                        help='模型路径')
    parser.add_argument('--prefix',
                        default='美国',
                        type=str,
                        required=False,
                        help='生成文章的开头')

    args = parser.parse_args()
    print(args)

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    length = args.length
    batch_size = args.batch_size
    nsamples = args.nsamples
    temperature = args.temperature
    topk = args.topk
    topp = args.topp

    device = "cuda" if torch.cuda.is_available() else "cpu"

    tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    model = GPT2LMHeadModel.from_pretrained(args.model_path)
    model.to(device)
    model.eval()

    if length == -1:
        length = model.config.n_ctx // 2
    elif length > model.config.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" %
                         model.config.n_ctx)

    while True:
        raw_text = args.prefix
        context_tokens = tokenizer.convert_tokens_to_ids(
            tokenizer.tokenize(raw_text))
        generated = 0
        for _ in range(nsamples // batch_size):
            out = sample_sequence(model=model,
                                  length=length,
                                  context=context_tokens,
                                  temperature=temperature,
                                  top_k=topk,
                                  top_p=topp,
                                  device=device)
            out = out.tolist()

            for i in range(batch_size):
                generated += 1
                text = tokenizer.convert_ids_to_tokens(out[0])

                for i, item in enumerate(text[:-1]):  # 确保英文前后有空格
                    if is_word(item) and is_word(text[i + 1]):
                        text[i] = item + ' '

                for i, item in enumerate(text):
                    if item == '[MASK]':
                        text[i] = ''
                    if item == '[CLS]' or item == '[SEP]':
                        text[i] = '\n'
                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                text = ''.join(text).replace('##', '').strip()
                print(text)
        print("=" * 80)

Example #24

0

Show file

File: generate.py Project: txgzlyy/text_generation_gpt2

    temperature = 1
    topk = 8
    topp = 0
    model_config = 'config/model_config_small.json'
    tokenizer_path = 'cache/vocab_small.txt'
    model_path = 'model/final_model'
    no_wordpiece = False
    segment = False
    fast_pattern = False
    # self.save_samples = False
    # self.save_samples_path = 'mnt/'
    repetition_penalty = 1.0


device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = tokenization_bert.BertTokenizer(vocab_file="cache/vocab_small.txt")


class Generate:
    def is_word(self, word):
        for item in list(word):
            if item not in 'qwertyuiopasdfghjklzxcvbnm':
                return False
        return True

    def _is_chinese_char(self, char):
        """Checks whether CP is the codepoint of a CJK character."""
        # This defines a "chinese character" as anything in the CJK Unicode block:
        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,

Example #25

0

Show file

File: preprocess.py Project: 952994221/NLP-Couplet-GPT2

        ]  # 只考虑长度超过min_length的句子
        sublines = [
            full_tokenizer.convert_tokens_to_ids(line) for line in sublines
        ]
        full_line = []
        for subline in sublines:
            full_line.append(full_tokenizer.convert_tokens_to_ids(
                '[MASK]'))  # 文章开头添加MASK表示文章开始
            full_line.extend(subline)
            full_line.append(full_tokenizer.convert_tokens_to_ids(
                '[CLS]'))  # 文章之间添加CLS表示文章结束
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                  'w') as f:
            for id in full_line:
                f.write(str(id) + ' ')
    print('finish')


data_path = "data/train.txt"
tokenized_data_path = "data/tokenized/"
num_pieces = 100
full_tokenizer = tokenization_bert.BertTokenizer(vocab_file="vocab/vocab.txt")
full_tokenizer.max_len = 999999
min_length = 2

if __name__ == '__main__':
    build_files(data_path=data_path,
                tokenized_data_path=tokenized_data_path,
                num_pieces=num_pieces,
                full_tokenizer=full_tokenizer,
                min_length=min_length)

Example #26

0

Show file

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='设置使用哪些显卡')
    parser.add_argument('--model_config',
                        default='config/model_config_small.json',
                        type=str,
                        required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path',
                        default='cache/vocab_small.txt',
                        type=str,
                        required=False,
                        help='选择词库')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized/',
                        type=str,
                        required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--epochs',
                        default=5,
                        type=int,
                        required=False,
                        help='训练循环')
    parser.add_argument('--batch_size',
                        default=8,
                        type=int,
                        required=False,
                        help='训练batch size')
    parser.add_argument('--lr',
                        default=1.5e-4,
                        type=float,
                        required=False,
                        help='学习率')
    parser.add_argument('--warmup_steps',
                        default=2000,
                        type=int,
                        required=False,
                        help='warm up步数')
    parser.add_argument('--log_step',
                        default=1,
                        type=int,
                        required=False,
                        help='多少步汇报一次loss')
    parser.add_argument('--save_per_step',
                        default=10000,
                        type=int,
                        required=False,
                        help='多少步保存一次模型')
    parser.add_argument('--stride',
                        default=768,
                        type=int,
                        required=False,
                        help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=int,
                        required=False,
                        help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level',
                        default='O1',
                        type=str,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--num_pieces',
                        default=100,
                        type=int,
                        required=False,
                        help='将训练语料分成多少份')
    parser.add_argument('--min_length',
                        default=128,
                        type=int,
                        required=False,
                        help='最短收录文章长度')
    parser.add_argument('--output_dir',
                        default='model/',
                        type=str,
                        required=False,
                        help='模型输出路径')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='模型训练起点路径')
    parser.add_argument('--writer_dir',
                        default='tensorboard_summary/',
                        type=str,
                        required=False,
                        help='Tensorboard路径')
    parser.add_argument('--bpe_token', action='store_true', help='subword')
    parser.add_argument('--encoder_json',
                        default="tokenizations/encoder.json",
                        type=str,
                        help="encoder.json")
    parser.add_argument('--vocab_bpe',
                        default="tokenizations/vocab.bpe",
                        type=str,
                        help="vocab.bpe")

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    from tokenizations import tokenization_bert
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡

    model_config = pytorch_transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    if args.bpe_token:
        full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe)
    else:
        full_tokenizer = tokenization_bert.BertTokenizer(
            vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = n_ctx
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    tokenized_data_path = args.tokenized_data_path
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    min_length = args.min_length
    output_dir = args.output_dir
    # tb_writer = SummaryWriter(log_dir=args.writer_dir)

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if not args.pretrained_model:
        model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel(
            config=model_config)
    else:
        model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model)
    model.train()
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))

    multi_gpu = False
    full_len = 0
    print('calculating total steps')
    for i in tqdm(range(num_pieces)):
        with open(
                os.path.join(tokenized_data_path,
                             'tokenized_train_{}.txt'.format(i)), 'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size /
                      gradient_accumulation)
    print('total steps = {}'.format(total_steps))

    optimizer = pytorch_transformers.AdamW(model.parameters(),
                                           lr=lr,
                                           correct_bias=True)
    scheduler = pytorch_transformers.WarmupLinearSchedule(
        optimizer, warmup_steps=warmup_steps, t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model)
        multi_gpu = True
    print('starting training')
    overall_step = 0
    running_loss = 0
    running_step = 0
    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            with open(
                    os.path.join(tokenized_data_path,
                                 'tokenized_train_{}.txt'.format(i)),
                    'r') as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:
                samples.append(tokens[start_point:start_point + n_ctx])
                start_point += stride
            random.shuffle(samples)
            for step in range(len(samples) // batch_size):  # drop last

                #  prepare data
                batch = samples[step * batch_size:(step + 1) * batch_size]
                batch_labels = []
                batch_inputs = []
                for ids in batch:
                    int_ids_for_labels = [int(x) for x in ids]
                    int_ids_for_inputs = [int(x) for x in ids]
                    batch_labels.append(int_ids_for_labels)
                    batch_inputs.append(int_ids_for_inputs)
                batch_labels = torch.tensor(batch_labels).long().to(device)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass
                outputs = model.forward(input_ids=batch_inputs,
                                        labels=batch_labels)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   max_grad_norm)

                #  optimizer step
                running_loss += loss.item()
                running_step += 1
                mean_loss = running_loss * gradient_accumulation / running_step
                if (step + 1) % gradient_accumulation == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()
                    overall_step += 1

                    # how many steps to print loss log
                    if overall_step % log_step == 0:
                        now_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                        print(
                            'now time: {}: Step {} of piece {} of epoch {}. Global Step: {}, Mean Loss: {}'
                            .format(now_time,
                                    (step + 1) // gradient_accumulation,
                                    piece_num, epoch + 1, overall_step,
                                    mean_loss))

                    # how many steps to save a checkpoint
                    if overall_step % args.save_per_step == 0:
                        if not os.path.exists(
                                os.path.join(
                                    output_dir, "model_step_%d" %
                                    (overall_step + 1))):
                            os.mkdir(
                                os.path.join(
                                    output_dir,
                                    "model_step_%d" % (overall_step + 1)))
                        model_to_save = model.module if hasattr(
                            model, 'module') else model
                        model_to_save.save_pretrained(
                            os.path.join(output_dir,
                                         "model_step_%d" % (overall_step + 1)))

            piece_num += 1

        # save model per epoch
        print('saving model for epoch {}'.format(epoch + 1))
        if not os.path.exists(
                os.path.join(output_dir, 'model_epoch{}'.format(epoch + 1))):
            os.mkdir(
                os.path.join(output_dir, 'model_epoch{}'.format(epoch + 1)))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(
            os.path.join(output_dir, 'model_epoch{}'.format(epoch + 1)))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    # save the final model
    print('training finished')
    if not os.path.exists(os.path.join(output_dir, 'final_model')):
        os.mkdir(os.path.join(output_dir, 'final_model'))
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(os.path.join(output_dir, 'final_model'))

Example #27

0

Show file

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_path',
                        default='model/final_model',
                        type=str,
                        required=False,
                        help='trained model')
    parser.add_argument('--tokenizer_path',
                        default='model/final_model/vocab.txt',
                        type=str,
                        required=False,
                        help='tokenizer')
    parser.add_argument('--inputs',
                        default='[CLS][MASK]',
                        type=str,
                        required=False,
                        help='beginning of generated text')
    parser.add_argument('--length',
                        default=100,
                        type=int,
                        required=False,
                        help='generated length')
    parser.add_argument('--temperature',
                        default=1,
                        type=float,
                        required=False,
                        help='temperature of generating freedom')
    parser.add_argument('--topk',
                        default=8,
                        type=int,
                        required=False,
                        help='top-k filtering')
    parser.add_argument('--topp',
                        default=0,
                        type=float,
                        required=False,
                        help='top-p filtering')
    # parser.add_argument('--repetition_penalty', default=1.0, type=float, required=False)
    args = parser.parse_args()

    device = "cuda" if torch.cuda.is_available() else "cpu"

    tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    model = GPT2LMHeadModel.from_pretrained(args.model_path)
    model.to(device)
    model.eval()

    n_ctx = model.config.n_ctx
    length = args.length if args.length > 0 else n_ctx

    past = None  # pre-computed hidden-states
    while True:
        para_tokens = []  # generated tokens
        # inputs = args.inputs
        inputs = input("In: ")
        if inputs == "":
            inputs = "[CLS][MASK]"

        context = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(inputs))
        context_tensor = torch.LongTensor(context).view(1, -1).to(device)
        _, past = model(context_tensor[:, :-1], past)[:2]  # prepare past
        prev = context_tensor[:, -1].view(1,
                                          -1)  # minimize context to speed up
        para_tokens += context

        generate, past = gen_paragraph(model,
                                       prev,
                                       past,
                                       length=length,
                                       temperature=args.temperature,
                                       topk=args.topk,
                                       topp=args.topp,
                                       device=device)

        para_tokens += generate
        para_word = tokenizer.convert_ids_to_tokens(para_tokens)

        # for i, item in enumerate(para_word):
        #     if item == '[MASK]' or item == '[UNK]':
        #         para_word[i] = ''
        #     elif item == '[CLS]':
        #         para_word[i] = '\n\n'
        #     elif item == '[SEP]':
        #         para_word[i] = '\n'

        para_text = ''.join(para_word).strip()
        print(para_text)