def getModel(path_config, gpu='0', fp16=False): print("load model......") torch.cuda.set_device(int(gpu)) #os.environ["CUDA_VISIBLE_DEVICES"] = gpu with open(path_config, 'r') as f: config = json.load(f) from tokenizations import tokenization_bert tokenizer_path = config['tokenizer_path'] model_path = config['model_path'] device = "cuda" if torch.cuda.is_available() else "cpu" print("use device:%s" % device) tokenizer = tokenization_bert.BertTokenizer(vocab_file=tokenizer_path) model = GPT2LMHeadModel.from_pretrained(model_path) model.to(device) model.eval() if fp16: optimizer = transformers.AdamW(model.parameters(), lr=0.1, correct_bias=True) from apex import amp fp16_opt_level = 'O1' model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) return model, tokenizer, config, device
def load_model(self,path='model/mini/'): device = "cuda" if torch.cuda.is_available() else "cpu" device = "cpu" tokenizer = tokenization_bert.BertTokenizer(vocab_file=path+'vocab.txt') model = GPT2LMHeadModel.from_pretrained(path) model.to(device) model.eval() return model, tokenizer
def main(path_source, path_target, path_vocab, nb_piece, n_ctx): #tokenizer_path = '../data/vocab/vocab_god_userdata.txt' #tokenized_data_path = '../data/userdata_tokenized_new/' full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=path_vocab) build_files(full_tokenizer, path_source, path_target, nb_piece=nb_piece, n_ctx=n_ctx)
def __init__(self, args): self.device = "cuda" if torch.cuda.is_available() else "cpu" self.batch_size = args.batch_size self.tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) self.model = GPT2KWModel.from_pretrained(args.model_path) self.model.to(self.device) self.model.eval() self.keywords_max_length = 64
def main(length, prefix, lucky_mode, rhyme_pattern=''): os.environ["CUDA_VISIBLE_DEVICES"] = '0,1,2,3' batch_size = 1 nsamples = 1 temperature = 1 topk = 8 topp = 0 repetition_penalty = 1.0 device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = tokenization_bert.BertTokenizer(vocab_file="./cache/vocab_small.txt") model = GPT2LMHeadModel.from_pretrained("./model/final_model") model.to(device) model.eval() n_ctx = model.config.n_ctx while True: raw_text = prefix context_tokens = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(raw_text)) generated = 0 for _ in range(nsamples // batch_size): out = generate( n_ctx=n_ctx, model=model, context=context_tokens, length=int(length), lucky_mode=lucky_mode, rhyme_pattern=rhyme_pattern, tokenizer=tokenizer, temperature=temperature, top_k=topk, top_p=topp, repitition_penalty=repetition_penalty, device=device ) for i in range(batch_size): generated += 1 text = tokenizer.convert_ids_to_tokens(out) for i, item in enumerate(text[:-1]): # 确保英文前后有空格 if is_word(item) and is_word(text[i + 1]): text[i] = item + ' ' for i, item in enumerate(text): if item == '[MASK]': text[i] = '' elif item == '[CLS]': text[i] = '\n\n' elif item == '[SEP]': text[i] = '\n' elif item == '@': text[i] = ' ' if text: text = ''.join(text).replace('##', '').strip() if generated == nsamples: break return text
def main(data_path, idx, dataname, tokenized_data_path, path_vocab, padding): #tokenizer_path = '../data/vocab/vocab_god_userdata.txt' #tokenized_data_path = '../data/userdata_tokenized_new/' full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=path_vocab) build_files(data_path, dataname, tokenized_data_path, full_tokenizer, idx, padding=padding)
def generate(titles, k): length = 1024 temperature = 1 topk = 8 topp = 0 repetition_penalty = 1.5 save_path = 'generated_template/' device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = tokenization_bert.BertTokenizer('../cache/vocab_small.txt') model = GPT2LMHeadModel.from_pretrained('model_template/final_model') model.to(device) model.eval() n_ctx = model.config.n_ctx if not os.path.exists(save_path): os.mkdir(save_path) with open(save_path + str(k) + '.txt', 'w', encoding='utf-8') as f: context_tokens = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(titles)) generated = 0 out = sample_sequence(n_ctx=n_ctx, model=model, length=length, context=context_tokens, tokenizer=tokenizer, temperature=temperature, top_k=topk, top_p=topp, repitition_penalty=repetition_penalty, device=device) out = out.tolist()[0] generated += 1 text = tokenizer.convert_ids_to_tokens(out) for i, item in enumerate(text[:-1]): # 确保英文前后有空格 if is_word(item) and is_word(text[i + 1]): text[i] = item + ' ' for i, item in enumerate(text): if item == '[MASK]': text[i] = '' if item == '[CLS]' or item == '[SEP]': text[i] = '\n' print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) text = ''.join(text).replace('##', '').strip() # text = ''.join(text.split('\n')[:-1]) genarateText = text[-length:] text = titles + genarateText print(genarateText) f.write(text) print("=" * 80) k = k + 1 return text, k
def getModel(path_config): with open(path_config, 'r') as f: config = json.load(f) from tokenizations import tokenization_bert tokenizer_path = config['tokenizer_path'] model_path = config['model_path'] device = 'cpu' tokenizer = tokenization_bert.BertTokenizer(vocab_file=tokenizer_path) model = GPT2LMHeadModel.from_pretrained(model_path) model.to(device) model.eval() return model, tokenizer, config
def getModel(path_config, gpu='0'): print("load model......") torch.cuda.set_device(int(gpu)) #os.environ["CUDA_VISIBLE_DEVICES"] = gpu with open(path_config, 'r') as f: config = json.load(f) from tokenizations import tokenization_bert tokenizer_path = config['tokenizer_path'] model_path = config['model_path'] device = "cuda" if torch.cuda.is_available() else "cpu" print("use device:%s" % device) tokenizer = tokenization_bert.BertTokenizer(vocab_file=tokenizer_path) model = GPT2LMHeadModel.from_pretrained(model_path) model.to(device) model.eval() return model, tokenizer, config, device
def __init__(self, model_path=MODEL7_PATH, tokenizer_path=TOKEN7_PATH, verbose=0): """Init model with given path.""" super(genModel, self).__init__() self.device = "cuda" if torch.cuda.is_available() else "cpu" self.tokenizer = tokenization_bert.BertTokenizer( vocab_file=tokenizer_path) self.model = GPT2LMHeadModel.from_pretrained(model_path) self.model.to(self.device) self.model.eval() self.n_ctx = self.model.config.n_ctx self.past = None # pre-computed hidden-states self.verbose = verbose
def main(): log('start') filename = '全唐诗.txt' nodel = filename.split('.')[0] nodelname = pinyin(nodel) # 新建文件夹 createfilepath(nodelname) createfilepath(nodelname + '/data_' + nodelname) createfilepath(nodelname + '/data_' + nodelname + '/tokenized') createfilepath(nodelname + '/generated_' + nodelname) createfilepath(nodelname + '/model_' + nodelname) createfilepath(nodelname + '/model_' + nodelname + '/final_model') log('files build') # 根据模板创建需要的py文件和bat脚本 createpyfromtemplate('start-parse-template.bat', nodelname) createpyfromtemplate('start-train-template.bat', nodelname) createpyfromtemplate('generate_template.py', nodelname) createpyfromtemplate('train-template.py', nodelname) log('py and bat build') # return # 根据指定的小说生成训练数据集 tokenizer_path = 'cache/vocab_small.txt' raw_data_path = nodelname + '/data_' + nodelname tokenized_data_path = raw_data_path + '/tokenized/' from tokenizations import tokenization_bert full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=tokenizer_path) full_tokenizer.max_len = 999999 log('building datas') build_files(filename=filename, data_path=raw_data_path, tokenized_data_path=tokenized_data_path, num_pieces=1, full_tokenizer=full_tokenizer, min_length=128) log('files built')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--tokenizer_path', type=str, required=True, help='选择词库') parser.add_argument('--raw_data_path', type=str, required=True, help='原始训练语料') parser.add_argument('--tokenized_data_path', type=str, required=True, help='tokenized语料存放位置') parser.add_argument('--num_pieces', default=1, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--min_length', default=10, type=int, required=False, help='最短收录文章长度') args = parser.parse_args() print('args:\n' + args.__repr__()) from tokenizations import tokenization_bert full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) print('building files...') build_files(data_path=args.raw_data_path, tokenized_data_path=args.tokenized_data_path, num_pieces=args.num_pieces, full_tokenizer=full_tokenizer, min_length=args.min_length) print('files built')
def main(): parser = argparse.ArgumentParser() parser.add_argument("--vocab_file", type=str, default="cache/vocab.txt", required=False, help="词表路径") parser.add_argument("--input_file", type=str, required=True, help="输入文本文件, 格式: 每个句子占一行,不同 doc 之间以空行隔开") parser.add_argument("--output_file", type=str, required=True, help="输出文件, 格式:Token Ids") args = parser.parse_args() print('args:\n' + args.__repr__()) # parameter checking if not os.path.exists(args.input_file) or not os.path.isfile(args.input_file) \ or not os.access(args.input_file, os.R_OK): logger.error("Input file [%s] not exists or not readable!" % args.input_file) sys.exit(1) if not os.path.exists(args.vocab_file) or not os.path.isfile(args.vocab_file) \ or not os.access(args.vocab_file, os.R_OK): logger.error("Vocab file [%s] not exists or not readable!" % args.vocab_file) sys.exit(1) full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.vocab_file) convert(args.input_file, args.output_file, full_tokenizer) logger.info("Data convert finished!")
from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = "0" # 此处设置程序使用哪些显卡 tokenizer_path = "./model/gpt2_prose/vocab.txt" model_config = "./model/model_godText/final_model/config.json" model_path = "./model/model_godText/final_model/" save_samples_path = "./test_godText/finetuned/" length = 50 batch_size = 4 nsamples = 10 temperature = 1.0 topk = 8 topp = 0 repetition_penalty = 1.0 device = 'cpu' tokenizer = tokenization_bert.BertTokenizer(vocab_file=tokenizer_path) model = GPT2LMHeadModel.from_pretrained(model_path) model.to(device) model.eval() params = list(model.parameters()) k = 0 for i in params: l = 1 #print("该层的结构:" + str(list(i.size()))) for j in i.size(): l *= j #print("该层参数和:" + str(l)) k = k + l print("总参数数量和:%dM" % int(k / 1024 / 1024))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='cuda visible devices') parser.add_argument('--nsamples', default=8, type=int, required=False, help='number of generated samples') parser.add_argument('--temperature', default=1, type=float, required=False) parser.add_argument('--topk', default=8, type=int, required=False, help='k for top k sampling') parser.add_argument('--topp', default=0, type=float, required=False, help='p for top p sampling') parser.add_argument('--tokenizer_path', default='data/vocabs.txt', type=str, required=False, help='path of the vocabulary file') parser.add_argument('--model_path', default='model/model_epoch24', type=str, required=False, help='pre-trained model dir') parser.add_argument('--prefix', default='仁义礼智信', type=str, required=False, help='prefix of the couplet') parser.add_argument('--save_samples', action='store_true', help='save samples') parser.add_argument('--save_samples_path', default='data', type=str, required=False, help="save the samples to this dir") parser.add_argument('--bow', action='store_true', help='use PPLM-BOW') parser.add_argument('--bow_path', default='data/bow_newyear.txt', type=str, required=False, help='path of the bag of considered characters') parser.add_argument('--bow_stepsize', default=0.3, type=float, required=False, help='stepsize of the PPLM') parser.add_argument('--bow_num_iterations', default=3, type=int, required=False, help='num_iterations of the PPLM') args = parser.parse_args() print('args:\n' + args.__repr__()) os.environ["CUDA_VISIBLE_DEVICES"] = args.device device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) model = GPT2LMHeadModel.from_pretrained(args.model_path, output_hidden_states=True) model.to(device) model.eval() if args.prefix.find("|") < 0: length = model.config.n_ctx - len(args.prefix) else: length = 2 * args.prefix.index("|") - len(args.prefix) + 1 bow_vectors = None if args.bow: bow_vectors = build_bow_vectors(args.bow_path, tokenizer, device) if args.save_samples: if not os.path.exists(args.save_samples_path): os.makedirs(args.save_samples_path) samples_file = open(args.save_samples_path + '/generated.txt', 'w', encoding='utf8') context_tokens = [tokenizer.convert_tokens_to_ids('[MASK]') ] + tokenizer.convert_tokens_to_ids( tokenizer.tokenize(args.prefix)) for t in range(args.nsamples): out = sample_sequence(model, context_tokens, length, temperature=args.temperature, top_k=args.topk, top_p=args.topp, device=device, bow=args.bow, bow_vectors=bow_vectors, bow_stepsize=args.bow_stepsize, bow_num_iterations=args.bow_num_iterations, tokenizer=tokenizer) text = tokenizer.convert_ids_to_tokens(out) for i, item in enumerate(text): if item == '[MASK]': text[i] = '上联:' elif item == '[PAD]': text[i] = ' ' elif item == '|': text[i] = ' 下联:' print("=" * 40 + " SAMPLE " + str(t + 1) + " " + "=" * 40 + "\n") text = ''.join(text) print(text) if args.save_samples: samples_file.write(text + '\n') if args.save_samples: samples_file.close()
def main_seg(data_path, dataname): tokenizer_path = '../model/model_dabaigou_seg/vocab.txt' tokenized_data_path = '../data/dabaigou_tokenized_seg/' full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=tokenizer_path) build_files_seg(data_path, dataname, tokenized_data_path, full_tokenizer)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/eval.json', type=str, required=False, help='原始语料') parser.add_argument('--tokenized_data_path', default='data/tokenized_eval/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--batch_size', default=8, type=int, required=False, help='batch size') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次') parser.add_argument('--stride', default=768, type=int, required=False, help='取数据的窗口步长') parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--min_length', default=128, type=int, required=False, help='最短收录文章长度') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型起点路径') parser.add_argument('--output_dir', default='eval_result/', type=str, required=False, help='结果输出路径') parser.add_argument('--output_name', default='result.txt', type=str, required=False, help='结果输出') args = parser.parse_args() print('args:\n' + args.__repr__()) # if args.no_wordpiece: # from tokenizations import tokenization_bert_without_wordpiece as tokenization_bert # else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 # model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(args.model_config) # print('config:\n' + model_config.to_json_string()) # n_ctx = model_config.n_ctx n_ctx = 1024 full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) full_tokenizer.max_len = n_ctx device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw # 选择是否从零开始构建数据集 batch_size = args.batch_size log_step = args.log_step stride = args.stride num_pieces = args.num_pieces min_length = args.min_length output_dir = args.output_dir if not os.path.exists(output_dir): os.mkdir(output_dir) if raw: print('building files') build_files(data_path=raw_data_path, tokenized_data_path=tokenized_data_path, num_pieces=num_pieces, full_tokenizer=full_tokenizer, min_length=min_length) print('files built') assert args.pretrained_model != '', AssertionError( 'you need to specify a trained model.') model = GPT2LMHeadModel.from_pretrained(args.pretrained_model) model.eval() model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) multi_gpu = False # full_len = 0 # print('calculating total steps') # for i in tqdm(range(num_pieces)): # with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: # full_len += len([int(item) for item in f.read().strip().split()]) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model) multi_gpu = True print('starting training') overall_step = 0 total_loss = 0 total_steps = 0 # eval now = datetime.now() print('time: {}'.format(now)) piece_num = 0 for i in range(num_pieces): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point:start_point + n_ctx]) start_point += stride start_point -= stride last = tokens[start_point + n_ctx:] last.extend([ full_tokenizer.convert_tokens_to_ids(['[PAD]']) * (n_ctx - len(last)) ]) random.shuffle(samples) for step in range(len(samples) // batch_size): # drop last # prepare data batch = samples[step * batch_size:(step + 1) * batch_size] batch_labels = [] batch_inputs = [] for ids in batch: int_ids_for_labels = [int(x) for x in ids] int_ids_for_inputs = [int(x) for x in ids] batch_labels.append(int_ids_for_labels) batch_inputs.append(int_ids_for_inputs) batch_labels = torch.tensor(batch_labels).long().to(device) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() total_loss += loss total_steps += 1 if (overall_step + 1) % log_step == 0: print('now time: {}:{}. Step {} of piece {}, ppl {}'.format( datetime.now().hour, datetime.now().minute, (step + 1), piece_num, torch.exp(loss))) piece_num += 1 if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) with open(args.output_dir + args.output_name, 'w') as f: f.write(total_loss / total_steps)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--length', default=-1, type=int, required=False, help='生成长度') parser.add_argument('--temperature', default=1, type=float, required=False, help='生成温度,越高越随机') parser.add_argument('--topk', default=8, type=int, required=False, help='生成的时候最高几选一') parser.add_argument('--topp', default=0, type=float, required=False, help='生成的时候积累概率最高多少') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='模型参数路径') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='词表路径') parser.add_argument('--model_path', default='model/final_model', type=str, required=False, help='模型路径') parser.add_argument('--save_path', default='generated/', type=str, required=False, help='存放生成的文件的路径') parser.add_argument('--articles_per_title', default=5, type=int, required=False, help='每个标题生成多少篇文章') parser.add_argument('--titles', default='萧炎', type=str, required=False, help='标题列表,是一个字符串,用空格分开') parser.add_argument('--titles_file', default='', type=str, required=False, help='标题列表文件,文件中每行一个标题。如果这个选项有值则titles无效') args = parser.parse_args() print(args) os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 length = args.length temperature = args.temperature topk = args.topk topp = args.topp titles = args.title.split() # 列表,里面每个元素是一个生成的标题 if args.titles_file: with open(args.titles_file, 'r') as f: titles = [line.strip('\n') for line in f.readlines()] articles_per_title = args.articles_per_title # 这里定义一个标题生成多少篇文章 save_path = args.save_path # 设置存到哪 device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) model_config = pytorch_transformers.GPT2Config.from_json_file(args.model_config) model = GPT2LMHeadModel.from_pretrained(args.model_path) model.to(device) model.eval() if not os.path.exists(save_path): os.mkdir(save_path) if length == -1: length = model.config.n_ctx // 2 elif length > model.config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx) for i, title in enumerate(titles): for j in range(articles_per_title): with open(save_path + str(i * j), 'w') as f: context_tokens = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(title)) generated = 0 out = sample_sequence( model=model, length=length, context=context_tokens, temperature=temperature, top_k=topk, top_p=topp, device=device ) out = out.tolist() generated += 1 text = tokenizer.convert_ids_to_tokens(out[0]) for i, item in enumerate(text[:-1]): # 确保英文前后有空格 if is_word(item) and is_word(text[i + 1]): text[i] = item + ' ' for i, item in enumerate(text): if item == '[MASK]': text[i] = '' if item == '[CLS]' or item == '[SEP]': text[i] = '\n' print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) text = ''.join(text).replace('##', '').strip() # text = ''.join(text.split('\n')[:-1]) print(text) f.write(text) print("=" * 80)
def main(data_path, dataname): tokenizer_path = '../model/gpt2_prose/vocab.txt' tokenized_data_path = '../data/dabaigou_tokenized_new/' full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=tokenizer_path) build_files(data_path, dataname, tokenized_data_path, full_tokenizer) shutil.rmtree(data_path)
text[i] = '\n\n' elif item == '[SEP]': text[i] = '\n' info = "=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40 + "\n" print(info) text = ''.join(text).replace('##', '').strip() print(text) res.append(text[len(raw_text):]) if save_samples: # samples_file.write(info) samples_file.write(raw_text+text) samples_file.write('\n') # samples_file.write('=' * 90) # samples_file.write('\n' * 2) print("=" * 80) if generated == nsamples: # close file when finish writing. if save_samples: samples_file.close() break return res if __name__ == '__main__': from tokenizations import tokenization_bert device = "cpu" tokenizer = tokenization_bert.BertTokenizer(vocab_file='cache/vocab.txt') model = GPT2LMHeadModel.from_pretrained('model/covi_final_model/') model.to(device) model.eval() print(gen("", model, tokenizer, 3, 50, 1.0))
app = Flask(__name__, static_url_path='') config_file = os.environ.get("config_file") if config_file is None: config_file = "songci.json" print("using config_file: %s" % config_file) args = {} try: args = json.loads(open(config_file).read()) except Exception as e: print(e) sys.exit(1) unk_idx = open(args['vocab_path']).read().split('\n').index('[UNK]') device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = tokenization_bert.BertTokenizer(vocab_file=args['vocab_path']) model = GPT2LMHeadModel.from_pretrained(args['model_path']) model.to(device) model.eval() @app.route("/api") def get(): prefix = request.args.get('text', '') length = args['length'] #101 temperature = 1 topk = 8 topp = 0 context_tokens = tokenizer.convert_tokens_to_ids(
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='cuda visible devices') parser.add_argument('--model_config', default='config/model_config.json', type=str, required=False, help='path of the model configration file') parser.add_argument('--tokenizer_path', default='data/vocabs.txt', type=str, required=False, help='path of the vocabulary file') parser.add_argument('--raw_data_path', default='data/samples.json', type=str, required=False, help='path of the samples file') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='save the tokenized samples file to this dir') parser.add_argument( '--raw', action='store_true', help= 'do tokenize before training, no need if already tokenized with same configration' ) parser.add_argument('--epochs', default=24, type=int, required=False) parser.add_argument('--batch_size', default=16, type=int, required=False) parser.add_argument('--lr', default=2e-4, type=float, required=False) parser.add_argument('--warmup_steps', default=4000, type=int, required=False) parser.add_argument('--log_step', default=4000, type=int, required=False, help='period of reporting loss') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--output_dir', default='model/', type=str, required=False, help='save the model to this dir') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='pre-trained model dir') args = parser.parse_args() print('args:\n' + args.__repr__()) from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device model_config = transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) full_tokenizer.max_len = 999999 if torch.cuda.is_available(): device = 'cuda' print(torch.cuda.get_device_name(0)) else: device = 'cpu' print(device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step gradient_accumulation = args.gradient_accumulation max_grad_norm = args.max_grad_norm output_dir = args.output_dir assert log_step % gradient_accumulation == 0 if not os.path.exists(output_dir): os.mkdir(output_dir) if raw: print('building files') build_files(data_path=raw_data_path, tokenized_data_path=tokenized_data_path, full_tokenizer=full_tokenizer, n_ctx=n_ctx) print('files built') if not args.pretrained_model: model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config) else: model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( args.pretrained_model) model.train() model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) multi_gpu = False full_len = 0 print('calculating total steps') with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(0), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / n_ctx * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=total_steps) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") device_ids = [] for i in args.device.split(','): try: print(torch.cuda.get_device_name(int(i))) device_ids.append(int(i)) except: pass model = DataParallel(model, device_ids=device_ids) multi_gpu = True print('starting training') overall_step = 0 running_loss = 0 with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(0), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point:start_point + n_ctx]) start_point += n_ctx if start_point < len(tokens): samples.append(tokens[len(tokens) - n_ctx:]) for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) samples2 = copy.deepcopy(samples) random.shuffle(samples2) for step in range(len(samples2) // batch_size): # drop last # prepare data batch = samples2[step * batch_size:(step + 1) * batch_size] batch_inputs = torch.tensor(batch).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs) loss, logits = outputs[:2] if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (overall_step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() if (overall_step + 1) % log_step == 0: tb_writer.add_scalar('loss', loss.item() * gradient_accumulation, overall_step) print('now time: {}:{}. Step {} of epoch {}, loss {}'.format( datetime.now().hour, datetime.now().minute, step + 1, epoch + 1, running_loss * gradient_accumulation / (log_step / gradient_accumulation))) running_loss = 0 overall_step += 1 print('saving model for epoch {}'.format(epoch + 1)) temp_epoch = (epoch + 1) % 2 # save disk space if not os.path.exists(output_dir + 'model_epoch{}'.format(temp_epoch)): os.mkdir(output_dir + 'model_epoch{}'.format(temp_epoch)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(temp_epoch)) #torch.save(scheduler, output_dir + 'model_epoch{}/scheduler.pt'.format(temp_epoch)) #torch.save(optimizer, output_dir + 'model_epoch{}/optimizer.pt'.format(temp_epoch)) print('epoch {} finished'.format(epoch + 1)) then = datetime.now() print('time: {}'.format(then)) print('time for one epoch: {}'.format(then - now)) print('training finished')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='生成设备') parser.add_argument('--length', default=-1, type=int, required=False, help='生成长度') parser.add_argument('--batch_size', default=1, type=int, required=False, help='生成的batch size') parser.add_argument('--nsamples', default=10, type=int, required=False, help='生成几个样本') parser.add_argument('--temperature', default=1, type=float, required=False, help='生成温度') parser.add_argument('--topk', default=8, type=int, required=False, help='最高几选一') parser.add_argument('--topp', default=0, type=float, required=False, help='最高积累概率') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='词表路径') parser.add_argument('--model_path', default='model/final_model', type=str, required=False, help='模型路径') parser.add_argument('--prefix', default='美国', type=str, required=False, help='生成文章的开头') args = parser.parse_args() print(args) os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 length = args.length batch_size = args.batch_size nsamples = args.nsamples temperature = args.temperature topk = args.topk topp = args.topp device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) model = GPT2LMHeadModel.from_pretrained(args.model_path) model.to(device) model.eval() if length == -1: length = model.config.n_ctx // 2 elif length > model.config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx) while True: raw_text = args.prefix context_tokens = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(raw_text)) generated = 0 for _ in range(nsamples // batch_size): out = sample_sequence(model=model, length=length, context=context_tokens, temperature=temperature, top_k=topk, top_p=topp, device=device) out = out.tolist() for i in range(batch_size): generated += 1 text = tokenizer.convert_ids_to_tokens(out[0]) for i, item in enumerate(text[:-1]): # 确保英文前后有空格 if is_word(item) and is_word(text[i + 1]): text[i] = item + ' ' for i, item in enumerate(text): if item == '[MASK]': text[i] = '' if item == '[CLS]' or item == '[SEP]': text[i] = '\n' print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) text = ''.join(text).replace('##', '').strip() print(text) print("=" * 80)
temperature = 1 topk = 8 topp = 0 model_config = 'config/model_config_small.json' tokenizer_path = 'cache/vocab_small.txt' model_path = 'model/final_model' no_wordpiece = False segment = False fast_pattern = False # self.save_samples = False # self.save_samples_path = 'mnt/' repetition_penalty = 1.0 device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = tokenization_bert.BertTokenizer(vocab_file="cache/vocab_small.txt") class Generate: def is_word(self, word): for item in list(word): if item not in 'qwertyuiopasdfghjklzxcvbnm': return False return True def _is_chinese_char(self, char): """Checks whether CP is the codepoint of a CJK character.""" # This defines a "chinese character" as anything in the CJK Unicode block: # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) # # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
] # 只考虑长度超过min_length的句子 sublines = [ full_tokenizer.convert_tokens_to_ids(line) for line in sublines ] full_line = [] for subline in sublines: full_line.append(full_tokenizer.convert_tokens_to_ids( '[MASK]')) # 文章开头添加MASK表示文章开始 full_line.extend(subline) full_line.append(full_tokenizer.convert_tokens_to_ids( '[CLS]')) # 文章之间添加CLS表示文章结束 with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'w') as f: for id in full_line: f.write(str(id) + ' ') print('finish') data_path = "data/train.txt" tokenized_data_path = "data/tokenized/" num_pieces = 100 full_tokenizer = tokenization_bert.BertTokenizer(vocab_file="vocab/vocab.txt") full_tokenizer.max_len = 999999 min_length = 2 if __name__ == '__main__': build_files(data_path=data_path, tokenized_data_path=tokenized_data_path, num_pieces=num_pieces, full_tokenizer=full_tokenizer, min_length=min_length)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=8, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss') parser.add_argument('--save_per_step', default=10000, type=int, required=False, help='多少步保存一次模型') parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--min_length', default=128, type=int, required=False, help='最短收录文章长度') parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径') parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径') parser.add_argument('--bpe_token', action='store_true', help='subword') parser.add_argument('--encoder_json', default="tokenizations/encoder.json", type=str, help="encoder.json") parser.add_argument('--vocab_bpe', default="tokenizations/vocab.bpe", type=str, help="vocab.bpe") args = parser.parse_args() print('args:\n' + args.__repr__()) from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = pytorch_transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx if args.bpe_token: full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe) else: full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) full_tokenizer.max_len = n_ctx device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) tokenized_data_path = args.tokenized_data_path epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 # 不支持半精度的显卡请勿打开 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces min_length = args.min_length output_dir = args.output_dir # tb_writer = SummaryWriter(log_dir=args.writer_dir) if not os.path.exists(output_dir): os.mkdir(output_dir) if not args.pretrained_model: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel( config=model_config) else: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( args.pretrained_model) model.train() model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) multi_gpu = False full_len = 0 print('calculating total steps') for i in tqdm(range(num_pieces)): with open( os.path.join(tokenized_data_path, 'tokenized_train_{}.txt'.format(i)), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) optimizer = pytorch_transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = pytorch_transformers.WarmupLinearSchedule( optimizer, warmup_steps=warmup_steps, t_total=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model) multi_gpu = True print('starting training') overall_step = 0 running_loss = 0 running_step = 0 for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 for i in x: with open( os.path.join(tokenized_data_path, 'tokenized_train_{}.txt'.format(i)), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point:start_point + n_ctx]) start_point += stride random.shuffle(samples) for step in range(len(samples) // batch_size): # drop last # prepare data batch = samples[step * batch_size:(step + 1) * batch_size] batch_labels = [] batch_inputs = [] for ids in batch: int_ids_for_labels = [int(x) for x in ids] int_ids_for_inputs = [int(x) for x in ids] batch_labels.append(int_ids_for_labels) batch_inputs.append(int_ids_for_inputs) batch_labels = torch.tensor(batch_labels).long().to(device) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step running_loss += loss.item() running_step += 1 mean_loss = running_loss * gradient_accumulation / running_step if (step + 1) % gradient_accumulation == 0: optimizer.step() optimizer.zero_grad() scheduler.step() overall_step += 1 # how many steps to print loss log if overall_step % log_step == 0: now_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") print( 'now time: {}: Step {} of piece {} of epoch {}. Global Step: {}, Mean Loss: {}' .format(now_time, (step + 1) // gradient_accumulation, piece_num, epoch + 1, overall_step, mean_loss)) # how many steps to save a checkpoint if overall_step % args.save_per_step == 0: if not os.path.exists( os.path.join( output_dir, "model_step_%d" % (overall_step + 1))): os.mkdir( os.path.join( output_dir, "model_step_%d" % (overall_step + 1))) model_to_save = model.module if hasattr( model, 'module') else model model_to_save.save_pretrained( os.path.join(output_dir, "model_step_%d" % (overall_step + 1))) piece_num += 1 # save model per epoch print('saving model for epoch {}'.format(epoch + 1)) if not os.path.exists( os.path.join(output_dir, 'model_epoch{}'.format(epoch + 1))): os.mkdir( os.path.join(output_dir, 'model_epoch{}'.format(epoch + 1))) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained( os.path.join(output_dir, 'model_epoch{}'.format(epoch + 1))) print('epoch {} finished'.format(epoch + 1)) then = datetime.now() print('time: {}'.format(then)) print('time for one epoch: {}'.format(then - now)) # save the final model print('training finished') if not os.path.exists(os.path.join(output_dir, 'final_model')): os.mkdir(os.path.join(output_dir, 'final_model')) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(os.path.join(output_dir, 'final_model'))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_path', default='model/final_model', type=str, required=False, help='trained model') parser.add_argument('--tokenizer_path', default='model/final_model/vocab.txt', type=str, required=False, help='tokenizer') parser.add_argument('--inputs', default='[CLS][MASK]', type=str, required=False, help='beginning of generated text') parser.add_argument('--length', default=100, type=int, required=False, help='generated length') parser.add_argument('--temperature', default=1, type=float, required=False, help='temperature of generating freedom') parser.add_argument('--topk', default=8, type=int, required=False, help='top-k filtering') parser.add_argument('--topp', default=0, type=float, required=False, help='top-p filtering') # parser.add_argument('--repetition_penalty', default=1.0, type=float, required=False) args = parser.parse_args() device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) model = GPT2LMHeadModel.from_pretrained(args.model_path) model.to(device) model.eval() n_ctx = model.config.n_ctx length = args.length if args.length > 0 else n_ctx past = None # pre-computed hidden-states while True: para_tokens = [] # generated tokens # inputs = args.inputs inputs = input("In: ") if inputs == "": inputs = "[CLS][MASK]" context = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(inputs)) context_tensor = torch.LongTensor(context).view(1, -1).to(device) _, past = model(context_tensor[:, :-1], past)[:2] # prepare past prev = context_tensor[:, -1].view(1, -1) # minimize context to speed up para_tokens += context generate, past = gen_paragraph(model, prev, past, length=length, temperature=args.temperature, topk=args.topk, topp=args.topp, device=device) para_tokens += generate para_word = tokenizer.convert_ids_to_tokens(para_tokens) # for i, item in enumerate(para_word): # if item == '[MASK]' or item == '[UNK]': # para_word[i] = '' # elif item == '[CLS]': # para_word[i] = '\n\n' # elif item == '[SEP]': # para_word[i] = '\n' para_text = ''.join(para_word).strip() print(para_text)