def origin(): tokenizer = BPEEncoder_ja(bpe_origin, emoji) model = GPT2LMHeadModel.from_pretrained('gpt2-pytorch-model-medium') model.resize_token_embeddings(len(tokenizer)) model.eval() device = 'cpu' seq_num = 0 with torch.no_grad(): for idx in range(3): print('=' * 5 + 'start' + '=' * 5) finished = False cur_ids = torch.tensor(tokenizer.encode('AIとは')).unsqueeze(0).to(device) # print(cur_ids) for i in range(100): outputs = model(cur_ids, labels=cur_ids) loss, logits = outputs[:2] # print(f'logits = {logits}, shape = {logits.shape}') # print(f'logits[0,-1] = {logits[0,-1]}, shape = {logits[0,-1].shape}') softmax_logits = torch.softmax(logits[0,-1], dim=0) # print(f'softmax = {softmax_logits} shape = {softmax_logits.shape}') if i < 3: n = 20 else: n = 3 next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n) # print(next_token_id) cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # print('encode = ', tokenizer.encode('<|endoftext|>')) # print('next_token_id = ', next_token_id, 'next decode = ', tokenizer.decode(next_token_id)) if next_token_id in tokenizer.encode('<|endoftext|>'): finished = True print('generate finished') break # print(cur_ids.shape) # if finished: # print(cur_ids.tolist()[0]) seq_num = seq_num + 1 output_list = list(cur_ids.squeeze().to('cpu').numpy()) # output_list = cur_ids.tolist()[0] output_text = tokenizer.decode(output_list) output_text = output_text.replace(' ','') output_text.split('<|endoftext|>')[0] print(output_text)
parser.add_argument("--dataset_type", help="dataset type", default='split') parser.add_argument("--split_tag", help="text split tag", default='<|SP_QA|>') parser.add_argument('--top_k', type=int, default=1) parser.add_argument('--top_p', type=float, default=0) parser.add_argument('--temperature', type=float, default=1) parser.add_argument("--gpu", help="use gpu number", default='0') parser.add_argument("--verbose", action='store_true') args = parser.parse_args() with open('ja-bpe.txt', encoding='utf-8') as f: bpe = f.read().split('\n') with open('emoji.json', encoding='utf-8') as f: emoji = json.loads(f.read()) enc = BPEEncoder_ja(bpe, emoji) n_vocab = len(enc) eot_token = enc.encode('<|endoftext|>')[0] sep_token = enc.encode('<|byte0|>')[0] temperature = args.temperature top_k = args.top_k top_p = args.top_p min_answer_len = args.min_answer_len if os.path.isfile(args.model + '/hparams.json'): with open(args.model + '/hparams.json') as f: params = json.loads(f.read()) hparams = HParams(**params) n_prediction = params['n_prediction'] elif 'small' in args.model: hparams = HParams(
def main(): args = parser.parse_args() if os.path.isfile(args.model + '/hparams.json'): with open(args.model + '/hparams.json') as f: bert_config_params = json.load(f) else: raise ValueError('invalid model name.') if not (len(args.input_file) > 0 or len(args.context) > 0): raise ValueError('--input_file or --context required.') if (not os.path.isfile(args.input_file)) and len(args.context) == 0: raise ValueError('invalid input file name.') if len(args.input_file) > 0 and os.path.isfile(args.input_file): with open(args.input_file) as f: args.context = f.read() vocab_size = bert_config_params['vocab_size'] max_seq_length = bert_config_params['max_position_embeddings'] batch_size = 1 EOT_TOKEN = vocab_size - 4 MASK_TOKEN = vocab_size - 3 CLS_TOKEN = vocab_size - 2 SEP_TOKEN = vocab_size - 1 with open('ja-bpe.txt', encoding='utf-8') as f: bpe = f.read().split('\n') with open('emoji.json', encoding='utf-8') as f: emoji = json.loads(f.read()) enc = BPEEncoder_ja(bpe, emoji) bert_config = BertConfig(**bert_config_params) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = args.gpu with tf.Session(config=config) as sess: input_ids = tf.placeholder(tf.int32, [None, None]) input_mask = tf.placeholder(tf.int32, [None, None]) segment_ids = tf.placeholder(tf.int32, [None, None]) masked_lm_positions = tf.placeholder(tf.int32, [None, None]) masked_lm_ids = tf.placeholder(tf.int32, [None, None]) masked_lm_weights = tf.placeholder(tf.float32, [None, None]) next_sentence_labels = tf.placeholder(tf.int32, [None]) model = BertModel(config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=False) output = model.get_sequence_output() (_, _, _) = get_masked_lm_output(bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) (_, _, _) = get_next_sentence_output(bert_config, model.get_pooled_output(), next_sentence_labels) saver = tf.train.Saver() masked_lm_values = tf.placeholder(tf.float32, [None, None]) with tf.variable_scope("loss"): (_, outputs) = get_masked_regression_output( bert_config, model.get_sequence_output(), masked_lm_positions, masked_lm_values, masked_lm_weights) saver = tf.train.Saver(var_list=tf.trainable_variables()) ckpt = tf.train.latest_checkpoint(args.model) saver.restore(sess, ckpt) _input_ids = [] _lm_positions = [] tokens = [enc.encode(p.strip()) for p in sep_txt(args.context)] tokens = [t for t in tokens if len(t) > 0] for t in tokens: _lm_positions.append(len(_input_ids)) _input_ids.extend([CLS_TOKEN] + t) _input_ids.append(EOT_TOKEN) _input_masks = [1] * len(_input_ids) _segments = [1] * len(_input_ids) _input_ids = _input_ids[:max_seq_length] _input_masks = _input_masks[:max_seq_length] _segments = _segments[:max_seq_length] while len(_segments) < max_seq_length: _input_ids.append(0) _input_masks.append(0) _segments.append(0) _lm_positions = [p for p in _lm_positions if p < max_seq_length] _lm_positions = _lm_positions[:max_seq_length] _lm_lm_weights = [1] * len(_lm_positions) while len(_lm_positions) < max_seq_length: _lm_positions.append(0) _lm_lm_weights.append(0) _lm_ids = [0] * len(_lm_positions) _lm_vals = [0] * len(_lm_positions) regress = sess.run(outputs, feed_dict={ input_ids: [_input_ids], input_mask: [_input_masks], segment_ids: [_segments], masked_lm_positions: [_lm_positions], masked_lm_ids: [_lm_ids], masked_lm_weights: [_lm_lm_weights], next_sentence_labels: [0], masked_lm_values: [_lm_vals] }) regress = regress.reshape((-1, )) if args.output_file == '': for tok, value in zip(tokens, regress): print(f'{value}\t{enc.decode(tok)}') else: sent = [] impt = [] for tok, value in zip(tokens, regress): sent.append(enc.decode(tok)) impt.append(value) df = pd.DataFrame({'sentence': sent, 'importance': impt}) df.to_csv(args.output_file, index=False)
print('generate finished') break # print(cur_ids.shape) # if finished: # print(cur_ids.tolist()[0]) seq_num = seq_num + 1 output_list = list(cur_ids.squeeze().to('cpu').numpy()) # output_list = cur_ids.tolist()[0] output_text = tokenizer.decode(output_list) output_text = output_text.replace(' ','') output_text.split('<|endoftext|>')[0] print(output_text) if __name__ == '__main__': tokenizer = BPEEncoder_ja(bpe, emoji) ids = [0,0,0,0,1,2] print(tokenizer.decode(ids)) # model_path = './trained_models/best.pt' # model = GPT2LMHeadModel.from_pretrained('gpt2-pytorch-model-medium') # model.resize_token_embeddings(len(tokenizer)) # device = torch.device('cuda:2') # model.load_state_dict(torch.load(model_path, map_location=device), strict=False) # article_generate(model, tokenizer, 1, 5, 0, device) # try: # article_generate(model, tokenizer, 1, 5, 0, device) # except: # print('=' * 30) # origin()
return ln_probs_next parser = argparse.ArgumentParser() parser.add_argument('input_file') parser.add_argument('--model', default='gpt2ja-medium') parser.add_argument('--gpu', type=str, default='0') args = parser.parse_args() with open('ja-bpe.txt') as f: bpe = f.read().split('\n') with open('emoji.json') as f: emoji = json.loads(f.read()) enc = BPEEncoder_ja(bpe, emoji) n_vocab = len(enc) if 'small' in args.model: hparams = HParams( **{ "n_vocab": n_vocab, "n_ctx": 1024, "n_embd": 768, "n_head": 12, "n_layer": 12 }) elif 'medium' in args.model: hparams = HParams( **{ "n_vocab": n_vocab,
bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) (_,_,_) = get_next_sentence_output( bert_config, model.get_pooled_output(), next_sentence_labels) saver = tf.train.Saver() ckpt = tf.train.latest_checkpoint(args.model) saver.restore(sess, ckpt) with open('ja-bpe.txt', encoding='utf-8') as f: bpe = f.read().split('\n') with open('emoji.json', encoding='utf-8') as f: emoji = json.loads(f.read()) enc = BPEEncoder_ja(bpe, emoji) if args.split_tag != '': contexts = args.context.split(args.split_tag) else: contexts = [args.context] _input_ids = [] _input_masks = [] _segments = [] _mask_positions = [] for context in contexts: context_tokens = enc.encode(context) context_tokens = context_tokens[:max_seq_length-3] inputs = [] inputs.append(CLS_TOKEN) inputs.extend(context_tokens)
def main(): global EOT_TOKEN, MASK_TOKEN, CLS_TOKEN, SEP_TOKEN, enc args = parser.parse_args() if os.path.isfile(args.model + '/hparams.json'): with open(args.model + '/hparams.json') as f: bert_config_params = json.load(f) else: raise ValueError('invalid model name.') vocab_size = bert_config_params['vocab_size'] max_seq_length = bert_config_params['max_position_embeddings'] batch_size = args.batch_size save_every = args.save_every num_epochs = args.num_epochs EOT_TOKEN = vocab_size - 4 MASK_TOKEN = vocab_size - 3 CLS_TOKEN = vocab_size - 2 SEP_TOKEN = vocab_size - 1 with open('ja-bpe.txt', encoding='utf-8') as f: bpe = f.read().split('\n') with open('emoji.json', encoding='utf-8') as f: emoji = json.loads(f.read()) enc = BPEEncoder_ja(bpe, emoji) fl = [f'{args.input_dir}/{f}' for f in os.listdir(args.input_dir)] with Pool(args.num_encode_process) as pool: imap = pool.imap(encode_one, fl) input_contexts = list(tqdm(imap, total=len(fl))) input_indexs = np.random.permutation(len(input_contexts)) if args.do_eval: eval_num = int(args.eval_rate * len(input_indexs)) eval_input_indexs = input_indexs[:eval_num] input_indexs = input_indexs[eval_num:] bert_config = BertConfig(**bert_config_params) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = args.gpu with tf.Session(config=config) as sess: input_ids = tf.placeholder(tf.int32, [None, None]) input_mask = tf.placeholder(tf.int32, [None, None]) segment_ids = tf.placeholder(tf.int32, [None, None]) masked_lm_positions = tf.placeholder(tf.int32, [None, None]) masked_lm_ids = tf.placeholder(tf.int32, [None, None]) masked_lm_weights = tf.placeholder(tf.float32, [None, None]) next_sentence_labels = tf.placeholder(tf.int32, [None]) model = BertModel(config=bert_config, is_training=True, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=False) output = model.get_sequence_output() (_, _, _) = get_masked_lm_output(bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) (_, _, _) = get_next_sentence_output(bert_config, model.get_pooled_output(), next_sentence_labels) saver = tf.train.Saver() ckpt = tf.train.latest_checkpoint(args.model) saver.restore(sess, ckpt) train_vars = tf.trainable_variables() restored_weights = {} for i in range(len(train_vars)): restored_weights[train_vars[i].name] = sess.run(train_vars[i]) labels = tf.placeholder(tf.float32, [ None, ]) output_layer = model.get_pooled_output() if int(tf.__version__[0]) > 1: hidden_size = output_layer.shape[-1] else: hidden_size = output_layer.shape[-1].value masked_lm_values = tf.placeholder(tf.float32, [None, None]) with tf.variable_scope("loss"): (loss, _) = get_masked_regression_output( bert_config, model.get_sequence_output(), masked_lm_positions, masked_lm_values, masked_lm_weights) opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate) train_vars = tf.trainable_variables() opt_grads = tf.gradients(loss, train_vars) opt_grads = list(zip(opt_grads, train_vars)) opt_apply = opt.apply_gradients(opt_grads) summaries = tf.summary.scalar('loss', loss) summary_log = tf.summary.FileWriter( os.path.join(CHECKPOINT_DIR, args.run_name)) counter = 1 counter_path = os.path.join(CHECKPOINT_DIR, args.run_name, 'counter') if os.path.exists(counter_path): # Load the step number if we're resuming a run # Add 1 so we don't immediately try to save again with open(counter_path, 'r') as fp: counter = int(fp.read()) + 1 hparams_path = os.path.join(CHECKPOINT_DIR, args.run_name, 'hparams.json') maketree(os.path.join(CHECKPOINT_DIR, args.run_name)) with open(hparams_path, 'w') as fp: fp.write(json.dumps(bert_config_params)) sess.run(tf.global_variables_initializer()) # init output_weights restored = 0 for k, v in restored_weights.items(): for i in range(len(train_vars)): if train_vars[i].name == k: assign_op = train_vars[i].assign(v) sess.run(assign_op) restored += 1 assert restored == len(restored_weights), 'fail to restore model.' saver = tf.train.Saver(var_list=tf.trainable_variables()) def save(): maketree(os.path.join(CHECKPOINT_DIR, args.run_name)) print( 'Saving', os.path.join(CHECKPOINT_DIR, args.run_name, 'model-{}').format(counter)) saver.save(sess, os.path.join(CHECKPOINT_DIR, args.run_name, 'model'), global_step=counter) with open(counter_path, 'w') as fp: fp.write(str(counter) + '\n') avg_loss = (0.0, 0.0) start_time = time.time() def sample_feature(i, eval=False): indexs = eval_input_indexs if eval else input_indexs last = min((i + 1) * batch_size, len(indexs)) _input_ids = [] _input_masks = [] _segments = [] _lm_positions = [] _lm_vals = [] _lm_lm_weights = [] _lm_ids = [] for j in range(i * batch_size, last, 1): (lm_tokens, lm_positions, lm_imprtances) = input_contexts[indexs[j]] ids = copy(lm_tokens)[:max_seq_length] seg = [1] * len(ids) while len(ids) < max_seq_length: ids.append(0) seg.append(0) _input_ids.append(ids) _input_masks.append(seg) _segments.append(seg) pos = copy(lm_positions)[:max_seq_length] val = copy(lm_imprtances)[:max_seq_length] wei = [1] * len(pos) while len(ids) < max_seq_length: pos.append(0) val.append(0) wei.append(0) _lm_positions.append(pos) _lm_ids.append([0] * max_seq_length) _lm_lm_weights.append(wei) _lm_vals.append(val) return { input_ids: _input_ids, input_mask: _input_masks, segment_ids: _segments, masked_lm_positions: _lm_positions, masked_lm_ids: _lm_ids, masked_lm_weights: _lm_lm_weights, next_sentence_labels: [0] * len(_input_ids), masked_lm_values: _lm_vals } try: for ep in range(num_epochs): if ep % args.save_every == 0: save() prog = tqdm(range(0, len(input_indexs) // batch_size, 1)) for i in prog: (_, v_loss, v_summary) = sess.run( (opt_apply, loss, summaries), feed_dict=sample_feature(i)) summary_log.add_summary(v_summary, counter) avg_loss = (avg_loss[0] * 0.99 + v_loss, avg_loss[1] * 0.99 + 1.0) prog.set_description( '[{ep} | {time:2.0f}] loss={loss:.4f} avg={avg:.4f}' .format(ep=ep, time=time.time() - start_time, loss=v_loss, avg=avg_loss[0] / avg_loss[1])) counter += 1 if args.do_eval: eval_losses = [] for i in tqdm( range(0, len(eval_input_indexs) // batch_size, 1)): eval_losses.append( sess.run(loss, feed_dict=sample_feature(i, True))) print("eval loss:", np.mean(eval_losses)) except KeyboardInterrupt: print('interrupted') save() save()
def main(): args = parser.parse_args() if os.path.isfile(args.model + '/hparams.json'): with open(args.model + '/hparams.json') as f: bert_config_params = json.load(f) else: raise ValueError('invalid model name.') if os.path.isfile(args.model + '/idmaps.json'): with open(args.model + '/idmaps.json') as f: idmapping_dict = json.load(f) else: raise ValueError('invalid model name.') vocab_size = bert_config_params['vocab_size'] max_seq_length = bert_config_params['max_position_embeddings'] batch_size = args.batch_size EOT_TOKEN = vocab_size - 4 MASK_TOKEN = vocab_size - 3 CLS_TOKEN = vocab_size - 2 SEP_TOKEN = vocab_size - 1 with open('ja-bpe.txt', encoding='utf-8') as f: bpe = f.read().split('\n') with open('emoji.json', encoding='utf-8') as f: emoji = json.loads(f.read()) enc = BPEEncoder_ja(bpe, emoji) num_labels = len(idmapping_dict) input_contexts = [] input_keys = [] input_names = [] for f, i in idmapping_dict.items(): n = 0 for t in os.listdir(f'{args.input_dir}/{f}'): if os.path.isfile(f'{args.input_dir}/{f}/{t}'): with open(f'{args.input_dir}/{f}/{t}', encoding='utf-8') as fn: if args.train_by_line: for ln, p in enumerate(fn.readlines()): tokens = enc.encode(p.strip())[:max_seq_length - 3] tokens = [CLS_TOKEN ] + tokens + [EOT_TOKEN, SEP_TOKEN] if len(tokens) < max_seq_length: tokens.extend([0] * (max_seq_length - len(tokens))) input_contexts.append(tokens) input_keys.append(i) input_names.append(f'{f}/{t}#{ln}') n += 1 else: p = fn.read() tokens = enc.encode(p.strip())[:max_seq_length - 2] tokens = [CLS_TOKEN] + tokens + [SEP_TOKEN] if len(tokens) < max_seq_length: tokens.extend([0] * (max_seq_length - len(tokens))) input_contexts.append(tokens) input_keys.append(i) input_names.append(f'{f}/{t}') n += 1 print(f'{args.input_dir}/{f} mapped for id_{i}, read {n} contexts.') input_indexs = np.arange(len(input_contexts)) bert_config = BertConfig(**bert_config_params) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = args.gpu with tf.Session(config=config) as sess: input_ids = tf.placeholder(tf.int32, [None, None]) input_mask = tf.placeholder(tf.int32, [None, None]) segment_ids = tf.placeholder(tf.int32, [None, None]) masked_lm_positions = tf.placeholder(tf.int32, [None, None]) masked_lm_ids = tf.placeholder(tf.int32, [None, None]) masked_lm_weights = tf.placeholder(tf.float32, [None, None]) next_sentence_labels = tf.placeholder(tf.int32, [None]) model = BertModel(config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=False) output = model.get_sequence_output() (_, _, _) = get_masked_lm_output(bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) (_, _, _) = get_next_sentence_output(bert_config, model.get_pooled_output(), next_sentence_labels) saver = tf.train.Saver() labels = tf.placeholder(tf.int32, [ batch_size, ]) output_layer = model.get_pooled_output() if int(tf.__version__[0]) > 1: hidden_size = output_layer.shape[-1] else: hidden_size = output_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probabilities = tf.nn.softmax(logits, axis=-1) saver = tf.train.Saver(var_list=tf.trainable_variables()) ckpt = tf.train.latest_checkpoint(args.model) saver.restore(sess, ckpt) def sample_feature(i): last = min((i + 1) * batch_size, len(input_indexs)) _input_ids = [ input_contexts[idx] for idx in input_indexs[i * batch_size:last] ] _input_masks = [[1] * len(input_contexts[idx]) + [0] * (max_seq_length - len(input_contexts[idx])) for idx in input_indexs[i * batch_size:last]] _segments = [[1] * len(input_contexts[idx]) + [0] * (max_seq_length - len(input_contexts[idx])) for idx in input_indexs[i * batch_size:last]] _labels = [ input_keys[idx] for idx in input_indexs[i * batch_size:last] ] return { input_ids: _input_ids, input_mask: _input_masks, segment_ids: _segments, masked_lm_positions: np.zeros((len(_input_ids), 0), dtype=np.int32), masked_lm_ids: np.zeros((len(_input_ids), 0), dtype=np.int32), masked_lm_weights: np.ones((len(_input_ids), 0), dtype=np.float32), next_sentence_labels: np.zeros((len(_input_ids), ), dtype=np.int32), labels: _labels } preds = [] prog = tqdm.tqdm(range(0, len(input_contexts) // batch_size, 1)) for i in prog: prob = sess.run(probabilities, feed_dict=sample_feature(i)) for p in prob: pred = np.argmax(p) preds.append(pred) pd.DataFrame({ 'id': input_names, 'y_true': input_keys, 'y_pred': preds }).to_csv(args.output_file, index=False) r = np.zeros((num_labels, num_labels), dtype=int) for t, p in zip(input_keys, preds): r[t, p] += 1 fig = plt.figure(figsize=(12, 6), dpi=72) ax = plt.matshow(r, interpolation='nearest', aspect=.5, cmap='cool') for (i, j), z in np.ndenumerate(r): if z >= 1000: plt.text(j - .33, i, '{:0.1f}K'.format(z / 1000), ha='left', va='center', size=9, color='black') else: plt.text(j - .33, i, f'{z}', ha='left', va='center', size=9, color='black') pfile = args.output_file if args.output_file.lower().endswith('.csv'): pfile = args.output_file[:-4] plt.savefig(pfile + '_map.png')
parser = argparse.ArgumentParser() parser.add_argument('input_file') parser.add_argument('--model', default='gpt2ja-medium') parser.add_argument('--tokens', action='store_true') parser.add_argument('--exclude-end', action='store_true') parser.add_argument('--gpu', type=str, default='0') args = parser.parse_args() with open('ja-bpe.txt', encoding='utf-8') as f: bpe = f.read().split('\n') with open('emoji.json', encoding='utf-8') as f: emoji = json.loads(f.read()) enc = BPEEncoder_ja(bpe, emoji) n_vocab = len(enc) if os.path.isfile(args.model + '/hparams.json'): with open(args.model + '/hparams.json', encoding='utf-8') as f: params = json.loads(f.read()) hparams = HParams(**params) elif 'small' in args.model: hparams = HParams( **{ "n_vocab": n_vocab, "n_ctx": 1024, "n_embd": 768, "n_head": 12, "n_layer": 12 })
def __init__(self): self.temperature = 1 self.top_k = 40 self.top_p = 0 self.tokenizer = BPEEncoder_ja(bpe, emoji)
from multiprocessing import Pool parser = argparse.ArgumentParser() parser.add_argument("--src_dir", help="source dir", required=True) parser.add_argument("--dst_file", help="destnation file", required=True) parser.add_argument("--num_process", help="process num", type=int, default=8) parser.add_argument("--split_tag", help="text split tag", default='') args = parser.parse_args() with open('ja-bpe.txt', encoding='utf-8') as f: bpe = f.read().split('\n') with open('emoji.json', encoding='utf-8') as f: emoji = json.loads(f.read()) enc = BPEEncoder_ja(bpe, emoji) split_types = enc.encode('。.。、;:.,「『') eot_token = enc.encode('<|endoftext|>') array_file = [] def _proc(i): token_chunks = [] for j, input in enumerate(tqdm(array_file)): if not ((j % args.num_process) == i): continue with open(input, 'r', encoding='utf-8') as fp: raw_text = fp.read() if args.split_tag == '': tokens = make_rouge(enc.encode(raw_text)) else:
parser.add_argument('--num_generate', type=int, default=1) parser.add_argument('--top_k', type=int, default=1) parser.add_argument('--top_p', type=float, default=0) parser.add_argument('--temperature', type=float, default=1) parser.add_argument('--allow_duplicate_line', action='store_true') parser.add_argument("--full_sentences", action='store_true') parser.add_argument('--gpu', type=str, default='0') args = parser.parse_args() with open('ja-bpe.txt', encoding='utf-8') as f: bpe = f.read().split('\n') with open('emoji.json', encoding='utf-8') as f: emoji = json.loads(f.read()) enc = BPEEncoder_ja(bpe, emoji) n_vocab = len(enc) if os.path.isfile(args.model + '/hparams.json'): with open(args.model + '/hparams.json') as f: params = json.loads(f.read()) hparams = HParams(**params) max_length = params['n_prediction'] else: raise ValueError('invalid model name.') length = hparams.n_ctx - max_length - 1 temperature = args.temperature top_k = args.top_k top_p = args.top_p SEP_TOKEN = enc.encode('<|byte0|>')[0]
def main(): args = parser.parse_args() if os.path.isfile(args.model + '/hparams.json'): with open(args.model + '/hparams.json') as f: bert_config_params = json.load(f) else: raise ValueError('invalid model name.') vocab_size = bert_config_params['vocab_size'] max_seq_length = bert_config_params['max_position_embeddings'] batch_size = args.batch_size save_every = args.save_every num_epochs = args.num_epochs EOT_TOKEN = vocab_size - 4 MASK_TOKEN = vocab_size - 3 CLS_TOKEN = vocab_size - 2 SEP_TOKEN = vocab_size - 1 with open('ja-bpe.txt', encoding='utf-8') as f: bpe = f.read().split('\n') with open('emoji.json', encoding='utf-8') as f: emoji = json.loads(f.read()) enc = BPEEncoder_ja(bpe, emoji) keys = [ f for f in os.listdir(args.input_dir) if os.path.isdir(args.input_dir + '/' + f) ] keys = sorted(keys) num_labels = len(keys) input_contexts = [] input_keys = [] idmapping_dict = {} for i, f in enumerate(keys): n = 0 for t in os.listdir(f'{args.input_dir}/{f}'): if os.path.isfile(f'{args.input_dir}/{f}/{t}'): with open(f'{args.input_dir}/{f}/{t}', encoding='utf-8') as fn: if args.train_by_line: for p in fn.readlines(): tokens = enc.encode(p.strip())[:max_seq_length - 2] tokens = [CLS_TOKEN] + tokens + [SEP_TOKEN] if len(tokens) < max_seq_length: tokens.extend([0] * (max_seq_length - len(tokens))) input_contexts.append(tokens) input_keys.append(i) n += 1 else: p = fn.read() tokens = enc.encode(p.strip())[:max_seq_length - 3] tokens = [CLS_TOKEN] + tokens + [EOT_TOKEN, SEP_TOKEN] if len(tokens) < max_seq_length: tokens.extend([0] * (max_seq_length - len(tokens))) input_contexts.append(tokens) input_keys.append(i) n += 1 print(f'{args.input_dir}/{f} mapped for id_{i}, read {n} contexts.') idmapping_dict[f] = i input_indexs = np.random.permutation(len(input_contexts)) bert_config = BertConfig(**bert_config_params) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = args.gpu with tf.Session(config=config) as sess: input_ids = tf.placeholder(tf.int32, [None, None]) input_mask = tf.placeholder(tf.int32, [None, None]) segment_ids = tf.placeholder(tf.int32, [None, None]) masked_lm_positions = tf.placeholder(tf.int32, [None, None]) masked_lm_ids = tf.placeholder(tf.int32, [None, None]) masked_lm_weights = tf.placeholder(tf.float32, [None, None]) next_sentence_labels = tf.placeholder(tf.int32, [None]) model = BertModel(config=bert_config, is_training=True, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=False) output = model.get_sequence_output() (_, _, _) = get_masked_lm_output(bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) (_, _, _) = get_next_sentence_output(bert_config, model.get_pooled_output(), next_sentence_labels) saver = tf.train.Saver() ckpt = tf.train.latest_checkpoint(args.model) saver.restore(sess, ckpt) train_vars = tf.trainable_variables() restored_weights = {} for i in range(len(train_vars)): restored_weights[train_vars[i].name] = sess.run(train_vars[i]) labels = tf.placeholder(tf.int32, [ None, ]) output_layer = model.get_pooled_output() if int(tf.__version__[0]) > 1: hidden_size = output_layer.shape[-1] else: hidden_size = output_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probabilities = tf.nn.softmax(logits, axis=-1) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate) train_vars = tf.trainable_variables() opt_grads = tf.gradients(loss, train_vars) opt_grads = list(zip(opt_grads, train_vars)) opt_apply = opt.apply_gradients(opt_grads) summaries = tf.summary.scalar('loss', loss) summary_log = tf.summary.FileWriter( os.path.join(CHECKPOINT_DIR, args.run_name)) counter = 1 counter_path = os.path.join(CHECKPOINT_DIR, args.run_name, 'counter') if os.path.exists(counter_path): # Load the step number if we're resuming a run # Add 1 so we don't immediately try to save again with open(counter_path, 'r') as fp: counter = int(fp.read()) + 1 hparams_path = os.path.join(CHECKPOINT_DIR, args.run_name, 'hparams.json') maketree(os.path.join(CHECKPOINT_DIR, args.run_name)) with open(hparams_path, 'w') as fp: fp.write(json.dumps(bert_config_params)) idmaps_path = os.path.join(CHECKPOINT_DIR, args.run_name, 'idmaps.json') with open(idmaps_path, 'w') as fp: fp.write(json.dumps(idmapping_dict)) sess.run(tf.global_variables_initializer()) # init output_weights restored = 0 for k, v in restored_weights.items(): for i in range(len(train_vars)): if train_vars[i].name == k: assign_op = train_vars[i].assign(v) sess.run(assign_op) restored += 1 assert restored == len(restored_weights), 'fail to restore model.' saver = tf.train.Saver(var_list=tf.trainable_variables()) def save(): maketree(os.path.join(CHECKPOINT_DIR, args.run_name)) print( 'Saving', os.path.join(CHECKPOINT_DIR, args.run_name, 'model-{}').format(counter)) saver.save(sess, os.path.join(CHECKPOINT_DIR, args.run_name, 'model'), global_step=counter) with open(counter_path, 'w') as fp: fp.write(str(counter) + '\n') avg_loss = (0.0, 0.0) start_time = time.time() def sample_feature(i): last = min((i + 1) * batch_size, len(input_indexs)) _input_ids = [ input_contexts[idx] for idx in input_indexs[i * batch_size:last] ] _input_masks = [[1] * len(input_contexts[idx]) + [0] * (max_seq_length - len(input_contexts[idx])) for idx in input_indexs[i * batch_size:last]] _segments = [[1] * len(input_contexts[idx]) + [0] * (max_seq_length - len(input_contexts[idx])) for idx in input_indexs[i * batch_size:last]] _labels = [ input_keys[idx] for idx in input_indexs[i * batch_size:last] ] return { input_ids: _input_ids, input_mask: _input_masks, segment_ids: _segments, masked_lm_positions: np.zeros((len(_input_ids), 0), dtype=np.int32), masked_lm_ids: np.zeros((len(_input_ids), 0), dtype=np.int32), masked_lm_weights: np.ones((len(_input_ids), 0), dtype=np.float32), next_sentence_labels: np.zeros((len(_input_ids), ), dtype=np.int32), labels: _labels } try: for ep in range(num_epochs): if ep % args.save_every == 0: save() prog = tqdm.tqdm( range(0, len(input_contexts) // batch_size, 1)) for i in prog: (_, v_loss, v_summary) = sess.run( (opt_apply, loss, summaries), feed_dict=sample_feature(i)) summary_log.add_summary(v_summary, counter) avg_loss = (avg_loss[0] * 0.99 + v_loss, avg_loss[1] * 0.99 + 1.0) prog.set_description( '[{ep} | {time:2.2f}] loss={loss:2.2f} avg={avg:2.2f}' .format(ep=ep, time=time.time() - start_time, loss=v_loss, avg=avg_loss[0] / avg_loss[1])) counter += 1 except KeyboardInterrupt: print('interrupted') save() save()
import model from encode_bpe import BPEEncoder_ja parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, default='gpt2ja-medium') parser.add_argument('--context', type=str, required=True) parser.add_argument('--gpu', type=str, default='0') args = parser.parse_args() with open('ja-bpe.txt') as f: bpe = f.read().split('\n') with open('emoji.json') as f: emoji = json.loads(f.read()) enc = BPEEncoder_ja(bpe, emoji) n_vocab = len(enc) if 'small' in args.model: hparams = HParams( **{ "n_vocab": n_vocab, "n_ctx": 1024, "n_embd": 768, "n_head": 12, "n_layer": 12 }) elif 'medium' in args.model: hparams = HParams( **{ "n_vocab": n_vocab,
bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) (_,_,_) = get_next_sentence_output( bert_config, model.get_pooled_output(), next_sentence_labels) saver = tf.train.Saver() ckpt = tf.train.latest_checkpoint(args.model) saver.restore(sess, ckpt) with open('ja-bpe.txt', encoding='utf-8') as f: bpe = f.read().split('\n') with open('emoji.json', encoding='utf-8') as f: emoji = json.loads(f.read()) enc = BPEEncoder_ja(bpe, emoji) if args.split_tag != '': contexts = args.context.split(args.split_tag) else: contexts = [args.context] _input_ids = [] _input_masks = [] _segments = [] _mask_positions = [] for context in contexts: context_tokens = [enc.encode(c)+[MASK_TOKEN] for c in context.split('[MASK]')] context_tokens = sum(context_tokens, []) if len(context_tokens) > 1: context_tokens = context_tokens[:-1] context_tokens = context_tokens[:max_seq_length-3]
parser.add_argument('--gpu', default='0', help='visible gpu number.') def maketree(path): try: os.makedirs(path) except: pass with open('ja-bpe.txt', encoding='utf-8') as f: bpe = f.read().split('\n') with open('emoji.json', encoding='utf-8') as f: emoji = json.loads(f.read()) enc = BPEEncoder_ja(bpe, emoji) n_vocab = len(enc) def main(): args = parser.parse_args() if os.path.isfile(args.base_model+'/hparams.json'): with open(args.base_model+'/hparams.json', encoding='utf-8') as f: params = json.loads(f.read()) hparams = HParams(**params) elif 'small' in args.base_model: hparams = HParams(**{ "n_vocab": n_vocab, "n_ctx": 1024, "n_embd": 768, "n_head": 12,
def maketree(path): try: os.makedirs(path) except: pass with open('ja-bpe.txt', encoding='utf-8') as f: bpe = f.read().split('\n') with open('emoji.json', encoding='utf-8') as f: emoji = json.loads(f.read()) enc = BPEEncoder_ja(bpe, emoji) n_vocab = len(enc) eot_token = enc.encode('<|endoftext|>')[0] sep_token = enc.encode('<|byte0|>')[0] def get_masked_lm_output(hparams, logits, positions, label_ids, label_weights): logits = gather_indexes(logits, positions) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=n_vocab, dtype=tf.float32) per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])