def origin(): tokenizer = BPEEncoder_ja(bpe_origin, emoji) model = GPT2LMHeadModel.from_pretrained('gpt2-pytorch-model-medium') model.resize_token_embeddings(len(tokenizer)) model.eval() device = 'cpu' seq_num = 0 with torch.no_grad(): for idx in range(3): print('=' * 5 + 'start' + '=' * 5) finished = False cur_ids = torch.tensor(tokenizer.encode('AIとは')).unsqueeze(0).to(device) # print(cur_ids) for i in range(100): outputs = model(cur_ids, labels=cur_ids) loss, logits = outputs[:2] # print(f'logits = {logits}, shape = {logits.shape}') # print(f'logits[0,-1] = {logits[0,-1]}, shape = {logits[0,-1].shape}') softmax_logits = torch.softmax(logits[0,-1], dim=0) # print(f'softmax = {softmax_logits} shape = {softmax_logits.shape}') if i < 3: n = 20 else: n = 3 next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n) # print(next_token_id) cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # print('encode = ', tokenizer.encode('<|endoftext|>')) # print('next_token_id = ', next_token_id, 'next decode = ', tokenizer.decode(next_token_id)) if next_token_id in tokenizer.encode('<|endoftext|>'): finished = True print('generate finished') break # print(cur_ids.shape) # if finished: # print(cur_ids.tolist()[0]) seq_num = seq_num + 1 output_list = list(cur_ids.squeeze().to('cpu').numpy()) # output_list = cur_ids.tolist()[0] output_text = tokenizer.decode(output_list) output_text = output_text.replace(' ','') output_text.split('<|endoftext|>')[0] print(output_text)
def main(): args = parser.parse_args() if os.path.isfile(args.model + '/hparams.json'): with open(args.model + '/hparams.json') as f: bert_config_params = json.load(f) else: raise ValueError('invalid model name.') if not (len(args.input_file) > 0 or len(args.context) > 0): raise ValueError('--input_file or --context required.') if (not os.path.isfile(args.input_file)) and len(args.context) == 0: raise ValueError('invalid input file name.') if len(args.input_file) > 0 and os.path.isfile(args.input_file): with open(args.input_file) as f: args.context = f.read() vocab_size = bert_config_params['vocab_size'] max_seq_length = bert_config_params['max_position_embeddings'] batch_size = 1 EOT_TOKEN = vocab_size - 4 MASK_TOKEN = vocab_size - 3 CLS_TOKEN = vocab_size - 2 SEP_TOKEN = vocab_size - 1 with open('ja-bpe.txt', encoding='utf-8') as f: bpe = f.read().split('\n') with open('emoji.json', encoding='utf-8') as f: emoji = json.loads(f.read()) enc = BPEEncoder_ja(bpe, emoji) bert_config = BertConfig(**bert_config_params) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = args.gpu with tf.Session(config=config) as sess: input_ids = tf.placeholder(tf.int32, [None, None]) input_mask = tf.placeholder(tf.int32, [None, None]) segment_ids = tf.placeholder(tf.int32, [None, None]) masked_lm_positions = tf.placeholder(tf.int32, [None, None]) masked_lm_ids = tf.placeholder(tf.int32, [None, None]) masked_lm_weights = tf.placeholder(tf.float32, [None, None]) next_sentence_labels = tf.placeholder(tf.int32, [None]) model = BertModel(config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=False) output = model.get_sequence_output() (_, _, _) = get_masked_lm_output(bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) (_, _, _) = get_next_sentence_output(bert_config, model.get_pooled_output(), next_sentence_labels) saver = tf.train.Saver() masked_lm_values = tf.placeholder(tf.float32, [None, None]) with tf.variable_scope("loss"): (_, outputs) = get_masked_regression_output( bert_config, model.get_sequence_output(), masked_lm_positions, masked_lm_values, masked_lm_weights) saver = tf.train.Saver(var_list=tf.trainable_variables()) ckpt = tf.train.latest_checkpoint(args.model) saver.restore(sess, ckpt) _input_ids = [] _lm_positions = [] tokens = [enc.encode(p.strip()) for p in sep_txt(args.context)] tokens = [t for t in tokens if len(t) > 0] for t in tokens: _lm_positions.append(len(_input_ids)) _input_ids.extend([CLS_TOKEN] + t) _input_ids.append(EOT_TOKEN) _input_masks = [1] * len(_input_ids) _segments = [1] * len(_input_ids) _input_ids = _input_ids[:max_seq_length] _input_masks = _input_masks[:max_seq_length] _segments = _segments[:max_seq_length] while len(_segments) < max_seq_length: _input_ids.append(0) _input_masks.append(0) _segments.append(0) _lm_positions = [p for p in _lm_positions if p < max_seq_length] _lm_positions = _lm_positions[:max_seq_length] _lm_lm_weights = [1] * len(_lm_positions) while len(_lm_positions) < max_seq_length: _lm_positions.append(0) _lm_lm_weights.append(0) _lm_ids = [0] * len(_lm_positions) _lm_vals = [0] * len(_lm_positions) regress = sess.run(outputs, feed_dict={ input_ids: [_input_ids], input_mask: [_input_masks], segment_ids: [_segments], masked_lm_positions: [_lm_positions], masked_lm_ids: [_lm_ids], masked_lm_weights: [_lm_lm_weights], next_sentence_labels: [0], masked_lm_values: [_lm_vals] }) regress = regress.reshape((-1, )) if args.output_file == '': for tok, value in zip(tokens, regress): print(f'{value}\t{enc.decode(tok)}') else: sent = [] impt = [] for tok, value in zip(tokens, regress): sent.append(enc.decode(tok)) impt.append(value) df = pd.DataFrame({'sentence': sent, 'importance': impt}) df.to_csv(args.output_file, index=False)
# print(cur_ids.shape) # if finished: # print(cur_ids.tolist()[0]) seq_num = seq_num + 1 output_list = list(cur_ids.squeeze().to('cpu').numpy()) # output_list = cur_ids.tolist()[0] output_text = tokenizer.decode(output_list) output_text = output_text.replace(' ','') output_text.split('<|endoftext|>')[0] print(output_text) if __name__ == '__main__': tokenizer = BPEEncoder_ja(bpe, emoji) ids = [0,0,0,0,1,2] print(tokenizer.decode(ids)) # model_path = './trained_models/best.pt' # model = GPT2LMHeadModel.from_pretrained('gpt2-pytorch-model-medium') # model.resize_token_embeddings(len(tokenizer)) # device = torch.device('cuda:2') # model.load_state_dict(torch.load(model_path, map_location=device), strict=False) # article_generate(model, tokenizer, 1, 5, 0, device) # try: # article_generate(model, tokenizer, 1, 5, 0, device) # except: # print('=' * 30) # origin()
for p in range(len(_mask_positions)): q = len(_mask_positions[p]) if q < max_mask_count: _mask_positions[p].extend([0]*(max_mask_count-q)) prob = sess.run(log_prob, feed_dict={ input_ids:_input_ids, input_mask:_input_masks, segment_ids:_segments, masked_lm_positions:_mask_positions, masked_lm_ids:np.zeros((len(_input_ids),max_mask_count), dtype=np.int32), masked_lm_weights:np.ones((len(_input_ids),max_mask_count), dtype=np.float32), next_sentence_labels:np.zeros((len(_input_ids),), dtype=np.int32), }) results = [] for i in range(len(_input_ids)): result_lines = [] for j in range(1,len(_input_ids[i])): if _input_ids[i][j] < EOT_TOKEN: score = prob[j][_input_ids[i][j]] word = enc.decode([_input_ids[i][j]]) maxword = enc.decode([np.argmax(prob[j])]) if args.output_max: result_lines.append(f'{score}\t{word}\t{maxword}') else: result_lines.append(f'{score}\t{word}') else: break results.append('\n'.join(result_lines)) print('=================\n'.join(results))
if args.output_file != "": outf = open(args.output_file, "w") prob = sess.run(log_prob, feed_dict={ input_ids:_input_ids, input_mask:_input_masks, segment_ids:_segments, masked_lm_positions:_mask_positions, masked_lm_ids:np.zeros((len(_input_ids),max_mask_count), dtype=np.int32), masked_lm_weights:np.ones((len(_input_ids),max_mask_count), dtype=np.float32), next_sentence_labels:np.zeros((len(_input_ids),), dtype=np.int32), }) mask_count = 0 for i in range(len(_input_ids)): result_token = [] for j in range(len(_input_ids[i])): if CLS_TOKEN == _input_ids[i][j]: pass elif MASK_TOKEN == _input_ids[i][j]: result_token.append(np.argmax(prob[mask_count])) mask_count += 1 elif EOT_TOKEN <= _input_ids[i][j]: break else: result_token.append(_input_ids[i][j]) oo = enc.decode(result_token) print(oo) if outf: outf.write(oo+'\n') outf.flush()
output = score_tokens(hparams=hparams, tokens=tokens_tensor) saver = tf.train.Saver() ckpt = tf.train.latest_checkpoint(args.model) saver.restore(sess, ckpt) end_token = enc.encode('<|endoftext|>')[0] start_token = end_token # it does double duty for text in texts: # prepend the start token so that we get a probability for the first "real" token tokens = enc.encode(text) if not args.exclude_end: tokens += [end_token] tokens_with_start = [start_token] + tokens logprobs = sess.run(output, feed_dict={ tokens_tensor: tokens_with_start, }) logprobs_list = logprobs.tolist() assert len(logprobs_list) == len(tokens) # sanity check print('%s\t%.5g' % (text, sum(logprobs_list))) if args.tokens: for t, lp in zip(tokens, logprobs_list): print('%s\t%.5g' % (enc.decode([t]), lp)) print()