def origin():
  tokenizer = BPEEncoder_ja(bpe_origin, emoji)
  model = GPT2LMHeadModel.from_pretrained('gpt2-pytorch-model-medium')
  model.resize_token_embeddings(len(tokenizer))
  model.eval()
  device = 'cpu'
  seq_num = 0

  with torch.no_grad():
    for idx in range(3):
      print('=' * 5 + 'start' + '=' * 5)
      finished = False
      cur_ids = torch.tensor(tokenizer.encode('AIとは')).unsqueeze(0).to(device)
  
      # print(cur_ids)
      for i in range(100):
        outputs = model(cur_ids, labels=cur_ids)
        loss, logits = outputs[:2]
        # print(f'logits = {logits}, shape = {logits.shape}')
        # print(f'logits[0,-1] = {logits[0,-1]}, shape = {logits[0,-1].shape}')

        softmax_logits = torch.softmax(logits[0,-1], dim=0)
        # print(f'softmax = {softmax_logits} shape = {softmax_logits.shape}')
        
        if i < 3:
          n = 20
        else:
          n = 3
        next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n)
        # print(next_token_id)
        cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1)
  
        # print('encode = ', tokenizer.encode('<|endoftext|>'))
        # print('next_token_id = ', next_token_id, 'next decode = ', tokenizer.decode(next_token_id))
  
        if next_token_id in tokenizer.encode('<|endoftext|>'):
          finished = True
          print('generate finished')
          break
  
      # print(cur_ids.shape)
      # if finished:
      # print(cur_ids.tolist()[0])
      seq_num = seq_num + 1
      output_list = list(cur_ids.squeeze().to('cpu').numpy())
      # output_list = cur_ids.tolist()[0]
      output_text = tokenizer.decode(output_list)
      output_text = output_text.replace(' ','')
      output_text.split('<|endoftext|>')[0]
      print(output_text)
def main():
    args = parser.parse_args()

    if os.path.isfile(args.model + '/hparams.json'):
        with open(args.model + '/hparams.json') as f:
            bert_config_params = json.load(f)
    else:
        raise ValueError('invalid model name.')

    if not (len(args.input_file) > 0 or len(args.context) > 0):
        raise ValueError('--input_file or --context required.')
    if (not os.path.isfile(args.input_file)) and len(args.context) == 0:
        raise ValueError('invalid input file name.')
    if len(args.input_file) > 0 and os.path.isfile(args.input_file):
        with open(args.input_file) as f:
            args.context = f.read()

    vocab_size = bert_config_params['vocab_size']
    max_seq_length = bert_config_params['max_position_embeddings']
    batch_size = 1
    EOT_TOKEN = vocab_size - 4
    MASK_TOKEN = vocab_size - 3
    CLS_TOKEN = vocab_size - 2
    SEP_TOKEN = vocab_size - 1

    with open('ja-bpe.txt', encoding='utf-8') as f:
        bpe = f.read().split('\n')

    with open('emoji.json', encoding='utf-8') as f:
        emoji = json.loads(f.read())

    enc = BPEEncoder_ja(bpe, emoji)

    bert_config = BertConfig(**bert_config_params)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = args.gpu

    with tf.Session(config=config) as sess:
        input_ids = tf.placeholder(tf.int32, [None, None])
        input_mask = tf.placeholder(tf.int32, [None, None])
        segment_ids = tf.placeholder(tf.int32, [None, None])
        masked_lm_positions = tf.placeholder(tf.int32, [None, None])
        masked_lm_ids = tf.placeholder(tf.int32, [None, None])
        masked_lm_weights = tf.placeholder(tf.float32, [None, None])
        next_sentence_labels = tf.placeholder(tf.int32, [None])

        model = BertModel(config=bert_config,
                          is_training=False,
                          input_ids=input_ids,
                          input_mask=input_mask,
                          token_type_ids=segment_ids,
                          use_one_hot_embeddings=False)

        output = model.get_sequence_output()
        (_, _, _) = get_masked_lm_output(bert_config,
                                         model.get_sequence_output(),
                                         model.get_embedding_table(),
                                         masked_lm_positions, masked_lm_ids,
                                         masked_lm_weights)
        (_, _, _) = get_next_sentence_output(bert_config,
                                             model.get_pooled_output(),
                                             next_sentence_labels)

        saver = tf.train.Saver()

        masked_lm_values = tf.placeholder(tf.float32, [None, None])

        with tf.variable_scope("loss"):
            (_, outputs) = get_masked_regression_output(
                bert_config, model.get_sequence_output(), masked_lm_positions,
                masked_lm_values, masked_lm_weights)

            saver = tf.train.Saver(var_list=tf.trainable_variables())
            ckpt = tf.train.latest_checkpoint(args.model)
            saver.restore(sess, ckpt)

            _input_ids = []
            _lm_positions = []
            tokens = [enc.encode(p.strip()) for p in sep_txt(args.context)]
            tokens = [t for t in tokens if len(t) > 0]
            for t in tokens:
                _lm_positions.append(len(_input_ids))
                _input_ids.extend([CLS_TOKEN] + t)
            _input_ids.append(EOT_TOKEN)
            _input_masks = [1] * len(_input_ids)
            _segments = [1] * len(_input_ids)
            _input_ids = _input_ids[:max_seq_length]
            _input_masks = _input_masks[:max_seq_length]
            _segments = _segments[:max_seq_length]
            while len(_segments) < max_seq_length:
                _input_ids.append(0)
                _input_masks.append(0)
                _segments.append(0)
            _lm_positions = [p for p in _lm_positions if p < max_seq_length]
            _lm_positions = _lm_positions[:max_seq_length]
            _lm_lm_weights = [1] * len(_lm_positions)
            while len(_lm_positions) < max_seq_length:
                _lm_positions.append(0)
                _lm_lm_weights.append(0)
            _lm_ids = [0] * len(_lm_positions)
            _lm_vals = [0] * len(_lm_positions)

            regress = sess.run(outputs,
                               feed_dict={
                                   input_ids: [_input_ids],
                                   input_mask: [_input_masks],
                                   segment_ids: [_segments],
                                   masked_lm_positions: [_lm_positions],
                                   masked_lm_ids: [_lm_ids],
                                   masked_lm_weights: [_lm_lm_weights],
                                   next_sentence_labels: [0],
                                   masked_lm_values: [_lm_vals]
                               })
            regress = regress.reshape((-1, ))
            if args.output_file == '':
                for tok, value in zip(tokens, regress):
                    print(f'{value}\t{enc.decode(tok)}')
            else:
                sent = []
                impt = []
                for tok, value in zip(tokens, regress):
                    sent.append(enc.decode(tok))
                    impt.append(value)
                df = pd.DataFrame({'sentence': sent, 'importance': impt})
                df.to_csv(args.output_file, index=False)
  
      # print(cur_ids.shape)
      # if finished:
      # print(cur_ids.tolist()[0])
      seq_num = seq_num + 1
      output_list = list(cur_ids.squeeze().to('cpu').numpy())
      # output_list = cur_ids.tolist()[0]
      output_text = tokenizer.decode(output_list)
      output_text = output_text.replace(' ','')
      output_text.split('<|endoftext|>')[0]
      print(output_text)

if __name__ == '__main__':
  tokenizer = BPEEncoder_ja(bpe, emoji)
  ids = [0,0,0,0,1,2]
  print(tokenizer.decode(ids))
  # model_path = './trained_models/best.pt'
  # model = GPT2LMHeadModel.from_pretrained('gpt2-pytorch-model-medium')
  # model.resize_token_embeddings(len(tokenizer))

  # device = torch.device('cuda:2')
  # model.load_state_dict(torch.load(model_path, map_location=device), strict=False)
  # article_generate(model, tokenizer, 1, 5, 0, device)
  
  # try:
  #   article_generate(model, tokenizer, 1, 5, 0, device)
  # except:
  #   print('=' * 30)
  # origin()
      
Exemple #4
0
    for p in range(len(_mask_positions)):
        q = len(_mask_positions[p])
        if q < max_mask_count:
            _mask_positions[p].extend([0]*(max_mask_count-q))

    prob = sess.run(log_prob, feed_dict={
        input_ids:_input_ids,
        input_mask:_input_masks,
        segment_ids:_segments,
        masked_lm_positions:_mask_positions,
        masked_lm_ids:np.zeros((len(_input_ids),max_mask_count), dtype=np.int32),
        masked_lm_weights:np.ones((len(_input_ids),max_mask_count), dtype=np.float32),
        next_sentence_labels:np.zeros((len(_input_ids),), dtype=np.int32),
    })
    results = []
    for i in range(len(_input_ids)):
        result_lines = []
        for j in range(1,len(_input_ids[i])):
            if _input_ids[i][j] < EOT_TOKEN:
                score = prob[j][_input_ids[i][j]]
                word = enc.decode([_input_ids[i][j]])
                maxword = enc.decode([np.argmax(prob[j])])
                if args.output_max:
                    result_lines.append(f'{score}\t{word}\t{maxword}')
                else:
                    result_lines.append(f'{score}\t{word}')
            else:
                break
        results.append('\n'.join(result_lines))
    print('=================\n'.join(results))
Exemple #5
0
    if args.output_file != "":
        outf = open(args.output_file, "w")

    prob = sess.run(log_prob, feed_dict={
        input_ids:_input_ids,
        input_mask:_input_masks,
        segment_ids:_segments,
        masked_lm_positions:_mask_positions,
        masked_lm_ids:np.zeros((len(_input_ids),max_mask_count), dtype=np.int32),
        masked_lm_weights:np.ones((len(_input_ids),max_mask_count), dtype=np.float32),
        next_sentence_labels:np.zeros((len(_input_ids),), dtype=np.int32),
    })
    mask_count = 0
    for i in range(len(_input_ids)):
        result_token = []
        for j in range(len(_input_ids[i])):
            if CLS_TOKEN == _input_ids[i][j]:
                pass
            elif MASK_TOKEN == _input_ids[i][j]:
                result_token.append(np.argmax(prob[mask_count]))
                mask_count += 1
            elif EOT_TOKEN <= _input_ids[i][j]:
                break
            else:
                result_token.append(_input_ids[i][j])
        oo = enc.decode(result_token)
        print(oo)
        if outf:
            outf.write(oo+'\n')
            outf.flush()
    output = score_tokens(hparams=hparams, tokens=tokens_tensor)

    saver = tf.train.Saver()
    ckpt = tf.train.latest_checkpoint(args.model)
    saver.restore(sess, ckpt)

    end_token = enc.encode('<|endoftext|>')[0]
    start_token = end_token  # it does double duty

    for text in texts:
        # prepend the start token so that we get a probability for the first "real" token
        tokens = enc.encode(text)
        if not args.exclude_end:
            tokens += [end_token]
        tokens_with_start = [start_token] + tokens

        logprobs = sess.run(output,
                            feed_dict={
                                tokens_tensor: tokens_with_start,
                            })

        logprobs_list = logprobs.tolist()
        assert len(logprobs_list) == len(tokens)  # sanity check

        print('%s\t%.5g' % (text, sum(logprobs_list)))
        if args.tokens:
            for t, lp in zip(tokens, logprobs_list):
                print('%s\t%.5g' % (enc.decode([t]), lp))
            print()