Example #1
0
def origin():
  tokenizer = BPEEncoder_ja(bpe_origin, emoji)
  model = GPT2LMHeadModel.from_pretrained('gpt2-pytorch-model-medium')
  model.resize_token_embeddings(len(tokenizer))
  model.eval()
  device = 'cpu'
  seq_num = 0

  with torch.no_grad():
    for idx in range(3):
      print('=' * 5 + 'start' + '=' * 5)
      finished = False
      cur_ids = torch.tensor(tokenizer.encode('AIとは')).unsqueeze(0).to(device)
  
      # print(cur_ids)
      for i in range(100):
        outputs = model(cur_ids, labels=cur_ids)
        loss, logits = outputs[:2]
        # print(f'logits = {logits}, shape = {logits.shape}')
        # print(f'logits[0,-1] = {logits[0,-1]}, shape = {logits[0,-1].shape}')

        softmax_logits = torch.softmax(logits[0,-1], dim=0)
        # print(f'softmax = {softmax_logits} shape = {softmax_logits.shape}')
        
        if i < 3:
          n = 20
        else:
          n = 3
        next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n)
        # print(next_token_id)
        cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1)
  
        # print('encode = ', tokenizer.encode('<|endoftext|>'))
        # print('next_token_id = ', next_token_id, 'next decode = ', tokenizer.decode(next_token_id))
  
        if next_token_id in tokenizer.encode('<|endoftext|>'):
          finished = True
          print('generate finished')
          break
  
      # print(cur_ids.shape)
      # if finished:
      # print(cur_ids.tolist()[0])
      seq_num = seq_num + 1
      output_list = list(cur_ids.squeeze().to('cpu').numpy())
      # output_list = cur_ids.tolist()[0]
      output_text = tokenizer.decode(output_list)
      output_text = output_text.replace(' ','')
      output_text.split('<|endoftext|>')[0]
      print(output_text)
    parser.add_argument("--dataset_type", help="dataset type", default='split')
    parser.add_argument("--split_tag",
                        help="text split tag",
                        default='<|SP_QA|>')
    parser.add_argument('--top_k', type=int, default=1)
    parser.add_argument('--top_p', type=float, default=0)
    parser.add_argument('--temperature', type=float, default=1)
    parser.add_argument("--gpu", help="use gpu number", default='0')
    parser.add_argument("--verbose", action='store_true')
    args = parser.parse_args()

    with open('ja-bpe.txt', encoding='utf-8') as f:
        bpe = f.read().split('\n')
    with open('emoji.json', encoding='utf-8') as f:
        emoji = json.loads(f.read())
    enc = BPEEncoder_ja(bpe, emoji)
    n_vocab = len(enc)
    eot_token = enc.encode('<|endoftext|>')[0]
    sep_token = enc.encode('<|byte0|>')[0]
    temperature = args.temperature
    top_k = args.top_k
    top_p = args.top_p
    min_answer_len = args.min_answer_len

    if os.path.isfile(args.model + '/hparams.json'):
        with open(args.model + '/hparams.json') as f:
            params = json.loads(f.read())
            hparams = HParams(**params)
            n_prediction = params['n_prediction']
    elif 'small' in args.model:
        hparams = HParams(
def main():
    args = parser.parse_args()

    if os.path.isfile(args.model + '/hparams.json'):
        with open(args.model + '/hparams.json') as f:
            bert_config_params = json.load(f)
    else:
        raise ValueError('invalid model name.')

    if not (len(args.input_file) > 0 or len(args.context) > 0):
        raise ValueError('--input_file or --context required.')
    if (not os.path.isfile(args.input_file)) and len(args.context) == 0:
        raise ValueError('invalid input file name.')
    if len(args.input_file) > 0 and os.path.isfile(args.input_file):
        with open(args.input_file) as f:
            args.context = f.read()

    vocab_size = bert_config_params['vocab_size']
    max_seq_length = bert_config_params['max_position_embeddings']
    batch_size = 1
    EOT_TOKEN = vocab_size - 4
    MASK_TOKEN = vocab_size - 3
    CLS_TOKEN = vocab_size - 2
    SEP_TOKEN = vocab_size - 1

    with open('ja-bpe.txt', encoding='utf-8') as f:
        bpe = f.read().split('\n')

    with open('emoji.json', encoding='utf-8') as f:
        emoji = json.loads(f.read())

    enc = BPEEncoder_ja(bpe, emoji)

    bert_config = BertConfig(**bert_config_params)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = args.gpu

    with tf.Session(config=config) as sess:
        input_ids = tf.placeholder(tf.int32, [None, None])
        input_mask = tf.placeholder(tf.int32, [None, None])
        segment_ids = tf.placeholder(tf.int32, [None, None])
        masked_lm_positions = tf.placeholder(tf.int32, [None, None])
        masked_lm_ids = tf.placeholder(tf.int32, [None, None])
        masked_lm_weights = tf.placeholder(tf.float32, [None, None])
        next_sentence_labels = tf.placeholder(tf.int32, [None])

        model = BertModel(config=bert_config,
                          is_training=False,
                          input_ids=input_ids,
                          input_mask=input_mask,
                          token_type_ids=segment_ids,
                          use_one_hot_embeddings=False)

        output = model.get_sequence_output()
        (_, _, _) = get_masked_lm_output(bert_config,
                                         model.get_sequence_output(),
                                         model.get_embedding_table(),
                                         masked_lm_positions, masked_lm_ids,
                                         masked_lm_weights)
        (_, _, _) = get_next_sentence_output(bert_config,
                                             model.get_pooled_output(),
                                             next_sentence_labels)

        saver = tf.train.Saver()

        masked_lm_values = tf.placeholder(tf.float32, [None, None])

        with tf.variable_scope("loss"):
            (_, outputs) = get_masked_regression_output(
                bert_config, model.get_sequence_output(), masked_lm_positions,
                masked_lm_values, masked_lm_weights)

            saver = tf.train.Saver(var_list=tf.trainable_variables())
            ckpt = tf.train.latest_checkpoint(args.model)
            saver.restore(sess, ckpt)

            _input_ids = []
            _lm_positions = []
            tokens = [enc.encode(p.strip()) for p in sep_txt(args.context)]
            tokens = [t for t in tokens if len(t) > 0]
            for t in tokens:
                _lm_positions.append(len(_input_ids))
                _input_ids.extend([CLS_TOKEN] + t)
            _input_ids.append(EOT_TOKEN)
            _input_masks = [1] * len(_input_ids)
            _segments = [1] * len(_input_ids)
            _input_ids = _input_ids[:max_seq_length]
            _input_masks = _input_masks[:max_seq_length]
            _segments = _segments[:max_seq_length]
            while len(_segments) < max_seq_length:
                _input_ids.append(0)
                _input_masks.append(0)
                _segments.append(0)
            _lm_positions = [p for p in _lm_positions if p < max_seq_length]
            _lm_positions = _lm_positions[:max_seq_length]
            _lm_lm_weights = [1] * len(_lm_positions)
            while len(_lm_positions) < max_seq_length:
                _lm_positions.append(0)
                _lm_lm_weights.append(0)
            _lm_ids = [0] * len(_lm_positions)
            _lm_vals = [0] * len(_lm_positions)

            regress = sess.run(outputs,
                               feed_dict={
                                   input_ids: [_input_ids],
                                   input_mask: [_input_masks],
                                   segment_ids: [_segments],
                                   masked_lm_positions: [_lm_positions],
                                   masked_lm_ids: [_lm_ids],
                                   masked_lm_weights: [_lm_lm_weights],
                                   next_sentence_labels: [0],
                                   masked_lm_values: [_lm_vals]
                               })
            regress = regress.reshape((-1, ))
            if args.output_file == '':
                for tok, value in zip(tokens, regress):
                    print(f'{value}\t{enc.decode(tok)}')
            else:
                sent = []
                impt = []
                for tok, value in zip(tokens, regress):
                    sent.append(enc.decode(tok))
                    impt.append(value)
                df = pd.DataFrame({'sentence': sent, 'importance': impt})
                df.to_csv(args.output_file, index=False)
Example #4
0
          print('generate finished')
          break
  
      # print(cur_ids.shape)
      # if finished:
      # print(cur_ids.tolist()[0])
      seq_num = seq_num + 1
      output_list = list(cur_ids.squeeze().to('cpu').numpy())
      # output_list = cur_ids.tolist()[0]
      output_text = tokenizer.decode(output_list)
      output_text = output_text.replace(' ','')
      output_text.split('<|endoftext|>')[0]
      print(output_text)

if __name__ == '__main__':
  tokenizer = BPEEncoder_ja(bpe, emoji)
  ids = [0,0,0,0,1,2]
  print(tokenizer.decode(ids))
  # model_path = './trained_models/best.pt'
  # model = GPT2LMHeadModel.from_pretrained('gpt2-pytorch-model-medium')
  # model.resize_token_embeddings(len(tokenizer))

  # device = torch.device('cuda:2')
  # model.load_state_dict(torch.load(model_path, map_location=device), strict=False)
  # article_generate(model, tokenizer, 1, 5, 0, device)
  
  # try:
  #   article_generate(model, tokenizer, 1, 5, 0, device)
  # except:
  #   print('=' * 30)
  # origin()
Example #5
0
    return ln_probs_next


parser = argparse.ArgumentParser()
parser.add_argument('input_file')
parser.add_argument('--model', default='gpt2ja-medium')
parser.add_argument('--gpu', type=str, default='0')
args = parser.parse_args()

with open('ja-bpe.txt') as f:
    bpe = f.read().split('\n')

with open('emoji.json') as f:
    emoji = json.loads(f.read())

enc = BPEEncoder_ja(bpe, emoji)
n_vocab = len(enc)

if 'small' in args.model:
    hparams = HParams(
        **{
            "n_vocab": n_vocab,
            "n_ctx": 1024,
            "n_embd": 768,
            "n_head": 12,
            "n_layer": 12
        })
elif 'medium' in args.model:
    hparams = HParams(
        **{
            "n_vocab": n_vocab,
Example #6
0
         bert_config, model.get_sequence_output(), model.get_embedding_table(),
         masked_lm_positions, masked_lm_ids, masked_lm_weights)
    (_,_,_) = get_next_sentence_output(
         bert_config, model.get_pooled_output(), next_sentence_labels)

    saver = tf.train.Saver()
    ckpt = tf.train.latest_checkpoint(args.model)
    saver.restore(sess, ckpt)

    with open('ja-bpe.txt', encoding='utf-8') as f:
        bpe = f.read().split('\n')

    with open('emoji.json', encoding='utf-8') as f:
        emoji = json.loads(f.read())

    enc = BPEEncoder_ja(bpe, emoji)

    if args.split_tag != '':
        contexts = args.context.split(args.split_tag)
    else:
        contexts = [args.context]
    _input_ids = []
    _input_masks = []
    _segments = []
    _mask_positions = []
    for context in contexts:
        context_tokens = enc.encode(context)
        context_tokens = context_tokens[:max_seq_length-3]
        inputs = []
        inputs.append(CLS_TOKEN)
        inputs.extend(context_tokens)
def main():
    global EOT_TOKEN, MASK_TOKEN, CLS_TOKEN, SEP_TOKEN, enc
    args = parser.parse_args()

    if os.path.isfile(args.model + '/hparams.json'):
        with open(args.model + '/hparams.json') as f:
            bert_config_params = json.load(f)
    else:
        raise ValueError('invalid model name.')

    vocab_size = bert_config_params['vocab_size']
    max_seq_length = bert_config_params['max_position_embeddings']
    batch_size = args.batch_size
    save_every = args.save_every
    num_epochs = args.num_epochs
    EOT_TOKEN = vocab_size - 4
    MASK_TOKEN = vocab_size - 3
    CLS_TOKEN = vocab_size - 2
    SEP_TOKEN = vocab_size - 1

    with open('ja-bpe.txt', encoding='utf-8') as f:
        bpe = f.read().split('\n')

    with open('emoji.json', encoding='utf-8') as f:
        emoji = json.loads(f.read())

    enc = BPEEncoder_ja(bpe, emoji)

    fl = [f'{args.input_dir}/{f}' for f in os.listdir(args.input_dir)]
    with Pool(args.num_encode_process) as pool:
        imap = pool.imap(encode_one, fl)
        input_contexts = list(tqdm(imap, total=len(fl)))
    input_indexs = np.random.permutation(len(input_contexts))

    if args.do_eval:
        eval_num = int(args.eval_rate * len(input_indexs))
        eval_input_indexs = input_indexs[:eval_num]
        input_indexs = input_indexs[eval_num:]

    bert_config = BertConfig(**bert_config_params)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = args.gpu

    with tf.Session(config=config) as sess:
        input_ids = tf.placeholder(tf.int32, [None, None])
        input_mask = tf.placeholder(tf.int32, [None, None])
        segment_ids = tf.placeholder(tf.int32, [None, None])
        masked_lm_positions = tf.placeholder(tf.int32, [None, None])
        masked_lm_ids = tf.placeholder(tf.int32, [None, None])
        masked_lm_weights = tf.placeholder(tf.float32, [None, None])
        next_sentence_labels = tf.placeholder(tf.int32, [None])

        model = BertModel(config=bert_config,
                          is_training=True,
                          input_ids=input_ids,
                          input_mask=input_mask,
                          token_type_ids=segment_ids,
                          use_one_hot_embeddings=False)

        output = model.get_sequence_output()
        (_, _, _) = get_masked_lm_output(bert_config,
                                         model.get_sequence_output(),
                                         model.get_embedding_table(),
                                         masked_lm_positions, masked_lm_ids,
                                         masked_lm_weights)
        (_, _, _) = get_next_sentence_output(bert_config,
                                             model.get_pooled_output(),
                                             next_sentence_labels)

        saver = tf.train.Saver()
        ckpt = tf.train.latest_checkpoint(args.model)
        saver.restore(sess, ckpt)
        train_vars = tf.trainable_variables()
        restored_weights = {}
        for i in range(len(train_vars)):
            restored_weights[train_vars[i].name] = sess.run(train_vars[i])

        labels = tf.placeholder(tf.float32, [
            None,
        ])

        output_layer = model.get_pooled_output()

        if int(tf.__version__[0]) > 1:
            hidden_size = output_layer.shape[-1]
        else:
            hidden_size = output_layer.shape[-1].value

        masked_lm_values = tf.placeholder(tf.float32, [None, None])

        with tf.variable_scope("loss"):
            (loss, _) = get_masked_regression_output(
                bert_config, model.get_sequence_output(), masked_lm_positions,
                masked_lm_values, masked_lm_weights)

            opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
            train_vars = tf.trainable_variables()
            opt_grads = tf.gradients(loss, train_vars)
            opt_grads = list(zip(opt_grads, train_vars))
            opt_apply = opt.apply_gradients(opt_grads)
            summaries = tf.summary.scalar('loss', loss)
            summary_log = tf.summary.FileWriter(
                os.path.join(CHECKPOINT_DIR, args.run_name))

            counter = 1
            counter_path = os.path.join(CHECKPOINT_DIR, args.run_name,
                                        'counter')
            if os.path.exists(counter_path):
                # Load the step number if we're resuming a run
                # Add 1 so we don't immediately try to save again
                with open(counter_path, 'r') as fp:
                    counter = int(fp.read()) + 1

            hparams_path = os.path.join(CHECKPOINT_DIR, args.run_name,
                                        'hparams.json')
            maketree(os.path.join(CHECKPOINT_DIR, args.run_name))
            with open(hparams_path, 'w') as fp:
                fp.write(json.dumps(bert_config_params))

            sess.run(tf.global_variables_initializer())  # init output_weights
            restored = 0
            for k, v in restored_weights.items():
                for i in range(len(train_vars)):
                    if train_vars[i].name == k:
                        assign_op = train_vars[i].assign(v)
                        sess.run(assign_op)
                        restored += 1
            assert restored == len(restored_weights), 'fail to restore model.'
            saver = tf.train.Saver(var_list=tf.trainable_variables())

            def save():
                maketree(os.path.join(CHECKPOINT_DIR, args.run_name))
                print(
                    'Saving',
                    os.path.join(CHECKPOINT_DIR, args.run_name,
                                 'model-{}').format(counter))
                saver.save(sess,
                           os.path.join(CHECKPOINT_DIR, args.run_name,
                                        'model'),
                           global_step=counter)
                with open(counter_path, 'w') as fp:
                    fp.write(str(counter) + '\n')

            avg_loss = (0.0, 0.0)
            start_time = time.time()

            def sample_feature(i, eval=False):
                indexs = eval_input_indexs if eval else input_indexs
                last = min((i + 1) * batch_size, len(indexs))
                _input_ids = []
                _input_masks = []
                _segments = []
                _lm_positions = []
                _lm_vals = []
                _lm_lm_weights = []
                _lm_ids = []
                for j in range(i * batch_size, last, 1):
                    (lm_tokens, lm_positions,
                     lm_imprtances) = input_contexts[indexs[j]]
                    ids = copy(lm_tokens)[:max_seq_length]
                    seg = [1] * len(ids)
                    while len(ids) < max_seq_length:
                        ids.append(0)
                        seg.append(0)
                    _input_ids.append(ids)
                    _input_masks.append(seg)
                    _segments.append(seg)
                    pos = copy(lm_positions)[:max_seq_length]
                    val = copy(lm_imprtances)[:max_seq_length]
                    wei = [1] * len(pos)
                    while len(ids) < max_seq_length:
                        pos.append(0)
                        val.append(0)
                        wei.append(0)
                    _lm_positions.append(pos)
                    _lm_ids.append([0] * max_seq_length)
                    _lm_lm_weights.append(wei)
                    _lm_vals.append(val)

                return {
                    input_ids: _input_ids,
                    input_mask: _input_masks,
                    segment_ids: _segments,
                    masked_lm_positions: _lm_positions,
                    masked_lm_ids: _lm_ids,
                    masked_lm_weights: _lm_lm_weights,
                    next_sentence_labels: [0] * len(_input_ids),
                    masked_lm_values: _lm_vals
                }

            try:
                for ep in range(num_epochs):
                    if ep % args.save_every == 0:
                        save()

                    prog = tqdm(range(0, len(input_indexs) // batch_size, 1))
                    for i in prog:
                        (_, v_loss, v_summary) = sess.run(
                            (opt_apply, loss, summaries),
                            feed_dict=sample_feature(i))

                        summary_log.add_summary(v_summary, counter)

                        avg_loss = (avg_loss[0] * 0.99 + v_loss,
                                    avg_loss[1] * 0.99 + 1.0)

                        prog.set_description(
                            '[{ep} | {time:2.0f}] loss={loss:.4f} avg={avg:.4f}'
                            .format(ep=ep,
                                    time=time.time() - start_time,
                                    loss=v_loss,
                                    avg=avg_loss[0] / avg_loss[1]))

                        counter += 1

                    if args.do_eval:
                        eval_losses = []
                        for i in tqdm(
                                range(0,
                                      len(eval_input_indexs) // batch_size,
                                      1)):
                            eval_losses.append(
                                sess.run(loss,
                                         feed_dict=sample_feature(i, True)))
                        print("eval loss:", np.mean(eval_losses))

            except KeyboardInterrupt:
                print('interrupted')
                save()

            save()
def main():
    args = parser.parse_args()

    if os.path.isfile(args.model + '/hparams.json'):
        with open(args.model + '/hparams.json') as f:
            bert_config_params = json.load(f)
    else:
        raise ValueError('invalid model name.')
    if os.path.isfile(args.model + '/idmaps.json'):
        with open(args.model + '/idmaps.json') as f:
            idmapping_dict = json.load(f)
    else:
        raise ValueError('invalid model name.')

    vocab_size = bert_config_params['vocab_size']
    max_seq_length = bert_config_params['max_position_embeddings']
    batch_size = args.batch_size
    EOT_TOKEN = vocab_size - 4
    MASK_TOKEN = vocab_size - 3
    CLS_TOKEN = vocab_size - 2
    SEP_TOKEN = vocab_size - 1

    with open('ja-bpe.txt', encoding='utf-8') as f:
        bpe = f.read().split('\n')

    with open('emoji.json', encoding='utf-8') as f:
        emoji = json.loads(f.read())

    enc = BPEEncoder_ja(bpe, emoji)

    num_labels = len(idmapping_dict)
    input_contexts = []
    input_keys = []
    input_names = []
    for f, i in idmapping_dict.items():
        n = 0
        for t in os.listdir(f'{args.input_dir}/{f}'):
            if os.path.isfile(f'{args.input_dir}/{f}/{t}'):
                with open(f'{args.input_dir}/{f}/{t}', encoding='utf-8') as fn:
                    if args.train_by_line:
                        for ln, p in enumerate(fn.readlines()):
                            tokens = enc.encode(p.strip())[:max_seq_length - 3]
                            tokens = [CLS_TOKEN
                                      ] + tokens + [EOT_TOKEN, SEP_TOKEN]
                            if len(tokens) < max_seq_length:
                                tokens.extend([0] *
                                              (max_seq_length - len(tokens)))
                            input_contexts.append(tokens)
                            input_keys.append(i)
                            input_names.append(f'{f}/{t}#{ln}')
                            n += 1
                    else:
                        p = fn.read()
                        tokens = enc.encode(p.strip())[:max_seq_length - 2]
                        tokens = [CLS_TOKEN] + tokens + [SEP_TOKEN]
                        if len(tokens) < max_seq_length:
                            tokens.extend([0] * (max_seq_length - len(tokens)))
                        input_contexts.append(tokens)
                        input_keys.append(i)
                        input_names.append(f'{f}/{t}')
                        n += 1
        print(f'{args.input_dir}/{f} mapped for id_{i}, read {n} contexts.')
    input_indexs = np.arange(len(input_contexts))

    bert_config = BertConfig(**bert_config_params)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = args.gpu

    with tf.Session(config=config) as sess:
        input_ids = tf.placeholder(tf.int32, [None, None])
        input_mask = tf.placeholder(tf.int32, [None, None])
        segment_ids = tf.placeholder(tf.int32, [None, None])
        masked_lm_positions = tf.placeholder(tf.int32, [None, None])
        masked_lm_ids = tf.placeholder(tf.int32, [None, None])
        masked_lm_weights = tf.placeholder(tf.float32, [None, None])
        next_sentence_labels = tf.placeholder(tf.int32, [None])

        model = BertModel(config=bert_config,
                          is_training=False,
                          input_ids=input_ids,
                          input_mask=input_mask,
                          token_type_ids=segment_ids,
                          use_one_hot_embeddings=False)

        output = model.get_sequence_output()
        (_, _, _) = get_masked_lm_output(bert_config,
                                         model.get_sequence_output(),
                                         model.get_embedding_table(),
                                         masked_lm_positions, masked_lm_ids,
                                         masked_lm_weights)
        (_, _, _) = get_next_sentence_output(bert_config,
                                             model.get_pooled_output(),
                                             next_sentence_labels)

        saver = tf.train.Saver()

        labels = tf.placeholder(tf.int32, [
            batch_size,
        ])

        output_layer = model.get_pooled_output()

        if int(tf.__version__[0]) > 1:
            hidden_size = output_layer.shape[-1]
        else:
            hidden_size = output_layer.shape[-1].value

        output_weights = tf.get_variable(
            "output_weights", [num_labels, hidden_size],
            initializer=tf.truncated_normal_initializer(stddev=0.02))

        output_bias = tf.get_variable("output_bias", [num_labels],
                                      initializer=tf.zeros_initializer())

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        probabilities = tf.nn.softmax(logits, axis=-1)

        saver = tf.train.Saver(var_list=tf.trainable_variables())
        ckpt = tf.train.latest_checkpoint(args.model)
        saver.restore(sess, ckpt)

        def sample_feature(i):
            last = min((i + 1) * batch_size, len(input_indexs))
            _input_ids = [
                input_contexts[idx]
                for idx in input_indexs[i * batch_size:last]
            ]
            _input_masks = [[1] * len(input_contexts[idx]) + [0] *
                            (max_seq_length - len(input_contexts[idx]))
                            for idx in input_indexs[i * batch_size:last]]
            _segments = [[1] * len(input_contexts[idx]) + [0] *
                         (max_seq_length - len(input_contexts[idx]))
                         for idx in input_indexs[i * batch_size:last]]
            _labels = [
                input_keys[idx] for idx in input_indexs[i * batch_size:last]
            ]
            return {
                input_ids: _input_ids,
                input_mask: _input_masks,
                segment_ids: _segments,
                masked_lm_positions: np.zeros((len(_input_ids), 0),
                                              dtype=np.int32),
                masked_lm_ids: np.zeros((len(_input_ids), 0), dtype=np.int32),
                masked_lm_weights: np.ones((len(_input_ids), 0),
                                           dtype=np.float32),
                next_sentence_labels: np.zeros((len(_input_ids), ),
                                               dtype=np.int32),
                labels: _labels
            }

        preds = []
        prog = tqdm.tqdm(range(0, len(input_contexts) // batch_size, 1))
        for i in prog:
            prob = sess.run(probabilities, feed_dict=sample_feature(i))
            for p in prob:
                pred = np.argmax(p)
                preds.append(pred)

        pd.DataFrame({
            'id': input_names,
            'y_true': input_keys,
            'y_pred': preds
        }).to_csv(args.output_file, index=False)

        r = np.zeros((num_labels, num_labels), dtype=int)
        for t, p in zip(input_keys, preds):
            r[t, p] += 1
        fig = plt.figure(figsize=(12, 6), dpi=72)
        ax = plt.matshow(r, interpolation='nearest', aspect=.5, cmap='cool')
        for (i, j), z in np.ndenumerate(r):
            if z >= 1000:
                plt.text(j - .33,
                         i,
                         '{:0.1f}K'.format(z / 1000),
                         ha='left',
                         va='center',
                         size=9,
                         color='black')
            else:
                plt.text(j - .33,
                         i,
                         f'{z}',
                         ha='left',
                         va='center',
                         size=9,
                         color='black')
        pfile = args.output_file
        if args.output_file.lower().endswith('.csv'):
            pfile = args.output_file[:-4]
        plt.savefig(pfile + '_map.png')
Example #9
0
parser = argparse.ArgumentParser()
parser.add_argument('input_file')
parser.add_argument('--model', default='gpt2ja-medium')
parser.add_argument('--tokens', action='store_true')
parser.add_argument('--exclude-end', action='store_true')
parser.add_argument('--gpu', type=str, default='0')
args = parser.parse_args()

with open('ja-bpe.txt', encoding='utf-8') as f:
    bpe = f.read().split('\n')

with open('emoji.json', encoding='utf-8') as f:
    emoji = json.loads(f.read())

enc = BPEEncoder_ja(bpe, emoji)
n_vocab = len(enc)

if os.path.isfile(args.model + '/hparams.json'):
    with open(args.model + '/hparams.json', encoding='utf-8') as f:
        params = json.loads(f.read())
        hparams = HParams(**params)
elif 'small' in args.model:
    hparams = HParams(
        **{
            "n_vocab": n_vocab,
            "n_ctx": 1024,
            "n_embd": 768,
            "n_head": 12,
            "n_layer": 12
        })
Example #10
0
 def __init__(self):
     self.temperature = 1
     self.top_k = 40
     self.top_p = 0
     self.tokenizer = BPEEncoder_ja(bpe, emoji)
Example #11
0
    from multiprocessing import Pool
    parser = argparse.ArgumentParser()
    parser.add_argument("--src_dir", help="source dir", required=True)
    parser.add_argument("--dst_file", help="destnation file", required=True)
    parser.add_argument("--num_process",
                        help="process num",
                        type=int,
                        default=8)
    parser.add_argument("--split_tag", help="text split tag", default='')
    args = parser.parse_args()

    with open('ja-bpe.txt', encoding='utf-8') as f:
        bpe = f.read().split('\n')
    with open('emoji.json', encoding='utf-8') as f:
        emoji = json.loads(f.read())
    enc = BPEEncoder_ja(bpe, emoji)
    split_types = enc.encode('。.。、;:.,「『')
    eot_token = enc.encode('<|endoftext|>')

    array_file = []

    def _proc(i):
        token_chunks = []
        for j, input in enumerate(tqdm(array_file)):
            if not ((j % args.num_process) == i):
                continue
            with open(input, 'r', encoding='utf-8') as fp:
                raw_text = fp.read()
                if args.split_tag == '':
                    tokens = make_rouge(enc.encode(raw_text))
                else:
parser.add_argument('--num_generate', type=int, default=1)
parser.add_argument('--top_k', type=int, default=1)
parser.add_argument('--top_p', type=float, default=0)
parser.add_argument('--temperature', type=float, default=1)
parser.add_argument('--allow_duplicate_line', action='store_true')
parser.add_argument("--full_sentences", action='store_true')
parser.add_argument('--gpu', type=str, default='0')
args = parser.parse_args()

with open('ja-bpe.txt', encoding='utf-8') as f:
    bpe = f.read().split('\n')

with open('emoji.json', encoding='utf-8') as f:
    emoji = json.loads(f.read())

enc = BPEEncoder_ja(bpe, emoji)
n_vocab = len(enc)

if os.path.isfile(args.model + '/hparams.json'):
    with open(args.model + '/hparams.json') as f:
        params = json.loads(f.read())
        hparams = HParams(**params)
        max_length = params['n_prediction']
else:
    raise ValueError('invalid model name.')

length = hparams.n_ctx - max_length - 1
temperature = args.temperature
top_k = args.top_k
top_p = args.top_p
SEP_TOKEN = enc.encode('<|byte0|>')[0]
Example #13
0
def main():
    args = parser.parse_args()

    if os.path.isfile(args.model + '/hparams.json'):
        with open(args.model + '/hparams.json') as f:
            bert_config_params = json.load(f)
    else:
        raise ValueError('invalid model name.')

    vocab_size = bert_config_params['vocab_size']
    max_seq_length = bert_config_params['max_position_embeddings']
    batch_size = args.batch_size
    save_every = args.save_every
    num_epochs = args.num_epochs
    EOT_TOKEN = vocab_size - 4
    MASK_TOKEN = vocab_size - 3
    CLS_TOKEN = vocab_size - 2
    SEP_TOKEN = vocab_size - 1

    with open('ja-bpe.txt', encoding='utf-8') as f:
        bpe = f.read().split('\n')

    with open('emoji.json', encoding='utf-8') as f:
        emoji = json.loads(f.read())

    enc = BPEEncoder_ja(bpe, emoji)

    keys = [
        f for f in os.listdir(args.input_dir)
        if os.path.isdir(args.input_dir + '/' + f)
    ]
    keys = sorted(keys)
    num_labels = len(keys)
    input_contexts = []
    input_keys = []
    idmapping_dict = {}
    for i, f in enumerate(keys):
        n = 0
        for t in os.listdir(f'{args.input_dir}/{f}'):
            if os.path.isfile(f'{args.input_dir}/{f}/{t}'):
                with open(f'{args.input_dir}/{f}/{t}', encoding='utf-8') as fn:
                    if args.train_by_line:
                        for p in fn.readlines():
                            tokens = enc.encode(p.strip())[:max_seq_length - 2]
                            tokens = [CLS_TOKEN] + tokens + [SEP_TOKEN]
                            if len(tokens) < max_seq_length:
                                tokens.extend([0] *
                                              (max_seq_length - len(tokens)))
                            input_contexts.append(tokens)
                            input_keys.append(i)
                            n += 1
                    else:
                        p = fn.read()
                        tokens = enc.encode(p.strip())[:max_seq_length - 3]
                        tokens = [CLS_TOKEN] + tokens + [EOT_TOKEN, SEP_TOKEN]
                        if len(tokens) < max_seq_length:
                            tokens.extend([0] * (max_seq_length - len(tokens)))
                        input_contexts.append(tokens)
                        input_keys.append(i)
                        n += 1
        print(f'{args.input_dir}/{f} mapped for id_{i}, read {n} contexts.')
        idmapping_dict[f] = i
    input_indexs = np.random.permutation(len(input_contexts))

    bert_config = BertConfig(**bert_config_params)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = args.gpu

    with tf.Session(config=config) as sess:
        input_ids = tf.placeholder(tf.int32, [None, None])
        input_mask = tf.placeholder(tf.int32, [None, None])
        segment_ids = tf.placeholder(tf.int32, [None, None])
        masked_lm_positions = tf.placeholder(tf.int32, [None, None])
        masked_lm_ids = tf.placeholder(tf.int32, [None, None])
        masked_lm_weights = tf.placeholder(tf.float32, [None, None])
        next_sentence_labels = tf.placeholder(tf.int32, [None])

        model = BertModel(config=bert_config,
                          is_training=True,
                          input_ids=input_ids,
                          input_mask=input_mask,
                          token_type_ids=segment_ids,
                          use_one_hot_embeddings=False)

        output = model.get_sequence_output()
        (_, _, _) = get_masked_lm_output(bert_config,
                                         model.get_sequence_output(),
                                         model.get_embedding_table(),
                                         masked_lm_positions, masked_lm_ids,
                                         masked_lm_weights)
        (_, _, _) = get_next_sentence_output(bert_config,
                                             model.get_pooled_output(),
                                             next_sentence_labels)

        saver = tf.train.Saver()
        ckpt = tf.train.latest_checkpoint(args.model)
        saver.restore(sess, ckpt)
        train_vars = tf.trainable_variables()
        restored_weights = {}
        for i in range(len(train_vars)):
            restored_weights[train_vars[i].name] = sess.run(train_vars[i])

        labels = tf.placeholder(tf.int32, [
            None,
        ])

        output_layer = model.get_pooled_output()

        if int(tf.__version__[0]) > 1:
            hidden_size = output_layer.shape[-1]
        else:
            hidden_size = output_layer.shape[-1].value

        output_weights = tf.get_variable(
            "output_weights", [num_labels, hidden_size],
            initializer=tf.truncated_normal_initializer(stddev=0.02))

        output_bias = tf.get_variable("output_bias", [num_labels],
                                      initializer=tf.zeros_initializer())

        with tf.variable_scope("loss"):
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            probabilities = tf.nn.softmax(logits, axis=-1)
            log_probs = tf.nn.log_softmax(logits, axis=-1)

            one_hot_labels = tf.one_hot(labels,
                                        depth=num_labels,
                                        dtype=tf.float32)

            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                              axis=-1)
            loss = tf.reduce_mean(per_example_loss)

            opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
            train_vars = tf.trainable_variables()
            opt_grads = tf.gradients(loss, train_vars)
            opt_grads = list(zip(opt_grads, train_vars))
            opt_apply = opt.apply_gradients(opt_grads)
            summaries = tf.summary.scalar('loss', loss)
            summary_log = tf.summary.FileWriter(
                os.path.join(CHECKPOINT_DIR, args.run_name))

            counter = 1
            counter_path = os.path.join(CHECKPOINT_DIR, args.run_name,
                                        'counter')
            if os.path.exists(counter_path):
                # Load the step number if we're resuming a run
                # Add 1 so we don't immediately try to save again
                with open(counter_path, 'r') as fp:
                    counter = int(fp.read()) + 1

            hparams_path = os.path.join(CHECKPOINT_DIR, args.run_name,
                                        'hparams.json')
            maketree(os.path.join(CHECKPOINT_DIR, args.run_name))
            with open(hparams_path, 'w') as fp:
                fp.write(json.dumps(bert_config_params))
            idmaps_path = os.path.join(CHECKPOINT_DIR, args.run_name,
                                       'idmaps.json')
            with open(idmaps_path, 'w') as fp:
                fp.write(json.dumps(idmapping_dict))

            sess.run(tf.global_variables_initializer())  # init output_weights
            restored = 0
            for k, v in restored_weights.items():
                for i in range(len(train_vars)):
                    if train_vars[i].name == k:
                        assign_op = train_vars[i].assign(v)
                        sess.run(assign_op)
                        restored += 1
            assert restored == len(restored_weights), 'fail to restore model.'
            saver = tf.train.Saver(var_list=tf.trainable_variables())

            def save():
                maketree(os.path.join(CHECKPOINT_DIR, args.run_name))
                print(
                    'Saving',
                    os.path.join(CHECKPOINT_DIR, args.run_name,
                                 'model-{}').format(counter))
                saver.save(sess,
                           os.path.join(CHECKPOINT_DIR, args.run_name,
                                        'model'),
                           global_step=counter)
                with open(counter_path, 'w') as fp:
                    fp.write(str(counter) + '\n')

            avg_loss = (0.0, 0.0)
            start_time = time.time()

            def sample_feature(i):
                last = min((i + 1) * batch_size, len(input_indexs))
                _input_ids = [
                    input_contexts[idx]
                    for idx in input_indexs[i * batch_size:last]
                ]
                _input_masks = [[1] * len(input_contexts[idx]) + [0] *
                                (max_seq_length - len(input_contexts[idx]))
                                for idx in input_indexs[i * batch_size:last]]
                _segments = [[1] * len(input_contexts[idx]) + [0] *
                             (max_seq_length - len(input_contexts[idx]))
                             for idx in input_indexs[i * batch_size:last]]
                _labels = [
                    input_keys[idx]
                    for idx in input_indexs[i * batch_size:last]
                ]
                return {
                    input_ids:
                    _input_ids,
                    input_mask:
                    _input_masks,
                    segment_ids:
                    _segments,
                    masked_lm_positions:
                    np.zeros((len(_input_ids), 0), dtype=np.int32),
                    masked_lm_ids:
                    np.zeros((len(_input_ids), 0), dtype=np.int32),
                    masked_lm_weights:
                    np.ones((len(_input_ids), 0), dtype=np.float32),
                    next_sentence_labels:
                    np.zeros((len(_input_ids), ), dtype=np.int32),
                    labels:
                    _labels
                }

            try:
                for ep in range(num_epochs):
                    if ep % args.save_every == 0:
                        save()

                    prog = tqdm.tqdm(
                        range(0,
                              len(input_contexts) // batch_size, 1))
                    for i in prog:
                        (_, v_loss, v_summary) = sess.run(
                            (opt_apply, loss, summaries),
                            feed_dict=sample_feature(i))

                        summary_log.add_summary(v_summary, counter)

                        avg_loss = (avg_loss[0] * 0.99 + v_loss,
                                    avg_loss[1] * 0.99 + 1.0)

                        prog.set_description(
                            '[{ep} | {time:2.2f}] loss={loss:2.2f} avg={avg:2.2f}'
                            .format(ep=ep,
                                    time=time.time() - start_time,
                                    loss=v_loss,
                                    avg=avg_loss[0] / avg_loss[1]))

                        counter += 1
            except KeyboardInterrupt:
                print('interrupted')
                save()

            save()
Example #14
0
import model
from encode_bpe import BPEEncoder_ja

parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default='gpt2ja-medium')
parser.add_argument('--context', type=str, required=True)
parser.add_argument('--gpu', type=str, default='0')
args = parser.parse_args()

with open('ja-bpe.txt') as f:
    bpe = f.read().split('\n')

with open('emoji.json') as f:
    emoji = json.loads(f.read())

enc = BPEEncoder_ja(bpe, emoji)
n_vocab = len(enc)

if 'small' in args.model:
    hparams = HParams(
        **{
            "n_vocab": n_vocab,
            "n_ctx": 1024,
            "n_embd": 768,
            "n_head": 12,
            "n_layer": 12
        })
elif 'medium' in args.model:
    hparams = HParams(
        **{
            "n_vocab": n_vocab,
Example #15
0
         bert_config, model.get_sequence_output(), model.get_embedding_table(),
         masked_lm_positions, masked_lm_ids, masked_lm_weights)
    (_,_,_) = get_next_sentence_output(
         bert_config, model.get_pooled_output(), next_sentence_labels)

    saver = tf.train.Saver()
    ckpt = tf.train.latest_checkpoint(args.model)
    saver.restore(sess, ckpt)

    with open('ja-bpe.txt', encoding='utf-8') as f:
        bpe = f.read().split('\n')

    with open('emoji.json', encoding='utf-8') as f:
        emoji = json.loads(f.read())

    enc = BPEEncoder_ja(bpe, emoji)

    if args.split_tag != '':
        contexts = args.context.split(args.split_tag)
    else:
        contexts = [args.context]
    _input_ids = []
    _input_masks = []
    _segments = []
    _mask_positions = []
    for context in contexts:
        context_tokens = [enc.encode(c)+[MASK_TOKEN] for c in context.split('[MASK]')]
        context_tokens = sum(context_tokens, [])
        if len(context_tokens) > 1:
            context_tokens = context_tokens[:-1]
        context_tokens = context_tokens[:max_seq_length-3]
Example #16
0
parser.add_argument('--gpu', default='0', help='visible gpu number.')

def maketree(path):
    try:
        os.makedirs(path)
    except:
        pass

with open('ja-bpe.txt', encoding='utf-8') as f:
    bpe = f.read().split('\n')

with open('emoji.json', encoding='utf-8') as f:
    emoji = json.loads(f.read())

enc = BPEEncoder_ja(bpe, emoji)
n_vocab = len(enc)

def main():
    args = parser.parse_args()

    if os.path.isfile(args.base_model+'/hparams.json'):
        with open(args.base_model+'/hparams.json', encoding='utf-8') as f:
            params = json.loads(f.read())
            hparams = HParams(**params)
    elif 'small' in args.base_model:
        hparams = HParams(**{
          "n_vocab": n_vocab,
          "n_ctx": 1024,
          "n_embd": 768,
          "n_head": 12,
Example #17
0

def maketree(path):
    try:
        os.makedirs(path)
    except:
        pass


with open('ja-bpe.txt', encoding='utf-8') as f:
    bpe = f.read().split('\n')

with open('emoji.json', encoding='utf-8') as f:
    emoji = json.loads(f.read())

enc = BPEEncoder_ja(bpe, emoji)
n_vocab = len(enc)
eot_token = enc.encode('<|endoftext|>')[0]
sep_token = enc.encode('<|byte0|>')[0]


def get_masked_lm_output(hparams, logits, positions, label_ids, label_weights):
    logits = gather_indexes(logits, positions)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    label_ids = tf.reshape(label_ids, [-1])
    label_weights = tf.reshape(label_weights, [-1])

    one_hot_labels = tf.one_hot(label_ids, depth=n_vocab, dtype=tf.float32)

    per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])