Esempio n. 1
0
def chat(kogptqa, sent='0'):
    tok_path = get_tokenizer()
    _, vocab = get_pytorch_kogpt2_model()
    tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0)
    sent_tokens = tok(sent)
    with torch.no_grad():
        while 1:
            q = input('user > ').strip()
            if q == 'quit':
                break
            q_tok = tok(q)
            a = ''
            a_tok = []
            while 1:
                input_ids = torch.LongTensor([
                    vocab[U_TKN]] + vocab[q_tok] +
                    vocab[EOS, SENT] + vocab[sent_tokens] +
                    vocab[EOS, S_TKN] +
                    vocab[a_tok]).unsqueeze(dim=0)
                pred = kogptqa(input_ids)
                gen = vocab.to_tokens(
                    torch.argmax(
                        pred,
                        dim=-1).squeeze().numpy().tolist())[-1]
                if gen == EOS:
                    break
                a += gen.replace('▁', ' ')
                a_tok = tok(a)
            print("Simsimi > {}".format(a.strip()))
Esempio n. 2
0
def Tokenizer(item):
    item = list(np.array(item.tolist()))
    max = 0
    tok_path = get_tokenizer()
    model, vocab = get_pytorch_kogpt2_model()
    tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0)

    out = []

    for i in item:

        toked = tok(i)
        input_ids = torch.tensor([
            vocab[vocab.bos_token],
        ] + vocab[toked]).unsqueeze(0)
        size = input_ids.shape
        # print(input_ids)
        # print(input_ids.shape)
        y = torch.cat(
            [input_ids, torch.empty(1, max_seqlen - size[1])], axis=1)
        out = torch.cat([out, y], axis=0)

        print(out.shape)

    x_np = out.numpy()
    x_df = pd.DataFrame(x_np)
    x_df.to_csv('./data/encoded.csv', mode='w')
Esempio n. 3
0
 def __init__(self, hparams, **kwargs):
     super(KoGPT2Chat, self).__init__()
     self.hparams = hparams
     self.tok_path = get_tokenizer()
     self.neg = -1e18
     self.kogpt2, self.vocab = get_pytorch_kogpt2_model()
     self.loss_function = torch.nn.CrossEntropyLoss(reduction='none')
    def __init__(self, args, **kwargs):
        super(KoGPT2Chat, self).__init__()
        self.hparams = args
        # TK TODO
        self.hparams.max_len = 1024

        self.kogpt2, self.vocab = get_pytorch_kogpt2_model()
        self.loss_function = torch.nn.CrossEntropyLoss(reduction='none')
Esempio n. 5
0
 def __init__(self, hparams, **kwargs):
     super(KoGPT2Chat, self).__init__()
     self.hparams = hparams
     self.tok_path = get_tokenizer()
     self.neg = -1e18
     self.kogpt2, self.vocab = get_pytorch_kogpt2_model("cuda")
     self.loss_function = torch.nn.CrossEntropyLoss(reduction='none')
     self.max_gpu_load_train = 0
     self.max_memory_used_train = 0.0
Esempio n. 6
0
 def __init__(self, max_len=32, batch_size=64, lr=5e-5, num_epochs=1):
     super(KoGPT2Chat, self).__init__()
     self.batch_size = batch_size
     self.lr = lr
     self.max_len = max_len
     self.tok_path = get_tokenizer()
     self.num_epochs = num_epochs
     self.neg = -1e18
     self.kogpt2, self.vocab = get_pytorch_kogpt2_model()
     self.loss_function = torch.nn.CrossEntropyLoss(reduction='none')
Esempio n. 7
0
 def __init__(self, hparams, **kwargs):
     super(KoGPT2Chat, self).__init__()
     self.hparams = hparams  # hparams에 args정보 들어감
     self.tok_path = get_tokenizer()
     self.neg = -1e18
     self.kogpt2, self.vocab = get_pytorch_kogpt2_model(
     )  # 모델이랑 단어 사전 두개로 받아준다
     self.loss_function = torch.nn.CrossEntropyLoss(
         reduction='none'
     )  # 손실함수는 CrossEntropyLoss : 분류 모델(label(정답값)과 gpt2의 아웃풋(원핫인코딩))
Esempio n. 8
0
    def __init__(self):

        self.PAD_IDX = 0
        self.UNK_IDX = 1
        self.PAD_TOKEN = 'PAD_TOKEN'
        self.UNK_TOKEN = 'UNK_TOKEN'
        self.tok=Mecab()
        _, self.vocab = get_pytorch_kogpt2_model()
        
        self.tok_path = get_tokenizer()
        self.tok2 = SentencepieceTokenizer(self.tok_path,  num_best=0, alpha=0)
    def __init__(self, examples):
        super(Dataset, self).__init__()
        self.examples = examples
        self.training = False

        self.PAD_IDX = 0
        self.UNK_IDX = 1
        self.PAD_TOKEN = 'PAD_TOKEN'
        self.UNK_TOKEN = 'UNK_TOKEN'
        self.tok = Mecab()
        _, self.vocab = get_pytorch_kogpt2_model()
Esempio n. 10
0
        count += 1
    randomNum = random.randint(0, count)
    return vocab.to_tokens(idx[randomNum])


def top_k(lists, vocab, k):
    item, idx = torch.sort(lists, descending=True)
    randomNum = random.randint(0, k)
    idx = idx.tolist()
    return vocab.to_tokens(idx[randomNum])


device = torch.device('cpu')
ck = 'mediumcheck.tar'
tok_path = get_tokenizer()
model, vocab = get_pytorch_kogpt2_model(ctx='cpu')
checkpoint = torch.load(ck, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
tok = SentencepieceTokenizer(tok_path)
print(vocab[vocab.bos_token])
print(vocab.bos_token)
sent_p = '세계 200여 개 기업이 '
sent_k = '세계 200여 개 기업이 '
sent_argmax = '세계 200여 개 기업이 '

toked_p = tok(sent_p)
toked_k = tok(sent_k)
toked_argmax = tok(sent_argmax)
sent_cnt = 0
input_ids_p = torch.tensor([
    vocab[vocab.bos_token],
Esempio n. 11
0
def main(args):
    # toker = GPT2Tokenizer.from_pretrained('gpt2')
    tok_path = get_tokenizer()
    toker = SentencepieceTokenizer(tok_path)
    _, vocab = get_pytorch_kogpt2_model()
    attrs = []
    if args.reverse:
        attrs.append('reverse')
    if args.two_turn:
        attrs.append('2turn')
    if attrs:
        db_path = (f'{args.corpus[:-4]}.{args.max_seq_len}len.'
                   f'{".".join(attrs)}.db/db')
    else:
        db_path = f'{args.corpus[:-4]}.{args.max_seq_len}len.db/db'
    if exists(dirname(db_path)):
        raise ValueError('Found existing DB, please backup')
    else:
        os.makedirs(dirname(db_path))
    with shelve.open(db_path, 'n') as db:
        # reader = open(args.corpus, "r", encoding="utf-8")
        reader = pd.read_csv(args.corpus, sep='\t', header=None)
        chunk = []
        n_chunk = 0
        n_example = 0

        # print("pdb-attach")
        # from pdb_clone import pdb
        # rsock = pdb.set_trace_remote()
        #
        # if rsock.state != rsock.ST_CONNECTED:
        #   input()

        for _, line in tqdm(reader.iterrows(), total=len(reader.index)):
            try:
                if len(chunk) >= args.chunk_size:
                    # save and renew chunk
                    db[f'chunk_{n_chunk}'] = gzip.compress(
                        json.dumps(chunk[:args.chunk_size]).encode('utf-8'))
                    chunk = chunk[args.chunk_size:]
                    n_chunk += 1

                weights, inputs = _get_inputs_from_text(line, toker, vocab)
                if args.reverse:
                    weights = list(reversed(weights))
                    inputs = list(reversed(inputs))
                if args.two_turn:
                    weights = weights[:2]
                    inputs = inputs[:2]
                if len(weights) < 2:
                    continue
                features = _make_features(n_example, weights, inputs, toker,
                                          vocab, args.max_seq_len)
                for feature in features:
                    chunk.append(vars(feature))
                    n_example += 1
            except Exception as e:
                print('!!! prepro exception !!!', e)
                continue
        # save last chunk
        db[f'chunk_{n_chunk}'] = gzip.compress(
            json.dumps(chunk).encode('utf-8'))
    # save relevant information to reproduce
    meta = {
        'n_example': n_example,
        'chunk_size': args.chunk_size,
        'max_seq_len': args.max_seq_len,
        'reverse': args.reverse,
        'two_turn': args.two_turn
    }
    with open(join(dirname(db_path), 'meta.json'), 'w') as writer:
        json.dump(meta, writer, indent=4)
Esempio n. 12
0
 def __init__(self, hparams, **kwargs):
     super(KoGPT2Chat, self).__init__()
     self.hparams = hparams
     self.neg = -1e18
     self.kogpt2, self.vocab = get_pytorch_kogpt2_model()
Esempio n. 13
0
def main(args):
    tok_path = get_tokenizer()
    model, vocab = get_pytorch_kogpt2_model()
    tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    batch_size = args.batch_size
    epochs = args.n_epochs
    learning_rate = 3e-5
    wamup_steps = 2000
    max_seq_len = 1024

    print("Dataset Loading... ", end=" ")
    dataset = synoDataset("./data/korean_naver_2.csv", vocab, tok)
    data_loader = DataLoader(dataset, batch_size=1, shuffle=False)
    print("[[[Done]]]")

    model = model.to(device)
    model.train()
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=wamup_steps,
                                                num_training_steps=-1)
    proc_seq_count = 0
    sum_loss = 0.0
    batch_count = 0
    model.zero_grad()

    models_folder = "trained_models"
    if not os.path.exists(models_folder):
        os.mkdir(models_folder)

    for epoch in range(epochs):
        print(f"Epoch {epoch} started" + "=" * 30)

        for idx, syno in enumerate(data_loader):
            # """  max 시퀀스가 넘으면 슬라이싱 """
            if len(syno) > max_seq_len:
                syno = syno[:max_seq_len]

            syno_tensor = torch.tensor(syno).unsqueeze(0).to(device)

            outputs = model(syno_tensor, labels=syno_tensor)
            loss, logits = outputs[:2]
            loss.backward()
            sum_loss = sum_loss + loss.detach().data

            proc_seq_count = proc_seq_count + 1
            if proc_seq_count == batch_size:
                proc_seq_count = 0
                batch_count += 1
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            if batch_count == args.print_every:
                print(
                    f"average loss for 100 epoch {sum_loss // args.print_every}"
                )
                batch_count = 0
                sum_loss = 0.0

        # Store the model after each epoch to compare the performance of them
        if epoch % args.save_every == 0:
            torch.save(
                model.state_dict(),
                os.path.join(args.save_dir, f"gpt2_genre_pad_{epoch}.pt"),
            )
 def __init__(self, **kwargs):
     super(KoGPT2Chat, self).__init__()
     self.neg = -1e18
     self.kogpt2, self.vocab = get_pytorch_kogpt2_model()
     self._tok_path = get_tokenizer()
     self.previous_context = [[]]
Esempio n. 15
0
import os
import torch
import platform
import sentencepiece
from kogpt2.utils import get_tokenizer
from kogpt2.pytorch_kogpt2 import get_pytorch_kogpt2_model
from flask import Flask, request, jsonify, __version__ as flaskver

tok_path = get_tokenizer(cachedir='./bin/')
model, vocab = get_pytorch_kogpt2_model(cachedir='./bin/')
tok = sentencepiece.SentencePieceProcessor(tok_path)

app = Flask(__name__)
port = int(os.getenv('port', '8080'))


@app.route('/', methods=['GET'])
def root():
    env = {
        'python': platform.python_version(),
        'flask': flaskver,
        'pytorch': torch.__version__
    }
    urls = {
        'original': 'https://github.com/SKT-AI/KoGPT2',
        'fork': 'https://github.com/pmh-only/KoGPT2'
    }
    usage = 'GET /job?query=<sentence>[&loop=<loopLimit>]'
    return jsonify(label='kogpt2', urls=urls, env=env, usage=usage)

Esempio n. 16
0
from kogpt2.model.torch_gpt2 import GPT2LMHeadModel
from kogpt2.configuration_gpt2 import GPT2Config

kogpt2_config = {
    "initializer_range": 0.02,
    "layer_norm_epsilon": 1e-05,
    "n_ctx": 1024,
    "n_embd": 768,
    "n_head": 12,
    "n_layer": 12,
    "n_positions": 1024,
    "vocab_size": 50000,
}

tok_path = get_tokenizer()
model, vocab = get_pytorch_kogpt2_model()
tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0)
device = "cpu"
if torch.cuda.is_available():
    device = torch.device("cuda:2")
torch.cuda.device("cuda:2")
print(device)
org_path = "trained_models/gpt2_j20_1007.pt"
load_path = "trained_models/gpt2_genre_pad_50.pt"

checkpoint = torch.load(load_path, map_location=device)
# 1013: special token 학습한 뒤로 keys 값이 달라져서 이와 같은 작업 필요
checkpoint_org = torch.load(org_path, map_location=device)

ckpt_final = {
    k: v
Esempio n. 17
0
def train():
    parser = ArgumentParser()
    parser.add_argument("--dataset_path",
                        type=str,
                        default="",
                        help="Path or url of the dataset.")
    parser.add_argument("--use_adapter",
                        default=False,
                        action='store_true',
                        help="Use adapter or not")
    parser.add_argument("--keyword_module",
                        type=str,
                        default="",
                        help="add, attention, ")
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="bertGpt",
                        help="Path, url or short name of the model")
    parser.add_argument("--train_batch_size",
                        type=int,
                        default=4,
                        help="Batch size for training")
    parser.add_argument("--valid_batch_size",
                        type=int,
                        default=4,
                        help="Batch size for validation")
    parser.add_argument("--gradient_accumulation_steps",
                        type=int,
                        default=8,
                        help="Accumulate gradients on several steps")
    parser.add_argument("--lr",
                        type=float,
                        default=6.25e-5,
                        help="Learning rate")
    parser.add_argument("--max_norm",
                        type=float,
                        default=1.0,
                        help="Clipping gradient norm")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=3,
                        help="Number of training epochs")
    parser.add_argument(
        "--eval_before_start",
        action='store_true',
        help="If true start with a first evaluation before training")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument(
        "--fp16",
        type=str,
        default="",
        help=
        "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="Local rank for distributed training (-1: not distributed)")
    parser.add_argument("--bert_model_path",
                        default="./",
                        type=str,
                        help="Bert pre-trained model path")
    parser.add_argument(
        "--vocab_file",
        default="./vocab.korean.rawtext.list",
        type=str,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    args = parser.parse_args()

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Running process %d", args.local_rank
    )  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(args))

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    logger.info("Prepare tokenizer, pretrained model and optimizer.")
    #tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer  # cant use Autotokenizer because checkpoint could be a Path
    #tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # Load KoBERT model and tokenizer
    bert_tokenizer = BertTokenizer.from_pretrained(
        args.vocab_file, do_lower_case=args.do_lower_case)
    bert_model = BertModel.from_pretrained(args.bert_model_path)
    bert_model.to(args.device)

    # Load KoGPT2 model and tokenizer
    tok_path = get_tokenizer()
    gpt_model, gpt_vocab = get_pytorch_kogpt2_model(
        keyword_module=args.keyword_module, use_adapter=args.use_adapter)
    gpt_tokenizer = SentencepieceTokenizer(tok_path)
    gpt_model.to(args.device)

    model = Seq2Seq(bert_model, gpt_model, gpt_vocab, args)

    optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    #if args.fp16:
    #from apex import amp  # Apex is only required if we use fp16 training
    #model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16)
    if args.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(
        args, bert_tokenizer, gpt_tokenizer, gpt_vocab)

    # Training function and trainer
    def update(engine, batch):
        model.train()
        batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
        source_ids, target_ids, lm_labels, keyword_scores = batch

        #(lm_loss), *_ = model(input_ids, token_type_ids=token_type_ids, labels=lm_labels)
        (lm_loss), *_ = model(source_ids,
                              target_ids,
                              key_score=keyword_scores,
                              lm_labels=lm_labels)
        loss = lm_loss / args.gradient_accumulation_steps

        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                           args.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        return loss.item()

    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(
                input_tensor.to(args.device) for input_tensor in batch)
            source_ids, target_ids, lm_labels, keyword_scores = batch

            #lm_logits, *_ = model(input_ids, token_type_ids=token_type_ids,)
            lm_logits, *_ = model(source_ids,
                                  target_ids,
                                  key_score=keyword_scores)
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(
                -1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return (lm_logits_flat_shifted), (lm_labels_flat_shifted)

    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(val_loader))
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED,
                                  lambda _: evaluator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED,
                                  lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr",
                                [(0, args.lr),
                                 (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {
        "nll":
        Loss(torch.nn.CrossEntropyLoss(ignore_index=-100),
             output_transform=lambda x: (x[0], x[1]))
    }
    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"], args)
    })
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(
            Events.COMPLETED, lambda _: pbar.log_message(
                "Validation: %s" % pformat(evaluator.state.metrics)))

        log_dir = make_logdir(args.model_checkpoint, args.dataset_path,
                              args.use_adapter, args.keyword_module)
        tb_logger = TensorboardLogger(log_dir)

        tb_logger.attach(trainer,
                         log_handler=OutputHandler(tag="training",
                                                   metric_names=["loss"]),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=OptimizerParamsHandler(optimizer),
                         event_name=Events.ITERATION_STARTED)
        tb_logger.attach(evaluator,
                         log_handler=OutputHandler(tag="validation",
                                                   metric_names=list(
                                                       metrics.keys()),
                                                   another_engine=trainer),
                         event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(log_dir,
                                             'checkpoint',
                                             save_interval=1,
                                             n_saved=2)
        trainer.add_event_handler(
            Events.EPOCH_COMPLETED, checkpoint_handler,
            {'mymodel': model
             })  # "getattr" takes care of distributed encapsulation

        torch.save(args, log_dir + '/model_training_args.bin')
        #getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME))
        #tokenizer.save_pretrained(log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if args.local_rank in [-1, 0] and args.n_epochs > 0:
        os.rename(
            os.path.join(log_dir, checkpoint_handler._saved[-1][1]),
            os.path.join(log_dir, WEIGHTS_NAME)
        )  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()