Exemple #1
0
def test_itf():
    tokenizer = Tokenizer.from_pretrained(Config.model_name)
    if Config.use_pickle:
        with open(f'{Config.pickle_path}', 'rb') as f:
            train_data = pickle.load(f)
    else:
        train_data = make_train_data_from_txt(Config, tokenizer)
    counter, itf = make_itf(train_data, Config.vocab_size, tokenizer)
    # itf = (itf - itf.min()) / (itf.max() - itf.min())
    # for i in range(itf.size(0)):
    #     print(i, itf[i])
    # itf[itf == 0] += 1e-6
    for k, v in counter.most_common(len(counter)):
        print(tokenizer.decode([k]), v)
Exemple #2
0
    seed_everything(Config.seed)
    device = torch.device(Config.device)

    start_epoch = 0

    logging.info('Define Models')
    model = build_model(Config).to(device)
    tokenizer = Tokenizer.from_pretrained(Config.model_name)

    logging.info('Define Loss and Optimizer')
    criterion = LabelSmoothing(tokenizer.vocab_size,
                               pad_id=tokenizer.pad_token_id,
                               smoothing=Config.smoothing)
    _opt = optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)
    optimizer = get_optimizer(_opt, factor=Config.factor, warmup=Config.warmup)

    logging.info('Preparing training data')
    if Config.use_pickle:
        with open(f'{Config.pickle_path}', 'rb') as f:
            train_data = pickle.load(f)
    else:
        train_data = make_train_data_from_txt(Config, tokenizer)
    dataset = DialogDataset(train_data, tokenizer)

    logging.info('Start Training')
    for epoch in range(start_epoch, Config.n_epoch):
        one_cycle(epoch, Config, model, optimizer, criterion,
                  BalancedDataLoader(dataset, tokenizer.pad_token_id),
                  tokenizer, device)
        evaluate(Config, 'もう疲れたー', tokenizer, model, device)