Ejemplo n.º 1
0
                                subchar=args.subchar)

    # model (restore)
    checkpoint_manager = CheckpointManager(model_dir)
    checkpoint = checkpoint_manager.load_checkpoint('best_snu_{}.tar'.format(
        args.pretrained_config))

    config = BertConfig(ptr_config.config)
    model = SentenceClassifier(config,
                               num_classes=model_config.num_classes,
                               vocab=preprocessor.vocab)
    model.load_state_dict(checkpoint['model_state_dict'])

    # evaluation
    filepath = getattr(data_config, args.dataset)
    ds = Corpus(filepath, preprocessor.preprocess)
    dl = DataLoader(ds, batch_size=model_config.batch_size, num_workers=4)

    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    summary_manager = SummaryManager(model_dir)
    summary = evaluate(model, dl, {
        'loss': nn.CrossEntropyLoss(),
        'acc': acc
    }, device)

    summary_manager.load('summary_snu_{}.json'.format(args.pretrained_config))
    summary_manager.update({'{}'.format(args.dataset): summary})
    summary_manager.save('summary_snu_{}.json'.format(args.pretrained_config))
Ejemplo n.º 2
0
def train(cfgpath):
    # parsing json
    with open(os.path.join(os.getcwd(), cfgpath)) as io:
        params = json.loads(io.read())

    # creating preprocessor
    tokenizer = JamoTokenizer()
    padder = PadSequence(300)

    # creating model
    model = CharCNN(num_classes=params['model'].get('num_classes'),
                    embedding_dim=params['model'].get('embedding_dim'),
                    dic=tokenizer.token2idx)

    # creating dataset, dataloader
    tr_filepath = os.path.join(os.getcwd(), params['filepath'].get('tr'))
    val_filepath = os.path.join(os.getcwd(), params['filepath'].get('val'))

    batch_size = params['training'].get('batch_size')
    tr_ds = Corpus(tr_filepath, tokenizer, padder)
    tr_dl = DataLoader(tr_ds,
                       batch_size=batch_size,
                       shuffle=True,
                       num_workers=4,
                       drop_last=True)
    val_ds = Corpus(val_filepath, tokenizer, padder)
    val_dl = DataLoader(val_ds, batch_size=batch_size, num_workers=4)

    # training
    loss_fn = nn.CrossEntropyLoss()
    opt = optim.Adam(params=model.parameters(),
                     lr=params['training'].get('learning_rate'))

    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    epochs = params['training'].get('epochs')

    for epoch in tqdm(range(epochs), desc='epochs'):

        avg_tr_loss = 0
        avg_val_loss = 0
        tr_step = 0
        val_step = 0

        model.train()
        for x_mb, y_mb in tqdm(tr_dl, desc='iters'):
            x_mb = x_mb.to(device)
            y_mb = y_mb.to(device)
            score = model(x_mb)

            opt.zero_grad()
            tr_loss = loss_fn(score, y_mb)
            tr_loss.backward()
            opt.step()

            avg_tr_loss += tr_loss.item()
            tr_step += 1
        else:
            avg_tr_loss /= tr_step

        model.eval()
        for x_mb, y_mb in tqdm(val_dl):
            x_mb = x_mb.to(device)
            y_mb = y_mb.to(device)

            with torch.no_grad():
                score = model(x_mb)
                val_loss = loss_fn(score, y_mb)
                avg_val_loss += val_loss.item()
                val_step += 1
        else:
            avg_val_loss /= val_step

        tqdm.write('epoch : {}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(
            epoch + 1, avg_tr_loss, avg_val_loss))

    ckpt = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'opt_state_dict': opt.state_dict()
    }

    savepath = os.path.join(os.getcwd(), params['filepath'].get('ckpt'))
    torch.save(ckpt, savepath)
Ejemplo n.º 3
0
def get_data_loaders(dataset_config, tokenizer, batch_size):
    tr_ds = Corpus(dataset_config.train, tokenizer.split_and_transform)
    tr_dl = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, num_workers=4, drop_last=True)
    val_ds = Corpus(dataset_config.validation, tokenizer.split_and_transform)
    val_dl = DataLoader(val_ds, batch_size=batch_size, num_workers=4)
    return tr_dl, val_dl
Ejemplo n.º 4
0
def main():
    train_path = Path.cwd() / 'data_in' / 'train.txt'
    val_path = Path.cwd() / 'data_in' / 'val.txt'
    vocab_path = Path.cwd() / 'data_in' / 'vocab.pkl'

    with open(vocab_path, mode='rb') as io:
        vocab = pickle.load(io)

    tokenizer = MeCab()
    padder = PadSequence(length=70, pad_val=vocab.token_to_idx['<pad>'])
    tr_ds = Corpus(train_path, vocab, tokenizer, padder)
    tr_dl = DataLoader(tr_ds, batch_size=1024, shuffle=True, num_workers=1, drop_last=True)
    val_ds = Corpus(val_path, vocab, tokenizer, padder)
    val_dl = DataLoader(val_ds, batch_size=1024)

    model = Net(vocab_len=len(vocab))

    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    for epoch in range(1):
        model.train()
        index = 0
        acc = 0
        for label, sen1, sen2 in tqdm(tr_dl, disable=True):
            optimizer.zero_grad()

            pre_label = model(sen1, sen2)

            loss = loss_fn(pre_label, label)
            loss.backward()
            optimizer.step()

            pred_cls = pre_label.data.max(1)[1]
            acc += pred_cls.eq(label.data).cpu().sum()

            print("epoch: {}, index: {}, loss: {}".format((epoch + 1), index, loss.item()))
            index += len(label)

        print('Accuracy : %d %%' % (
                100 * acc / index))

    for epoch in range(1):
        model.train()
        index = 0
        acc = 0
        for label, sen1, sen2 in tqdm(val_dl, disable=True):
            optimizer.zero_grad()

            pre_label = model(sen1, sen2)

            loss = loss_fn(pre_label, label)
            loss.backward()
            optimizer.step()

            pred_cls = pre_label.data.max(1)[1]
            acc += pred_cls.eq(label.data).cpu().sum()

            print("epoch: {}, index: {}, loss: {}".format((epoch + 1), index, loss.item()))
            index += len(label)

        print('Accuracy : %d %%' % (
                100 * acc / index))
Ejemplo n.º 5
0
    # model(restore)
    checkpoint_manager = CheckpointManager(model_dir)
    checkpoint = checkpoint_manager.load_checkpoint(args.restore_file + '.tar')
    model = ConvRec(num_classes=model_config.num_classes,
                    embedding_dim=model_config.embedding_dim,
                    hidden_dim=model_config.hidden_dim,
                    vocab=tokenizer.vocab)
    model.load_state_dict(checkpoint['model_state_dict'])

    # evaluation
    model.eval()
    summary_manager = SummaryManager(model_dir)
    filepath = getattr(data_config, args.data_name)
    ds = Corpus(filepath,
                tokenizer.split_and_transform,
                min_length=model_config.min_length,
                pad_val=tokenizer.vocab.to_indices(' '))
    dl = DataLoader(ds,
                    batch_size=model_config.batch_size,
                    num_workers=4,
                    collate_fn=batchify)

    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    summary = evaluate(model, dl, {
        'loss': nn.CrossEntropyLoss(),
        'acc': acc
    }, device)
Ejemplo n.º 6
0
    # model (restore)
    checkpoint_manager = CheckpointManager(model_dir)
    checkpoint = checkpoint_manager.load_checkpoint("best.tar")
    model = SAN(
        num_classes=model_config.num_classes,
        lstm_hidden_dim=model_config.lstm_hidden_dim,
        hidden_dim=model_config.hidden_dim,
        da=model_config.da,
        r=model_config.r,
        vocab=tokenizer.vocab,
    )
    model.load_state_dict(checkpoint["model_state_dict"])

    # evaluation
    filepath = getattr(data_config, args.dataset)
    ds = Corpus(filepath, tokenizer.split_and_transform)
    dl = DataLoader(
        ds, batch_size=model_config.batch_size, num_workers=4, collate_fn=batchify
    )

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)

    summary_manager = SummaryManager(model_dir)
    summary = evaluate(model, dl, {"loss": nn.CrossEntropyLoss(), "acc": acc}, device)

    summary_manager.load("summary.json")
    summary_manager.update({"{}".format(args.dataset): summary})
    summary_manager.save("summary.json")

    print("loss: {:.3f}, acc: {:.2%}".format(summary["loss"], summary["acc"]))
Ejemplo n.º 7
0
 def validation_dataloader(self):
     val_ds = Corpus(self.hparams.validation,
                     self.tokenizer.split_and_transform)
     return DataLoader(val_ds,
                       batch_size=self.hparams.batch_size,
                       num_workers=4)
Ejemplo n.º 8
0
def main():
    train_path = Path.cwd() / '..' / 'data_in' / 'train.txt'
    val_path = Path.cwd() / '..' / 'data_in' / 'val.txt'
    vocab_path = Path.cwd() / '..' / 'data_in' / 'vocab.pkl'

    length = 70
    dim = 300
    batch_size = 1024
    learning_rate = 0.01
    epochs = 10
    hidden = 50

    with open(vocab_path, mode='rb') as io:
        vocab = pickle.load(io)

    train = tf.data.TextLineDataset(str(train_path)).shuffle(
        buffer_size=batch_size).batch(batch_size=batch_size,
                                      drop_remainder=True)
    eval = tf.data.TextLineDataset(str(val_path)).batch(batch_size=batch_size,
                                                        drop_remainder=True)

    tokenizer = MeCab()
    corpus = Corpus(vocab, tokenizer)
    malstm = MaLSTM(length, dim, len(vocab))

    opt = tf.optimizers.Adam(learning_rate=learning_rate)
    loss_fn = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
    '''
    loss, accuracy
    '''
    train_loss_metric = tf.keras.metrics.Mean(name='train_loss')
    train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy(
        name='train_accuracy')
    val_loss_metric = tf.keras.metrics.Mean(name='val_loss')
    val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy(
        name='val_accuracy')

    for epoch in range(epochs):
        train_loss_metric.reset_states()
        train_acc_metric.reset_states()
        val_loss_metric.reset_states()
        val_acc_metric.reset_states()
        tf.keras.backend.set_learning_phase(1)

        for step, val in tqdm(enumerate(train)):
            sen1, sen2, label = corpus.token2idx(val)
            with tf.GradientTape() as tape:
                logits = malstm(sen1, sen2)
                train_loss = loss_fn(label, logits)

            grads = tape.gradient(target=train_loss,
                                  sources=malstm.trainable_variables)
            opt.apply_gradients(
                grads_and_vars=zip(grads, malstm.trainable_variables))

            train_loss_metric.update_state(train_loss)
            train_acc_metric.update_state(label, logits)

        tr_loss = train_loss_metric.result()

        tqdm.write('epoch : {}, tr_acc : {:.3f}%, tr_loss : {:.3f}'.format(
            epoch + 1,
            train_acc_metric.result() * 100, tr_loss))
Ejemplo n.º 9
0
    tokenizer = Tokenizer(vocab=vocab,
                          split_fn=ptr_tokenizer.tokenize,
                          pad_fn=pad_sequence)

    # model (restore)
    checkpoint_manager = CheckpointManager(model_dir)
    checkpoint = checkpoint_manager.load_checkpoint(args.restore_file + '.tar')
    config = BertConfig('pretrained/bert_config.json')
    model = BertClassifier(config,
                           num_labels=model_config.num_classes,
                           vocab=tokenizer.vocab)
    model.load_state_dict(checkpoint['model_state_dict'])

    # evaluation
    filepath = getattr(data_config, args.data_name)
    ds = Corpus(filepath, tokenizer.preprocess)
    dl = DataLoader(ds, batch_size=model_config.batch_size, num_workers=4)

    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    summary_manager = SummaryManager(model_dir)
    summary = evaluate(model, dl, {
        'loss': nn.CrossEntropyLoss(),
        'acc': acc
    }, device)

    summary_manager.load('summary.json')
    summary_manager.update({'{}'.format(args.data_name): summary})
    summary_manager.save('summary.json')
Ejemplo n.º 10
0
def main(argv):
    train_data = Path.cwd() / '..' / 'data_in' / 'train.txt'
    val_data = Path.cwd() / '..' / 'data_in' / 'val.txt'
    test_data = Path.cwd() / '..' / 'data_in' / 'test.txt'
    dev_data = Path.cwd() / '..' / 'data_in' / 'dev.txt'
    # init params
    classes = FLAGS.classes
    max_length = FLAGS.length
    epochs = FLAGS.epochs
    learning_rate = FLAGS.learning_rate
    dim = FLAGS.embedding_dim
    global_step = 1000
    batch_size = FLAGS.batch_size

    with open(Path.cwd() / '..' / 'data_in' / 'vocab.pkl', mode='rb') as io:
        vocab = pickle.load(io)

    train = tf.data.TextLineDataset(str(train_data)).shuffle(
        buffer_size=batch_size).batch(batch_size=batch_size)
    eval = tf.data.TextLineDataset(str(val_data)).batch(batch_size=batch_size)
    test = tf.data.TextLineDataset(str(test_data)).batch(batch_size=batch_size)
    dev = tf.data.TextLineDataset(str(dev_data)).batch(batch_size=batch_size)

    padder = PadSequence(max_length,
                         pad_val=vocab.to_indices(vocab.padding_token))
    processing = Corpus(vocab=vocab, split_fn=Split(), pad_fn=padder)

    # create model
    char_cnn = CharCNN(vocab=vocab, classes=classes, dim=dim)

    # create optimizer & loss_fn
    opt = tf.optimizers.Adam(learning_rate=learning_rate)
    loss_fn = tf.losses.SparseCategoricalCrossentropy()

    train_loss_metric = tf.keras.metrics.Mean(name='train_loss')
    train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy(
        name='train_accuracy')
    val_loss_metric = tf.keras.metrics.Mean(name='val_loss')
    val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy(
        name='val_accuracy')

    # train_summary_writer = tf.summary.create_file_writer('./data_out/summaries/train')
    # eval_summary_writer = tf.summary.create_file_writer('./data_out/summaries/eval')

    # ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=opt, net=char_cnn)
    # manager = tf.train.CheckpointManager(ckpt, './data_out/tf_ckpts', max_to_keep=3)
    # ckpt.restore(manager.latest_checkpoint)
    #
    # if manager.latest_checkpoint:
    #     print("Restored from {}".format(manager.latest_checkpoint))
    # else:
    #     print("Initializing from scratch.")

    #training
    for epoch in tqdm(range(epochs), desc='epochs'):

        train_loss_metric.reset_states()
        train_acc_metric.reset_states()
        val_loss_metric.reset_states()
        val_acc_metric.reset_states()
        tf.keras.backend.set_learning_phase(1)

        #with train_summary_writer.as_default():
        for step, val in tqdm(enumerate(train), desc='steps'):
            data, label = processing.token2idex(val)
            with tf.GradientTape() as tape:
                logits = char_cnn(data)
                train_loss = loss_fn(label, logits)

            #ckpt.step.assign_add(1)
            grads = tape.gradient(target=train_loss,
                                  sources=char_cnn.trainable_variables)
            opt.apply_gradients(
                grads_and_vars=zip(grads, char_cnn.trainable_variables))

            train_loss_metric.update_state(train_loss)
            train_acc_metric.update_state(label, logits)

            # if tf.equal(opt.iterations % global_step, 0):
            #     tf.summary.scalar('loss', train_loss_metric.result(), step=opt.iterations)

        tr_loss = train_loss_metric.result()

        #save_path = manager.save()
        #print(save_path)
        tqdm.write('epoch : {}, tr_acc : {:.3f}%, tr_loss : {:.3f}'.format(
            epoch + 1,
            train_acc_metric.result() * 100, tr_loss))
Ejemplo n.º 11
0
        label_vocab = pickle.load(io)
    token_tokenizer = Tokenizer(token_vocab, split_to_self)
    label_tokenizer = Tokenizer(label_vocab, split_to_self)

    # model (restore)
    checkpoint_manager = CheckpointManager(model_dir)
    checkpoint = checkpoint_manager.load_checkpoint(args.restore_file + ".tar")
    model = BilstmCRF(label_vocab, token_vocab, model_config.lstm_hidden_dim)
    model.load_state_dict(checkpoint["model_state_dict"])

    # evaluation
    summary_manager = SummaryManager(model_dir)
    filepath = getattr(data_config, args.data_name)
    ds = Corpus(
        filepath,
        token_tokenizer.split_and_transform,
        label_tokenizer.split_and_transform,
    )
    dl = DataLoader(ds,
                    batch_size=model_config.batch_size,
                    num_workers=4,
                    collate_fn=batchify)
    device = torch.device(
        "cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)

    f1_score = get_f1_score(model, dl, device)
    summary_manager.load("summary.json")
    summary_manager._summary[args.data_name].update({"f1": f1_score})
    summary_manager.save("summary.json")
Ejemplo n.º 12
0
# loading trained model
save_path = params['filepath'].get('ckpt')
# save_path = 'tokenize.pth'
ckpt = torch.load(save_path)
config = BertConfig('bert/bert_config.json')
model = BertTagger(config=config, num_labels=len(label_vocab.token_to_idx), vocab=token_vocab)
model.load_state_dict(ckpt['model_state_dict'])

# loading datasets
batch_size = params['training'].get('batch_size')
train_path = params['filepath'].get('train')
val_path = params['filepath'].get('val')
test_path = params['filepath'].get('test')

train_data = Corpus(train_path, token_vocab.to_indices, label_vocab.to_indices)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=False, num_workers=16,
                          drop_last=True, collate_fn=batchify)
val_data = Corpus(val_path, token_vocab.to_indices, label_vocab.to_indices)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, num_workers=16,
                        drop_last=True, collate_fn=batchify)
test_data = Corpus(test_path, token_vocab.to_indices, label_vocab.to_indices)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=16,
                         drop_last=True, collate_fn=batchify)


# using gpu
device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

Ejemplo n.º 13
0
    pad_sequence = PadSequence(length=model_config.length,
                               pad_val=vocab.to_indices(vocab.padding_token))
    preprocessor = PreProcessor(vocab=vocab,
                                split_fn=ptr_tokenizer.tokenize,
                                pad_fn=pad_sequence)

    # model
    config = BertConfig('pretrained/bert_config.json')
    model = PairwiseClassifier(config,
                               num_classes=model_config.num_classes,
                               vocab=preprocessor.vocab)
    bert_pretrained = torch.load('pretrained/pytorch_model.bin')
    model.load_state_dict(bert_pretrained, strict=False)

    # training
    tr_ds = Corpus(data_config.tr, preprocessor.preprocess)
    tr_dl = DataLoader(tr_ds,
                       batch_size=model_config.batch_size,
                       shuffle=True,
                       num_workers=4,
                       drop_last=True)
    val_ds = Corpus(data_config.val, preprocessor.preprocess)
    val_dl = DataLoader(val_ds, batch_size=model_config.batch_size)

    loss_fn = nn.CrossEntropyLoss()
    opt = optim.Adam([
        {
            "params": model.bert.parameters(),
            "lr": model_config.learning_rate / 100
        },
        {
Ejemplo n.º 14
0
    model_config = Config(json_path=model_dir / 'config.json')

    # tokenizer
    with open(data_config.vocab, mode='rb') as io:
        vocab = pickle.load(io)
    tokenizer = Tokenizer(vocab=vocab, split_fn=split_to_jamo)

    # model
    model = ConvRec(num_classes=model_config.num_classes,
                    embedding_dim=model_config.embedding_dim,
                    hidden_dim=model_config.hidden_dim,
                    vocab=tokenizer.vocab)

    # training
    tr_ds = Corpus(data_config.train,
                   tokenizer.split_and_transform,
                   min_length=model_config.min_length,
                   pad_val=tokenizer.vocab.to_indices(' '))
    tr_dl = DataLoader(tr_ds,
                       batch_size=model_config.batch_size,
                       shuffle=True,
                       num_workers=4,
                       collate_fn=batchify,
                       drop_last=True)
    val_ds = Corpus(data_config.validation,
                    tokenizer.split_and_transform,
                    min_length=model_config.min_length,
                    pad_val=tokenizer.vocab.to_indices(' '))
    val_dl = DataLoader(val_ds,
                        batch_size=model_config.batch_size,
                        collate_fn=batchify)
Ejemplo n.º 15
0
def main(json_path):
    cwd = Path.cwd()
    with open(cwd / json_path) as io:
        params = json.loads(io.read())

    # tokenizer
    vocab_path = params['filepath'].get('vocab')
    with open(cwd / vocab_path, mode='rb') as io:
        vocab = pickle.load(io)
    tokenizer = Tokenizer(vocab=vocab, split_fn=MeCab().morphs)

    # model
    num_classes = params['model'].get('num_classes')
    lstm_hidden_dim = params['model'].get('lstm_hidden_dim')
    hidden_dim = params['model'].get('hidden_dim')
    da = params['model'].get('da')
    r = params['model'].get('r')
    model = SAN(num_classes=num_classes,
                lstm_hidden_dim=lstm_hidden_dim,
                hidden_dim=hidden_dim,
                da=da,
                r=r,
                vocab=tokenizer.vocab)

    # training
    epochs = params['training'].get('epochs')
    batch_size = params['training'].get('batch_size')
    learning_rate = params['training'].get('learning_rate')
    global_step = params['training'].get('global_step')

    tr_path = cwd / params['filepath'].get('tr')
    val_path = cwd / params['filepath'].get('val')
    tr_ds = Corpus(tr_path, tokenizer.split_and_transform)
    tr_dl = DataLoader(tr_ds,
                       batch_size=batch_size,
                       shuffle=True,
                       num_workers=4,
                       drop_last=True,
                       collate_fn=batchify)
    val_ds = Corpus(val_path, tokenizer.split_and_transform)
    val_dl = DataLoader(val_ds,
                        batch_size=batch_size,
                        num_workers=4,
                        collate_fn=batchify)

    loss_fn = nn.CrossEntropyLoss()
    opt = optim.Adam(params=model.parameters(), lr=learning_rate)
    scheduler = ReduceLROnPlateau(opt, patience=5)
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    writer = SummaryWriter('./runs/{}'.format(params['version']))
    for epoch in tqdm(range(epochs), desc='epochs'):

        tr_loss = 0

        model.train()
        for step, mb in tqdm(enumerate(tr_dl), desc='steps', total=len(tr_dl)):
            queries_a_mb, queries_b_mb, y_mb = map(lambda elm: elm.to(device),
                                                   mb)
            queries_mb = (queries_a_mb, queries_b_mb)

            opt.zero_grad()
            score, queries_a_attn_mat, queries_b_attn_mat = model(queries_mb)
            a_reg = regularize(queries_a_attn_mat, r, device)
            b_reg = regularize(queries_b_attn_mat, r, device)
            mb_loss = loss_fn(score, y_mb)
            mb_loss.add_(a_reg)
            mb_loss.add_(b_reg)
            mb_loss.backward()
            opt.step()

            tr_loss += mb_loss.item()

            if (epoch * len(tr_dl) + step) % global_step == 0:
                val_loss = evaluate(model, val_dl, loss_fn, device)
                writer.add_scalars('loss', {
                    'train': tr_loss / (step + 1),
                    'validation': val_loss
                },
                                   epoch * len(tr_dl) + step)
                model.train()
        else:
            tr_loss /= (step + 1)

        val_loss = evaluate(model, val_dl, loss_fn, device)
        scheduler.step(val_loss)
        tqdm.write('epoch : {}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(
            epoch + 1, tr_loss, val_loss))

    ckpt = {
        'model_state_dict': model.state_dict(),
        'opt_state_dict': opt.state_dict()
    }

    save_path = cwd / params['filepath'].get('ckpt')
    torch.save(ckpt, save_path)
Ejemplo n.º 16
0
    preprocessor = PreProcessor(ptr_tokenizer, model_config.max_len)

    # Load Model
    config_filepath = ptr_dir / "{}-config.json".format(args.type)
    config = BertConfig.from_pretrained(config_filepath,
                                        output_hidden_states=False)
    model = BIIN(config,
                 vocab,
                 model_config.hidden_size,
                 enc_num_layers=len(model_config.hidden_size))

    # Data Loader
    tr_ds = Corpus(data_config.tr_path,
                   preprocessor.preprocess,
                   sep='\t',
                   doc_col='question1',
                   label_col='is_duplicate',
                   is_pair=True,
                   doc_col_second='question2')
    val_ds = Corpus(data_config.dev_path,
                    preprocessor.preprocess,
                    sep='\t',
                    doc_col='question1',
                    label_col='is_duplicate',
                    is_pair=True,
                    doc_col_second='question2')
    tr_dl = DataLoader(tr_ds,
                       batch_size=model_config.batch_size,
                       shuffle=True,
                       num_workers=4,
                       drop_last=True)
Ejemplo n.º 17
0
                                   pad_val=vocab.to_indices(
                                       vocab.padding_token))
        preprocessor = PreProcessor(vocab=vocab,
                                    split_fn=ptr_tokenizer,
                                    pad_fn=pad_sequence)

    # model
    config = BertConfig(ptr_config.config)
    model = PairwiseClassifier(config,
                               num_classes=model_config.num_classes,
                               vocab=preprocessor.vocab)
    # bert_pretrained = torch.load(ptr_config.bert)
    # model.load_state_dict(bert_pretrained, strict=False)

    # training
    tr_ds = Corpus(data_config.train, preprocessor.preprocess)
    tr_dl = DataLoader(tr_ds,
                       batch_size=model_config.batch_size,
                       shuffle=True,
                       num_workers=4,
                       drop_last=True)
    val_ds = Corpus(data_config.validation, preprocessor.preprocess)
    val_dl = DataLoader(val_ds,
                        batch_size=model_config.batch_size,
                        num_workers=4)

    loss_fn = nn.CrossEntropyLoss()
    opt = optim.Adam([
        {
            "params": model.bert.parameters(),
            "lr": model_config.learning_rate / 100
Ejemplo n.º 18
0
def main(json_path):
    cwd = Path.cwd()
    with open(cwd / json_path) as io:
        params = json.loads(io.read())

    # tokenizer
    vocab_path = params['filepath'].get('vocab')
    with open(cwd / vocab_path, mode='rb') as io:
        vocab = pickle.load(io)
    length = params['padder'].get('length')
    padder = PadSequence(length=length, pad_val=vocab.to_indices(vocab.padding_token))
    tokenizer = Tokenizer(vocab=vocab, split_fn=MeCab().morphs, pad_fn=padder)

    # model
    num_classes = params['model'].get('num_classes')
    model = SenCNN(num_classes=num_classes, vocab=tokenizer.vocab)

    # training
    epochs = params['training'].get('epochs')
    batch_size = params['training'].get('batch_size')
    learning_rate = params['training'].get('learning_rate')
    global_step = params['training'].get('global_step')

    tr_path = cwd / params['filepath'].get('tr')
    val_path = cwd / params['filepath'].get('val')
    tr_ds = Corpus(tr_path, tokenizer.split_and_transform)
    tr_dl = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, num_workers=4, drop_last=True)
    val_ds = Corpus(val_path, tokenizer.split_and_transform)
    val_dl = DataLoader(val_ds, batch_size=batch_size)

    loss_fn = nn.CrossEntropyLoss()
    opt = optim.Adam(params=model.parameters(), lr=learning_rate)
    scheduler = ReduceLROnPlateau(opt, patience=5)
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    writer = SummaryWriter('./runs/{}'.format(params['version']))
    for epoch in tqdm(range(epochs), desc='epochs'):

        tr_loss = 0

        model.train()
        for step, mb in tqdm(enumerate(tr_dl), desc='steps', total=len(tr_dl)):
            x_mb, y_mb = map(lambda elm: elm.to(device), mb)

            opt.zero_grad()
            mb_loss = loss_fn(model(x_mb), y_mb)
            mb_loss.backward()
            clip_grad_norm_(model._fc.weight, 5)
            opt.step()

            tr_loss += mb_loss.item()

            if (epoch * len(tr_dl) + step) % global_step == 0:
                val_loss = evaluate(model, val_dl, loss_fn, device)
                writer.add_scalars('loss', {'train': tr_loss / (step + 1),
                                            'val': val_loss}, epoch * len(tr_dl) + step)

                model.train()
        else:
            tr_loss /= (step + 1)

        val_loss = evaluate(model, val_dl, loss_fn, device)
        scheduler.step(val_loss)
        tqdm.write('epoch : {}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(epoch + 1, tr_loss, val_loss))

    ckpt = {'model_state_dict': model.state_dict(),
            'opt_state_dict': opt.state_dict()}

    save_path = cwd / params['filepath'].get('ckpt')
    torch.save(ckpt, save_path)
Ejemplo n.º 19
0
    # tokenizer
    with open(data_config.vocab, mode='rb') as io:
        vocab = pickle.load(io)
    pad_sequence = PadSequence(length=model_config.length,
                               pad_val=vocab.to_indices(vocab.padding_token))
    tokenizer = Tokenizer(vocab=vocab,
                          split_fn=split_to_jamo,
                          pad_fn=pad_sequence)

    # model
    model = CharCNN(num_classes=model_config.num_classes,
                    embedding_dim=model_config.embedding_dim,
                    vocab=tokenizer.vocab)

    # training
    tr_ds = Corpus(data_config.train, tokenizer.split_and_transform)
    tr_dl = DataLoader(tr_ds,
                       batch_size=model_config.batch_size,
                       shuffle=True,
                       num_workers=4,
                       drop_last=True)
    val_ds = Corpus(data_config.validation, tokenizer.split_and_transform)
    val_dl = DataLoader(val_ds,
                        batch_size=model_config.batch_size,
                        num_workers=4)

    loss_fn = nn.CrossEntropyLoss()
    opt = optim.Adam(params=model.parameters(), lr=model_config.learning_rate)
    scheduler = ReduceLROnPlateau(opt, patience=5)
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
Ejemplo n.º 20
0
def main(argv):
    train_data = Path.cwd() / 'data_in' / 'train.txt'
    val_data = Path.cwd() / 'data_in' / 'val.txt'

    with open(Path.cwd() / 'data_in' / 'vocab.pkl', mode='rb') as io:
        vocab = pickle.load(io)

    train = tf.data.TextLineDataset(str(train_data)).shuffle(buffer_size=1000).batch(batch_size=FLAGS.batch_size,
                                                                                     drop_remainder=True)
    eval = tf.data.TextLineDataset(str(val_data)).batch(batch_size=FLAGS.batch_size, drop_remainder=True)

    tokenized = MeCab()
    processing = Corpus(vocab=vocab, tokenizer=tokenized)

    # init params
    classes = FLAGS.classes
    max_length = FLAGS.length
    epochs = FLAGS.epochs
    learning_rate = FLAGS.learning_rate
    global_step = 1000

    # create model
    sen_cnn = SenCNN(vocab=vocab, classes=classes)

    # create optimizer & loss_fn
    opt = tf.optimizers.Adam(learning_rate=learning_rate)
    loss_fn = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

    train_loss_metric = tf.keras.metrics.Mean(name='train_loss')
    train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
    val_loss_metric = tf.keras.metrics.Mean(name='val_loss')
    val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy(name='val_accuracy')

    train_summary_writer = tf.summary.create_file_writer('./data_out/summaries/train')
    eval_summary_writer = tf.summary.create_file_writer('./data_out/summaries/eval')

    ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=opt, net=sen_cnn)
    manager = tf.train.CheckpointManager(ckpt, './data_out/tf_ckpts', max_to_keep=3)
    ckpt.restore(manager.latest_checkpoint)

    if manager.latest_checkpoint:
        print("Restored from {}".format(manager.latest_checkpoint))
    else:
        print("Initializing from scratch.")

    # training
    for epoch in tqdm(range(epochs), desc='epochs'):

        train_loss_metric.reset_states()
        train_acc_metric.reset_states()
        val_loss_metric.reset_states()
        val_acc_metric.reset_states()
        tf.keras.backend.set_learning_phase(1)

        tr_loss = 0
        with train_summary_writer.as_default():
            for step, val in tqdm(enumerate(train), desc='steps'):
                data, label = processing.token2idex(val)
                with tf.GradientTape() as tape:
                    logits = sen_cnn(data)
                    train_loss = loss_fn(label, logits)
                ckpt.step.assign_add(1)
                grads = tape.gradient(target=train_loss, sources=sen_cnn.trainable_variables)
                opt.apply_gradients(grads_and_vars=zip(grads, sen_cnn.trainable_variables))
                # tr_loss += pred_loss.numpy()

                train_loss_metric.update_state(train_loss)
                train_acc_metric.update_state(label, logits)

                if tf.equal(opt.iterations % global_step, 0):
                    tf.summary.scalar('loss', train_loss_metric.result(), step=opt.iterations)

        # else:
        # tr_loss /= (step + 1)
        # print("t_loss {}".format(tr_loss))
        tr_loss = train_loss_metric.result()
        save_path = manager.save()
        print(save_path)

        tf.keras.backend.set_learning_phase(0)

        val_loss = 0
        with eval_summary_writer.as_default():
            for step, val in tqdm(enumerate(eval), desc='steps'):
                data, label = processing.token2idex(val)
                logits = sen_cnn(data)
                val_loss = loss_fn(label, logits)
                # val_loss += mb_loss.numpy()
                val_loss_metric.update_state(val_loss)
                val_acc_metric.update_state(label, logits)
                tf.summary.scalar('loss', val_loss_metric.result(), step=step)

        val_loss = val_loss_metric.result()

        tqdm.write(
            'epoch : {}, tr_acc : {:.3f}%, tr_loss : {:.3f}, val_acc : {:.3f}%, val_loss : {:.3f}'.format(epoch + 1,
                                                                                                          train_acc_metric.result() * 100,
                                                                                                          tr_loss,
                                                                                                          val_acc_metric.result() * 100,
                                                                                                          val_loss))