Example #1
0
def transform(raw_data, params):
    # 定义数据转换接口
    # raw_data --> batch_data

    for data in raw_data:
        pass

    num_buckets = params.num_buckets
    batch_size = params.batch_size

    responses = raw_data

    batch_idxes = FixedBucketSampler([len(rs) for rs in responses],
                                     batch_size,
                                     num_buckets=num_buckets)
    batch = []

    def index(r):
        correct = 0 if r[1] <= 0 else 1
        return r[0] * 2 + correct

    for batch_idx in tqdm(batch_idxes, "batchify"):
        batch_rs = []
        batch_pick_index = []
        batch_labels = []
        for idx in batch_idx:
            batch_rs.append([index(r) for r in responses[idx]])
            if len(responses[idx]) <= 1:
                pick_index, labels = [], []
            else:
                pick_index, labels = zip(*[(r[0], 0 if r[1] <= 0 else 1)
                                           for r in responses[idx][1:]])
            batch_pick_index.append(list(pick_index))
            batch_labels.append(list(labels))

        max_len = max([len(rs) for rs in batch_rs])
        padder = PadSequence(max_len, pad_val=0)
        batch_rs, data_mask = zip(*[(padder(rs), len(rs)) for rs in batch_rs])

        max_len = max([len(rs) for rs in batch_labels])
        padder = PadSequence(max_len, pad_val=0)
        batch_labels, label_mask = zip(*[(padder(labels), len(labels))
                                         for labels in batch_labels])
        batch_pick_index = [
            padder(pick_index) for pick_index in batch_pick_index
        ]
        batch.append([
            mx.nd.array(batch_rs),
            mx.nd.array(data_mask),
            mx.nd.array(batch_labels),
            mx.nd.array(batch_pick_index),
            mx.nd.array(label_mask)
        ])

    return batch
Example #2
0
def transform(raw_data, params):
    # 定义数据转换接口
    # raw_data --> batch_data

    num_buckets = params.num_buckets
    batch_size = params.batch_size

    responses = raw_data

    # 不同长度的样本每次都会分配到固定的bucket当中
    batch_idxes = FixedBucketSampler([len(rs) for rs in responses], batch_size, num_buckets=num_buckets)
    batch = []

    def index(r):
        correct = 0 if r[1] <= 0 else 1
        return r[0] * 2 + correct

    for batch_idx in tqdm(batch_idxes, "batchify"):
        batch_rs = []
        batch_pick_index = []
        batch_labels = []
        for idx in batch_idx:
            # 1. 答对答错存储的id不一样
            # 2. batch_pick_index[i], batch_labels[i]维度比batch_rs[i]小1
            batch_rs.append([index(r) for r in responses[idx]])
            if len(responses[idx]) <= 1:
                pick_index, labels = [], []
            else:
                pick_index, labels = zip(*[(r[0], 0 if r[1] <= 0 else 1) for r in responses[idx][1:]])
            batch_pick_index.append(list(pick_index))
            batch_labels.append(list(labels))
        # 每一个bucket中的max_len不一样
        max_len = max([len(rs) for rs in batch_rs])
        padder = PadSequence(max_len, pad_val=0)
        # batch_rs padding到max_len之后的数据
        # data_mask表示对应的有效数据的条数
        batch_rs, data_mask = zip(*[(padder(rs), len(rs)) for rs in batch_rs])

        max_len = max([len(rs) for rs in batch_labels])
        padder = PadSequence(max_len, pad_val=0)
        batch_labels, label_mask = zip(*[(padder(labels), len(labels)) for labels in batch_labels])
        batch_pick_index = [padder(pick_index) for pick_index in batch_pick_index]
        # Load
        # 所有数据都padding到固定长度的序列
        # data_mask label_mask表示有效数据的长度
        # len(batch_labels) + 1 = len(batch_rs)
        batch.append(
            [torch.tensor(batch_rs), torch.tensor(data_mask), torch.tensor(batch_labels),
             torch.tensor(batch_pick_index),
             torch.tensor(label_mask)])

    return batch
Example #3
0
File: etl.py Project: tswsxk/XKT
def transform(raw_data, params):
    # 定义数据转换接口
    # raw_data --> batch_data

    num_buckets = params.num_buckets
    batch_size = params.batch_size

    responses = raw_data

    batch_idxes = FixedBucketSampler([len(rs) for rs in responses],
                                     batch_size,
                                     num_buckets=num_buckets)
    batch = []

    def response_index(r):
        correct = 0 if r[1] <= 0 else 1
        return r[0] * 2 + correct

    def question_index(r):
        return r[0]

    for batch_idx in tqdm(batch_idxes, "batchify"):
        batch_qs = []
        batch_rs = []
        batch_labels = []
        for idx in batch_idx:
            batch_qs.append([question_index(r) for r in responses[idx]])
            batch_rs.append([response_index(r) for r in responses[idx]])
            labels = [0 if r[1] <= 0 else 1 for r in responses[idx][:]]
            batch_labels.append(list(labels))

        max_len = max([len(rs) for rs in batch_rs])
        padder = PadSequence(max_len, pad_val=0)
        batch_qs, _ = zip(*[(padder(qs), len(qs)) for qs in batch_qs])
        batch_rs, data_mask = zip(*[(padder(rs), len(rs)) for rs in batch_rs])

        max_len = max([len(rs) for rs in batch_labels])
        padder = PadSequence(max_len, pad_val=0)
        batch_labels, label_mask = zip(*[(padder(labels), len(labels))
                                         for labels in batch_labels])
        batch.append([
            mx.nd.array(batch_qs, dtype="float32"),
            mx.nd.array(batch_rs, dtype="float32"),
            mx.nd.array(data_mask),
            mx.nd.array(batch_labels),
            mx.nd.array(label_mask)
        ])

    return batch
Example #4
0
def load_data(params):     
    tokenizer = JamoTokenizer()
    padder = PadSequence(length=params.get('pad_length'))
    batch_size = params.get('batch_size')
    test_path = params.get('test')
    test_ds = NaverMovieCorpus(test_path, tokenizer, padder)
    test_dl = DataLoader(test_ds, batch_size*2, drop_last=False)    
    return test_dl
Example #5
0
def load_data(params):
    tokenizer = JamoTokenizer()
    padder = PadSequence(length=params.get('pad_length'))
    train_path = params.get('train')
    valid_path = params.get('valid')
    batch_size = params.get('batch_size')
    train_ds = NaverMovieCorpus(train_path, tokenizer, padder)
    valid_ds = NaverMovieCorpus(valid_path, tokenizer, padder)
    train_dl = DataLoader(train_ds, batch_size, shuffle=True, drop_last=True)
    valid_dl = DataLoader(valid_ds, batch_size * 2, drop_last=False)
    return train_dl, valid_dl
Example #6
0
def main(json_path):
    cwd = Path.cwd()
    with open(cwd / json_path) as io:
        params = json.loads(io.read())

    # tokenizer
    vocab_path = params['filepath'].get('vocab')
    with open(cwd / vocab_path, mode='rb') as io:
        vocab = pickle.load(io)
    length = params['padder'].get('length')
    padder = PadSequence(length=length,
                         pad_val=vocab.to_indices(vocab.padding_token))
    tokenizer = Tokenizer(vocab=vocab, split_fn=split_to_jamo, pad_fn=padder)

    # model (restore)
    save_path = cwd / params['filepath'].get('ckpt')
    ckpt = torch.load(save_path)
    num_classes = params['model'].get('num_classes')
    embedding_dim = params['model'].get('embedding_dim')
    k_max = params['model'].get('k_max')

    model = VDCNN(num_classes=num_classes,
                  embedding_dim=embedding_dim,
                  k_max=k_max,
                  vocab=tokenizer.vocab)
    model.load_state_dict(ckpt['model_state_dict'])

    # evaluation
    batch_size = params['training'].get('batch_size')
    tr_path = cwd / params['filepath'].get('tr')
    val_path = cwd / params['filepath'].get('val')
    tst_path = cwd / params['filepath'].get('tst')

    tr_ds = Corpus(tr_path, tokenizer.split_and_transform)
    tr_dl = DataLoader(tr_ds, batch_size=batch_size, num_workers=4)
    val_ds = Corpus(val_path, tokenizer.split_and_transform)
    val_dl = DataLoader(val_ds, batch_size=batch_size, num_workers=4)
    tst_ds = Corpus(tst_path, tokenizer.split_and_transform)
    tst_dl = DataLoader(tst_ds, batch_size=batch_size, num_workers=4)

    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    tr_acc = get_accuracy(model, tr_dl, device)
    val_acc = get_accuracy(model, val_dl, device)
    tst_acc = get_accuracy(model, tst_dl, device)

    print('tr_acc: {:.2%}, val_acc: {:.2%}, tst_acc: {:.2%}'.format(
        tr_acc, val_acc, tst_acc))
Example #7
0
def transform(raw_data, params):
    # 定义数据转换接口
    # raw_data --> batch_data

    batch_size = params.batch_size
    padding = params.padding
    num_buckets = params.num_buckets
    fixed_length = params.fixed_length

    features, labels = raw_data
    word_feature, word_radical_feature, char_feature, char_radical_feature = features
    batch_idxes = FixedBucketSampler([len(word_f) for word_f in word_feature],
                                     batch_size,
                                     num_buckets=num_buckets)
    batch = []
    for batch_idx in batch_idxes:
        batch_features = [[] for _ in range(len(features))]
        batch_labels = []
        for idx in batch_idx:
            for i, feature in enumerate(batch_features):
                batch_features[i].append(features[i][idx])
            batch_labels.append(labels[idx])
        batch_data = []
        word_mask = []
        char_mask = []
        for i, feature in enumerate(batch_features):
            max_len = max([len(fea) for fea in feature
                           ]) if not fixed_length else fixed_length
            padder = PadSequence(max_len, pad_val=padding)
            feature, mask = zip(*[(padder(fea), len(fea)) for fea in feature])
            if i == 0:
                word_mask = mask
            elif i == 2:
                char_mask = mask
            batch_data.append(mx.nd.array(feature))
        batch_data.append(mx.nd.array(word_mask))
        batch_data.append(mx.nd.array(char_mask))
        batch_data.append(mx.nd.array(batch_labels, dtype=np.int))
        batch.append(batch_data)
    return batch[::-1]
Example #8
0
def main():
    test_path = Path.cwd() / 'data_in' / 'test.txt'
    vocab_path = Path.cwd() / 'data_in' / 'vocab.pkl'

    with open(vocab_path, mode='rb') as io:
        vocab = pickle.load(io)

    tokenizer = MeCab()
    padder = PadSequence(length=70, pad_val=vocab.token_to_idx['<pad>'])
    test_ds = Corpus(test_path, vocab, tokenizer, padder)
    test_dl = DataLoader(test_ds, batch_size=1024)

    model = Net(vocab_len=len(vocab))

    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    for epoch in range(1):
        model.train()
        index = 0
        acc = 0
        for label, sen1, sen2 in tqdm(test_dl, disable=True):
            optimizer.zero_grad()

            pre_label = model(sen1, sen2)

            loss = loss_fn(pre_label, label)
            loss.backward()
            optimizer.step()

            pred_cls = pre_label.data.max(1)[1]
            acc += pred_cls.eq(label.data).cpu().sum()

            print("epoch: {}, index: {}, loss: {}".format((epoch + 1), index,
                                                          loss.item()))
            index += len(label)

        print('Accuracy : %d %%' % (100 * acc / index))
Example #9
0
def evaluate(cfgpath):
    # parsing json
    with open(os.path.join(os.getcwd(), cfgpath)) as io:
        params = json.loads(io.read())

    # restoring model
    savepath = os.path.join(os.getcwd(), params['filepath'].get('ckpt'))
    ckpt = torch.load(savepath)

    vocab = ckpt['vocab']
    model = SentenceCNN(num_classes=params['model'].get('num_classes'),
                        vocab=vocab)
    model.load_state_dict(ckpt['model_state_dict'])
    model.eval()

    # creating dataset, dataloader
    tagger = MeCab()
    padder = PadSequence(length=30)
    tst_filepath = os.path.join(os.getcwd(), params['filepath'].get('tst'))

    tst_ds = Corpus(tst_filepath, vocab, tagger, padder)
    tst_dl = DataLoader(tst_ds, batch_size=128, num_workers=4)

    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    # evaluation
    correct_count = 0
    for x_mb, y_mb in tqdm(tst_dl):
        x_mb = x_mb.to(device)
        y_mb = y_mb.to(device)
        with torch.no_grad():
            y_mb_hat = model(x_mb)
            y_mb_hat = torch.max(y_mb_hat, 1)[1]
            correct_count += (y_mb_hat == y_mb).sum().item()

    print('Acc : {:.2%}'.format(correct_count / len(tst_ds)))
Example #10
0
counter = gluonnlp.data.count_tokens(itertools.chain.from_iterable(imdb_tok_train))
vocab = gluonnlp.Vocab(counter, bos_token="<s>", eos_token="</s>", min_freq=10)

def encode(toks):
    return [vocab[tok] for tok in toks]

imdb_x_train = [encode(toks) for toks in imdb_tok_train]


# Build data pipeline.
# TODO: Wrap x and y before making a dataset?
maxlen = max([len(x) for x in imdb_x_train])
dataset = SimpleDataset(imdb_x_train)


dataset = dataset.transform(PadSequence(maxlen))
dataset = dataset.transform(mxnet.nd.array)


# Build the model.
model_ctx = mxnet.cpu()
model = mxnet.gluon.nn.Sequential()
with model.name_scope():
    model.add(mxnet.gluon.nn.Embedding(len(vocab), embedding_size))
    model.add(mxnet.gluon.rnn.GRU(64, dropout=.2))
    model.add(mxnet.gluon.nn.Dense(1))
model.initialize(ctx=model_ctx)
loss = mxnet.gluon.loss.SigmoidBinaryCrossEntropyLoss()
opt = mxnet.gluon.Trainer(model.collect_params(), "sgd", {"learning_rate": .01})

Example #11
0
def train(cfgpath):
    # parsing json
    with open(os.path.join(os.getcwd(), cfgpath)) as io:
        params = json.loads(io.read())

    with open(params['filepath'].get('vocab'), mode='rb') as io:
        vocab = pickle.load(io)

    # creating model
    model = SentenceCNN(num_classes=params['model'].get('num_classes'), vocab=vocab)

    # creating dataset, dataloader
    tagger = MeCab()
    padder = PadSequence(length=30)

    batch_size = params['training'].get('batch_size')
    tr_filepath = os.path.join(os.getcwd(), params['filepath'].get('tr'))
    val_filepath = os.path.join(os.getcwd(), params['filepath'].get('val'))

    tr_ds = Corpus(tr_filepath, vocab, tagger, padder)
    tr_dl = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, num_workers=4, drop_last=True)

    val_ds = Corpus(val_filepath, vocab, tagger, padder)
    val_dl = DataLoader(val_ds, batch_size=batch_size)

    # training
    loss_fn = nn.CrossEntropyLoss()
    opt = optim.Adam(params=model.parameters(), lr=params['training'].get('learning_rate'))

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    epochs = params['training'].get('epochs')

    for epoch in tqdm(range(epochs), desc='epochs'):

        avg_tr_loss = 0
        avg_val_loss = 0
        tr_step = 0
        val_step = 0

        model.train()
        for x_mb, y_mb in tqdm(tr_dl, desc='iters'):
            x_mb = x_mb.to(device)
            y_mb = y_mb.to(device)
            score = model(x_mb)

            opt.zero_grad()
            tr_loss = loss_fn(score, y_mb)
            reg_term = torch.norm(model.fc.weight, p=2)
            tr_loss.add_(.5 * reg_term)
            tr_loss.backward()
            opt.step()

            avg_tr_loss += tr_loss.item()
            tr_step += 1
        else:
            avg_tr_loss /= tr_step

        model.eval()
        for x_mb, y_mb in tqdm(val_dl):
            x_mb = x_mb.to(device)
            y_mb = y_mb.to(device)

            with torch.no_grad():
                score = model(x_mb)
                val_loss = loss_fn(score, y_mb)
                avg_val_loss += val_loss.item()
                val_step += 1
        else:
            avg_val_loss /= val_step

        tqdm.write('epoch : {}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(epoch + 1, avg_tr_loss, avg_val_loss))

    ckpt = {'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'opt_state_dict': opt.state_dict(),
            'vocab': vocab}

    savepath = os.path.join(os.getcwd(), params['filepath'].get('ckpt'))
    torch.save(ckpt, savepath)
Example #12
0
                    help='Directory of config.json of data')
parser.add_argument('--model_dir',
                    default='experiments/base_model',
                    help="Directory containing config.json of model")

if __name__ == '__main__':
    args = parser.parse_args()
    data_dir = Path(args.data_dir)
    model_dir = Path(args.model_dir)
    data_config = Config(json_path=data_dir / 'config.json')
    model_config = Config(json_path=model_dir / 'config.json')

    # tokenizer
    with open('data/vocab.pkl', mode='rb') as io:
        vocab = pickle.load(io)
    padding = PadSequence(model_config.length, pad_val=vocab.padding_token)
    tokenizer = Tokenizer(vocab=vocab, split_fn=split_to_jamo, pad_fn=padding)

    # model
    model = VDCNN(num_classes=model_config.num_classes,
                  embedding_dim=model_config.embedding_dim,
                  k_max=model_config.k_max,
                  vocab=Vocab)
    # training
    tr_ds = Corpus(data_config.train, tokenizer.split_and_transform)
    tr_dl = DataLoader(tr_ds,
                       batch_size=model_config.batch_size,
                       shuffle=True,
                       num_workers=4,
                       drop_last=True)
    val_ds = Corpus(data_config.validation, tokenizer.split_and_transform)
Example #13
0
def main(argv):
    train_data = Path.cwd() / '..' / 'data_in' / 'train.txt'
    val_data = Path.cwd() / '..' / 'data_in' / 'val.txt'
    test_data = Path.cwd() / '..' / 'data_in' / 'test.txt'
    dev_data = Path.cwd() / '..' / 'data_in' / 'dev.txt'
    # init params
    classes = FLAGS.classes
    max_length = FLAGS.length
    epochs = FLAGS.epochs
    learning_rate = FLAGS.learning_rate
    dim = FLAGS.embedding_dim
    global_step = 1000
    batch_size = FLAGS.batch_size

    with open(Path.cwd() / '..' / 'data_in' / 'vocab.pkl', mode='rb') as io:
        vocab = pickle.load(io)

    train = tf.data.TextLineDataset(str(train_data)).shuffle(
        buffer_size=batch_size).batch(batch_size=batch_size)
    eval = tf.data.TextLineDataset(str(val_data)).batch(batch_size=batch_size)
    test = tf.data.TextLineDataset(str(test_data)).batch(batch_size=batch_size)
    dev = tf.data.TextLineDataset(str(dev_data)).batch(batch_size=batch_size)

    padder = PadSequence(max_length,
                         pad_val=vocab.to_indices(vocab.padding_token))
    processing = Corpus(vocab=vocab, split_fn=Split(), pad_fn=padder)

    # create model
    char_cnn = CharCNN(vocab=vocab, classes=classes, dim=dim)

    # create optimizer & loss_fn
    opt = tf.optimizers.Adam(learning_rate=learning_rate)
    loss_fn = tf.losses.SparseCategoricalCrossentropy()

    train_loss_metric = tf.keras.metrics.Mean(name='train_loss')
    train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy(
        name='train_accuracy')
    val_loss_metric = tf.keras.metrics.Mean(name='val_loss')
    val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy(
        name='val_accuracy')

    # train_summary_writer = tf.summary.create_file_writer('./data_out/summaries/train')
    # eval_summary_writer = tf.summary.create_file_writer('./data_out/summaries/eval')

    # ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=opt, net=char_cnn)
    # manager = tf.train.CheckpointManager(ckpt, './data_out/tf_ckpts', max_to_keep=3)
    # ckpt.restore(manager.latest_checkpoint)
    #
    # if manager.latest_checkpoint:
    #     print("Restored from {}".format(manager.latest_checkpoint))
    # else:
    #     print("Initializing from scratch.")

    #training
    for epoch in tqdm(range(epochs), desc='epochs'):

        train_loss_metric.reset_states()
        train_acc_metric.reset_states()
        val_loss_metric.reset_states()
        val_acc_metric.reset_states()
        tf.keras.backend.set_learning_phase(1)

        #with train_summary_writer.as_default():
        for step, val in tqdm(enumerate(train), desc='steps'):
            data, label = processing.token2idex(val)
            with tf.GradientTape() as tape:
                logits = char_cnn(data)
                train_loss = loss_fn(label, logits)

            #ckpt.step.assign_add(1)
            grads = tape.gradient(target=train_loss,
                                  sources=char_cnn.trainable_variables)
            opt.apply_gradients(
                grads_and_vars=zip(grads, char_cnn.trainable_variables))

            train_loss_metric.update_state(train_loss)
            train_acc_metric.update_state(label, logits)

            # if tf.equal(opt.iterations % global_step, 0):
            #     tf.summary.scalar('loss', train_loss_metric.result(), step=opt.iterations)

        tr_loss = train_loss_metric.result()

        #save_path = manager.save()
        #print(save_path)
        tqdm.write('epoch : {}, tr_acc : {:.3f}%, tr_loss : {:.3f}'.format(
            epoch + 1,
            train_acc_metric.result() * 100, tr_loss))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--epoch', default=10, type=int)
    parser.add_argument('--batch_size', default=128, type=int)
    # parser.add_argument('--data_type', default='senCNN')
    parser.add_argument('--classes', default=2, type=int)
    parser.add_argument('--gpu', default=0, type=int)
    parser.add_argument('--learning_rate', default=1e-3, type=float)
    # parser.add_argument('--print_freq', default=3000, type=int)
    # parser.add_argument('--weight_decay', default=5e-5, type=float)
    parser.add_argument('--word_dim', default=16, type=int)
    parser.add_argument('--word_max_len', default=300, type=int)
    parser.add_argument('--global_step', default=1000, type=int)
    parser.add_argument('--data_path', default='../data_in')
    parser.add_argument('--file_path', default='../nsmc-master')
    # parser.add_argument('--build_preprocessing', default=False)
    # parser.add_argument('--build_vocab', default=False)

    args = parser.parse_args()
    # p = Preprocessing(args)
    # p.makeProcessing()

    # v = Build_Vocab(args)
    # v.make_vocab()

    with open(args.data_path + '/' + 'vocab_char.pkl', mode='rb') as io:
        vocab = pickle.load(io)

    padder = PadSequence(length=args.word_max_len,
                         pad_val=vocab.to_indices(vocab.padding_token))
    tokenizer = Tokenizer(vocab=vocab, split_fn=split_to_jamo, pad_fn=padder)

    model = EfficientCharCRNN(args, vocab)

    epochs = args.epoch
    batch_size = args.batch_size
    learning_rate = args.learning_rate
    global_step = args.global_step

    tr_ds = Corpus(args.data_path + '/train.txt',
                   tokenizer.split_and_transform)
    tr_dl = DataLoader(tr_ds,
                       batch_size=batch_size,
                       shuffle=True,
                       num_workers=4,
                       drop_last=True)
    val_ds = Corpus(args.data_path + '/val.txt', tokenizer.split_and_transform)
    val_dl = DataLoader(val_ds, batch_size=batch_size, num_workers=4)

    loss_fn = nn.CrossEntropyLoss()
    opt = optim.Adam(params=model.parameters(), lr=learning_rate)
    scheduler = ReduceLROnPlateau(opt, patience=5)
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    best_val_loss = 1e+10

    for epoch in tqdm(range(args.epoch), desc='epochs'):
        tr_loss = 0
        tr_acc = 0
        model.train()
        for step, mb in tqdm(enumerate(tr_dl), desc='steps', total=len(tr_dl)):
            x, y = map(lambda elm: elm.to(device), mb)
            opt.zero_grad()
            y_h = model(x)
            m_loss = loss_fn(y_h, y)
            m_loss.backward()
            clip_grad_norm_(model._fc.weight, 5)
            opt.step()

            with torch.no_grad():
                m_acc = acc(y_h, y)

            tr_loss += m_loss.item()
            tr_acc += m_acc.item()

        else:
            tr_loss /= (step + 1)
            tr_acc /= (step + 1)

            tr_summ = {'loss': tr_loss, 'acc': tr_acc}
            val_summ = evaluate(model, val_dl, {
                'loss': loss_fn,
                'acc': acc
            }, device)
            scheduler.step(val_summ['loss'])
            tqdm.write('epoch : {}, tr_loss: {:.3f}, val_loss: '
                       '{:.3f}, tr_acc: {:.2%}, val_acc: {:.2%}'.format(
                           epoch + 1, tr_summ['loss'], val_summ['loss'],
                           tr_summ['acc'], val_summ['acc']))

            val_loss = val_summ['loss']
            is_best = val_loss < best_val_loss

            if is_best:
                state = {
                    'epoch': epoch + 1,
                    'model_state_dict': model.state_dict(),
                    'opt_state_dict': opt.state_dict()
                }
                summary = {'tr': tr_summ, 'val': val_summ}

                # manager.update_summary(summary)
                # manager.save_summary('summary.json')
                # manager.save_checkpoint(state, 'best.tar')

                best_val_loss = val_loss
Example #15
0
    params = json.loads(io.read())
    print(params)

# restoring model
savepath = params['filepath'].get('ckpt')
ckpt = torch.load(savepath)

vocab = ckpt['vocab']

model = SeNet(num_classes=params['num_classes'], vocab=vocab)
model.load_state_dict(ckpt['model_state_dict'])
model.eval()

# create dataset, dataloader
tagger = Okt()
padder = PadSequence(length=30)
tst_data = read_data(params['filepath'].get('tst'))
tst_data = remove_na(tst_data)
tst_dataset = Corpus(tst_data, vocab, tagger, padder)
tst_dataloader = DataLoader(tst_dataset, batch_size=128)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device(
    'cpu')
model.to(device)

# evaluation
correct_count = 0
for x_mb, y_mb in tqdm(tst_dataloader):
    x_mb = x_mb.to(device)
    y_mb = y_mb.to(device)
    with torch.no_grad():
Example #16
0
def main(json_path):
    cwd = Path.cwd()
    with open(cwd / json_path) as io:
        params = json.loads(io.read())

    # tokenizer
    vocab_path = params['filepath'].get('vocab')
    with open(cwd / vocab_path, mode='rb') as io:
        vocab = pickle.load(io)
    length = params['padder'].get('length')
    padder = PadSequence(length=length, pad_val=vocab.to_indices(vocab.padding_token))
    tokenizer = Tokenizer(vocab=vocab, split_fn=MeCab().morphs, pad_fn=padder)

    # model
    num_classes = params['model'].get('num_classes')
    model = SenCNN(num_classes=num_classes, vocab=tokenizer.vocab)

    # training
    epochs = params['training'].get('epochs')
    batch_size = params['training'].get('batch_size')
    learning_rate = params['training'].get('learning_rate')
    global_step = params['training'].get('global_step')

    tr_path = cwd / params['filepath'].get('tr')
    val_path = cwd / params['filepath'].get('val')
    tr_ds = Corpus(tr_path, tokenizer.split_and_transform)
    tr_dl = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, num_workers=4, drop_last=True)
    val_ds = Corpus(val_path, tokenizer.split_and_transform)
    val_dl = DataLoader(val_ds, batch_size=batch_size)

    loss_fn = nn.CrossEntropyLoss()
    opt = optim.Adam(params=model.parameters(), lr=learning_rate)
    scheduler = ReduceLROnPlateau(opt, patience=5)
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    writer = SummaryWriter('./runs/{}'.format(params['version']))
    for epoch in tqdm(range(epochs), desc='epochs'):

        tr_loss = 0

        model.train()
        for step, mb in tqdm(enumerate(tr_dl), desc='steps', total=len(tr_dl)):
            x_mb, y_mb = map(lambda elm: elm.to(device), mb)

            opt.zero_grad()
            mb_loss = loss_fn(model(x_mb), y_mb)
            mb_loss.backward()
            clip_grad_norm_(model._fc.weight, 5)
            opt.step()

            tr_loss += mb_loss.item()

            if (epoch * len(tr_dl) + step) % global_step == 0:
                val_loss = evaluate(model, val_dl, loss_fn, device)
                writer.add_scalars('loss', {'train': tr_loss / (step + 1),
                                            'val': val_loss}, epoch * len(tr_dl) + step)

                model.train()
        else:
            tr_loss /= (step + 1)

        val_loss = evaluate(model, val_dl, loss_fn, device)
        scheduler.step(val_loss)
        tqdm.write('epoch : {}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(epoch + 1, tr_loss, val_loss))

    ckpt = {'model_state_dict': model.state_dict(),
            'opt_state_dict': opt.state_dict()}

    save_path = cwd / params['filepath'].get('ckpt')
    torch.save(ckpt, save_path)
Example #17
0
def train(cfgpath):
    # parsing json
    with open(os.path.join(os.getcwd(), cfgpath)) as io:
        params = json.loads(io.read())

    # creating preprocessor
    tokenizer = JamoTokenizer()
    padder = PadSequence(300)

    # creating model
    model = CharCNN(num_classes=params['model'].get('num_classes'),
                    embedding_dim=params['model'].get('embedding_dim'),
                    dic=tokenizer.token2idx)

    # creating dataset, dataloader
    tr_filepath = os.path.join(os.getcwd(), params['filepath'].get('tr'))
    val_filepath = os.path.join(os.getcwd(), params['filepath'].get('val'))

    batch_size = params['training'].get('batch_size')
    tr_ds = Corpus(tr_filepath, tokenizer, padder)
    tr_dl = DataLoader(tr_ds,
                       batch_size=batch_size,
                       shuffle=True,
                       num_workers=4,
                       drop_last=True)
    val_ds = Corpus(val_filepath, tokenizer, padder)
    val_dl = DataLoader(val_ds, batch_size=batch_size, num_workers=4)

    # training
    loss_fn = nn.CrossEntropyLoss()
    opt = optim.Adam(params=model.parameters(),
                     lr=params['training'].get('learning_rate'))

    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    epochs = params['training'].get('epochs')

    for epoch in tqdm(range(epochs), desc='epochs'):

        avg_tr_loss = 0
        avg_val_loss = 0
        tr_step = 0
        val_step = 0

        model.train()
        for x_mb, y_mb in tqdm(tr_dl, desc='iters'):
            x_mb = x_mb.to(device)
            y_mb = y_mb.to(device)
            score = model(x_mb)

            opt.zero_grad()
            tr_loss = loss_fn(score, y_mb)
            tr_loss.backward()
            opt.step()

            avg_tr_loss += tr_loss.item()
            tr_step += 1
        else:
            avg_tr_loss /= tr_step

        model.eval()
        for x_mb, y_mb in tqdm(val_dl):
            x_mb = x_mb.to(device)
            y_mb = y_mb.to(device)

            with torch.no_grad():
                score = model(x_mb)
                val_loss = loss_fn(score, y_mb)
                avg_val_loss += val_loss.item()
                val_step += 1
        else:
            avg_val_loss /= val_step

        tqdm.write('epoch : {}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(
            epoch + 1, avg_tr_loss, avg_val_loss))

    ckpt = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'opt_state_dict': opt.state_dict()
    }

    savepath = os.path.join(os.getcwd(), params['filepath'].get('ckpt'))
    torch.save(ckpt, savepath)