def transform(raw_data, params): # 定义数据转换接口 # raw_data --> batch_data for data in raw_data: pass num_buckets = params.num_buckets batch_size = params.batch_size responses = raw_data batch_idxes = FixedBucketSampler([len(rs) for rs in responses], batch_size, num_buckets=num_buckets) batch = [] def index(r): correct = 0 if r[1] <= 0 else 1 return r[0] * 2 + correct for batch_idx in tqdm(batch_idxes, "batchify"): batch_rs = [] batch_pick_index = [] batch_labels = [] for idx in batch_idx: batch_rs.append([index(r) for r in responses[idx]]) if len(responses[idx]) <= 1: pick_index, labels = [], [] else: pick_index, labels = zip(*[(r[0], 0 if r[1] <= 0 else 1) for r in responses[idx][1:]]) batch_pick_index.append(list(pick_index)) batch_labels.append(list(labels)) max_len = max([len(rs) for rs in batch_rs]) padder = PadSequence(max_len, pad_val=0) batch_rs, data_mask = zip(*[(padder(rs), len(rs)) for rs in batch_rs]) max_len = max([len(rs) for rs in batch_labels]) padder = PadSequence(max_len, pad_val=0) batch_labels, label_mask = zip(*[(padder(labels), len(labels)) for labels in batch_labels]) batch_pick_index = [ padder(pick_index) for pick_index in batch_pick_index ] batch.append([ mx.nd.array(batch_rs), mx.nd.array(data_mask), mx.nd.array(batch_labels), mx.nd.array(batch_pick_index), mx.nd.array(label_mask) ]) return batch
def transform(raw_data, params): # 定义数据转换接口 # raw_data --> batch_data num_buckets = params.num_buckets batch_size = params.batch_size responses = raw_data # 不同长度的样本每次都会分配到固定的bucket当中 batch_idxes = FixedBucketSampler([len(rs) for rs in responses], batch_size, num_buckets=num_buckets) batch = [] def index(r): correct = 0 if r[1] <= 0 else 1 return r[0] * 2 + correct for batch_idx in tqdm(batch_idxes, "batchify"): batch_rs = [] batch_pick_index = [] batch_labels = [] for idx in batch_idx: # 1. 答对答错存储的id不一样 # 2. batch_pick_index[i], batch_labels[i]维度比batch_rs[i]小1 batch_rs.append([index(r) for r in responses[idx]]) if len(responses[idx]) <= 1: pick_index, labels = [], [] else: pick_index, labels = zip(*[(r[0], 0 if r[1] <= 0 else 1) for r in responses[idx][1:]]) batch_pick_index.append(list(pick_index)) batch_labels.append(list(labels)) # 每一个bucket中的max_len不一样 max_len = max([len(rs) for rs in batch_rs]) padder = PadSequence(max_len, pad_val=0) # batch_rs padding到max_len之后的数据 # data_mask表示对应的有效数据的条数 batch_rs, data_mask = zip(*[(padder(rs), len(rs)) for rs in batch_rs]) max_len = max([len(rs) for rs in batch_labels]) padder = PadSequence(max_len, pad_val=0) batch_labels, label_mask = zip(*[(padder(labels), len(labels)) for labels in batch_labels]) batch_pick_index = [padder(pick_index) for pick_index in batch_pick_index] # Load # 所有数据都padding到固定长度的序列 # data_mask label_mask表示有效数据的长度 # len(batch_labels) + 1 = len(batch_rs) batch.append( [torch.tensor(batch_rs), torch.tensor(data_mask), torch.tensor(batch_labels), torch.tensor(batch_pick_index), torch.tensor(label_mask)]) return batch
def transform(raw_data, params): # 定义数据转换接口 # raw_data --> batch_data num_buckets = params.num_buckets batch_size = params.batch_size responses = raw_data batch_idxes = FixedBucketSampler([len(rs) for rs in responses], batch_size, num_buckets=num_buckets) batch = [] def response_index(r): correct = 0 if r[1] <= 0 else 1 return r[0] * 2 + correct def question_index(r): return r[0] for batch_idx in tqdm(batch_idxes, "batchify"): batch_qs = [] batch_rs = [] batch_labels = [] for idx in batch_idx: batch_qs.append([question_index(r) for r in responses[idx]]) batch_rs.append([response_index(r) for r in responses[idx]]) labels = [0 if r[1] <= 0 else 1 for r in responses[idx][:]] batch_labels.append(list(labels)) max_len = max([len(rs) for rs in batch_rs]) padder = PadSequence(max_len, pad_val=0) batch_qs, _ = zip(*[(padder(qs), len(qs)) for qs in batch_qs]) batch_rs, data_mask = zip(*[(padder(rs), len(rs)) for rs in batch_rs]) max_len = max([len(rs) for rs in batch_labels]) padder = PadSequence(max_len, pad_val=0) batch_labels, label_mask = zip(*[(padder(labels), len(labels)) for labels in batch_labels]) batch.append([ mx.nd.array(batch_qs, dtype="float32"), mx.nd.array(batch_rs, dtype="float32"), mx.nd.array(data_mask), mx.nd.array(batch_labels), mx.nd.array(label_mask) ]) return batch
def load_data(params): tokenizer = JamoTokenizer() padder = PadSequence(length=params.get('pad_length')) batch_size = params.get('batch_size') test_path = params.get('test') test_ds = NaverMovieCorpus(test_path, tokenizer, padder) test_dl = DataLoader(test_ds, batch_size*2, drop_last=False) return test_dl
def load_data(params): tokenizer = JamoTokenizer() padder = PadSequence(length=params.get('pad_length')) train_path = params.get('train') valid_path = params.get('valid') batch_size = params.get('batch_size') train_ds = NaverMovieCorpus(train_path, tokenizer, padder) valid_ds = NaverMovieCorpus(valid_path, tokenizer, padder) train_dl = DataLoader(train_ds, batch_size, shuffle=True, drop_last=True) valid_dl = DataLoader(valid_ds, batch_size * 2, drop_last=False) return train_dl, valid_dl
def main(json_path): cwd = Path.cwd() with open(cwd / json_path) as io: params = json.loads(io.read()) # tokenizer vocab_path = params['filepath'].get('vocab') with open(cwd / vocab_path, mode='rb') as io: vocab = pickle.load(io) length = params['padder'].get('length') padder = PadSequence(length=length, pad_val=vocab.to_indices(vocab.padding_token)) tokenizer = Tokenizer(vocab=vocab, split_fn=split_to_jamo, pad_fn=padder) # model (restore) save_path = cwd / params['filepath'].get('ckpt') ckpt = torch.load(save_path) num_classes = params['model'].get('num_classes') embedding_dim = params['model'].get('embedding_dim') k_max = params['model'].get('k_max') model = VDCNN(num_classes=num_classes, embedding_dim=embedding_dim, k_max=k_max, vocab=tokenizer.vocab) model.load_state_dict(ckpt['model_state_dict']) # evaluation batch_size = params['training'].get('batch_size') tr_path = cwd / params['filepath'].get('tr') val_path = cwd / params['filepath'].get('val') tst_path = cwd / params['filepath'].get('tst') tr_ds = Corpus(tr_path, tokenizer.split_and_transform) tr_dl = DataLoader(tr_ds, batch_size=batch_size, num_workers=4) val_ds = Corpus(val_path, tokenizer.split_and_transform) val_dl = DataLoader(val_ds, batch_size=batch_size, num_workers=4) tst_ds = Corpus(tst_path, tokenizer.split_and_transform) tst_dl = DataLoader(tst_ds, batch_size=batch_size, num_workers=4) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) tr_acc = get_accuracy(model, tr_dl, device) val_acc = get_accuracy(model, val_dl, device) tst_acc = get_accuracy(model, tst_dl, device) print('tr_acc: {:.2%}, val_acc: {:.2%}, tst_acc: {:.2%}'.format( tr_acc, val_acc, tst_acc))
def transform(raw_data, params): # 定义数据转换接口 # raw_data --> batch_data batch_size = params.batch_size padding = params.padding num_buckets = params.num_buckets fixed_length = params.fixed_length features, labels = raw_data word_feature, word_radical_feature, char_feature, char_radical_feature = features batch_idxes = FixedBucketSampler([len(word_f) for word_f in word_feature], batch_size, num_buckets=num_buckets) batch = [] for batch_idx in batch_idxes: batch_features = [[] for _ in range(len(features))] batch_labels = [] for idx in batch_idx: for i, feature in enumerate(batch_features): batch_features[i].append(features[i][idx]) batch_labels.append(labels[idx]) batch_data = [] word_mask = [] char_mask = [] for i, feature in enumerate(batch_features): max_len = max([len(fea) for fea in feature ]) if not fixed_length else fixed_length padder = PadSequence(max_len, pad_val=padding) feature, mask = zip(*[(padder(fea), len(fea)) for fea in feature]) if i == 0: word_mask = mask elif i == 2: char_mask = mask batch_data.append(mx.nd.array(feature)) batch_data.append(mx.nd.array(word_mask)) batch_data.append(mx.nd.array(char_mask)) batch_data.append(mx.nd.array(batch_labels, dtype=np.int)) batch.append(batch_data) return batch[::-1]
def main(): test_path = Path.cwd() / 'data_in' / 'test.txt' vocab_path = Path.cwd() / 'data_in' / 'vocab.pkl' with open(vocab_path, mode='rb') as io: vocab = pickle.load(io) tokenizer = MeCab() padder = PadSequence(length=70, pad_val=vocab.token_to_idx['<pad>']) test_ds = Corpus(test_path, vocab, tokenizer, padder) test_dl = DataLoader(test_ds, batch_size=1024) model = Net(vocab_len=len(vocab)) loss_fn = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.01) for epoch in range(1): model.train() index = 0 acc = 0 for label, sen1, sen2 in tqdm(test_dl, disable=True): optimizer.zero_grad() pre_label = model(sen1, sen2) loss = loss_fn(pre_label, label) loss.backward() optimizer.step() pred_cls = pre_label.data.max(1)[1] acc += pred_cls.eq(label.data).cpu().sum() print("epoch: {}, index: {}, loss: {}".format((epoch + 1), index, loss.item())) index += len(label) print('Accuracy : %d %%' % (100 * acc / index))
def evaluate(cfgpath): # parsing json with open(os.path.join(os.getcwd(), cfgpath)) as io: params = json.loads(io.read()) # restoring model savepath = os.path.join(os.getcwd(), params['filepath'].get('ckpt')) ckpt = torch.load(savepath) vocab = ckpt['vocab'] model = SentenceCNN(num_classes=params['model'].get('num_classes'), vocab=vocab) model.load_state_dict(ckpt['model_state_dict']) model.eval() # creating dataset, dataloader tagger = MeCab() padder = PadSequence(length=30) tst_filepath = os.path.join(os.getcwd(), params['filepath'].get('tst')) tst_ds = Corpus(tst_filepath, vocab, tagger, padder) tst_dl = DataLoader(tst_ds, batch_size=128, num_workers=4) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) # evaluation correct_count = 0 for x_mb, y_mb in tqdm(tst_dl): x_mb = x_mb.to(device) y_mb = y_mb.to(device) with torch.no_grad(): y_mb_hat = model(x_mb) y_mb_hat = torch.max(y_mb_hat, 1)[1] correct_count += (y_mb_hat == y_mb).sum().item() print('Acc : {:.2%}'.format(correct_count / len(tst_ds)))
counter = gluonnlp.data.count_tokens(itertools.chain.from_iterable(imdb_tok_train)) vocab = gluonnlp.Vocab(counter, bos_token="<s>", eos_token="</s>", min_freq=10) def encode(toks): return [vocab[tok] for tok in toks] imdb_x_train = [encode(toks) for toks in imdb_tok_train] # Build data pipeline. # TODO: Wrap x and y before making a dataset? maxlen = max([len(x) for x in imdb_x_train]) dataset = SimpleDataset(imdb_x_train) dataset = dataset.transform(PadSequence(maxlen)) dataset = dataset.transform(mxnet.nd.array) # Build the model. model_ctx = mxnet.cpu() model = mxnet.gluon.nn.Sequential() with model.name_scope(): model.add(mxnet.gluon.nn.Embedding(len(vocab), embedding_size)) model.add(mxnet.gluon.rnn.GRU(64, dropout=.2)) model.add(mxnet.gluon.nn.Dense(1)) model.initialize(ctx=model_ctx) loss = mxnet.gluon.loss.SigmoidBinaryCrossEntropyLoss() opt = mxnet.gluon.Trainer(model.collect_params(), "sgd", {"learning_rate": .01})
def train(cfgpath): # parsing json with open(os.path.join(os.getcwd(), cfgpath)) as io: params = json.loads(io.read()) with open(params['filepath'].get('vocab'), mode='rb') as io: vocab = pickle.load(io) # creating model model = SentenceCNN(num_classes=params['model'].get('num_classes'), vocab=vocab) # creating dataset, dataloader tagger = MeCab() padder = PadSequence(length=30) batch_size = params['training'].get('batch_size') tr_filepath = os.path.join(os.getcwd(), params['filepath'].get('tr')) val_filepath = os.path.join(os.getcwd(), params['filepath'].get('val')) tr_ds = Corpus(tr_filepath, vocab, tagger, padder) tr_dl = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, num_workers=4, drop_last=True) val_ds = Corpus(val_filepath, vocab, tagger, padder) val_dl = DataLoader(val_ds, batch_size=batch_size) # training loss_fn = nn.CrossEntropyLoss() opt = optim.Adam(params=model.parameters(), lr=params['training'].get('learning_rate')) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) epochs = params['training'].get('epochs') for epoch in tqdm(range(epochs), desc='epochs'): avg_tr_loss = 0 avg_val_loss = 0 tr_step = 0 val_step = 0 model.train() for x_mb, y_mb in tqdm(tr_dl, desc='iters'): x_mb = x_mb.to(device) y_mb = y_mb.to(device) score = model(x_mb) opt.zero_grad() tr_loss = loss_fn(score, y_mb) reg_term = torch.norm(model.fc.weight, p=2) tr_loss.add_(.5 * reg_term) tr_loss.backward() opt.step() avg_tr_loss += tr_loss.item() tr_step += 1 else: avg_tr_loss /= tr_step model.eval() for x_mb, y_mb in tqdm(val_dl): x_mb = x_mb.to(device) y_mb = y_mb.to(device) with torch.no_grad(): score = model(x_mb) val_loss = loss_fn(score, y_mb) avg_val_loss += val_loss.item() val_step += 1 else: avg_val_loss /= val_step tqdm.write('epoch : {}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(epoch + 1, avg_tr_loss, avg_val_loss)) ckpt = {'epoch': epoch, 'model_state_dict': model.state_dict(), 'opt_state_dict': opt.state_dict(), 'vocab': vocab} savepath = os.path.join(os.getcwd(), params['filepath'].get('ckpt')) torch.save(ckpt, savepath)
help='Directory of config.json of data') parser.add_argument('--model_dir', default='experiments/base_model', help="Directory containing config.json of model") if __name__ == '__main__': args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') # tokenizer with open('data/vocab.pkl', mode='rb') as io: vocab = pickle.load(io) padding = PadSequence(model_config.length, pad_val=vocab.padding_token) tokenizer = Tokenizer(vocab=vocab, split_fn=split_to_jamo, pad_fn=padding) # model model = VDCNN(num_classes=model_config.num_classes, embedding_dim=model_config.embedding_dim, k_max=model_config.k_max, vocab=Vocab) # training tr_ds = Corpus(data_config.train, tokenizer.split_and_transform) tr_dl = DataLoader(tr_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4, drop_last=True) val_ds = Corpus(data_config.validation, tokenizer.split_and_transform)
def main(argv): train_data = Path.cwd() / '..' / 'data_in' / 'train.txt' val_data = Path.cwd() / '..' / 'data_in' / 'val.txt' test_data = Path.cwd() / '..' / 'data_in' / 'test.txt' dev_data = Path.cwd() / '..' / 'data_in' / 'dev.txt' # init params classes = FLAGS.classes max_length = FLAGS.length epochs = FLAGS.epochs learning_rate = FLAGS.learning_rate dim = FLAGS.embedding_dim global_step = 1000 batch_size = FLAGS.batch_size with open(Path.cwd() / '..' / 'data_in' / 'vocab.pkl', mode='rb') as io: vocab = pickle.load(io) train = tf.data.TextLineDataset(str(train_data)).shuffle( buffer_size=batch_size).batch(batch_size=batch_size) eval = tf.data.TextLineDataset(str(val_data)).batch(batch_size=batch_size) test = tf.data.TextLineDataset(str(test_data)).batch(batch_size=batch_size) dev = tf.data.TextLineDataset(str(dev_data)).batch(batch_size=batch_size) padder = PadSequence(max_length, pad_val=vocab.to_indices(vocab.padding_token)) processing = Corpus(vocab=vocab, split_fn=Split(), pad_fn=padder) # create model char_cnn = CharCNN(vocab=vocab, classes=classes, dim=dim) # create optimizer & loss_fn opt = tf.optimizers.Adam(learning_rate=learning_rate) loss_fn = tf.losses.SparseCategoricalCrossentropy() train_loss_metric = tf.keras.metrics.Mean(name='train_loss') train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy( name='train_accuracy') val_loss_metric = tf.keras.metrics.Mean(name='val_loss') val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy( name='val_accuracy') # train_summary_writer = tf.summary.create_file_writer('./data_out/summaries/train') # eval_summary_writer = tf.summary.create_file_writer('./data_out/summaries/eval') # ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=opt, net=char_cnn) # manager = tf.train.CheckpointManager(ckpt, './data_out/tf_ckpts', max_to_keep=3) # ckpt.restore(manager.latest_checkpoint) # # if manager.latest_checkpoint: # print("Restored from {}".format(manager.latest_checkpoint)) # else: # print("Initializing from scratch.") #training for epoch in tqdm(range(epochs), desc='epochs'): train_loss_metric.reset_states() train_acc_metric.reset_states() val_loss_metric.reset_states() val_acc_metric.reset_states() tf.keras.backend.set_learning_phase(1) #with train_summary_writer.as_default(): for step, val in tqdm(enumerate(train), desc='steps'): data, label = processing.token2idex(val) with tf.GradientTape() as tape: logits = char_cnn(data) train_loss = loss_fn(label, logits) #ckpt.step.assign_add(1) grads = tape.gradient(target=train_loss, sources=char_cnn.trainable_variables) opt.apply_gradients( grads_and_vars=zip(grads, char_cnn.trainable_variables)) train_loss_metric.update_state(train_loss) train_acc_metric.update_state(label, logits) # if tf.equal(opt.iterations % global_step, 0): # tf.summary.scalar('loss', train_loss_metric.result(), step=opt.iterations) tr_loss = train_loss_metric.result() #save_path = manager.save() #print(save_path) tqdm.write('epoch : {}, tr_acc : {:.3f}%, tr_loss : {:.3f}'.format( epoch + 1, train_acc_metric.result() * 100, tr_loss))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--epoch', default=10, type=int) parser.add_argument('--batch_size', default=128, type=int) # parser.add_argument('--data_type', default='senCNN') parser.add_argument('--classes', default=2, type=int) parser.add_argument('--gpu', default=0, type=int) parser.add_argument('--learning_rate', default=1e-3, type=float) # parser.add_argument('--print_freq', default=3000, type=int) # parser.add_argument('--weight_decay', default=5e-5, type=float) parser.add_argument('--word_dim', default=16, type=int) parser.add_argument('--word_max_len', default=300, type=int) parser.add_argument('--global_step', default=1000, type=int) parser.add_argument('--data_path', default='../data_in') parser.add_argument('--file_path', default='../nsmc-master') # parser.add_argument('--build_preprocessing', default=False) # parser.add_argument('--build_vocab', default=False) args = parser.parse_args() # p = Preprocessing(args) # p.makeProcessing() # v = Build_Vocab(args) # v.make_vocab() with open(args.data_path + '/' + 'vocab_char.pkl', mode='rb') as io: vocab = pickle.load(io) padder = PadSequence(length=args.word_max_len, pad_val=vocab.to_indices(vocab.padding_token)) tokenizer = Tokenizer(vocab=vocab, split_fn=split_to_jamo, pad_fn=padder) model = EfficientCharCRNN(args, vocab) epochs = args.epoch batch_size = args.batch_size learning_rate = args.learning_rate global_step = args.global_step tr_ds = Corpus(args.data_path + '/train.txt', tokenizer.split_and_transform) tr_dl = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, num_workers=4, drop_last=True) val_ds = Corpus(args.data_path + '/val.txt', tokenizer.split_and_transform) val_dl = DataLoader(val_ds, batch_size=batch_size, num_workers=4) loss_fn = nn.CrossEntropyLoss() opt = optim.Adam(params=model.parameters(), lr=learning_rate) scheduler = ReduceLROnPlateau(opt, patience=5) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) best_val_loss = 1e+10 for epoch in tqdm(range(args.epoch), desc='epochs'): tr_loss = 0 tr_acc = 0 model.train() for step, mb in tqdm(enumerate(tr_dl), desc='steps', total=len(tr_dl)): x, y = map(lambda elm: elm.to(device), mb) opt.zero_grad() y_h = model(x) m_loss = loss_fn(y_h, y) m_loss.backward() clip_grad_norm_(model._fc.weight, 5) opt.step() with torch.no_grad(): m_acc = acc(y_h, y) tr_loss += m_loss.item() tr_acc += m_acc.item() else: tr_loss /= (step + 1) tr_acc /= (step + 1) tr_summ = {'loss': tr_loss, 'acc': tr_acc} val_summ = evaluate(model, val_dl, { 'loss': loss_fn, 'acc': acc }, device) scheduler.step(val_summ['loss']) tqdm.write('epoch : {}, tr_loss: {:.3f}, val_loss: ' '{:.3f}, tr_acc: {:.2%}, val_acc: {:.2%}'.format( epoch + 1, tr_summ['loss'], val_summ['loss'], tr_summ['acc'], val_summ['acc'])) val_loss = val_summ['loss'] is_best = val_loss < best_val_loss if is_best: state = { 'epoch': epoch + 1, 'model_state_dict': model.state_dict(), 'opt_state_dict': opt.state_dict() } summary = {'tr': tr_summ, 'val': val_summ} # manager.update_summary(summary) # manager.save_summary('summary.json') # manager.save_checkpoint(state, 'best.tar') best_val_loss = val_loss
params = json.loads(io.read()) print(params) # restoring model savepath = params['filepath'].get('ckpt') ckpt = torch.load(savepath) vocab = ckpt['vocab'] model = SeNet(num_classes=params['num_classes'], vocab=vocab) model.load_state_dict(ckpt['model_state_dict']) model.eval() # create dataset, dataloader tagger = Okt() padder = PadSequence(length=30) tst_data = read_data(params['filepath'].get('tst')) tst_data = remove_na(tst_data) tst_dataset = Corpus(tst_data, vocab, tagger, padder) tst_dataloader = DataLoader(tst_dataset, batch_size=128) device = torch.device('cuda') if torch.cuda.is_available() else torch.device( 'cpu') model.to(device) # evaluation correct_count = 0 for x_mb, y_mb in tqdm(tst_dataloader): x_mb = x_mb.to(device) y_mb = y_mb.to(device) with torch.no_grad():
def main(json_path): cwd = Path.cwd() with open(cwd / json_path) as io: params = json.loads(io.read()) # tokenizer vocab_path = params['filepath'].get('vocab') with open(cwd / vocab_path, mode='rb') as io: vocab = pickle.load(io) length = params['padder'].get('length') padder = PadSequence(length=length, pad_val=vocab.to_indices(vocab.padding_token)) tokenizer = Tokenizer(vocab=vocab, split_fn=MeCab().morphs, pad_fn=padder) # model num_classes = params['model'].get('num_classes') model = SenCNN(num_classes=num_classes, vocab=tokenizer.vocab) # training epochs = params['training'].get('epochs') batch_size = params['training'].get('batch_size') learning_rate = params['training'].get('learning_rate') global_step = params['training'].get('global_step') tr_path = cwd / params['filepath'].get('tr') val_path = cwd / params['filepath'].get('val') tr_ds = Corpus(tr_path, tokenizer.split_and_transform) tr_dl = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, num_workers=4, drop_last=True) val_ds = Corpus(val_path, tokenizer.split_and_transform) val_dl = DataLoader(val_ds, batch_size=batch_size) loss_fn = nn.CrossEntropyLoss() opt = optim.Adam(params=model.parameters(), lr=learning_rate) scheduler = ReduceLROnPlateau(opt, patience=5) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) writer = SummaryWriter('./runs/{}'.format(params['version'])) for epoch in tqdm(range(epochs), desc='epochs'): tr_loss = 0 model.train() for step, mb in tqdm(enumerate(tr_dl), desc='steps', total=len(tr_dl)): x_mb, y_mb = map(lambda elm: elm.to(device), mb) opt.zero_grad() mb_loss = loss_fn(model(x_mb), y_mb) mb_loss.backward() clip_grad_norm_(model._fc.weight, 5) opt.step() tr_loss += mb_loss.item() if (epoch * len(tr_dl) + step) % global_step == 0: val_loss = evaluate(model, val_dl, loss_fn, device) writer.add_scalars('loss', {'train': tr_loss / (step + 1), 'val': val_loss}, epoch * len(tr_dl) + step) model.train() else: tr_loss /= (step + 1) val_loss = evaluate(model, val_dl, loss_fn, device) scheduler.step(val_loss) tqdm.write('epoch : {}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(epoch + 1, tr_loss, val_loss)) ckpt = {'model_state_dict': model.state_dict(), 'opt_state_dict': opt.state_dict()} save_path = cwd / params['filepath'].get('ckpt') torch.save(ckpt, save_path)
def train(cfgpath): # parsing json with open(os.path.join(os.getcwd(), cfgpath)) as io: params = json.loads(io.read()) # creating preprocessor tokenizer = JamoTokenizer() padder = PadSequence(300) # creating model model = CharCNN(num_classes=params['model'].get('num_classes'), embedding_dim=params['model'].get('embedding_dim'), dic=tokenizer.token2idx) # creating dataset, dataloader tr_filepath = os.path.join(os.getcwd(), params['filepath'].get('tr')) val_filepath = os.path.join(os.getcwd(), params['filepath'].get('val')) batch_size = params['training'].get('batch_size') tr_ds = Corpus(tr_filepath, tokenizer, padder) tr_dl = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, num_workers=4, drop_last=True) val_ds = Corpus(val_filepath, tokenizer, padder) val_dl = DataLoader(val_ds, batch_size=batch_size, num_workers=4) # training loss_fn = nn.CrossEntropyLoss() opt = optim.Adam(params=model.parameters(), lr=params['training'].get('learning_rate')) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) epochs = params['training'].get('epochs') for epoch in tqdm(range(epochs), desc='epochs'): avg_tr_loss = 0 avg_val_loss = 0 tr_step = 0 val_step = 0 model.train() for x_mb, y_mb in tqdm(tr_dl, desc='iters'): x_mb = x_mb.to(device) y_mb = y_mb.to(device) score = model(x_mb) opt.zero_grad() tr_loss = loss_fn(score, y_mb) tr_loss.backward() opt.step() avg_tr_loss += tr_loss.item() tr_step += 1 else: avg_tr_loss /= tr_step model.eval() for x_mb, y_mb in tqdm(val_dl): x_mb = x_mb.to(device) y_mb = y_mb.to(device) with torch.no_grad(): score = model(x_mb) val_loss = loss_fn(score, y_mb) avg_val_loss += val_loss.item() val_step += 1 else: avg_val_loss /= val_step tqdm.write('epoch : {}, tr_loss : {:.3f}, val_loss : {:.3f}'.format( epoch + 1, avg_tr_loss, avg_val_loss)) ckpt = { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'opt_state_dict': opt.state_dict() } savepath = os.path.join(os.getcwd(), params['filepath'].get('ckpt')) torch.save(ckpt, savepath)