def main(cfgpath): # parsing config.json proj_dir = Path.cwd() params = json.load((proj_dir / cfgpath).open()) # create dataset batch_size = params['training'].get('batch_size') tr_filepath = params['filepath'].get('tr') val_filepath = params['filepath'].get('val') tst_filepath = params['filepath'].get('tst') tr_ds = create_dataset(tr_filepath, batch_size, False, False) val_ds = create_dataset(val_filepath, batch_size, False, False) tst_ds = create_dataset(tst_filepath, batch_size, False, False) # create pre_processor vocab = pickle.load( (proj_dir / params['filepath'].get('vocab')).open(mode='rb')) pre_processor = PreProcessor(vocab=vocab, tokenizer=MeCab().morphs, pad_idx=1) # create model model = SenCNN(num_classes=2, vocab=vocab) ckpt = tf.train.Checkpoint(model=model) ckpt.restore(save_path=tf.train.latest_checkpoint(proj_dir / 'checkpoint')) # evluation tr_acc = get_accuracy(model, tr_ds, pre_processor.convert2idx) val_acc = get_accuracy(model, val_ds, pre_processor.convert2idx) tst_acc = get_accuracy(model, tst_ds, pre_processor.convert2idx) print('tr_acc: {:.2%}, val_acc : {:.2%}, tst_acc: {:.2%}'.format( tr_acc, val_acc, tst_acc))
def main(cfgpath, global_step): # parsing config.json proj_dir = Path.cwd() params = json.load((proj_dir / cfgpath).open()) # create dataset batch_size = params['training'].get('batch_size') tr_filepath = params['filepath'].get('tr') val_filepath = params['filepath'].get('val') tr_ds = create_dataset(tr_filepath, batch_size, True) val_ds = create_dataset(val_filepath, batch_size, False) # create pre_processor vocab = pickle.load((proj_dir / params['filepath'].get('vocab')).open(mode='rb')) pre_processor = PreProcessor(vocab=vocab, tokenizer=MeCab().morphs, pad_idx=1) # create model model = SenCNN(num_classes=2, vocab=vocab) # create optimizer & loss_fn epochs = params['training'].get('epochs') learning_rate = params['training'].get('learning_rate') opt = tf.optimizers.Adam(learning_rate=learning_rate) loss_fn = tf.losses.SparseCategoricalCrossentropy(from_logits=True) writer = tf.summary.create_file_writer(logdir='./runs/exp') # training for epoch in tqdm(range(epochs), desc='epochs'): tr_loss = 0 tf.keras.backend.set_learning_phase(1) for step, mb in tqdm(enumerate(tr_ds), desc='steps'): x_mb, y_mb = pre_processor.convert2idx(mb) with tf.GradientTape() as tape: mb_loss = loss_fn(y_mb, model(x_mb)) grads = tape.gradient(target=mb_loss, sources=model.trainable_variables) opt.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables)) tr_loss += mb_loss.numpy() if tf.equal(opt.iterations % global_step, 0): with writer.as_default(): val_loss = evaluate(model, val_ds, loss_fn, pre_processor.convert2idx) tf.summary.scalar('tr_loss', tr_loss / (step + 1), step=opt.iterations) tf.summary.scalar('val_loss', val_loss, step=opt.iterations) tf.keras.backend.set_learning_phase(1) else: tr_loss /= (step + 1) val_loss = evaluate(model, val_ds, loss_fn, pre_processor.convert2idx) tqdm.write('epoch : {}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(epoch + 1, tr_loss, val_loss)) ckpt_path = proj_dir / params['filepath'].get('ckpt') ckpt = tf.train.Checkpoint(model=model) ckpt.save(ckpt_path)
def main(json_path): cwd = Path.cwd() with open(cwd / json_path) as io: params = json.loads(io.read()) # tokenizer vocab_path = params['filepath'].get('vocab') with open(cwd / vocab_path, mode='rb') as io: vocab = pickle.load(io) tokenizer = Tokenizer(vocab=vocab, split_fn=MeCab().morphs) # model (restore) save_path = cwd / params['filepath'].get('ckpt') ckpt = torch.load(save_path) num_classes = params['model'].get('num_classes') lstm_hidden_dim = params['model'].get('lstm_hidden_dim') hidden_dim = params['model'].get('hidden_dim') da = params['model'].get('da') r = params['model'].get('r') model = SAN(num_classes=num_classes, lstm_hidden_dim=lstm_hidden_dim, hidden_dim=hidden_dim, da=da, r=r, vocab=tokenizer.vocab) model.load_state_dict(ckpt['model_state_dict']) # evaluation batch_size = params['training'].get('batch_size') tr_path = cwd / params['filepath'].get('tr') val_path = cwd / params['filepath'].get('val') tr_ds = Corpus(tr_path, tokenizer.split_and_transform) tr_dl = DataLoader(tr_ds, batch_size=batch_size, num_workers=4, collate_fn=batchify) val_ds = Corpus(val_path, tokenizer.split_and_transform) val_dl = DataLoader(val_ds, batch_size=batch_size, num_workers=4, collate_fn=batchify) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) tr_acc = get_accuracy(model, tr_dl, device) val_acc = get_accuracy(model, val_dl, device) print('tr_acc: {:.2%}, val_acc: {:.2%}'.format(tr_acc, val_acc))
def main(): train_path = Path.cwd() / '..' / 'data_in' / 'train.txt' val_path = Path.cwd() / '..' / 'data_in' / 'val.txt' vocab_path = Path.cwd() / '..' / 'data_in' / 'vocab.pkl' length = 70 batch_size = 1024 learning_rate = 0.01 epochs = 10 with open(vocab_path, mode='rb') as io: vocab = pickle.load(io) train = tf.data.TextLineDataset( str(train_path)).shuffle(buffer_size=1000).batch(batch_size=batch_size, drop_remainder=True) eval = tf.data.TextLineDataset(str(val_path)).batch(batch_size=batch_size, drop_remainder=True) tokenizer = MeCab() processing = Corpus(vocab, tokenizer) linear = Linear(vocab) opt = tf.optimizers.Adam(learning_rate=learning_rate) loss_fn = tf.losses.SparseCategoricalCrossentropy(from_logits=True) train_loss_metric = tf.keras.metrics.Mean(name='train_loss') train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy( name='train_accuracy') val_loss_metric = tf.keras.metrics.Mean(name='val_loss') val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy( name='val_accuracy') for epoch in range(epochs): train_loss_metric.reset_states() train_acc_metric.reset_states() val_loss_metric.reset_states() val_acc_metric.reset_states() tf.keras.backend.set_learning_phase(1) for step, val in tqdm(enumerate(train)): print(val)
def main(json_path): cwd = Path.cwd() with open(cwd / json_path) as io: params = json.loads(io.read()) # tokenizer vocab_path = params['filepath'].get('vocab') with open(cwd / vocab_path, mode='rb') as io: vocab = pickle.load(io) length = params['padder'].get('length') padder = PadSequence(length=length, pad_val=vocab.to_indices(vocab.padding_token)) tokenizer = Tokenizer(vocab=vocab, split_fn=MeCab().morphs, pad_fn=padder) # model (restore) save_path = cwd / params['filepath'].get('ckpt') ckpt = torch.load(save_path) num_classes = params['model'].get('num_classes') model = SenCNN(num_classes=num_classes, vocab=tokenizer.vocab) model.load_state_dict(ckpt['model_state_dict']) # evaluation batch_size = params['training'].get('batch_size') tr_path = cwd / params['filepath'].get('tr') val_path = cwd / params['filepath'].get('val') tst_path = cwd / params['filepath'].get('tst') tr_ds = Corpus(tr_path, tokenizer.split_and_transform) tr_dl = DataLoader(tr_ds, batch_size=batch_size, num_workers=4) val_ds = Corpus(val_path, tokenizer.split_and_transform) val_dl = DataLoader(val_ds, batch_size=batch_size, num_workers=4) tst_ds = Corpus(tst_path, tokenizer.split_and_transform) tst_dl = DataLoader(tst_ds, batch_size=batch_size, num_workers=4) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) tr_acc = get_accuracy(model, tr_dl, device) val_acc = get_accuracy(model, val_dl, device) tst_acc = get_accuracy(model, tst_dl, device) print('tr_acc: {:.2%}, val_acc : {:.2%}, tst_acc: {:.2%}'.format(tr_acc, val_acc, tst_acc))
def evaluate(cfgpath): # parsing json with open(os.path.join(os.getcwd(), cfgpath)) as io: params = json.loads(io.read()) # restoring model savepath = os.path.join(os.getcwd(), params['filepath'].get('ckpt')) ckpt = torch.load(savepath) vocab = ckpt['vocab'] model = SentenceCNN(num_classes=params['model'].get('num_classes'), vocab=vocab) model.load_state_dict(ckpt['model_state_dict']) model.eval() # creating dataset, dataloader tagger = MeCab() padder = PadSequence(length=30) tst_filepath = os.path.join(os.getcwd(), params['filepath'].get('tst')) tst_ds = Corpus(tst_filepath, vocab, tagger, padder) tst_dl = DataLoader(tst_ds, batch_size=128, num_workers=4) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) # evaluation correct_count = 0 for x_mb, y_mb in tqdm(tst_dl): x_mb = x_mb.to(device) y_mb = y_mb.to(device) with torch.no_grad(): y_mb_hat = model(x_mb) y_mb_hat = torch.max(y_mb_hat, 1)[1] correct_count += (y_mb_hat == y_mb).sum().item() print('Acc : {:.2%}'.format(correct_count / len(tst_ds)))
def main(): test_path = Path.cwd() / 'data_in' / 'test.txt' vocab_path = Path.cwd() / 'data_in' / 'vocab.pkl' with open(vocab_path, mode='rb') as io: vocab = pickle.load(io) tokenizer = MeCab() padder = PadSequence(length=70, pad_val=vocab.token_to_idx['<pad>']) test_ds = Corpus(test_path, vocab, tokenizer, padder) test_dl = DataLoader(test_ds, batch_size=1024) model = Net(vocab_len=len(vocab)) loss_fn = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.01) for epoch in range(1): model.train() index = 0 acc = 0 for label, sen1, sen2 in tqdm(test_dl, disable=True): optimizer.zero_grad() pre_label = model(sen1, sen2) loss = loss_fn(pre_label, label) loss.backward() optimizer.step() pred_cls = pre_label.data.max(1)[1] acc += pred_cls.eq(label.data).cpu().sum() print("epoch: {}, index: {}, loss: {}".format((epoch + 1), index, loss.item())) index += len(label) print('Accuracy : %d %%' % (100 * acc / index))
def main(argv): train_data = Path.cwd() / 'data_in' / 'train.txt' val_data = Path.cwd() / 'data_in' / 'val.txt' with open(Path.cwd() / 'data_in' / 'vocab.pkl', mode='rb') as io: vocab = pickle.load(io) train = tf.data.TextLineDataset(str(train_data)).shuffle(buffer_size=1000).batch(batch_size=FLAGS.batch_size, drop_remainder=True) eval = tf.data.TextLineDataset(str(val_data)).batch(batch_size=FLAGS.batch_size, drop_remainder=True) tokenized = MeCab() processing = Corpus(vocab=vocab, tokenizer=tokenized) # init params classes = FLAGS.classes max_length = FLAGS.length epochs = FLAGS.epochs learning_rate = FLAGS.learning_rate global_step = 1000 # create model sen_cnn = SenCNN(vocab=vocab, classes=classes) # create optimizer & loss_fn opt = tf.optimizers.Adam(learning_rate=learning_rate) loss_fn = tf.losses.SparseCategoricalCrossentropy(from_logits=True) train_loss_metric = tf.keras.metrics.Mean(name='train_loss') train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy') val_loss_metric = tf.keras.metrics.Mean(name='val_loss') val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy(name='val_accuracy') train_summary_writer = tf.summary.create_file_writer('./data_out/summaries/train') eval_summary_writer = tf.summary.create_file_writer('./data_out/summaries/eval') ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=opt, net=sen_cnn) manager = tf.train.CheckpointManager(ckpt, './data_out/tf_ckpts', max_to_keep=3) ckpt.restore(manager.latest_checkpoint) if manager.latest_checkpoint: print("Restored from {}".format(manager.latest_checkpoint)) else: print("Initializing from scratch.") # training for epoch in tqdm(range(epochs), desc='epochs'): train_loss_metric.reset_states() train_acc_metric.reset_states() val_loss_metric.reset_states() val_acc_metric.reset_states() tf.keras.backend.set_learning_phase(1) tr_loss = 0 with train_summary_writer.as_default(): for step, val in tqdm(enumerate(train), desc='steps'): data, label = processing.token2idex(val) with tf.GradientTape() as tape: logits = sen_cnn(data) train_loss = loss_fn(label, logits) ckpt.step.assign_add(1) grads = tape.gradient(target=train_loss, sources=sen_cnn.trainable_variables) opt.apply_gradients(grads_and_vars=zip(grads, sen_cnn.trainable_variables)) # tr_loss += pred_loss.numpy() train_loss_metric.update_state(train_loss) train_acc_metric.update_state(label, logits) if tf.equal(opt.iterations % global_step, 0): tf.summary.scalar('loss', train_loss_metric.result(), step=opt.iterations) # else: # tr_loss /= (step + 1) # print("t_loss {}".format(tr_loss)) tr_loss = train_loss_metric.result() save_path = manager.save() print(save_path) tf.keras.backend.set_learning_phase(0) val_loss = 0 with eval_summary_writer.as_default(): for step, val in tqdm(enumerate(eval), desc='steps'): data, label = processing.token2idex(val) logits = sen_cnn(data) val_loss = loss_fn(label, logits) # val_loss += mb_loss.numpy() val_loss_metric.update_state(val_loss) val_acc_metric.update_state(label, logits) tf.summary.scalar('loss', val_loss_metric.result(), step=step) val_loss = val_loss_metric.result() tqdm.write( 'epoch : {}, tr_acc : {:.3f}%, tr_loss : {:.3f}, val_acc : {:.3f}%, val_loss : {:.3f}'.format(epoch + 1, train_acc_metric.result() * 100, tr_loss, val_acc_metric.result() * 100, val_loss))
def train(cfgpath): # parsing json with open(os.path.join(os.getcwd(), cfgpath)) as io: params = json.loads(io.read()) with open(params['filepath'].get('vocab'), mode='rb') as io: vocab = pickle.load(io) # creating model model = SentenceCNN(num_classes=params['model'].get('num_classes'), vocab=vocab) # creating dataset, dataloader tagger = MeCab() padder = PadSequence(length=30) batch_size = params['training'].get('batch_size') tr_filepath = os.path.join(os.getcwd(), params['filepath'].get('tr')) val_filepath = os.path.join(os.getcwd(), params['filepath'].get('val')) tr_ds = Corpus(tr_filepath, vocab, tagger, padder) tr_dl = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, num_workers=4, drop_last=True) val_ds = Corpus(val_filepath, vocab, tagger, padder) val_dl = DataLoader(val_ds, batch_size=batch_size) # training loss_fn = nn.CrossEntropyLoss() opt = optim.Adam(params=model.parameters(), lr=params['training'].get('learning_rate')) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) epochs = params['training'].get('epochs') for epoch in tqdm(range(epochs), desc='epochs'): avg_tr_loss = 0 avg_val_loss = 0 tr_step = 0 val_step = 0 model.train() for x_mb, y_mb in tqdm(tr_dl, desc='iters'): x_mb = x_mb.to(device) y_mb = y_mb.to(device) score = model(x_mb) opt.zero_grad() tr_loss = loss_fn(score, y_mb) reg_term = torch.norm(model.fc.weight, p=2) tr_loss.add_(.5 * reg_term) tr_loss.backward() opt.step() avg_tr_loss += tr_loss.item() tr_step += 1 else: avg_tr_loss /= tr_step model.eval() for x_mb, y_mb in tqdm(val_dl): x_mb = x_mb.to(device) y_mb = y_mb.to(device) with torch.no_grad(): score = model(x_mb) val_loss = loss_fn(score, y_mb) avg_val_loss += val_loss.item() val_step += 1 else: avg_val_loss /= val_step tqdm.write('epoch : {}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(epoch + 1, avg_tr_loss, avg_val_loss)) ckpt = {'epoch': epoch, 'model_state_dict': model.state_dict(), 'opt_state_dict': opt.state_dict(), 'vocab': vocab} savepath = os.path.join(os.getcwd(), params['filepath'].get('ckpt')) torch.save(ckpt, savepath)
def main(argv): train_path = Path.cwd() / '..' / 'data_in' / 'train.txt' val_path = Path.cwd() / '..' / 'data_in' / 'val.txt' test_path = Path.cwd() / '..' / 'data_in' / 'test.txt' vocab_path = Path.cwd() / '..' / 'data_in' / 'vocab.pkl' batch_size = FLAGS.batch_size learning_rate = FLAGS.learning_rate epochs = FLAGS.epochs with open(vocab_path, mode='rb') as io: vocab = pickle.load(io) train = tf.data.TextLineDataset(str(train_path)).shuffle( buffer_size=batch_size).batch(batch_size=batch_size, drop_remainder=True) eval = tf.data.TextLineDataset(str(val_path)).batch(batch_size=batch_size, drop_remainder=True) test = tf.data.TextLineDataset(str(test_path)).batch(batch_size=batch_size, drop_remainder=True) tokenizer = MeCab() corpus = Corpus(vocab, tokenizer) hbmp = HBMP(vocab_len=len(vocab)) opt = tf.optimizers.Adam(learning_rate=learning_rate) loss_fn = tf.losses.SparseCategoricalCrossentropy(from_logits=True) ''' loss, accuracy ''' train_loss_metric = tf.keras.metrics.Mean(name='train_loss') train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy( name='train_accuracy') val_loss_metric = tf.keras.metrics.Mean(name='val_loss') val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy( name='val_accuracy') test_loss_metric = tf.keras.metrics.Mean(name='test_loss') test_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy( name='test_accuracy') for epoch in range(epochs): train_loss_metric.reset_states() train_acc_metric.reset_states() val_loss_metric.reset_states() val_acc_metric.reset_states() tf.keras.backend.set_learning_phase(1) for step, val in tqdm(enumerate(train)): sen1, sen2, label = corpus.token2idx(val) with tf.GradientTape() as tape: logits = hbmp(sen1, sen2) train_loss = loss_fn(label, logits) grads = tape.gradient(target=train_loss, sources=hbmp.trainable_variables) opt.apply_gradients( grads_and_vars=zip(grads, hbmp.trainable_variables)) train_loss_metric.update_state(train_loss) train_acc_metric.update_state(label, logits) tr_loss = train_loss_metric.result() tqdm.write('epoch : {}, tr_acc : {:.3f}%, tr_loss : {:.3f}'.format( epoch + 1, train_acc_metric.result() * 100, tr_loss)) tf.keras.backend.set_learning_phase(0) for step, val in tqdm(enumerate(eval)): sen1, sen2, label = corpus.token2idx(val) with tf.GradientTape() as tape: logits = hbmp(sen1, sen2) val_loss = loss_fn(label, logits) grads = tape.gradient(target=val_loss, sources=hbmp.trainable_variables) opt.apply_gradients( grads_and_vars=zip(grads, hbmp.trainable_variables)) val_loss_metric.update_state(val_loss) val_acc_metric.update_state(label, logits) v_loss = val_loss_metric.result() tqdm.write('epoch : {}, val_acc : {:.3f}%, val_loss : {:.3f}'.format( epoch + 1, val_acc_metric.result() * 100, v_loss)) tf.keras.backend.set_learning_phase(0) test_loss_metric.reset_states() test_acc_metric.reset_states() for step, val in tqdm(enumerate(test)): sen1, sen2, label = corpus.token2idx(val) with tf.GradientTape() as tape: logits = hbmp(sen1, sen2) test_loss = loss_fn(label, logits) grads = tape.gradient(target=test_loss, sources=hbmp.trainable_variables) opt.apply_gradients( grads_and_vars=zip(grads, hbmp.trainable_variables)) test_loss_metric.update_state(test_loss) test_acc_metric.update_state(label, logits) t_loss = val_loss_metric.result() tqdm.write('epoch : {}, test_acc : {:.3f}%, test_loss : {:.3f}'.format( epoch + 1, test_acc_metric.result() * 100, t_loss))
import itertools import pickle import pandas as pd import gluonnlp as nlp from pathlib import Path from mecab import MeCab #train path train_path = Path.cwd() / 'nsmc-master' / 'ratings_train.txt' # train data를 tab으로 구별 document, label 컬럼으로 불러옴 tr = pd.read_csv(train_path, sep='\t').loc[:, ['document', 'label']] # Mecab 정의 tokenizer = MeCab() # document 열의 데이터를 Mecab의 형태소로 나눈 것들을 list로 변환 tokenized = tr['document'].apply( lambda elm: tokenizer.morphs(str(elm))).tolist() # tokenized 에서 각 단어의 count 저장 counter = nlp.data.count_tokens(itertools.chain.from_iterable(tokenized)) # counter에서 최소 10번 이상 나온것들을 vocab에 저장 vocab = nlp.Vocab(counter=counter, min_freq=10, bos_token=None, eos_token=None) nlp.embedding.list_sources() # wiki.ko 데이터를 fasttext로 벡터화 한 임베딩 가져오기 embedding = nlp.embedding.create('fasttext', source='wiki.ko') # 만든 vocab에 벡터 적용 vocab.set_embedding(embedding) # vocab.pkl 저장
identity = torch.eye(r).to(device) p = torch.norm(sim_mat - identity, dim=(1, 2)).mean() return p if __name__ == '__main__': args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') # tokenizer with open(data_config.vocab, mode='rb') as io: vocab = pickle.load(io) tokenizer = Tokenizer(vocab=vocab, split_fn=MeCab().morphs) # model model = SAN(num_classes=model_config.num_classes, lstm_hidden_dim=model_config.lstm_hidden_dim, da=model_config.da, r=model_config.r, hidden_dim=model_config.hidden_dim, vocab=tokenizer.vocab) # training tr_ds = Corpus(data_config.train, tokenizer.split_and_transform) tr_dl = DataLoader(tr_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4,
import pandas as pd import gluonnlp as nlp import itertools import pickle from pathlib import Path from mecab import MeCab # loading dataset cwd = Path.cwd() tr_path = cwd / 'data' / 'train.txt' tr = pd.read_csv(tr_path, sep='\t').loc[:, ['document', 'label']] # extracting morph in sentences tokenizer = MeCab().morphs list_of_tokens = tr['document'].apply(tokenizer).tolist() # making the vocab counter = nlp.data.count_tokens(itertools.chain.from_iterable(list_of_tokens)) vocab = nlp.Vocab(counter=counter, min_freq=10, bos_token=None, eos_token=None) # connecting SISG embedding with vocab ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko') vocab.set_embedding(ptr_embedding) # saving vocab with open('./data/vocab.pkl', mode='wb') as io: pickle.dump(vocab, io)
help="Directory containing config.json of model") if __name__ == '__main__': args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') # tokenizer with open(data_config.vocab, mode='rb') as io: vocab = pickle.load(io) pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token)) tokenizer = Tokenizer(vocab=vocab, split_fn=MeCab().morphs, pad_fn=pad_sequence) # model model = SenCNN(num_classes=model_config.num_classes, vocab=tokenizer.vocab) # training tr_ds = Corpus(data_config.train, tokenizer.split_and_transform) tr_dl = DataLoader(tr_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4, drop_last=True) val_ds = Corpus(data_config.validation, tokenizer.split_and_transform) val_dl = DataLoader(val_ds, batch_size=model_config.batch_size)
import pandas as pd import gluonnlp as nlp import itertools from pathlib import Path from sklearn.model_selection import train_test_split from mecab import MeCab # loading dataset cwd = Path.cwd() dataset = pd.read_csv(cwd / 'data' / 'kor_pair_train.csv').filter(items=['question1', 'question2', 'is_duplicate']) tst = pd.read_csv(cwd / 'data' / 'kor_pair_test.csv').filter(items=['question1', 'question2', 'is_duplicate']) total = pd.concat([dataset, tst], axis=0, ignore_index=True, sort=False) tr, val = train_test_split(total, test_size=.2, random_state=777) # extracting morph in sentences list_of_tokens = pd.concat([tr['question1'], tr['question2']]).apply(MeCab().morphs).tolist() # making the vocab counter = nlp.data.count_tokens(itertools.chain.from_iterable(list_of_tokens)) vocab = nlp.Vocab(counter=counter, min_freq=5, bos_token=None, eos_token=None) # # connecting SISG embedding with vocab ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko') vocab.set_embedding(ptr_embedding) # saving vocab with open(cwd / 'data' / 'vocab.pkl', mode='wb') as io: pickle.dump(vocab, io) tr.to_csv(cwd / 'data' / 'train.txt', index=False)
def __init__(self, language): punct = '"“”#$%&\'()*+,-/:;<=>@[\\]^_`{|}~' self._table = str.maketrans({key: None for key in punct}) self._morphs = MeCab().morphs self._language = language
def main(json_path): cwd = Path.cwd() with open(cwd / json_path) as io: params = json.loads(io.read()) # tokenizer vocab_path = params['filepath'].get('vocab') with open(cwd / vocab_path, mode='rb') as io: vocab = pickle.load(io) tokenizer = Tokenizer(vocab=vocab, split_fn=MeCab().morphs) # model num_classes = params['model'].get('num_classes') lstm_hidden_dim = params['model'].get('lstm_hidden_dim') hidden_dim = params['model'].get('hidden_dim') da = params['model'].get('da') r = params['model'].get('r') model = SAN(num_classes=num_classes, lstm_hidden_dim=lstm_hidden_dim, hidden_dim=hidden_dim, da=da, r=r, vocab=tokenizer.vocab) # training epochs = params['training'].get('epochs') batch_size = params['training'].get('batch_size') learning_rate = params['training'].get('learning_rate') global_step = params['training'].get('global_step') tr_path = cwd / params['filepath'].get('tr') val_path = cwd / params['filepath'].get('val') tr_ds = Corpus(tr_path, tokenizer.split_and_transform) tr_dl = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, num_workers=4, drop_last=True, collate_fn=batchify) val_ds = Corpus(val_path, tokenizer.split_and_transform) val_dl = DataLoader(val_ds, batch_size=batch_size, num_workers=4, collate_fn=batchify) loss_fn = nn.CrossEntropyLoss() opt = optim.Adam(params=model.parameters(), lr=learning_rate) scheduler = ReduceLROnPlateau(opt, patience=5) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) writer = SummaryWriter('./runs/{}'.format(params['version'])) for epoch in tqdm(range(epochs), desc='epochs'): tr_loss = 0 model.train() for step, mb in tqdm(enumerate(tr_dl), desc='steps', total=len(tr_dl)): queries_a_mb, queries_b_mb, y_mb = map(lambda elm: elm.to(device), mb) queries_mb = (queries_a_mb, queries_b_mb) opt.zero_grad() score, queries_a_attn_mat, queries_b_attn_mat = model(queries_mb) a_reg = regularize(queries_a_attn_mat, r, device) b_reg = regularize(queries_b_attn_mat, r, device) mb_loss = loss_fn(score, y_mb) mb_loss.add_(a_reg) mb_loss.add_(b_reg) mb_loss.backward() opt.step() tr_loss += mb_loss.item() if (epoch * len(tr_dl) + step) % global_step == 0: val_loss = evaluate(model, val_dl, loss_fn, device) writer.add_scalars('loss', { 'train': tr_loss / (step + 1), 'validation': val_loss }, epoch * len(tr_dl) + step) model.train() else: tr_loss /= (step + 1) val_loss = evaluate(model, val_dl, loss_fn, device) scheduler.step(val_loss) tqdm.write('epoch : {}, tr_loss : {:.3f}, val_loss : {:.3f}'.format( epoch + 1, tr_loss, val_loss)) ckpt = { 'model_state_dict': model.state_dict(), 'opt_state_dict': opt.state_dict() } save_path = cwd / params['filepath'].get('ckpt') torch.save(ckpt, save_path)
import signal import pickle import gensim import numpy as np from mecab import MeCab from jumanpp import Jumanpp signal.signal(signal.SIGINT, signal.SIG_DFL) num_clusters = int(sys.argv[1]) dimension = int(sys.argv[2]) wikiFile = sys.argv[3] baseFile = os.path.splitext(wikiFile)[0] probaVecFile = baseFile + '.pvec' analyzer = MeCab() proba_wordvecs = pickle.load(open(probaVecFile, 'rb')) def scdv(sentence): words = analyzer.analysis(sentence) vec = np.zeros(num_clusters * dimension, dtype=np.float32) for word in words: if word in proba_wordvecs: vec += proba_wordvecs[word] return vec
from mecab import MeCab from sklearn.model_selection import train_test_split # loading dataset tr_filepath = os.path.join(os.getcwd(), 'data/ratings_train.txt') data = pd.read_table(tr_filepath).loc[:, ['document', 'label']] data = data.loc[data['document'].isna().apply(lambda elm : not elm), :] tr_data, val_data = train_test_split(data, test_size=.2) tst_filepath = os.path.join(os.getcwd(), 'data/ratings_test.txt') tst_data = pd.read_table(tst_filepath).loc[:, ['document', 'label']] tst_data = tst_data.loc[tst_data['document'].isna().apply(lambda elm : not elm), :] # extracting morph in sentences tokenizer = MeCab() data['document'] = data['document'].apply(tokenizer.morphs) # making the vocab counter = nlp.data.count_tokens(itertools.chain.from_iterable([token for token in data['document']])) vocab = nlp.Vocab(counter=counter, min_freq=10, bos_token=None, eos_token=None) # connecting SISG embedding with vocab ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko') vocab.set_embedding(ptr_embedding) # saving vocab with open('./data/vocab.pkl', mode = 'wb') as io: pickle.dump(vocab, io) # saving tr_data, val_data, tst_data
import re import string import unicodedata from mecab import MeCab from typing import List split_morphs = MeCab().morphs def split_space(sentence: str) -> List[str]: return re.split(r'\s+', sentence) def preprocess(sentence): sentence = re.sub(r'[' + re.escape(string.punctuation) + ']', '', sentence) sentence = re.sub(r'\s{2,}', ' ', sentence) sentence = unicodedata.normalize('NFKD', sentence) sentence = sentence.strip() sentence = sentence.lower() return sentence
parser.add_argument('--data_dir', default='data', help="Directory containing config.json of data") parser.add_argument('--model_dir', default='experiments/base_model', help="Directory containing config.json of model") if __name__ == '__main__': args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') # tokenizer with open(data_config.vocab, mode='rb') as io: vocab = pickle.load(io) pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token)) tokenizer = Tokenizer(vocab=vocab, split_fn=MeCab().morphs, pad_fn=pad_sequence) # model model = SenCNN(num_classes=model_config.num_classes, vocab=tokenizer.vocab) # training tr_ds = Corpus(data_config.train, tokenizer.split_and_transform) tr_dl = DataLoader(tr_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4, drop_last=True) val_ds = Corpus(data_config.validation, tokenizer.split_and_transform) val_dl = DataLoader(val_ds, batch_size=model_config.batch_size) loss_fn = nn.CrossEntropyLoss() opt = optim.Adam(params=model.parameters(), lr=model_config.learning_rate) scheduler = ReduceLROnPlateau(opt, patience=5) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device)
def main(json_path): cwd = Path.cwd() with open(cwd / json_path) as io: params = json.loads(io.read()) # tokenizer vocab_path = params['filepath'].get('vocab') with open(cwd / vocab_path, mode='rb') as io: vocab = pickle.load(io) length = params['padder'].get('length') padder = PadSequence(length=length, pad_val=vocab.to_indices(vocab.padding_token)) tokenizer = Tokenizer(vocab=vocab, split_fn=MeCab().morphs, pad_fn=padder) # model num_classes = params['model'].get('num_classes') model = SenCNN(num_classes=num_classes, vocab=tokenizer.vocab) # training epochs = params['training'].get('epochs') batch_size = params['training'].get('batch_size') learning_rate = params['training'].get('learning_rate') global_step = params['training'].get('global_step') tr_path = cwd / params['filepath'].get('tr') val_path = cwd / params['filepath'].get('val') tr_ds = Corpus(tr_path, tokenizer.split_and_transform) tr_dl = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, num_workers=4, drop_last=True) val_ds = Corpus(val_path, tokenizer.split_and_transform) val_dl = DataLoader(val_ds, batch_size=batch_size) loss_fn = nn.CrossEntropyLoss() opt = optim.Adam(params=model.parameters(), lr=learning_rate) scheduler = ReduceLROnPlateau(opt, patience=5) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) writer = SummaryWriter('./runs/{}'.format(params['version'])) for epoch in tqdm(range(epochs), desc='epochs'): tr_loss = 0 model.train() for step, mb in tqdm(enumerate(tr_dl), desc='steps', total=len(tr_dl)): x_mb, y_mb = map(lambda elm: elm.to(device), mb) opt.zero_grad() mb_loss = loss_fn(model(x_mb), y_mb) mb_loss.backward() clip_grad_norm_(model._fc.weight, 5) opt.step() tr_loss += mb_loss.item() if (epoch * len(tr_dl) + step) % global_step == 0: val_loss = evaluate(model, val_dl, loss_fn, device) writer.add_scalars('loss', {'train': tr_loss / (step + 1), 'val': val_loss}, epoch * len(tr_dl) + step) model.train() else: tr_loss /= (step + 1) val_loss = evaluate(model, val_dl, loss_fn, device) scheduler.step(val_loss) tqdm.write('epoch : {}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(epoch + 1, tr_loss, val_loss)) ckpt = {'model_state_dict': model.state_dict(), 'opt_state_dict': opt.state_dict()} save_path = cwd / params['filepath'].get('ckpt') torch.save(ckpt, save_path)