def train(x, y): model = TextCNN() model = model.cuda() parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.SGD(model.parameters(), lr=1e-3) criterion = nn.CrossEntropyLoss(size_average=False) for epoch in range(100): total = 0 for i in range(0, len(x) / 64): batch_x = x[i * 64:(i + 1) * 64] batch_y = y[i * 64:(i + 1) * 64] batch_x = Variable(torch.FloatTensor(batch_x)).cuda() batch_y = Variable(torch.LongTensor(batch_y)).cuda() optimizer.zero_grad() model.train() pred = model(batch_x, 64) loss = criterion(pred, batch_y) #print(loss) loss.backward() nn.utils.clip_grad_norm(parameters, max_norm=3) total += np.sum( pred.data.max(1)[1].cpu().numpy() == batch_y.data.cpu().numpy()) optimizer.step() print("epoch ", epoch + 1, " acc: ", float(total) / len(x)) return model
def main(mode): if mode == 'train': model = TextCNN(embedding_weights) model.bulid_graph() model.train((x_train, y_train), (x_test, y_test)) elif mode == 'test': model = TextCNN(embedding_weights) model.bulid_graph() model.test((x_test, y_test), '1528038283')
def train(**kwargs): opt.parse(kwargs) device = torch.device( "cuda:{}".format(opt.gpu_id) if torch.cuda.is_available() else "cpu") opt.device = device x_text, y = load_data_and_labels("./data/rt-polarity.pos", "./data/rt-polarity.neg") x_train, x_test, y_train, y_test = train_test_split( x_text, y, test_size=opt.test_size) train_data = Data(x_train, y_train) test_data = Data(x_test, y_test) train_loader = DataLoader(train_data, batch_size=opt.batch_size, shuffle=True, collate_fn=collate_fn) test_loader = DataLoader(test_data, batch_size=opt.batch_size, shuffle=False, collate_fn=collate_fn) print("{} train data: {}, test data: {}".format(now(), len(train_data), len(test_data))) model = TextCNN(opt) print("{} init model finished".format(now())) if opt.use_gpu: model.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay) for epoch in range(opt.epochs): total_loss = 0.0 model.train() for step, batch_data in enumerate(train_loader): x, labels = batch_data labels = torch.LongTensor(labels) if opt.use_gpu: labels = labels.to(device) optimizer.zero_grad() output = model(x) loss = criterion(output, labels) loss.backward() optimizer.step() total_loss += loss.item() acc = test(model, test_loader) print("{} {} epoch: loss: {}, acc: {}".format(now(), epoch, total_loss, acc))
def build_textcnn_model(vocab, config, train=True): model = TextCNN(vocab.vocab_size, config) if train: model.train() else: model.eval() if torch.cuda.is_available(): model.cuda() else: model.cpu() return model
def build_textcnn_model(vocab, config, train=True): model = TextCNN(vocab.vocab_size, config) if train: model.train() #在训练模型时会在前面加上train(); else: model.eval() #在测试模型时在前面使用eval(),会将BN和DropOut固定住,不会取平均,而是用训练好的值 if torch.cuda.is_available(): model.cuda() else: model.cpu() return model
def train(args): train_iter, dev_iter = data_processor.load_data(args) # 将数据分为训练集和验证集 print('加载数据完成') model = TextCNN(args) if args.cuda: model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) steps = 0 best_acc = 0 last_step = 0 model.train() for epoch in range(1, args.epoch + 1): for batch in train_iter: feature, target = batch.text, batch.label # t_()函数表示将(max_len, batch_size)转置为(batch_size, max_len) # feature.data.t_(), target.data.sub_(1) # target减去1 feature = feature.data.t() # x.t() x是不变的,所以重新赋值 # target.data.sub_(1) if args.cuda: feature, target = feature.cuda(), target.cuda() optimizer.zero_grad() logits = model(feature) loss = F.cross_entropy(logits, target) loss.backward() optimizer.step() steps += 1 if steps % args.log_interval == 0: # torch.max(logits, 1)函数:返回每一行中最大值的那个元素,且返回其索引(返回最大元素在这一行的列索引) corrects = (torch.max(logits, 1)[1] == target).sum() train_acc = 100.0 * corrects / batch.batch_size sys.stdout.write( '\rBatch[{}] - loss: {:.6f} acc: {:.4f}%({}/{})'.format( steps, loss.item(), train_acc, corrects, batch.batch_size)) if steps % args.test_interval == 0: dev_acc = eval(dev_iter, model, args) if dev_acc > best_acc: best_acc = dev_acc last_step = steps if args.save_best: print('Saving best model, acc: {:.4f}%\n'.format( best_acc)) save(model, args.save_dir, 'best', steps) else: if steps - last_step >= args.early_stopping: print('\nearly stop by {} steps, acc: {:.4f}%'.format( args.early_stopping, best_acc)) raise KeyboardInterrupt
def build_textcnn_model(vocab, config, train=True): model = TextCNN(vocab.vocab_size, config) if train: model.train() #在训练模型时会在前面加上train(); else: model.eval() #在测试模型时在前面使用eval(),会将BN和DropOut固定住,不会取平均,而是用训练好的值 #train()与eval()两个方法是针对网络train和eval时采用不同方式的情况 #比如Batch Normalization和Dropout #BN的作用主要是对网络中间的每层进行归一化处理,并且使用变换重构保证所提取的特征分布不会被破坏; #由于训练完毕后参数都是固定的,所有BN的训练和测试时的操作不同 #Dropopt能够克服过拟合,在每个训练batch中,通过忽略一般的特征检测器,可以明显地减少过拟合现象。 if torch.cuda.is_available(): model.cuda() else: model.cpu() return model
def objective(trial): model = TextCNN(trial, len(id2vocab), CLS) model.to(device) optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"]) lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True) optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr) criterion = nn.NLLLoss() for epoch in range(EPOCHS): model.train() epoch_loss= [] for batch in train_iter: text_idx_batch, label_idx_batch = batch.text.t_().to(device), batch.label.to(device) model.zero_grad() out = model(text_idx_batch) loss = criterion(out, label_idx_batch) loss.backward() epoch_loss.append(loss.item()) optimizer.step() #print(f'Epoch[{epoch}] - Loss:{sum(epoch_loss)/len(epoch_loss)}') model.eval() predict_all = np.array([], dtype=int) labels_all = np.array([], dtype=int) with torch.no_grad(): for batch in val_iter: text_idx_batch, label_idx_batch = batch.text.t_().to(device), batch.label pred = model(text_idx_batch) pred = torch.max(pred.data, 1)[1].cpu().numpy() predict_all = np.append(predict_all, pred) truth = label_idx_batch.cpu().numpy() labels_all = np.append(labels_all, truth) acc = metrics.accuracy_score(labels_all, predict_all) trial.report(acc, epoch) if trial.should_prune(): raise optuna.exceptions.TrialPruned() return acc
# 加载模型 net = TextCNN(args, weight).to(device) # 定义损失函数和优化器 criterion = nn.CrossEntropyLoss() # 使用交叉熵损失函数 optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=args.lr) # 模型训练 print('training on ', device) best_acc = 0.0 step = 0 for epoch in range(1, args.num_epochs + 1): net.train() for X, y in train_iter: X, y = X.to(device), y.to(device) y_hat = net(X) # 计算预测概率值 loss = criterion(y_hat, y) # 计算loss值 optimizer.zero_grad() # 梯度置零 loss.backward() # 反向传播 optimizer.step() # 参数更新 step += 1 # 测试 if step % args.test_per_step == 0: net.eval() all_pre = [] all_label = []
def main(): device = torch.device('cuda') embedding_vectors = torch.load(f'{EMBEDDINGS_DIR}/vectors.pkl') text_processor = TextProcessor( wti=pickle.load(open(f'{EMBEDDINGS_DIR}/wti.pkl', 'rb')), tokenizer=get_tokenizer('basic_english'), standardize=True, min_len=3, ) dataset = TextDataset(CORPUS_DIR, text_processor) # split into training and test set # TODO: fix this splitting sometimes failing when corpus size changes train_set, test_set = torch.utils.data.random_split( dataset, [ int(len(dataset) * DATA_SPLIT), int(len(dataset) * (1.0 - DATA_SPLIT)) ]) # count number of samples in each class class_count = [0, 0] for data, label in dataset: class_count[int(label.item())] += 1 # get relative weights for classes _sum = sum(class_count) class_count[0] /= _sum class_count[1] /= _sum # reverse the weights since we're getting the inverse for the sampler class_count = list(reversed(class_count)) # set weight for every sample weights = [class_count[int(x[1].item())] for x in train_set] # weighted sampler sampler = torch.utils.data.WeightedRandomSampler( weights=weights, num_samples=len(train_set), replacement=True) train_loader = DataLoader(dataset=train_set, batch_size=32, collate_fn=Sequencer(SEQUENCE_LEN), sampler=sampler) test_loader = DataLoader(dataset=test_set, batch_size=32, collate_fn=Sequencer(SEQUENCE_LEN)) # number of filters in each convolutional filter N_FILTERS = 64 # sizes and number of convolutional layers FILTER_SIZES = [2, 3] # dropout for between conv and dense layers DROPOUT = 0.5 model = TextCNN( embeddings=embedding_vectors, n_filters=N_FILTERS, filter_sizes=FILTER_SIZES, dropout=DROPOUT, ).to(device) print(model) print('Trainable params:', sum(p.numel() for p in model.parameters() if p.requires_grad)) criterion = nn.BCELoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) EPOCHS = 12 best_acc = 0.0 # training loop for epoch in range(EPOCHS): print('Epoch', epoch + 1) for i, data in tqdm(enumerate(train_loader), total=len(train_loader)): # get word indices vector and corresponding labels x, labels = data # send to device x = x.to(device) labels = labels.to(device) # make predictions predictions = model(x).squeeze() # calculate loss loss = criterion(predictions, labels) # learning stuff... optimizer.zero_grad() loss.backward() optimizer.step() # evaluate with torch.no_grad(): model.eval() correct = 0 wrong = 0 m = [[0, 0], [0, 0]] for data in test_loader: x, label = data x = x.to(device) predictions = model(x).squeeze() for truth, prediction in zip(label, predictions): y = int(truth.item()) y_pred = 1 if prediction.item() > 0.5 else 0 m[y][y_pred] += 1 if y == y_pred: correct += 1 else: wrong += 1 model.train() acc = correct / (correct + wrong) if acc > best_acc: best_acc = acc for file in glob.glob('models/model_*.pth'): os.remove(file) torch.save(model.state_dict(), f'models/state_{epoch}.pth') print() print('Correct:', f'{correct}/{correct + wrong}', 'Accuracy:', acc) print('[[TN, FP], [FN, TP]]') print(m) print() # put into evaluation mode model.eval() text_processor.do_standardize = True with torch.no_grad(): while True: text = input('Prompt: ') x = text_processor.process(text) x = torch.tensor(x).unsqueeze(dim=0) print(model(x.to(device)).squeeze())
batch_size=config.batch_size, num_workers=2) config.word_num = len(training_set.tok2num) model = TextCNN(config) if torch.cuda.is_available(): model.cuda() criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=config.lr) training_lossse = [] # Train the model for epoch in range(config.epoch): model.train() for data, label in training_iter: if config.cuda and torch.cuda.is_available(): data = data.cuda() labels = label.cuda() out = model(data) loss = criterion(out, label) training_lossse.append(loss.item()) if len(training_lossse) % 100 == 0: print("train epoch", epoch, end=' ') print("The loss is: %.5f" % (np.average(training_lossse[-100:])))
# -*- coding: utf-8 -*- import tensorflow.keras as keras import numpy as np from sklearn import metrics import os from preprocess import preprocesser from config import Config from model import TextCNN from model import LSTM np.random.seed(42) if __name__ == '__main__': CNN_model = TextCNN() CNN_model.train(5) CNN_model.test() # LSTM_MODEL = LSTM() # LSTM_MODEL.train(5) # LSTM_MODEL.test()
def train(): # 配置文件 cf = Config('./config.yaml') # 有GPU用GPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 训练数据 train_data = NewsDataset("./data/cnews_final_train.txt",cf.max_seq_len) train_dataloader = DataLoader(train_data,batch_size=cf.batch_size,shuffle=True) # 测试数据 test_data = NewsDataset("./data/cnews_final_test.txt",cf.max_seq_len) test_dataloader = DataLoader(test_data,batch_size=cf.batch_size,shuffle=True) # 预训练词向量矩阵 embedding_matrix = get_pre_embedding_matrix("./data/final_vectors") # 模型 model = TextCNN(cf,torch.tensor(embedding_matrix)) # 优化器用adam optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters())) # 把模型放到指定设备 model.to(device) # 让模型并行化运算 if torch.cuda.device_count()>1: model = torch.nn.DataParallel(model) # 训练 start_time = time.time() total_batch = 0 # 总批次 best_acc_val = 0.0 # 最佳验证集准确率 last_improved = 0 # 记录上一次提升批次 require_improvement = 1000 # 如果超过1000轮未提升,提前结束训练 flag = False model.train() for epoch_id in trange(cf.epoch,desc="Epoch"): for step,batch in enumerate(tqdm(train_dataloader,"batch",total=len(train_dataloader))): label_id = batch['label_id'].squeeze(1).to(device) segment_ids = batch['segment_ids'].to(device) loss = model(segment_ids,label_id) loss.backward() optimizer.step() optimizer.zero_grad() total_batch += 1 if total_batch % cf.print_per_batch == 0: model.eval() with torch.no_grad(): loss_train,acc_train = model.get_loss_acc(segment_ids,label_id) loss_val,acc_val = evaluate(model,test_dataloader,device) if acc_val > best_acc_val: # 保存最好结果 best_acc_val = acc_val last_improved = total_batch torch.save(model.state_dict(),"./output/model.bin") improved_str = "*" else: improved_str = "" time_dif = get_time_dif(start_time) msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \ + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}' print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str)) model.train() if total_batch - last_improved > require_improvement: print("长时间未优化") flag = True break if flag: break
def main(args): print "loadding reviews and labels from dataset" data = pd.read_csv('data/labeledTrainData.tsv.zip', compression='zip', delimiter='\t', header=0, quoting=3) reviews = data["review"] labels = list(data['sentiment']) sentences = [] for review in reviews: if len(review) > 0: sentences.append( utils.review_to_wordlist(review.decode('utf8').strip(), remove_stopwords=True)) print "loaded %d reviews from dataset" % len(sentences) word_dict = utils.build_vocab(sentences, max_words=10000) vec_reviews = utils.vectorize(sentences, word_dict, verbose=True) train_x = vec_reviews[0:20000] train_y = labels[0:20000] train_y = utils.one_hot(train_y, args.nb_classes) test_x = vec_reviews[20000:] test_y = labels[20000:] test_y = utils.one_hot(test_y, args.nb_classes) save_dir = args.save_dir log_dir = args.log_dir if not os.path.exists(save_dir): os.makedirs(save_dir) if not os.path.exists(log_dir): os.makedirs(log_dir) with tf.Graph().as_default(): config_proto = utils.get_config_proto() sess = tf.Session(config=config_proto) if args.model_type == "cnn": model = TextCNN(args, "TextCNN") test_batch = utils.get_batches(test_x, test_y, args.max_size) elif args.model_type in ["rnn", "bi_rnn"]: model = TextRNN(args, "TextRNN") test_batch = utils.get_batches(test_x, test_y, args.max_size, type="rnn") sess.run(tf.global_variables_initializer()) summary_writer = tf.summary.FileWriter(log_dir, sess.graph) for epoch in range(1, args.nb_epochs + 1): print "epoch %d start" % epoch print "- " * 50 loss = 0. total_reviews = 0 accuracy = 0. if args.model_type == "cnn": train_batch = utils.get_batches(train_x, train_y, args.batch_size) elif args.model_type in ["rnn", "bi_rnn"]: train_batch = utils.get_batches(train_x, train_y, args.batch_size, type="rnn") epoch_start_time = time.time() step_start_time = epoch_start_time for idx, batch in enumerate(train_batch): reviews, reviews_length, labels = batch _, loss_t, accuracy_t, global_step, batch_size, summaries = model.train( sess, reviews, reviews_length, labels, args.keep_prob) loss += loss_t * batch_size total_reviews += batch_size accuracy += accuracy_t * batch_size summary_writer.add_summary(summaries, global_step) if global_step % 50 == 0: print "epoch %d, step %d, loss %f, accuracy %.4f, time %.2fs" % \ (epoch, global_step, loss_t, accuracy_t, time.time() - step_start_time) step_start_time = time.time() epoch_time = time.time() - epoch_start_time print "%.2f seconds in this epoch" % (epoch_time) print "train loss %f, train accuracy %.4f" % ( loss / total_reviews, accuracy / total_reviews) total_reviews = 0 accuracy = 0. for batch in test_batch: reviews, reviews_length, labels = batch accuracy_t, batch_size = model.test(sess, reviews, reviews_length, labels, 1.0) total_reviews += batch_size accuracy += accuracy_t * batch_size print "accuracy %.4f in %d test reviews" % ( accuracy / total_reviews, total_reviews)
class Trainer: def __init__(self, config): self.config = config self.train_data_loader = None self.eval_data_loader = None # 加载数据集 self.load_data() self.train_inputs, self.train_labels, label_to_idx = self.train_data_loader.gen_data( ) self.vocab_size = self.train_data_loader.vocab_size self.word_vectors = self.train_data_loader.word_vectors print(f"train data size: {len(self.train_labels)}") print(f"vocab size: {self.vocab_size}") self.label_list = [value for key, value in label_to_idx.items()] self.eval_inputs, self.eval_labels = self.eval_data_loader.gen_data() # 初始化模型 self.model = TextCNN(config=self.config, vocab_size=self.vocab_size, word_vectors=self.word_vectors) def load_data(self): """加载数据集""" self.train_data_loader = TrainData(self.config) self.config.test_data = self.config.eval_data # 使用验证集,进行训练过程中的测试 self.eval_data_loader = TestData(self.config) def train(self): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9, allow_growth=True) sess_config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True, gpu_options=gpu_options) with tf.Session(config=sess_config) as sess: sess.run(tf.global_variables_initializer()) # 初始化变量 current_step = 0 # 创建Train/Eval的summar路径和写入对象 train_summary_path = os.path.join( self.config.BASE_DIR, self.config.summary_path + "/train") if not os.path.exists(train_summary_path): os.makedirs(train_summary_path) train_summary_writer = tf.summary.FileWriter( train_summary_path, sess.graph) eval_summary_path = os.path.join( self.config.BASE_DIR, self.config.summary_path + "/eval") if not os.path.exists(eval_summary_path): os.makedirs(eval_summary_path) eval_summary_writer = tf.summary.FileWriter( eval_summary_path, sess.graph) # Train & Eval Process for epoch in range(self.config.epochs): print(f"----- Epoch {epoch + 1}/{self.config.epochs} -----") for batch in self.train_data_loader.next_batch( self.train_inputs, self.train_labels, self.config.batch_size): summary, loss, predictions = self.model.train( sess, batch, self.config.keep_prob) train_summary_writer.add_summary(summary) if self.config.num_classes == 1: acc = get_binary_metrics(pred_y=predictions.tolist(), true_y=batch['y']) print("Train step: {}, acc: {:.3f}".format( current_step, acc)) elif self.config.num_classes > 1: acc = get_multi_metrics(pred_y=predictions.tolist(), true_y=batch['y']) print("Train step: {}, acc: {:.3f}".format( current_step, acc)) current_step += 1 if self.eval_data_loader and current_step % self.config.ckeckpoint_every == 0: eval_losses = [] eval_accs = [] for eval_batch in self.eval_data_loader.next_batch( self.eval_inputs, self.eval_labels, self.config.batch_size): eval_summary, eval_loss, eval_predictions = self.model.eval( sess, eval_batch) eval_summary_writer.add_summary(eval_summary) eval_losses.append(eval_loss) if self.config.num_classes == 1: acc = get_binary_metrics( pred_y=eval_predictions.tolist(), true_y=batch['y']) eval_accs.append(acc) elif self.config.num_classes > 1: acc = get_multi_metrics( pred_y=eval_predictions.tolist(), true_y=batch['y']) eval_accs.append(acc) print( f"Eval \tloss: {list_mean(eval_losses)}, acc: {list_mean(eval_accs)}" ) if self.config.ckpt_model_path: save_path = os.path.join( self.config.BASE_DIR, self.config.ckpt_model_path) if not os.path.exists(save_path): os.makedirs(save_path) model_save_path = os.path.join( save_path, self.config.model_name) self.model.saver.save(sess, model_save_path, global_step=current_step)
def main(args): print "loadding data and labels from dataset" train = pd.read_csv(args.train_dir) ch_train = pd.read_csv(args.chtrain_dir) x_train = train["comment_text"] x_chtrain = ch_train["comment_text"] target_cols = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] x = [] x_ch = [] for line in x_train: if len(line) > 0: x.append(utils.review_to_wordlist(line.strip())) print "loaded %d comments from dataset" % len(x) for line in x_chtrain: if len(line) > 0: x_ch.append(utils.review_to_wordlist_char(line.strip())) print "loaded %d comments from dataset" % len(x) y = train[target_cols].values index2word, word2index = utils.load_vocab(args.vocab_dir) index2char, char2index = utils.load_char(args.char_dir) x_vector = utils.vectorize(x, word2index, verbose=False) x_vector = np.array(x_vector) char_vector = utils.vectorize_char(x_ch, char2index, verbose=False) char_vector = np.array(char_vector) print char_vector[0] save_dir = os.path.join(args.save_dir, args.model_type) if not os.path.exists(save_dir): os.makedirs(save_dir) if args.model_type in ["cnn", "cnnfe", "chcnn", "chcnn2"]: max_step = args.max_step_cnn max_size = args.max_size_cnn nb_epochs = args.nb_epochs_cnn elif args.model_type in [ "rnn", "rnnfe", "rnnfe2", "chrnn", "chrnnfe", "rcnn" ]: max_step = args.max_step_rnn max_size = args.max_size_rnn nb_epochs = args.nb_epochs_rnn ex_features = add_features("../data/train.csv") nfolds = args.nfolds skf = KFold(n_splits=nfolds, shuffle=True, random_state=2018) test_prob = [] stack_logits = np.zeros((len(x_vector), len(target_cols))) for (f, (train_index, test_index)) in enumerate(skf.split(x_vector)): x_train, x_eval = x_vector[train_index], x_vector[test_index] char_train, char_eval = char_vector[train_index], char_vector[ test_index] y_train, y_eval = y[train_index], y[test_index] with tf.Graph().as_default(): config_proto = utils.get_config_proto() sess = tf.Session(config=config_proto) if args.model_type == "cnn": model = TextCNN(args, "TextCNN") elif args.model_type == "cnnfe": model = TextCNNFE(args, "TextCNNFE") elif args.model_type == "rnn": model = TextRNN(args, "TextRNN") elif args.model_type == "rnnfe": model = TextRNNFE(args, "TextRNNFE") elif args.model_type == "rcnn": model = TextRCNN(args, "TextRCNN") elif args.model_type == "attention": model = RNNWithAttention(args, "Attention") elif args.model_type == "chrnn": model = TextRNNChar(args, "TextRNNChar") elif args.model_type == "chcnn": model = TextCNNChar(args, "TextCNNChar") elif args.model_type == "chcnn2": model = TextCNNChar(args, "TextCNNChar2") elif args.model_type == "rnnfe2": model = TextRNNFE2(args, "TextCNNCharFE2") elif args.model_type == "chrnnfe": model = TextRNNCharFE(args, "TextCNNCharFE") else: raise ValueError("Unknown model_type %s" % args.model_type) sess.run(tf.global_variables_initializer()) if args.use_ft: pretrain_dir = args.ft_dir print "use FastText word vector" embedding = utils.load_fasttext(pretrain_dir, index2word) if not args.use_ft: pretrain_dir = args.glove_dir print "use Glove word vector" embedding = utils.load_glove(pretrain_dir, index2word) sess.run(model.embedding_init, {model.embedding_placeholder: embedding}) for line in model.tvars: print line print "training %s model for toxic comments classification" % ( args.model_type) print "%d fold start training" % f for epoch in range(1, nb_epochs + 1): print "epoch %d start with lr %f" % ( epoch, model.learning_rate.eval(session=sess)), "\n", "- " * 50 loss, total_comments = 0.0, 0 if args.model_type in ["cnn", "rnn", "rcnn"]: train_batch = utils.get_batches(x_train, y_train, args.batch_size, args.max_len) valid_batch = utils.get_batches(x_eval, y_eval, max_size, args.max_len, False) elif args.model_type in ["chrnn", "chcnn", "chcnn2"]: train_batch = utils.get_batches_with_char( x_train, char_train, y_train, args.batch_size, args.max_len) valid_batch = utils.get_batches_with_char( x_eval, char_eval, y_eval, max_size, args.max_len, False) elif args.model_type in ["rnnfe", "cnnfe", "rnnfe2"]: train_batch = utils.get_batches_with_fe( x_train, y_train, ex_features, args.batch_size, args.max_len) valid_batch = utils.get_batches_with_fe( x_eval, y_eval, ex_features, max_size, args.max_len, False) elif args.model_type in ["chrnnfe"]: train_batch = utils.get_batches_with_charfe( x_train, char_train, y_train, ex_features, args.batch_size, args.max_len) valid_batch = utils.get_batches_with_charfe( x_eval, char_eval, y_eval, ex_features, max_size, args.max_len, False) epoch_start_time = time.time() step_start_time = epoch_start_time for idx, batch in enumerate(train_batch): if args.model_type in ["cnn", "rnn", "rcnn"]: comments, comments_length, labels = batch _, loss_t, global_step, batch_size = model.train( sess, comments, comments_length, labels) elif args.model_type in ["chrnn", "chcnn", "chcnn2"]: comments, comments_length, chs, labels = batch _, loss_t, global_step, batch_size = model.train( sess, comments, comments_length, chs, labels) elif args.model_type in ["rnnfe", "cnnfe", "rnnfe2"]: comments, comments_length, exs, labels = batch _, loss_t, global_step, batch_size = model.train( sess, comments, comments_length, labels, exs) elif args.model_type in ["chrnnfe"]: comments, comments_length, chs, exs, labels = batch _, loss_t, global_step, batch_size = model.train( sess, comments, comments_length, chs, labels, exs) loss += loss_t * batch_size total_comments += batch_size if global_step % 200 == 0: print "epoch %d step %d loss %f time %.2fs" % ( epoch, global_step, loss_t, time.time() - step_start_time) if global_step % 200 == 0: _ = run_valid(valid_batch, model, sess, args.model_type) # model.saver.save(sess, os.path.join(save_dir, "model.ckpt"), global_step=global_step) step_start_time = time.time() epoch_time = time.time() - epoch_start_time sess.run(model.learning_rate_decay_op) print "%.2f seconds in this epoch with train loss %f" % ( epoch_time, loss / total_comments) test_prob.append(run_test(args, model, sess)) stack_logits[test_index] = run_valid(valid_batch, model, sess, args.model_type) preds = np.zeros((test_prob[0].shape[0], len(target_cols))) for prob in test_prob: preds += prob print prob[0] preds /= len(test_prob) print len(test_prob) write_predict(stack_logits, args.model_type) write_results(preds, args.model_type)
# -*- coding: utf-8 -*- """ Created on 2020-07-19 01:32 @Author : Justin Jiang @Email : [email protected] """ import tensorflow.keras as keras import numpy as np from sklearn import metrics import os from preprocess import preprocesser from config import Config from model import TextCNN if __name__ == '__main__': CNN_model = TextCNN() CNN_model.train(3) CNN_model.test()
def train(config): try: split = config["split"] data_path = config["data_path"] pretrained_model_dir = config["pretrained_model_dir"] pretrained_model_file = config["pretrained_model_file"] last_model_path = config["last_model_path"] save_to = config["save_to"] min_freq = config["min_freq"] batch_size = config["batch_size"] max_sent_length = config["max_sent_length"] embed_dim = config["embed_dim"] filter_num = config["filter_num"] filter_widths = config["filter_widths"] learning_rate = config["learning_rate"] patience = config["patience"] lr_decay = config["lr_decay"] max_num_trial = config["max_num_trial"] max_epoch = config["max_epoch"] save_every = config["save_every"] cuda = config["cuda"] debug = config["debug"] except KeyError: print("Input Parameter Error") exit(1) if not Path(save_to).exists(): Path(save_to).mkdir() device = torch.device("cuda:0" if ( torch.cuda.is_available() and cuda) else "cpu") # build torchtext field TEXT = torchtext.data.Field(tokenize='spacy', lower=True) LABEL = torchtext.data.Field(dtype=torch.long) train_data, test_data = IMDB.splits(TEXT, LABEL, root=data_path) if debug: train_data, val_data = train_data.split(split_ratio=0.1) train_data, val_data = train_data.split(split_ratio=0.7) train_iter, val_iter = torchtext.data.Iterator.splits( (train_data, val_data), batch_size=batch_size, device=device) if (pretrained_model_file is not None) and (pretrained_model_dir is not None): pretrained_vector = Vectors(name=pretrained_model_file, cache=pretrained_model_dir) TEXT.build_vocab(train_data, min_freq=min_freq, vectors=pretrained_vector) LABEL.build_vocab(train_data) logging.info("saving TEXT/LABEL vocabulary...") with open(f"{save_to}/TEXT_vocab.bin", "wb") as f: dill.dump(TEXT, f) with open(f"{save_to}/LABEL_vocab.bin", "wb") as f: dill.dump(LABEL, f) assert embed_dim == TEXT.vocab.vectors.shape[ -1], "incompatiable embeddings" embed_num, class_num = len(TEXT.vocab), len(LABEL.vocab) model = TextCNN(embed_num, embed_dim, class_num, filter_num, filter_widths, from_pretrained=TEXT.vocab.vectors).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) cross_entropy = nn.CrossEntropyLoss(weight=torch.tensor( [0, 0, 1.0, 1.0], device=device)) # class [<unk>,<pad>,'pos','neg'] if last_model_path is not None: # load model logging.info(f'load model from {last_model_path}') params = torch.load(last_model_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) logging.info('restore parameters of the optimizers') optimizer.load_state_dict(torch.load(last_model_path + '.optim')) model.train() epoch = 0 cur_trial = 0 hist_valid_scores = [] train_time = begin_time = time.time() logging.info("begin training!") while True: epoch += 1 train_loss = 0 cum_cnt = 0 step = 0 for batch in iter(train_iter): feature, target = batch.text.T, batch.label.squeeze(0) step += 1 optimizer.zero_grad() res = model(feature) loss = cross_entropy(res, target) train_loss += loss loss.backward() optimizer.step() train_loss = train_loss / step val_loss, accuracy = evaluate(model, val_iter, cross_entropy) logging.info( f'epoch {epoch}\t train_loss: {train_loss}\t val_loss:{val_loss}\t val_accuracy:{accuracy} speed:{time.time()-train_time:.2f}s/epoch\t time elapsed {time.time()-begin_time:.2f}s' ) train_time = time.time() is_better = len( hist_valid_scores) == 0 or val_loss < min(hist_valid_scores) hist_valid_scores.append(val_loss) if epoch % save_every == 0: model.save(f"{save_to}/model_step_{epoch}") torch.save(optimizer.state_dict(), f"{save_to}/model_step_{epoch}.optim") if is_better: cur_patience = 0 model_save_path = f"{save_to}/model_best" print(f'save currently the best model to [{model_save_path}]') model.save(model_save_path) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') elif cur_patience < patience: cur_patience += 1 print('hit patience %d' % cur_patience) if cur_patience == patience: cur_trial += 1 print(f'hit #{cur_trial} trial') if cur_trial == max_num_trial: print('early stop!') exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * lr_decay logging.info( f'load previously best model and decay learning rate to {lr}' ) # load model params = torch.load(model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) logging.info('restore parameters of the optimizers') optimizer.load_state_dict( torch.load(model_save_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience cur_patience = 0 if epoch == max_epoch: print('reached maximum number of epochs!') exit(0)
criterion = torch.nn.CrossEntropyLoss() # 训练 t0 = time() nb_epoch = opts.nb_epoch max_patience = opts.max_patience current_patience = 0 root_model = opts.root_model if not os.path.exists(root_model): os.makedirs(root_model) path_model = os.path.join(root_model, 'textcnn.model') best_dev_loss = 1000. for epoch in range(nb_epoch): sys.stdout.write('epoch {0} / {1}: \r'.format(epoch, nb_epoch)) total_loss, dev_loss = 0., 0. text_cnn.train() current_count = 0 for i_batch, sample_batched in enumerate(data_loader_train): optimizer.zero_grad() data = Variable(sample_batched['data']) label = Variable(sample_batched['label']) if use_cuda: data = data.cuda() label = label.cuda() target = text_cnn(data) loss = criterion(target, label) loss.backward() optimizer.step() total_loss += loss.data[0]
def train(self): best_valid_loss = 1e9 all_valid_loss, all_valid_acc = 0, 0 # CV loop for i in range(self.args.cv_num): model = TextCNN(self.vocab_size, self.pad_idx, self.args).to(device) # model variations (cf. "rand" is default value) if self.args.mode == "static": model.static_embedding.weight.data.copy_(self.embeddings) model.static_embedding.weight.requires_grad = False elif self.args.mode == "non-static": model.static_embedding.data.normal_(0, 1) model.static_embedding.weight.data.copy_(self.embeddings) elif self.args.mode == "multichannel": model.static_embedding.weight.data.copy_(self.embeddings) model.static_embedding.weight.requires_grad = False model.nonstatic_embedding.weight.data.copy_(self.embeddings) optimizer = optim.Adadelta(model.parameters()) model.train() # generate train dataset print(f'>>> {i+1}th dataset is testset') ## ?? dataset = self.dataset_list.copy() del dataset[i] # remove testset dataset = functools.reduce( lambda x, y: x + y, dataset) # Concatenate datasets consecutively. data_loader = DataLoader(dataset=dataset, batch_size=self.args.batch_size, shuffle=True, collate_fn=self.collate_fn) for epoch in range(self.args.epochs): # Epoch loop pbar = tqdm(data_loader) for text, label in pbar: text = text.to(device) label = label.to(device) optimizer.zero_grad() predictions = model(text).squeeze(1) loss = self.criterion(predictions, label) acc = self._binary_accuracy(predictions, label) loss.backward() optimizer.step() # max_norm_scaling eps = 1e-7 param = model.fc.weight norm = torch.norm(param) # l2_norm if norm > self.args.l2_constraint: param.data *= self.args.l2_constraint / (eps + norm) pbar.set_description( f"loss : {loss.item():.4f}, acc : {acc.item():.4f}") valid_loss, valid_acc = self.evaluate(model, i) all_valid_loss += valid_loss.item() all_valid_acc += valid_acc.item() print( f'valid loss : {valid_loss.item():.3f}, valid acc : {valid_acc.item():.3f}' ) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save( model.state_dict(), osp.join(self.args.ck_path, f'{self.args.name}_best.pt')) if not self.args.cv: return print() print(f'Final loss : {all_valid_loss / self.args.cv_num:.3f}') print(f'Final acc : {all_valid_acc / self.args.cv_num:.3f}')