def __init__(self, args): """ model: 训练好的模型 word2id: 词语到id的映射 id2label: id到类别的映射 """ self.args = args self.rule = re.compile(r"[^\u4e00-\u9fa5]") self.cut = word_tokenize with open(self.args.vocab + '/' + 'word2id.pkl', 'rb') as f: print('Loading word2id...') self.word2id = pickle.load(f) with open(self.args.vocab + '/' + 'id2label.pkl', 'rb') as f: print('Loading id2label...') self.id2label = pickle.load(f) self.args.embed_num = len(self.word2id) self.args.class_num = len(self.id2label) print_parameters(self.args) self.model = TextCNN(args) if self.args.snapshot is not None: print('\nLoading model from %s...' % (self.args.snapshot)) self.model.load_state_dict(torch.load(self.args.snapshot)) if self.args.cuda: torch.cuda.set_device(self.args.device) self.model = self.model.cuda() self.model.eval()
class Predictor(): def __init__(self, args): """ model: 训练好的模型 word2id: 词语到id的映射 id2label: id到类别的映射 """ self.args = args self.rule = re.compile(r"[^\u4e00-\u9fa5]") self.cut = word_tokenize with open(self.args.vocab + '/' + 'word2id.pkl', 'rb') as f: print('Loading word2id...') self.word2id = pickle.load(f) with open(self.args.vocab + '/' + 'id2label.pkl', 'rb') as f: print('Loading id2label...') self.id2label = pickle.load(f) self.args.embed_num = len(self.word2id) self.args.class_num = len(self.id2label) print_parameters(self.args) self.model = TextCNN(args) if self.args.snapshot is not None: print('\nLoading model from %s...' % (self.args.snapshot)) self.model.load_state_dict(torch.load(self.args.snapshot)) if self.args.cuda: torch.cuda.set_device(self.args.device) self.model = self.model.cuda() self.model.eval() def predict(self, text): """ 预测单个文本的类别 :param text: 单个文本 :return: """ try: text = self.rule.sub('', text) text = self.cut(text) x = Variable( torch.LongTensor([[ self.word2id[word] if word != '\x00' and word in self.word2id else 0 for word in text ]])) # print(id2label) if self.args.cuda: x = x.cuda() output = self.model(x) # prob = F.softmax(output, 1) # _, predicted = torch.max(prob, 1) _, predicted = torch.max(output, 1) return self.id2label[predicted.data[0]] except: return 'UNK'
def get_model(model, Title): embedding_dim = len(Title.vocab.vectors[0]) embedding_weight = Title.vocab.vectors if model == 'TextCNN': model = TextCNN.TextCNN(max_length=20, vocab_size=len(Title.vocab), embedding_dim=embedding_dim, embedding_weight=embedding_weight, output_size=10) elif model == 'LSTM': model = LSTM.LSTM(vocab_size=len(Title.vocab), embedding_dim=embedding_dim, embedding_weight=embedding_weight, hidden_size=100, num_layers=4, output_size=10) elif model == 'BiLSTM_Attention': model = BiLSTM_Attention.BiLSTM_Attention( vocab_size=len(Title.vocab), embedding_dim=embedding_dim, embedding_weight=embedding_weight, hidden_size=100, num_layers=4, output_size=10) else: model = Fasttext.fasttext(vocab_size=len(Title.vocab), embedding_dim=embedding_dim, embedding_weight=embedding_weight, hidden_size=100, output_size=10) return model
def train(): x = tf.placeholder(tf.int32, [None, None], name='x') y = tf.placeholder(tf.int32, [None], name='y') lr = TextCNN.INIT_LEARNING_RATE embedding = tf.Variable(embedding_table, dtype=tf.float32, trainable=False) # embedding = tf.Variable(tf.random_uniform([TextCNN.VOCAB_SIZE, TextCNN.EMBED_FEATURE], -1.0, 1.0)) input = tf.nn.embedding_lookup(embedding, x) model = TextCNN.TextCNN() logits_train = model.inference(input, Training=True) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits_train, labels=y, name='loss') loss_ = tf.reduce_mean(loss) + tf.nn.l2_loss(model.fc.get_weights()[0]) train_op = tf.train.AdamOptimizer(lr).minimize(loss_) logits = model.inference(input) correct_pred = tf.equal(tf.argmax(logits, axis=1), tf.cast(y, tf.int64)) accuracy = tf.reduce_mean(tf.cast(correct_pred, dtype=tf.float32)) sum_correct_pred = tf.reduce_sum(tf.cast(correct_pred, dtype=tf.float32)) with tf.Session() as sess: init_op = tf.global_variables_initializer() sess.run(init_op) for epoch in range(TextCNN.EPOCH): for step, (x_, y_) in enumerate( data_helper.batch_iter(x_train, y_train, TextCNN.BATCH_SIZE)): # print(sess.run(input , feed_dict={x:x_})) _ = sess.run(train_op, feed_dict={x: x_, y: y_}) if step % 64 == 0: # print(y_) # print(sess.run(tf.argmax(logits, axis=1), feed_dict={x: x_})) print('epoch :', epoch, 'step :', step, ' train_acc = ', sess.run(accuracy, feed_dict={ x: x_, y: y_ })) sum_ = 0 for (x__, y__) in data_helper.batch_iter(x_test, y_test, TextCNN.BATCH_SIZE): tmp = sess.run(sum_correct_pred, feed_dict={x: x__, y: y__}) sum_ += tmp print('epoch ', epoch, 'acc = ', sum_ / len(y_test)) if epoch % 30 == 0: lr /= 2
x_batch.extend([x_char, x_char_pad_idx]) return x_batch, y_batch with tf.Graph().as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN(char_ngram_vocab_size=len(ngrams_dict) + 1, word_ngram_vocab_size=len(words_dict) + 1, char_vocab_size=len(chars_dict) + 1, embedding_size=FLAGS["model.emb_dim"], word_seq_len=FLAGS["data.max_len_words"], char_seq_len=FLAGS["data.max_len_chars"], l2_reg_lambda=FLAGS["train.l2_reg_lambda"], mode=FLAGS["model.emb_mode"], filter_sizes=list( map(int, FLAGS["model.filter_sizes"].split(",")))) global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(FLAGS["train.lr"]) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) print("Writing to {}\n".format(FLAGS["log.output_dir"])) if not os.path.exists(FLAGS["log.output_dir"]): os.makedirs(FLAGS["log.output_dir"])
def __init__(self, file_config, config=None, model=None, corpus=None, verbose=True, opt_param=None): print('Loading data...') self.verbose = verbose if config is None: config = TCNNConfig() self.config = config self.model_file = "{}epc{}lr{}".format( datetime.datetime.now().strftime("%d%m%Y-%H%M%S"), self.config.num_epochs, self.config.learning_rate) self.file_config = file_config if corpus is None: corpus = TwitterHashtagCorpus(file_config.train_file, file_config.vocab_file, self.config.dev_split, self.config.seq_length, self.config.vocab_size) #DOING##### self.file_config.model_file = '{}/{}'.format( self.file_config.save_path, self.model_file) self.file_config.results_train = '{}/{}.epochs'.format( self.file_config.result_path, self.model_file) self.file_config.results_train_file = None ########## self.corpus = corpus self.config.vocab_size = len(self.corpus.words) self.config.target_names = self.corpus.label_to_id.keys() self.config.num_classes = len(self.corpus.label_to_id) self.train_data = TensorDataset(torch.LongTensor(self.corpus.x_train), torch.LongTensor(self.corpus.y_train)) self.validation_data = TensorDataset( torch.LongTensor(self.corpus.x_validation), torch.LongTensor(self.corpus.y_validation)) if corpus.x_test is not None: self.test_data = TensorDataset( torch.LongTensor(self.corpus.x_test), torch.LongTensor(self.corpus.y_test)) print('Configuring CNN model...') if model is None: model = TextCNN(self.config) self.model = model if opt_param is None: opt_param = self.model.parameters() if self.verbose: print(self.corpus) print(self.model) if use_cuda: self.model.cuda() #Optimizer and Loss Function self.criterion = nn.CrossEntropyLoss( size_average=False) # nn.MultiLabelSoftMarginLoss() self.optimizer = optim.Adam(opt_param, lr=self.config.learning_rate)
def train(self, bagging_iter): for model_idx in range(FLAGS.num_models): model_idx += 1 start_time = time.time() graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto() session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) f_model_path = '../data/f_model/%sth_f_model/%sth_f_model.dat' % ( bagging_iter, model_idx) f = open(f_model_path, 'r') model = pickle.load(f) f.close() with sess.as_default(): cnn = mycnn.DepCNNv6(model_idx=model_idx, num_classes=model.type_size, vocab_size=model.mor_size + 1, pos_size=model.pos_size + 1, hc_size=model.hc_feature_size, embedding_size=FLAGS.embedding_dim, mlp_size=FLAGS.mlp_size, l2_reg_lambda=FLAGS.l2_reg_lambda) # Define Training procedure optimizer = tf.train.AdamOptimizer(0.0001) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # Initialize all variables sess.run(tf.global_variables_initializer()) out_path = '../data/training_data/' + str( bagging_iter) + 'th_out' sample_list = dt.make_sample_list_from_input_data( out_path + '/train' + str(model_idx) + '.out', model) # print('데이터 로드 끝') total_batch = len(sample_list) / FLAGS.batch_size print('total_batch = ' + str(total_batch)) for ep in range(FLAGS.num_epochs): # 학습 phase # sample 섞기 random.shuffle(sample_list) # batch 만큼 데이터 가져오기 batch_number = 0 tmp = 0 while True: tmp += 1 sample_batch = sample_list[batch_number * FLAGS.batch_size: (batch_number + 1) * FLAGS.batch_size] batch_number += 1 x_mor, x_pos, x_left_mor, x_left_pos, x_right_mor, x_right_pos, \ x_position_mark_batch, x_child_mor, x_child_pos, x_hc_batch, y_batch\ = dt.convert_to_input_vector(sample_batch, model) if sample_batch == []: if batch_number * FLAGS.batch_size > len( sample_list): break continue self.train_step(cnn, sess, train_op, x_mor, x_pos, x_child_mor, x_child_pos, x_hc_batch, y_batch) if (batch_number % (total_batch / 50)) == 0: print('.'), if batch_number * FLAGS.batch_size > len( sample_list): break # 테스트 phase total_arc = 0 correct_arc = 0 correct_sentence = 0 total_sentence = 0 correct_arc_with_tag = 0 correct_sentence_with_tag = 0 cr_test = dt.CorpusReader() cr_test.set_file( '../data/raw_data/valid_data/sejong_test_edit_VV.txt' ) parser = tp.TransitionParser(model, 1) while True: data = cr_test.get_next(1) if data == []: break if data[0].raw_sentence is None: continue parser.initialize(data) while parser.is_final_state() is False: left_mor, left_pos, right_mor, right_pos, child_mor, child_pos, hc = parser.make_input_vector( data, mode='train') x_mor = left_mor + right_mor x_pos = left_pos + right_pos hc = model.convert_to_zero_one( hc, model.hc_feature_size, mode='train') next_action = self.test_step( cnn, graph, sess, np.array([x_mor]), np.array([x_pos]), np.array([child_mor]), np.array([child_pos]), np.array([hc]), model_idx) next_action = next_action[0][0] parser.run_action(next_action, model, mode='train') # 성능 평가 predicts = parser.get_result('train') golds = data[0].correct_dep_list sentence_flag = True sentence_with_tag_flag = True for i in range(len(predicts)): if predicts[i].head == golds[i].head: correct_arc += 1 if predicts[i].type == golds[i].type: correct_arc_with_tag += 1 else: sentence_with_tag_flag = False else: sentence_flag = False sentence_with_tag_flag = False total_arc += 1 if sentence_flag is True: correct_sentence += 1 if sentence_with_tag_flag is True: correct_sentence_with_tag += 1 total_sentence += 1 if (total_sentence % 300) == 0: print('.'), cr_test.close_file() with open( '../data/log/' + str(bagging_iter) + 'th_bagging_model.txt', 'a') as f: if ep is 0: print( '%sth_total_arc = %s %sth_total_sentence = %s' % (model_idx, total_arc, model_idx, total_sentence)) f.write( '%sth_total_arc = %s %sth_total_sentence = %s\n' % (model_idx, total_arc, model_idx, total_sentence)) if not os.path.isdir('../data/ckpt/%sth_ckpt' % (bagging_iter)): os.mkdir('../data/ckpt/%sth_ckpt' % (bagging_iter)) if (ep % FLAGS.num_checkpoints) == 0: path = saver.save( sess, '../data/ckpt/%sth_ckpt/%sth_%s' % (bagging_iter, model_idx, ep)) f.write('\n') f.write( "Saved model checkpoint to {}\n".format( path)) print("Saved model checkpoint to {}\n".format( path)) f.write( str(model_idx) + 'th model result : ' + 'epoch = ' + str(ep + 1) + ', acc = ' + str(correct_arc / float(total_arc)) + ', sen_acc = ' + str(correct_sentence / float(total_sentence)) + ', acc_with_tag = ' + str(correct_arc_with_tag / float(total_arc)) + ', sen_acc_with_tag = ' + str(correct_sentence_with_tag / float(total_sentence))) print( str(model_idx) + 'th model result : ', 'epoch = ' + str(ep + 1) + ', acc = ' + str(correct_arc / float(total_arc)) + ', sen_acc = ' + str(correct_sentence / float(total_sentence)) + ', acc_with_tag = ' + str(correct_arc_with_tag / float(total_arc)) + ', sen_acc_with_tag = ' + str(correct_sentence_with_tag / float(total_sentence))) print( str(bagging_iter) + '_bagging ', str(model_idx) + 'th trainning time : ', str(time.time() - start_time))
# Training # ================================================== max_acc = 0.0 with tf.Graph().as_default(): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement, gpu_options=gpu_options) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN( sequence_length=length, # train 或者test 切换的时候要记得修改 num_classes=1999, embedding_size=FLAGS.embedding_dim, filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-3) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram(
def train_TextCNN(subject): print('Reading Data') root = roots[subject] dataset = build_dataset(root) num_topics = len(dataset['label'].unique()) common_texts = dataset['item'].tolist() print('Cleaning Data') common_texts, word2id, valid_words = filter_pad_words( common_texts, max_feature) id2word = dict(zip(word2id.values(), word2id.keys())) origin_texts = [[id2word[ind] for ind in sentence] for sentence in common_texts] print('Training Word2Vec') model = Word2Vec( origin_texts, size=embedding_size, min_count= 1, # this min_count is also used to select words in utils.clean_sentence workers=3, window=5, iter=3) print('Feeding weights') fixed = np.zeros((len(word2id), embedding_size)) for word, ind in word2id.items(): fixed[ind] = np.array(model.wv[word]) fixed = torch.from_numpy(fixed).float() Network = TextCNN(fixed, window_size_list, len(word2id), num_topics, len(word2id) - 1, dropout_rate, embedding_size).to(device) optimizer = optim.Adam(Network.parameters(), lr_schedule[0]) print('Creating training/testing set') label2id = dict(zip(dataset['label'].unique(), range(num_topics))) id2label = dict(zip(label2id.values(), label2id.keys())) X = np.array(common_texts) y = np.array([label2id[label] for label in dataset['label']]).reshape(-1, 1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101) X_train = torch.tensor(X_train).long() y_train = torch.tensor(y_train).long() X_test = torch.tensor(X_test).long() y_test = torch.tensor(y_test).long() train = TensorDataset(X_train, y_train) test = TensorDataset(X_test, y_test) train_loader = DataLoader(train, 64, True) test_loader = DataLoader(test, 64, False) print('Training\n') criterion = nn.NLLLoss() Network = Network.to(device) Network.train() for i in range(1, epoch + 1): log = [] for X_sample, y_sample in iter(train_loader): X_sample = X_sample.to(device) y_sample = y_sample.view(-1).to(device) logits = Network(X_sample) loss = criterion(logits, y_sample) log.append(loss.item()) optimizer.zero_grad() loss.backward() optimizer.step() print('Epoch {}. Average loss {:.4f}'.format(i, np.mean(log))) if i in lr_schedule: for param_group in optimizer.param_groups: param_group['lr'] = lr_schedule[i] print('\nTesting\n') predictions = [] Network.eval() with torch.no_grad(): for X_sample, _ in iter(test_loader): X_sample = X_sample.to(device) logits = Network(X_sample) _, index = logits.topk(1, 1) index = index.view(-1).cpu().numpy().tolist() predictions += index y_test = y_test.reshape(-1).tolist() y_test = [id2label[ind] for ind in y_test] predictions = [id2label[ind] for ind in predictions] print('\nTest result for {} :'.format(subject)) print(classification_report(y_test, predictions)) return TextCNN
feed_dict) return step, loss, acc with tf.Graph().as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN(char_ngram_vocab_size=len(ngrams_dict) + 1, word_ngram_vocab_size=len(words_dict) + 1, char_vocab_size=len(chars_dict) + 1, embedding_size=FLAGS.EMB_DIM, word_seq_len=FLAGS.MAX_LENGTH_WORDS, char_seq_len=FLAGS.MAX_LENGTH_CHARS, l2_reg_lambda=FLAGS.L2_REG_LAMBDA, mode=FLAGS.EMB_MODE, filter_sizes=list(map(int, FLAGS.FILTER_SIZES.split(",")))) global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(FLAGS.LR) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) print("Writing to {}\n".format(FLAGS.OUTPUT_DIR)) if not os.path.exists(FLAGS.OUTPUT_DIR): os.makedirs(FLAGS.OUTPUT_DIR)
args = parser.parse_args() device = 'cuda' if torch.cuda.is_available() else 'cpu' #device = 'cpu' best_acc = 0 start_epoch = 0 train_data = dataloader.Myarticles(args.csvdir,args.article_dir,validation=False) test_data = dataloader.Myarticles(args.csvdir,args.article_dir,validation=True) train_loader =data.DataLoader(train_data,batch_size=1,shuffle=True) test_loader =data.DataLoader(test_data,batch_size=1,shuffle=False) print('==> Loading Network structure..\n') args.vocab_size = len(train_data.word2idx) net = TextCNN.MultiCNNTextBNDeep(args.vocab_size,args.emb_dim,args.content_dim,args.pooling_dim, args.linear_dim,args.num_classes) net = net.to(device) print('==> Loading cuda...\n') criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=1e-4) savepath='./train/'+str(args.mname) if not os.path.exists(savepath): os.makedirs(savepath) def train(epoch): print('\nEpoch: %d' % epoch) net.train() train_loss = 0
def run(self): self.graph = tf.Graph() with self.graph.as_default(): f = open( '../data/f_model/%sth_f_model/%sth_f_model.dat' % (self.bagging_iter, self.model_idx), 'r') model = pickle.load(f) f.close() self.cnn = mycnn.DepCNNv6(model_idx=self.model_idx, num_classes=model.type_size, vocab_size=model.mor_size + 1, pos_size=model.pos_size + 1, hc_size=model.hc_feature_size, embedding_size=FLAGS.embedding_dim, mlp_size=FLAGS.mlp_size, l2_reg_lambda=FLAGS.l2_reg_lambda) checkpoint_file = '../data/ckpt/%sth_ckpt/%sth_%s' % ( self.bagging_iter, self.model_idx, int(FLAGS.num_epochs) - 1) cr_test = dt.CorpusReader() if self.bagging_iter == str(0) and self.mode != 'g_predict': cr_test.set_file( (self.input_path + '/test_%s.txt') % (self.file_idx)) elif self.mode == 'g_predict': cr_test.set_file(self.input_path) if self.input_path == 'default': self.input_path = '../data/test_data/%sth_test_data' cr_test.set_file((self.input_path + '/test_%s.txt') % (self.bagging_iter, self.file_idx)) elif self.input_path != 'default' and self.mode != 'g_predict': self.input_path = self.input_path cr_test.set_file( (self.input_path + '/test_%s.txt') % (self.file_idx)) session_conf = tf.ConfigProto() session_conf.gpu_options.allow_growth = True self.sess = tf.Session(config=session_conf) saver = tf.train.Saver() saver.restore(self.sess, checkpoint_file) last_flag = False data_dix = 0 parser = tp.TransitionParser(model, FLAGS.batch_size) results = [] while True: data = cr_test.get_next() data_dix += 1 parsing_trees = [0] * len(data) if data == []: last_flag == True self.file_write(parsing_trees) parsing_trees = [0] * len(data) break if data[0].raw_sentence is None: last_flag == True self.file_write(parsing_trees) parsing_trees = [0] * len(data) break if data == None: last_flag == True self.file_write(parsing_trees) parsing_trees = [0] * len(data) break for batch_idx in range(len(data)): parsing_trees[batch_idx] = { 'raw_sentence': data[batch_idx].raw_sentence, 'tree': [], 'eojeol_list': data[batch_idx].eojeol_list } parser.initialize(data) tmp_idx = 0 while parser.is_final_state() is False: features, hc = parser.make_input_vector(data) hc = model.convert_to_zero_one(hc, model.hc_feature_size) next_action = self.test_step(features, hc) next_action = next_action[0] parser.run_action(next_action, model) tmp_idx += 1 predicts = parser.get_result('test') for batch_idx, predict in enumerate(predicts): parsing_trees[batch_idx]['tree'] = (predict) results.append(parsing_trees) parsing_trees = [0] * len(data) if data_dix % 20 == 1: print( str(self.file_idx) + 'th_file , ' + str(self.model_idx) + 'th_model index, ' + str(data_dix) + '_th batch') self.file_write(results) results = [] self.file_write(results)