class Tuner(object): def __init__(self, train_corpus_fname=None, tokenized_train_corpus_fname=None, test_corpus_fname=None, tokenized_test_corpus_fname=None, model_name="bert", model_save_path=None, vocab_fname=None, eval_every=1000, batch_size=32, num_epochs=10, dropout_keep_prob_rate=0.9, model_ckpt_path=None, sp_model_path=None): # configurations tf.logging.set_verbosity(tf.logging.INFO) self.model_name = model_name self.eval_every = eval_every self.model_ckpt_path = model_ckpt_path self.model_save_path = model_save_path self.batch_size = batch_size self.num_epochs = num_epochs self.dropout_keep_prob_rate = dropout_keep_prob_rate self.best_valid_score = 0.0 if not os.path.exists(model_save_path): os.mkdir(model_save_path) # define tokenizer if self.model_name == "bert": self.tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False) elif self.model_name == "xlnet": sp = spm.SentencePieceProcessor() sp.Load(sp_model_path) self.tokenizer = sp else: self.tokenizer = get_tokenizer("mecab") # load or tokenize corpus self.train_data, self.train_data_size = self.load_or_tokenize_corpus(train_corpus_fname, tokenized_train_corpus_fname) self.test_data, self.test_data_size = self.load_or_tokenize_corpus(test_corpus_fname, tokenized_test_corpus_fname) def load_or_tokenize_corpus(self, corpus_fname, tokenized_corpus_fname): data_set = [] if os.path.exists(tokenized_corpus_fname): tf.logging.info("load tokenized corpus : " + tokenized_corpus_fname) with open(tokenized_corpus_fname, 'r') as f1: for line in f1: tokens, label = line.strip().split("\u241E") if len(tokens) > 0: data_set.append([tokens.split(" "), int(label)]) else: tf.logging.info("tokenize corpus : " + corpus_fname + " > " + tokenized_corpus_fname) with open(corpus_fname, 'r') as f2: next(f2) # skip head line for line in f2: sentence, label = line.strip().split("\u241E") if self.model_name == "bert": tokens = self.tokenizer.tokenize(sentence) elif self.model_name == "xlnet": normalized_sentence = preprocess_text(sentence, lower=False) tokens = encode_pieces(self.tokenizer, normalized_sentence, return_unicode=False, sample=False) else: tokens = self.tokenizer.morphs(sentence) tokens = post_processing(tokens) if int(label) > 0.5: int_label = 1 else: int_label = 0 data_set.append([tokens, int_label]) with open(tokenized_corpus_fname, 'w') as f3: for tokens, label in data_set: f3.writelines(' '.join(tokens) + "\u241E" + str(label) + "\n") return data_set, len(data_set) def train(self, sess, saver, global_step, output_feed): train_batches = self.get_batch(self.train_data, num_epochs=self.num_epochs, is_training=True) checkpoint_loss = 0.0 for current_input_feed in train_batches: _, _, _, current_loss = sess.run(output_feed, current_input_feed) checkpoint_loss += current_loss if global_step.eval(sess) % self.eval_every == 0: tf.logging.info("global step %d train loss %.4f" % (global_step.eval(sess), checkpoint_loss / self.eval_every)) checkpoint_loss = 0.0 self.validation(sess, saver, global_step) def validation(self, sess, saver, global_step): valid_loss, valid_pred, valid_num_data = 0, 0, 0 output_feed = [self.logits, self.loss] test_batches = self.get_batch(self.test_data, num_epochs=1, is_training=False) for current_input_feed, current_labels in test_batches: current_logits, current_loss = sess.run(output_feed, current_input_feed) current_preds = np.argmax(current_logits, axis=-1) valid_loss += current_loss valid_num_data += len(current_labels) for pred, label in zip(current_preds, current_labels): if pred == label: valid_pred += 1 valid_score = valid_pred / valid_num_data tf.logging.info("valid loss %.4f valid score %.4f" % (valid_loss, valid_score)) if valid_score > self.best_valid_score: self.best_valid_score = valid_score path = self.model_save_path + "/" + str(valid_score) saver.save(sess, path, global_step=global_step) def get_batch(self, data, num_epochs, is_training=True): if is_training: data_size = self.train_data_size else: data_size = self.test_data_size num_batches_per_epoch = int((data_size - 1) / self.batch_size) if is_training: tf.logging.info("num_batches_per_epoch : " + str(num_batches_per_epoch)) for epoch in range(num_epochs): idx = random.sample(range(data_size), data_size) data = np.array(data)[idx] for batch_num in range(num_batches_per_epoch): batch_sentences = [] batch_labels = [] start_index = batch_num * self.batch_size end_index = (batch_num + 1) * self.batch_size features = data[start_index:end_index] for feature in features: sentence, label = feature batch_sentences.append(sentence) batch_labels.append(int(label)) yield self.make_input(batch_sentences, batch_labels, is_training) def make_input(self, sentences, labels, is_training): raise NotImplementedError def tune(self): raise NotImplementedError
class Tuner(object): def __init__(self, train_corpus_fname = None, tokenized_train_corpus_fname = None, test_corpus_fname = None, tokenized_test_corpus_fname= None, model_name='bert', model_save_path = None, vocab_fname=None, eval_every=1000, batch_size=32, num_epochs=10, dropout_keep_prob_rate=0.9, model_ckpt_path=None): self.model_name = model_name self.eval_every = eval_every self.model_ckpt_path = model_ckpt_path self.model_save_path = model_save_path self.batch_size = batch_size self.num_epochs = num_epochs self.dropout_keep_prob_rate = dropout_keep_prob_rate self.best_valid_score = 0.0 #tokenizer defining if self.model_name =='bert': self.tokenizer = FullTokenizer(vocab_file = vocab_fname, do_lower_case = False) else: self.tokenizer = get_tokenizer('mecab') #load or tokenize corpus self.train_data, self.train_data_size = self.load_or_tokenize_corpus(train_corpus_fname, tokenized_train_corpus_fname) self.test_data, self.test_data_size = self.load_or_tokenize_corpus(test_corpus_fname, tokenized_test_corpus_fname) def load_or_tokenize_corpus(self, corpus_fname, tokenized_corpus_fname): data_set = [] if os.path.exists(tokenized_corpus_fname): tf.logging.info('load tokenized corpus : ' + tokenized_corpus_fname) with open(tokenized_corpus_fname, 'r') as f1: for line in f1: tokens, label = line.strip().split('\u241E') if len(tokens) > 0: data_set.append([tokens.split(" "), int(label)]) else : with open(corpus_fname, 'r') as f2: next(f2) #skip head line for line in f2: sentence, label = line.strip().split('\u241E') if self.model_name == 'bert': tokens = self.tokenizer.tokenize(sentence) else: tokens = self.tokenizer.morphs(sentence) tokens = post_processing(tokens) #labelling if int(label) >=1: int_label = 1 else: int_label = 0 data_set.append([tokens, int_label]) with open(tokenized_corpus_fname, 'w') as f3: for tokens, label in data_set: f3.writelines(' '.join(tokens) + '\u241E' + str(label) + '\n') return data_set, len(data_set) def get_batch(self, data, num_epochs, is_training = True): data_size = self.train_data_size num_batches_per_epoch = int((data_size - 1) / self.batch_size) + 1 if is_training: for epoch in range(num_epocs): idx = random.sample(range(data_size), data_size) data = np.array(data)[idx] for batch_num in range(num_batches_per_epoch): batch_sentences = [] batch_labels = [] start_index = batch_num * self.batch_size end_index = min((batch_num+1)* self.batch_size, data_size) features = data[start_index : end_index] for features in features: sentence, label = feature batch_sentences.append(sentence) batch_labels.append(int(label)) yield self.make_input(batch_sentences, batch_labels, is_training) def train(self, sess, saver, global_step, output_feed): train_batches = self.get_batch(self.train_data, self.num_epochs, is_training=True) checkpoint_loss = 0.0 for current_input_feed in train_batches: _,_,_, current_loss = sess.run(output_feed, current_input_feed) checkpoint_loss += current_loss if global_step.eval(sess) % self.eval_every == 0 : tf.logging.info("global step %d train loss %.4f" % (global_step.eval(sess), checkpoint_loss / self.eval_every)) checkpoint_loss = 0.0 self.validation(sess, saver, global_step) def validation(self, sess, saver, global_step): valid_loss, valid_pred, valid_num_data = 0,0,0 output_feed = [self.logits, self.loss] test_batches = self.get_batch(self.test_data, num_epochs = 1, is_training= False) for current_input_feed, current_labels in test_batches: current_logits, current_loss = sess.run(output_feed, current_input_feed) current_preds = np.argmax(current_logits, axis= -1) valid_loss += current_loss valid_num_data += len(current_labels) for pred, label in zip(current_preds, current_labels): if pred == label : valid_pred +=1 valid_score = valid_pred / valid_num_data tf.logging.info('valid loss %.4f valid score %.4f'%(valid_loss, valid_score)) if valid_score > self.best_valid_score: self.best_valid_score = valid_score path = self.model_save_path + '/' +str(valid_score) saver.save(sess, path, global_step=global_step) def make_input(self, sentences, labels, is_training): raise NotImplementedError def tune(self): raise NotImplementedError