Exemple #1
0
class Tuner(object):

    def __init__(self, train_corpus_fname=None, tokenized_train_corpus_fname=None,
                 test_corpus_fname=None, tokenized_test_corpus_fname=None,
                 model_name="bert", model_save_path=None, vocab_fname=None, eval_every=1000,
                 batch_size=32, num_epochs=10, dropout_keep_prob_rate=0.9, model_ckpt_path=None,
                 sp_model_path=None):
        # configurations
        tf.logging.set_verbosity(tf.logging.INFO)
        self.model_name = model_name
        self.eval_every = eval_every
        self.model_ckpt_path = model_ckpt_path
        self.model_save_path = model_save_path
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.dropout_keep_prob_rate = dropout_keep_prob_rate
        self.best_valid_score = 0.0
        if not os.path.exists(model_save_path):
            os.mkdir(model_save_path)
        # define tokenizer
        if self.model_name == "bert":
            self.tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False)
        elif self.model_name == "xlnet":
            sp = spm.SentencePieceProcessor()
            sp.Load(sp_model_path)
            self.tokenizer = sp
        else:
            self.tokenizer = get_tokenizer("mecab")
        # load or tokenize corpus
        self.train_data, self.train_data_size = self.load_or_tokenize_corpus(train_corpus_fname, tokenized_train_corpus_fname)
        self.test_data, self.test_data_size = self.load_or_tokenize_corpus(test_corpus_fname, tokenized_test_corpus_fname)

    def load_or_tokenize_corpus(self, corpus_fname, tokenized_corpus_fname):
        data_set = []
        if os.path.exists(tokenized_corpus_fname):
            tf.logging.info("load tokenized corpus : " + tokenized_corpus_fname)
            with open(tokenized_corpus_fname, 'r') as f1:
                for line in f1:
                    tokens, label = line.strip().split("\u241E")
                    if len(tokens) > 0:
                        data_set.append([tokens.split(" "), int(label)])
        else:
            tf.logging.info("tokenize corpus : " + corpus_fname + " > " + tokenized_corpus_fname)
            with open(corpus_fname, 'r') as f2:
                next(f2)  # skip head line
                for line in f2:
                    sentence, label = line.strip().split("\u241E")
                    if self.model_name == "bert":
                        tokens = self.tokenizer.tokenize(sentence)
                    elif self.model_name == "xlnet":
                        normalized_sentence = preprocess_text(sentence, lower=False)
                        tokens = encode_pieces(self.tokenizer, normalized_sentence, return_unicode=False, sample=False)
                    else:
                        tokens = self.tokenizer.morphs(sentence)
                        tokens = post_processing(tokens)
                    if int(label) > 0.5:
                        int_label = 1
                    else:
                        int_label = 0
                    data_set.append([tokens, int_label])
            with open(tokenized_corpus_fname, 'w') as f3:
                for tokens, label in data_set:
                    f3.writelines(' '.join(tokens) + "\u241E" + str(label) + "\n")
        return data_set, len(data_set)

    def train(self, sess, saver, global_step, output_feed):
        train_batches = self.get_batch(self.train_data, num_epochs=self.num_epochs, is_training=True)
        checkpoint_loss = 0.0
        for current_input_feed in train_batches:
            _, _, _, current_loss = sess.run(output_feed, current_input_feed)
            checkpoint_loss += current_loss
            if global_step.eval(sess) % self.eval_every == 0:
                tf.logging.info("global step %d train loss %.4f" %
                                (global_step.eval(sess), checkpoint_loss / self.eval_every))
                checkpoint_loss = 0.0
                self.validation(sess, saver, global_step)

    def validation(self, sess, saver, global_step):
        valid_loss, valid_pred, valid_num_data = 0, 0, 0
        output_feed = [self.logits, self.loss]
        test_batches = self.get_batch(self.test_data, num_epochs=1, is_training=False)
        for current_input_feed, current_labels in test_batches:
            current_logits, current_loss = sess.run(output_feed, current_input_feed)
            current_preds = np.argmax(current_logits, axis=-1)
            valid_loss += current_loss
            valid_num_data += len(current_labels)
            for pred, label in zip(current_preds, current_labels):
                if pred == label:
                    valid_pred += 1
        valid_score = valid_pred / valid_num_data
        tf.logging.info("valid loss %.4f valid score %.4f" %
                        (valid_loss, valid_score))
        if valid_score > self.best_valid_score:
            self.best_valid_score = valid_score
            path = self.model_save_path + "/" + str(valid_score)
            saver.save(sess, path, global_step=global_step)

    def get_batch(self, data, num_epochs, is_training=True):
        if is_training:
            data_size = self.train_data_size
        else:
            data_size = self.test_data_size
        num_batches_per_epoch = int((data_size - 1) / self.batch_size)
        if is_training:
            tf.logging.info("num_batches_per_epoch : " + str(num_batches_per_epoch))
        for epoch in range(num_epochs):
            idx = random.sample(range(data_size), data_size)
            data = np.array(data)[idx]
            for batch_num in range(num_batches_per_epoch):
                batch_sentences = []
                batch_labels = []
                start_index = batch_num * self.batch_size
                end_index = (batch_num + 1) * self.batch_size
                features = data[start_index:end_index]
                for feature in features:
                    sentence, label = feature
                    batch_sentences.append(sentence)
                    batch_labels.append(int(label))
                yield self.make_input(batch_sentences, batch_labels, is_training)

    def make_input(self, sentences, labels, is_training):
        raise NotImplementedError

    def tune(self):
        raise NotImplementedError
Exemple #2
0
class Tuner(object):
    
    def __init__(self, train_corpus_fname = None,
                tokenized_train_corpus_fname = None,
                test_corpus_fname = None, tokenized_test_corpus_fname= None,
                model_name='bert', model_save_path = None, vocab_fname=None,
                eval_every=1000,
                batch_size=32, num_epochs=10, dropout_keep_prob_rate=0.9,
                model_ckpt_path=None):
        
        self.model_name = model_name
        self.eval_every = eval_every
        self.model_ckpt_path = model_ckpt_path
        self.model_save_path = model_save_path
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.dropout_keep_prob_rate = dropout_keep_prob_rate
        self.best_valid_score = 0.0
        
        #tokenizer defining
        if self.model_name =='bert':
            self.tokenizer = FullTokenizer(vocab_file = vocab_fname, do_lower_case = False)
        else:
            self.tokenizer = get_tokenizer('mecab')
            
        #load or tokenize corpus
        
        self.train_data, self.train_data_size = self.load_or_tokenize_corpus(train_corpus_fname, tokenized_train_corpus_fname)
        self.test_data, self.test_data_size = self.load_or_tokenize_corpus(test_corpus_fname, tokenized_test_corpus_fname)
    
    
    def load_or_tokenize_corpus(self, corpus_fname, tokenized_corpus_fname):
        data_set = []
        if os.path.exists(tokenized_corpus_fname):
            tf.logging.info('load tokenized corpus : ' + tokenized_corpus_fname)
            with open(tokenized_corpus_fname, 'r') as f1:
                for line in f1:
                    tokens, label = line.strip().split('\u241E')
                    if len(tokens) > 0:
                        data_set.append([tokens.split(" "), int(label)])
                        
        else :
            with open(corpus_fname, 'r') as f2:
                next(f2) #skip head line
                for line in f2:
                    sentence, label = line.strip().split('\u241E')
                    if self.model_name == 'bert':
                        tokens = self.tokenizer.tokenize(sentence)
                    else:
                        tokens = self.tokenizer.morphs(sentence)
                        tokens = post_processing(tokens)
                    
                    #labelling
                    if int(label) >=1:
                        int_label = 1
                    else:
                        int_label = 0
                    data_set.append([tokens, int_label])
            with open(tokenized_corpus_fname, 'w') as f3:
                for tokens, label in data_set:
                    f3.writelines(' '.join(tokens) + '\u241E' + str(label) + '\n')
        return data_set, len(data_set)
    
    
    def get_batch(self, data, num_epochs, is_training = True):

        data_size = self.train_data_size
        num_batches_per_epoch = int((data_size - 1) / self.batch_size) + 1
        
        if is_training:
            for epoch in range(num_epocs):
                idx = random.sample(range(data_size), data_size)
                data = np.array(data)[idx]
                for batch_num in range(num_batches_per_epoch):
                    batch_sentences = []
                    batch_labels = []
                    start_index = batch_num * self.batch_size
                    end_index = min((batch_num+1)* self.batch_size, data_size)
                    features = data[start_index : end_index]
                    for features in features:
                        sentence, label = feature
                        batch_sentences.append(sentence)
                        batch_labels.append(int(label))
                    yield self.make_input(batch_sentences, batch_labels, is_training)
                    
                    
                    
    def train(self, sess, saver, global_step, output_feed):
        train_batches = self.get_batch(self.train_data, self.num_epochs, is_training=True)
        checkpoint_loss = 0.0
        for current_input_feed in train_batches:
            _,_,_, current_loss = sess.run(output_feed, current_input_feed)
            checkpoint_loss += current_loss
            if global_step.eval(sess) % self.eval_every == 0 :
                tf.logging.info("global step %d train loss %.4f" % (global_step.eval(sess), checkpoint_loss / self.eval_every))
                checkpoint_loss = 0.0
                self.validation(sess, saver, global_step)
                
                
    def validation(self, sess, saver, global_step):
        valid_loss, valid_pred, valid_num_data = 0,0,0
        output_feed = [self.logits, self.loss]
        test_batches = self.get_batch(self.test_data, num_epochs = 1, is_training= False)
        
        for current_input_feed, current_labels in test_batches:
            current_logits, current_loss = sess.run(output_feed, current_input_feed)
            current_preds = np.argmax(current_logits, axis= -1)
            valid_loss += current_loss
            valid_num_data += len(current_labels)
            for pred, label in zip(current_preds, current_labels):
                if pred == label :
                    valid_pred +=1
        valid_score = valid_pred / valid_num_data
        tf.logging.info('valid loss %.4f valid score %.4f'%(valid_loss, valid_score))
        
        if valid_score > self.best_valid_score:
            self.best_valid_score = valid_score
            path = self.model_save_path + '/' +str(valid_score)
            saver.save(sess, path, global_step=global_step)
            
            
        def make_input(self, sentences, labels, is_training):
            raise NotImplementedError
            
        def tune(self):
            raise NotImplementedError