def check(self, filepath, lineno): """ 2번 이상 memory leak 이 발생한 경우만 저장한다. 따라서, 테스트코드에서 같은 명령을 2회 이상 실행해야 한다. :param filepath: :param lineno: :return: """ if self.enable: lineno = int(lineno) garbage_len = self.gabage_len() leaks_count = garbage_len - self.last_garbage_len self.last_garbage_len = garbage_len # print('total_byes:', self.memory.total_memory()) if leaks_count > 0: # detect memory leak increased_bytes = self.memory.increased_bytes() print('increased_bytes:', increased_bytes) self.total_leaks_count += leaks_count self.total_increased_bytes += increased_bytes if self.show_lines: line = 'leaks: %s bytes(%s)\n' % (NumUtil.comma_str( increased_bytes), NumUtil.comma_str(leaks_count)) for i in range(lineno - 1, lineno): # print(linecache.getline(filepath, i).strip()) line += '\tincreased:%s bytes\t%s:%s\t%s\n' % ( NumUtil.comma_str(increased_bytes), filepath, i, linecache.getline(filepath, i).strip()) self.total_lines.append(line) return increased_bytes return 0
def collect_characters(sentences_file: str, characters_file: str, max_test: int = 0): """ 문장 파일을 읽어서, 유니크한 문자(음절)들을 추출 한다. 추후 corpus기반으로 one hot vector 생성시 사용한다. :param sentences_file: *.sentences file path :param characters_file: *.characters file path :param max_test: 0=run all :return: """ total = FileUtil.count_lines(sentences_file, gzip_format=True) log.info('total: %s' % NumUtil.comma_str(total)) char_set = set() with gzip.open(sentences_file, 'rt') as f: for i, sentence in enumerate(f): i += 1 if i % 10000 == 0: log.info( '%s %.1f%% writed.' % (os.path.basename(characters_file), i / total * 100)) _char_set = set([c for c in sentence]) char_set.update(_char_set) if 0 < max_test <= i: break char_list = list(char_set) char_list.sort() if max_test == 0: # 0=full with open(characters_file, 'w') as f: for c in char_list: f.write(c) f.write('\n') log.info('writed to %s OK.' % characters_file)
def dump_urls(mongo_url, db_name, collection_name, urls_file, mongo_query=None, limit=0): if mongo_query is None: mongo_query = {} corpus_mongo = MongodbUtil(mongo_url, db_name=db_name, collection_name=collection_name) total = corpus_mongo.count() log.info('%s total: %s' % (corpus_mongo, NumUtil.comma_str(total))) output_dir = os.path.basename(urls_file) if not os.path.exists(output_dir): os.makedirs(output_dir) with open(urls_file, 'wt') as out_f: for i, row in enumerate(corpus_mongo.find(mongo_query, limit=limit)): if i % 1000 == 0: log.info('%s %.1f%% writed.' % (os.path.basename(urls_file), i / total * 100)) out_f.write(row['url']) out_f.write('\n')
def summary(self): print('total_byes:', self.memory.total_memory()) if self.enable: if self.show_lines: summary = '[leak summary]\ntotal bytes: %s, total lines: %s, increased: %s bytes\n' % ( NumUtil.comma_str( self.memory.total_memory()), len(self.total_lines), NumUtil.comma_str(self.total_increased_bytes)) return '\n'.join(self.total_lines) + summary else: summary = '[leak summary]\ntotal bytes: %s, total increased: %s bytes\n' % ( NumUtil.comma_str(self.memory.total_memory()), NumUtil.comma_str(self.total_increased_bytes)) return summary else: return ''
def dump_corpus(mongo_url, db_name, collection_name, sentences_file, mongo_query=None, limit=None): """ Mongodb에서 문서를 읽어서, 문장 단위로 저장한다. (단 문장안의 단어가 1개 이거나, 한글이 전혀 없는 문장은 추출하지 않는다.) :param mongo_url: mongodb://~~~ :param db_name: database name of mongodb :param collection_name: collection name of mongodb :param sentences_file: *.sentence file :param mongo_query: default={} :param limit: :return: """ if mongo_query is None: mongo_query = {} corpus_mongo = MongodbUtil(mongo_url, db_name=db_name, collection_name=collection_name) total = corpus_mongo.count() log.info('%s total: %s' % (corpus_mongo, NumUtil.comma_str(total))) output_dir = os.path.basename(sentences_file) if not os.path.exists(output_dir): os.makedirs(output_dir) with gzip.open(sentences_file, 'wt') as out_f: for i, row in enumerate(corpus_mongo.find(mongo_query, limit=limit)): # print('url:', row['url']) for c in row['content']: if i % 1000 == 0: print('%.1f%% writed.' % (i / total * 100)) for s in HangulUtil.text2sentences(c['sentences']): if HangulUtil.has_hangul(s): out_f.write(s) out_f.write('\n')
def learning(cls, total_epoch, n_train, n_valid, n_test, batch_size, left_gram, right_gram, model_file, features_vector, labels_vector, n_hidden1=100, learning_rate=0.01, early_stop_cost=0.001): ngram = left_gram + right_gram n_features = len(features_vector) * ngram # number of features = 17,380 * 4 n_classes = len(labels_vector) if len(labels_vector) >= 3 else 1 # number of classes = 2 but len=1 log.info('load characters list...') log.info('load characters list OK. len: %s\n' % NumUtil.comma_str(len(features_vector))) watch = WatchUtil() train_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing', 'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.train.gz' % (n_train, left_gram, right_gram)) valid_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing', 'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.test.gz' % (n_valid, left_gram, right_gram)) test_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing', 'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.valid.gz' % (n_test, left_gram, right_gram)) if not os.path.exists(train_file) or not os.path.exists(valid_file) or not os.path.exists(test_file): dataset_dir = os.path.dirname(train_file) if not os.path.exists(dataset_dir): os.makedirs(dataset_dir) watch.start('create dataset') log.info('create dataset...') data_files = (('train', KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE, n_train, train_file, False), ('valid', KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE, n_valid, valid_file, False), ('test', KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE, n_test, test_file, False)) for name, data_file, total, dataset_file, to_one_hot_vector in data_files: check_interval = 10000 log.info('check_interval: %s' % check_interval) log.info('%s %s total: %s' % (name, os.path.basename(data_file), NumUtil.comma_str(total))) features, labels = [], [] with gzip.open(data_file, 'rt', encoding='utf8') as f: for i, line in enumerate(f, 1): if total < i: break if i % check_interval == 0: time.sleep(0.01) # prevent cpu overload percent = i / total * 100 log.info('create dataset... %.1f%% readed. data len: %s. %s' % (percent, NumUtil.comma_str(len(features)), data_file)) _f, _l = WordSpacing.sentence2features_labels(line.strip(), left_gram=left_gram, right_gram=right_gram) features.extend(_f) labels.extend(_l) dataset = DataSet(features=features, labels=labels, features_vector=features_vector, labels_vector=labels_vector, name=name) log.info('dataset save... %s' % dataset_file) dataset.save(dataset_file, gzip_format=True, verbose=True) log.info('dataset save OK. %s' % dataset_file) log.info('dataset: %s' % dataset) log.info('create dataset OK.') log.info('') watch.stop('create dataset') watch.start('dataset load') log.info('dataset load...') train = DataSet.load(train_file, gzip_format=True, verbose=True) if n_train >= int('100,000'.replace(',', '')): valid = DataSet.load(valid_file, gzip_format=True, verbose=True) else: valid = DataSet.load(train_file, gzip_format=True, verbose=True) log.info('valid.convert_to_one_hot_vector()...') valid = valid.convert_to_one_hot_vector(verbose=True) log.info('valid.convert_to_one_hot_vector() OK.') log.info('train dataset: %s' % train) log.info('valid dataset: %s' % valid) log.info('dataset load OK.') log.info('') watch.stop('dataset load') graph = WordSpacing.build_FFNN(n_features, n_classes, n_hidden1, learning_rate, watch) train_step, X, Y, cost, predicted, accuracy = graph['train_step'], graph['X'], graph['Y'], graph['cost'], graph['predicted'], graph['accuracy'] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) check_interval = 10 # max(1, min(1000, n_train // 10)) nth_train, nth_input, total_input = 0, 0, total_epoch * train.size log.info('learn...') log.info('total: %s' % NumUtil.comma_str(train.size)) watch.start('learn') valid_cost = sys.float_info.max for epoch in range(1, total_epoch + 1): if valid_cost < early_stop_cost: break for step, (features_batch, labels_batch) in enumerate(train.next_batch(batch_size=batch_size), 1): if valid_cost < early_stop_cost: log.info('valid_cost: %s, early_stop_cost: %s, early stopped.' % (valid_cost, early_stop_cost)) break nth_train += 1 nth_input += features_batch.shape[0] sess.run(train_step, feed_dict={X: features_batch, Y: labels_batch}) # if step % check_interval == 1: percent = nth_input / total_input * 100 valid_cost = sess.run(cost, feed_dict={X: valid.features, Y: valid.labels}) log.info('[epoch=%s][%.1f%%] %s cost: %.4f' % (epoch, percent, valid.name, valid_cost)) watch.stop('learn') log.info('learn OK.\n') log.info('model save... %s' % model_file) watch.start('model save...') model_dir = os.path.dirname(model_file) if not os.path.exists(model_dir): os.makedirs(model_dir) saver = tf.train.Saver() saver.save(sess, model_file) watch.stop('model save...') log.info('model save OK. %s' % model_file) log.info('\n') log.info('batch_size: %s' % batch_size) log.info(watch.summary()) log.info('\n')
log.info('test_sentences_file: %s' % test_sentences_file) characters_file = KO_WIKIPEDIA_ORG_CHARACTERS_FILE log.info('characters_file: %s' % characters_file) try: if len(sys.argv) == 4: n_train = int(sys.argv[1]) left_gram = int(sys.argv[2]) right_gram = int(sys.argv[3]) else: n_train, left_gram, right_gram = 100, 2, 2 # n_train = int('1,000,000'.replace(',', '')) # 1M data (학습: 17시간 소요) ngram = left_gram + right_gram n_valid, n_test = 100, 100 log.info('n_train: %s' % NumUtil.comma_str(n_train)) log.info('n_valid: %s' % NumUtil.comma_str(n_valid)) log.info('n_test: %s' % NumUtil.comma_str(n_test)) log.info('left_gram: %s, right_gram: %s' % (left_gram, right_gram)) log.info('ngram: %s' % ngram) total_sentences = FileUtil.count_lines(KO_WIKIPEDIA_ORG_SENTENCES_FILE) model_file = os.path.join(KO_WIKIPEDIA_ORG_WORD_SPACING_MODEL_DIR, 'word_spacing_model.sentences=%s.left_gram=%s.right_gram=%s/model' % ( n_train, left_gram, right_gram)) # .%s' % max_sentences log.info('model_file: %s' % model_file) batch_size = 500 # mini batch size log.info('batch_size: %s' % batch_size) total_epoch = 100 # min(100, 1000000 // n_train) # 1 ~ 100
def dump_corpus(mongo_url, db_name, collection_name, sentences_file, characters_file, info_file, urls_file, train_sentences_file, valid_sentences_file, test_sentences_file, mongo_query=None, limit=None): """ Mongodb에서 문서를 읽어서, 문장 단위로 저장한다. (단 문장안의 단어가 1개 이거나, 한글이 전혀 없는 문장은 추출하지 않는다.) :param characters_file: :param urls_file: :param info_file: :param mongo_url: mongodb://~~~ :param db_name: database name of mongodb :param collection_name: collection name of mongodb :param sentences_file: *.sentence file :param train_sentences_file: :param valid_sentences_file: :param test_sentences_file: :param mongo_query: default={} :param limit: :return: """ if mongo_query is None: mongo_query = {} corpus_mongo = MongodbUtil(mongo_url, db_name=db_name, collection_name=collection_name) total_docs = corpus_mongo.count() log.info('%s total: %s' % (corpus_mongo, NumUtil.comma_str(total_docs))) output_dir = os.path.basename(sentences_file) if not os.path.exists(output_dir): os.makedirs(output_dir) with gzip.open(sentences_file, 'wt') as out_f, \ gzip.open(train_sentences_file, 'wt') as train_f, \ gzip.open(valid_sentences_file, 'wt') as valid_f, \ gzip.open(test_sentences_file, 'wt') as test_f, \ open(info_file, 'wt') as info_f, \ open(urls_file, 'wt') as urls_f: char_set = set() n_docs = n_total = n_train = n_valid = n_test = 0 if limit: cursor = corpus_mongo.find(mongo_query, limit=limit) else: cursor = corpus_mongo.find(mongo_query) for i, row in enumerate(cursor, 1): if i % 1000 == 0: log.info('%s %.1f%% writed.' % (os.path.basename(sentences_file), i / total_docs * 100)) sentences = [] for c in row['content']: sentences.extend(HangulUtil.text2sentences(c['sentences'], remove_only_one_word=True, has_hangul=True)) # sentences = HangulUtil.text2sentences(row['content'], remove_only_one_word=True, has_hangul=True) log.debug('url: %s, len: %s' % (row['url'], len(sentences))) if len(sentences) == 0: # log.error(row['content']) continue urls_f.write(row['url']) urls_f.write('\n') n_docs += 1 for s in sentences: _char_set = set([c for c in s]) char_set.update(_char_set) n_total += 1 out_f.write(s) out_f.write('\n') if len(sentences) >= 10: # can split test_len = valid_len = len(sentences) // 10 # log.info('train: %s, test: %s, valid: %s' % (len(sentences) - test_len - valid_len, test_len, valid_len)) for s in sentences[:test_len]: n_test += 1 test_f.write(s) test_f.write('\n') for s in sentences[test_len:test_len + valid_len]: n_valid += 1 valid_f.write(s) valid_f.write('\n') for s in sentences[test_len + valid_len:]: n_train += 1 train_f.write(s) train_f.write('\n') else: # can't split for s in sentences: n_train += 1 train_f.write(s) train_f.write('\n') char_list = list(char_set) char_list.sort() log.info('writed to %s...' % characters_file) with open(characters_file, 'w') as f: for c in char_list: f.write(c) f.write('\n') log.info('writed to %s OK.' % characters_file) log.info('total docs: %s', NumUtil.comma_str(total_docs)) log.info('total docs: %s (has hangul sentence)', NumUtil.comma_str(n_docs)) log.info('total sentences: %s (has hangul sentence)', NumUtil.comma_str(n_total)) log.info('train: %s', NumUtil.comma_str(n_train)) log.info('valid: %s', NumUtil.comma_str(n_valid)) log.info('test: %s', NumUtil.comma_str(n_test)) log.info('total characters: %s', NumUtil.comma_str(len(char_list))) info_f.write('total docs: %s\n' % NumUtil.comma_str(total_docs)) info_f.write('total docs: %s (has hangul sentence)\n' % NumUtil.comma_str(n_docs)) info_f.write('total sentences: %s (has hangul sentence)\n' % NumUtil.comma_str(n_total)) info_f.write('train: %s\n' % NumUtil.comma_str(n_train)) info_f.write('valid: %s\n' % NumUtil.comma_str(n_valid)) info_f.write('test: %s\n' % NumUtil.comma_str(n_test)) info_f.write('total characters: %s\n' % NumUtil.comma_str(len(char_list)))
def learning(cls, total_epoch, n_train, n_valid, n_test, batch_size, window_size, noise_rate, model_file, features_vector, labels_vector, n_hidden1, learning_rate, dropout_keep_rate, early_stop_cost=0.001): n_features = len(features_vector) * window_size # number of features = 17,382 * 10 log.info('load characters list...') log.info('load characters list OK. len: %s' % NumUtil.comma_str(len(features_vector))) watch = WatchUtil() train_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'spelling_error_correction', 'ko.wikipedia.org.dataset.sentences=%s.window_size=%d.train.gz' % (n_train, window_size)) valid_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'spelling_error_correction', 'ko.wikipedia.org.dataset.sentences=%s.window_size=%d.valid.gz' % (n_valid, window_size)) test_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'spelling_error_correction', 'ko.wikipedia.org.dataset.sentences=%s.window_size=%d.test.gz' % (n_test, window_size)) log.info('train_file: %s' % train_file) log.info('valid_file: %s' % valid_file) log.info('test_file: %s' % test_file) if not os.path.exists(train_file) or not os.path.exists(valid_file) or not os.path.exists(test_file): dataset_dir = os.path.dirname(train_file) if not os.path.exists(dataset_dir): os.makedirs(dataset_dir) watch.start('create dataset') # FIXME: out of memory (1M sentences) log.info('create dataset...') data_files = (('train', KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE, n_train, train_file, False), ('valid', KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE, n_valid, valid_file, False), ('test', KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE, n_test, test_file, False)) for (name, data_file, total, dataset_file, to_one_hot_vector) in data_files: check_interval = 10000 log.info('check_interval: %s' % check_interval) log.info('%s %s total: %s' % (name, os.path.basename(data_file), NumUtil.comma_str(total))) log.info('noise_rate: %s' % noise_rate) features, labels = [], [] with gzip.open(data_file, 'rt') as f: for i, line in enumerate(f, 1): if total < i: break if i % check_interval == 0: time.sleep(0.01) # prevent cpu overload percent = i / total * 100 log.info('create dataset... %.1f%% readed. data len: %s. %s' % (percent, NumUtil.comma_str(len(features)), data_file)) sentence = line.strip() for start in range(0, len(sentence) - window_size + 1): # 문자 단위로 노이즈(공백) 생성 chars = sentence[start: start + window_size] for idx in range(len(chars)): noised_chars = StringUtil.replace_with_index(chars, ' ', idx) features.append(noised_chars) labels.append(chars) log.debug('create dataset... %s "%s" -> "%s"' % (name, noised_chars, chars)) # log.info('noise_sampling: %s' % noise_sampling) # for nth_sample in range(noise_sampling): # 초성, 중성, 종성 단위로 노이즈 생성 # for start in range(0, len(sentence) - window_size + 1): # chars = sentence[start: start + window_size] # noised_chars = SpellingErrorCorrection.encode_noise(chars, noise_rate=noise_rate, noise_with_blank=True) # if chars == noised_chars: # continue # if i % check_interval == 0 and nth_sample == 0: # log.info('create dataset... %s "%s" -> "%s"' % (name, noised_chars, chars)) # features.append(noised_chars) # labels.append(chars) # print('dataset features:', features) # print('dataset labels:', labels) dataset = DataSet(features=features, labels=labels, features_vector=features_vector, labels_vector=labels_vector, name=name) log.info('dataset save... %s' % dataset_file) dataset.save(dataset_file, gzip_format=True, verbose=True) log.info('dataset save OK. %s' % dataset_file) log.info('dataset: %s' % dataset) log.info('create dataset OK.') log.info('') watch.stop('create dataset') watch.start('dataset load') log.info('dataset load...') train = DataSet.load(train_file, gzip_format=True, verbose=True) if n_train >= int('100,000'.replace(',', '')): valid = DataSet.load(valid_file, gzip_format=True, verbose=True) else: valid = DataSet.load(train_file, gzip_format=True, verbose=True) log.info('valid.convert_to_one_hot_vector()...') valid = valid.convert_to_one_hot_vector(verbose=True) log.info('valid.convert_to_one_hot_vector() OK.') log.info('train dataset: %s' % train) log.info('valid dataset: %s' % valid) log.info('dataset load OK.') log.info('') watch.stop('dataset load') X, Y, dropout_keep_prob, train_step, cost, y_hat, accuracy = SpellingErrorCorrection.build_DAE(n_features, window_size, noise_rate, n_hidden1, learning_rate, watch) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) check_interval = max(1, min(1000, n_train // 10)) nth_train, nth_input, total_input = 0, 0, total_epoch * train.size log.info('') log.info('learn...') log.info('total_epoch: %s' % total_epoch) log.info('train.size (total features): %s' % NumUtil.comma_str(train.size)) log.info('check_interval: %s' % check_interval) log.info('total_epoch: %s' % total_epoch) log.info('batch_size: %s' % batch_size) log.info('total_input: %s (total_epoch * train.size)' % total_input) log.info('') watch.start('learn') valid_cost = sys.float_info.max for epoch in range(1, total_epoch + 1): if valid_cost < early_stop_cost: log.info('valid_cost: %s, early_stop_cost: %s, early stopped.' % (valid_cost, early_stop_cost)) break for step, (features_batch, labels_batch) in enumerate(train.next_batch(batch_size=batch_size, to_one_hot_vector=True), 1): if valid_cost < early_stop_cost: break nth_train += 1 nth_input += features_batch.shape[0] sess.run(train_step, feed_dict={X: features_batch, Y: labels_batch, dropout_keep_prob: dropout_keep_rate}) # if nth_train % check_interval == 1: percent = nth_input / total_input * 100 valid_cost = sess.run(cost, feed_dict={X: valid.features, Y: valid.labels, dropout_keep_prob: 1.0}) log.info('[epoch=%s][%.1f%%] %s cost: %.8f' % (epoch, percent, valid.name, valid_cost)) watch.stop('learn') log.info('learn OK.') log.info('') log.info('model save... %s' % model_file) watch.start('model save...') model_dir = os.path.dirname(model_file) if not os.path.exists(model_dir): os.makedirs(model_dir) saver = tf.train.Saver() saver.save(sess, model_file) watch.stop('model save...') log.info('model save OK. %s' % model_file) log.info('') log.info('total_epoch: %s' % total_epoch) log.info('batch_size: %s' % batch_size) log.info('total_input: %s (total_epoch * train.size)' % total_input) log.info('') log.info(watch.summary()) log.info('')
n_test = min(100, n_train // 10) if noise_rate is None or window_size is None: # default window_size = 6 # 2 ~ 10 # feature로 추출할 문자 수 (label과 동일) noise_rate = max(0.1, 1 / window_size) # 0.0 ~ 1.0 # noise_rate = 노이즈 문자 수 / 전체 문자 수 (windos 안에서 최소 한 글자는 노이즈가 생기도록 함.) dropout_keep_rate = 1.0 # 0.0 ~ 1.0 # one hot vector에 경우에 dropout 사용시, 학습이 안 됨. # total_epoch = max(10, 100 // window_size) # 10 ~ 100 # window_size 가 클 수록 total_epoch는 작아도 됨. total_epoch = min(100, 1000000 // n_train) # 1 ~ 100 batch_size = min(100, 10 * window_size) # 1 ~ 100 # one hot vector 입력이면, batch_size 작게 잡아야 학습이 잘 된다. batch_size가 너무 크면, 전혀 학습이 안 됨. n_hidden1 = min(1000, 10 * window_size) # 10 ~ 1000 learning_rate = 1 / total_epoch # 0.01 # 0.1 ~ 0.001 # total_epoch 가 클 수록 learning_rate는 작아도 됨. early_stop_cost = 0.0001 log.info('') log.info('n_train (sentences): %s' % NumUtil.comma_str(n_train)) log.info('n_valid (sentences): %s' % NumUtil.comma_str(n_valid)) log.info('n_test (sentences): %s' % NumUtil.comma_str(n_test)) log.info('') log.info('window_size: %s' % window_size) log.info('noise_rate: %s' % noise_rate) log.info('dropout_keep_rate: %s' % dropout_keep_rate) log.info('') log.info('n_hidden1: %s' % n_hidden1) log.info('learning_rate: %s' % learning_rate) log.info('') log.info('total_epoch: %s' % total_epoch) log.info('batch_size: %s' % batch_size) log.info('early_stop_cost: %s' % early_stop_cost) log.info('total_epoch: %s' % total_epoch)
if _valid_cost < min_valid_cost: min_valid_cost = _valid_cost min_valid_epoch = epoch log.info('[epoch: %s, nth_batch: %s] train cost: %.8f, valid cost: %.8f' % ( epoch, nth_batch, _train_cost, _valid_cost)) if min_valid_epoch == epoch: # save the lastest best model saver.save(sess, model_file) if save_model_each_epochs: saver.save(sess, model_file, global_step=epoch) log.info('') log.info( '"%s" train: min_valid_cost: %.8f, min_valid_epoch: %s, %.2f secs (batch_size: %s, total_input_data: %s, total_epochs: %s, total_train_time: %s secs)' % ( model_name, min_valid_cost, min_valid_epoch, watch.elapsed(), batch_size, NumUtil.comma_str(batch_size * nth_batch), epoch, total_train_time)) log.info('') except: log.info(traceback.format_exc()) finally: coordinator.request_stop() coordinator.join(threads) # Wait for threads to finish. else: # testing x, y, learning_rate, W1, b1, y_hat, cost, train_step, summary = create_graph(model_name, scope_name, verbose=False) test_x_batch, test_y_batch = input_pipeline([test_file], batch_size=n_test, delim='\t', splits=3) log.info('') log.info('model loaded... %s' % model_file) saver = tf.train.Saver(max_to_keep=None) saver.restore(sess, model_file) log.info('model loaded OK. %s' % model_file)
if min_valid_epoch == epoch: # save the lastest best model saver.save(sess, model_file) if save_model_each_epochs: saver.save(sess, model_file, global_step=epoch) log.info('') log.info( '"%s" train: min_valid_cost: %.8f, min_valid_epoch: %s, %.2f secs (batch_size: %s, total_input_data: %s, total_epochs: %s, total_train_time: %s secs)' % (model_name, min_valid_cost, min_valid_epoch, watch.elapsed(), batch_size, NumUtil.comma_str(batch_size * nth_batch), epoch, total_train_time)) log.info('') except: log.info(traceback.format_exc()) finally: coordinator.request_stop() coordinator.join( threads) # Wait for threads to finish. else: # testing x, y, learning_rate, use_first_pipeline, W1, b1, y_hat, cost, train_step, summary = create_graph( model_name, scope_name, first_pipeline=test_pipeline, second_pipeline=test_pipeline, verbose=False)
log.info('[epoch: %s] rsme (train/valid): %.1f / %.1f model saved' % (epoch, train_rsme, valid_rsme)) else: log.info('[epoch: %s] rsme (train/valid): %.1f / %.1f' % (epoch, train_rsme, valid_rsme)) if valid_rsme < early_stop_cost or valid_rsme > max_cost or math.isnan(valid_rsme): running = False break watch.stop('train') if model_file_saved and os.path.exists(model_file + '.index'): restored = saver.restore(sess, model_file) log.info('') log.info('--------TEST----------') watch.start('test') test_rsme, _y_hat = sess.run([rsme, y_hat], feed_dict={x: x_test, y: y_test}) log.info('%s rsme (test): %.1f (epoch best/total: %s/%s), activation: %s, n_hiddens: %s, learning_rate: %s, weights_initializer: %s' % ( func.__name__, test_rsme, NumUtil.comma_str(best_epoch), NumUtil.comma_str(epoch), activation.__name__, n_hiddens, learning_rate, weights_initializer.__name__)) # _y_hat = np.round(_y_hat) for i in range(min(5, _y_hat.shape[0])): log.info('%s\t->\t%.1f\t(label: %d)' % (x_test[i], _y_hat[i], y_test[i])) watch.stop('test') log.info('--------TEST----------') log.info(watch.summary()) except: traceback.print_exc() log.info('OK.')
def learning(cls, sentences_file, batch_size, left_gram, right_gram, model_file, features_vector, labels_vector, n_hidden1=100, max_sentences=0, learning_rate=0.01, layers=2): ngram = left_gram + right_gram n_features = len( features_vector) * ngram # number of features = 17,380 * 4 n_classes = len(labels_vector) if len( labels_vector) >= 3 else 1 # number of classes = 2 but len=1 log.info('load characters list...') log.info('load characters list OK. len: %s\n' % NumUtil.comma_str(len(features_vector))) watch = WatchUtil() train_file = os.path.join( KO_WIKIPEDIA_ORG_DATA_DIR, 'datasets', 'ko.wikipedia.org.dataset.sentences=%d.left=%d.right=%d.train.gz' % (max_sentences, left_gram, right_gram)) validation_file = train_file.replace('.train.', '.validation.') test_file = train_file.replace('.train.', '.test.') if not os.path.exists(train_file) or not os.path.exists( validation_file) or not os.path.exists(test_file): watch.start('create dataset') log.info('create dataset...') features, labels = [], [] check_interval = min(10000, math.ceil(max_sentences)) log.info('total: %s' % NumUtil.comma_str(max_sentences)) with gzip.open(sentences_file, 'rt') as f: for i, line in enumerate(f, 1): if max_sentences < i: break if i % check_interval == 0: log.info( 'create dataset... %.1f%% readed. data len: %s' % (i / max_sentences * 100, NumUtil.comma_str(len(features)))) _f, _l = WordSpacing.sentence2features_labels( line.strip(), left_gram=left_gram, right_gram=right_gram) features.extend(_f) labels.extend(_l) dataset = DataSet(features=features, labels=labels, features_vector=features_vector, labels_vector=labels_vector, name='all') log.info('dataset: %s' % dataset) log.info('create dataset OK.\n') watch.stop('create dataset') watch.start('dataset save') log.info('split to train, test, validation...') datasets = DataSets.to_datasets(dataset, test_rate=0.1, valid_rate=0.1, test_max=10000, valid_max=1000, shuffle=True) train, test, validation = datasets.train, datasets.test, datasets.validation log.info(train) log.info(test) log.info(validation) # log.info('%s %s' % (test.features[0], test.labels[0])) log.info('split to train, test, validation OK.\n') log.info('dataset save... %s' % train_file) train.save(train_file, verbose=True) # save as text log.info('dataset save OK.\n') log.info('dataset save... %s' % validation_file) validation = validation.convert_to_one_hot_vector( verbose=True) # save as vector validation.save(validation_file, verbose=True) log.info('dataset save OK.\n') log.info('dataset save... %s' % test_file) test = test.convert_to_one_hot_vector(verbose=True) test.save(test_file, verbose=True) # save as vector log.info('dataset save OK.\n') watch.stop('dataset save') else: watch.start('dataset load') log.info('dataset load...') train = DataSet.load(train_file, verbose=True) validation = DataSet.load(validation_file, verbose=True) test = DataSet.load(test_file, verbose=True) log.info(train) log.info(validation) log.info(test) log.info('dataset load OK.\n') watch.stop('dataset load') log.info('check samples...') for i, (features_batch, labels_batch) in enumerate( train.next_batch(batch_size=5, to_one_hot_vector=True), 1): if i > 2: break for a, b in zip(features_batch, labels_batch): feature, label = a, b _feature = feature.reshape((ngram, len(features_vector))) chars = ''.join(features_vector.to_values(_feature)) has_space = np.argmax(label) log.info('[%s] %s -> %s, %s (len=%s) %s (len=%s)' % (i, chars, has_space, feature, len(feature), label, len(label))) log.info('check samples OK.\n') graph = WordSpacing.build_FFNN(n_features, n_classes, n_hidden1, learning_rate, watch, layers=layers) train_step, X, Y, cost, hypothesis, predicted, accuracy = graph[ 'train_step'], graph['X'], graph['Y'], graph['cost'], graph[ 'hypothesis'], graph['predicted'], graph['accuracy'] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) n_input = 0 log.info('total: %s' % NumUtil.comma_str(train.size)) log.info('learn...') watch.start('learn') for step, (features_batch, labels_batch) in enumerate( train.next_batch(batch_size=batch_size), 1): n_input += batch_size sess.run(train_step, feed_dict={ X: features_batch, Y: labels_batch }) log.info( '[%s][%.1f%%] validation cost: %.4f' % (NumUtil.comma_str(n_input), n_input / train.size * 100, sess.run(cost, feed_dict={ X: validation.features, Y: validation.labels }))) watch.stop('learn') log.info('learn OK.\n') log.info('evaluate...') watch.start('evaluate...') _hypothesis, _correct, _accuracy = sess.run( [hypothesis, predicted, accuracy], feed_dict={ X: test.features, Y: test.labels }) # Accuracy report watch.stop('evaluate...') log.info('evaluate OK.') log.info('model save... %s' % model_file) watch.start('model save...') model_dir = os.path.dirname(model_file) if not os.path.exists(model_dir): os.makedirs(model_dir) saver = tf.train.Saver() saver.save(sess, model_file) watch.stop('model save...') log.info('model save OK. %s' % model_file) log.info('\n') log.info(watch.summary()) # log.info('hypothesis: %s %s' % (_hypothesis.shape, _hypothesis)) # log.info('correct: %s %s' % (_correct.shape, _correct)) log.info('accuracy: %s %s' % (_accuracy.shape, _accuracy)) log.info('\n')
if max_test_sentences < max_sentences: # leared sentences is smaller than full sentences for i, line in enumerate(f, 1): if i <= max_sentences: # skip learned sentences if i % 100000 == 0: log.info('skip %d th learned sentence.' % i) continue if len(sentences ) >= max_test_sentences: # read new sentences break s = line.strip() if s.count(' ') > 0 and len( s.replace(' ', '') ) > ngram: # sentence must have one or more space. sentences.append(s) log.info('len(sentences): %s' % NumUtil.comma_str(len(sentences))) watch.stop('read sentences') watch.start('run tensorflow') accuracies, sims = [], [] with tf.Session() as sess: graph = WordSpacing.build_FFNN(n_features, n_classes, n_hidden1, learning_rate, layers=layers) X, Y, predicted, accuracy = graph['X'], graph['Y'], graph[ 'predicted'], graph['accuracy'] saver = tf.train.Saver() try: