def learning(cls, total_epoch, n_train, n_valid, n_test, batch_size, left_gram, right_gram, model_file, features_vector, labels_vector, n_hidden1=100, learning_rate=0.01, early_stop_cost=0.001): ngram = left_gram + right_gram n_features = len(features_vector) * ngram # number of features = 17,380 * 4 n_classes = len(labels_vector) if len(labels_vector) >= 3 else 1 # number of classes = 2 but len=1 log.info('load characters list...') log.info('load characters list OK. len: %s\n' % NumUtil.comma_str(len(features_vector))) watch = WatchUtil() train_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing', 'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.train.gz' % (n_train, left_gram, right_gram)) valid_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing', 'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.test.gz' % (n_valid, left_gram, right_gram)) test_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing', 'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.valid.gz' % (n_test, left_gram, right_gram)) if not os.path.exists(train_file) or not os.path.exists(valid_file) or not os.path.exists(test_file): dataset_dir = os.path.dirname(train_file) if not os.path.exists(dataset_dir): os.makedirs(dataset_dir) watch.start('create dataset') log.info('create dataset...') data_files = (('train', KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE, n_train, train_file, False), ('valid', KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE, n_valid, valid_file, False), ('test', KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE, n_test, test_file, False)) for name, data_file, total, dataset_file, to_one_hot_vector in data_files: check_interval = 10000 log.info('check_interval: %s' % check_interval) log.info('%s %s total: %s' % (name, os.path.basename(data_file), NumUtil.comma_str(total))) features, labels = [], [] with gzip.open(data_file, 'rt', encoding='utf8') as f: for i, line in enumerate(f, 1): if total < i: break if i % check_interval == 0: time.sleep(0.01) # prevent cpu overload percent = i / total * 100 log.info('create dataset... %.1f%% readed. data len: %s. %s' % (percent, NumUtil.comma_str(len(features)), data_file)) _f, _l = WordSpacing.sentence2features_labels(line.strip(), left_gram=left_gram, right_gram=right_gram) features.extend(_f) labels.extend(_l) dataset = DataSet(features=features, labels=labels, features_vector=features_vector, labels_vector=labels_vector, name=name) log.info('dataset save... %s' % dataset_file) dataset.save(dataset_file, gzip_format=True, verbose=True) log.info('dataset save OK. %s' % dataset_file) log.info('dataset: %s' % dataset) log.info('create dataset OK.') log.info('') watch.stop('create dataset') watch.start('dataset load') log.info('dataset load...') train = DataSet.load(train_file, gzip_format=True, verbose=True) if n_train >= int('100,000'.replace(',', '')): valid = DataSet.load(valid_file, gzip_format=True, verbose=True) else: valid = DataSet.load(train_file, gzip_format=True, verbose=True) log.info('valid.convert_to_one_hot_vector()...') valid = valid.convert_to_one_hot_vector(verbose=True) log.info('valid.convert_to_one_hot_vector() OK.') log.info('train dataset: %s' % train) log.info('valid dataset: %s' % valid) log.info('dataset load OK.') log.info('') watch.stop('dataset load') graph = WordSpacing.build_FFNN(n_features, n_classes, n_hidden1, learning_rate, watch) train_step, X, Y, cost, predicted, accuracy = graph['train_step'], graph['X'], graph['Y'], graph['cost'], graph['predicted'], graph['accuracy'] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) check_interval = 10 # max(1, min(1000, n_train // 10)) nth_train, nth_input, total_input = 0, 0, total_epoch * train.size log.info('learn...') log.info('total: %s' % NumUtil.comma_str(train.size)) watch.start('learn') valid_cost = sys.float_info.max for epoch in range(1, total_epoch + 1): if valid_cost < early_stop_cost: break for step, (features_batch, labels_batch) in enumerate(train.next_batch(batch_size=batch_size), 1): if valid_cost < early_stop_cost: log.info('valid_cost: %s, early_stop_cost: %s, early stopped.' % (valid_cost, early_stop_cost)) break nth_train += 1 nth_input += features_batch.shape[0] sess.run(train_step, feed_dict={X: features_batch, Y: labels_batch}) # if step % check_interval == 1: percent = nth_input / total_input * 100 valid_cost = sess.run(cost, feed_dict={X: valid.features, Y: valid.labels}) log.info('[epoch=%s][%.1f%%] %s cost: %.4f' % (epoch, percent, valid.name, valid_cost)) watch.stop('learn') log.info('learn OK.\n') log.info('model save... %s' % model_file) watch.start('model save...') model_dir = os.path.dirname(model_file) if not os.path.exists(model_dir): os.makedirs(model_dir) saver = tf.train.Saver() saver.save(sess, model_file) watch.stop('model save...') log.info('model save OK. %s' % model_file) log.info('\n') log.info('batch_size: %s' % batch_size) log.info(watch.summary()) log.info('\n')
def learning(cls, total_epoch, n_train, n_valid, n_test, batch_size, window_size, noise_rate, model_file, features_vector, labels_vector, n_hidden1, learning_rate, dropout_keep_rate, early_stop_cost=0.001): n_features = len(features_vector) * window_size # number of features = 17,382 * 10 log.info('load characters list...') log.info('load characters list OK. len: %s' % NumUtil.comma_str(len(features_vector))) watch = WatchUtil() train_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'spelling_error_correction', 'ko.wikipedia.org.dataset.sentences=%s.window_size=%d.train.gz' % (n_train, window_size)) valid_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'spelling_error_correction', 'ko.wikipedia.org.dataset.sentences=%s.window_size=%d.valid.gz' % (n_valid, window_size)) test_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'spelling_error_correction', 'ko.wikipedia.org.dataset.sentences=%s.window_size=%d.test.gz' % (n_test, window_size)) log.info('train_file: %s' % train_file) log.info('valid_file: %s' % valid_file) log.info('test_file: %s' % test_file) if not os.path.exists(train_file) or not os.path.exists(valid_file) or not os.path.exists(test_file): dataset_dir = os.path.dirname(train_file) if not os.path.exists(dataset_dir): os.makedirs(dataset_dir) watch.start('create dataset') # FIXME: out of memory (1M sentences) log.info('create dataset...') data_files = (('train', KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE, n_train, train_file, False), ('valid', KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE, n_valid, valid_file, False), ('test', KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE, n_test, test_file, False)) for (name, data_file, total, dataset_file, to_one_hot_vector) in data_files: check_interval = 10000 log.info('check_interval: %s' % check_interval) log.info('%s %s total: %s' % (name, os.path.basename(data_file), NumUtil.comma_str(total))) log.info('noise_rate: %s' % noise_rate) features, labels = [], [] with gzip.open(data_file, 'rt') as f: for i, line in enumerate(f, 1): if total < i: break if i % check_interval == 0: time.sleep(0.01) # prevent cpu overload percent = i / total * 100 log.info('create dataset... %.1f%% readed. data len: %s. %s' % (percent, NumUtil.comma_str(len(features)), data_file)) sentence = line.strip() for start in range(0, len(sentence) - window_size + 1): # 문자 단위로 노이즈(공백) 생성 chars = sentence[start: start + window_size] for idx in range(len(chars)): noised_chars = StringUtil.replace_with_index(chars, ' ', idx) features.append(noised_chars) labels.append(chars) log.debug('create dataset... %s "%s" -> "%s"' % (name, noised_chars, chars)) # log.info('noise_sampling: %s' % noise_sampling) # for nth_sample in range(noise_sampling): # 초성, 중성, 종성 단위로 노이즈 생성 # for start in range(0, len(sentence) - window_size + 1): # chars = sentence[start: start + window_size] # noised_chars = SpellingErrorCorrection.encode_noise(chars, noise_rate=noise_rate, noise_with_blank=True) # if chars == noised_chars: # continue # if i % check_interval == 0 and nth_sample == 0: # log.info('create dataset... %s "%s" -> "%s"' % (name, noised_chars, chars)) # features.append(noised_chars) # labels.append(chars) # print('dataset features:', features) # print('dataset labels:', labels) dataset = DataSet(features=features, labels=labels, features_vector=features_vector, labels_vector=labels_vector, name=name) log.info('dataset save... %s' % dataset_file) dataset.save(dataset_file, gzip_format=True, verbose=True) log.info('dataset save OK. %s' % dataset_file) log.info('dataset: %s' % dataset) log.info('create dataset OK.') log.info('') watch.stop('create dataset') watch.start('dataset load') log.info('dataset load...') train = DataSet.load(train_file, gzip_format=True, verbose=True) if n_train >= int('100,000'.replace(',', '')): valid = DataSet.load(valid_file, gzip_format=True, verbose=True) else: valid = DataSet.load(train_file, gzip_format=True, verbose=True) log.info('valid.convert_to_one_hot_vector()...') valid = valid.convert_to_one_hot_vector(verbose=True) log.info('valid.convert_to_one_hot_vector() OK.') log.info('train dataset: %s' % train) log.info('valid dataset: %s' % valid) log.info('dataset load OK.') log.info('') watch.stop('dataset load') X, Y, dropout_keep_prob, train_step, cost, y_hat, accuracy = SpellingErrorCorrection.build_DAE(n_features, window_size, noise_rate, n_hidden1, learning_rate, watch) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) check_interval = max(1, min(1000, n_train // 10)) nth_train, nth_input, total_input = 0, 0, total_epoch * train.size log.info('') log.info('learn...') log.info('total_epoch: %s' % total_epoch) log.info('train.size (total features): %s' % NumUtil.comma_str(train.size)) log.info('check_interval: %s' % check_interval) log.info('total_epoch: %s' % total_epoch) log.info('batch_size: %s' % batch_size) log.info('total_input: %s (total_epoch * train.size)' % total_input) log.info('') watch.start('learn') valid_cost = sys.float_info.max for epoch in range(1, total_epoch + 1): if valid_cost < early_stop_cost: log.info('valid_cost: %s, early_stop_cost: %s, early stopped.' % (valid_cost, early_stop_cost)) break for step, (features_batch, labels_batch) in enumerate(train.next_batch(batch_size=batch_size, to_one_hot_vector=True), 1): if valid_cost < early_stop_cost: break nth_train += 1 nth_input += features_batch.shape[0] sess.run(train_step, feed_dict={X: features_batch, Y: labels_batch, dropout_keep_prob: dropout_keep_rate}) # if nth_train % check_interval == 1: percent = nth_input / total_input * 100 valid_cost = sess.run(cost, feed_dict={X: valid.features, Y: valid.labels, dropout_keep_prob: 1.0}) log.info('[epoch=%s][%.1f%%] %s cost: %.8f' % (epoch, percent, valid.name, valid_cost)) watch.stop('learn') log.info('learn OK.') log.info('') log.info('model save... %s' % model_file) watch.start('model save...') model_dir = os.path.dirname(model_file) if not os.path.exists(model_dir): os.makedirs(model_dir) saver = tf.train.Saver() saver.save(sess, model_file) watch.stop('model save...') log.info('model save OK. %s' % model_file) log.info('') log.info('total_epoch: %s' % total_epoch) log.info('batch_size: %s' % batch_size) log.info('total_input: %s (total_epoch * train.size)' % total_input) log.info('') log.info(watch.summary()) log.info('')