def dump_urls(mongo_url, db_name, collection_name, urls_file, mongo_query=None, limit=0): if mongo_query is None: mongo_query = {} corpus_mongo = MongodbUtil(mongo_url, db_name=db_name, collection_name=collection_name) total = corpus_mongo.count() log.info('%s total: %s' % (corpus_mongo, NumUtil.comma_str(total))) output_dir = os.path.basename(urls_file) if not os.path.exists(output_dir): os.makedirs(output_dir) with open(urls_file, 'wt') as out_f: for i, row in enumerate(corpus_mongo.find(mongo_query, limit=limit)): if i % 1000 == 0: log.info('%s %.1f%% writed.' % (os.path.basename(urls_file), i / total * 100)) out_f.write(row['url']) out_f.write('\n')
exit() config = tf.ConfigProto() config.gpu_options.allow_growth = True filenames = [data_file] features_batch, labels_batch = input_pipeline(filenames, batch_size=batch_size, shuffle=shuffle, tokens=2) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) coordinator = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coordinator) log.info('coordinator: %s' % coordinator) log.info('threads: %s, %s' % (len(threads), threads)) try: for nth_batch in range(5): if coordinator.should_stop(): break _features_batch, _labels_batch = sess.run( [features_batch, labels_batch]) log.info('') log.info('nth_batch: %s' % nth_batch) for _f, _l in zip(_features_batch, _labels_batch): log.info('%s %s' % (_f.decode('utf8'), _l.decode('utf8'))) # decode for print except: log.info(traceback.format_exc())
import matplotlib.pyplot as plt import numpy as np import tensorflow as tf from tensorflow.examples.tutorials.mnist import input_data from bage_utils.base_util import is_server from nlp4kor_tensorflow.config import MNIST_DATA_DIR, MNIST_DAE_MODEL_DIR, log if __name__ == '__main__': mnist_data = os.path.join(MNIST_DATA_DIR, MNIST_DAE_MODEL_DIR) # input device2use = '/gpu:0' if is_server() else '/cpu:0' model_file = os.path.join(MNIST_DAE_MODEL_DIR, 'dae_mnist_model≤/model') # .%s' % max_sentences log.info('model_file: %s' % model_file) model_dir = os.path.dirname(model_file) if not os.path.exists(model_dir): os.makedirs(model_dir) image_shape = (28, 28) mnist = input_data.read_data_sets(mnist_data, one_hot=True) assert (mnist.train.images.shape[1] == mnist.test.images.shape[1]) n_input_dim = mnist.train.images.shape[ 1] # MNIST data input (img shape: 28*28) n_output_dim = n_input_dim # MNIST data input (img shape: 28*28) n_hidden_1 = 256 # 1st layer num features n_hidden_2 = 256 # 2nd layer num features log.info('n_input_dim: %s' % n_input_dim)
def learning(cls, total_epoch, n_train, n_valid, n_test, batch_size, left_gram, right_gram, model_file, features_vector, labels_vector, n_hidden1=100, learning_rate=0.01, early_stop_cost=0.001): ngram = left_gram + right_gram n_features = len(features_vector) * ngram # number of features = 17,380 * 4 n_classes = len(labels_vector) if len(labels_vector) >= 3 else 1 # number of classes = 2 but len=1 log.info('load characters list...') log.info('load characters list OK. len: %s\n' % NumUtil.comma_str(len(features_vector))) watch = WatchUtil() train_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing', 'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.train.gz' % (n_train, left_gram, right_gram)) valid_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing', 'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.test.gz' % (n_valid, left_gram, right_gram)) test_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing', 'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.valid.gz' % (n_test, left_gram, right_gram)) if not os.path.exists(train_file) or not os.path.exists(valid_file) or not os.path.exists(test_file): dataset_dir = os.path.dirname(train_file) if not os.path.exists(dataset_dir): os.makedirs(dataset_dir) watch.start('create dataset') log.info('create dataset...') data_files = (('train', KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE, n_train, train_file, False), ('valid', KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE, n_valid, valid_file, False), ('test', KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE, n_test, test_file, False)) for name, data_file, total, dataset_file, to_one_hot_vector in data_files: check_interval = 10000 log.info('check_interval: %s' % check_interval) log.info('%s %s total: %s' % (name, os.path.basename(data_file), NumUtil.comma_str(total))) features, labels = [], [] with gzip.open(data_file, 'rt', encoding='utf8') as f: for i, line in enumerate(f, 1): if total < i: break if i % check_interval == 0: time.sleep(0.01) # prevent cpu overload percent = i / total * 100 log.info('create dataset... %.1f%% readed. data len: %s. %s' % (percent, NumUtil.comma_str(len(features)), data_file)) _f, _l = WordSpacing.sentence2features_labels(line.strip(), left_gram=left_gram, right_gram=right_gram) features.extend(_f) labels.extend(_l) dataset = DataSet(features=features, labels=labels, features_vector=features_vector, labels_vector=labels_vector, name=name) log.info('dataset save... %s' % dataset_file) dataset.save(dataset_file, gzip_format=True, verbose=True) log.info('dataset save OK. %s' % dataset_file) log.info('dataset: %s' % dataset) log.info('create dataset OK.') log.info('') watch.stop('create dataset') watch.start('dataset load') log.info('dataset load...') train = DataSet.load(train_file, gzip_format=True, verbose=True) if n_train >= int('100,000'.replace(',', '')): valid = DataSet.load(valid_file, gzip_format=True, verbose=True) else: valid = DataSet.load(train_file, gzip_format=True, verbose=True) log.info('valid.convert_to_one_hot_vector()...') valid = valid.convert_to_one_hot_vector(verbose=True) log.info('valid.convert_to_one_hot_vector() OK.') log.info('train dataset: %s' % train) log.info('valid dataset: %s' % valid) log.info('dataset load OK.') log.info('') watch.stop('dataset load') graph = WordSpacing.build_FFNN(n_features, n_classes, n_hidden1, learning_rate, watch) train_step, X, Y, cost, predicted, accuracy = graph['train_step'], graph['X'], graph['Y'], graph['cost'], graph['predicted'], graph['accuracy'] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) check_interval = 10 # max(1, min(1000, n_train // 10)) nth_train, nth_input, total_input = 0, 0, total_epoch * train.size log.info('learn...') log.info('total: %s' % NumUtil.comma_str(train.size)) watch.start('learn') valid_cost = sys.float_info.max for epoch in range(1, total_epoch + 1): if valid_cost < early_stop_cost: break for step, (features_batch, labels_batch) in enumerate(train.next_batch(batch_size=batch_size), 1): if valid_cost < early_stop_cost: log.info('valid_cost: %s, early_stop_cost: %s, early stopped.' % (valid_cost, early_stop_cost)) break nth_train += 1 nth_input += features_batch.shape[0] sess.run(train_step, feed_dict={X: features_batch, Y: labels_batch}) # if step % check_interval == 1: percent = nth_input / total_input * 100 valid_cost = sess.run(cost, feed_dict={X: valid.features, Y: valid.labels}) log.info('[epoch=%s][%.1f%%] %s cost: %.4f' % (epoch, percent, valid.name, valid_cost)) watch.stop('learn') log.info('learn OK.\n') log.info('model save... %s' % model_file) watch.start('model save...') model_dir = os.path.dirname(model_file) if not os.path.exists(model_dir): os.makedirs(model_dir) saver = tf.train.Saver() saver.save(sess, model_file) watch.stop('model save...') log.info('model save OK. %s' % model_file) log.info('\n') log.info('batch_size: %s' % batch_size) log.info(watch.summary()) log.info('\n')
def build_FFNN(cls, n_features, n_classes, n_hidden1, learning_rate, watch=WatchUtil()): log.info('\nbuild_FFNN') if len(cls.graph_nodes) == 0: n_hidden3 = n_hidden2 = n_hidden1 log.info('create tensorflow graph...') watch.start('create tensorflow graph') log.info('n_features: %s' % n_features) log.info('n_classes: %s' % n_classes) log.info('n_hidden1: %s' % n_hidden1) log.info('n_hidden2: %s' % n_hidden2) log.info('n_hidden3: %s' % n_hidden3) tf.set_random_seed(777) # for reproducibility X = tf.placeholder(tf.float32, [None, n_features], name='X') # two characters Y = tf.placeholder(tf.float32, [None, n_classes], name='Y') W1 = tf.Variable(tf.random_normal([n_features, n_hidden1]), name='W1') b1 = tf.Variable(tf.random_normal([n_hidden1]), name='b1') layer1 = tf.nn.relu(tf.matmul(X, W1) + b1, name='layer1') W2 = tf.Variable(tf.random_normal([n_hidden1, n_hidden2]), name='W2') b2 = tf.Variable(tf.random_normal([n_hidden2]), name='b2') layer2 = tf.nn.relu(tf.matmul(layer1, W2) + b2, name='layer2') W3 = tf.Variable(tf.random_normal([n_hidden2, n_hidden3]), name='W3') b3 = tf.Variable(tf.random_normal([n_hidden3]), name='b3') layer3 = tf.nn.relu(tf.matmul(layer2, W3) + b3, name='layer3') W4 = tf.Variable(tf.random_normal([n_hidden3, n_classes]), name='W4') b4 = tf.Variable(tf.random_normal([n_classes]), name='b4') y_hat = tf.add(tf.matmul(layer3, W4), b4, name='y_hat') # cost = -tf.reduce_mean(Y * tf.log(hypothesis) + (1 - Y) * tf.log(1 - hypothesis), name='cost') # cost/loss function cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat, labels=Y), name='cost') train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize( cost) # Very Very good!! sentences=10000 + layer=4, 10분, accuracy 0.9294, cost: 0.1839 predicted = tf.cast(y_hat > 0.0, dtype=tf.float32, name='predicted') # 0 <= hypothesis <= 1 accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, Y), dtype=tf.float32), name='accuracy') watch.stop('create tensorflow graph') log.info('create tensorflow graph OK.\n') cls.graph_nodes = {'predicted': predicted, 'accuracy': accuracy, 'X': X, 'Y': Y, 'train_step': train_step, 'cost': cost} return cls.graph_nodes
total_spaces = labels1.count(1) # 정답에 있는 공백 개수 correct = total_spaces - incorrect # 정답에 있는 공백과 같은 곳에 공백이 있는지 if total_spaces == 0: sim = 1 else: sim = correct / total_spaces return sim, correct, total_spaces if __name__ == '__main__': train_sentences_file = KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE valid_sentences_file = KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE test_sentences_file = KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE log.info('train_sentences_file: %s' % train_sentences_file) log.info('valid_sentences_file: %s' % valid_sentences_file) log.info('test_sentences_file: %s' % test_sentences_file) characters_file = KO_WIKIPEDIA_ORG_CHARACTERS_FILE log.info('characters_file: %s' % characters_file) try: if len(sys.argv) == 4: n_train = int(sys.argv[1]) left_gram = int(sys.argv[2]) right_gram = int(sys.argv[3]) else: n_train, left_gram, right_gram = 100, 2, 2 # n_train = int('1,000,000'.replace(',', '')) # 1M data (학습: 17시간 소요) ngram = left_gram + right_gram
def create_graph(model_name, scope_name, verbose=False): """ create or reuse graph :param model_name: :param scope_name: :param verbose: print graph nodes :return: tensorflow graph nodes """ with tf.variable_scope('common'): # for reusing graph learning_rate = tf.placeholder(dtype=tf.float32, name='learning_rate') W1 = tf.get_variable(dtype=tf.float32, shape=[input_len, output_len], initializer=tf.random_normal_initializer(), name='W1') b1 = tf.get_variable(dtype=tf.float32, initializer=tf.constant(0.0, shape=[output_len]), name='b1') x = tf.placeholder(dtype=tf.float32, shape=[None, input_len], name='x') y = tf.placeholder(dtype=tf.float32, shape=[None, output_len], name='y') y_hat = tf.add(tf.matmul(x, W1), b1, name='y_hat') cost = tf.reduce_mean(tf.square(y_hat - y), name='cost') train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost, name='train_step') with tf.variable_scope(scope_name, reuse=None): _W1 = tf.summary.histogram(values=W1, name='_W1') _b1 = tf.summary.histogram(values=b1, name='_b1') _cost = tf.summary.scalar(tensor=cost, name='_cost') summary = tf.summary.merge([_W1, _b1, _cost], name='summary') # merge_all() if verbose: log.info('') log.info(x) log.info(W1) log.info(b1) log.info('') log.info(y) log.info(y_hat) log.info(cost) return x, y, learning_rate, W1, b1, y_hat, cost, train_step, summary
input_len = 2 # x1, x2 output_len = 1 # y _learning_rate = 0.01 n_train, n_valid, n_test = 1000, 100, 10 if not os.path.exists(train_file): create_data4add(train_file, n_train, digit_max=99) if not os.path.exists(valid_file): create_data4add(valid_file, n_valid, digit_max=99) if not os.path.exists(test_file): create_data4add(test_file, n_test, digit_max=99) for training_mode in [True, False]: # training & testing for batch_size in [1, 10, 100]: tf.reset_default_graph() # Clears the default graph stack and resets the global default graph. log.info('') log.info('training_mode: %s, batch_size: %s, total_train_time: %s secs' % (training_mode, batch_size, total_train_time)) model_name = os.path.basename(__file__).replace('.py', '') model_file = os.path.join(SAMPLE_MODELS_DIR, '%s.n_train_%s.batch_size_%s.total_train_time_%s/model' % (model_name, n_train, batch_size, total_train_time)) model_dir = os.path.dirname(model_file) log.info('model_name: %s' % model_name) log.info('model_file: %s' % model_file) scope_name = '%s.%s.batch_size_%s.total_train_time_%s' % (model_name, DateUtil.current_yyyymmdd_hhmm(), batch_size, total_train_time) log.info('scope_name: %s' % scope_name) with tf.device('/gpu:0'): with tf.Graph().as_default(): # for reusing graph checkpoint = tf.train.get_checkpoint_state(model_dir) is_training = True if training_mode or not checkpoint else False # learning or testing
if __name__ == '__main__': os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # ignore tensorflow warnings tf.logging.set_verbosity(tf.logging.ERROR) # ignore tensorflow info func = multiply # 다른 데이터 생성 함수로 교체해 볼것 add, average n_features = 2 # x1, x2 n_classes = 1 # y digits = list(range(-99, 100, 1)) n_train, n_test = 4000, 100 # 10% of 200 * 200 x_data = np.random.choice(digits, (n_train + n_test, n_features), replace=True) y_data = func(x_data) x_train, x_test = x_data[:n_train], x_data[n_train:] y_train, y_test = y_data[:n_train], y_data[n_train:] log.info('') log.info('func: %s' % func.__name__) log.info('digits: %s ~ %s ' % (min(digits), max(digits))) log.info('x_train: %s' % str(x_train.shape)) log.info(x_data[:5]) log.info('y_train: %s' % str(y_train.shape)) log.info(y_data[:5]) log.info('x_test: %s' % str(x_test.shape)) log.info('y_test %s' % str(y_test.shape)) valid_check_interval = 0.5 bias_value = 0.0 early_stop_cost = 0.1 # stop learning # default values optimizer = tf.train.AdamOptimizer
def dump_corpus(mongo_url, db_name, collection_name, sentences_file, characters_file, info_file, urls_file, train_sentences_file, valid_sentences_file, test_sentences_file, mongo_query=None, limit=None): """ Mongodb에서 문서를 읽어서, 문장 단위로 저장한다. (단 문장안의 단어가 1개 이거나, 한글이 전혀 없는 문장은 추출하지 않는다.) :param characters_file: :param urls_file: :param info_file: :param mongo_url: mongodb://~~~ :param db_name: database name of mongodb :param collection_name: collection name of mongodb :param sentences_file: *.sentence file :param train_sentences_file: :param valid_sentences_file: :param test_sentences_file: :param mongo_query: default={} :param limit: :return: """ if mongo_query is None: mongo_query = {} corpus_mongo = MongodbUtil(mongo_url, db_name=db_name, collection_name=collection_name) total_docs = corpus_mongo.count() log.info('%s total: %s' % (corpus_mongo, NumUtil.comma_str(total_docs))) output_dir = os.path.basename(sentences_file) if not os.path.exists(output_dir): os.makedirs(output_dir) with gzip.open(sentences_file, 'wt') as out_f, \ gzip.open(train_sentences_file, 'wt') as train_f, \ gzip.open(valid_sentences_file, 'wt') as valid_f, \ gzip.open(test_sentences_file, 'wt') as test_f, \ open(info_file, 'wt') as info_f, \ open(urls_file, 'wt') as urls_f: char_set = set() n_docs = n_total = n_train = n_valid = n_test = 0 if limit: cursor = corpus_mongo.find(mongo_query, limit=limit) else: cursor = corpus_mongo.find(mongo_query) for i, row in enumerate(cursor, 1): if i % 1000 == 0: log.info('%s %.1f%% writed.' % (os.path.basename(sentences_file), i / total_docs * 100)) sentences = [] for c in row['content']: sentences.extend(HangulUtil.text2sentences(c['sentences'], remove_only_one_word=True, has_hangul=True, remove_markdown=True)) log.debug('url: %s, len: %s' % (row['url'], len(sentences))) if len(sentences) == 0: # log.error(row['content']) continue urls_f.write(row['url']) urls_f.write('\n') n_docs += 1 for s in sentences: _char_set = set([c for c in s]) char_set.update(_char_set) n_total += 1 out_f.write(s) out_f.write('\n') if len(sentences) >= 10: # can split test_len = valid_len = len(sentences) // 10 # log.info('train: %s, test: %s, valid: %s' % (len(sentences) - test_len - valid_len, test_len, valid_len)) for s in sentences[:test_len]: n_test += 1 test_f.write(s) test_f.write('\n') for s in sentences[test_len:test_len + valid_len]: n_valid += 1 valid_f.write(s) valid_f.write('\n') for s in sentences[test_len + valid_len:]: n_train += 1 train_f.write(s) train_f.write('\n') else: # can't split for s in sentences: n_train += 1 train_f.write(s) train_f.write('\n') char_list = list(char_set) char_list.sort() log.info('writed to %s...' % characters_file) with open(characters_file, 'w') as f: for c in char_list: f.write(c) f.write('\n') log.info('writed to %s OK.' % characters_file) log.info('total docs: %s', NumUtil.comma_str(total_docs)) log.info('total docs: %s (has hangul sentence)', NumUtil.comma_str(n_docs)) log.info('total sentences: %s (has hangul sentence)', NumUtil.comma_str(n_total)) log.info('train: %s', NumUtil.comma_str(n_train)) log.info('valid: %s', NumUtil.comma_str(n_valid)) log.info('test: %s', NumUtil.comma_str(n_test)) log.info('total characters: %s', NumUtil.comma_str(len(char_list))) info_f.write('total docs: %s\n' % NumUtil.comma_str(total_docs)) info_f.write('total docs: %s (has hangul sentence)\n' % NumUtil.comma_str(n_docs)) info_f.write('total sentences: %s (has hangul sentence)\n' % NumUtil.comma_str(n_total)) info_f.write('train: %s\n' % NumUtil.comma_str(n_train)) info_f.write('valid: %s\n' % NumUtil.comma_str(n_valid)) info_f.write('test: %s\n' % NumUtil.comma_str(n_test)) info_f.write('total characters: %s\n' % NumUtil.comma_str(len(char_list)))
info_f.write('total docs: %s\n' % NumUtil.comma_str(total_docs)) info_f.write('total docs: %s (has hangul sentence)\n' % NumUtil.comma_str(n_docs)) info_f.write('total sentences: %s (has hangul sentence)\n' % NumUtil.comma_str(n_total)) info_f.write('train: %s\n' % NumUtil.comma_str(n_train)) info_f.write('valid: %s\n' % NumUtil.comma_str(n_valid)) info_f.write('test: %s\n' % NumUtil.comma_str(n_test)) info_f.write('total characters: %s\n' % NumUtil.comma_str(len(char_list))) if __name__ == '__main__': info_file = KO_WIKIPEDIA_ORG_INFO_FILE urls_file = KO_WIKIPEDIA_ORG_URLS_FILE sentences_file = KO_WIKIPEDIA_ORG_SENTENCES_FILE characters_file = KO_WIKIPEDIA_ORG_CHARACTERS_FILE log.info('info_file: %s' % info_file) log.info('urls_file: %s' % urls_file) log.info('sentences_file: %s' % sentences_file) log.info('characters_file: %s' % characters_file) if not os.path.exists(characters_file) or not os.path.exists(sentences_file) or not os.path.exists(info_file) or not os.path.exists(urls_file): try: log.info('create senences file...') TextPreprocess.dump_corpus(MONGO_URL, db_name='parsed', collection_name='ko.wikipedia.org', sentences_file=sentences_file, characters_file=characters_file, info_file=info_file, urls_file=urls_file, train_sentences_file=KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE, valid_sentences_file=KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE, test_sentences_file=KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE, mongo_query={}) # mongodb -> text file(corpus) log.info('create senences file OK')
def create_graph(tensorboard_scope, mode_scope, input_file, input_len=2, output_len=1, batch_size=1, verbose=True, reuse=None, n_threads=2): """ create or reuse graph :param tensorboard_scope: variable scope name :param mode_scope: 'train', 'valid', 'test' :param input_file: train or valid or test file path :param input_len: x1, x2 :param output_len: y :param batch_size: batch size > 0 :param verbose: print graph nodes :param reuse: reuse graph or not :param n_threads: number of example enqueue threands (2 is enough) :return: tensorflow graph nodes """ with tf.name_scope(mode_scope): # don't share x, y = input_pipeline([input_file], batch_size=batch_size, delim='\t', splits=3, n_threads=n_threads) learning_rate = tf.placeholder(dtype=tf.float32, name='learning_rate') with tf.variable_scope('layers%d' % 1, reuse=reuse): # share W, b W1 = tf.get_variable(dtype=tf.float32, shape=[input_len, output_len], initializer=tf.random_normal_initializer(), name='W1') b1 = tf.get_variable(dtype=tf.float32, initializer=tf.constant(0.0, shape=[output_len]), name='b1') y_hat = tf.add(tf.matmul(x, W1), b1, name='y_hat') with tf.variable_scope('cost', reuse=reuse): # share W, b cost = tf.reduce_mean(tf.square(y_hat - y), name='cost') train_step = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(cost, name='train_step') with tf.name_scope(tensorboard_scope): # don't share _W1 = tf.summary.histogram(values=W1, name='_W1') _b1 = tf.summary.histogram(values=b1, name='_b1') _cost = tf.summary.scalar(tensor=cost, name='_cost') summary = tf.summary.merge([_W1, _b1, _cost], name='summary') # tf.summary.merge_all() if verbose: log.info('') log.info('mode_scope: %s' % mode_scope) log.info(x) log.info(W1) log.info(b1) log.info(y) log.info(y_hat) log.info(cost) log.info(train_step.name) return x, y, learning_rate, W1, b1, y_hat, cost, train_step, summary