Python infoの例、nlp4kor_tensorflow.config.log.info Pythonの例

コード例 #1

0

ファイルを表示

    def dump_urls(mongo_url, db_name, collection_name, urls_file, mongo_query=None, limit=0):
        if mongo_query is None:
            mongo_query = {}

        corpus_mongo = MongodbUtil(mongo_url, db_name=db_name, collection_name=collection_name)
        total = corpus_mongo.count()
        log.info('%s total: %s' % (corpus_mongo, NumUtil.comma_str(total)))

        output_dir = os.path.basename(urls_file)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        with open(urls_file, 'wt') as out_f:
            for i, row in enumerate(corpus_mongo.find(mongo_query, limit=limit)):
                if i % 1000 == 0:
                    log.info('%s %.1f%% writed.' % (os.path.basename(urls_file), i / total * 100))
                    out_f.write(row['url'])
                    out_f.write('\n')

コード例 #2

0

ファイルを表示

ファイル: read_korean.py プロジェクト: bage79/nlp4kor-tensorflow

        exit()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    filenames = [data_file]
    features_batch, labels_batch = input_pipeline(filenames,
                                                  batch_size=batch_size,
                                                  shuffle=shuffle,
                                                  tokens=2)
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())

        coordinator = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coordinator)
        log.info('coordinator: %s' % coordinator)
        log.info('threads: %s, %s' % (len(threads), threads))
        try:
            for nth_batch in range(5):
                if coordinator.should_stop():
                    break

                _features_batch, _labels_batch = sess.run(
                    [features_batch, labels_batch])
                log.info('')
                log.info('nth_batch: %s' % nth_batch)
                for _f, _l in zip(_features_batch, _labels_batch):
                    log.info('%s %s' % (_f.decode('utf8'),
                                        _l.decode('utf8')))  # decode for print
        except:
            log.info(traceback.format_exc())

コード例 #3

0

ファイルを表示

ファイル: dae_mnist.py プロジェクト: bage79/nlp4kor-tensorflow

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

from bage_utils.base_util import is_server
from nlp4kor_tensorflow.config import MNIST_DATA_DIR, MNIST_DAE_MODEL_DIR, log

if __name__ == '__main__':
    mnist_data = os.path.join(MNIST_DATA_DIR, MNIST_DAE_MODEL_DIR)  # input
    device2use = '/gpu:0' if is_server() else '/cpu:0'

    model_file = os.path.join(MNIST_DAE_MODEL_DIR,
                              'dae_mnist_model≤/model')  # .%s' % max_sentences
    log.info('model_file: %s' % model_file)

    model_dir = os.path.dirname(model_file)
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    image_shape = (28, 28)
    mnist = input_data.read_data_sets(mnist_data, one_hot=True)
    assert (mnist.train.images.shape[1] == mnist.test.images.shape[1])
    n_input_dim = mnist.train.images.shape[
        1]  # MNIST data input (img shape: 28*28)
    n_output_dim = n_input_dim  # MNIST data input (img shape: 28*28)
    n_hidden_1 = 256  # 1st layer num features
    n_hidden_2 = 256  # 2nd layer num features

    log.info('n_input_dim: %s' % n_input_dim)

コード例 #4

0

ファイルを表示

ファイル: word_spacing.py プロジェクト: bage79/nlp4kor-tensorflow

    def learning(cls, total_epoch, n_train, n_valid, n_test, batch_size, left_gram, right_gram, model_file, features_vector, labels_vector, n_hidden1=100,
                 learning_rate=0.01, early_stop_cost=0.001):
        ngram = left_gram + right_gram
        n_features = len(features_vector) * ngram  # number of features = 17,380 * 4
        n_classes = len(labels_vector) if len(labels_vector) >= 3 else 1  # number of classes = 2 but len=1

        log.info('load characters list...')
        log.info('load characters list OK. len: %s\n' % NumUtil.comma_str(len(features_vector)))
        watch = WatchUtil()

        train_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing',
                                  'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.train.gz' % (n_train, left_gram, right_gram))
        valid_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing',
                                  'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.test.gz' % (n_valid, left_gram, right_gram))
        test_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing',
                                 'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.valid.gz' % (n_test, left_gram, right_gram))
        if not os.path.exists(train_file) or not os.path.exists(valid_file) or not os.path.exists(test_file):
            dataset_dir = os.path.dirname(train_file)
            if not os.path.exists(dataset_dir):
                os.makedirs(dataset_dir)

            watch.start('create dataset')
            log.info('create dataset...')

            data_files = (('train', KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE, n_train, train_file, False),
                          ('valid', KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE, n_valid, valid_file, False),
                          ('test', KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE, n_test, test_file, False))

            for name, data_file, total, dataset_file, to_one_hot_vector in data_files:
                check_interval = 10000
                log.info('check_interval: %s' % check_interval)
                log.info('%s %s total: %s' % (name, os.path.basename(data_file), NumUtil.comma_str(total)))

                features, labels = [], []
                with gzip.open(data_file, 'rt', encoding='utf8') as f:
                    for i, line in enumerate(f, 1):
                        if total < i:
                            break

                        if i % check_interval == 0:
                            time.sleep(0.01)  # prevent cpu overload
                            percent = i / total * 100
                            log.info('create dataset... %.1f%% readed. data len: %s. %s' % (percent, NumUtil.comma_str(len(features)), data_file))

                        _f, _l = WordSpacing.sentence2features_labels(line.strip(), left_gram=left_gram, right_gram=right_gram)
                        features.extend(_f)
                        labels.extend(_l)

                dataset = DataSet(features=features, labels=labels, features_vector=features_vector, labels_vector=labels_vector, name=name)
                log.info('dataset save... %s' % dataset_file)
                dataset.save(dataset_file, gzip_format=True, verbose=True)
                log.info('dataset save OK. %s' % dataset_file)
                log.info('dataset: %s' % dataset)

            log.info('create dataset OK.')
            log.info('')
            watch.stop('create dataset')

        watch.start('dataset load')
        log.info('dataset load...')
        train = DataSet.load(train_file, gzip_format=True, verbose=True)

        if n_train >= int('100,000'.replace(',', '')):
            valid = DataSet.load(valid_file, gzip_format=True, verbose=True)
        else:
            valid = DataSet.load(train_file, gzip_format=True, verbose=True)
        log.info('valid.convert_to_one_hot_vector()...')
        valid = valid.convert_to_one_hot_vector(verbose=True)
        log.info('valid.convert_to_one_hot_vector() OK.')

        log.info('train dataset: %s' % train)
        log.info('valid dataset: %s' % valid)
        log.info('dataset load OK.')
        log.info('')
        watch.stop('dataset load')

        graph = WordSpacing.build_FFNN(n_features, n_classes, n_hidden1, learning_rate, watch)

        train_step, X, Y, cost, predicted, accuracy = graph['train_step'], graph['X'], graph['Y'], graph['cost'], graph['predicted'], graph['accuracy']

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            check_interval = 10  # max(1, min(1000, n_train // 10))
            nth_train, nth_input, total_input = 0, 0, total_epoch * train.size

            log.info('learn...')
            log.info('total: %s' % NumUtil.comma_str(train.size))
            watch.start('learn')
            valid_cost = sys.float_info.max
            for epoch in range(1, total_epoch + 1):
                if valid_cost < early_stop_cost:
                    break
                for step, (features_batch, labels_batch) in enumerate(train.next_batch(batch_size=batch_size), 1):
                    if valid_cost < early_stop_cost:
                        log.info('valid_cost: %s, early_stop_cost: %s, early stopped.' % (valid_cost, early_stop_cost))
                        break
                    nth_train += 1
                    nth_input += features_batch.shape[0]
                    sess.run(train_step, feed_dict={X: features_batch, Y: labels_batch})

                    # if step % check_interval == 1:
                    percent = nth_input / total_input * 100
                    valid_cost = sess.run(cost, feed_dict={X: valid.features, Y: valid.labels})
                    log.info('[epoch=%s][%.1f%%] %s cost: %.4f' % (epoch, percent, valid.name, valid_cost))
            watch.stop('learn')
            log.info('learn OK.\n')

            log.info('model save... %s' % model_file)
            watch.start('model save...')
            model_dir = os.path.dirname(model_file)
            if not os.path.exists(model_dir):
                os.makedirs(model_dir)
            saver = tf.train.Saver()
            saver.save(sess, model_file)
            watch.stop('model save...')
            log.info('model save OK. %s' % model_file)

        log.info('\n')
        log.info('batch_size: %s' % batch_size)
        log.info(watch.summary())
        log.info('\n')

コード例 #5

0

ファイルを表示

ファイル: word_spacing.py プロジェクト: bage79/nlp4kor-tensorflow

    def build_FFNN(cls, n_features, n_classes, n_hidden1, learning_rate, watch=WatchUtil()):
        log.info('\nbuild_FFNN')
        if len(cls.graph_nodes) == 0:
            n_hidden3 = n_hidden2 = n_hidden1
            log.info('create tensorflow graph...')
            watch.start('create tensorflow graph')
            log.info('n_features: %s' % n_features)
            log.info('n_classes: %s' % n_classes)
            log.info('n_hidden1: %s' % n_hidden1)
            log.info('n_hidden2: %s' % n_hidden2)
            log.info('n_hidden3: %s' % n_hidden3)

            tf.set_random_seed(777)  # for reproducibility

            X = tf.placeholder(tf.float32, [None, n_features], name='X')  # two characters
            Y = tf.placeholder(tf.float32, [None, n_classes], name='Y')

            W1 = tf.Variable(tf.random_normal([n_features, n_hidden1]), name='W1')
            b1 = tf.Variable(tf.random_normal([n_hidden1]), name='b1')
            layer1 = tf.nn.relu(tf.matmul(X, W1) + b1, name='layer1')

            W2 = tf.Variable(tf.random_normal([n_hidden1, n_hidden2]), name='W2')
            b2 = tf.Variable(tf.random_normal([n_hidden2]), name='b2')
            layer2 = tf.nn.relu(tf.matmul(layer1, W2) + b2, name='layer2')

            W3 = tf.Variable(tf.random_normal([n_hidden2, n_hidden3]), name='W3')
            b3 = tf.Variable(tf.random_normal([n_hidden3]), name='b3')
            layer3 = tf.nn.relu(tf.matmul(layer2, W3) + b3, name='layer3')

            W4 = tf.Variable(tf.random_normal([n_hidden3, n_classes]), name='W4')
            b4 = tf.Variable(tf.random_normal([n_classes]), name='b4')
            y_hat = tf.add(tf.matmul(layer3, W4), b4, name='y_hat')

            # cost = -tf.reduce_mean(Y * tf.log(hypothesis) + (1 - Y) * tf.log(1 - hypothesis), name='cost')  # cost/loss function
            cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat, labels=Y), name='cost')

            train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(
                cost)  # Very Very good!! sentences=10000 + layer=4, 10분, accuracy 0.9294, cost: 0.1839

            predicted = tf.cast(y_hat > 0.0, dtype=tf.float32, name='predicted')  # 0 <= hypothesis <= 1

            accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, Y), dtype=tf.float32), name='accuracy')
            watch.stop('create tensorflow graph')
            log.info('create tensorflow graph OK.\n')
            cls.graph_nodes = {'predicted': predicted, 'accuracy': accuracy, 'X': X, 'Y': Y, 'train_step': train_step, 'cost': cost}
        return cls.graph_nodes

コード例 #6

0

ファイルを表示

ファイル: word_spacing.py プロジェクト: bage79/nlp4kor-tensorflow

        total_spaces = labels1.count(1)  # 정답에 있는 공백 개수
        correct = total_spaces - incorrect  # 정답에 있는 공백과 같은 곳에 공백이 있는지

        if total_spaces == 0:
            sim = 1
        else:
            sim = correct / total_spaces
        return sim, correct, total_spaces


if __name__ == '__main__':
    train_sentences_file = KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE
    valid_sentences_file = KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE
    test_sentences_file = KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE
    log.info('train_sentences_file: %s' % train_sentences_file)
    log.info('valid_sentences_file: %s' % valid_sentences_file)
    log.info('test_sentences_file: %s' % test_sentences_file)

    characters_file = KO_WIKIPEDIA_ORG_CHARACTERS_FILE
    log.info('characters_file: %s' % characters_file)
    try:
        if len(sys.argv) == 4:
            n_train = int(sys.argv[1])
            left_gram = int(sys.argv[2])
            right_gram = int(sys.argv[3])
        else:
            n_train, left_gram, right_gram = 100, 2, 2
            # n_train = int('1,000,000'.replace(',', ''))  # 1M data (학습: 17시간 소요)

        ngram = left_gram + right_gram

コード例 #7

0

ファイルを表示

def create_graph(model_name, scope_name, verbose=False):
    """
    create or reuse graph
    :param model_name:
    :param scope_name:
    :param verbose: print graph nodes
    :return: tensorflow graph nodes
    """
    with tf.variable_scope('common'):  # for reusing graph
        learning_rate = tf.placeholder(dtype=tf.float32, name='learning_rate')

        W1 = tf.get_variable(dtype=tf.float32, shape=[input_len, output_len], initializer=tf.random_normal_initializer(), name='W1')
        b1 = tf.get_variable(dtype=tf.float32, initializer=tf.constant(0.0, shape=[output_len]), name='b1')

        x = tf.placeholder(dtype=tf.float32, shape=[None, input_len], name='x')
        y = tf.placeholder(dtype=tf.float32, shape=[None, output_len], name='y')

        y_hat = tf.add(tf.matmul(x, W1), b1, name='y_hat')
        cost = tf.reduce_mean(tf.square(y_hat - y), name='cost')
        train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost, name='train_step')

    with tf.variable_scope(scope_name, reuse=None):
        _W1 = tf.summary.histogram(values=W1, name='_W1')
        _b1 = tf.summary.histogram(values=b1, name='_b1')
        _cost = tf.summary.scalar(tensor=cost, name='_cost')
        summary = tf.summary.merge([_W1, _b1, _cost], name='summary')  # merge_all()
        if verbose:
            log.info('')
            log.info(x)
            log.info(W1)
            log.info(b1)
            log.info('')
            log.info(y)
            log.info(y_hat)
            log.info(cost)
    return x, y, learning_rate, W1, b1, y_hat, cost, train_step, summary

コード例 #8

0

ファイルを表示

    input_len = 2  # x1, x2
    output_len = 1  # y
    _learning_rate = 0.01

    n_train, n_valid, n_test = 1000, 100, 10
    if not os.path.exists(train_file):
        create_data4add(train_file, n_train, digit_max=99)
    if not os.path.exists(valid_file):
        create_data4add(valid_file, n_valid, digit_max=99)
    if not os.path.exists(test_file):
        create_data4add(test_file, n_test, digit_max=99)

    for training_mode in [True, False]:  # training & testing
        for batch_size in [1, 10, 100]:
            tf.reset_default_graph()  # Clears the default graph stack and resets the global default graph.
            log.info('')
            log.info('training_mode: %s, batch_size: %s, total_train_time: %s secs' % (training_mode, batch_size, total_train_time))

            model_name = os.path.basename(__file__).replace('.py', '')
            model_file = os.path.join(SAMPLE_MODELS_DIR, '%s.n_train_%s.batch_size_%s.total_train_time_%s/model' % (model_name, n_train, batch_size, total_train_time))
            model_dir = os.path.dirname(model_file)
            log.info('model_name: %s' % model_name)
            log.info('model_file: %s' % model_file)

            scope_name = '%s.%s.batch_size_%s.total_train_time_%s' % (model_name, DateUtil.current_yyyymmdd_hhmm(), batch_size, total_train_time)
            log.info('scope_name: %s' % scope_name)

            with tf.device('/gpu:0'):
                with tf.Graph().as_default():  # for reusing graph
                    checkpoint = tf.train.get_checkpoint_state(model_dir)
                    is_training = True if training_mode or not checkpoint else False  # learning or testing

コード例 #9

0

ファイルを表示

ファイル: learn_math_functions_demo.py プロジェクト: bage79/nlp4kor-tensorflow

if __name__ == '__main__':
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # ignore tensorflow warnings
    tf.logging.set_verbosity(tf.logging.ERROR)  # ignore tensorflow info

    func = multiply  # 다른 데이터 생성 함수로 교체해 볼것 add, average
    n_features = 2  # x1, x2
    n_classes = 1  # y
    digits = list(range(-99, 100, 1))
    n_train, n_test = 4000, 100  # 10% of 200 * 200

    x_data = np.random.choice(digits, (n_train + n_test, n_features), replace=True)
    y_data = func(x_data)
    x_train, x_test = x_data[:n_train], x_data[n_train:]
    y_train, y_test = y_data[:n_train], y_data[n_train:]

    log.info('')
    log.info('func: %s' % func.__name__)
    log.info('digits: %s ~ %s ' % (min(digits), max(digits)))
    log.info('x_train: %s' % str(x_train.shape))
    log.info(x_data[:5])
    log.info('y_train: %s' % str(y_train.shape))
    log.info(y_data[:5])
    log.info('x_test: %s' % str(x_test.shape))
    log.info('y_test %s' % str(y_test.shape))

    valid_check_interval = 0.5
    bias_value = 0.0
    early_stop_cost = 0.1  # stop learning

    # default values
    optimizer = tf.train.AdamOptimizer

コード例 #10

0

ファイルを表示

    def dump_corpus(mongo_url, db_name, collection_name, sentences_file, characters_file, info_file, urls_file,
                    train_sentences_file, valid_sentences_file, test_sentences_file,
                    mongo_query=None, limit=None):
        """
        Mongodb에서 문서를 읽어서, 문장 단위로 저장한다. (단 문장안의 단어가 1개 이거나, 한글이 전혀 없는 문장은 추출하지 않는다.)
        :param characters_file:
        :param urls_file:
        :param info_file:
        :param mongo_url: mongodb://~~~
        :param db_name: database name of mongodb
        :param collection_name: collection name of mongodb
        :param sentences_file: *.sentence file
        :param train_sentences_file:
        :param valid_sentences_file:
        :param test_sentences_file:
        :param mongo_query: default={}
        :param limit:
        :return:
        """
        if mongo_query is None:
            mongo_query = {}

        corpus_mongo = MongodbUtil(mongo_url, db_name=db_name, collection_name=collection_name)
        total_docs = corpus_mongo.count()
        log.info('%s total: %s' % (corpus_mongo, NumUtil.comma_str(total_docs)))

        output_dir = os.path.basename(sentences_file)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        with gzip.open(sentences_file, 'wt') as out_f, \
                gzip.open(train_sentences_file, 'wt') as train_f, \
                gzip.open(valid_sentences_file, 'wt') as valid_f, \
                gzip.open(test_sentences_file, 'wt') as test_f, \
                open(info_file, 'wt') as info_f, \
                open(urls_file, 'wt') as urls_f:

            char_set = set()
            n_docs = n_total = n_train = n_valid = n_test = 0
            if limit:
                cursor = corpus_mongo.find(mongo_query, limit=limit)
            else:
                cursor = corpus_mongo.find(mongo_query)

            for i, row in enumerate(cursor, 1):
                if i % 1000 == 0:
                    log.info('%s %.1f%% writed.' % (os.path.basename(sentences_file), i / total_docs * 100))

                sentences = []
                for c in row['content']:
                    sentences.extend(HangulUtil.text2sentences(c['sentences'], remove_only_one_word=True, has_hangul=True, remove_markdown=True))

                log.debug('url: %s, len: %s' % (row['url'], len(sentences)))
                if len(sentences) == 0:
                    # log.error(row['content'])
                    continue

                urls_f.write(row['url'])
                urls_f.write('\n')
                n_docs += 1

                for s in sentences:
                    _char_set = set([c for c in s])
                    char_set.update(_char_set)

                    n_total += 1
                    out_f.write(s)
                    out_f.write('\n')

                if len(sentences) >= 10:  # can split
                    test_len = valid_len = len(sentences) // 10
                    # log.info('train: %s, test: %s, valid: %s' % (len(sentences) - test_len - valid_len, test_len, valid_len))
                    for s in sentences[:test_len]:
                        n_test += 1
                        test_f.write(s)
                        test_f.write('\n')
                    for s in sentences[test_len:test_len + valid_len]:
                        n_valid += 1
                        valid_f.write(s)
                        valid_f.write('\n')
                    for s in sentences[test_len + valid_len:]:
                        n_train += 1
                        train_f.write(s)
                        train_f.write('\n')
                else:  # can't split
                    for s in sentences:
                        n_train += 1
                        train_f.write(s)
                        train_f.write('\n')

            char_list = list(char_set)
            char_list.sort()
            log.info('writed to %s...' % characters_file)
            with open(characters_file, 'w') as f:
                for c in char_list:
                    f.write(c)
                    f.write('\n')
            log.info('writed to %s OK.' % characters_file)

            log.info('total docs: %s', NumUtil.comma_str(total_docs))
            log.info('total docs: %s (has hangul sentence)', NumUtil.comma_str(n_docs))
            log.info('total sentences: %s (has hangul sentence)', NumUtil.comma_str(n_total))
            log.info('train: %s', NumUtil.comma_str(n_train))
            log.info('valid: %s', NumUtil.comma_str(n_valid))
            log.info('test: %s', NumUtil.comma_str(n_test))
            log.info('total characters: %s', NumUtil.comma_str(len(char_list)))

            info_f.write('total docs: %s\n' % NumUtil.comma_str(total_docs))
            info_f.write('total docs: %s (has hangul sentence)\n' % NumUtil.comma_str(n_docs))
            info_f.write('total sentences: %s (has hangul sentence)\n' % NumUtil.comma_str(n_total))
            info_f.write('train: %s\n' % NumUtil.comma_str(n_train))
            info_f.write('valid: %s\n' % NumUtil.comma_str(n_valid))
            info_f.write('test: %s\n' % NumUtil.comma_str(n_test))
            info_f.write('total characters: %s\n' % NumUtil.comma_str(len(char_list)))

コード例 #11

0

ファイルを表示

            info_f.write('total docs: %s\n' % NumUtil.comma_str(total_docs))
            info_f.write('total docs: %s (has hangul sentence)\n' % NumUtil.comma_str(n_docs))
            info_f.write('total sentences: %s (has hangul sentence)\n' % NumUtil.comma_str(n_total))
            info_f.write('train: %s\n' % NumUtil.comma_str(n_train))
            info_f.write('valid: %s\n' % NumUtil.comma_str(n_valid))
            info_f.write('test: %s\n' % NumUtil.comma_str(n_test))
            info_f.write('total characters: %s\n' % NumUtil.comma_str(len(char_list)))


if __name__ == '__main__':
    info_file = KO_WIKIPEDIA_ORG_INFO_FILE
    urls_file = KO_WIKIPEDIA_ORG_URLS_FILE
    sentences_file = KO_WIKIPEDIA_ORG_SENTENCES_FILE
    characters_file = KO_WIKIPEDIA_ORG_CHARACTERS_FILE
    log.info('info_file: %s' % info_file)
    log.info('urls_file: %s' % urls_file)
    log.info('sentences_file: %s' % sentences_file)
    log.info('characters_file: %s' % characters_file)

    if not os.path.exists(characters_file) or not os.path.exists(sentences_file) or not os.path.exists(info_file) or not os.path.exists(urls_file):
        try:
            log.info('create senences file...')
            TextPreprocess.dump_corpus(MONGO_URL, db_name='parsed', collection_name='ko.wikipedia.org', sentences_file=sentences_file,
                                       characters_file=characters_file,
                                       info_file=info_file, urls_file=urls_file,
                                       train_sentences_file=KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE,
                                       valid_sentences_file=KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE,
                                       test_sentences_file=KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE,
                                       mongo_query={})  # mongodb -> text file(corpus)
            log.info('create senences file OK')

コード例 #12

0

ファイルを表示

ファイル: learn_add_with_queue.py プロジェクト: bage79/nlp4kor-tensorflow

def create_graph(tensorboard_scope,
                 mode_scope,
                 input_file,
                 input_len=2,
                 output_len=1,
                 batch_size=1,
                 verbose=True,
                 reuse=None,
                 n_threads=2):
    """
    create or reuse graph
    :param tensorboard_scope: variable scope name
    :param mode_scope: 'train', 'valid', 'test'
    :param input_file: train or valid or test file path
    :param input_len: x1, x2
    :param output_len: y
    :param batch_size: batch size > 0
    :param verbose: print graph nodes
    :param reuse: reuse graph or not
    :param n_threads: number of example enqueue threands (2 is enough)
    :return: tensorflow graph nodes
    """

    with tf.name_scope(mode_scope):  # don't share
        x, y = input_pipeline([input_file],
                              batch_size=batch_size,
                              delim='\t',
                              splits=3,
                              n_threads=n_threads)
        learning_rate = tf.placeholder(dtype=tf.float32, name='learning_rate')

        with tf.variable_scope('layers%d' % 1, reuse=reuse):  # share W, b
            W1 = tf.get_variable(dtype=tf.float32,
                                 shape=[input_len, output_len],
                                 initializer=tf.random_normal_initializer(),
                                 name='W1')
            b1 = tf.get_variable(dtype=tf.float32,
                                 initializer=tf.constant(0.0,
                                                         shape=[output_len]),
                                 name='b1')

        y_hat = tf.add(tf.matmul(x, W1), b1, name='y_hat')
        with tf.variable_scope('cost', reuse=reuse):  # share W, b
            cost = tf.reduce_mean(tf.square(y_hat - y), name='cost')
            train_step = tf.train.AdamOptimizer(
                learning_rate=learning_rate).minimize(cost, name='train_step')

    with tf.name_scope(tensorboard_scope):  # don't share
        _W1 = tf.summary.histogram(values=W1, name='_W1')
        _b1 = tf.summary.histogram(values=b1, name='_b1')
        _cost = tf.summary.scalar(tensor=cost, name='_cost')
        summary = tf.summary.merge([_W1, _b1, _cost],
                                   name='summary')  # tf.summary.merge_all()

    if verbose:
        log.info('')
        log.info('mode_scope: %s' % mode_scope)
        log.info(x)
        log.info(W1)
        log.info(b1)
        log.info(y)
        log.info(y_hat)
        log.info(cost)
        log.info(train_step.name)
    return x, y, learning_rate, W1, b1, y_hat, cost, train_step, summary