Example #1
0
    def __build_FFNN_layers2(cls,
                             n_features,
                             n_classes,
                             n_hidden1,
                             learning_rate,
                             watch=WatchUtil()):
        if len(cls.graph_nodes) == 0:
            log.info('create tensorflow graph...')
            watch.start('create tensorflow graph')
            log.info('n_features: %s' % n_features)
            log.info('n_classes: %s' % n_classes)
            log.info('n_hidden1: %s' % n_hidden1)

            tf.set_random_seed(777)  # for reproducibility

            X = tf.placeholder(tf.float32, [None, n_features],
                               name='X')  # two characters
            Y = tf.placeholder(tf.float32, [None, n_classes], name='Y')

            W1 = tf.Variable(tf.random_normal([n_features, n_hidden1]),
                             name='W1')
            b1 = tf.Variable(tf.random_normal([n_hidden1]), name='b1')
            layer1 = tf.sigmoid(tf.matmul(X, W1) + b1, name='layer1')

            W2 = tf.Variable(tf.random_normal([n_hidden1, n_classes]),
                             name='W2')
            b2 = tf.Variable(tf.random_normal([n_classes]), name='b2')
            hypothesis = tf.sigmoid(tf.matmul(layer1, W2) + b2,
                                    name='hypothesis')

            cost = -tf.reduce_mean(Y * tf.log(hypothesis) +
                                   (1 - Y) * tf.log(1 - hypothesis),
                                   name='cost')  # cost/loss function

            # train_step = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost)  # Too bad. sentences=10000 + layer=2, 20분, Accuracy: 0.689373, cost: 0.8719
            train_step = tf.train.AdamOptimizer(
                learning_rate=learning_rate
            ).minimize(
                cost
            )  # Very good!! sentences=10000 + layer=2, 10분, accuracy 0.9194, cost: 0.2139

            predicted = tf.cast(hypothesis > 0.5,
                                dtype=tf.float32,
                                name='predicted')  # 0 <= hypothesis <= 1
            accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, Y),
                                              dtype=tf.float32),
                                      name='accuracy')
            watch.stop('create tensorflow graph')
            log.info('create tensorflow graph OK.\n')
            cls.graph_nodes = {
                'hypothesis': hypothesis,
                'predicted': predicted,
                'accuracy': accuracy,
                'X': X,
                'Y': Y,
                'train_step': train_step,
                'cost': cost
            }
        return cls.graph_nodes
Example #2
0
    def build_FFNN(cls, n_features, n_classes, n_hidden1, learning_rate, watch=WatchUtil()):  # TODO: 2 layers
        log.info('\nbuild_FFNN')
        if len(cls.graph_nodes) == 0:
            n_hidden3 = n_hidden2 = n_hidden1
            log.info('create tensorflow graph...')
            watch.start('create tensorflow graph')
            log.info('n_features: %s' % n_features)
            log.info('n_classes: %s' % n_classes)
            log.info('n_hidden1: %s' % n_hidden1)
            log.info('n_hidden2: %s' % n_hidden2)
            log.info('n_hidden3: %s' % n_hidden3)

            tf.set_random_seed(777)  # for reproducibility

            X = tf.placeholder(tf.float32, [None, n_features], name='X')  # two characters
            Y = tf.placeholder(tf.float32, [None, n_classes], name='Y')

            # W1 = tf.Variable(tf.truncated_normal([n_features, n_hidden1], mean=0.0, stddev=0.1), name='W1')
            # b1 = tf.Variable(tf.constant(0.1, shape=[n_hidden1]), name='b1')
            W1 = tf.Variable(tf.random_normal([n_features, n_hidden1]), name='W1')
            b1 = tf.Variable(tf.random_normal([n_hidden1]), name='b1')
            layer1 = tf.nn.relu(tf.matmul(X, W1) + b1, name='layer1')

            W2 = tf.Variable(tf.random_normal([n_hidden1, n_hidden2]), name='W2')
            b2 = tf.Variable(tf.random_normal([n_hidden2]), name='b2')
            layer2 = tf.nn.relu(tf.matmul(layer1, W2) + b2, name='layer2')

            W3 = tf.Variable(tf.random_normal([n_hidden2, n_hidden3]), name='W3')
            b3 = tf.Variable(tf.random_normal([n_hidden3]), name='b3')
            layer3 = tf.nn.relu(tf.matmul(layer2, W3) + b3, name='layer3')

            W4 = tf.Variable(tf.random_normal([n_hidden3, n_classes]), name='W4')
            b4 = tf.Variable(tf.random_normal([n_classes]), name='b4')
            y_hat = tf.add(tf.matmul(layer3, W4), b4, name='y_hat')

            # cost = -tf.reduce_mean(Y * tf.log(hypothesis) + (1 - Y) * tf.log(1 - hypothesis), name='cost')  # cost/loss function
            cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat, labels=Y), name='cost')

            train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(
                cost)  # Very Very good!! sentences=10000 + layer=4, 10분, accuracy 0.9294, cost: 0.1839

            predicted = tf.cast(y_hat > 0.5, dtype=tf.float32, name='predicted')  # 0 <= hypothesis <= 1

            accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, Y), dtype=tf.float32), name='accuracy')

            watch.stop('create tensorflow graph')
            log.info('create tensorflow graph OK.\n')
            cls.graph_nodes = {'predicted': predicted, 'accuracy': accuracy, 'X': X, 'Y': Y, 'train_step': train_step, 'cost': cost}
        return cls.graph_nodes
Example #3
0
 def build_FFNN(cls,
                n_features,
                n_classes,
                n_hidden1,
                learning_rate,
                watch=WatchUtil(),
                layers=4):
     log.info('\nbuild_FFNN(layers=%s)' % layers)
     if layers == 2:
         return cls.__build_FFNN_layers2(n_features,
                                         n_classes,
                                         n_hidden1,
                                         learning_rate,
                                         watch=watch)
     else:
         return cls.__build_FFNN_layers4(n_features,
                                         n_classes,
                                         n_hidden1,
                                         learning_rate,
                                         watch=watch)
    def learning(cls, total_epoch, n_train, n_valid, n_test, batch_size, left_gram, right_gram, model_file, features_vector, labels_vector, n_hidden1=100,
                 learning_rate=0.01, early_stop_cost=0.001):
        ngram = left_gram + right_gram
        n_features = len(features_vector) * ngram  # number of features = 17,380 * 4
        n_classes = len(labels_vector) if len(labels_vector) >= 3 else 1  # number of classes = 2 but len=1

        log.info('load characters list...')
        log.info('load characters list OK. len: %s\n' % NumUtil.comma_str(len(features_vector)))
        watch = WatchUtil()

        train_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing',
                                  'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.train.gz' % (n_train, left_gram, right_gram))
        valid_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing',
                                  'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.test.gz' % (n_valid, left_gram, right_gram))
        test_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing',
                                 'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.valid.gz' % (n_test, left_gram, right_gram))
        if not os.path.exists(train_file) or not os.path.exists(valid_file) or not os.path.exists(test_file):
            dataset_dir = os.path.dirname(train_file)
            if not os.path.exists(dataset_dir):
                os.makedirs(dataset_dir)

            watch.start('create dataset')
            log.info('create dataset...')

            data_files = (('train', KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE, n_train, train_file, False),
                          ('valid', KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE, n_valid, valid_file, False),
                          ('test', KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE, n_test, test_file, False))

            for name, data_file, total, dataset_file, to_one_hot_vector in data_files:
                check_interval = 10000
                log.info('check_interval: %s' % check_interval)
                log.info('%s %s total: %s' % (name, os.path.basename(data_file), NumUtil.comma_str(total)))

                features, labels = [], []
                with gzip.open(data_file, 'rt', encoding='utf8') as f:
                    for i, line in enumerate(f, 1):
                        if total < i:
                            break

                        if i % check_interval == 0:
                            time.sleep(0.01)  # prevent cpu overload
                            percent = i / total * 100
                            log.info('create dataset... %.1f%% readed. data len: %s. %s' % (percent, NumUtil.comma_str(len(features)), data_file))

                        _f, _l = WordSpacing.sentence2features_labels(line.strip(), left_gram=left_gram, right_gram=right_gram)
                        features.extend(_f)
                        labels.extend(_l)

                dataset = DataSet(features=features, labels=labels, features_vector=features_vector, labels_vector=labels_vector, name=name)
                log.info('dataset save... %s' % dataset_file)
                dataset.save(dataset_file, gzip_format=True, verbose=True)
                log.info('dataset save OK. %s' % dataset_file)
                log.info('dataset: %s' % dataset)

            log.info('create dataset OK.')
            log.info('')
            watch.stop('create dataset')

        watch.start('dataset load')
        log.info('dataset load...')
        train = DataSet.load(train_file, gzip_format=True, verbose=True)

        if n_train >= int('100,000'.replace(',', '')):
            valid = DataSet.load(valid_file, gzip_format=True, verbose=True)
        else:
            valid = DataSet.load(train_file, gzip_format=True, verbose=True)
        log.info('valid.convert_to_one_hot_vector()...')
        valid = valid.convert_to_one_hot_vector(verbose=True)
        log.info('valid.convert_to_one_hot_vector() OK.')

        log.info('train dataset: %s' % train)
        log.info('valid dataset: %s' % valid)
        log.info('dataset load OK.')
        log.info('')
        watch.stop('dataset load')

        graph = WordSpacing.build_FFNN(n_features, n_classes, n_hidden1, learning_rate, watch)

        train_step, X, Y, cost, predicted, accuracy = graph['train_step'], graph['X'], graph['Y'], graph['cost'], graph['predicted'], graph['accuracy']

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            check_interval = 10  # max(1, min(1000, n_train // 10))
            nth_train, nth_input, total_input = 0, 0, total_epoch * train.size

            log.info('learn...')
            log.info('total: %s' % NumUtil.comma_str(train.size))
            watch.start('learn')
            valid_cost = sys.float_info.max
            for epoch in range(1, total_epoch + 1):
                if valid_cost < early_stop_cost:
                    break
                for step, (features_batch, labels_batch) in enumerate(train.next_batch(batch_size=batch_size), 1):
                    if valid_cost < early_stop_cost:
                        log.info('valid_cost: %s, early_stop_cost: %s, early stopped.' % (valid_cost, early_stop_cost))
                        break
                    nth_train += 1
                    nth_input += features_batch.shape[0]
                    sess.run(train_step, feed_dict={X: features_batch, Y: labels_batch})

                    # if step % check_interval == 1:
                    percent = nth_input / total_input * 100
                    valid_cost = sess.run(cost, feed_dict={X: valid.features, Y: valid.labels})
                    log.info('[epoch=%s][%.1f%%] %s cost: %.4f' % (epoch, percent, valid.name, valid_cost))
            watch.stop('learn')
            log.info('learn OK.\n')

            log.info('model save... %s' % model_file)
            watch.start('model save...')
            model_dir = os.path.dirname(model_file)
            if not os.path.exists(model_dir):
                os.makedirs(model_dir)
            saver = tf.train.Saver()
            saver.save(sess, model_file)
            watch.stop('model save...')
            log.info('model save OK. %s' % model_file)

        log.info('\n')
        log.info('batch_size: %s' % batch_size)
        log.info(watch.summary())
        log.info('\n')
            log.info('%s -> %s' % (features, labels))
            log.info('in : "%s"' % s)
            log.info('out: "%s"' % WordSpacing.spacing(s.replace(' ', ''), labels))
        log.info('sample testing OK.\n')

        if not os.path.exists(model_file + '.index') or not os.path.exists(model_file + '.meta'):
            if n_train >= int('100,000'.replace(',', '')):
                SlackUtil.send_message('%s start (max_sentences=%s, left_gram=%s, right_gram=%.1f)' % (sys.argv[0], n_train, left_gram, right_gram))
            WordSpacing.learning(total_epoch, n_train, n_valid, n_test, batch_size, left_gram, right_gram, model_file, features_vector, labels_vector,
                                 n_hidden1=n_hidden1,
                                 learning_rate=learning_rate, early_stop_cost=early_stop_cost)
            if n_train >= int('100,000'.replace(',', '')):
                SlackUtil.send_message('%s end (max_sentences=%s, left_gram=%s, right_gram=%.1f)' % (sys.argv[0], n_train, left_gram, right_gram))

        log.info('chek result...')
        watch = WatchUtil()
        watch.start('read sentences')

        sentences = []  # '아버지가 방에 들어 가신다.', '가는 말이 고와야 오는 말이 곱다.']
        max_test_sentences = 100

        if n_train >= int('100,000'.replace(',', '')):
            sentences_file = test_sentences_file
        else:
            sentences_file = train_sentences_file

        with gzip.open(sentences_file, 'rt', encoding='utf8') as f:
            for i, line in enumerate(f, 1):
                if len(sentences) >= max_test_sentences:
                    break
Example #6
0
    def learning(cls, total_epoch, n_train, n_valid, n_test, batch_size, window_size, noise_rate, model_file, features_vector, labels_vector,
                 n_hidden1,
                 learning_rate,
                 dropout_keep_rate, early_stop_cost=0.001):
        n_features = len(features_vector) * window_size  # number of features = 17,382 * 10

        log.info('load characters list...')
        log.info('load characters list OK. len: %s' % NumUtil.comma_str(len(features_vector)))
        watch = WatchUtil()

        train_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'spelling_error_correction',
                                  'ko.wikipedia.org.dataset.sentences=%s.window_size=%d.train.gz' % (n_train, window_size))
        valid_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'spelling_error_correction',
                                  'ko.wikipedia.org.dataset.sentences=%s.window_size=%d.valid.gz' % (n_valid, window_size))
        test_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'spelling_error_correction',
                                 'ko.wikipedia.org.dataset.sentences=%s.window_size=%d.test.gz' % (n_test, window_size))

        log.info('train_file: %s' % train_file)
        log.info('valid_file: %s' % valid_file)
        log.info('test_file: %s' % test_file)
        if not os.path.exists(train_file) or not os.path.exists(valid_file) or not os.path.exists(test_file):
            dataset_dir = os.path.dirname(train_file)
            if not os.path.exists(dataset_dir):
                os.makedirs(dataset_dir)

            watch.start('create dataset')  # FIXME: out of memory (1M sentences)
            log.info('create dataset...')

            data_files = (('train', KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE, n_train, train_file, False),
                          ('valid', KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE, n_valid, valid_file, False),
                          ('test', KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE, n_test, test_file, False))

            for (name, data_file, total, dataset_file, to_one_hot_vector) in data_files:
                check_interval = 10000
                log.info('check_interval: %s' % check_interval)
                log.info('%s %s total: %s' % (name, os.path.basename(data_file), NumUtil.comma_str(total)))
                log.info('noise_rate: %s' % noise_rate)

                features, labels = [], []
                with gzip.open(data_file, 'rt') as f:
                    for i, line in enumerate(f, 1):
                        if total < i:
                            break

                        if i % check_interval == 0:
                            time.sleep(0.01)  # prevent cpu overload
                            percent = i / total * 100
                            log.info('create dataset... %.1f%% readed. data len: %s. %s' % (percent, NumUtil.comma_str(len(features)), data_file))

                        sentence = line.strip()
                        for start in range(0, len(sentence) - window_size + 1):  # 문자 단위로 노이즈(공백) 생성
                            chars = sentence[start: start + window_size]
                            for idx in range(len(chars)):
                                noised_chars = StringUtil.replace_with_index(chars, ' ', idx)
                                features.append(noised_chars)
                                labels.append(chars)
                                log.debug('create dataset... %s "%s" -> "%s"' % (name, noised_chars, chars))

                # log.info('noise_sampling: %s' % noise_sampling)
                #         for nth_sample in range(noise_sampling): # 초성, 중성, 종성 단위로 노이즈 생성
                #             for start in range(0, len(sentence) - window_size + 1):
                #                 chars = sentence[start: start + window_size]
                #                 noised_chars = SpellingErrorCorrection.encode_noise(chars, noise_rate=noise_rate, noise_with_blank=True)
                #                 if chars == noised_chars:
                #                     continue
                #                 if i % check_interval == 0 and nth_sample == 0:
                #                     log.info('create dataset... %s "%s" -> "%s"' % (name, noised_chars, chars))
                #                 features.append(noised_chars)
                #                 labels.append(chars)

                # print('dataset features:', features)
                # print('dataset labels:', labels)
                dataset = DataSet(features=features, labels=labels, features_vector=features_vector, labels_vector=labels_vector, name=name)
                log.info('dataset save... %s' % dataset_file)
                dataset.save(dataset_file, gzip_format=True, verbose=True)
                log.info('dataset save OK. %s' % dataset_file)
                log.info('dataset: %s' % dataset)

            log.info('create dataset OK.')
            log.info('')
            watch.stop('create dataset')

        watch.start('dataset load')
        log.info('dataset load...')
        train = DataSet.load(train_file, gzip_format=True, verbose=True)

        if n_train >= int('100,000'.replace(',', '')):
            valid = DataSet.load(valid_file, gzip_format=True, verbose=True)
        else:
            valid = DataSet.load(train_file, gzip_format=True, verbose=True)
        log.info('valid.convert_to_one_hot_vector()...')
        valid = valid.convert_to_one_hot_vector(verbose=True)
        log.info('valid.convert_to_one_hot_vector() OK.')

        log.info('train dataset: %s' % train)
        log.info('valid dataset: %s' % valid)
        log.info('dataset load OK.')
        log.info('')
        watch.stop('dataset load')

        X, Y, dropout_keep_prob, train_step, cost, y_hat, accuracy = SpellingErrorCorrection.build_DAE(n_features, window_size, noise_rate, n_hidden1,
                                                                                                       learning_rate, watch)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            check_interval = max(1, min(1000, n_train // 10))
            nth_train, nth_input, total_input = 0, 0, total_epoch * train.size

            log.info('')
            log.info('learn...')
            log.info('total_epoch: %s' % total_epoch)
            log.info('train.size (total features): %s' % NumUtil.comma_str(train.size))
            log.info('check_interval: %s' % check_interval)
            log.info('total_epoch: %s' % total_epoch)
            log.info('batch_size: %s' % batch_size)
            log.info('total_input: %s (total_epoch * train.size)' % total_input)
            log.info('')
            watch.start('learn')
            valid_cost = sys.float_info.max
            for epoch in range(1, total_epoch + 1):
                if valid_cost < early_stop_cost:
                    log.info('valid_cost: %s, early_stop_cost: %s, early stopped.' % (valid_cost, early_stop_cost))
                    break
                for step, (features_batch, labels_batch) in enumerate(train.next_batch(batch_size=batch_size, to_one_hot_vector=True), 1):
                    if valid_cost < early_stop_cost:
                        break

                    nth_train += 1
                    nth_input += features_batch.shape[0]
                    sess.run(train_step, feed_dict={X: features_batch, Y: labels_batch, dropout_keep_prob: dropout_keep_rate})

                    # if nth_train % check_interval == 1:
                    percent = nth_input / total_input * 100
                    valid_cost = sess.run(cost, feed_dict={X: valid.features, Y: valid.labels, dropout_keep_prob: 1.0})
                    log.info('[epoch=%s][%.1f%%] %s cost: %.8f' % (epoch, percent, valid.name, valid_cost))

            watch.stop('learn')
            log.info('learn OK.')
            log.info('')

            log.info('model save... %s' % model_file)
            watch.start('model save...')
            model_dir = os.path.dirname(model_file)
            if not os.path.exists(model_dir):
                os.makedirs(model_dir)
            saver = tf.train.Saver()
            saver.save(sess, model_file)
            watch.stop('model save...')
            log.info('model save OK. %s' % model_file)

        log.info('')
        log.info('total_epoch: %s' % total_epoch)
        log.info('batch_size: %s' % batch_size)
        log.info('total_input: %s (total_epoch * train.size)' % total_input)
        log.info('')
        log.info(watch.summary())
        log.info('')
Example #7
0
    def build_DAE(cls, n_features, window_size, noise_rate, n_hidden1, learning_rate, watch=WatchUtil()):
        if len(cls.graph) == 0:
            log.info('')
            log.info('create tensorflow graph...')
            watch.start('create tensorflow graph')

            features_vector_size = n_features // window_size
            log.info('n_features: %s' % n_features)
            log.info('window_size: %s' % window_size)
            log.info('features_vector_size: %s' % features_vector_size)

            log.info('noise_rate: %.1f' % noise_rate)
            log.info('n_hidden1: %s' % n_hidden1)

            tf.set_random_seed(777)  # for reproducibility

            X = tf.placeholder(tf.float32, [None, n_features], name='X')  # shape=(batch_size, window_size * feature_vector.size)
            Y = tf.placeholder(tf.float32, [None, n_features], name='Y')  # shape=(batch_size, window_size * feature_vector.size)
            dropout_keep_prob = tf.placeholder(tf.float32)

            # layers = 3
            # n_hidden2 = n_hidden1
            # W1 = tf.Variable(tf.random_normal([n_features, n_hidden1]), name='W1')
            # b1 = tf.Variable(tf.random_normal([n_hidden1]), name='b1')
            # layer1 = tf.nn.sigmoid(tf.matmul(X, W1) + b1, name='layer1')
            # layer1_dropout = tf.nn.dropout(layer1, dropout_keep_prob, name='layer1_dropout')
            #
            # W2 = tf.Variable(tf.random_normal([n_hidden1, n_hidden2]), name='W2')
            # b2 = tf.Variable(tf.random_normal([n_hidden2]), name='b2')
            # layer2 = tf.nn.sigmoid(tf.matmul(layer1_dropout, W2) + b2, name='layer2')
            # layer2_dropout = tf.nn.dropout(layer2, dropout_keep_prob, name='layer2_dropout')
            #
            # W3 = tf.Variable(tf.random_normal([n_hidden1, n_features]), name='W3')
            # b3 = tf.Variable(tf.random_normal([n_features]), name='b3')
            # y_hat = tf.add(tf.matmul(layer2_dropout, W3), b3, name='y_hat')

            # layers = 2
            W1 = tf.Variable(tf.random_normal([n_features, n_hidden1]), name='W1')
            b1 = tf.Variable(tf.random_normal([n_hidden1]), name='b1')
            layer1 = tf.nn.sigmoid(tf.matmul(X, W1) + b1, name='layer1')
            layer1_dropout = tf.nn.dropout(layer1, dropout_keep_prob, name='layer1_dropout')

            W2 = tf.Variable(tf.random_normal([n_hidden1, n_features]), name='W2')
            b2 = tf.Variable(tf.random_normal([n_features]), name='b2')
            y_hat = tf.add(tf.matmul(layer1_dropout, W2), b2, name='y_hat')  # shape=(batch_size, window_size * feature_vector.size)

            labels_hat = tf.reshape(y_hat, shape=(-1, window_size, features_vector_size))  # shape=(batch_size, window_size, feature_vector.size)
            labels = tf.reshape(Y, shape=(-1, window_size, features_vector_size))  # shape=(batch_size, window_size, feature_vector.size)

            cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=labels_hat, labels=labels), name='cost')
            train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

            accuracy = tf.reduce_mean(tf.cast(tf.abs(tf.nn.softmax(y_hat) - Y) < 0.1, dtype=tf.float32), name='accuracy')
            # log.debug('X:', X)
            # log.debug('Y:', Y)
            # log.debug('y_hat:', y_hat)
            # log.debug('labels_hat:', labels_hat)
            # log.debug('labels:', labels)
            # log.debug('cost:', cost)
            # log.debug('accuracy:', accuracy)

            watch.stop('create tensorflow graph')
            log.info('create tensorflow graph OK.')
            log.info('')
            cls.graph = {'X': X, 'Y': Y, 'dropout_keep_prob': dropout_keep_prob,
                         'train_step': train_step, 'cost': cost, 'y_hat': y_hat, 'accuracy': accuracy, }
        return cls.graph['X'], cls.graph['Y'], cls.graph['dropout_keep_prob'], \
               cls.graph['train_step'], cls.graph['cost'], cls.graph['y_hat'], cls.graph['accuracy']
Example #8
0
                        if is_training:  # training
                            x, y, learning_rate, W1, b1, y_hat, cost, train_step, summary = create_graph(model_name, scope_name, verbose=False)
                            train_x_batch, train_y_batch = input_pipeline([train_file], batch_size=batch_size, delim='\t', splits=3)
                            valid_x_batch, valid_y_batch = input_pipeline([valid_file], batch_size=n_valid, delim='\t', splits=3)

                            sess.run(tf.global_variables_initializer())
                            saver = tf.train.Saver(max_to_keep=None)

                            train_writer = tf.summary.FileWriter(TENSORBOARD_LOG_DIR + '/train', sess.graph)
                            valid_writer = tf.summary.FileWriter(TENSORBOARD_LOG_DIR + '/valid', sess.graph)

                            coordinator = tf.train.Coordinator()  # coordinator for enqueue threads
                            threads = tf.train.start_queue_runners(sess=sess, coord=coordinator)  # start filename queue
                            batch_count = math.ceil(n_train / batch_size)  # batch count for one epoch
                            try:
                                watch = WatchUtil()
                                stop_timer = TimerUtil(interval_secs=total_train_time)
                                valid_timer = TimerUtil(interval_secs=valid_check_interval)
                                watch.start()
                                stop_timer.start()
                                valid_timer.start()

                                nth_batch, min_valid_epoch, min_valid_cost = 0, 0, 1e10
                                epoch, running = 0, True
                                while running:
                                    epoch += 1
                                    for i in range(1, batch_count + 1):
                                        if stop_timer.is_over():
                                            running = False
                                            break
Example #9
0
    def train(self, iterations: int, batch: int, embedding: Word2VecEmbedding,
              args: argparse.Namespace) -> str:
        batches_in_epoch = int(numpy.ceil(
            len(self.dataloader.dataset) / batch))
        total_batches = batches_in_epoch * iterations
        nth_total_batch = 0
        log.info(f'batches_in_epoch: {batches_in_epoch}')
        log.info(f'total_batches: {total_batches}')

        watch = WatchUtil(auto_stop=False)
        watch.start()
        best_loss = float("inf")
        first_epoch, last_epoch = self.epoch + 1, self.epoch + iterations + 1
        last_embedding_file = None

        log.info(Word2VecEmbedding.get_filenpath(args))
        for self.epoch in range(first_epoch, last_epoch):
            log.info(f"[e{self.epoch:2d}] {self}")
            loss_list = []
            for nth, (iword, owords) in enumerate(self.dataloader, 1):
                try:
                    loss = self.sgns(iword, owords)
                except RuntimeError:
                    loss_list = [float('-inf')]
                    break

                self.optim.zero_grad()
                loss.backward()
                self.optim.step()
                # if nth_batch == 1 and self.scheduler is not None and self.epoch >= self.decay_start_epoch:  # TODO: TEST
                #     self.scheduler.step()

                if self.learning_decay != 0:
                    PytorchUtil.set_learning_rate(self.optim,
                                                  self.epoch,
                                                  gamma=self.learning_decay,
                                                  base_lr=self.init_lr,
                                                  min_lr=1e-10,
                                                  decay_start=2,
                                                  decay_interval=3)

                lr = PytorchUtil.get_learning_rate(self.optim)

                _, negatives = owords.size()
                real_loss = loss.data[0] / float(negatives)

                loss_list.append(real_loss)

                nth_total_batch += 1
                progressed = nth_total_batch / total_batches
                seconds_per_batch = float(
                    watch.elapsed()) / float(nth_total_batch)
                remain_batches = total_batches - nth_total_batch
                remain_secs = int(seconds_per_batch * remain_batches)

                if nth == 1 or nth == batches_in_epoch or nth % 1000 == 0:
                    log.info(
                        f"[e{self.epoch:2d}][b{nth:5d}/{batches_in_epoch:5d}][{progressed*100:.1f}% remain: {DateUtil.secs_to_string(remain_secs)}][window: {self.window}][lr: {lr:.0e}] loss: {real_loss:.7f}"
                    )

            total_loss = numpy.mean(loss_list)
            log.info(
                f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] total_loss: {total_loss:.7f}, best_loss: {best_loss:.7f}"
            )
            if total_loss > best_loss or total_loss == float(
                    'inf') or total_loss == float(
                        '-inf'):  # bad loss than before or diverge
                log.info('')
                log.info(
                    f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] total_loss > best_loss BREAK"
                )
                log.info('')
                break
            else:
                if best_loss < total_loss:
                    best_loss = total_loss
                log.info(
                    f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] embedding.save()..."
                )
                args.epoch = self.epoch
                last_embedding_file = embedding.save(
                    idx2vec=trainer.embedding,
                    filepath=Word2VecEmbedding.get_filenpath(args))
                log.info(
                    f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] embedding.save() OK. {os.path.basename(embedding.filepath)}"
                )
        return last_embedding_file
Example #10
0
                        default=Word2VecEmbedding.SUBSAMPLE,
                        type=float,
                        help="subsample threshold (default: 1e-5)")

    parser.add_argument('--learning_rate',
                        default=Word2VecEmbedding.LEARNING_RATE,
                        type=float,
                        help="learning rate for AdamOptimizer")
    parser.add_argument('--learning_decay',
                        default=Word2VecEmbedding.LEARNING_DECAY,
                        type=float,
                        help="exponential decay gamma (default: 0.0=no decay)")
    args = parser.parse_args()
    log.info(args)

    watch = WatchUtil(auto_stop=True)

    try:
        log.info(f'load {args.corpus_file} ...')
        watch.start()
        corpus = Word2VecCorpus.load(filepath=args.corpus_file)
        log.info(
            f'load {args.corpus_file} OK. (elapsed: {watch.elapsed_string()})')
        log.info(corpus.vocab)

        if len(corpus.vocab) > 1e5:  # out of memory (11GB GPU memory)
            args.device_no = None

        log.info('')
        log.info(args)
        log.info('')
Example #11
0
                            train_writer = tf.summary.FileWriter(
                                TENSORBOARD_LOG_DIR + '/train', sess.graph)
                            valid_writer = tf.summary.FileWriter(
                                TENSORBOARD_LOG_DIR + '/valid', sess.graph)

                            coordinator = tf.train.Coordinator(
                            )  # coordinator for enqueue threads
                            threads = tf.train.start_queue_runners(
                                sess=sess,
                                coord=coordinator)  # start filename queue
                            batch_count = math.ceil(
                                n_train /
                                batch_size)  # batch count for one epoch
                            try:
                                watch = WatchUtil()
                                stop_timer = TimerUtil(
                                    interval_secs=total_train_time)
                                valid_timer = TimerUtil(
                                    interval_secs=valid_check_interval)
                                watch.start()
                                stop_timer.start()
                                valid_timer.start()

                                nth_batch, min_valid_epoch, min_valid_cost = 0, 0, 1e10
                                epoch, running = 0, True
                                while running:
                                    epoch += 1
                                    for i in range(1, batch_count + 1):
                                        if stop_timer.is_over():
                                            running = False
    log.info('weights_initializer: %s' % weights_initializer.__name__)
    log.info('learning_rate: %.4f' % learning_rate)
    log.info('train_time: %s' % train_time)

    how_many_trains = 3 if train_time < 10 else 1
    log.info('how_many_trains: %s' % how_many_trains)
    for _ in range(how_many_trains):
        time.sleep(1)
        tf.reset_default_graph()  # Clears the default graph stack and resets the global default graph.
        tf.set_random_seed(7942)  # 3. 결과를 규칙적으로 만들자. (cost: 600-700)

        scope_name = '%s.%s' % (func.__name__, DateUtil.current_yyyymmdd_hhmmss())
        x, y, y_hat, cost, rsme, train_step, summary = build_graph(scope_name, n_features, n_hiddens, n_classes, learning_rate, activation=activation, weights_initializer=weights_initializer,
                                                                   bias_value=bias_value)
        try:
            watch = WatchUtil()

            model_file_saved = False
            model_file = os.path.join(MODELS_DIR, '%s_%s/model' % (os.path.basename(__file__.replace('.py', '')), func.__name__))
            model_dir = os.path.dirname(model_file)
            # log.info('model_file: %s' % model_file)
            if not os.path.exists(model_dir):
                # log.info('model_dir: %s' % model_dir)
                os.makedirs(model_dir)

            config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
            saver = tf.train.Saver()
            with tf.Session(config=config) as sess:
                sess.run(tf.global_variables_initializer())

                train_writer = tf.summary.FileWriter(TENSORBOARD_LOG_DIR + '/train', sess.graph)
Example #13
0
                        allow_growth=True))
                    with tf.Session(config=config) as sess:
                        sess.run(tf.global_variables_initializer())
                        saver = tf.train.Saver(max_to_keep=None)

                        if is_training:  # training
                            train_writer = tf.summary.FileWriter(
                                TENSORBOARD_LOG_DIR + '/train', sess.graph)
                            valid_writer = tf.summary.FileWriter(
                                TENSORBOARD_LOG_DIR + '/valid', sess.graph)

                            batch_count = math.ceil(
                                n_train /
                                batch_size)  # batch count for one epoch
                            try:
                                watch = WatchUtil()
                                stop_timer = TimerUtil(
                                    interval_secs=total_train_time)
                                valid_timer = TimerUtil(
                                    interval_secs=valid_check_interval)
                                watch.start()
                                stop_timer.start()
                                valid_timer.start()

                                nth_batch, min_valid_epoch, min_valid_cost = 0, 0, 1e10
                                epoch, running = 0, True
                                while running:
                                    epoch += 1
                                    for _x_batch, _y_batch in next_batch_in_memory(
                                            'train'):
                                        if stop_timer.is_over():
Example #14
0
    def learning(cls,
                 sentences_file,
                 batch_size,
                 left_gram,
                 right_gram,
                 model_file,
                 features_vector,
                 labels_vector,
                 n_hidden1=100,
                 max_sentences=0,
                 learning_rate=0.01,
                 layers=2):
        ngram = left_gram + right_gram
        n_features = len(
            features_vector) * ngram  # number of features = 17,380 * 4
        n_classes = len(labels_vector) if len(
            labels_vector) >= 3 else 1  # number of classes = 2 but len=1

        log.info('load characters list...')
        log.info('load characters list OK. len: %s\n' %
                 NumUtil.comma_str(len(features_vector)))
        watch = WatchUtil()

        train_file = os.path.join(
            KO_WIKIPEDIA_ORG_DATA_DIR, 'datasets',
            'ko.wikipedia.org.dataset.sentences=%d.left=%d.right=%d.train.gz' %
            (max_sentences, left_gram, right_gram))
        validation_file = train_file.replace('.train.', '.validation.')
        test_file = train_file.replace('.train.', '.test.')
        if not os.path.exists(train_file) or not os.path.exists(
                validation_file) or not os.path.exists(test_file):
            watch.start('create dataset')
            log.info('create dataset...')
            features, labels = [], []
            check_interval = min(10000, math.ceil(max_sentences))
            log.info('total: %s' % NumUtil.comma_str(max_sentences))

            with gzip.open(sentences_file, 'rt') as f:
                for i, line in enumerate(f, 1):
                    if max_sentences < i:
                        break

                    if i % check_interval == 0:
                        log.info(
                            'create dataset... %.1f%% readed. data len: %s' %
                            (i / max_sentences * 100,
                             NumUtil.comma_str(len(features))))

                    _f, _l = WordSpacing.sentence2features_labels(
                        line.strip(),
                        left_gram=left_gram,
                        right_gram=right_gram)
                    features.extend(_f)
                    labels.extend(_l)

            dataset = DataSet(features=features,
                              labels=labels,
                              features_vector=features_vector,
                              labels_vector=labels_vector,
                              name='all')
            log.info('dataset: %s' % dataset)
            log.info('create dataset OK.\n')
            watch.stop('create dataset')

            watch.start('dataset save')
            log.info('split to train, test, validation...')
            datasets = DataSets.to_datasets(dataset,
                                            test_rate=0.1,
                                            valid_rate=0.1,
                                            test_max=10000,
                                            valid_max=1000,
                                            shuffle=True)
            train, test, validation = datasets.train, datasets.test, datasets.validation
            log.info(train)
            log.info(test)
            log.info(validation)
            # log.info('%s %s' % (test.features[0], test.labels[0]))
            log.info('split to train, test, validation OK.\n')

            log.info('dataset save... %s' % train_file)
            train.save(train_file, verbose=True)  # save as text
            log.info('dataset save OK.\n')

            log.info('dataset save... %s' % validation_file)
            validation = validation.convert_to_one_hot_vector(
                verbose=True)  # save as vector
            validation.save(validation_file, verbose=True)
            log.info('dataset save OK.\n')

            log.info('dataset save... %s' % test_file)
            test = test.convert_to_one_hot_vector(verbose=True)
            test.save(test_file, verbose=True)  # save as vector
            log.info('dataset save OK.\n')
            watch.stop('dataset save')
        else:
            watch.start('dataset load')
            log.info('dataset load...')
            train = DataSet.load(train_file, verbose=True)
            validation = DataSet.load(validation_file, verbose=True)
            test = DataSet.load(test_file, verbose=True)
            log.info(train)
            log.info(validation)
            log.info(test)
            log.info('dataset load OK.\n')
            watch.stop('dataset load')

        log.info('check samples...')
        for i, (features_batch, labels_batch) in enumerate(
                train.next_batch(batch_size=5, to_one_hot_vector=True), 1):
            if i > 2:
                break
            for a, b in zip(features_batch, labels_batch):
                feature, label = a, b
                _feature = feature.reshape((ngram, len(features_vector)))
                chars = ''.join(features_vector.to_values(_feature))
                has_space = np.argmax(label)
                log.info('[%s] %s -> %s, %s (len=%s) %s (len=%s)' %
                         (i, chars, has_space, feature, len(feature), label,
                          len(label)))
        log.info('check samples OK.\n')

        graph = WordSpacing.build_FFNN(n_features,
                                       n_classes,
                                       n_hidden1,
                                       learning_rate,
                                       watch,
                                       layers=layers)

        train_step, X, Y, cost, hypothesis, predicted, accuracy = graph[
            'train_step'], graph['X'], graph['Y'], graph['cost'], graph[
                'hypothesis'], graph['predicted'], graph['accuracy']

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            n_input = 0
            log.info('total: %s' % NumUtil.comma_str(train.size))
            log.info('learn...')
            watch.start('learn')
            for step, (features_batch, labels_batch) in enumerate(
                    train.next_batch(batch_size=batch_size), 1):
                n_input += batch_size
                sess.run(train_step,
                         feed_dict={
                             X: features_batch,
                             Y: labels_batch
                         })
                log.info(
                    '[%s][%.1f%%] validation cost: %.4f' %
                    (NumUtil.comma_str(n_input), n_input / train.size * 100,
                     sess.run(cost,
                              feed_dict={
                                  X: validation.features,
                                  Y: validation.labels
                              })))
            watch.stop('learn')
            log.info('learn OK.\n')

            log.info('evaluate...')
            watch.start('evaluate...')
            _hypothesis, _correct, _accuracy = sess.run(
                [hypothesis, predicted, accuracy],
                feed_dict={
                    X: test.features,
                    Y: test.labels
                })  # Accuracy report
            watch.stop('evaluate...')
            log.info('evaluate OK.')

            log.info('model save... %s' % model_file)
            watch.start('model save...')
            model_dir = os.path.dirname(model_file)
            if not os.path.exists(model_dir):
                os.makedirs(model_dir)
            saver = tf.train.Saver()
            saver.save(sess, model_file)
            watch.stop('model save...')
            log.info('model save OK. %s' % model_file)

        log.info('\n')
        log.info(watch.summary())
        # log.info('hypothesis: %s %s' % (_hypothesis.shape, _hypothesis))
        # log.info('correct: %s %s' % (_correct.shape, _correct))
        log.info('accuracy: %s %s' % (_accuracy.shape, _accuracy))
        log.info('\n')
Example #15
0
    print('%s -> %s -> %s -> %s -> %s' % (x_train.shape[1], n_hiddens, activation.__name__, n_hiddens, 1))
    print('weights_initializer: %s' % weights_initializer.__name__)
    print('learning_rate: %.4f' % learning_rate)
    print('train_time: %s' % train_time)

    how_many_trains = 3 if train_time <= 1 else 1  # 1초 실행하는 경우, 3번 실험 그 외에는 1번 실험.
    for _ in range(how_many_trains) :
        # time.sleep(1)
        tf.reset_default_graph() # 기존 session을 초기화
        tf.set_random_seed(7942) # tf.random_normal_initializer 사용하기 때문에 설정 필요

        scope_name = '%s.%s' % (func.__name__,DateUtil.current_yyyymmdd_hhmmss()) # graph 겹치지 않게 하기 위해서, func + 날짜 이름으로 설정하는 것을 추천
        x, y, y_hat, cost, rsme, train_step, summary = build_graph(scope_name, n_features, n_hiddens, n_classes, learning_rate, activation=activation, weights_initializer=weights_initializer, bias_value=bias_value)

        try :
            watch = WatchUtil()

            model_file_saved = False
            model_file = os.path.join('%s/workspace/nlp4kor/models/%s_%s/model' % (os.getcwd(), os.path.basename(__name__.replace('.py', '')), func.__name__))
            model_dir = os.path.dirname(model_file)
            # print('model_file: %s' % model_file)
            if not os.path.exists(model_dir):
                # print('model_dir: %s' % model_dir)
                os.makedirs(model_dir)

            config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
            saver = tf.train.Saver() # 최근 5개만 남개 되어서 max_to_keep=None 해야함
            with tf.Session(config=config) as sess:
                sess.run(tf.global_variables_initializer())

                train_writer = tf.summary.FileWriter(TENSORBOARD_LOG_DIR + '/train', sess.graph)