Example #1
0
    info_file = KO_WIKIPEDIA_ORG_INFO_FILE
    urls_file = KO_WIKIPEDIA_ORG_URLS_FILE
    sentences_file = KO_WIKIPEDIA_ORG_SENTENCES_FILE
    characters_file = KO_WIKIPEDIA_ORG_CHARACTERS_FILE
    log.info('info_file: %s' % info_file)
    log.info('urls_file: %s' % urls_file)
    log.info('sentences_file: %s' % sentences_file)
    log.info('characters_file: %s' % characters_file)

    if not os.path.exists(characters_file) or not os.path.exists(sentences_file) or not os.path.exists(info_file) or not os.path.exists(urls_file):
        try:
            log.info('create senences file...')
            TextPreprocess.dump_corpus(MONGO_URL, db_name='parsed', collection_name='ko.wikipedia.org', sentences_file=sentences_file,
                                       characters_file=characters_file,
                                       info_file=info_file, urls_file=urls_file,
                                       train_sentences_file=KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE,
                                       valid_sentences_file=KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE,
                                       test_sentences_file=KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE,
                                       mongo_query={})  # mongodb -> text file(corpus)
            log.info('create senences file OK')
        except:
            log.error(traceback.format_exc())
            if os.path.exists(sentences_file):
                os.remove(sentences_file)
            if os.path.exists(info_file):
                os.remove(info_file)
            if os.path.exists(urls_file):
                os.remove(urls_file)
            if os.path.exists(characters_file):
                os.remove(characters_file)
Example #2
0
                s = line.strip()
                sentences.append(s)
        log.info('len(sentences): %s' % NumUtil.comma_str(len(sentences)))
        watch.stop('read sentences')

        watch.start('run tensorflow')
        accuracies, costs, sims = [], [], []
        with tf.Session() as sess:
            X, Y, dropout_keep_prob, train_step, cost, y_hat, accuracy = SpellingErrorCorrection.build_DAE(n_features, window_size, noise_rate, n_hidden1,
                                                                                                           learning_rate, watch)

            saver = tf.train.Saver()
            try:
                restored = saver.restore(sess, model_file)
            except Exception as e:
                log.error('restore failed. model_file: %s' % model_file)
                raise e

            train_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'spelling_error_correction',
                                      'ko.wikipedia.org.dataset.sentences=%s.window_size=%d.train.gz' % (n_train, window_size))
            train = DataSet.load(train_file, gzip_format=True, verbose=True)
            train_vector = DataSet.load(train_file, gzip_format=True, verbose=True)
            train_vector.convert_to_one_hot_vector()
            try:

                total_test_sampling = 1
                for i, sentence in enumerate(sentences):
                    for nth in range(total_test_sampling):
                        # log.info('[%s] noise(%.1f) "%s" -> "%s"' % (nth, noise_rate, sentence, noised_sentence))
                        noised_sentence = SpellingErrorCorrection.encode_noise(sentence, noise_rate=noise_rate)
                        log.info('')
Example #3
0
            capacity=capacity,
            min_after_dequeue=min_after_dequeue)
    else:
        features_batch, labels_batch = tf.train.batch([x, y],
                                                      batch_size=batch_size,
                                                      capacity=capacity)
    return features_batch, labels_batch


if __name__ == '__main__':
    shuffle = False
    batch_size = 5
    data_file = os.path.join(DATA_DIR, 'en2kor.tsv')

    if not os.path.exists(data_file):
        log.error('file not exists. %s' % data_file)
        exit()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    filenames = [data_file]
    features_batch, labels_batch = input_pipeline(filenames,
                                                  batch_size=batch_size,
                                                  shuffle=shuffle,
                                                  tokens=2)
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())

        coordinator = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coordinator)
Example #4
0
                if s.count(' ') > 0:  # sentence must have one or more space.
                    sentences.append(s)
        log.info('len(sentences): %s' % NumUtil.comma_str(len(sentences)))
        watch.stop('read sentences')

        watch.start('run tensorflow')
        accuracies, sims = [], []
        with tf.Session() as sess:
            graph = WordSpacing.build_FFNN(n_features, n_classes, n_hidden1, learning_rate)
            X, Y, predicted, accuracy = graph['X'], graph['Y'], graph['predicted'], graph['accuracy']

            saver = tf.train.Saver()
            try:
                restored = saver.restore(sess, model_file)
            except:
                log.error('restore failed. model_file: %s' % model_file)
            try:
                for i, s in enumerate(sentences):
                    log.info('')
                    log.info('[%s] in : "%s"' % (i, s))
                    _features, _labels = WordSpacing.sentence2features_labels(s, left_gram, right_gram)
                    dataset = DataSet(features=_features, labels=_labels, features_vector=features_vector, labels_vector=labels_vector)
                    dataset.convert_to_one_hot_vector()
                    if len(dataset) > 0:
                        _predicted, _accuracy = sess.run([predicted, accuracy], feed_dict={X: dataset.features, Y: dataset.labels})  # Accuracy report

                        sentence_hat = WordSpacing.spacing(s.replace(' ', ''), _predicted)
                        sim, correct, total = WordSpacing.sim_two_sentence(s, sentence_hat, left_gram=left_gram, right_gram=right_gram)

                        accuracies.append(_accuracy)
                        sims.append(sim)
Example #5
0
                                epoch, running = 0, True
                                while running:
                                    epoch += 1
                                    for _x_batch, _y_batch in next_batch(
                                        [train_file],
                                            data_size=n_train,
                                            batch_size=batch_size,
                                            delim='\t',
                                            splits=3,
                                            shuffle=False):
                                        if stop_timer.is_over():
                                            running = False
                                            break

                                        if len(_x_batch) != batch_size:
                                            log.error('len(_x_batch): %s' %
                                                      _x_batch.shape)

                                        nth_batch += 1
                                        _, _train_cost, _summary = sess.run(
                                            [train_step, cost, summary],
                                            feed_dict={
                                                x: _x_batch,
                                                y: _y_batch,
                                                learning_rate: _learning_rate
                                            })
                                        train_writer.add_summary(
                                            _summary, global_step=nth_batch)

                                        if valid_timer.is_over():
                                            for _x_valid_batch, _y_valid_batch in next_batch(
                                                [valid_file],