info_file = KO_WIKIPEDIA_ORG_INFO_FILE urls_file = KO_WIKIPEDIA_ORG_URLS_FILE sentences_file = KO_WIKIPEDIA_ORG_SENTENCES_FILE characters_file = KO_WIKIPEDIA_ORG_CHARACTERS_FILE log.info('info_file: %s' % info_file) log.info('urls_file: %s' % urls_file) log.info('sentences_file: %s' % sentences_file) log.info('characters_file: %s' % characters_file) if not os.path.exists(characters_file) or not os.path.exists(sentences_file) or not os.path.exists(info_file) or not os.path.exists(urls_file): try: log.info('create senences file...') TextPreprocess.dump_corpus(MONGO_URL, db_name='parsed', collection_name='ko.wikipedia.org', sentences_file=sentences_file, characters_file=characters_file, info_file=info_file, urls_file=urls_file, train_sentences_file=KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE, valid_sentences_file=KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE, test_sentences_file=KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE, mongo_query={}) # mongodb -> text file(corpus) log.info('create senences file OK') except: log.error(traceback.format_exc()) if os.path.exists(sentences_file): os.remove(sentences_file) if os.path.exists(info_file): os.remove(info_file) if os.path.exists(urls_file): os.remove(urls_file) if os.path.exists(characters_file): os.remove(characters_file)
s = line.strip() sentences.append(s) log.info('len(sentences): %s' % NumUtil.comma_str(len(sentences))) watch.stop('read sentences') watch.start('run tensorflow') accuracies, costs, sims = [], [], [] with tf.Session() as sess: X, Y, dropout_keep_prob, train_step, cost, y_hat, accuracy = SpellingErrorCorrection.build_DAE(n_features, window_size, noise_rate, n_hidden1, learning_rate, watch) saver = tf.train.Saver() try: restored = saver.restore(sess, model_file) except Exception as e: log.error('restore failed. model_file: %s' % model_file) raise e train_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'spelling_error_correction', 'ko.wikipedia.org.dataset.sentences=%s.window_size=%d.train.gz' % (n_train, window_size)) train = DataSet.load(train_file, gzip_format=True, verbose=True) train_vector = DataSet.load(train_file, gzip_format=True, verbose=True) train_vector.convert_to_one_hot_vector() try: total_test_sampling = 1 for i, sentence in enumerate(sentences): for nth in range(total_test_sampling): # log.info('[%s] noise(%.1f) "%s" -> "%s"' % (nth, noise_rate, sentence, noised_sentence)) noised_sentence = SpellingErrorCorrection.encode_noise(sentence, noise_rate=noise_rate) log.info('')
capacity=capacity, min_after_dequeue=min_after_dequeue) else: features_batch, labels_batch = tf.train.batch([x, y], batch_size=batch_size, capacity=capacity) return features_batch, labels_batch if __name__ == '__main__': shuffle = False batch_size = 5 data_file = os.path.join(DATA_DIR, 'en2kor.tsv') if not os.path.exists(data_file): log.error('file not exists. %s' % data_file) exit() config = tf.ConfigProto() config.gpu_options.allow_growth = True filenames = [data_file] features_batch, labels_batch = input_pipeline(filenames, batch_size=batch_size, shuffle=shuffle, tokens=2) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) coordinator = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coordinator)
if s.count(' ') > 0: # sentence must have one or more space. sentences.append(s) log.info('len(sentences): %s' % NumUtil.comma_str(len(sentences))) watch.stop('read sentences') watch.start('run tensorflow') accuracies, sims = [], [] with tf.Session() as sess: graph = WordSpacing.build_FFNN(n_features, n_classes, n_hidden1, learning_rate) X, Y, predicted, accuracy = graph['X'], graph['Y'], graph['predicted'], graph['accuracy'] saver = tf.train.Saver() try: restored = saver.restore(sess, model_file) except: log.error('restore failed. model_file: %s' % model_file) try: for i, s in enumerate(sentences): log.info('') log.info('[%s] in : "%s"' % (i, s)) _features, _labels = WordSpacing.sentence2features_labels(s, left_gram, right_gram) dataset = DataSet(features=_features, labels=_labels, features_vector=features_vector, labels_vector=labels_vector) dataset.convert_to_one_hot_vector() if len(dataset) > 0: _predicted, _accuracy = sess.run([predicted, accuracy], feed_dict={X: dataset.features, Y: dataset.labels}) # Accuracy report sentence_hat = WordSpacing.spacing(s.replace(' ', ''), _predicted) sim, correct, total = WordSpacing.sim_two_sentence(s, sentence_hat, left_gram=left_gram, right_gram=right_gram) accuracies.append(_accuracy) sims.append(sim)
epoch, running = 0, True while running: epoch += 1 for _x_batch, _y_batch in next_batch( [train_file], data_size=n_train, batch_size=batch_size, delim='\t', splits=3, shuffle=False): if stop_timer.is_over(): running = False break if len(_x_batch) != batch_size: log.error('len(_x_batch): %s' % _x_batch.shape) nth_batch += 1 _, _train_cost, _summary = sess.run( [train_step, cost, summary], feed_dict={ x: _x_batch, y: _y_batch, learning_rate: _learning_rate }) train_writer.add_summary( _summary, global_step=nth_batch) if valid_timer.is_over(): for _x_valid_batch, _y_valid_batch in next_batch( [valid_file],