def main(_):
    # Determine which config we should use.
    if FLAGS.config == "TestConfig":
        config = TestConfig()
    elif FLAGS.config == "DefaultMovieDialogConfig":
        config = DefaultMovieDialogConfig()
    else:
        raise ValueError("config argument not recognized; must be one of: "
                         "TestConfig, DefaultPTBConfig, "
                         "DefaultMovieDialogConfig")

    # Determine which kind of DataReader we want to use.
    if FLAGS.data_reader_type == "MovieDialogReader":
        data_reader = MovieDialogReader(config, FLAGS.train_path)
    else:
        raise ValueError("data_reader_type argument not recognized; must be "
                         "one of: MovieDialogReader, PTBDataReader")

    if FLAGS.decode:
        # Decode test sentences.
        with tf.Session() as session:
            model = create_model(session, True, FLAGS.model_path, config=config)
            print("Loaded model. Beginning decoding.")
            corrective_tokens = get_corrective_tokens(data_reader, FLAGS.train_path)
            evaluate_accuracy(session, model, data_reader, corrective_tokens, FLAGS.test_path,
                          max_samples=None)
    else:
        print("Training model.")
        train(data_reader, FLAGS.train_path, FLAGS.val_path, FLAGS.model_path)
Ejemplo n.º 2
0
def main(_):
    # Determine which config we should use.
    if FLAGS.config == "TestConfig":
        config = TestConfig()
    elif FLAGS.config == "DefaultMovieDialogConfig":
        config = DefaultMovieDialogConfig()
    elif FLAGS.config == "DefaultPTBConfig":
        config = DefaultPTBConfig()
    else:
        raise ValueError("config argument not recognized; must be one of: "
                         "TestConfig, DefaultPTBConfig, "
                         "DefaultMovieDialogConfig")

    # Determine which kind of DataReader we want to use.
    if FLAGS.data_reader_type == "MovieDialogReader":
        data_reader = MovieDialogReader(config, FLAGS.train_path)
    elif FLAGS.data_reader_type == "PTBDataReader":
        data_reader = PTBDataReader(config, FLAGS.train_path)
    else:
        raise ValueError("data_reader_type argument not recognized; must be "
                         "one of: MovieDialogReader, PTBDataReader")

    if FLAGS.decode:
        # Decode test sentences.
        with tf.Session() as session:
            model = create_model(session,
                                 True,
                                 FLAGS.model_path,
                                 config=config)
            print("Loaded model. Beginning decoding.")
            decodings = decode(session,
                               model=model,
                               data_reader=data_reader,
                               data_to_decode=data_reader.read_tokens(
                                   FLAGS.test_path),
                               verbose=False)
            # Write the decoded tokens to stdout.
            for tokens in decodings:
                print(" ".join(tokens))
                sys.stdout.flush()
    else:
        print("Training model.")
        train(data_reader, FLAGS.train_path, FLAGS.val_path, FLAGS.model_path)
def test(config, model_path, test_path):
    data_reader = MovieDialogReader(config, test_path)
    with tf.Session() as session:
        print("Loaded model. Beginning decoding.")
        model = create_model(session, True, model_path, config=config)
        corrective_tokens = get_corrective_tokens(data_reader, test_path)
        evaluate_accuracy(session,
                          model,
                          data_reader,
                          corrective_tokens,
                          test_path,
                          max_samples=None)
Ejemplo n.º 4
0
def main(_):
    # Determine which config we should use.
    if FLAGS.config == "TestConfig":
        config = TestConfig()
    elif FLAGS.config == "DefaultMovieDialogConfig":
        config = DefaultMovieDialogConfig()
    elif FLAGS.config == "DefaultPTBConfig":
        config = DefaultPTBConfig()
    else:
        raise ValueError("config argument not recognized; must be one of: "
                         "TestConfig, DefaultPTBConfig, "
                         "DefaultMovieDialogConfig")
    # Set the model path.
    if not FLAGS.decode and not FLAGS.decode_sentence:
        model_path = os.path.join(FLAGS.output_path, "model")
    else:
        model_path = os.path.join(FLAGS.input_path, "model")
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    # Set the max_steps.
    config.max_steps = FLAGS.num_steps
    # Determine which kind of DataReader we want to use.
    if FLAGS.data_reader_type == "MovieDialogReader":
        data_reader = MovieDialogReader(config, FLAGS.train_path)
    elif FLAGS.data_reader_type == "PTBDataReader":
        data_reader = PTBDataReader(config, FLAGS.train_path)
    else:
        raise ValueError("data_reader_type argument %s not recognized; must be "
                         "one of: MovieDialogReader, PTBDataReader" % FLAGS.data_reader_type)

    if FLAGS.decode_sentence:
        # Correct user's sentences.
        with tf.Session() as session:
            model = create_model(session, True, model_path, config=config)
            print("Enter a sentence you'd like to correct")
            correct_new_sentence = raw_input()
            while correct_new_sentence.lower() != 'no':
                decode_sentence(session, model=model, data_reader=data_reader,
                                sentence=correct_new_sentence,
                                corrective_tokens=data_reader.read_tokens(FLAGS.train_path))
                print("Enter a sentence you'd like to correct or press NO")
                correct_new_sentence = raw_input()
    elif FLAGS.decode:
        # Decode test sentences.
        with tf.Session() as session:
            model = create_model(session, True, model_path, config=config)
            print("Loaded model. Beginning decoding.")
            decodings = decode(session, model=model, data_reader=data_reader,
                               data_to_decode=data_reader.read_tokens(FLAGS.test_path),
                               corrective_tokens=data_reader.read_tokens(FLAGS.train_path))
            # Write the decoded tokens to stdout.
            for tokens in decodings:
               sys.stdout.flush()
    else:
        print("Training model.")
        train(data_reader, FLAGS.train_path, FLAGS.val_path, model_path)
        copy_train_data()
Ejemplo n.º 5
0
def main(_):
    # Determine which config we should use.
    if FLAGS.config == "TestConfig":
        config = TestConfig()
    elif FLAGS.config == "DefaultMovieDialogConfig":
        config = DefaultMovieDialogConfig()
    elif FLAGS.config == "DefaultPTBConfig":
        config = DefaultPTBConfig()
    elif FLAGS.config == "DefaultWikiConfig":
        config = DefaultWikiConfig()
    else:
        raise ValueError("config argument not recognized; must be one of: "
                         "TestConfig, DefaultPTBConfig, DefaultWikiConfig, "
                         "DefaultMovieDialogConfig")

    # Determine which kind of DataReader we want to use.
    if FLAGS.data_reader_type == "MovieDialogReader":
        data_reader = MovieDialogReader(config, FLAGS.train_path)
        train_path = FLAGS.train_path
        val_path = FLAGS.val_path
    elif FLAGS.data_reader_type == "PTBDataReader":
        data_reader = PTBDataReader(config, FLAGS.train_path)
        train_path = FLAGS.train_path
        val_path = FLAGS.val_path
    elif FLAGS.data_reader_type == "WikiDataReader":
        train_path = [
            os.path.join(FLAGS.train_path,
                         "wiki2017CleanChainLifetime.enz_train.txt"),
            os.path.join(FLAGS.train_path,
                         "wiki2017CleanChainLifetime.enu_train.txt")
        ]
        val_path = [
            os.path.join(FLAGS.val_path,
                         "wiki2017CleanChainLifetime.enz_val.txt"),
            os.path.join(FLAGS.val_path,
                         "wiki2017CleanChainLifetime.enu_val.txt")
        ]
        data_reader = WikiDataReader(config, train_path)
    else:
        raise ValueError(
            "data_reader_type argument not recognized; must be "
            "one of: MovieDialogReader, PTBDataReader, WikiDataReader")

    if FLAGS.task == "decode":
        #        data_to_decode=data_reader.read_samples_from_string(FLAGS.test_string)
        #        print(list(data_to_decode))
        #        exit(0)
        print('creating session')
        # Decode test sentences.
        with tf.Session() as session:
            print("creating model")
            model = create_model(session,
                                 True,
                                 FLAGS.model_path,
                                 config=config)
            print("Loaded model. Beginning decoding.")
            if FLAGS.test_string != "":
                decodings = decode_sentence(session, model, data_reader,
                                            FLAGS.test_string)


#                decodings = decode(session, model=model, data_reader=data_reader,
#                                   data_to_decode=data_reader.read_samples_from_string(
#                                       FLAGS.test_string), verbose=True)
            else:
                decodings = decode(session,
                                   model=model,
                                   data_reader=data_reader,
                                   data_to_decode=data_reader.read_tokens(
                                       FLAGS.test_path),
                                   verbose=True)
            # Write the decoded tokens to stdout.
            print(decodings)
            for tokens in decodings:
                print(" ".join(tokens))
                sys.stdout.flush()
    elif FLAGS.task == "serve":
        print('creating session')
        # Decode test sentences.
        with tf.Session() as session:
            print("creating model")
            model = create_model(session,
                                 True,
                                 FLAGS.model_path,
                                 config=config)
            HttpHandler.model = model
            HttpHandler.data_reader = data_reader
            HttpHandler.session = session
            HttpHandler.model_name = FLAGS.model_path
            httpd = HTTPServer(("0.0.0.0", 8080), HttpHandler)
            try:
                print("Starting server...")
                httpd.serve_forever()
            except KeyboardInterrupt:
                pass
            httpd.server_close()
    else:
        print("Training model.")
        train(data_reader, train_path, val_path, FLAGS.model_path)
Ejemplo n.º 6
0
    get_corrective_tokens, DefaultPTBConfig, DefaultMovieDialogConfig
from text_corrector_data_readers import PTBDataReader, MovieDialogReader

root_data_path = "E:/Deep Learning/deep-text-corrector-master/deep-text-corrector-master/"
train_path = os.path.join(root_data_path, "movie_dialog_train.txt")
val_path = os.path.join(root_data_path, "movie_dialog_val.txt")
test_path = os.path.join(root_data_path, "movie_dialog_test.txt")
model_path = os.path.join(root_data_path, "movie_dialog_model")
config = DefaultMovieDialogConfig()

#data_reader = MovieDialogReader(config, train_path)
#train(data_reader, train_path, val_path, model_path)

data_reader = MovieDialogReader(config,
                                train_path,
                                dropout_prob=0.25,
                                replacement_prob=0.25,
                                dataset_copies=1)
corrective_tokens = get_corrective_tokens(data_reader, train_path)
import pickle
with open(os.path.join(root_data_path, "corrective_tokens.pickle"), "wb") as f:
    pickle.dump(corrective_tokens, f)
import pickle
with open(os.path.join(root_data_path, "token_to_id.pickle"), "wb") as f:
    pickle.dump(data_reader.token_to_id, f)
sess = tf.InteractiveSession()
model = create_model(sess, True, model_path, config=config)

decoded = decode_sentence(sess,
                          model,
                          data_reader,
Ejemplo n.º 7
0
from __future__ import print_function

import os
import time
import numpy as np
import tensorflow as tf
import pandas as pd
from collections import defaultdict

from sklearn.metrics import roc_auc_score, accuracy_score
import nltk

from correct_text import train, decode, decode_sentence, evaluate_accuracy, create_model, get_corrective_tokens, DefaultPTBConfig, DefaultMovieDialogConfig
from text_corrector_data_readers import PTBDataReader, MovieDialogReader

root_data_path = "data/cornell-movie-dialogs-corpus"
train_path = os.path.join(root_data_path, "processed_movie_lines.txt")
val_path = os.path.join(root_data_path, "processed_movie_lines.txt")
test_path = os.path.join(root_data_path, "processed_movie_lines.txt")
model_path = os.path.join(root_data_path, "dialog_correcter_model_testnltk")
config = DefaultMovieDialogConfig()

data_reader = MovieDialogReader(config, train_path)

train(data_reader, train_path, val_path, model_path)
Ejemplo n.º 8
0
token_to_id_filename = "token_to_id.pickle"
token_to_id_path = os.path.join(ROOT_DATA_PATH, token_to_id_filename)
download(s3_client, token_to_id_filename, local_path=token_to_id_path)

# Load model.
config = DefaultMovieDialogConfig()
sess = tf.Session()
print("Loading model")
model = create_model(sess, True, MODEL_PATH, config=config)
print("Loaded model")

with open(corrective_tokens_path) as f:
    corrective_tokens = pickle.load(f)
with open(token_to_id_path) as f:
    token_to_id = pickle.load(f)
data_reader = MovieDialogReader(config, token_to_id=token_to_id)
print("Done initializing.")


def process_event(event, context):
    print("Received event: " + json.dumps(event, indent=2))

    outputs = decode_sentence(sess,
                              model,
                              data_reader,
                              event["text"],
                              corrective_tokens=corrective_tokens,
                              verbose=False)
    return {"input": event["text"], "output": " ".join(outputs)}
Ejemplo n.º 9
0
from flask import Flask, request
import tensorflow as tf
from correct_text import create_model, DefaultMovieDialogConfig, decode
from text_corrector_data_readers import MovieDialogReader

data_path = '/input/data/movie_dialog_train.txt'
model_path = '/input/model'

tfs = tf.Session()

config = DefaultMovieDialogConfig()
print('Loading model from path: %s' % model_path)
model = create_model(tfs, True, model_path, config=config)
print('Using data from path: %s' % data_path)
data_reader = MovieDialogReader(config, data_path)

app = Flask('deep-text-corrector')


@app.route('/<path:path>', methods=['POST'])
def correct_handler(path):
    corrective_tokens = data_reader.read_tokens(data_path)
    request.get_data()
    decodings = decode(tfs,
                       model=model,
                       data_reader=data_reader,
                       data_to_decode=[request.data.split()],
                       corrective_tokens=corrective_tokens)
    return ' '.join(next(decodings))

def train(config, model_path, train_path, val_path):
    data_reader = MovieDialogReader(config, train_path)
    train_data = data_reader.build_dataset(train_path)
    val_data = data_reader.build_dataset(val_path)
    with tf.Session() as sess:
        model = create_model(sess, False, model_path, config=config)
        train_bucket_sizes = [
            len(train_data[b]) for b in range(len(config.buckets))
        ]
        train_total_size = float(sum(train_bucket_sizes))
        train_buckets_scale = [
            sum(train_bucket_sizes[:i + 1]) / train_total_size
            for i in range(len(train_bucket_sizes))
        ]
        # 循环训练
        step_time, loss = 0.0, 0.0
        current_step = 0
        previous_losses = []
        while current_step < config.max_steps:
            random_number_01 = np.random.random_sample()
            bucket_id = min([
                i for i in range(len(train_buckets_scale))
                if train_buckets_scale[i] > random_number_01
            ])

            # Get a batch and make a step.
            start_time = time.time()
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                train_data, bucket_id)
            _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                         target_weights, bucket_id, False)
            print(step_loss)
            step_time += (time.time() -
                          start_time) / config.steps_per_checkpoint
            loss += step_loss / config.steps_per_checkpoint
            current_step += 1
            if current_step % config.steps_per_checkpoint == 0:
                perplexity = math.exp(
                    float(loss)) if loss < 300 else float("inf")
                print("global step %d learning rate %.4f step-time %.2f "
                      "perplexity %.2f" %
                      (model.global_step.eval(), model.learning_rate.eval(),
                       step_time, perplexity))
                if len(previous_losses) > 2 and loss > max(
                        previous_losses[-3:]):
                    sess.run(model.learning_rate_decay_op)
                previous_losses.append(loss)
                checkpoint_path = os.path.join(model_path, "translate.ckpt")
                model.saver.save(sess,
                                 checkpoint_path,
                                 global_step=model.global_step)
                # 在开发集上评估
                step_time, loss = 0.0, 0.0
                for bucket_id in range(len(config.buckets)):
                    if len(val_data[bucket_id]) == 0:
                        print(" eval: empty bucket %d" % (bucket_id))
                        continue
                    encoder_inputs, decoder_inputs, target_weights = \
                        model.get_batch(val_data, bucket_id)
                    _, eval_loss, _ = model.step(sess, encoder_inputs,
                                                 decoder_inputs,
                                                 target_weights, bucket_id,
                                                 True)
                    eval_ppx = math.exp(
                        float(eval_loss)) if eval_loss < 300 else float("inf")
                    print(" eval: bucket %d perplexity %.2f" %
                          (bucket_id, eval_ppx))
                sys.stdout.flush()
Ejemplo n.º 11
0
def main(_):
    # Determine which config we should use.
    if FLAGS.config == "TestConfig":
        config = TestConfig()
    elif FLAGS.config == "DefaultMovieDialogConfig":
        config = DefaultMovieDialogConfig()
    elif FLAGS.config == "DefaultPTBConfig":
        config = DefaultPTBConfig()
    elif FLAGS.config == "DefaultFCEConfig":
        config = DefaultFCEConfig()
    else:
        raise ValueError("config argument not recognized; must be one of: "
                         "TestConfig, DefaultPTBConfig, DefaultFCEConfig, "
                         "DefaultMovieDialogConfig")
                         
    is_train = not (FLAGS.correct or FLAGS.evaluate or FLAGS.decode)
                         
    # Determine which kind of DataReader we want to use.
    if FLAGS.data_reader_type == "MovieDialogReader":
        data_reader = MovieDialogReader(config, FLAGS.train_path) if is_train else None
    elif FLAGS.data_reader_type == "PTBDataReader":
        data_reader = PTBDataReader(config, FLAGS.train_path)
    elif FLAGS.data_reader_type == "FCEReader":
        data_reader = FCEReader(config, FLAGS.train_path)
    else:
        raise ValueError("data_reader_type argument not recognized; must be "
                         "one of: MovieDialogReader, PTBDataReader")

    corrective_tokens = set()
    import pickle
    if not is_train:        
        with open(os.path.join(FLAGS.model_path, "token_to_id.pickle"), "rb") as f:
            token_to_id = pickle.load(f)
        if FLAGS.data_reader_type == "MovieDialogReader":
            data_reader = MovieDialogReader(config, None, token_to_id, dropout_prob=0.25, replacement_prob=0.25, dataset_copies=1)
        elif FLAGS.data_reader_type == "PTBDataReader":
            data_reader = PTBDataReader(config, None, token_to_id, dropout_prob=0.25, replacement_prob=0.25, dataset_copies=1)
        elif FLAGS.data_reader_type == "FCEReader":
            data_reader = FCEReader(config, None, token_to_id, dropout_prob=0.25, replacement_prob=0.25, dataset_copies=1)
        #with open(os.path.join(FLAGS.model_path, "corrective_tokens.pickle"), "rb") as f:
        #    corrective_tokens = pickle.load(f)
        #corrective_tokens = data_reader.read_tokens(FLAGS.train_path)
        corrective_tokens = get_corrective_tokens(data_reader, FLAGS.train_path)
        #print(corrective_tokens)
    else:
        corrective_tokens = get_corrective_tokens(data_reader, FLAGS.train_path)   
        #print(corrective_tokens) 
        sys.stdout.flush()
        with open(os.path.join(FLAGS.model_path, "corrective_tokens.pickle"), "wb") as f:
            pickle.dump(corrective_tokens, f)
        with open(os.path.join(FLAGS.model_path, "token_to_id.pickle"), "wb") as f:
            pickle.dump(data_reader.token_to_id, f)

    sess_config = tf.ConfigProto(device_count={"CPU": config.cpu_num}, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0)

    if FLAGS.correct:
        with tf.Session(config=sess_config) as session:
        #session = tf.InteractiveSession()
            model = create_model(session, True, FLAGS.model_path, config=config)
            print("Loaded model. Beginning correcting.")
            while True:
                sentence = input("Input sentence or exit\n")
                if sentence:
                    if sentence.lower() == 'exit':
                        break                
                    decoded = decode_sentence(session, model=model, data_reader=data_reader, sentence=sentence, corrective_tokens=corrective_tokens, verbose=True)
                    sys.stdout.flush()
        #session.close()
    elif FLAGS.evaluate:
        with tf.Session(config=sess_config) as session:
            model = create_model(session, True, FLAGS.model_path, config=config)
            print("Loaded model. Beginning evaluating.")
            errors = evaluate_accuracy(session, model=model, data_reader=data_reader, corrective_tokens=corrective_tokens, test_path=FLAGS.test_path)
            print(errors)
            sys.stdout.flush()
    elif FLAGS.decode:
        # Decode test sentences.
        with tf.Session(config=sess_config) as session:
            model = create_model(session, True, FLAGS.model_path, config=config)
            print("Loaded model. Beginning decoding.")
            decodings = decode(session, model=model, data_reader=data_reader,
                               data_to_decode=data_reader.read_tokens(
                                   FLAGS.test_path), corrective_tokens=corrective_tokens, verbose=False)
            # Write the decoded tokens to stdout.
            for tokens in decodings:
                print(" ".join(tokens))
                sys.stdout.flush()
    else:
        print("Training model.")
        sys.stdout.flush()
        train(data_reader, FLAGS.train_path, FLAGS.val_path, FLAGS.model_path)