def main(_): # Determine which config we should use. if FLAGS.config == "TestConfig": config = TestConfig() elif FLAGS.config == "DefaultMovieDialogConfig": config = DefaultMovieDialogConfig() else: raise ValueError("config argument not recognized; must be one of: " "TestConfig, DefaultPTBConfig, " "DefaultMovieDialogConfig") # Determine which kind of DataReader we want to use. if FLAGS.data_reader_type == "MovieDialogReader": data_reader = MovieDialogReader(config, FLAGS.train_path) else: raise ValueError("data_reader_type argument not recognized; must be " "one of: MovieDialogReader, PTBDataReader") if FLAGS.decode: # Decode test sentences. with tf.Session() as session: model = create_model(session, True, FLAGS.model_path, config=config) print("Loaded model. Beginning decoding.") corrective_tokens = get_corrective_tokens(data_reader, FLAGS.train_path) evaluate_accuracy(session, model, data_reader, corrective_tokens, FLAGS.test_path, max_samples=None) else: print("Training model.") train(data_reader, FLAGS.train_path, FLAGS.val_path, FLAGS.model_path)
def main(_): # Determine which config we should use. if FLAGS.config == "TestConfig": config = TestConfig() elif FLAGS.config == "DefaultMovieDialogConfig": config = DefaultMovieDialogConfig() elif FLAGS.config == "DefaultPTBConfig": config = DefaultPTBConfig() else: raise ValueError("config argument not recognized; must be one of: " "TestConfig, DefaultPTBConfig, " "DefaultMovieDialogConfig") # Determine which kind of DataReader we want to use. if FLAGS.data_reader_type == "MovieDialogReader": data_reader = MovieDialogReader(config, FLAGS.train_path) elif FLAGS.data_reader_type == "PTBDataReader": data_reader = PTBDataReader(config, FLAGS.train_path) else: raise ValueError("data_reader_type argument not recognized; must be " "one of: MovieDialogReader, PTBDataReader") if FLAGS.decode: # Decode test sentences. with tf.Session() as session: model = create_model(session, True, FLAGS.model_path, config=config) print("Loaded model. Beginning decoding.") decodings = decode(session, model=model, data_reader=data_reader, data_to_decode=data_reader.read_tokens( FLAGS.test_path), verbose=False) # Write the decoded tokens to stdout. for tokens in decodings: print(" ".join(tokens)) sys.stdout.flush() else: print("Training model.") train(data_reader, FLAGS.train_path, FLAGS.val_path, FLAGS.model_path)
def test(config, model_path, test_path): data_reader = MovieDialogReader(config, test_path) with tf.Session() as session: print("Loaded model. Beginning decoding.") model = create_model(session, True, model_path, config=config) corrective_tokens = get_corrective_tokens(data_reader, test_path) evaluate_accuracy(session, model, data_reader, corrective_tokens, test_path, max_samples=None)
def main(_): # Determine which config we should use. if FLAGS.config == "TestConfig": config = TestConfig() elif FLAGS.config == "DefaultMovieDialogConfig": config = DefaultMovieDialogConfig() elif FLAGS.config == "DefaultPTBConfig": config = DefaultPTBConfig() else: raise ValueError("config argument not recognized; must be one of: " "TestConfig, DefaultPTBConfig, " "DefaultMovieDialogConfig") # Set the model path. if not FLAGS.decode and not FLAGS.decode_sentence: model_path = os.path.join(FLAGS.output_path, "model") else: model_path = os.path.join(FLAGS.input_path, "model") if not os.path.exists(model_path): os.makedirs(model_path) # Set the max_steps. config.max_steps = FLAGS.num_steps # Determine which kind of DataReader we want to use. if FLAGS.data_reader_type == "MovieDialogReader": data_reader = MovieDialogReader(config, FLAGS.train_path) elif FLAGS.data_reader_type == "PTBDataReader": data_reader = PTBDataReader(config, FLAGS.train_path) else: raise ValueError("data_reader_type argument %s not recognized; must be " "one of: MovieDialogReader, PTBDataReader" % FLAGS.data_reader_type) if FLAGS.decode_sentence: # Correct user's sentences. with tf.Session() as session: model = create_model(session, True, model_path, config=config) print("Enter a sentence you'd like to correct") correct_new_sentence = raw_input() while correct_new_sentence.lower() != 'no': decode_sentence(session, model=model, data_reader=data_reader, sentence=correct_new_sentence, corrective_tokens=data_reader.read_tokens(FLAGS.train_path)) print("Enter a sentence you'd like to correct or press NO") correct_new_sentence = raw_input() elif FLAGS.decode: # Decode test sentences. with tf.Session() as session: model = create_model(session, True, model_path, config=config) print("Loaded model. Beginning decoding.") decodings = decode(session, model=model, data_reader=data_reader, data_to_decode=data_reader.read_tokens(FLAGS.test_path), corrective_tokens=data_reader.read_tokens(FLAGS.train_path)) # Write the decoded tokens to stdout. for tokens in decodings: sys.stdout.flush() else: print("Training model.") train(data_reader, FLAGS.train_path, FLAGS.val_path, model_path) copy_train_data()
def main(_): # Determine which config we should use. if FLAGS.config == "TestConfig": config = TestConfig() elif FLAGS.config == "DefaultMovieDialogConfig": config = DefaultMovieDialogConfig() elif FLAGS.config == "DefaultPTBConfig": config = DefaultPTBConfig() elif FLAGS.config == "DefaultWikiConfig": config = DefaultWikiConfig() else: raise ValueError("config argument not recognized; must be one of: " "TestConfig, DefaultPTBConfig, DefaultWikiConfig, " "DefaultMovieDialogConfig") # Determine which kind of DataReader we want to use. if FLAGS.data_reader_type == "MovieDialogReader": data_reader = MovieDialogReader(config, FLAGS.train_path) train_path = FLAGS.train_path val_path = FLAGS.val_path elif FLAGS.data_reader_type == "PTBDataReader": data_reader = PTBDataReader(config, FLAGS.train_path) train_path = FLAGS.train_path val_path = FLAGS.val_path elif FLAGS.data_reader_type == "WikiDataReader": train_path = [ os.path.join(FLAGS.train_path, "wiki2017CleanChainLifetime.enz_train.txt"), os.path.join(FLAGS.train_path, "wiki2017CleanChainLifetime.enu_train.txt") ] val_path = [ os.path.join(FLAGS.val_path, "wiki2017CleanChainLifetime.enz_val.txt"), os.path.join(FLAGS.val_path, "wiki2017CleanChainLifetime.enu_val.txt") ] data_reader = WikiDataReader(config, train_path) else: raise ValueError( "data_reader_type argument not recognized; must be " "one of: MovieDialogReader, PTBDataReader, WikiDataReader") if FLAGS.task == "decode": # data_to_decode=data_reader.read_samples_from_string(FLAGS.test_string) # print(list(data_to_decode)) # exit(0) print('creating session') # Decode test sentences. with tf.Session() as session: print("creating model") model = create_model(session, True, FLAGS.model_path, config=config) print("Loaded model. Beginning decoding.") if FLAGS.test_string != "": decodings = decode_sentence(session, model, data_reader, FLAGS.test_string) # decodings = decode(session, model=model, data_reader=data_reader, # data_to_decode=data_reader.read_samples_from_string( # FLAGS.test_string), verbose=True) else: decodings = decode(session, model=model, data_reader=data_reader, data_to_decode=data_reader.read_tokens( FLAGS.test_path), verbose=True) # Write the decoded tokens to stdout. print(decodings) for tokens in decodings: print(" ".join(tokens)) sys.stdout.flush() elif FLAGS.task == "serve": print('creating session') # Decode test sentences. with tf.Session() as session: print("creating model") model = create_model(session, True, FLAGS.model_path, config=config) HttpHandler.model = model HttpHandler.data_reader = data_reader HttpHandler.session = session HttpHandler.model_name = FLAGS.model_path httpd = HTTPServer(("0.0.0.0", 8080), HttpHandler) try: print("Starting server...") httpd.serve_forever() except KeyboardInterrupt: pass httpd.server_close() else: print("Training model.") train(data_reader, train_path, val_path, FLAGS.model_path)
get_corrective_tokens, DefaultPTBConfig, DefaultMovieDialogConfig from text_corrector_data_readers import PTBDataReader, MovieDialogReader root_data_path = "E:/Deep Learning/deep-text-corrector-master/deep-text-corrector-master/" train_path = os.path.join(root_data_path, "movie_dialog_train.txt") val_path = os.path.join(root_data_path, "movie_dialog_val.txt") test_path = os.path.join(root_data_path, "movie_dialog_test.txt") model_path = os.path.join(root_data_path, "movie_dialog_model") config = DefaultMovieDialogConfig() #data_reader = MovieDialogReader(config, train_path) #train(data_reader, train_path, val_path, model_path) data_reader = MovieDialogReader(config, train_path, dropout_prob=0.25, replacement_prob=0.25, dataset_copies=1) corrective_tokens = get_corrective_tokens(data_reader, train_path) import pickle with open(os.path.join(root_data_path, "corrective_tokens.pickle"), "wb") as f: pickle.dump(corrective_tokens, f) import pickle with open(os.path.join(root_data_path, "token_to_id.pickle"), "wb") as f: pickle.dump(data_reader.token_to_id, f) sess = tf.InteractiveSession() model = create_model(sess, True, model_path, config=config) decoded = decode_sentence(sess, model, data_reader,
from __future__ import print_function import os import time import numpy as np import tensorflow as tf import pandas as pd from collections import defaultdict from sklearn.metrics import roc_auc_score, accuracy_score import nltk from correct_text import train, decode, decode_sentence, evaluate_accuracy, create_model, get_corrective_tokens, DefaultPTBConfig, DefaultMovieDialogConfig from text_corrector_data_readers import PTBDataReader, MovieDialogReader root_data_path = "data/cornell-movie-dialogs-corpus" train_path = os.path.join(root_data_path, "processed_movie_lines.txt") val_path = os.path.join(root_data_path, "processed_movie_lines.txt") test_path = os.path.join(root_data_path, "processed_movie_lines.txt") model_path = os.path.join(root_data_path, "dialog_correcter_model_testnltk") config = DefaultMovieDialogConfig() data_reader = MovieDialogReader(config, train_path) train(data_reader, train_path, val_path, model_path)
token_to_id_filename = "token_to_id.pickle" token_to_id_path = os.path.join(ROOT_DATA_PATH, token_to_id_filename) download(s3_client, token_to_id_filename, local_path=token_to_id_path) # Load model. config = DefaultMovieDialogConfig() sess = tf.Session() print("Loading model") model = create_model(sess, True, MODEL_PATH, config=config) print("Loaded model") with open(corrective_tokens_path) as f: corrective_tokens = pickle.load(f) with open(token_to_id_path) as f: token_to_id = pickle.load(f) data_reader = MovieDialogReader(config, token_to_id=token_to_id) print("Done initializing.") def process_event(event, context): print("Received event: " + json.dumps(event, indent=2)) outputs = decode_sentence(sess, model, data_reader, event["text"], corrective_tokens=corrective_tokens, verbose=False) return {"input": event["text"], "output": " ".join(outputs)}
from flask import Flask, request import tensorflow as tf from correct_text import create_model, DefaultMovieDialogConfig, decode from text_corrector_data_readers import MovieDialogReader data_path = '/input/data/movie_dialog_train.txt' model_path = '/input/model' tfs = tf.Session() config = DefaultMovieDialogConfig() print('Loading model from path: %s' % model_path) model = create_model(tfs, True, model_path, config=config) print('Using data from path: %s' % data_path) data_reader = MovieDialogReader(config, data_path) app = Flask('deep-text-corrector') @app.route('/<path:path>', methods=['POST']) def correct_handler(path): corrective_tokens = data_reader.read_tokens(data_path) request.get_data() decodings = decode(tfs, model=model, data_reader=data_reader, data_to_decode=[request.data.split()], corrective_tokens=corrective_tokens) return ' '.join(next(decodings))
def train(config, model_path, train_path, val_path): data_reader = MovieDialogReader(config, train_path) train_data = data_reader.build_dataset(train_path) val_data = data_reader.build_dataset(val_path) with tf.Session() as sess: model = create_model(sess, False, model_path, config=config) train_bucket_sizes = [ len(train_data[b]) for b in range(len(config.buckets)) ] train_total_size = float(sum(train_bucket_sizes)) train_buckets_scale = [ sum(train_bucket_sizes[:i + 1]) / train_total_size for i in range(len(train_bucket_sizes)) ] # 循环训练 step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] while current_step < config.max_steps: random_number_01 = np.random.random_sample() bucket_id = min([ i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01 ]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_data, bucket_id) _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False) print(step_loss) step_time += (time.time() - start_time) / config.steps_per_checkpoint loss += step_loss / config.steps_per_checkpoint current_step += 1 if current_step % config.steps_per_checkpoint == 0: perplexity = math.exp( float(loss)) if loss < 300 else float("inf") print("global step %d learning rate %.4f step-time %.2f " "perplexity %.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) if len(previous_losses) > 2 and loss > max( previous_losses[-3:]): sess.run(model.learning_rate_decay_op) previous_losses.append(loss) checkpoint_path = os.path.join(model_path, "translate.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) # 在开发集上评估 step_time, loss = 0.0, 0.0 for bucket_id in range(len(config.buckets)): if len(val_data[bucket_id]) == 0: print(" eval: empty bucket %d" % (bucket_id)) continue encoder_inputs, decoder_inputs, target_weights = \ model.get_batch(val_data, bucket_id) _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) eval_ppx = math.exp( float(eval_loss)) if eval_loss < 300 else float("inf") print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) sys.stdout.flush()
def main(_): # Determine which config we should use. if FLAGS.config == "TestConfig": config = TestConfig() elif FLAGS.config == "DefaultMovieDialogConfig": config = DefaultMovieDialogConfig() elif FLAGS.config == "DefaultPTBConfig": config = DefaultPTBConfig() elif FLAGS.config == "DefaultFCEConfig": config = DefaultFCEConfig() else: raise ValueError("config argument not recognized; must be one of: " "TestConfig, DefaultPTBConfig, DefaultFCEConfig, " "DefaultMovieDialogConfig") is_train = not (FLAGS.correct or FLAGS.evaluate or FLAGS.decode) # Determine which kind of DataReader we want to use. if FLAGS.data_reader_type == "MovieDialogReader": data_reader = MovieDialogReader(config, FLAGS.train_path) if is_train else None elif FLAGS.data_reader_type == "PTBDataReader": data_reader = PTBDataReader(config, FLAGS.train_path) elif FLAGS.data_reader_type == "FCEReader": data_reader = FCEReader(config, FLAGS.train_path) else: raise ValueError("data_reader_type argument not recognized; must be " "one of: MovieDialogReader, PTBDataReader") corrective_tokens = set() import pickle if not is_train: with open(os.path.join(FLAGS.model_path, "token_to_id.pickle"), "rb") as f: token_to_id = pickle.load(f) if FLAGS.data_reader_type == "MovieDialogReader": data_reader = MovieDialogReader(config, None, token_to_id, dropout_prob=0.25, replacement_prob=0.25, dataset_copies=1) elif FLAGS.data_reader_type == "PTBDataReader": data_reader = PTBDataReader(config, None, token_to_id, dropout_prob=0.25, replacement_prob=0.25, dataset_copies=1) elif FLAGS.data_reader_type == "FCEReader": data_reader = FCEReader(config, None, token_to_id, dropout_prob=0.25, replacement_prob=0.25, dataset_copies=1) #with open(os.path.join(FLAGS.model_path, "corrective_tokens.pickle"), "rb") as f: # corrective_tokens = pickle.load(f) #corrective_tokens = data_reader.read_tokens(FLAGS.train_path) corrective_tokens = get_corrective_tokens(data_reader, FLAGS.train_path) #print(corrective_tokens) else: corrective_tokens = get_corrective_tokens(data_reader, FLAGS.train_path) #print(corrective_tokens) sys.stdout.flush() with open(os.path.join(FLAGS.model_path, "corrective_tokens.pickle"), "wb") as f: pickle.dump(corrective_tokens, f) with open(os.path.join(FLAGS.model_path, "token_to_id.pickle"), "wb") as f: pickle.dump(data_reader.token_to_id, f) sess_config = tf.ConfigProto(device_count={"CPU": config.cpu_num}, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0) if FLAGS.correct: with tf.Session(config=sess_config) as session: #session = tf.InteractiveSession() model = create_model(session, True, FLAGS.model_path, config=config) print("Loaded model. Beginning correcting.") while True: sentence = input("Input sentence or exit\n") if sentence: if sentence.lower() == 'exit': break decoded = decode_sentence(session, model=model, data_reader=data_reader, sentence=sentence, corrective_tokens=corrective_tokens, verbose=True) sys.stdout.flush() #session.close() elif FLAGS.evaluate: with tf.Session(config=sess_config) as session: model = create_model(session, True, FLAGS.model_path, config=config) print("Loaded model. Beginning evaluating.") errors = evaluate_accuracy(session, model=model, data_reader=data_reader, corrective_tokens=corrective_tokens, test_path=FLAGS.test_path) print(errors) sys.stdout.flush() elif FLAGS.decode: # Decode test sentences. with tf.Session(config=sess_config) as session: model = create_model(session, True, FLAGS.model_path, config=config) print("Loaded model. Beginning decoding.") decodings = decode(session, model=model, data_reader=data_reader, data_to_decode=data_reader.read_tokens( FLAGS.test_path), corrective_tokens=corrective_tokens, verbose=False) # Write the decoded tokens to stdout. for tokens in decodings: print(" ".join(tokens)) sys.stdout.flush() else: print("Training model.") sys.stdout.flush() train(data_reader, FLAGS.train_path, FLAGS.val_path, FLAGS.model_path)