def telecomChatBotSessionOpen(self, args=None): """ Launch the training and/or the interactive mode """ print('Welcome to DeepQA v0.1 TelecomChatBot!') print() print('TensorFlow detected: v{}'.format(tf.__version__)) # General initialisation self.args = self.parseArgs(args) if not self.args.rootDir: self.args.rootDir = os.getcwd( ) # Use the current working directory #tf.logging.set_verbosity(tf.logging.INFO) # DEBUG, INFO, WARN (default), ERROR, or FATAL self.loadModelParams( ) # Update the self.modelDir and self.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) self.textData = TextData(self.args) if self.args.createDataset: print('Dataset created! Thanks for using this program') return # No need to go further # Prepare the model with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.summary.FileWriter(self._getSummaryName()) self.saver = tf.train.Saver(max_to_keep=200) # Running session self.sess = tf.Session( config=tf.ConfigProto( allow_soft_placement= True, # Allows backup device for non GPU-available operations (when forcing GPU) log_device_placement=False) # Too verbose ? ) # TODO: Replace all sess by self.sess (not necessary a good idea) ? if self.args.debug: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) print('Initialize variables...') self.sess.run(tf.global_variables_initializer()) # Reload the model eventually (if it exist.), on testing mode, the models are not loaded here (but in predictTestset) if self.args.test != Chatbot.TestMode.ALL: self.managePreviousModel(self.sess) # Initialize embeddings with pre-trained word2vec vectors if self.args.initEmbeddings: self.loadEmbedding(self.sess)
def load(self, args=None): """ Launch the training and/or the interactive mode """ print('Welcome to chinese version !') #print(user_input) print('hi') #print('TensorFlow detected: v{}'.format(tf.__version__)) # General initialisation self.args = self.parseArgs(args) if not self.args.rootDir: self.args.rootDir = os.getcwd() # Use the current working directory #tf.logging.set_verbosity(tf.logging.INFO) # DEBUG, INFO, WARN (default), ERROR, or FATAL self.loadModelParams() # Update the self.modelDir and self.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) self.textData = TextData(self.args) # TODO: Add a mode where we can force the input of the decoder // Try to visualize the predictions for # each word of the vocabulary / decoder input # TODO: For now, the model are trained for a specific dataset (because of the maxLength which define the # vocabulary). Add a compatibility mode which allow to launch a model trained on a different vocabulary ( # remap the word2id/id2word variables). #if self.args.createDataset: #print('Dataset created! Thanks for using this program') #return # No need to go further # Prepare the model #with tf.device(self.getDevice()): print ("start gpu training") with tf.device("/gpu:0"): self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.summary.FileWriter(self._getSummaryName()) self.saver = tf.train.Saver(max_to_keep=200) # TODO: Fixed seed (WARNING: If dataset shuffling, make sure to do that after saving the # dataset, otherwise, all which cames after the shuffling won't be replicable when # reloading the dataset). How to restore the seed after loading ?? # Also fix seed for random.shuffle (does it works globally for all files ?) # Running session config = tf.ConfigProto( allow_soft_placement=True, # Allows backup device for non GPU-available operations (when forcing GPU) log_device_placement=False) config.gpu_options.allow_growth = True self.sess = tf.Session(config= config) # Too verbose ? # TODO: Replace all sess by self.sess (not necessary a good idea) ? if self.args.debug: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
def main(self, args=None): """ Launch the training and/or the interactive mode """ print('Welcome to Conversation with Shakespeare') # General initialisation self.args = args print(self.args.numEpochs) if not self.args.rootDir: self.args.rootDir = os.getcwd() # Use the current working directory ## Function to load any parameters self.loadModelParams() self.textData = TextData(self.args) if self.args.createDataset: print('Dataset created.') return # No need to go further # Prepare the model with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.summary.FileWriter(self._getSummaryName()) self.saver = tf.train.Saver(max_to_keep=200) # TODO: Fixed seed (WARNING: If dataset shuffling, make sure to do that after saving the # dataset, otherwise, all which cames after the shuffling won't be replicable when # reloading the dataset). How to restore the seed after loading ?? # Also fix seed for random.shuffle (does it works globally for all files ?) # Running session self.sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, # Allows backup device for non GPU-available operations (when forcing GPU) log_device_placement=False) # Too verbose ? ) # TODO: Replace all sess by self.sess (not necessary a good idea) ? if self.args.debug: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) print('Initialize variables...') self.sess.run(tf.global_variables_initializer()) # Initialize embeddings with pre-trained word2vec vectors if self.args.initEmbeddings: self.loadEmbedding(self.sess) self.mainTrain(self.sess)
def main(self, args=None): print('TensorFlow detected: v{}'.format(tf.__version__)) self.args = self.parseArgs(args) if not self.args.rootDir: self.args.rootDir = os.getcwd() self.loadModelParams() self.textData = TextData(self.args) if self.args.createDataset: return with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) self.writer = tf.summary.FileWriter(self._getSummaryName()) self.saver = tf.train.Saver(max_to_keep=200) self.sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) if self.args.debug: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) print('Initialize variables...') self.sess.run(tf.global_variables_initializer()) if self.args.test != Chatbot.TestMode.ALL: self.managePreviousModel(self.sess) if self.args.initEmbeddings: self.loadEmbedding(self.sess) if self.args.test: if self.args.test == Chatbot.TestMode.INTERACTIVE: self.mainTestInteractive(self.sess) elif self.args.test == Chatbot.TestMode.ALL: print('Start predicting...') self.predictTestset(self.sess) print('All predictions done') elif self.args.test == Chatbot.TestMode.DAEMON: print('Daemon mode, running in background...') else: raise RuntimeError('Unknown test mode: {}'.format( self.args.test)) else: self.mainTrain(self.sess) if self.args.test != Chatbot.TestMode.DAEMON: self.sess.close() print("The End! Thanks for using this program")
def main(self, args=None): ## (1)参数及对象初始化 # General initialisation argsUtil = ArgsUtil() argsUtil.parseArgs(argsUtil, args) self.args = argsUtil.args if not self.args.rootDir: self.args.rootDir = os.getcwd( ) # Use the current working directory self.textData = TextData(self.args) self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.summary.FileWriter("save/model-" + self.args.modelTag) self.saver = tf.train.Saver(max_to_keep=200) # Running session self.sess = tf.Session(config=tf.ConfigProto( log_device_placement=False)) ## (2)初始化tf变量 self.sess.run(tf.global_variables_initializer()) ## (3)tf session训练 #self.managePreviousModel(self.sess) self.mainTrain(self.sess, self.args)
def main(self, args=None): ## (1)参数及对象初始化 # General initialisation argsUtil = ArgsUtil() argsUtil.parseArgs(argsUtil, args) self.args = argsUtil.args #self.loadModelParams() # Update the self.modelDir and self.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) if not self.args.rootDir: self.args.rootDir = os.getcwd() # Use the current working directory self.textData = TextData(self.args) self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.summary.FileWriter("save/model-" + self.args.modelTag) self.saver = tf.train.Saver(max_to_keep=200) # Running session self.sess = tf.Session(config=tf.ConfigProto( log_device_placement=False) ) self.sess2 = tf.Session(config=tf.ConfigProto( log_device_placement=False) ) ## (2)初始化tf变量 self.sess.run(tf.global_variables_initializer()) self.sess2.run(tf.global_variables_initializer()) ## (3)tf session预测 self.managePreviousModel(self.sess, self.args.modelTag)
def load_model(self): self.loadModelParams( ) # Update the self.modelDir and self.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) self.textData = TextData(self.args) # TODO: Add a mode where we can force the input of the decoder // Try to visualize the predictions for # each word of the vocabulary / decoder input # TODO: For now, the model are trained for a specific dataset (because of the maxLength which define the # vocabulary). Add a compatibility mode which allow to launch a model trained on a different vocabulary ( # remap the word2id/id2word variables). if self.args.createDataset: print('Dataset created! Thanks for using this program') return # No need to go further # only create the model if we need to, to avoid errors if self.model is None: # Prepare the model with tf.device(self.getDevice()): self.model = Model(self.args, self.textData)
def run(self, args=None): """ Launch the training and/or the interactive mode """ print('Welcome to DeepQA v0.1 !') print() print('TensorFlow detected: v{}'.format(tf.__version__)) self.args = self.parseArgs(args) if not self.args.rootDir: self.args.rootDir = os.getcwd( ) # Use the current working directory self.loadModelParams( ) # Update the self.modelDir and self.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) self.textData = TextData(self.args) tf.reset_default_graph() # Prepare the model with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) self.saver = tf.train.Saver(max_to_keep=200) # Running session self.sess = tf.Session( config=tf.ConfigProto( allow_soft_placement= True, # Allows backup device for non GPU-available operations (when forcing GPU) log_device_placement=False) # Too verbose ? ) # TODO: Replace all sess by self.sess (not necessary a good idea) ? self.sess.run(tf.global_variables_initializer()) # Reload the model eventually (if it exist.), on testing mode, the models are not loaded here (but in predictTestset) self.managePreviousModel(self.sess) print("Done")
def main(self, args=None): """ Launch the training and/or the interactive mode """ print('Welcome to DeepQA v0.1 !') print() print('TensorFlow detected: v{}'.format(tf.__version__)) # General initialisation self.args = self.parseArgs(args) if not self.args.rootDir: self.args.rootDir = os.getcwd( ) # Use the current working directory #tf.logging.set_verbosity(tf.logging.INFO) # DEBUG, INFO, WARN (default), ERROR, or FATAL self.loadModelParams( ) # Update the self.modelDir and self.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) self.textData = TextData(self.args) # TODO: Add a mode where we can force the input of the decoder // Try to visualize the predictions for # each word of the vocabulary / decoder input # TODO: For now, the model are trained for a specific dataset (because of the maxLength which define the # vocabulary). Add a compatibility mode which allow to launch a model trained on a different vocabulary ( # remap the word2id/id2word variables). if self.args.createDataset: print('Dataset created! Thanks for using this program') return # No need to go further # Prepare the model with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.summary.FileWriter(self._getSummaryName()) self.saver = tf.train.Saver(max_to_keep=200) # TODO: Fixed seed (WARNING: If dataset shuffling, make sure to do that after saving the # dataset, otherwise, all which cames after the shuffling won't be replicable when # reloading the dataset). How to restore the seed after loading ?? # Also fix seed for random.shuffle (does it works globally for all files ?) # Running session self.sess = tf.Session( config=tf.ConfigProto( allow_soft_placement= True, # Allows backup device for non GPU-available operations (when forcing GPU) log_device_placement=False) # Too verbose ? ) # TODO: Replace all sess by self.sess (not necessary a good idea) ? if self.args.debug: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) print('Initialize variables...') self.sess.run(tf.global_variables_initializer()) # Reload the model eventually (if it exist.), on testing mode, the models are not loaded here (but in predictTestset) if self.args.test != Chatbot.TestMode.ALL: self.managePreviousModel(self.sess) # Initialize embeddings with pre-trained word2vec vectors if self.args.initEmbeddings: self.loadEmbedding(self.sess) if self.args.tts: mixer.init() self.tts_client = texttospeech.TextToSpeechClient() self.tts_voice = texttospeech.types.VoiceSelectionParams( language_code='es-MX', ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL) self.tts_audio_config = texttospeech.types.AudioConfig( audio_encoding=texttospeech.enums.AudioEncoding.MP3) if self.args.test: if self.args.test == Chatbot.TestMode.INTERACTIVE: self.mainTestInteractive(self.sess) elif self.args.test == Chatbot.TestMode.ALL: print('Start predicting...') self.predictTestset(self.sess) print('All predictions done') elif self.args.test == Chatbot.TestMode.DAEMON: print('Daemon mode, running in background...') else: raise RuntimeError('Unknown test mode: {}'.format( self.args.test)) # Should never happen else: self.mainTrain(self.sess) if self.args.test != Chatbot.TestMode.DAEMON: self.sess.close() print("The End! Thanks for using this program")
class Chatbot: """ Main class which launch the training or testing mode """ class TestMode: """ Simple structure representing the different testing modes """ ALL = 'all' INTERACTIVE = 'interactive' # The user can write his own questions DAEMON = 'daemon' # The chatbot runs on background and can regularly be called to predict something def __init__(self): """ """ # Model/dataset parameters self.args = None # Task specific object self.textData = None # Dataset self.model = None # Sequence to sequence model # Tensorflow utilities for convenience saving/logging self.writer = None self.saver = None self.modelDir = '' # Where the model is saved self.globStep = 0 # Represent the number of iteration for the current model # TensorFlow main session (we keep track for the daemon) self.sess = None # Filename and directories constants self.MODEL_DIR_BASE = 'save/model' self.MODEL_NAME_BASE = 'model' self.MODEL_EXT = '.ckpt' self.CONFIG_FILENAME = 'params.ini' self.CONFIG_VERSION = '0.3' self.TEST_IN_NAME = 'data/test/samples.txt' self.TEST_OUT_SUFFIX = '_predictions.txt' self.REFERENCES_SUFFIX = '_reference.txt' self.SENTENCES_PREFIX = ['Q: ', 'A: '] @staticmethod def parseArgs(args): """ Parse the arguments from the given command line Args: args (list<str>): List of arguments to parse. If None, the default sys.argv will be parsed """ parser = argparse.ArgumentParser() # Global options globalArgs = parser.add_argument_group('Global options') globalArgs.add_argument( '--test', nargs='?', choices=[ Chatbot.TestMode.ALL, Chatbot.TestMode.INTERACTIVE, Chatbot.TestMode.DAEMON ], const=Chatbot.TestMode.ALL, default=None, help= 'if present, launch the program try to answer all sentences from data/test/ with' ' the defined model(s), in interactive mode, the user can wrote his own sentences,' ' use daemon mode to integrate the chatbot in another program') globalArgs.add_argument( '--createDataset', action='store_true', help= 'if present, the program will only generate the dataset from the corpus (no training/testing)' ) globalArgs.add_argument( '--playDataset', type=int, nargs='?', const=10, default=None, help= 'if set, the program will randomly play some samples(can be use conjointly with createDataset if this is the only action you want to perform)' ) globalArgs.add_argument( '--reset', action='store_true', help= 'use this if you want to ignore the previous model present on the model directory (Warning: the model will be destroyed with all the folder content)' ) globalArgs.add_argument( '--verbose', action='store_true', help= 'When testing, will plot the outputs at the same time they are computed' ) globalArgs.add_argument( '--keepAll', action='store_true', help= 'If this option is set, all saved model will be keep (Warning: make sure you have enough free disk space or increase saveEvery)' ) # TODO: Add an option to delimit the max size globalArgs.add_argument( '--modelTag', type=str, default=None, help='tag to differentiate which model to store/load') globalArgs.add_argument( '--rootDir', type=str, default=None, help='folder where to look for the models and data') globalArgs.add_argument( '--watsonMode', action='store_true', help= 'Inverse the questions and answer when training (the network try to guess the question)' ) globalArgs.add_argument( '--device', type=str, default=None, help= '\'gpu\' or \'cpu\' (Warning: make sure you have enough free RAM), allow to choose on which hardware run the model' ) globalArgs.add_argument('--seed', type=int, default=None, help='random seed for replication') # Dataset options datasetArgs = parser.add_argument_group('Dataset options') datasetArgs.add_argument( '--corpus', type=str, default='cornell', help= 'corpus on which extract the dataset: cornell or nutrition or healthy-comment' ) datasetArgs.add_argument( '--healthy_flag', type=int, default=0, help='whether to append healthy/unhealthy flag at end of input meal' ) datasetArgs.add_argument('--encode_food_descrips', type=int, default=0, help='whether to encode food descriptions') datasetArgs.add_argument('--encode_food_ids', type=int, default=0, help='whether to encode food descriptions') datasetArgs.add_argument( '--encode_single_food_descrip', type=int, default=0, help='whether to encode single food descriptions') datasetArgs.add_argument( '--match_encoder_decoder_input', type=int, default=0, help='whether to use same input for encoder and decoder') datasetArgs.add_argument( '--motivate_only', type=int, default=0, help='only use the first AMT response, the motivational support') datasetArgs.add_argument( '--advice_only', type=int, default=0, help='only use the 2nd AMT response, the advice part') datasetArgs.add_argument( '--datasetTag', type=str, default=None, help= 'add a tag to the dataset (file where to load the vocabulary and the precomputed samples, not the original corpus). Useful to manage multiple versions' ) # The samples are computed from the corpus if it does not exist already. There are saved in \'data/samples/\' datasetArgs.add_argument( '--ratioDataset', type=float, default=1.0, help='ratio of dataset used to avoid using the whole dataset' ) # Not implemented, useless ? datasetArgs.add_argument( '--maxLength', type=int, default=10, help= 'maximum length of the sentence (for input and output), define number of maximum step of the RNN' ) datasetArgs.add_argument( '--augment', type=int, default=0, help='whether to include additional meals with similar foods') datasetArgs.add_argument( '--finetune', type=int, default=0, help='whether to continue training on nutrition data') datasetArgs.add_argument( '--all_data', type=int, default=0, help='whether to use the full model trained on all data') # Network options (Warning: if modifying something here, also make the change on save/loadParams() ) nnArgs = parser.add_argument_group('Network options', 'architecture related option') nnArgs.add_argument('--hiddenSize', type=int, default=50, help='number of hidden units in each RNN cell') nnArgs.add_argument('--numLayers', type=int, default=1, help='number of rnn layers') nnArgs.add_argument('--embeddingSize', type=int, default=64, help='embedding size of the word representation') nnArgs.add_argument( '--softmaxSamples', type=int, default=0, help= 'Number of samples in the sampled softmax loss function. A value of 0 deactivates sampled softmax' ) nnArgs.add_argument('--attention', type=int, default=0, help='whether to use RNN with attention') nnArgs.add_argument( '--food_context', type=int, default=0, help='whether to use decoder with food context vec') nnArgs.add_argument( '--first_step', type=int, default=0, help= 'whether to limit food context vec to first decode step and input zeros for the rest' ) nnArgs.add_argument('--beam_search', type=int, default=1, help='whether to decode using beam search') nnArgs.add_argument( '--beam_size', type=int, default=10, help= 'number of candidate paths to keep on beam during beam search decode' ) nnArgs.add_argument( '--MMI', type=int, default=0, help='whether to rank decoded candidates with MMI criterion') nnArgs.add_argument( '--lambda_wt', type=float, default=0.1, help= 'weight controlling how much to penalize target response in final MMI score' ) nnArgs.add_argument( '--gamma_wt', type=int, default=1, help= 'number words in target to penalize/weight for length term of MMI score' ) # Training options trainingArgs = parser.add_argument_group('Training options') trainingArgs.add_argument('--numEpochs', type=int, default=30, help='maximum number of epochs to run') trainingArgs.add_argument( '--saveEvery', type=int, default=1000, help='nb of mini-batch step before creating a model checkpoint') trainingArgs.add_argument('--batchSize', type=int, default=10, help='mini-batch size') trainingArgs.add_argument('--learningRate', type=float, default=0.001, help='Learning rate') return parser.parse_args(args) def main(self, args=None): """ Launch the training and/or the interactive mode """ print('Welcome to DeepQA v0.1 !') print() print('TensorFlow detected: v{}'.format(tf.__version__)) # General initialisation self.args = self.parseArgs(args) if self.args.corpus == 'nutrition': self.args.maxLength = 100 if self.args.encode_food_descrips: self.MODEL_DIR_BASE = 'save/food-meal-model' self.SENTENCES_PREFIX = ['Input food: ', 'Output meal: '] elif self.args.encode_single_food_descrip: self.MODEL_DIR_BASE = 'save/single-food-meal-model' self.SENTENCES_PREFIX = ['Input food: ', 'Output meal: '] elif self.args.encode_food_ids: self.MODEL_DIR_BASE = 'save/foodID-meal-model' self.SENTENCES_PREFIX = ['Input food: ', 'Output meal: '] else: self.MODEL_DIR_BASE = 'save/meal-model' self.SENTENCES_PREFIX = ['Input meal: ', 'Output meal: '] elif self.args.corpus == 'healthy-comments': self.args.maxLength = 100 self.args.usda_vecs = load_usda_vecs() if self.args.all_data: self.MODEL_DIR_BASE = 'save_allData/healthy-comments' else: self.MODEL_DIR_BASE = 'save/healthy-comments' if self.args.healthy_flag: self.MODEL_DIR_BASE = 'save/healthy-comments-flag' elif self.args.encode_food_ids: self.MODEL_DIR_BASE = 'save/healthy-comments-foodID' self.SENTENCES_PREFIX = ['Input meal: ', 'Output comment: '] self.TEST_IN_NAME = 'data/test/healthy_comments_test.txt' if self.args.motivate_only: self.MODEL_DIR_BASE += '-motivate' elif self.args.advice_only: self.MODEL_DIR_BASE += '-advice' if self.args.match_encoder_decoder_input: self.MODEL_DIR_BASE += '-match-decoder' if self.args.attention: self.MODEL_DIR_BASE += '-attention' self.args.softmaxSamples = 512 if self.args.food_context: self.MODEL_DIR_BASE += '-context' self.args.softmaxSamples = 512 if self.args.first_step: self.MODEL_DIR_BASE += '-firstStep' if self.args.augment: self.MODEL_DIR_BASE += '-augment' if self.args.numLayers == 2: self.MODEL_DIR_BASE += '-deep' if self.args.finetune: self.MODEL_DIR_BASE += '-finetune' if self.args.MMI: self.args.beam_size = 200 #if self.args.beam_search: # self.MODEL_DIR_BASE += '-beam' ''' # create ranker model if self.args.test and self.args.food_context: import sys sys.path.append('/usr/users/korpusik/LanaServer/Server/Model') from ranker import Ranker self.args.model = Ranker() ''' if not self.args.rootDir: self.args.rootDir = os.getcwd( ) # Use the current working directory #tf.logging.set_verbosity(tf.logging.INFO) # DEBUG, INFO, WARN (default), ERROR, or FATAL self.loadModelParams( ) # Update the self.modelDir and self.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) self.textData = TextData(self.args) # TODO: Add a mode where we can force the input of the decoder // Try to visualize the predictions for # each word of the vocabulary / decoder input # TODO: For now, the model are trained for a specific dataset (because of the maxLength which define the # vocabulary). Add a compatibility mode which allow to launch a model trained on a different vocabulary ( # remap the word2id/id2word variables). if self.args.createDataset: print('Dataset created! Thanks for using this program') return # No need to go further if self.args.MMI: # create bigram language model for MMI scoring of decoder output self.probDist = nltk.ConditionalProbDist( nltk.ConditionalFreqDist( nltk.bigrams(self.textData.responseWords)), nltk.MLEProbDist) with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.summary.FileWriter(self._getSummaryName()) self.saver = tf.train.Saver( max_to_keep=200, write_version=tf.train.SaverDef.V1) # Arbitrary limit ? # TODO: Fixed seed (WARNING: If dataset shuffling, make sure to do that after saving the # dataset, otherwise, all which cames after the shuffling won't be replicable when # reloading the dataset). How to restore the seed after loading ?? # Also fix seed for random.shuffle (does it works globally for all files ?) # Running session self.sess = tf.Session( ) # TODO: Replace all sess by self.sess (not necessary a good idea) ? print('Initialize variables...') self.sess.run(tf.global_variables_initializer()) # Reload the model eventually (if it exist.), on testing mode, the models are not loaded here (but in predictTestset) if self.args.test != Chatbot.TestMode.ALL: self.managePreviousModel(self.sess) if self.args.test: if self.args.test == Chatbot.TestMode.INTERACTIVE: self.mainTestInteractive(self.sess) elif self.args.test == Chatbot.TestMode.ALL: print('Start predicting...') self.predictTestset(self.sess) print('All predictions done') elif self.args.test == Chatbot.TestMode.DAEMON: print('Daemon mode, running in background...') else: raise RuntimeError('Unknown test mode: {}'.format( self.args.test)) # Should never happen else: self.mainTrain(self.sess) if self.args.test != Chatbot.TestMode.DAEMON: self.sess.close() print("The End! Thanks for using this program") def mainTrain(self, sess): """ Training loop Args: sess: The current running session """ # Specific training dependent loading self.textData.makeLighter( self.args.ratioDataset) # Limit the number of training samples mergedSummaries = tf.summary.merge_all( ) # Define the summary operator (Warning: Won't appear on the tensorboard graph) if self.globStep == 0: # Not restoring from previous run self.writer.add_graph(sess.graph) # First time only # If restoring a model, restore the progression bar ? and current batch ? print('Start training (press Ctrl+C to save and exit)...') try: # If the user exit while training, we still try to save the model for e in range(self.args.numEpochs): print() print("----- Epoch {}/{} ; (lr={}) -----".format( e + 1, self.args.numEpochs, self.args.learningRate)) batches = self.textData.getBatches() # TODO: Also update learning parameters eventually tic = datetime.datetime.now() for nextBatch in tqdm(batches, desc="Training"): # Training pass ops, feedDict = self.model.step(nextBatch) assert len(ops) == 2 # training, loss _, loss, summary = sess.run(ops + (mergedSummaries, ), feedDict) self.writer.add_summary(summary, self.globStep) self.globStep += 1 # Checkpoint if self.globStep % self.args.saveEvery == 0: self._saveSession(sess) toc = datetime.datetime.now() print( "Epoch finished in {}".format(toc - tic) ) # Warning: Will overflow if an epoch takes more than 24 hours, and the output isn't really nicer except (KeyboardInterrupt, SystemExit): # If the user press Ctrl+C while testing progress print('Interruption detected, exiting the program...') self._saveSession(sess) # Ultimate saving before complete exit def predictTestset(self, sess): """ Try predicting the sentences from the samples.txt file. The sentences are saved on the modelDir under the same name Args: sess: The current running session """ modelList = self._getModelList() if not modelList: print( 'Warning: No model found in \'{}\'. Please train a model before trying to predict' .format(self.modelDir)) return if self.args.corpus == 'healthy-comments': lines = [] responses = None responses_motivate = [] responses_advice = [] corpusDir = '/usr/users/korpusik/nutrition/Talia_data/' files = [ 'salad1.csv', 'salad2.csv', 'salad3.csv', 'dinner1.csv', 'dinner2.csv', 'dinner3.csv', 'pasta1.csv', 'pasta2.csv', 'pasta3.csv', 'pasta4.csv' ] count = 0 for filen in files: csvfile = open(corpusDir + filen) reader = csv.DictReader(csvfile) for row in reader: count += 1 # use every 10th line for testing if count % 10 != 0: continue #print(row['Input.meal_response']) lines.append(row['Input.meal_response']) responses_motivate.append(row['Answer.description1']) responses_advice.append(row['Answer.description2']) assert len(lines) == len(responses_motivate) == len( responses_advice) else: # Loading the file to predict with open(os.path.join(self.args.rootDir, self.TEST_IN_NAME), 'r') as f: lines = f.readlines() responses = None # Predicting for each model present in modelDir meal_response_map = {} # maps meals to list of candidate responses for modelName in sorted(modelList): # TODO: Natural sorting print('Restoring previous model from {}'.format(modelName)) self.saver.restore(sess, modelName) print('Testing...') saveName = modelName[:-len( self.MODEL_EXT )] + '_predictions_0.1_full_data' # We remove the model extension and add the prediction suffix if self.args.MMI: saveName += '_MMI_' + str(self.args.lambda_wt) + '_' + str( self.args.gamma_wt) saveName += '.txt' if self.args.corpus == 'healthy-comments': reference_f1 = open( modelName[:-len(self.MODEL_EXT)] + '_reference_motivate.txt', 'w') reference_f2 = open( modelName[:-len(self.MODEL_EXT)] + '_reference_advice.txt', 'w') meal_f = open('test_meals.txt', 'w') else: reference_f = open( modelName[:-len(self.MODEL_EXT)] + self.REFERENCES_SUFFIX, 'w') with open(saveName, 'w') as f: nbIgnored = 0 for i, line in enumerate(tqdm(lines, desc='Sentences')): question = line[:-1] # Remove the endl character if responses: response = responses[i] reference_f.write(response + '\n') elif self.args.corpus == 'healthy-comments': reference_f1.write(responses_motivate[i] + '\n') reference_f2.write(responses_advice[i] + '\n') meal_f.write(question + '\n') answer, predict_responses = self.singlePredict(question) if not answer: nbIgnored += 1 continue # Back to the beginning, try again output = self.textData.sequence2str(answer, clean=True) predict_responses = [ self.textData.sequence2str(reply, clean=True) for reply in predict_responses ] meal_response_map[question] = predict_responses predString = '{x[0]}{0}\n{x[1]}{1}\n\n'.format( question, output, x=self.SENTENCES_PREFIX) if self.args.verbose: tqdm.write(predString) f.write(output + '\n') print( 'Prediction finished, {}/{} sentences ignored (too long)'. format(nbIgnored, len(lines))) if self.args.corpus == 'healthy-comments': reference_f1.close() reference_f2.close() meal_f.close() else: reference_f.close() with open(modelName[:-len(self.MODEL_EXT)] + 'predict_candidates.json', 'w') as fp: json.dump(meal_response_map, fp) print('Output: ', modelName[:-len(self.MODEL_EXT)] + self.TEST_OUT_SUFFIX) print('Refs: ', modelName[:-len(self.MODEL_EXT)] + self.REFERENCES_SUFFIX) def mainTestInteractive(self, sess): """ Try predicting the sentences that the user will enter in the console Args: sess: The current running session """ # TODO: If verbose mode, also show similar sentences from the training set with the same words (include in mainTest also) # TODO: Also show the top 10 most likely predictions for each predicted output (when verbose mode) # TODO: Log the questions asked for latter re-use (merge with test/samples.txt) print('Testing: Launch interactive mode:') print('') print( 'Welcome to the interactive mode, here you can ask to Deep Q&A the sentence you want. Don\'t have high ' 'expectation. Type \'exit\' or just press ENTER to quit the program. Have fun.' ) while True: question = input(self.SENTENCES_PREFIX[0]) if question == '' or question == 'exit': break questionSeq = [ ] # Will be contain the question as seen by the encoder answer, candidates = self.singlePredict(question, questionSeq) if not answer: print( 'Warning: sentence too long, sorry. Maybe try a simpler sentence.' ) continue # Back to the beginning, try again print('{}{}'.format(self.SENTENCES_PREFIX[1], self.textData.sequence2str(answer, clean=True))) if self.args.verbose: print( self.textData.batchSeq2str(questionSeq, clean=True, reverse=True)) print(self.textData.sequence2str(answer)) print() def singlePredict(self, question, questionSeq=None): """ Predict the sentence Args: question (str): the raw input sentence questionSeq (List<int>): output argument. If given will contain the input batch sequence Return: list <int>: the word ids corresponding to the answer """ # Create the input batch batch = self.textData.sentence2enco(question) if not batch: return None if questionSeq is not None: # If the caller want to have the real input questionSeq.extend(batch.encoderSeqs) # Run the model ops, feedDict = self.model.step(batch, self.args.match_encoder_decoder_input) output = self.sess.run( ops[0], feedDict) # TODO: Summarize the output too (histogram, ...) candidates = [] if self.args.beam_search: # print all candidates in beam probs, path, symbol, output = output[-1], output[-3], output[ -2], output[:-3] #print('probs', probs) #print('path', path) #print('symbol', symbol) paths = [] log_probs = [] # total log prob of each candidate path num_steps = len(path) lastTokenIndex = [num_steps] * self.args.beam_size for kk in range(self.args.beam_size): paths.append([]) log_probs.append(0.0) for i in range(num_steps - 1): if symbol[i][kk] == self.textData.eosToken: lastTokenIndex[kk] = i break curr = list(range(self.args.beam_size)) for i in range(num_steps - 1, -1, -1): for kk in range(self.args.beam_size): if i > lastTokenIndex[kk]: continue paths[kk].append(symbol[i][curr[kk]]) log_probs[kk] = log_probs[kk] + probs[i][curr[kk]] curr[kk] = path[i][curr[kk]] #print ("Replies ---------------------->") reply_score_map = {} best_score = None for kk in range(self.args.beam_size): foutputs = [int(logit) for logit in paths[kk][::-1]] candidates.append(foutputs) #print(foutputs) reply = self.textData.sequence2str(foutputs, clean=True) if reply in reply_score_map: continue if self.args.MMI: length_term = self.args.gamma_wt * len(paths[kk]) log_LM_penalty = 0.0 prevWord = "<start>" for wordID in foutputs[:self.args.gamma_wt]: currWord = self.textData.id2word[wordID] bigramP = self.probDist[prevWord].prob(currWord) # TODO: try Kneser-Ney smoothing if bigramP > 0: log_LM_penalty += math.log(bigramP) prevWord = currWord # TODO: try with product of probs instead of sum of logs LM_term = self.args.lambda_wt * log_LM_penalty score = log_probs[kk] - LM_term + length_term reply_score_map[reply] = score #print(score, log_probs[kk], LM_term, length_term, reply) else: print(reply) score = log_probs[kk] reply_score_map[reply] = log_probs[kk] if kk == 0: answer = foutputs best_score = score elif score > best_score: answer = foutputs best_score = score # rerank replies based on MMI scores if self.args.MMI: sorted_replies = sorted(reply_score_map.items(), key=operator.itemgetter(1), reverse=True) for i, (reply, score) in enumerate(sorted_replies): if self.args.test == 'interactive': print(i, score, reply) else: answer = self.textData.deco2sentence(output) candidates.append(answer) return answer, candidates def daemonPredict(self, sentence): """ Return the answer to a given sentence (same as singlePredict() but with additional cleaning) Args: sentence (str): the raw input sentence Return: str: the human readable sentence """ return self.textData.sequence2str(self.singlePredict(sentence), clean=True) def daemonClose(self): """ A utility function to close the daemon when finish """ print('Exiting the daemon mode...') self.sess.close() print('Daemon closed.') def managePreviousModel(self, sess): """ Restore or reset the model, depending of the parameters If the destination directory already contains some file, it will handle the conflict as following: * If --reset is set, all present files will be removed (warning: no confirmation is asked) and the training restart from scratch (globStep & cie reinitialized) * Otherwise, it will depend of the directory content. If the directory contains: * No model files (only summary logs): works as a reset (restart from scratch) * Other model files, but modelName not found (surely keepAll option changed): raise error, the user should decide by himself what to do * The right model file (eventually some other): no problem, simply resume the training In any case, the directory will exist as it has been created by the summary writer Args: sess: The current running session """ print('WARNING: ', end='') modelName = self._getModelName() if os.listdir(self.modelDir): if self.args.reset: print('Reset: Destroying previous model at {}'.format( self.modelDir)) # Analysing directory content elif os.path.exists(modelName): # Restore the model print('Restoring previous model from {}'.format(modelName)) self.saver.restore( sess, modelName ) # Will crash when --reset is not activated and the model has not been saved yet print('Model restored.') elif self._getModelList(): print('Conflict with previous models.') raise RuntimeError( 'Some models are already present in \'{}\'. You should check them first (or re-try with the keepAll flag)' .format(self.modelDir)) else: # No other model to conflict with (probably summary files) print( 'No previous model found, but some files found at {}. Cleaning...' .format(self.modelDir)) # Warning: No confirmation asked self.args.reset = True if self.args.reset: fileList = [ os.path.join(self.modelDir, f) for f in os.listdir(self.modelDir) ] for f in fileList: print('Removing {}'.format(f)) os.remove(f) else: print('No previous model found, starting from clean directory: {}'. format(self.modelDir)) def _saveSession(self, sess): """ Save the model parameters and the variables Args: sess: the current session """ tqdm.write('Checkpoint reached: saving model (don\'t stop the run)...') self.saveModelParams() self.saver.save(sess, self._getModelName() ) # TODO: Put a limit size (ex: 3GB for the modelDir) tqdm.write('Model saved.') def _getModelList(self): """ Return the list of the model files inside the model directory """ return [ os.path.join(self.modelDir, f) for f in os.listdir(self.modelDir) if f.endswith(self.MODEL_EXT) ] def loadModelParams(self): """ Load the some values associated with the current model, like the current globStep value For now, this function does not need to be called before loading the model (no parameters restored). However, the modelDir name will be initialized here so it is required to call this function before managePreviousModel(), _getModelName() or _getSummaryName() Warning: if you modify this function, make sure the changes mirror saveModelParams, also check if the parameters should be reset in managePreviousModel """ # Compute the current model path self.modelDir = os.path.join(self.args.rootDir, self.MODEL_DIR_BASE) if self.args.modelTag: self.modelDir += '-' + self.args.modelTag # If there is a previous model, restore some parameters configName = os.path.join(self.modelDir, self.CONFIG_FILENAME) if not self.args.reset and not self.args.createDataset and os.path.exists( configName): # Loading config = configparser.ConfigParser() config.read(configName) # Check the version currentVersion = config['General'].get('version') if currentVersion != self.CONFIG_VERSION: raise UserWarning( 'Present configuration version {0} does not match {1}. You can try manual changes on \'{2}\'' .format(currentVersion, self.CONFIG_VERSION, configName)) # Restoring the the parameters self.globStep = config['General'].getint('globStep') self.args.maxLength = config['General'].getint( 'maxLength' ) # We need to restore the model length because of the textData associated and the vocabulary size (TODO: Compatibility mode between different maxLength) self.args.watsonMode = config['General'].getboolean('watsonMode') #self.args.datasetTag = config['General'].get('datasetTag') self.args.hiddenSize = config['Network'].getint('hiddenSize') self.args.numLayers = config['Network'].getint('numLayers') self.args.embeddingSize = config['Network'].getint('embeddingSize') self.args.softmaxSamples = config['Network'].getint( 'softmaxSamples') # No restoring for training params, batch size or other non model dependent parameters # Show the restored params print() print('Warning: Restoring parameters:') print('globStep: {}'.format(self.globStep)) print('maxLength: {}'.format(self.args.maxLength)) print('watsonMode: {}'.format(self.args.watsonMode)) print('hiddenSize: {}'.format(self.args.hiddenSize)) print('numLayers: {}'.format(self.args.numLayers)) print('embeddingSize: {}'.format(self.args.embeddingSize)) print('softmaxSamples: {}'.format(self.args.softmaxSamples)) print() # For now, not arbitrary independent maxLength between encoder and decoder self.args.maxLengthEnco = self.args.maxLength self.args.maxLengthDeco = self.args.maxLength + 2 if self.args.watsonMode: self.SENTENCES_PREFIX.reverse() def saveModelParams(self): """ Save the params of the model, like the current globStep value Warning: if you modify this function, make sure the changes mirror loadModelParams """ config = configparser.ConfigParser() config['General'] = {} config['General']['version'] = self.CONFIG_VERSION config['General']['globStep'] = str(self.globStep) config['General']['maxLength'] = str(self.args.maxLength) config['General']['watsonMode'] = str(self.args.watsonMode) config['Network'] = {} config['Network']['hiddenSize'] = str(self.args.hiddenSize) config['Network']['numLayers'] = str(self.args.numLayers) config['Network']['embeddingSize'] = str(self.args.embeddingSize) config['Network']['softmaxSamples'] = str(self.args.softmaxSamples) # Keep track of the learning params (but without restoring them) config['Training (won\'t be restored)'] = {} config['Training (won\'t be restored)']['learningRate'] = str( self.args.learningRate) config['Training (won\'t be restored)']['batchSize'] = str( self.args.batchSize) with open(os.path.join(self.modelDir, self.CONFIG_FILENAME), 'w') as configFile: config.write(configFile) def _getSummaryName(self): """ Parse the argument to decide were to save the summary, at the same place that the model The folder could already contain logs if we restore the training, those will be merged Return: str: The path and name of the summary """ return self.modelDir def _getModelName(self): """ Parse the argument to decide were to save/load the model This function is called at each checkpoint and the first time the model is load. If keepAll option is set, the globStep value will be included in the name. Return: str: The path and name were the model need to be saved """ modelName = os.path.join(self.modelDir, self.MODEL_NAME_BASE) if self.args.keepAll: # We do not erase the previously saved model by including the current step on the name modelName += '-' + str(self.globStep) return modelName + self.MODEL_EXT def getDevice(self): """ Parse the argument to decide on which device run the model Return: str: The name of the device on which run the program """ if self.args.device == 'cpu': return '/cpu:0' elif self.args.device == 'gpu': return '/gpu:0' elif self.args.device is None: # No specified device (default) return None else: print( 'Warning: Error in the device name: {}, use the default device' .format(self.args.device)) return None
def main(self, args=None): """ Launch the training and/or the interactive mode """ print('Welcome to DeepQA v0.1 !') print() print('TensorFlow detected: v{}'.format(tf.__version__)) # General initialisation self.args = self.parseArgs(args) if self.args.corpus == 'nutrition': self.args.maxLength = 100 if self.args.encode_food_descrips: self.MODEL_DIR_BASE = 'save/food-meal-model' self.SENTENCES_PREFIX = ['Input food: ', 'Output meal: '] elif self.args.encode_single_food_descrip: self.MODEL_DIR_BASE = 'save/single-food-meal-model' self.SENTENCES_PREFIX = ['Input food: ', 'Output meal: '] elif self.args.encode_food_ids: self.MODEL_DIR_BASE = 'save/foodID-meal-model' self.SENTENCES_PREFIX = ['Input food: ', 'Output meal: '] else: self.MODEL_DIR_BASE = 'save/meal-model' self.SENTENCES_PREFIX = ['Input meal: ', 'Output meal: '] elif self.args.corpus == 'healthy-comments': self.args.maxLength = 100 self.args.usda_vecs = load_usda_vecs() if self.args.all_data: self.MODEL_DIR_BASE = 'save_allData/healthy-comments' else: self.MODEL_DIR_BASE = 'save/healthy-comments' if self.args.healthy_flag: self.MODEL_DIR_BASE = 'save/healthy-comments-flag' elif self.args.encode_food_ids: self.MODEL_DIR_BASE = 'save/healthy-comments-foodID' self.SENTENCES_PREFIX = ['Input meal: ', 'Output comment: '] self.TEST_IN_NAME = 'data/test/healthy_comments_test.txt' if self.args.motivate_only: self.MODEL_DIR_BASE += '-motivate' elif self.args.advice_only: self.MODEL_DIR_BASE += '-advice' if self.args.match_encoder_decoder_input: self.MODEL_DIR_BASE += '-match-decoder' if self.args.attention: self.MODEL_DIR_BASE += '-attention' self.args.softmaxSamples = 512 if self.args.food_context: self.MODEL_DIR_BASE += '-context' self.args.softmaxSamples = 512 if self.args.first_step: self.MODEL_DIR_BASE += '-firstStep' if self.args.augment: self.MODEL_DIR_BASE += '-augment' if self.args.numLayers == 2: self.MODEL_DIR_BASE += '-deep' if self.args.finetune: self.MODEL_DIR_BASE += '-finetune' if self.args.MMI: self.args.beam_size = 200 #if self.args.beam_search: # self.MODEL_DIR_BASE += '-beam' ''' # create ranker model if self.args.test and self.args.food_context: import sys sys.path.append('/usr/users/korpusik/LanaServer/Server/Model') from ranker import Ranker self.args.model = Ranker() ''' if not self.args.rootDir: self.args.rootDir = os.getcwd( ) # Use the current working directory #tf.logging.set_verbosity(tf.logging.INFO) # DEBUG, INFO, WARN (default), ERROR, or FATAL self.loadModelParams( ) # Update the self.modelDir and self.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) self.textData = TextData(self.args) # TODO: Add a mode where we can force the input of the decoder // Try to visualize the predictions for # each word of the vocabulary / decoder input # TODO: For now, the model are trained for a specific dataset (because of the maxLength which define the # vocabulary). Add a compatibility mode which allow to launch a model trained on a different vocabulary ( # remap the word2id/id2word variables). if self.args.createDataset: print('Dataset created! Thanks for using this program') return # No need to go further if self.args.MMI: # create bigram language model for MMI scoring of decoder output self.probDist = nltk.ConditionalProbDist( nltk.ConditionalFreqDist( nltk.bigrams(self.textData.responseWords)), nltk.MLEProbDist) with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.summary.FileWriter(self._getSummaryName()) self.saver = tf.train.Saver( max_to_keep=200, write_version=tf.train.SaverDef.V1) # Arbitrary limit ? # TODO: Fixed seed (WARNING: If dataset shuffling, make sure to do that after saving the # dataset, otherwise, all which cames after the shuffling won't be replicable when # reloading the dataset). How to restore the seed after loading ?? # Also fix seed for random.shuffle (does it works globally for all files ?) # Running session self.sess = tf.Session( ) # TODO: Replace all sess by self.sess (not necessary a good idea) ? print('Initialize variables...') self.sess.run(tf.global_variables_initializer()) # Reload the model eventually (if it exist.), on testing mode, the models are not loaded here (but in predictTestset) if self.args.test != Chatbot.TestMode.ALL: self.managePreviousModel(self.sess) if self.args.test: if self.args.test == Chatbot.TestMode.INTERACTIVE: self.mainTestInteractive(self.sess) elif self.args.test == Chatbot.TestMode.ALL: print('Start predicting...') self.predictTestset(self.sess) print('All predictions done') elif self.args.test == Chatbot.TestMode.DAEMON: print('Daemon mode, running in background...') else: raise RuntimeError('Unknown test mode: {}'.format( self.args.test)) # Should never happen else: self.mainTrain(self.sess) if self.args.test != Chatbot.TestMode.DAEMON: self.sess.close() print("The End! Thanks for using this program")
def main(self, args=None): """ Launch the training and/or the interactive mode """ print('Welcome to DeepQA v0.1 !') print() print('TensorFlow detected: v{}'.format(tf.__version__)) # General initialisation self.args = self.parseArgs(args) if not self.args.rootDir: self.args.rootDir = os.getcwd( ) # Use the current working directory #tf.logging.set_verbosity(tf.logging.INFO) # DEBUG, INFO, WARN (default), ERROR, or FATAL self.loadModelParams( ) # Update the self.modelDir and self.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) self.textData = TextData(self.args) # TODO: Add a mode where we can force the input of the decoder // Try to visualize the predictions for # each word of the vocabulary / decoder input # TODO: For now, the model are trained for a specific dataset (because of the maxLength which define the # vocabulary). Add a compatibility mode which allow to launch a model trained on a different vocabulary ( # remap the word2id/id2word variables). if self.args.createDataset: print('Dataset created! Thanks for using this program') return # No need to go further # Prepare the model with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.summary.FileWriter(self._getSummaryName()) if '12' in tf.__version__: # HACK: Solve new tf Saver V2 format self.saver = tf.train.Saver(max_to_keep=200, write_version=1) # Arbitrary limit ? else: self.saver = tf.train.Saver(max_to_keep=200) # TODO: Fixed seed (WARNING: If dataset shuffling, make sure to do that after saving the # dataset, otherwise, all which cames after the shuffling won't be replicable when # reloading the dataset). How to restore the seed after loading ?? # Also fix seed for random.shuffle (does it works globally for all files ?) # Running session self.sess = tf.Session( config=tf.ConfigProto( allow_soft_placement= True, # Allows backup device for non GPU-available operations (when forcing GPU) log_device_placement=False) # Too verbose ? ) # TODO: Replace all sess by self.sess (not necessary a good idea) ? print("Displaying summary statistics from TextData object:") self.textData.printWordCountStats() print("self.textData.getWordIdCalledCount is: " + str(self.textData.getWordIdCalledCount)) print('Initialize variables...') self.sess.run(tf.global_variables_initializer()) print("Displaying summary statistics from TextData object:") self.textData.printWordCountStats() print("self.textData.getWordIdCalledCount is: " + str(self.textData.getWordIdCalledCount)) # Reload the model eventually (if it exist.), on testing mode, the models are not loaded here (but in predictTestset) if self.args.test != Chatbot.TestMode.ALL: self.managePreviousModel(self.sess) # Initialize embeddings with pre-trained word2vec vectors if self.args.initEmbeddings: print( "Loading pre-trained embeddings from GoogleNews-vectors-negative300.bin" ) self.loadEmbedding(self.sess) print("Displaying summary statistics from TextData object:") self.textData.printWordCountStats() print("self.textData.getWordIdCalledCount is: " + str(self.textData.getWordIdCalledCount)) if self.args.test: if self.args.test == Chatbot.TestMode.INTERACTIVE: self.mainTestInteractive(self.sess) elif self.args.test == Chatbot.TestMode.ALL: print('Start predicting...') self.predictTestset(self.sess) print('All predictions done') elif self.args.test == Chatbot.TestMode.DAEMON: print('Daemon mode, running in background...') else: raise RuntimeError('Unknown test mode: {}'.format( self.args.test)) # Should never happen else: self.mainTrain(self.sess) if self.args.test != Chatbot.TestMode.DAEMON: self.sess.close() print("The End! Thanks for using this program")
def main(self, args=None): """ Launch the training and/or the interactive mode """ print('Welcome to DeepQA v0.1 !') print() print('TensorFlow detected: v{}'.format(tf.__version__)) # General initialisation self.args = self.parseArgs(args) if not self.args.rootDir: self.args.rootDir = os.getcwd() # Use the current working directory #tf.logging.set_verbosity(tf.logging.INFO) # DEBUG, INFO, WARN (default), ERROR, or FATAL self.loadModelParams() # Update the self.modelDir and self.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) self.textData = TextData(self.args) # TODO: Add a mode where we can force the input of the decoder // Try to visualize the predictions for # each word of the vocabulary / decoder input # TODO: For now, the model are trained for a specific dataset (because of the maxLength which define the # vocabulary). Add a compatibility mode which allow to launch a model trained on a different vocabulary ( # remap the word2id/id2word variables). if self.args.createDataset: print('Dataset created! Thanks for using this program') return # No need to go further # Prepare the model with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.summary.FileWriter(self._getSummaryName()) self.saver = tf.train.Saver(max_to_keep=200) # TODO: Fixed seed (WARNING: If dataset shuffling, make sure to do that after saving the # dataset, otherwise, all which cames after the shuffling won't be replicable when # reloading the dataset). How to restore the seed after loading ?? # Also fix seed for random.shuffle (does it works globally for all files ?) # Running session self.sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, # Allows backup device for non GPU-available operations (when forcing GPU) log_device_placement=False) # Too verbose ? ) # TODO: Replace all sess by self.sess (not necessary a good idea) ? if self.args.debug: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) print('Initialize variables...') self.sess.run(tf.global_variables_initializer()) # Reload the model eventually (if it exist.), on testing mode, the models are not loaded here (but in predictTestset) if self.args.test != Chatbot.TestMode.ALL: self.managePreviousModel(self.sess) # Initialize embeddings with pre-trained word2vec vectors if self.args.initEmbeddings: self.loadEmbedding(self.sess) if self.args.test: if self.args.test == Chatbot.TestMode.INTERACTIVE: self.mainTestInteractive(self.sess) elif self.args.test == Chatbot.TestMode.ALL: print('Start predicting...') self.predictTestset(self.sess) print('All predictions done') elif self.args.test == Chatbot.TestMode.DAEMON: print('Daemon mode, running in background...') else: raise RuntimeError('Unknown test mode: {}'.format(self.args.test)) # Should never happen else: self.mainTrain(self.sess) if self.args.test != Chatbot.TestMode.DAEMON: self.sess.close() print("The End! Thanks for using this program")
def main(self, args=None): """ Launch the training and/or the interactive mode """ print('Welcome to DeepQA v0.1 !') print() print('TensorFlow detected: v{}'.format(tf.__version__)) # General initialisation self.args = self.parseArgs(args) if not self.args.rootDir: self.args.rootDir = os.getcwd( ) # Use the current working directory self.loadModelParams() self.textData = TextData(self.args) if self.args.createDataset: print('Dataset created! Thanks for using this program') return # No need to go further with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.summary.FileWriter(self._getSummaryName()) # tf0.12 and before:self.writer = tf.train.SummaryWriter(self._getSummaryName()) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=200) # Arbitrary limit ? # Running session config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) print('Initialize variables...') self.sess.run(tf.global_variables_initializer()) # Reload the model eventually (if it exist.), on testing mode, # the models are not loaded here (but in predictTestset) if self.args.test != Chatbot.TestMode.ALL: self.managePreviousModel(self.sess) # Use attention mechanism in model by creating embedding_attention model if self.args.useAttentions: print('Using attention mechanism in model') if self.args.softmaxSamples == 0: print( 'Warning: Use attention mechanism without softmax samples ' 'requires larger memory space and may raise OOM exception.' ) print('Recommend to rerun the program and train the model ' 'with softmaxSamples and useAttentions arguments') # Initialize embeddings with pre-trained word2vec vectors if self.args.initEmbeddings: print( "Loading pre-trained embeddings from GoogleNews-vectors-negative300.bin" ) self.loadEmbedding(self.sess) if self.args.test: if self.args.test == Chatbot.TestMode.INTERACTIVE: self.mainTestInteractive(self.sess) elif self.args.test == Chatbot.TestMode.ALL: print('Start predicting...') self.predictTestset(self.sess) print('All predictions done') elif self.args.test == Chatbot.TestMode.DAEMON: print('Daemon mode, running in background...') else: raise RuntimeError('Unknown test mode: {}'.format( self.args.test)) # Should never happen else: self.mainTrain(self.sess) if self.args.test != Chatbot.TestMode.DAEMON: self.sess.close() print("The End! Thanks for using this program")
class Chatbot: """ Main class which launch the training or testing mode """ class TestMode: """ Simple structure representing the different testing modes """ ALL = 'all' INTERACTIVE = 'interactive' # The user can write his own questions DAEMON = 'daemon' # The chatbot runs on background and can regularly be called to predict something def __init__(self): """ """ # Model/dataset parameters self.args = None # Task specific object self.textData = None # Dataset self.model = None # Sequence to sequence model # Tensorflow utilities for convenience saving/logging self.writer = None self.saver = None self.modelDir = '' # Where the model is saved self.globStep = 0 # Represent the number of iteration for the current model # TensorFlow main session (we keep track for the daemon) self.sess = None # Filename and directories constants self.MODEL_DIR_BASE = 'save' + os.sep + 'model' self.MODEL_NAME_BASE = 'model' self.MODEL_EXT = '.ckpt' self.CONFIG_FILENAME = 'params.ini' self.CONFIG_VERSION = '0.5' self.TEST_IN_NAME = 'data' + os.sep + 'test' + os.sep + 'samples.txt' self.TEST_OUT_SUFFIX = '_predictions.txt' self.SENTENCES_PREFIX = ['Q: ', 'A: '] def main(self, args=None): """ Launch the training and/or the interactive mode """ print('Welcome to Conversation with Shakespeare') # General initialisation self.args = args print(self.args.numEpochs) if not self.args.rootDir: self.args.rootDir = os.getcwd() # Use the current working directory ## Function to load any parameters self.loadModelParams() self.textData = TextData(self.args) if self.args.createDataset: print('Dataset created.') return # No need to go further # Prepare the model with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.summary.FileWriter(self._getSummaryName()) self.saver = tf.train.Saver(max_to_keep=200) # TODO: Fixed seed (WARNING: If dataset shuffling, make sure to do that after saving the # dataset, otherwise, all which cames after the shuffling won't be replicable when # reloading the dataset). How to restore the seed after loading ?? # Also fix seed for random.shuffle (does it works globally for all files ?) # Running session self.sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, # Allows backup device for non GPU-available operations (when forcing GPU) log_device_placement=False) # Too verbose ? ) # TODO: Replace all sess by self.sess (not necessary a good idea) ? if self.args.debug: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) print('Initialize variables...') self.sess.run(tf.global_variables_initializer()) # Initialize embeddings with pre-trained word2vec vectors if self.args.initEmbeddings: self.loadEmbedding(self.sess) self.mainTrain(self.sess) def mainTrain(self, sess): """ Training loop Args: sess: The current running session """ # Specific training dependent loading self.textData.makeLighter(self.args.ratioDataset) # Limit the number of training samples mergedSummaries = tf.summary.merge_all() # Define the summary operator (Warning: Won't appear on the tensorboard graph) if self.globStep == 0: # Not restoring from previous run self.writer.add_graph(sess.graph) # First time only # If restoring a model, restore the progression bar ? and current batch ? print('Start training (press Ctrl+C to save and exit)...') try: # If the user exit while training, we still try to save the model for e in range(self.args.numEpochs): print() print("----- Epoch {}/{} ; (lr={}) -----".format(e+1, self.args.numEpochs, self.args.learningRate)) batches = self.textData.getBatches() # TODO: Also update learning parameters eventually tic = datetime.datetime.now() for nextBatch in tqdm(batches, desc="Training"): # Training pass ops, feedDict = self.model.step(nextBatch) assert len(ops) == 2 # training, loss _, loss, summary = sess.run(ops + (mergedSummaries,), feedDict) self.writer.add_summary(summary, self.globStep) self.globStep += 1 # Output training status if self.globStep % 100 == 0: perplexity = math.exp(float(loss)) if loss < 300 else float("inf") tqdm.write("----- Step %d -- Loss %.2f -- Perplexity %.2f" % (self.globStep, loss, perplexity)) # Checkpoint if self.globStep % self.args.saveEvery == 0: self._saveSession(sess) toc = datetime.datetime.now() print("Epoch finished in {}".format(toc-tic)) # Warning: Will overflow if an epoch takes more than 24 hours, and the output isn't really nicer except (KeyboardInterrupt, SystemExit): # If the user press Ctrl+C while testing progress print('Interruption detected, exiting the program...') self._saveSession(sess) # Ultimate saving before complete exit def loadEmbedding(self, sess): """ Initialize embeddings with pre-trained word2vec vectors Will modify the embedding weights of the current loaded model Uses the GoogleNews pre-trained values (path hardcoded) """ # Fetch embedding variables from model with tf.variable_scope("embedding_rnn_seq2seq/rnn/embedding_wrapper", reuse=True): em_in = tf.get_variable("embedding") with tf.variable_scope("embedding_rnn_seq2seq/embedding_rnn_decoder", reuse=True): em_out = tf.get_variable("embedding") # Disable training for embeddings variables = tf.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES) variables.remove(em_in) variables.remove(em_out) # If restoring a model, we can leave here if self.globStep != 0: return # New model, we load the pre-trained word2vec data and initialize embeddings embeddings_path = os.path.join(self.args.rootDir, 'data', 'embeddings', self.args.embeddingSource) embeddings_format = os.path.splitext(embeddings_path)[1][1:] print("Loading pre-trained word embeddings from %s " % embeddings_path) with open(embeddings_path, "rb") as f: header = f.readline() vocab_size, vector_size = map(int, header.split()) binary_len = np.dtype('float32').itemsize * vector_size initW = np.random.uniform(-0.25,0.25,(len(self.textData.word2id), vector_size)) for line in tqdm(range(vocab_size)): word = [] while True: ch = f.read(1) if ch == b' ': word = b''.join(word).decode('utf-8') break if ch != b'\n': word.append(ch) if word in self.textData.word2id: if embeddings_format == 'bin': vector = np.fromstring(f.read(binary_len), dtype='float32') elif embeddings_format == 'vec': vector = np.fromstring(f.readline(), sep=' ', dtype='float32') else: raise Exception("Unkown format for embeddings: %s " % embeddings_format) initW[self.textData.word2id[word]] = vector else: if embeddings_format == 'bin': f.read(binary_len) elif embeddings_format == 'vec': f.readline() else: raise Exception("Unkown format for embeddings: %s " % embeddings_format) # PCA Decomposition to reduce word2vec dimensionality if self.args.embeddingSize < vector_size: U, s, Vt = np.linalg.svd(initW, full_matrices=False) S = np.zeros((vector_size, vector_size), dtype=complex) S[:vector_size, :vector_size] = np.diag(s) initW = np.dot(U[:, :self.args.embeddingSize], S[:self.args.embeddingSize, :self.args.embeddingSize]) # Initialize input and output embeddings sess.run(em_in.assign(initW)) sess.run(em_out.assign(initW)) def _saveSession(self, sess): """ Save the model parameters and the variables Args: sess: the current session """ tqdm.write('Checkpoint reached: saving model (don\'t stop the run)...') self.saveModelParams() model_name = self._getModelName() with open(model_name, 'w') as f: # HACK: Simulate the old model existance to avoid rewriting the file parser f.write('This file is used internally by DeepQA to check the model existance. Please do not remove.\n') self.saver.save(sess, model_name) # TODO: Put a limit size (ex: 3GB for the modelDir) tqdm.write('Model saved.') def _getModelList(self): """ Return the list of the model files inside the model directory """ return [os.path.join(self.modelDir, f) for f in os.listdir(self.modelDir) if f.endswith(self.MODEL_EXT)] def loadModelParams(self): self.modelDir = os.path.join(self.args.rootDir, self.MODEL_DIR_BASE) if self.args.modelTag: self.modelDir += '-' + self.args.modelTag self.args.maxLengthEnco = self.args.maxLength self.args.maxLengthDeco = self.args.maxLength + 2 if self.args.watsonMode: self.SENTENCES_PREFIX.reverse() def saveModelParams(self): """ Save the params of the model, like the current globStep value Warning: if you modify this function, make sure the changes mirror loadModelParams """ config = configparser.ConfigParser() config['General'] = {} config['General']['version'] = self.CONFIG_VERSION config['General']['globStep'] = str(self.globStep) config['General']['watsonMode'] = str(self.args.watsonMode) config['General']['autoEncode'] = str(self.args.autoEncode) config['General']['corpus'] = str(self.args.corpus) config['Dataset'] = {} config['Dataset']['datasetTag'] = str(self.args.datasetTag) config['Dataset']['maxLength'] = str(self.args.maxLength) config['Dataset']['filterVocab'] = str(self.args.filterVocab) config['Dataset']['skipLines'] = str(self.args.skipLines) config['Dataset']['vocabularySize'] = str(self.args.vocabularySize) config['Network'] = {} config['Network']['hiddenSize'] = str(self.args.hiddenSize) config['Network']['numLayers'] = str(self.args.numLayers) config['Network']['softmaxSamples'] = str(self.args.softmaxSamples) config['Network']['initEmbeddings'] = str(self.args.initEmbeddings) config['Network']['embeddingSize'] = str(self.args.embeddingSize) config['Network']['embeddingSource'] = str(self.args.embeddingSource) # Keep track of the learning params (but without restoring them) config['Training (won\'t be restored)'] = {} config['Training (won\'t be restored)']['learningRate'] = str(self.args.learningRate) config['Training (won\'t be restored)']['batchSize'] = str(self.args.batchSize) config['Training (won\'t be restored)']['dropout'] = str(self.args.dropout) with open(os.path.join(self.modelDir, self.CONFIG_FILENAME), 'w') as configFile: config.write(configFile) def _getSummaryName(self): """ Parse the argument to decide were to save the summary, at the same place that the model The folder could already contain logs if we restore the training, those will be merged Return: str: The path and name of the summary """ return self.modelDir def _getModelName(self): """ Parse the argument to decide were to save/load the model This function is called at each checkpoint and the first time the model is load. If keepAll option is set, the globStep value will be included in the name. Return: str: The path and name were the model need to be saved """ modelName = os.path.join(self.modelDir, self.MODEL_NAME_BASE) if self.args.keepAll: # We do not erase the previously saved model by including the current step on the name modelName += '-' + str(self.globStep) return modelName + self.MODEL_EXT def getDevice(self): """ Parse the argument to decide on which device run the model Return: str: The name of the device on which run the program """ if self.args.device == 'cpu': return '/cpu:0' elif self.args.device == 'gpu': return '/gpu:0' elif self.args.device is None: # No specified device (default) return None else: print('Warning: Error in the device name: {}, use the default device'.format(self.args.device)) return None
def main(self, args=None): """ Launch the training and/or the interactive mode """ # General initialisation self.args = self.parseArgs(args) if not self.args.rootDir: self.args.rootDir = os.getcwd( ) # Use the current working directory #tf.logging.set_verbosity(tf.logging.INFO) # DEBUG, INFO, WARN (default), ERROR, or FATAL self.loadModelParams( ) # Update the self.modelDir and self.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) self.textData = TextData(self.args) # each word of the vocabulary / decoder input # vocabulary). Add a compatibility mode which allow to launch a model trained on a different vocabulary ( # remap the word2id/id2word variables). if self.args.createDataset: print('Dataset created! Thanks for using this program') return # No need to go further # Prepare the model with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.summary.FileWriter(self._getSummaryName()) self.saver = tf.train.Saver( max_to_keep=200, write_version=tf.train.SaverDef.V1 ) # TODO: See GitHub for format name issue (when restoring the model) # dataset, otherwise, all which cames after the shuffling won't be replicable when # reloading the dataset). How to restore the seed after loading ?? # Also fix seed for random.shuffle (does it works globally for all files ?) # Running session self.sess = tf.Session(config=tf.ConfigProto( allow_soft_placement= True, # Allows backup device for non GPU-available operations (when forcing GPU) log_device_placement=False) # Too verbose ? ) if self.args.debug: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) print('Initialize variables...') self.sess.run(tf.global_variables_initializer()) # Reload the model eventually (if it exist.), on testing mode, the models are not loaded here (but in predictTestset) self.managePreviousModel(self.sess) # Initialize embeddings with pre-trained word2vec vectors if self.args.initEmbeddings: print( "Loading pre-trained embeddings from GoogleNews-vectors-negative300.bin" ) self.loadEmbedding(self.sess) if self.args.test: if self.args.test == Chatbot.TestMode.INTERACTIVE: self.mainTestInteractive(self.sess) elif self.args.test == Chatbot.TestMode.DAEMON: print('Daemon mode, running in background...') else: raise RuntimeError('Unknown test mode: {}'.format( self.args.test)) # Should never happen else: self.mainTrain(self.sess) if self.args.test != Chatbot.TestMode.DAEMON: self.sess.close() print("The End!")
def main(self, args=None): """ Launch the training and/or the interactive mode """ print('Welcome to DeepQA v0.1 !') print() print('TensorFlow detected: v{}'.format(tf.__version__)) # General initialisation self.args = self.parseArgs(args) if not self.args.rootDir: self.args.rootDir = os.getcwd() # Use the current working directory #tf.logging.set_verbosity(tf.logging.INFO) # DEBUG, INFO, WARN (default), ERROR, or FATAL self.loadModelParams() # Update the self.modelDir and self.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) self.textData = TextData(self.args) # TODO: Add a mode where we can force the input of the decoder // Try to visualize the predictions for # each word of the vocabulary / decoder input # TODO: For now, the model are trained for a specific dataset (because of the maxLength which define the # vocabulary). Add a compatibility mode which allow to launch a model trained on a different vocabulary ( # remap the word2id/id2word variables). if self.args.createDataset: print('Dataset created! Thanks for using this program') return # No need to go further with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.train.SummaryWriter(self._getSummaryName()) self.saver = tf.train.Saver(max_to_keep=200) # Arbitrary limit ? # TODO: Fixed seed (WARNING: If dataset shuffling, make sure to do that after saving the # dataset, otherwise, all which cames after the shuffling won't be replicable when # reloading the dataset). How to restore the seed after loading ?? # Also fix seed for random.shuffle (does it works globally for all files ?) # Running session self.sess = tf.Session() # TODO: Replace all sess by self.sess (not necessary a good idea) ? print('Initialize variables...') self.sess.run(tf.initialize_all_variables()) # Reload the model eventually (if it exist.), on testing mode, the models are not loaded here (but in predictTestset) if self.args.test != Chatbot.TestMode.ALL: self.managePreviousModel(self.sess) if self.args.test: # TODO: For testing, add a mode where instead taking the most likely output after the <go> token, # takes the second or third so it generates new sentences for the same input. Difficult to implement, # probably have to modify the TensorFlow source code if self.args.test == Chatbot.TestMode.INTERACTIVE: self.mainTestInteractive(self.sess) elif self.args.test == Chatbot.TestMode.ALL: print('Start predicting...') self.predictTestset(self.sess) print('All predictions done') elif self.args.test == Chatbot.TestMode.DAEMON: print('Daemon mode, running in background...') else: raise RuntimeError('Unknown test mode: {}'.format(self.args.test)) # Should never happen else: self.mainTrain(self.sess) if self.args.test != Chatbot.TestMode.DAEMON: self.sess.close() print("The End! Thanks for using this program")
def _init_deep_qa_bot(self, chatbot, args=None): """ Launch the training and/or the interactive mode almost the same as chatbot.main() except the removal of print and replaced with log """ # General initialisation chatbot.args = chatbot.parseArgs(args) if not chatbot.args.rootDir: chatbot.args.rootDir = os.getcwd( ) # Use the current working directory # tf.logging.set_verbosity(tf.logging.INFO) # DEBUG, INFO, WARN (default), ERROR, or FATAL chatbot.loadModelParams( ) # Update the chatbot.modelDir and chatbot.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) chatbot.textData = TextData(chatbot.args) # TODO: Add a mode where we can force the input of the decoder // Try to visualize the predictions for # each word of the vocabulary / decoder input # TODO: For now, the model are trained for a specific dataset (because of the maxLength which define the # vocabulary). Add a compatibility mode which allow to launch a model trained on a different vocabulary ( # remap the word2id/id2word variables). if chatbot.args.createDataset: self._logger.info( '[{}] Dataset created! Thanks for using this program'.format( chatbot.args.modelTag)) return # No need to go further # Prepare the model with tf.device(chatbot.getDevice()): chatbot.model = Model(chatbot.args, chatbot.textData) # Saver/summaries chatbot.writer = tf.summary.FileWriter(chatbot._getSummaryName()) chatbot.saver = tf.train.Saver(max_to_keep=200) # Running session chatbot.sess = tf.Session(config=tf.ConfigProto( allow_soft_placement= True, # Allows backup device for non GPU-available operations (when forcing GPU) log_device_placement=False) # Too verbose ? ) if chatbot.args.debug: chatbot.sess = tf_debug.LocalCLIDebugWrapperSession(chatbot.sess) chatbot.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) self._logger.info('[{}] Initialize variables...'.format( chatbot.args.modelTag)) chatbot.sess.run(tf.global_variables_initializer()) # Reload the model eventually (if it exist.), on testing mode, the models are not loaded here (but in predictTestset) if chatbot.args.test != Chatbot.TestMode.ALL: chatbot.managePreviousModel(chatbot.sess) # Initialize embeddings with pre-trained word2vec vectors if chatbot.args.initEmbeddings: chatbot.loadEmbedding(chatbot.sess) if chatbot.args.test: if chatbot.args.test == Chatbot.TestMode.INTERACTIVE: chatbot.mainTestInteractive(chatbot.sess) elif chatbot.args.test == Chatbot.TestMode.ALL: self._logger.info('[{}] Start predicting...'.format( chatbot.args.modelTag)) chatbot.predictTestset(chatbot.sess) self._logger.info('[{}] All predictions done'.format( chatbot.args.modelTag)) elif chatbot.args.test == Chatbot.TestMode.DAEMON: self._logger.info( '[{}] Daemon mode, running in background...'.format( chatbot.args.modelTag)) else: raise RuntimeError('[{}] Unknown test mode: {}'.format( chatbot.args.modelTag, chatbot.args.test)) # Should never happen else: chatbot.mainTrain(chatbot.sess) if chatbot.args.test != Chatbot.TestMode.DAEMON: chatbot.sess.close() self._logger.info( "[{}] The End! Thanks for using this program".format( chatbot.args.modelTag))
# Copyright 2015 Conchylicultor. All Rights Reserved.
class Chatbot: """ Main class which launch the training or testing mode 主类, 启动训练和测试 """ class TestMode: """ Simple structure representing the different testing modes """ ALL = 'all' INTERACTIVE = 'interactive' # The user can write his own questions 交互模式 DAEMON = 'daemon' # 后台模式 The chatbot runs on background and can regularly be called to predict something def __init__(self): """ """ # Model/dataset parameters self.args = None # Task specific object self.textData = None # Dataset self.model = None # Sequence to sequence model # Tensorflow utilities for convenience saving/logging self.writer = None self.saver = None self.modelDir = '' # Where the model is saved self.globStep = 0 # Represent the number of iteration for the current model # TensorFlow main session (we keep track for the daemon) self.sess = None # Filename and directories constants self.MODEL_DIR_BASE = 'save' + os.sep + 'model' self.MODEL_NAME_BASE = 'model' self.MODEL_EXT = '.ckpt' self.CONFIG_FILENAME = 'params.ini' # 配置文件的名称 self.CONFIG_VERSION = '0.5' # 配置文件的版本 self.TEST_IN_NAME = 'data' + os.sep + 'test' + os.sep + 'samples.txt' # 测试文件地址 self.TEST_OUT_SUFFIX = '_predictions.txt' # 测试文件后准 self.SENTENCES_PREFIX = ['Q: ', 'A: '] # 对话的前缀 # 解析参数 @staticmethod def parseArgs(args): """ Parse the arguments from the given command line Args: args (list<str>): List of arguments to parse. If None, the default sys.argv will be parsed """ parser = argparse.ArgumentParser() # Global options globalArgs = parser.add_argument_group('Global options') globalArgs.add_argument( '--test', nargs='?', choices=[ Chatbot.TestMode.ALL, Chatbot.TestMode.INTERACTIVE, Chatbot.TestMode.DAEMON ], const=Chatbot.TestMode.ALL, default=None, help= 'if present, launch the program try to answer all sentences from data/test/ with' ' the defined model(s), in interactive mode, the user can wrote his own sentences,' ' use daemon mode to integrate the chatbot in another program') # 从语料生成数据集 globalArgs.add_argument( '--createDataset', action='store_true', help= 'if present, the program will only generate the dataset from the corpus (no training/testing)' ) # 随机使用一些样本 globalArgs.add_argument( '--playDataset', type=int, nargs='?', const=10, default=None, help= 'if set, the program will randomly play some samples(can be use conjointly with createDataset if this is the only action you want to perform)' ) # 从头开始训练模型 globalArgs.add_argument( '--reset', action='store_true', help= 'use this if you want to ignore the previous model present on the model directory (Warning: the model will be destroyed with all the folder content)' ) globalArgs.add_argument( '--verbose', action='store_true', help= 'When testing, will plot the outputs at the same time they are computed' ) # TODO 查询TensorFlow的debug 模式 globalArgs.add_argument( '--debug', action='store_true', help= 'run DeepQA with Tensorflow debug mode. Read TF documentation for more details on this.' ) # 保存训练过程的所有模型 globalArgs.add_argument( '--keepAll', action='store_true', help= 'If this option is set, all saved model will be kept (Warning: make sure you have enough free disk space or increase saveEvery)' ) # TODO: Add an option to delimit the max size # 制定模型名称,将会从制定模型加载,保存 globalArgs.add_argument( '--modelTag', type=str, default=None, help='tag to differentiate which model to store/load') # 制定根目录,将会总根目录下加载模型和数据 globalArgs.add_argument( '--rootDir', type=str, default=None, help='folder where to look for the models and data') # 翻转问题和答案 globalArgs.add_argument( '--watsonMode', action='store_true', help= 'Inverse the questions and answer when training (the network try to guess the question)' ) # 随机选择问题和答案,将其作为输入和输出 globalArgs.add_argument( '--autoEncode', action='store_true', help= 'Randomly pick the question or the answer and use it both as input and output' ) # 制定cpu or gpu 模式运行 globalArgs.add_argument( '--device', type=str, default=None, help= '\'gpu\' or \'cpu\' (Warning: make sure you have enough free RAM), allow to choose on which hardware run the model' ) globalArgs.add_argument('--seed', type=int, default=None, help='random seed for replication') # Dataset options datasetArgs = parser.add_argument_group('Dataset options') # 提取数据集的语料 datasetArgs.add_argument('--corpus', choices=TextData.corpusChoices(), default=TextData.corpusChoices()[0], help='corpus on which extract the dataset.') # 数据集标记,使用的预处理好的语料 datasetArgs.add_argument( '--datasetTag', type=str, default='', help= 'add a tag to the dataset (file where to load the vocabulary and the precomputed samples, not the original corpus). Useful to manage multiple versions. Also used to define the file used for the lightweight format.' ) # The samples are computed from the corpus if it does not exist already. There are saved in \'data/samples/\' # Not implemented, useless ? 使用这个数据集的比例,现在还没有实现这个功能 datasetArgs.add_argument( '--ratioDataset', type=float, default=1.0, help='ratio of dataset used to avoid using the whole dataset') # 最大长度 默认是10 这个影响输入和输出的卷积步长 把这个长度改为100 datasetArgs.add_argument( '--maxLength', type=int, default=100, help= 'maximum length of the sentence (for input and output), define number of maximum step of the RNN' ) # 删除很少使用的单词(默认单词只使用一次)。 0保留所有单词。 datasetArgs.add_argument( '--filterVocab', type=int, default=1, help= 'remove rarelly used words (by default words used only once). 0 to keep all words.' ) # datasetArgs.add_argument( '--skipLines', action='store_true', help= 'Generate training samples by only using even conversation lines as questions (and odd lines as answer). Useful to train the network on a particular person.' ) # 词汇量的大小,默认40000 个 如果设置为0 表示没有限制 datasetArgs.add_argument( '--vocabularySize', type=int, default=40000, help='Limit the number of words in the vocabulary (0 for unlimited)' ) # Network options (Warning: if modifying something here, also make the change on save/loadParams() ) nnArgs = parser.add_argument_group('Network options', 'architecture related option') # 隐藏层的大小 512 nnArgs.add_argument('--hiddenSize', type=int, default=512, help='number of hidden units in each RNN cell') # 网络的层数 默认是2 nnArgs.add_argument('--numLayers', type=int, default=2, help='number of rnn layers') # 采样 nnArgs.add_argument( '--softmaxSamples', type=int, default=0, help= 'Number of samples in the sampled softmax loss function. A value of 0 deactivates sampled softmax' ) # 用初始化的词向量进行嵌入 nnArgs.add_argument( '--initEmbeddings', action='store_true', help= 'if present, the program will initialize the embeddings with pre-trained word2vec vectors' ) # 嵌入词的 大小 nnArgs.add_argument('--embeddingSize', type=int, default=64, help='embedding size of the word representation') # 默认使用Google的词向量 nnArgs.add_argument( '--embeddingSource', type=str, default="GoogleNews-vectors-negative300.bin", help='embedding file to use for the word representation') # Training options trainingArgs = parser.add_argument_group('Training options') # 训练参数,训练轮数 默认 30 ,2000次创建一个检查点 trainingArgs.add_argument('--numEpochs', type=int, default=30, help='maximum number of epochs to run') trainingArgs.add_argument( '--saveEvery', type=int, default=2000, help='nb of mini-batch step before creating a model checkpoint') # 彼此大小,256 trainingArgs.add_argument('--batchSize', type=int, default=256, help='mini-batch size') # 学习率, 默认0.002 trainingArgs.add_argument('--learningRate', type=float, default=0.002, help='Learning rate') # dropout 默认是0.9 trainingArgs.add_argument('--dropout', type=float, default=0.9, help='Dropout rate (keep probabilities)') return parser.parse_args(args) def main(self, args=None): """ Launch the training and/or the interactive mode 开始训练或者 交互模式 """ print('Welcome to chat v0.1 !') print() print('TensorFlow detected: v{}'.format( tf.__version__)) # 当前tensorflow 版本1.10.0 ,测试服务器的版本1.13.1 # General initialisation 分割参数 # --modelTag pretrainedv2 --test interactive self.args = self.parseArgs(args) if not self.args.rootDir: self.args.rootDir = os.getcwd( ) # Use the current working directory 使用当前路径 #tf.logging.set_verbosity(tf.logging.INFO) # DEBUG, INFO, WARN (default), ERROR, or FATAL self.loadModelParams( ) # Update the self.modelDir and self.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) # 读取数据对象 self.textData = TextData(self.args) # TODO: Add a mode where we can force the input of the decoder // Try to visualize the predictions for # each word of the vocabulary / decoder input # TODO: For now, the model are trained for a specific dataset (because of the maxLength which define the # vocabulary). Add a compatibility mode which allow to launch a model trained on a different vocabulary ( # remap the word2id/id2word variables). if self.args.createDataset: print('Dataset created! Thanks for using this program') return # No need to go further # Prepare the model with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.summary.FileWriter(self._getSummaryName()) self.saver = tf.train.Saver(max_to_keep=200) # TODO: Fixed seed (WARNING: If dataset shuffling, make sure to do that after saving the # dataset, otherwise, all which cames after the shuffling won't be replicable when # reloading the dataset). How to restore the seed after loading ?? # Also fix seed for random.shuffle (does it works globally for all files ?) # Running session self.sess = tf.Session( config=tf.ConfigProto( allow_soft_placement= True, # 如果指定的设备不存在,允许tf自动分配设备(true) 如果允许false log_device_placement=False ) # Too verbose ? 是否打印设备分配日志 打印设备日志日志 ) # TODO: Replace all sess by self.sess (not necessary a good idea) ? if self.args.debug: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) print('Initialize variables...') self.sess.run(tf.global_variables_initializer()) # Reload the model eventually (if it exist.), on testing mode, the models are not loaded here (but in predictTestset) if self.args.test != Chatbot.TestMode.ALL: self.managePreviousModel(self.sess) # Initialize embeddings with pre-trained word2vec vectors if self.args.initEmbeddings: self.loadEmbedding(self.sess) if self.args.test: #实时对话模式 if self.args.test == Chatbot.TestMode.INTERACTIVE: self.mainTestInteractive(self.sess) elif self.args.test == Chatbot.TestMode.ALL: print('Start predicting...') self.predictTestset(self.sess) print('All predictions done') elif self.args.test == Chatbot.TestMode.DAEMON: print('Daemon mode, running in background...') else: raise RuntimeError('Unknown test mode: {}'.format( self.args.test)) # Should never happen else: self.mainTrain(self.sess) if self.args.test != Chatbot.TestMode.DAEMON: self.sess.close() print("The End! Thanks for using this program") def mainTrain(self, sess): """ Training loop 训练 循环 Args: sess: The current running session """ # Specific training dependent loading self.textData.makeLighter(self.args.ratioDataset) # 限制训练集的大小 mergedSummaries = tf.summary.merge_all( ) # Define the summary operator (Warning: Won't appear on the tensorboard graph) if self.globStep == 0: # Not restoring from previous run self.writer.add_graph(sess.graph) # First time only # If restoring a model, restore the progression bar ? and current batch ? print('Start training (press Ctrl+C to save and exit)...') try: # If the user exit while training, we still try to save the model for e in range(self.args.numEpochs): print() print("----- Epoch {}/{} ; (lr={}) -----".format( e + 1, self.args.numEpochs, self.args.learningRate)) batches = self.textData.getBatches() # TODO: Also update learning parameters eventually tic = datetime.datetime.now() for nextBatch in tqdm(batches, desc="Training"): # Training pass ops, feedDict = self.model.step(nextBatch) assert len(ops) == 2 # training, loss _, loss, summary = sess.run(ops + (mergedSummaries, ), feedDict) self.writer.add_summary(summary, self.globStep) self.globStep += 1 # Output training status 输出训练状态 每100步 if self.globStep % 100 == 0: perplexity = math.exp( float(loss)) if loss < 300 else float("inf") tqdm.write( "----- Step %d -- Loss %.2f -- Perplexity %.2f" % (self.globStep, loss, perplexity)) # Checkpoint if self.globStep % self.args.saveEvery == 0: self._saveSession(sess) toc = datetime.datetime.now() print( "Epoch finished in {}".format(toc - tic) ) # Warning: Will overflow if an epoch takes more than 24 hours, and the output isn't really nicer except (KeyboardInterrupt, SystemExit): # If the user press Ctrl+C while testing progress print('Interruption detected, exiting the program...') self._saveSession(sess) # Ultimate saving before complete exit def predictTestset(self, sess): """ Try predicting the sentences from the samples.txt file. The sentences are saved on the modelDir under the same name Args: sess: The current running session """ # Loading the file to predict # 载入指定文件然后查看预测的结果 with open(os.path.join(self.args.rootDir, self.TEST_IN_NAME), 'r') as f: lines = f.readlines() modelList = self._getModelList() # 获取模型列表 if not modelList: print( 'Warning: No model found in \'{}\'. Please train a model before trying to predict' .format(self.modelDir)) return # Predicting for each model present in modelDir for modelName in sorted(modelList): # TODO: Natural sorting print('Restoring previous model from {}'.format(modelName)) # 载入模型 self.saver.restore(sess, modelName) print('Testing...') # 删除模型的扩展后缀并且给预测添加后缀 saveName = modelName[:-len(self.MODEL_EXT)] + self.TEST_OUT_SUFFIX with open(saveName, 'w') as f: nbIgnored = 0 for line in tqdm(lines, desc='Sentences'): question = line[:-1] #TODO 移除最后一个字符,存在一定的问题 answer = self.singlePredict(question) if not answer: nbIgnored += 1 continue # Back to the beginning, try again predString = '{x[0]}{0}\n{x[1]}{1}\n\n'.format( question, self.textData.sequence2str(answer, clean=True), x=self.SENTENCES_PREFIX) if self.args.verbose: tqdm.write(predString) f.write(predString) print( 'Prediction finished, {}/{} sentences ignored (too long)'. format(nbIgnored, len(lines))) def mainTestInteractive(self, sess): """ Try predicting the sentences that the user will enter in the console Args: sess: The current running session """ # TODO: If verbose mode, also show similar sentences from the training set with the same words (include in mainTest also) # TODO: Also show the top 10 most likely predictions for each predicted output (when verbose mode) # TODO: Log the questions asked for latter re-use (merge with test/samples.txt) print('Testing: Launch interactive mode:') print('') print( 'Welcome to the interactive mode, here you can ask to Deep Q&A the sentence you want. Don\'t have high ' 'expectation. Type \'exit\' or just press ENTER to quit the program. Have fun.' ) while True: question = input(self.SENTENCES_PREFIX[0]) if question == '' or question == 'exit': break questionSeq = [ ] # Will be contain the question as seen by the encoder answer = self.singlePredict(question, questionSeq) if not answer: print( 'Warning: sentence too long, sorry. Maybe try a simpler sentence.' ) continue # Back to the beginning, try again print('{}{}'.format(self.SENTENCES_PREFIX[1], self.textData.sequence2str(answer, clean=True))) if self.args.verbose: print( self.textData.batchSeq2str(questionSeq, clean=True, reverse=True)) print(self.textData.sequence2str(answer)) print() def singlePredict(self, question, questionSeq=None): """ Predict the sentence Args: question (str): the raw input sentence questionSeq (List<int>): output argument. If given will contain the input batch sequence Return: list <int>: the word ids corresponding to the answer """ # Create the input batch batch = self.textData.sentence2enco(question) if not batch: return None if questionSeq is not None: # If the caller want to have the real input questionSeq.extend(batch.encoderSeqs) # Run the model ops, feedDict = self.model.step(batch) output = self.sess.run( ops[0], feedDict) # TODO: Summarize the output too (histogram, ...) # 输出结果转成具体语句 answer = self.textData.deco2sentence(output) return answer # 预测单个语句 def daemonPredict(self, sentence): """ Return the answer to a given sentence (same as singlePredict() but with additional cleaning) Args: sentence (str): the raw input sentence Return: str: the human readable sentence """ return self.textData.sequence2str(self.singlePredict(sentence), clean=True) def daemonClose(self): """ A utility function to close the daemon when finish """ print('Exiting the daemon mode...') self.sess.close() print('Daemon closed.') # 词向量嵌入,采用google 采用GoogleNewS 预训练的向量 def loadEmbedding(self, sess): """ Initialize embeddings with pre-trained word2vec vectors Will modify the embedding weights of the current loaded model Uses the GoogleNews pre-trained values (path hardcoded) """ # Fetch embedding variables from model with tf.variable_scope("embedding_rnn_seq2seq/rnn/embedding_wrapper", reuse=True): em_in = tf.get_variable("embedding") with tf.variable_scope("embedding_rnn_seq2seq/embedding_rnn_decoder", reuse=True): em_out = tf.get_variable("embedding") # Disable training for embeddings variables = tf.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES) variables.remove(em_in) variables.remove(em_out) # If restoring a model, we can leave here if self.globStep != 0: return # New model, we load the pre-trained word2vec data and initialize embeddings embeddings_path = os.path.join(self.args.rootDir, 'data', 'embeddings', self.args.embeddingSource) embeddings_format = os.path.splitext(embeddings_path)[1][1:] print("Loading pre-trained word embeddings from %s " % embeddings_path) with open(embeddings_path, "rb") as f: header = f.readline() vocab_size, vector_size = map(int, header.split()) binary_len = np.dtype('float32').itemsize * vector_size initW = np.random.uniform( -0.25, 0.25, (len(self.textData.word2id), vector_size)) for line in tqdm(range(vocab_size)): word = [] while True: ch = f.read(1) if ch == b' ': word = b''.join(word).decode('utf-8') break if ch != b'\n': word.append(ch) if word in self.textData.word2id: if embeddings_format == 'bin': vector = np.fromstring(f.read(binary_len), dtype='float32') elif embeddings_format == 'vec': vector = np.fromstring(f.readline(), sep=' ', dtype='float32') else: raise Exception("Unkown format for embeddings: %s " % embeddings_format) initW[self.textData.word2id[word]] = vector else: if embeddings_format == 'bin': f.read(binary_len) elif embeddings_format == 'vec': f.readline() else: raise Exception("Unkown format for embeddings: %s " % embeddings_format) # PCA Decomposition to reduce word2vec dimensionality if self.args.embeddingSize < vector_size: U, s, Vt = np.linalg.svd(initW, full_matrices=False) S = np.zeros((vector_size, vector_size), dtype=complex) S[:vector_size, :vector_size] = np.diag(s) initW = np.dot( U[:, :self.args.embeddingSize], S[:self.args.embeddingSize, :self.args.embeddingSize]) # Initialize input and output embeddings sess.run(em_in.assign(initW)) sess.run(em_out.assign(initW)) # 管理以前的模型 def managePreviousModel(self, sess): """ Restore or reset the model, depending of the parameters If the destination directory already contains some file, it will handle the conflict as following: * If --reset is set, all present files will be removed (warning: no confirmation is asked) and the training restart from scratch (globStep & cie reinitialized) * Otherwise, it will depend of the directory content. If the directory contains: * No model files (only summary logs): works as a reset (restart from scratch) * Other model files, but modelName not found (surely keepAll option changed): raise error, the user should decide by himself what to do * The right model file (eventually some other): no problem, simply resume the training In any case, the directory will exist as it has been created by the summary writer Args: sess: The current running session """ print('WARNING: ', end='') modelName = self._getModelName() if os.listdir(self.modelDir): # 从头训练 if self.args.reset: print('Reset: Destroying previous model at {}'.format( self.modelDir)) # Analysing directory content elif os.path.exists(modelName): # Restore the model print('Restoring previous model from {}'.format(modelName)) self.saver.restore( sess, modelName ) # Will crash when --reset is not activated and the model has not been saved yet elif self._getModelList(): print('Conflict with previous models.') print(self.modelDir) raise RuntimeError( 'Some models are already present in \'{}\'. You should check them first (or re-try with the keepAll flag)' .format(self.modelDir)) else: # No other model to conflict with (probably summary files) print( 'No previous model found, but some files found at {}. Cleaning...' .format(self.modelDir)) # Warning: No confirmation asked self.args.reset = True if self.args.reset: fileList = [ os.path.join(self.modelDir, f) for f in os.listdir(self.modelDir) ] for f in fileList: print('Removing {}'.format(f)) os.remove(f) else: print('No previous model found, starting from clean directory: {}'. format(self.modelDir)) def _saveSession(self, sess): """ Save the model parameters and the variables Args: sess: the current session """ tqdm.write('Checkpoint reached: saving model (don\'t stop the run)...') self.saveModelParams() model_name = self._getModelName() with open( model_name, 'w' ) as f: # HACK: Simulate the old model existance to avoid rewriting the file parser f.write( 'This file is used internally by DeepQA to check the model existance. Please do not remove.\n' ) self.saver.save( sess, model_name) # TODO: Put a limit size (ex: 3GB for the modelDir) tqdm.write('Model saved.') # 修改保存成二进制格式 def _getModelList(self): """ Return the list of the model files inside the model directory """ return [ os.path.join(self.modelDir, f) for f in os.listdir(self.modelDir) if f.endswith(self.MODEL_EXT) ] def loadModelParams(self): """ Load the some values associated with the current model, like the current globStep value For now, this function does not need to be called before loading the model (no parameters restored). However, the modelDir name will be initialized here so it is required to call this function before managePreviousModel(), _getModelName() or _getSummaryName() Warning: if you modify this function, make sure the changes mirror saveModelParams, also check if the parameters should be reset in managePreviousModel """ # Compute the current model path self.modelDir = os.path.join(self.args.rootDir, self.MODEL_DIR_BASE) if self.args.modelTag: self.modelDir += '-' + self.args.modelTag # If there is a previous model, restore some parameters configName = os.path.join(self.modelDir, self.CONFIG_FILENAME) print('configName', configName) if not self.args.reset and not self.args.createDataset and os.path.exists( configName): # Loading config = configparser.ConfigParser() config.read(configName) # Check the version currentVersion = config['General'].get('version') if currentVersion != self.CONFIG_VERSION: raise UserWarning( 'Present configuration version {0} does not match {1}. You can try manual changes on \'{2}\'' .format(currentVersion, self.CONFIG_VERSION, configName)) # Restoring the the parameters self.globStep = config['General'].getint('globStep') self.args.watsonMode = config['General'].getboolean('watsonMode') self.args.autoEncode = config['General'].getboolean('autoEncode') self.args.corpus = config['General'].get('corpus') self.args.datasetTag = config['Dataset'].get('datasetTag') # We need to restore the model length because of the textData associated # and the vocabulary size (TODO: Compatibility mode between different maxLength) self.args.maxLength = config['Dataset'].getint('maxLength') self.args.filterVocab = config['Dataset'].getint('filterVocab') self.args.skipLines = config['Dataset'].getboolean('skipLines') self.args.vocabularySize = config['Dataset'].getint( 'vocabularySize') self.args.hiddenSize = config['Network'].getint('hiddenSize') self.args.numLayers = config['Network'].getint('numLayers') self.args.softmaxSamples = config['Network'].getint( 'softmaxSamples') self.args.initEmbeddings = config['Network'].getboolean( 'initEmbeddings') self.args.embeddingSize = config['Network'].getint('embeddingSize') self.args.embeddingSource = config['Network'].get( 'embeddingSource') # No restoring for training params, batch size or other non model dependent parameters # Show the restored params print() print('Warning: Restoring parameters:') print('globStep: {}'.format(self.globStep)) print('watsonMode: {}'.format(self.args.watsonMode)) print('autoEncode: {}'.format(self.args.autoEncode)) print('corpus: {}'.format(self.args.corpus)) print('datasetTag: {}'.format(self.args.datasetTag)) print('maxLength: {}'.format(self.args.maxLength)) print('filterVocab: {}'.format(self.args.filterVocab)) print('skipLines: {}'.format(self.args.skipLines)) print('vocabularySize: {}'.format(self.args.vocabularySize)) print('hiddenSize: {}'.format(self.args.hiddenSize)) print('numLayers: {}'.format(self.args.numLayers)) print('softmaxSamples: {}'.format(self.args.softmaxSamples)) print('initEmbeddings: {}'.format(self.args.initEmbeddings)) print('embeddingSize: {}'.format(self.args.embeddingSize)) print('embeddingSource: {}'.format(self.args.embeddingSource)) print() # For now, not arbitrary independent maxLength between encoder and decoder self.args.maxLengthEnco = self.args.maxLength self.args.maxLengthDeco = self.args.maxLength + 2 if self.args.watsonMode: self.SENTENCES_PREFIX.reverse() def saveModelParams(self): """ Save the params of the model, like the current globStep value Warning: if you modify this function, make sure the changes mirror loadModelParams """ config = configparser.ConfigParser() config['General'] = {} config['General']['version'] = self.CONFIG_VERSION config['General']['globStep'] = str(self.globStep) config['General']['watsonMode'] = str(self.args.watsonMode) config['General']['autoEncode'] = str(self.args.autoEncode) config['General']['corpus'] = str(self.args.corpus) config['Dataset'] = {} config['Dataset']['datasetTag'] = str(self.args.datasetTag) config['Dataset']['maxLength'] = str(self.args.maxLength) config['Dataset']['filterVocab'] = str(self.args.filterVocab) config['Dataset']['skipLines'] = str(self.args.skipLines) config['Dataset']['vocabularySize'] = str(self.args.vocabularySize) config['Network'] = {} config['Network']['hiddenSize'] = str(self.args.hiddenSize) config['Network']['numLayers'] = str(self.args.numLayers) config['Network']['softmaxSamples'] = str(self.args.softmaxSamples) config['Network']['initEmbeddings'] = str(self.args.initEmbeddings) config['Network']['embeddingSize'] = str(self.args.embeddingSize) config['Network']['embeddingSource'] = str(self.args.embeddingSource) # Keep track of the learning params (but without restoring them) config['Training (won\'t be restored)'] = {} config['Training (won\'t be restored)']['learningRate'] = str( self.args.learningRate) config['Training (won\'t be restored)']['batchSize'] = str( self.args.batchSize) config['Training (won\'t be restored)']['dropout'] = str( self.args.dropout) with open(os.path.join(self.modelDir, self.CONFIG_FILENAME), 'w') as configFile: config.write(configFile) def _getSummaryName(self): """ Parse the argument to decide were to save the summary, at the same place that the model The folder could already contain logs if we restore the training, those will be merged Return: str: The path and name of the summary """ return self.modelDir def _getModelName(self): """ Parse the argument to decide were to save/load the model This function is called at each checkpoint and the first time the model is load. If keepAll option is set, the globStep value will be included in the name. Return: str: The path and name were the model need to be saved """ modelName = os.path.join(self.modelDir, self.MODEL_NAME_BASE) if self.args.keepAll: # We do not erase the previously saved model by including the current step on the name modelName += '-' + str(self.globStep) return modelName + self.MODEL_EXT def getDevice(self): """ Parse the argument to decide on which device run the model Return: str: The name of the device on which run the program """ if self.args.device == 'cpu': return '/cpu:0' elif self.args.device == 'gpu': return '/gpu:0' elif self.args.device is None: # No specified device (default) return None else: print( 'Warning: Error in the device name: {}, use the default device' .format(self.args.device)) return None
class Chatbot: """ Main class which launch the training or testing mode """ class TestMode: """ Simple structure representing the different testing modes """ ALL = 'all' INTERACTIVE = 'interactive' # The user can write his own questions DAEMON = 'daemon' # The chatbot runs on background and can regularly be called to predict something def __init__(self): """ """ # Model/dataset parameters self.args = None # Task specific object self.textData = None # Dataset self.model = None # Sequence to sequence model # Tensorflow utilities for convenience saving/logging self.writer = None self.saver = None self.modelDir = '' # Where the model is saved self.globStep = 0 # Represent the number of iteration for the current model # TensorFlow main session (we keep track for the daemon) self.sess = None # Filename and directories constants self.MODEL_DIR_BASE = 'save' + os.sep + 'model' self.MODEL_NAME_BASE = 'model' self.MODEL_EXT = '.ckpt' self.CONFIG_FILENAME = 'params.ini' self.CONFIG_VERSION = '0.5' self.TEST_IN_NAME = 'data' + os.sep + 'test' + os.sep + 'samples.txt' self.TEST_OUT_SUFFIX = '_predictions.txt' self.SENTENCES_PREFIX = ['Q: ', 'A: '] @staticmethod def parseArgs(args): """ Parse the arguments from the given command line Args: args (list<str>): List of arguments to parse. If None, the default sys.argv will be parsed """ parser = argparse.ArgumentParser() # Global options globalArgs = parser.add_argument_group('Global options') globalArgs.add_argument('--test', nargs='?', choices=[Chatbot.TestMode.ALL, Chatbot.TestMode.INTERACTIVE, Chatbot.TestMode.DAEMON], const=Chatbot.TestMode.ALL, default=None, help='if present, launch the program try to answer all sentences from data/test/ with' ' the defined model(s), in interactive mode, the user can wrote his own sentences,' ' use daemon mode to integrate the chatbot in another program') globalArgs.add_argument('--createDataset', action='store_true', help='if present, the program will only generate the dataset from the corpus (no training/testing)') globalArgs.add_argument('--playDataset', type=int, nargs='?', const=10, default=None, help='if set, the program will randomly play some samples(can be use conjointly with createDataset if this is the only action you want to perform)') globalArgs.add_argument('--reset', action='store_true', help='use this if you want to ignore the previous model present on the model directory (Warning: the model will be destroyed with all the folder content)') globalArgs.add_argument('--verbose', action='store_true', help='When testing, will plot the outputs at the same time they are computed') globalArgs.add_argument('--debug', action='store_true', help='run DeepQA with Tensorflow debug mode. Read TF documentation for more details on this.') globalArgs.add_argument('--keepAll', action='store_true', help='If this option is set, all saved model will be kept (Warning: make sure you have enough free disk space or increase saveEvery)') # TODO: Add an option to delimit the max size globalArgs.add_argument('--modelTag', type=str, default=None, help='tag to differentiate which model to store/load') globalArgs.add_argument('--rootDir', type=str, default=None, help='folder where to look for the models and data') globalArgs.add_argument('--watsonMode', action='store_true', help='Inverse the questions and answer when training (the network try to guess the question)') globalArgs.add_argument('--autoEncode', action='store_true', help='Randomly pick the question or the answer and use it both as input and output') globalArgs.add_argument('--device', type=str, default=None, help='\'gpu\' or \'cpu\' (Warning: make sure you have enough free RAM), allow to choose on which hardware run the model') globalArgs.add_argument('--seed', type=int, default=None, help='random seed for replication') # Dataset options datasetArgs = parser.add_argument_group('Dataset options') datasetArgs.add_argument('--corpus', choices=TextData.corpusChoices(), default=TextData.corpusChoices()[0], help='corpus on which extract the dataset.') datasetArgs.add_argument('--datasetTag', type=str, default='', help='add a tag to the dataset (file where to load the vocabulary and the precomputed samples, not the original corpus). Useful to manage multiple versions. Also used to define the file used for the lightweight format.') # The samples are computed from the corpus if it does not exist already. There are saved in \'data/samples/\' datasetArgs.add_argument('--ratioDataset', type=float, default=1.0, help='ratio of dataset used to avoid using the whole dataset') # Not implemented, useless ? datasetArgs.add_argument('--maxLength', type=int, default=10, help='maximum length of the sentence (for input and output), define number of maximum step of the RNN') datasetArgs.add_argument('--filterVocab', type=int, default=1, help='remove rarelly used words (by default words used only once). 0 to keep all words.') datasetArgs.add_argument('--skipLines', action='store_true', help='Generate training samples by only using even conversation lines as questions (and odd lines as answer). Useful to train the network on a particular person.') datasetArgs.add_argument('--vocabularySize', type=int, default=40000, help='Limit the number of words in the vocabulary (0 for unlimited)') # Network options (Warning: if modifying something here, also make the change on save/loadParams() ) nnArgs = parser.add_argument_group('Network options', 'architecture related option') nnArgs.add_argument('--hiddenSize', type=int, default=512, help='number of hidden units in each RNN cell') nnArgs.add_argument('--numLayers', type=int, default=2, help='number of rnn layers') nnArgs.add_argument('--softmaxSamples', type=int, default=0, help='Number of samples in the sampled softmax loss function. A value of 0 deactivates sampled softmax') nnArgs.add_argument('--initEmbeddings', action='store_true', help='if present, the program will initialize the embeddings with pre-trained word2vec vectors') nnArgs.add_argument('--embeddingSize', type=int, default=64, help='embedding size of the word representation') nnArgs.add_argument('--embeddingSource', type=str, default="GoogleNews-vectors-negative300.bin", help='embedding file to use for the word representation') # Training options trainingArgs = parser.add_argument_group('Training options') trainingArgs.add_argument('--numEpochs', type=int, default=30, help='maximum number of epochs to run') trainingArgs.add_argument('--saveEvery', type=int, default=2000, help='nb of mini-batch step before creating a model checkpoint') trainingArgs.add_argument('--batchSize', type=int, default=256, help='mini-batch size') trainingArgs.add_argument('--learningRate', type=float, default=0.002, help='Learning rate') trainingArgs.add_argument('--dropout', type=float, default=0.9, help='Dropout rate (keep probabilities)') return parser.parse_args(args) def main(self, args=None): """ Launch the training and/or the interactive mode """ print('Welcome to DeepQA v0.1 !') print() print('TensorFlow detected: v{}'.format(tf.__version__)) # General initialisation self.args = self.parseArgs(args) if not self.args.rootDir: self.args.rootDir = os.getcwd() # Use the current working directory #tf.logging.set_verbosity(tf.logging.INFO) # DEBUG, INFO, WARN (default), ERROR, or FATAL self.loadModelParams() # Update the self.modelDir and self.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) self.textData = TextData(self.args) # TODO: Add a mode where we can force the input of the decoder // Try to visualize the predictions for # each word of the vocabulary / decoder input # TODO: For now, the model are trained for a specific dataset (because of the maxLength which define the # vocabulary). Add a compatibility mode which allow to launch a model trained on a different vocabulary ( # remap the word2id/id2word variables). if self.args.createDataset: print('Dataset created! Thanks for using this program') return # No need to go further # Prepare the model with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.summary.FileWriter(self._getSummaryName()) self.saver = tf.train.Saver(max_to_keep=200) # TODO: Fixed seed (WARNING: If dataset shuffling, make sure to do that after saving the # dataset, otherwise, all which cames after the shuffling won't be replicable when # reloading the dataset). How to restore the seed after loading ?? # Also fix seed for random.shuffle (does it works globally for all files ?) # Running session self.sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, # Allows backup device for non GPU-available operations (when forcing GPU) log_device_placement=False) # Too verbose ? ) # TODO: Replace all sess by self.sess (not necessary a good idea) ? if self.args.debug: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) print('Initialize variables...') self.sess.run(tf.global_variables_initializer()) # Reload the model eventually (if it exist.), on testing mode, the models are not loaded here (but in predictTestset) if self.args.test != Chatbot.TestMode.ALL: self.managePreviousModel(self.sess) # Initialize embeddings with pre-trained word2vec vectors if self.args.initEmbeddings: self.loadEmbedding(self.sess) if self.args.test: if self.args.test == Chatbot.TestMode.INTERACTIVE: self.mainTestInteractive(self.sess) elif self.args.test == Chatbot.TestMode.ALL: print('Start predicting...') self.predictTestset(self.sess) print('All predictions done') elif self.args.test == Chatbot.TestMode.DAEMON: print('Daemon mode, running in background...') else: raise RuntimeError('Unknown test mode: {}'.format(self.args.test)) # Should never happen else: self.mainTrain(self.sess) if self.args.test != Chatbot.TestMode.DAEMON: self.sess.close() print("The End! Thanks for using this program") def mainTrain(self, sess): """ Training loop Args: sess: The current running session """ # Specific training dependent loading self.textData.makeLighter(self.args.ratioDataset) # Limit the number of training samples mergedSummaries = tf.summary.merge_all() # Define the summary operator (Warning: Won't appear on the tensorboard graph) if self.globStep == 0: # Not restoring from previous run self.writer.add_graph(sess.graph) # First time only # If restoring a model, restore the progression bar ? and current batch ? print('Start training (press Ctrl+C to save and exit)...') try: # If the user exit while training, we still try to save the model for e in range(self.args.numEpochs): print() print("----- Epoch {}/{} ; (lr={}) -----".format(e+1, self.args.numEpochs, self.args.learningRate)) batches = self.textData.getBatches() # TODO: Also update learning parameters eventually tic = datetime.datetime.now() for nextBatch in tqdm(batches, desc="Training"): # Training pass ops, feedDict = self.model.step(nextBatch) assert len(ops) == 2 # training, loss _, loss, summary = sess.run(ops + (mergedSummaries,), feedDict) self.writer.add_summary(summary, self.globStep) self.globStep += 1 # Output training status if self.globStep % 100 == 0: perplexity = math.exp(float(loss)) if loss < 300 else float("inf") tqdm.write("----- Step %d -- Loss %.2f -- Perplexity %.2f" % (self.globStep, loss, perplexity)) # Checkpoint if self.globStep % self.args.saveEvery == 0: self._saveSession(sess) toc = datetime.datetime.now() print("Epoch finished in {}".format(toc-tic)) # Warning: Will overflow if an epoch takes more than 24 hours, and the output isn't really nicer except (KeyboardInterrupt, SystemExit): # If the user press Ctrl+C while testing progress print('Interruption detected, exiting the program...') self._saveSession(sess) # Ultimate saving before complete exit def predictTestset(self, sess): """ Try predicting the sentences from the samples.txt file. The sentences are saved on the modelDir under the same name Args: sess: The current running session """ # Loading the file to predict with open(os.path.join(self.args.rootDir, self.TEST_IN_NAME), 'r') as f: lines = f.readlines() modelList = self._getModelList() if not modelList: print('Warning: No model found in \'{}\'. Please train a model before trying to predict'.format(self.modelDir)) return # Predicting for each model present in modelDir for modelName in sorted(modelList): # TODO: Natural sorting print('Restoring previous model from {}'.format(modelName)) self.saver.restore(sess, modelName) print('Testing...') saveName = modelName[:-len(self.MODEL_EXT)] + self.TEST_OUT_SUFFIX # We remove the model extension and add the prediction suffix with open(saveName, 'w') as f: nbIgnored = 0 for line in tqdm(lines, desc='Sentences'): question = line[:-1] # Remove the endl character answer = self.singlePredict(question) if not answer: nbIgnored += 1 continue # Back to the beginning, try again predString = '{x[0]}{0}\n{x[1]}{1}\n\n'.format(question, self.textData.sequence2str(answer, clean=True), x=self.SENTENCES_PREFIX) if self.args.verbose: tqdm.write(predString) f.write(predString) print('Prediction finished, {}/{} sentences ignored (too long)'.format(nbIgnored, len(lines))) def mainTestInteractive(self, sess): """ Try predicting the sentences that the user will enter in the console Args: sess: The current running session """ # TODO: If verbose mode, also show similar sentences from the training set with the same words (include in mainTest also) # TODO: Also show the top 10 most likely predictions for each predicted output (when verbose mode) # TODO: Log the questions asked for latter re-use (merge with test/samples.txt) print('Testing: Launch interactive mode:') print('') print('Welcome to the interactive mode, here you can ask to Deep Q&A the sentence you want. Don\'t have high ' 'expectation. Type \'exit\' or just press ENTER to quit the program. Have fun.') while True: question = input(self.SENTENCES_PREFIX[0]) if question == '' or question == 'exit': break questionSeq = [] # Will be contain the question as seen by the encoder answer = self.singlePredict(question, questionSeq) if not answer: print('Warning: sentence too long, sorry. Maybe try a simpler sentence.') continue # Back to the beginning, try again print('{}{}'.format(self.SENTENCES_PREFIX[1], self.textData.sequence2str(answer, clean=True))) if self.args.verbose: print(self.textData.batchSeq2str(questionSeq, clean=True, reverse=True)) print(self.textData.sequence2str(answer)) print() def singlePredict(self, question, questionSeq=None): """ Predict the sentence Args: question (str): the raw input sentence questionSeq (List<int>): output argument. If given will contain the input batch sequence Return: list <int>: the word ids corresponding to the answer """ # Create the input batch batch = self.textData.sentence2enco(question) if not batch: return None if questionSeq is not None: # If the caller want to have the real input questionSeq.extend(batch.encoderSeqs) # Run the model ops, feedDict = self.model.step(batch) output = self.sess.run(ops[0], feedDict) # TODO: Summarize the output too (histogram, ...) answer = self.textData.deco2sentence(output) return answer def daemonPredict(self, sentence): """ Return the answer to a given sentence (same as singlePredict() but with additional cleaning) Args: sentence (str): the raw input sentence Return: str: the human readable sentence """ return self.textData.sequence2str( self.singlePredict(sentence), clean=True ) def daemonClose(self): """ A utility function to close the daemon when finish """ print('Exiting the daemon mode...') self.sess.close() print('Daemon closed.') def loadEmbedding(self, sess): """ Initialize embeddings with pre-trained word2vec vectors Will modify the embedding weights of the current loaded model Uses the GoogleNews pre-trained values (path hardcoded) """ # Fetch embedding variables from model with tf.variable_scope("embedding_rnn_seq2seq/rnn/embedding_wrapper", reuse=True): em_in = tf.get_variable("embedding") with tf.variable_scope("embedding_rnn_seq2seq/embedding_rnn_decoder", reuse=True): em_out = tf.get_variable("embedding") # Disable training for embeddings variables = tf.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES) variables.remove(em_in) variables.remove(em_out) # If restoring a model, we can leave here if self.globStep != 0: return # New model, we load the pre-trained word2vec data and initialize embeddings embeddings_path = os.path.join(self.args.rootDir, 'data', 'embeddings', self.args.embeddingSource) embeddings_format = os.path.splitext(embeddings_path)[1][1:] print("Loading pre-trained word embeddings from %s " % embeddings_path) with open(embeddings_path, "rb") as f: header = f.readline() vocab_size, vector_size = map(int, header.split()) binary_len = np.dtype('float32').itemsize * vector_size initW = np.random.uniform(-0.25,0.25,(len(self.textData.word2id), vector_size)) for line in tqdm(range(vocab_size)): word = [] while True: ch = f.read(1) if ch == b' ': word = b''.join(word).decode('utf-8') break if ch != b'\n': word.append(ch) if word in self.textData.word2id: if embeddings_format == 'bin': vector = np.fromstring(f.read(binary_len), dtype='float32') elif embeddings_format == 'vec': vector = np.fromstring(f.readline(), sep=' ', dtype='float32') else: raise Exception("Unkown format for embeddings: %s " % embeddings_format) initW[self.textData.word2id[word]] = vector else: if embeddings_format == 'bin': f.read(binary_len) elif embeddings_format == 'vec': f.readline() else: raise Exception("Unkown format for embeddings: %s " % embeddings_format) # PCA Decomposition to reduce word2vec dimensionality if self.args.embeddingSize < vector_size: U, s, Vt = np.linalg.svd(initW, full_matrices=False) S = np.zeros((vector_size, vector_size), dtype=complex) S[:vector_size, :vector_size] = np.diag(s) initW = np.dot(U[:, :self.args.embeddingSize], S[:self.args.embeddingSize, :self.args.embeddingSize]) # Initialize input and output embeddings sess.run(em_in.assign(initW)) sess.run(em_out.assign(initW)) def managePreviousModel(self, sess): """ Restore or reset the model, depending of the parameters If the destination directory already contains some file, it will handle the conflict as following: * If --reset is set, all present files will be removed (warning: no confirmation is asked) and the training restart from scratch (globStep & cie reinitialized) * Otherwise, it will depend of the directory content. If the directory contains: * No model files (only summary logs): works as a reset (restart from scratch) * Other model files, but modelName not found (surely keepAll option changed): raise error, the user should decide by himself what to do * The right model file (eventually some other): no problem, simply resume the training In any case, the directory will exist as it has been created by the summary writer Args: sess: The current running session """ print('WARNING: ', end='') modelName = self._getModelName() if os.listdir(self.modelDir): if self.args.reset: print('Reset: Destroying previous model at {}'.format(self.modelDir)) # Analysing directory content elif os.path.exists(modelName): # Restore the model print('Restoring previous model from {}'.format(modelName)) self.saver.restore(sess, modelName) # Will crash when --reset is not activated and the model has not been saved yet elif self._getModelList(): print('Conflict with previous models.') raise RuntimeError('Some models are already present in \'{}\'. You should check them first (or re-try with the keepAll flag)'.format(self.modelDir)) else: # No other model to conflict with (probably summary files) print('No previous model found, but some files found at {}. Cleaning...'.format(self.modelDir)) # Warning: No confirmation asked self.args.reset = True if self.args.reset: fileList = [os.path.join(self.modelDir, f) for f in os.listdir(self.modelDir)] for f in fileList: print('Removing {}'.format(f)) os.remove(f) else: print('No previous model found, starting from clean directory: {}'.format(self.modelDir)) def _saveSession(self, sess): """ Save the model parameters and the variables Args: sess: the current session """ tqdm.write('Checkpoint reached: saving model (don\'t stop the run)...') self.saveModelParams() model_name = self._getModelName() with open(model_name, 'w') as f: # HACK: Simulate the old model existance to avoid rewriting the file parser f.write('This file is used internally by DeepQA to check the model existance. Please do not remove.\n') self.saver.save(sess, model_name) # TODO: Put a limit size (ex: 3GB for the modelDir) tqdm.write('Model saved.') def _getModelList(self): """ Return the list of the model files inside the model directory """ return [os.path.join(self.modelDir, f) for f in os.listdir(self.modelDir) if f.endswith(self.MODEL_EXT)] def loadModelParams(self): """ Load the some values associated with the current model, like the current globStep value For now, this function does not need to be called before loading the model (no parameters restored). However, the modelDir name will be initialized here so it is required to call this function before managePreviousModel(), _getModelName() or _getSummaryName() Warning: if you modify this function, make sure the changes mirror saveModelParams, also check if the parameters should be reset in managePreviousModel """ # Compute the current model path self.modelDir = os.path.join(self.args.rootDir, self.MODEL_DIR_BASE) if self.args.modelTag: self.modelDir += '-' + self.args.modelTag # If there is a previous model, restore some parameters configName = os.path.join(self.modelDir, self.CONFIG_FILENAME) if not self.args.reset and not self.args.createDataset and os.path.exists(configName): # Loading config = configparser.ConfigParser() config.read(configName) # Check the version currentVersion = config['General'].get('version') if currentVersion != self.CONFIG_VERSION: raise UserWarning('Present configuration version {0} does not match {1}. You can try manual changes on \'{2}\''.format(currentVersion, self.CONFIG_VERSION, configName)) # Restoring the the parameters self.globStep = config['General'].getint('globStep') self.args.watsonMode = config['General'].getboolean('watsonMode') self.args.autoEncode = config['General'].getboolean('autoEncode') self.args.corpus = config['General'].get('corpus') self.args.datasetTag = config['Dataset'].get('datasetTag') self.args.maxLength = config['Dataset'].getint('maxLength') # We need to restore the model length because of the textData associated and the vocabulary size (TODO: Compatibility mode between different maxLength) self.args.filterVocab = config['Dataset'].getint('filterVocab') self.args.skipLines = config['Dataset'].getboolean('skipLines') self.args.vocabularySize = config['Dataset'].getint('vocabularySize') self.args.hiddenSize = config['Network'].getint('hiddenSize') self.args.numLayers = config['Network'].getint('numLayers') self.args.softmaxSamples = config['Network'].getint('softmaxSamples') self.args.initEmbeddings = config['Network'].getboolean('initEmbeddings') self.args.embeddingSize = config['Network'].getint('embeddingSize') self.args.embeddingSource = config['Network'].get('embeddingSource') # No restoring for training params, batch size or other non model dependent parameters # Show the restored params print() print('Warning: Restoring parameters:') print('globStep: {}'.format(self.globStep)) print('watsonMode: {}'.format(self.args.watsonMode)) print('autoEncode: {}'.format(self.args.autoEncode)) print('corpus: {}'.format(self.args.corpus)) print('datasetTag: {}'.format(self.args.datasetTag)) print('maxLength: {}'.format(self.args.maxLength)) print('filterVocab: {}'.format(self.args.filterVocab)) print('skipLines: {}'.format(self.args.skipLines)) print('vocabularySize: {}'.format(self.args.vocabularySize)) print('hiddenSize: {}'.format(self.args.hiddenSize)) print('numLayers: {}'.format(self.args.numLayers)) print('softmaxSamples: {}'.format(self.args.softmaxSamples)) print('initEmbeddings: {}'.format(self.args.initEmbeddings)) print('embeddingSize: {}'.format(self.args.embeddingSize)) print('embeddingSource: {}'.format(self.args.embeddingSource)) print() # For now, not arbitrary independent maxLength between encoder and decoder self.args.maxLengthEnco = self.args.maxLength self.args.maxLengthDeco = self.args.maxLength + 2 if self.args.watsonMode: self.SENTENCES_PREFIX.reverse() def saveModelParams(self): """ Save the params of the model, like the current globStep value Warning: if you modify this function, make sure the changes mirror loadModelParams """ config = configparser.ConfigParser() config['General'] = {} config['General']['version'] = self.CONFIG_VERSION config['General']['globStep'] = str(self.globStep) config['General']['watsonMode'] = str(self.args.watsonMode) config['General']['autoEncode'] = str(self.args.autoEncode) config['General']['corpus'] = str(self.args.corpus) config['Dataset'] = {} config['Dataset']['datasetTag'] = str(self.args.datasetTag) config['Dataset']['maxLength'] = str(self.args.maxLength) config['Dataset']['filterVocab'] = str(self.args.filterVocab) config['Dataset']['skipLines'] = str(self.args.skipLines) config['Dataset']['vocabularySize'] = str(self.args.vocabularySize) config['Network'] = {} config['Network']['hiddenSize'] = str(self.args.hiddenSize) config['Network']['numLayers'] = str(self.args.numLayers) config['Network']['softmaxSamples'] = str(self.args.softmaxSamples) config['Network']['initEmbeddings'] = str(self.args.initEmbeddings) config['Network']['embeddingSize'] = str(self.args.embeddingSize) config['Network']['embeddingSource'] = str(self.args.embeddingSource) # Keep track of the learning params (but without restoring them) config['Training (won\'t be restored)'] = {} config['Training (won\'t be restored)']['learningRate'] = str(self.args.learningRate) config['Training (won\'t be restored)']['batchSize'] = str(self.args.batchSize) config['Training (won\'t be restored)']['dropout'] = str(self.args.dropout) with open(os.path.join(self.modelDir, self.CONFIG_FILENAME), 'w') as configFile: config.write(configFile) def _getSummaryName(self): """ Parse the argument to decide were to save the summary, at the same place that the model The folder could already contain logs if we restore the training, those will be merged Return: str: The path and name of the summary """ return self.modelDir def _getModelName(self): """ Parse the argument to decide were to save/load the model This function is called at each checkpoint and the first time the model is load. If keepAll option is set, the globStep value will be included in the name. Return: str: The path and name were the model need to be saved """ modelName = os.path.join(self.modelDir, self.MODEL_NAME_BASE) if self.args.keepAll: # We do not erase the previously saved model by including the current step on the name modelName += '-' + str(self.globStep) return modelName + self.MODEL_EXT def getDevice(self): """ Parse the argument to decide on which device run the model Return: str: The name of the device on which run the program """ if self.args.device == 'cpu': return '/cpu:0' elif self.args.device == 'gpu': return '/gpu:0' elif self.args.device is None: # No specified device (default) return None else: print('Warning: Error in the device name: {}, use the default device'.format(self.args.device)) return None
def main(self, args=None): """ Launch the training and/or the interactive mode """ print('Welcome to DeepQA v0.1 !') print() print('TensorFlow detected: v{}'.format(tf.__version__)) # General initialisation self.args = self.parseArgs(args) if not self.args.rootDir: self.args.rootDir = os.getcwd( ) # Use the current working directory #tf.logging.set_verbosity(tf.logging.INFO) # DEBUG, INFO, WARN (default), ERROR, or FATAL self.loadModelParams( ) # Update the self.modelDir and self.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) self.textData = TextData(self.args) # TODO: Add a mode where we can force the input of the decoder // Try to visualize the predictions for # each word of the vocabulary / decoder input # TODO: For now, the model are trained for a specific dataset (because of the maxLength which define the # vocabulary). Add a compatibility mode which allow to launch a model trained on a different vocabulary ( # remap the word2id/id2word variables). if self.args.createDataset: print('Dataset created! Thanks for using this program') return # No need to go further with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.train.SummaryWriter(self._getSummaryName()) self.saver = tf.train.Saver(max_to_keep=200) # Arbitrary limit ? # TODO: Fixed seed (WARNING: If dataset shuffling, make sure to do that after saving the # dataset, otherwise, all which cames after the shuffling won't be replicable when # reloading the dataset). How to restore the seed after loading ?? # Also fix seed for random.shuffle (does it works globally for all files ?) # Running session self.sess = tf.Session( ) # TODO: Replace all sess by self.sess (not necessary a good idea) ? print('Initialize variables...') self.sess.run(tf.initialize_all_variables()) # Reload the model eventually (if it exist.), on testing mode, the models are not loaded here (but in predictTestset) if self.args.test != Chatbot.TestMode.ALL: self.managePreviousModel(self.sess) if self.args.test: # TODO: For testing, add a mode where instead taking the most likely output after the <go> token, # takes the second or third so it generates new sentences for the same input. Difficult to implement, # probably have to modify the TensorFlow source code if self.args.test == Chatbot.TestMode.INTERACTIVE: self.mainTestInteractive(self.sess) elif self.args.test == Chatbot.TestMode.ALL: print('Start predicting...') self.predictTestset(self.sess) print('All predictions done') elif self.args.test == Chatbot.TestMode.DAEMON: print('Daemon mode, running in background...') else: raise RuntimeError('Unknown test mode: {}'.format( self.args.test)) # Should never happen else: self.mainTrain(self.sess) if self.args.test != Chatbot.TestMode.DAEMON: self.sess.close() print("The End! Thanks for using this program")
class Chatbot: """ Main class which launch the training or testing mode """ class TestMode: """ Simple structure representing the different testing modes """ ALL = 'all' INTERACTIVE = 'interactive' # The user can write his own questions DAEMON = 'daemon' # The chatbot runs on background and can regularly be called to predict something def __init__(self): """ """ # Model/dataset parameters self.args = None # Task specific object self.textData = None # Dataset self.model = None # Sequence to sequence model # Tensorflow utilities for convenience saving/logging self.writer = None self.saver = None self.modelDir = '' # Where the model is saved self.globStep = 0 # Represent the number of iteration for the current model # TensorFlow main session (we keep track for the daemon) self.sess = None # Filename and directories constants self.MODEL_DIR_BASE = 'save/model' self.MODEL_NAME_BASE = 'model' self.MODEL_EXT = '.ckpt' self.CONFIG_FILENAME = 'params.ini' self.CONFIG_VERSION = '0.2' self.TEST_IN_NAME = 'data/test/samples.txt' self.TEST_OUT_SUFFIX = '_predictions.txt' self.SENTENCES_PREFIX = ['Q: ', 'A: '] @staticmethod def parseArgs(args): """ Parse the arguments from the given command line Args: args (list<str>): List of arguments to parse. If None, the default sys.argv will be parsed """ parser = argparse.ArgumentParser() # Global options globalArgs = parser.add_argument_group('Global options') globalArgs.add_argument( '--test', nargs='?', choices=[ Chatbot.TestMode.ALL, Chatbot.TestMode.INTERACTIVE, Chatbot.TestMode.DAEMON ], const=Chatbot.TestMode.ALL, default=None, help= 'if present, launch the program try to answer all sentences from data/test/ with' ' the defined model(s), in interactive mode, the user can wrote his own sentences,' ' use daemon mode to integrate the chatbot in another program') globalArgs.add_argument( '--createDataset', action='store_true', help= 'if present, the program will only generate the dataset from the corpus (no training/testing)' ) globalArgs.add_argument( '--playDataset', type=int, nargs='?', const=10, default=None, help= 'if set, the program will randomly play some samples(can be use conjointly with createDataset if this is the only action you want to perform)' ) globalArgs.add_argument( '--reset', action='store_true', help= 'use this if you want to ignore the previous model present on the model directory (Warning: the model will be destroyed with all the folder content)' ) globalArgs.add_argument( '--verbose', action='store_true', help= 'When testing, will plot the outputs at the same time they are computed' ) globalArgs.add_argument( '--keepAll', action='store_true', help= 'If this option is set, all saved model will be keep (Warning: make sure you have enough free disk space or increase saveEvery)' ) # TODO: Add an option to delimit the max size globalArgs.add_argument( '--modelTag', type=str, default=None, help='tag to differentiate which model to store/load') globalArgs.add_argument( '--rootDir', type=str, default=None, help='folder where to look for the models and data') globalArgs.add_argument( '--watsonMode', action='store_true', help= 'Inverse the questions and answer when training (the network try to guess the question)' ) globalArgs.add_argument( '--device', type=str, default=None, help= '\'gpu\' or \'cpu\' (Warning: make sure you have enough free RAM), allow to choose on which hardware run the model' ) globalArgs.add_argument('--seed', type=int, default=None, help='random seed for replication') # Dataset options datasetArgs = parser.add_argument_group('Dataset options') datasetArgs.add_argument( '--corpus', type=str, default='cornell', help= 'corpus on which extract the dataset. Only one corpus available right now (Cornell)' ) datasetArgs.add_argument( '--datasetTag', type=str, default=None, help= 'add a tag to the dataset (file where to load the vocabulary and the precomputed samples, not the original corpus). Useful to manage multiple versions' ) # The samples are computed from the corpus if it does not exist already. There are saved in \'data/samples/\' datasetArgs.add_argument( '--ratioDataset', type=float, default=1.0, help='ratio of dataset used to avoid using the whole dataset' ) # Not implemented, useless ? datasetArgs.add_argument( '--maxLength', type=int, default=10, help= 'maximum length of the sentence (for input and output), define number of maximum step of the RNN' ) # Network options (Warning: if modifying something here, also make the change on save/loadParams() ) nnArgs = parser.add_argument_group('Network options', 'architecture related option') nnArgs.add_argument('--hiddenSize', type=int, default=256, help='number of hidden units in each RNN cell') nnArgs.add_argument('--numLayers', type=int, default=2, help='number of rnn layers') nnArgs.add_argument('--embeddingSize', type=int, default=32, help='embedding size of the word representation') # Training options trainingArgs = parser.add_argument_group('Training options') trainingArgs.add_argument('--numEpochs', type=int, default=30, help='maximum number of epochs to run') trainingArgs.add_argument( '--saveEvery', type=int, default=1000, help='nb of mini-batch step before creating a model checkpoint') trainingArgs.add_argument('--batchSize', type=int, default=10, help='mini-batch size') trainingArgs.add_argument('--learningRate', type=float, default=0.001, help='Learning rate') return parser.parse_args(args) def main(self, args=None): """ Launch the training and/or the interactive mode """ print('Welcome to DeepQA v0.1 !') print() print('TensorFlow detected: v{}'.format(tf.__version__)) # General initialisation self.args = self.parseArgs(args) if not self.args.rootDir: self.args.rootDir = os.getcwd( ) # Use the current working directory #tf.logging.set_verbosity(tf.logging.INFO) # DEBUG, INFO, WARN (default), ERROR, or FATAL self.loadModelParams( ) # Update the self.modelDir and self.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) self.textData = TextData(self.args) # TODO: Add a mode where we can force the input of the decoder // Try to visualize the predictions for # each word of the vocabulary / decoder input # TODO: For now, the model are trained for a specific dataset (because of the maxLength which define the # vocabulary). Add a compatibility mode which allow to launch a model trained on a different vocabulary ( # remap the word2id/id2word variables). if self.args.createDataset: print('Dataset created! Thanks for using this program') return # No need to go further with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.train.SummaryWriter(self._getSummaryName()) self.saver = tf.train.Saver(max_to_keep=200) # Arbitrary limit ? # TODO: Fixed seed (WARNING: If dataset shuffling, make sure to do that after saving the # dataset, otherwise, all which cames after the shuffling won't be replicable when # reloading the dataset). How to restore the seed after loading ?? # Also fix seed for random.shuffle (does it works globally for all files ?) # Running session self.sess = tf.Session( ) # TODO: Replace all sess by self.sess (not necessary a good idea) ? print('Initialize variables...') self.sess.run(tf.initialize_all_variables()) # Reload the model eventually (if it exist.), on testing mode, the models are not loaded here (but in predictTestset) if self.args.test != Chatbot.TestMode.ALL: self.managePreviousModel(self.sess) if self.args.test: # TODO: For testing, add a mode where instead taking the most likely output after the <go> token, # takes the second or third so it generates new sentences for the same input. Difficult to implement, # probably have to modify the TensorFlow source code if self.args.test == Chatbot.TestMode.INTERACTIVE: self.mainTestInteractive(self.sess) elif self.args.test == Chatbot.TestMode.ALL: print('Start predicting...') self.predictTestset(self.sess) print('All predictions done') elif self.args.test == Chatbot.TestMode.DAEMON: print('Daemon mode, running in background...') else: raise RuntimeError('Unknown test mode: {}'.format( self.args.test)) # Should never happen else: self.mainTrain(self.sess) if self.args.test != Chatbot.TestMode.DAEMON: self.sess.close() print("The End! Thanks for using this program") def mainTrain(self, sess): """ Training loop Args: sess: The current running session """ # Specific training dependent loading self.textData.makeLighter( self.args.ratioDataset) # Limit the number of training samples mergedSummaries = tf.merge_all_summaries( ) # Define the summary operator (Warning: Won't appear on the tensorboard graph) if self.globStep == 0: # Not restoring from previous run self.writer.add_graph(sess.graph) # First time only # If restoring a model, restore the progression bar ? and current batch ? print('Start training (press Ctrl+C to save and exit)...') try: # If the user exit while training, we still try to save the model for e in range(self.args.numEpochs): print("--- Epoch {}/{} ; (lr={})".format( e + 1, self.args.numEpochs, self.args.learningRate)) print() batches = self.textData.getBatches() # TODO: Also update learning parameters eventually tic = datetime.datetime.now() for nextBatch in tqdm(batches, desc="Training"): # Training pass ops, feedDict = self.model.step(nextBatch) assert len(ops) == 2 # training, loss _, loss, summary = sess.run(ops + (mergedSummaries, ), feedDict) self.writer.add_summary(summary, self.globStep) self.globStep += 1 # Checkpoint if self.globStep % self.args.saveEvery == 0: self._saveSession(sess) toc = datetime.datetime.now() print( "Epoch finished in {}".format(toc - tic) ) # Warning: Will overflow if an epoch takes more than 24 hours, and the output isn't really nicer except (KeyboardInterrupt, SystemExit): # If the user press Ctrl+C while testing progress print('Interruption detected, exiting the program...') self._saveSession(sess) # Ultimate saving before complete exit def predictTestset(self, sess): """ Try predicting the sentences from the samples.txt file. The sentences are saved on the modelDir under the same name Args: sess: The current running session """ # Loading the file to predict with open(os.path.join(self.args.rootDir, self.TEST_IN_NAME), 'r') as f: lines = f.readlines() modelList = self._getModelList() if not modelList: print( 'Warning: No model found in \'{}\'. Please train a model before trying to predict' .format(self.modelDir)) return # Predicting for each model present in modelDir for modelName in sorted(modelList): # TODO: Natural sorting print('Restoring previous model from {}'.format(modelName)) self.saver.restore(sess, modelName) print('Testing...') saveName = modelName[:-len( self.MODEL_EXT )] + self.TEST_OUT_SUFFIX # We remove the model extension and add the prediction suffix with open(saveName, 'w') as f: nbIgnored = 0 for line in tqdm(lines, desc='Sentences'): question = line[:-1] # Remove the endl character answer = self.singlePredict(question) if not answer: nbIgnored += 1 continue # Back to the beginning, try again predString = '{x[0]}{0}\n{x[1]}{1}\n\n'.format( question, self.textData.sequence2str(answer, clean=True), x=self.SENTENCES_PREFIX) if self.args.verbose: tqdm.write(predString) f.write(predString) print( 'Prediction finished, {}/{} sentences ignored (too long)'. format(nbIgnored, len(lines))) def mainTestInteractive(self, sess): """ Try predicting the sentences that the user will enter in the console Args: sess: The current running session """ # TODO: If verbose mode, also show similar sentences from the training set with the same words (include in mainTest also) # TODO: Also show the top 10 most likely predictions for each predicted output (when verbose mode) print('Testing: Launch interactive mode:') print('') print( 'Welcome to the interactive mode, here you can ask to Deep Q&A the sentence you want. Don\'t have high ' 'expectation. Type \'exit\' or just press ENTER to quit the program. Have fun.' ) while True: question = input(self.SENTENCES_PREFIX[0]) if question == '' or question == 'exit': break answer = self.singlePredict(question) if not answer: print( 'Warning: sentence too long, sorry. Maybe try a simpler sentence.' ) continue # Back to the beginning, try again # TODO: print(self.textData.batchSeq2str(batch.encoderSeqs, clean=True, reverse=True)) print('{}{}'.format(self.SENTENCES_PREFIX[1], self.textData.sequence2str(answer, clean=True))) print(self.textData.sequence2str(answer)) print() def singlePredict(self, question): """ Predict the sentence Args: question (str): the raw input sentence Return: list <int>: the word ids corresponding to the answer """ batch = self.textData.sentence2enco(question) if not batch: return None ops, feedDict = self.model.step(batch) output = self.sess.run( ops[0], feedDict) # TODO: Summarize the output too (histogram, ...) answer = self.textData.deco2sentence(output) return answer def daemonPredict(self, sentence): """ Return the answer to a given sentence (same as singlePredict() but with additional cleaning) Args: sentence (str): the raw input sentence Return: str: the human readable sentence """ return self.textData.sequence2str(self.singlePredict(sentence), clean=True) def daemonClose(self): """ A utility function to close the daemon when finish """ print('Exiting the daemon mode...') self.sess.close() print('Daemon closed.') def managePreviousModel(self, sess): """ Restore or reset the model, depending of the parameters If the destination directory already contains some file, it will handle the conflict as following: * If --reset is set, all present files will be removed (warning: no confirmation is asked) and the training restart from scratch (globStep & cie reinitialized) * Otherwise, it will depend of the directory content. If the directory contains: * No model files (only summary logs): works as a reset (restart from scratch) * Other model files, but modelName not found (surely keepAll option changed): raise error, the user should decide by himself what to do * The right model file (eventually some other): no problem, simply resume the training In any case, the directory will exist as it has been created by the summary writer Args: sess: The current running session """ print('WARNING: ', end='') modelName = self._getModelName() if os.listdir(self.modelDir): if self.args.reset: print('Reset: Destroying previous model at {}'.format( self.modelDir)) # Analysing directory content elif os.path.exists(modelName): # Restore the model print('Restoring previous model from {}'.format(modelName)) self.saver.restore( sess, modelName ) # Will crash when --reset is not activated and the model has not been saved yet print('Model restored.') elif self._getModelList(): print('Conflict with previous models.') raise RuntimeError( 'Some models are already present in \'{}\'. You should check them first' .format(self.modelDir)) else: # No other model to conflict with (probably summary files) print( 'No previous model found, but some files found at {}. Cleaning...' .format(self.modelDir)) # Warning: No confirmation asked self.args.reset = True if self.args.reset: fileList = [ os.path.join(self.modelDir, f) for f in os.listdir(self.modelDir) ] for f in fileList: print('Removing {}'.format(f)) os.remove(f) else: print('No previous model found, starting from clean directory: {}'. format(self.modelDir)) def _saveSession(self, sess): """ Save the model parameters and the variables Args: sess: the current session """ tqdm.write('Checkpoint reached: saving model (don\'t stop the run)...') self.saveModelParams() self.saver.save(sess, self._getModelName() ) # TODO: Put a limit size (ex: 3GB for the modelDir) tqdm.write('Model saved.') def _getModelList(self): """ Return the list of the model files inside the model directory """ return [ os.path.join(self.modelDir, f) for f in os.listdir(self.modelDir) if f.endswith(self.MODEL_EXT) ] def loadModelParams(self): """ Load the some values associated with the current model, like the current globStep value For now, this function does not need to be called before loading the model (no parameters restored). However, the modelDir name will be initialized here so it is required to call this function before managePreviousModel(), _getModelName() or _getSummaryName() Warning: if you modify this function, make sure the changes mirror saveModelParams, also check if the parameters should be reset in managePreviousModel """ # Compute the current model path self.modelDir = os.path.join(self.args.rootDir, self.MODEL_DIR_BASE) if self.args.modelTag: self.modelDir += '-' + self.args.modelTag # If there is a previous model, restore some parameters configName = os.path.join(self.modelDir, self.CONFIG_FILENAME) if not self.args.reset and not self.args.createDataset and os.path.exists( configName): # Loading config = configparser.ConfigParser() config.read(configName) # Check the version currentVersion = config['General'].get('version') if currentVersion != self.CONFIG_VERSION: raise UserWarning( 'Present configuration version {0} does not match {1}. You can try manual changes on \'{2}\'' .format(currentVersion, self.CONFIG_VERSION, configName)) # Restoring the the parameters self.globStep = config['General'].getint('globStep') self.args.maxLength = config['General'].getint( 'maxLength' ) # We need to restore the model length because of the textData associated and the vocabulary size (TODO: Compatibility mode between different maxLength) self.args.watsonMode = config['General'].getboolean('watsonMode') #self.args.datasetTag = config['General'].get('datasetTag') self.args.hiddenSize = config['Network'].getint('hiddenSize') self.args.numLayers = config['Network'].getint('numLayers') self.args.embeddingSize = config['Network'].getint('embeddingSize') # No restoring for training params, batch size or other non model dependent parameters # Show the restored params print() print('Warning: Restoring parameters:') print('globStep: {}'.format(self.globStep)) print('maxLength: {}'.format(self.args.maxLength)) print('watsonMode: {}'.format(self.args.watsonMode)) print('hiddenSize: {}'.format(self.args.hiddenSize)) print('numLayers: {}'.format(self.args.numLayers)) print('embeddingSize: {}'.format(self.args.embeddingSize)) print() # For now, not arbitrary independent maxLength between encoder and decoder self.args.maxLengthEnco = self.args.maxLength self.args.maxLengthDeco = self.args.maxLength + 2 if self.args.watsonMode: self.SENTENCES_PREFIX.reverse() def saveModelParams(self): """ Save the params of the model, like the current globStep value Warning: if you modify this function, make sure the changes mirror loadModelParams """ config = configparser.ConfigParser() config['General'] = {} config['General']['version'] = self.CONFIG_VERSION config['General']['globStep'] = str(self.globStep) config['General']['maxLength'] = str(self.args.maxLength) config['General']['watsonMode'] = str(self.args.watsonMode) config['Network'] = {} config['Network']['hiddenSize'] = str(self.args.hiddenSize) config['Network']['numLayers'] = str(self.args.numLayers) config['Network']['embeddingSize'] = str(self.args.embeddingSize) # Keep track of the learning params (but without restoring them) config['Training (won\'t be restored)'] = {} config['Training (won\'t be restored)']['learningRate'] = str( self.args.learningRate) config['Training (won\'t be restored)']['batchSize'] = str( self.args.batchSize) with open(os.path.join(self.modelDir, self.CONFIG_FILENAME), 'w') as configFile: config.write(configFile) def _getSummaryName(self): """ Parse the argument to decide were to save the summary, at the same place that the model The folder could already contain logs if we restore the training, those will be merged Return: str: The path and name of the summary """ return self.modelDir def _getModelName(self): """ Parse the argument to decide were to save/load the model This function is called at each checkpoint and the first time the model is load. If keepAll option is set, the globStep value will be included in the name. Return: str: The path and name were the model need to be saved """ modelName = os.path.join(self.modelDir, self.MODEL_NAME_BASE) if self.args.keepAll: # We do not erase the previously saved model by including the current step on the name modelName += '-' + str(self.globStep) return modelName + self.MODEL_EXT def getDevice(self): """ Parse the argument to decide on which device run the model Return: str: The name of the device on which run the program """ if self.args.device == 'cpu': return '"/cpu:0' elif self.args.device == 'gpu': return '/gpu:0' elif self.args.device is None: # No specified device (default) return None else: print( 'Warning: Error in the device name: {}, use the default device' .format(self.args.device)) return None
class Chatbot: """ Main class which launch the training or testing mode """ class TestMode: """ Simple structure representing the different testing modes """ ALL = 'all' INTERACTIVE = 'interactive' # The user can write his own questions DAEMON = 'daemon' # The chatbot runs on background and can regularly be called to predict something def __init__(self): """ """ # Model/dataset parameters self.args = None # Task specific object self.textData = None # Dataset self.model = None # Sequence to sequence model # Tensorflow utilities for convenience saving/logging self.writer = None self.saver = None self.modelDir = '' # Where the model is saved self.globStep = 0 # Represent the number of iteration for the current model # TensorFlow main session (we keep track for the daemon) self.sess = None # Filename and directories constants self.MODEL_DIR_BASE = 'save' + os.sep + 'model' self.MODEL_NAME_BASE = 'model' self.MODEL_EXT = '.ckpt' self.CONFIG_FILENAME = 'params.ini' self.CONFIG_VERSION = '0.5' self.TEST_IN_NAME = 'data' + os.sep + 'test' + os.sep + 'samples.txt' self.TEST_OUT_SUFFIX = '_predictions.txt' self.SENTENCES_PREFIX = ['Q: ', 'A: '] @staticmethod def parseArgs(args): """ Parse the arguments from the given command line Args: args (list<str>): List of arguments to parse. If None, the default sys.argv will be parsed """ parser = argparse.ArgumentParser() # Global options globalArgs = parser.add_argument_group('Global options') globalArgs.add_argument('--test', action='store_true', default=True) globalArgs.add_argument( '--createDataset', action='store_true', help= 'if present, the program will only generate the dataset from the corpus (no training/testing)' ) globalArgs.add_argument( '--playDataset', type=int, nargs='?', const=10, default=None, help= 'if set, the program will randomly play some samples(can be use conjointly with createDataset if this is the only action you want to perform)' ) globalArgs.add_argument( '--reset', action='store_true', default=False, help= 'use this if you want to ignore the previous model present on the model directory (Warning: the model will be destroyed with all the folder content)' ) globalArgs.add_argument( '--verbose', action='store_true', help= 'When testing, will plot the outputs at the same time they are computed' ) globalArgs.add_argument( '--debug', action='store_true', help= 'run DeepQA with Tensorflow debug mode. Read TF documentation for more details on this.' ) globalArgs.add_argument( '--keepAll', action='store_true', help= 'If this option is set, all saved model will be kept (Warning: make sure you have enough free disk space or increase saveEvery)' ) # TODO: Add an option to delimit the max size globalArgs.add_argument( '--modelTag', type=str, default=None, help='tag to differentiate which model to store/load') globalArgs.add_argument( '--rootDir', type=str, default="", help='folder where to look for the models and data') globalArgs.add_argument( '--watsonMode', action='store_true', help= 'Inverse the questions and answer when training (the network try to guess the question)' ) globalArgs.add_argument( '--autoEncode', action='store_true', help= 'Randomly pick the question or the answer and use it both as input and output' ) globalArgs.add_argument( '--device', type=str, default=None, help= '\'gpu\' or \'cpu\' (Warning: make sure you have enough free RAM), allow to choose on which hardware run the model' ) globalArgs.add_argument('--seed', type=int, default=None, help='random seed for replication') # Dataset options datasetArgs = parser.add_argument_group('Dataset options') datasetArgs.add_argument('--corpus', choices=TextData.corpusChoices(), default=TextData.corpusChoices()[0], help='corpus on which extract the dataset.') datasetArgs.add_argument( '--datasetTag', type=str, default='', help= 'add a tag to the dataset (file where to load the vocabulary and the precomputed samples, not the original corpus). Useful to manage multiple versions. Also used to define the file used for the lightweight format.' ) # The samples are computed from the corpus if it does not exist already. There are saved in \'data/samples/\' datasetArgs.add_argument( '--ratioDataset', type=float, default=1.0, help='ratio of dataset used to avoid using the whole dataset' ) # Not implemented, useless ? datasetArgs.add_argument( '--maxLength', type=int, default=10, help= 'maximum length of the sentence (for input and output), define number of maximum step of the RNN' ) datasetArgs.add_argument( '--filterVocab', type=int, default=1, help= 'remove rarelly used words (by default words used only once). 0 to keep all words.' ) datasetArgs.add_argument( '--skipLines', action='store_true', help= 'Generate training samples by only using even conversation lines as questions (and odd lines as answer). Useful to train the network on a particular person.' ) datasetArgs.add_argument( '--vocabularySize', type=int, default=40000, help='Limit the number of words in the vocabulary (0 for unlimited)' ) # Network options (Warning: if modifying something here, also make the change on save/loadParams() ) nnArgs = parser.add_argument_group('Network options', 'architecture related option') nnArgs.add_argument('--hiddenSize', type=int, default=512, help='number of hidden units in each RNN cell') nnArgs.add_argument('--numLayers', type=int, default=2, help='number of rnn layers') nnArgs.add_argument( '--softmaxSamples', type=int, default=0, help= 'Number of samples in the sampled softmax loss function. A value of 0 deactivates sampled softmax' ) nnArgs.add_argument( '--initEmbeddings', action='store_true', help= 'if present, the program will initialize the embeddings with pre-trained word2vec vectors' ) nnArgs.add_argument('--embeddingSize', type=int, default=64, help='embedding size of the word representation') nnArgs.add_argument( '--embeddingSource', type=str, default="GoogleNews-vectors-negative300.bin", help='embedding file to use for the word representation') # Training options trainingArgs = parser.add_argument_group('Training options') trainingArgs.add_argument('--numEpochs', type=int, default=30, help='maximum number of epochs to run') trainingArgs.add_argument( '--saveEvery', type=int, default=2000, help='nb of mini-batch step before creating a model checkpoint') trainingArgs.add_argument('--batchSize', type=int, default=256, help='mini-batch size') trainingArgs.add_argument('--learningRate', type=float, default=0.002, help='Learning rate') trainingArgs.add_argument('--dropout', type=float, default=0.9, help='Dropout rate (keep probabilities)') return parser.parse_args(args) def run(self, args=None): """ Launch the training and/or the interactive mode """ print('Welcome to DeepQA v0.1 !') print() print('TensorFlow detected: v{}'.format(tf.__version__)) self.args = self.parseArgs(args) if not self.args.rootDir: self.args.rootDir = os.getcwd( ) # Use the current working directory self.loadModelParams( ) # Update the self.modelDir and self.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) self.textData = TextData(self.args) tf.reset_default_graph() # Prepare the model with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) self.saver = tf.train.Saver(max_to_keep=200) # Running session self.sess = tf.Session( config=tf.ConfigProto( allow_soft_placement= True, # Allows backup device for non GPU-available operations (when forcing GPU) log_device_placement=False) # Too verbose ? ) # TODO: Replace all sess by self.sess (not necessary a good idea) ? self.sess.run(tf.global_variables_initializer()) # Reload the model eventually (if it exist.), on testing mode, the models are not loaded here (but in predictTestset) self.managePreviousModel(self.sess) print("Done") def predict(self, inp): if inp == '' or inp == 'exit': return questionSeq = [] # Will be contain the question as seen by the encoder answer = self.singlePredict(inp, questionSeq) if not answer: return return self.textData.sequence2str(answer, clean=True) def singlePredict(self, question, questionSeq=None): """ Predict the sentence Args: question (str): the raw input sentence questionSeq (List<int>): output argument. If given will contain the input batch sequence Return: list <int>: the word ids corresponding to the answer """ # Create the input batch batch = self.textData.sentence2enco(question) if not batch: return None if questionSeq is not None: # If the caller want to have the real input questionSeq.extend(batch.encoderSeqs) # Run the model ops, feedDict = self.model.step(batch) output = self.sess.run( ops[0], feedDict) # TODO: Summarize the output too (histogram, ...) answer = self.textData.deco2sentence(output) return answer def loadEmbedding(self, sess): """ Initialize embeddings with pre-trained word2vec vectors Will modify the embedding weights of the current loaded model Uses the GoogleNews pre-trained values (path hardcoded) """ # Fetch embedding variables from model with tf.variable_scope("embedding_rnn_seq2seq/rnn/embedding_wrapper", reuse=True): em_in = tf.get_variable("embedding") with tf.variable_scope("embedding_rnn_seq2seq/embedding_rnn_decoder", reuse=True): em_out = tf.get_variable("embedding") # Disable training for embeddings variables = tf.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES) variables.remove(em_in) variables.remove(em_out) # If restoring a model, we can leave here if self.globStep != 0: return # New model, we load the pre-trained word2vec data and initialize embeddings embeddings_path = os.path.join(self.args.rootDir, 'data', 'embeddings', self.args.embeddingSource) embeddings_format = os.path.splitext(embeddings_path)[1][1:] print("Loading pre-trained word embeddings from %s " % embeddings_path) with open(embeddings_path, "rb") as f: header = f.readline() vocab_size, vector_size = map(int, header.split()) binary_len = np.dtype('float32').itemsize * vector_size initW = np.random.uniform( -0.25, 0.25, (len(self.textData.word2id), vector_size)) for line in tqdm(range(vocab_size)): word = [] while True: ch = f.read(1) if ch == b' ': word = b''.join(word).decode('utf-8') break if ch != b'\n': word.append(ch) if word in self.textData.word2id: if embeddings_format == 'bin': vector = np.fromstring(f.read(binary_len), dtype='float32') elif embeddings_format == 'vec': vector = np.fromstring(f.readline(), sep=' ', dtype='float32') else: raise Exception("Unkown format for embeddings: %s " % embeddings_format) initW[self.textData.word2id[word]] = vector else: if embeddings_format == 'bin': f.read(binary_len) elif embeddings_format == 'vec': f.readline() else: raise Exception("Unkown format for embeddings: %s " % embeddings_format) # PCA Decomposition to reduce word2vec dimensionality if self.args.embeddingSize < vector_size: U, s, Vt = np.linalg.svd(initW, full_matrices=False) S = np.zeros((vector_size, vector_size), dtype=complex) S[:vector_size, :vector_size] = np.diag(s) initW = np.dot( U[:, :self.args.embeddingSize], S[:self.args.embeddingSize, :self.args.embeddingSize]) # Initialize input and output embeddings sess.run(em_in.assign(initW)) sess.run(em_out.assign(initW)) def managePreviousModel(self, sess): """ Restore or reset the model, depending of the parameters If the destination directory already contains some file, it will handle the conflict as following: * If --reset is set, all present files will be removed (warning: no confirmation is asked) and the training restart from scratch (globStep & cie reinitialized) * Otherwise, it will depend of the directory content. If the directory contains: * No model files (only summary logs): works as a reset (restart from scratch) * Other model files, but modelName not found (surely keepAll option changed): raise error, the user should decide by himself what to do * The right model file (eventually some other): no problem, simply resume the training In any case, the directory will exist as it has been created by the summary writer Args: sess: The current running session """ modelName = self._getModelName() if os.listdir(self.modelDir): if os.path.exists(modelName): # Restore the model print('Restoring previous model from {}'.format(modelName)) self.saver.restore( sess, modelName ) # Will crash when --reset is not activated and the model has not been saved yet else: # No other model to conflict with (probably summary files) print( 'No previous model found, but some files found at {}. Cleaning...' .format(self.modelDir)) # Warning: No confirmation asked self.args.reset = True else: print('No previous model found, starting from clean directory: {}'. format(self.modelDir)) def loadModelParams(self): """ Load the some values associated with the current model, like the current globStep value For now, this function does not need to be called before loading the model (no parameters restored). However, the modelDir name will be initialized here so it is required to call this function before managePreviousModel(), _getModelName() or _getSummaryName() Warning: if you modify this function, make sure the changes mirror saveModelParams, also check if the parameters should be reset in managePreviousModel """ # Compute the current model path self.modelDir = os.path.join(self.args.rootDir, self.MODEL_DIR_BASE) if self.args.modelTag: self.modelDir += '-' + self.args.modelTag # If there is a previous model, restore some parameters configName = os.path.join(self.modelDir, self.CONFIG_FILENAME) # Loading config = configparser.ConfigParser() config.read(configName) # Check the version currentVersion = config['General'].get('version') if currentVersion != self.CONFIG_VERSION: raise UserWarning( 'Present configuration version {0} does not match {1}. You can try manual changes on \'{2}\'' .format(currentVersion, self.CONFIG_VERSION, configName)) # Restoring the the parameters self.globStep = config['General'].getint('globStep') self.args.watsonMode = config['General'].getboolean('watsonMode') self.args.autoEncode = config['General'].getboolean('autoEncode') self.args.corpus = config['General'].get('corpus') self.args.datasetTag = config['Dataset'].get('datasetTag') self.args.maxLength = config['Dataset'].getint( 'maxLength' ) # We need to restore the model length because of the textData associated and the vocabulary size (TODO: Compatibility mode between different maxLength) self.args.filterVocab = config['Dataset'].getint('filterVocab') self.args.skipLines = config['Dataset'].getboolean('skipLines') self.args.vocabularySize = config['Dataset'].getint('vocabularySize') self.args.hiddenSize = config['Network'].getint('hiddenSize') self.args.numLayers = config['Network'].getint('numLayers') self.args.softmaxSamples = config['Network'].getint('softmaxSamples') self.args.initEmbeddings = config['Network'].getboolean( 'initEmbeddings') self.args.embeddingSize = config['Network'].getint('embeddingSize') self.args.embeddingSource = config['Network'].get('embeddingSource') # No restoring for training params, batch size or other non model dependent parameters # Show the restored params print() print('Warning: Restoring parameters:') print('globStep: {}'.format(self.globStep)) print('watsonMode: {}'.format(self.args.watsonMode)) print('autoEncode: {}'.format(self.args.autoEncode)) print('corpus: {}'.format(self.args.corpus)) print('datasetTag: {}'.format(self.args.datasetTag)) print('maxLength: {}'.format(self.args.maxLength)) print('filterVocab: {}'.format(self.args.filterVocab)) print('skipLines: {}'.format(self.args.skipLines)) print('vocabularySize: {}'.format(self.args.vocabularySize)) print('hiddenSize: {}'.format(self.args.hiddenSize)) print('numLayers: {}'.format(self.args.numLayers)) print('softmaxSamples: {}'.format(self.args.softmaxSamples)) print('initEmbeddings: {}'.format(self.args.initEmbeddings)) print('embeddingSize: {}'.format(self.args.embeddingSize)) print('embeddingSource: {}'.format(self.args.embeddingSource)) print() # For now, not arbitrary independent maxLength between encoder and decoder self.args.maxLengthEnco = self.args.maxLength self.args.maxLengthDeco = self.args.maxLength + 2 def _getSummaryName(self): """ Parse the argument to decide were to save the summary, at the same place that the model The folder could already contain logs if we restore the training, those will be merged Return: str: The path and name of the summary """ return self.modelDir def _getModelName(self): """ Parse the argument to decide were to save/load the model This function is called at each checkpoint and the first time the model is load. If keepAll option is set, the globStep value will be included in the name. Return: str: The path and name were the model need to be saved """ modelName = os.path.join(self.modelDir, self.MODEL_NAME_BASE) if self.args.keepAll: # We do not erase the previously saved model by including the current step on the name modelName += '-' + str(self.globStep) return modelName + self.MODEL_EXT def getDevice(self): """ Parse the argument to decide on which device run the model Return: str: The name of the device on which run the program """ if self.args.device == 'cpu': return '/cpu:0' elif self.args.device == 'gpu': return '/gpu:0' elif self.args.device is None: # No specified device (default) return None else: print( 'Warning: Error in the device name: {}, use the default device' .format(self.args.device)) return None
class Chatbot: """ Main class which launch the training or testing mode """ class TestMode: """ Simple structure representing the different testing modes """ ALL = 'all' INTERACTIVE = 'interactive' # The user can write his own questions DAEMON = 'daemon' # The chatbot runs on background and can regularly be called to predict something def __init__(self): """ """ # Model/dataset parameters self.args = None # Task specific object self.textData = None # Dataset self.model = None # Sequence to sequence model # Tensorflow utilities for convenience saving/logging self.writer = None self.saver = None self.modelDir = '' # Where the model is saved self.globStep = 0 # Represent the number of iteration for the current model # TensorFlow main session (we keep track for the daemon) self.sess = None # Filename and directories constants self.MODEL_DIR_BASE = 'save/model' self.MODEL_NAME_BASE = 'model' self.MODEL_EXT = '.ckpt' self.CONFIG_FILENAME = 'params.ini' self.CONFIG_VERSION = '0.4' self.TEST_IN_NAME = 'data/test/samples.txt' self.TEST_OUT_SUFFIX = '_predictions.txt' self.SENTENCES_PREFIX = ['Q: ', 'A: '] @staticmethod def parseArgs(args): """ Parse the arguments from the given command line Args: args (list<str>): List of arguments to parse. If None, the default sys.argv will be parsed """ parser = argparse.ArgumentParser() # Global options globalArgs = parser.add_argument_group('Global options') globalArgs.add_argument( '--test', nargs='?', choices=[ Chatbot.TestMode.ALL, Chatbot.TestMode.INTERACTIVE, Chatbot.TestMode.DAEMON ], const=Chatbot.TestMode.ALL, default=None, help= 'if present, launch the program try to answer all sentences from data/test/ with' ' the defined model(s), in interactive mode, the user can wrote his own sentences,' ' use daemon mode to integrate the chatbot in another program') globalArgs.add_argument( '--createDataset', action='store_true', help= 'if present, the program will only generate the dataset from the corpus (no training/testing)' ) globalArgs.add_argument( '--playDataset', type=int, nargs='?', const=10, default=None, help= 'if set, the program will randomly play some samples(can be use conjointly with createDataset if this is the only action you want to perform)' ) globalArgs.add_argument( '--reset', action='store_true', help= 'use this if you want to ignore the previous model present on the model directory (Warning: the model will be destroyed with all the folder content)' ) globalArgs.add_argument( '--verbose', action='store_true', help= 'When testing, will plot the outputs at the same time they are computed' ) globalArgs.add_argument( '--keepAll', action='store_true', help= 'If this option is set, all saved model will be keep (Warning: make sure you have enough free disk space or increase saveEvery)' ) # TODO: Add an option to delimit the max size globalArgs.add_argument( '--modelTag', type=str, default=None, help='tag to differentiate which model to store/load') globalArgs.add_argument( '--rootDir', type=str, default=None, help='folder where to look for the models and data') globalArgs.add_argument( '--watsonMode', action='store_true', help= 'Inverse the questions and answer when training (the network try to guess the question)' ) globalArgs.add_argument( '--device', type=str, default=None, help= '\'gpu\' or \'cpu\' (Warning: make sure you have enough free RAM), allow to choose on which hardware run the model' ) globalArgs.add_argument('--seed', type=int, default=None, help='random seed for replication') # Dataset options datasetArgs = parser.add_argument_group('Dataset options') datasetArgs.add_argument('--corpus', choices=TextData.corpusChoices(), default=TextData.corpusChoices()[0], help='corpus on which extract the dataset.') datasetArgs.add_argument( '--datasetTag', type=str, default='', help= 'add a tag to the dataset (file where to load the vocabulary and the precomputed samples, not the original corpus). Useful to manage multiple versions' ) # The samples are computed from the corpus if it does not exist already. There are saved in \'data/samples/\' datasetArgs.add_argument( '--ratioDataset', type=float, default=1.0, help='ratio of dataset used to avoid using the whole dataset' ) # Not implemented, useless ? datasetArgs.add_argument( '--maxLength', type=int, default=10, help= 'maximum length of the sentence (for input and output), define number of maximum step of the RNN' ) # Network options (Warning: if modifying something here, also make the change on save/loadParams() ) nnArgs = parser.add_argument_group('Network options', 'architecture related option') nnArgs.add_argument('--hiddenSize', type=int, default=256, help='number of hidden units in each RNN cell') nnArgs.add_argument('--numLayers', type=int, default=2, help='number of rnn layers') nnArgs.add_argument('--embeddingSize', type=int, default=32, help='embedding size of the word representation') nnArgs.add_argument( '--initEmbeddings', action='store_true', help= 'if present, the program will initialize the embeddings with pre-trained word2vec vectors' ) nnArgs.add_argument( '--softmaxSamples', type=int, default=0, help= 'Number of samples in the sampled softmax loss function. A value of 0 deactivates sampled softmax' ) nnArgs.add_argument( '--useAttentions', action='store_true', help='if present, the program will use attention mechanism') # Training options trainingArgs = parser.add_argument_group('Training options') trainingArgs.add_argument('--numEpochs', type=int, default=30, help='maximum number of epochs to run') trainingArgs.add_argument( '--saveEvery', type=int, default=1000, help='nb of mini-batch step before creating a model checkpoint') trainingArgs.add_argument('--batchSize', type=int, default=10, help='mini-batch size') trainingArgs.add_argument('--learningRate', type=float, default=0.001, help='Learning rate') trainingArgs.add_argument('--dropout', type=float, default=0.9, help='Dropout rate (keep probabilities)') return parser.parse_args(args) def main(self, args=None): """ Launch the training and/or the interactive mode """ print('Welcome to DeepQA v0.1 !') print() print('TensorFlow detected: v{}'.format(tf.__version__)) # General initialisation self.args = self.parseArgs(args) if not self.args.rootDir: self.args.rootDir = os.getcwd( ) # Use the current working directory self.loadModelParams() self.textData = TextData(self.args) if self.args.createDataset: print('Dataset created! Thanks for using this program') return # No need to go further with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.summary.FileWriter(self._getSummaryName()) # tf0.12 and before:self.writer = tf.train.SummaryWriter(self._getSummaryName()) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=200) # Arbitrary limit ? # Running session config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) print('Initialize variables...') self.sess.run(tf.global_variables_initializer()) # Reload the model eventually (if it exist.), on testing mode, # the models are not loaded here (but in predictTestset) if self.args.test != Chatbot.TestMode.ALL: self.managePreviousModel(self.sess) # Use attention mechanism in model by creating embedding_attention model if self.args.useAttentions: print('Using attention mechanism in model') if self.args.softmaxSamples == 0: print( 'Warning: Use attention mechanism without softmax samples ' 'requires larger memory space and may raise OOM exception.' ) print('Recommend to rerun the program and train the model ' 'with softmaxSamples and useAttentions arguments') # Initialize embeddings with pre-trained word2vec vectors if self.args.initEmbeddings: print( "Loading pre-trained embeddings from GoogleNews-vectors-negative300.bin" ) self.loadEmbedding(self.sess) if self.args.test: if self.args.test == Chatbot.TestMode.INTERACTIVE: self.mainTestInteractive(self.sess) elif self.args.test == Chatbot.TestMode.ALL: print('Start predicting...') self.predictTestset(self.sess) print('All predictions done') elif self.args.test == Chatbot.TestMode.DAEMON: print('Daemon mode, running in background...') else: raise RuntimeError('Unknown test mode: {}'.format( self.args.test)) # Should never happen else: self.mainTrain(self.sess) if self.args.test != Chatbot.TestMode.DAEMON: self.sess.close() print("The End! Thanks for using this program") def mainTrain(self, sess): """ Training loop Args: sess: The current running session """ # Specific training dependent loading self.textData.makeLighter( self.args.ratioDataset) # Limit the number of training samples mergedSummaries = tf.summary.merge_all() if self.globStep == 0: # Not restoring from previous run self.writer.add_graph(sess.graph) # First time only # If restoring a model, restore the progression bar ? and current batch ? print('Start training (press Ctrl+C to save and exit)...') try: # If the user exit while training, we still try to save the model for e in range(self.args.numEpochs): print() print("----- Epoch {}/{} ; (lr={}) -----".format( e + 1, self.args.numEpochs, self.args.learningRate)) batches = self.textData.getBatches() # TODO: Also update learning parameters eventually tic = datetime.datetime.now() for nextBatch in tqdm(batches, desc="Training"): # Training pass ops, feedDict = self.model.step(nextBatch) assert len(ops) == 2 # training, loss _, loss, summary = sess.run(ops + (mergedSummaries, ), feedDict) self.writer.add_summary(summary, self.globStep) self.globStep += 1 # Output training status if self.globStep % 100 == 0: perplexity = math.exp( float(loss)) if loss < 300 else float("inf") tqdm.write( "----- Step %d -- Loss %.2f -- Perplexity %.2f" % (self.globStep, loss, perplexity)) # Checkpoint if self.globStep % self.args.saveEvery == 0: self._saveSession(sess) toc = datetime.datetime.now() print( "Epoch finished in {}".format(toc - tic) ) # Warning: Will overflow if an epoch takes more than 24 hours, and the output isn't really nicer except (KeyboardInterrupt, SystemExit): # If the user press Ctrl+C while testing progress print('Interruption detected, exiting the program...') self._saveSession(sess) # Ultimate saving before complete exit def predictTestset(self, sess): """ Try predicting the sentences from the samples.txt file. The sentences are saved on the modelDir under the same name Args: sess: The current running session """ lines = [] hypseqs = [] refs = [] hyps = [] average_bleu = 0 average_gleu = 0 average_uni_ratio = 0 average_bi_ratio = 0 total_token = 0 uni_dict = {} bi_dict = {} av_total_uni_ratio = 0.0 av_total_bi_ratio = 0.0 flag = self.textData.loadTestData(self.textData.testSamplesDir) if not flag: # Loading the file to predict with open(os.path.join(self.args.rootDir, self.TEST_IN_NAME), 'r') as f: lines = f.readlines() else: for sample in self.textData.testingSamples: lines.append(self.textData.sequence2str(sample[0], clean=True)) hypseqs.append( self.textData.sequence2str(sample[1], clean=True)) modelList = self._getModelList() if not modelList: print( 'Warning: No model found in \'{}\'. Please train a model before trying to predict' .format(self.modelDir)) return # Predicting for each model present in modelDir for modelName in sorted(modelList): # TODO: Natural sorting print('Restoring previous model from {}'.format(modelName)) self.saver.restore(sess, modelName) print('Testing...') saveName = modelName[:-len( self.MODEL_EXT )] + self.TEST_OUT_SUFFIX # We remove the model extension and add the prediction suffix with open(saveName, 'w') as f: nbIgnored = 0 index = 0 for line in tqdm(lines, desc='Sentences'): if not flag: question = line[:-1] # Remove the endl character else: question = line answer = self.singlePredict(question) if not answer: nbIgnored += 1 continue # Back to the beginning, try again if not flag: predString = '{x[0]}{0}\n{x[1]}{1}\n\n'.format( question, self.textData.sequence2str(answer, clean=True), x=self.SENTENCES_PREFIX) else: predString = '{x[0]}{0}\n{x[1]}{1}\n{y}{2}\n\n'.format( question, self.textData.sequence2str(answer, clean=True), hypseqs[index], x=self.SENTENCES_PREFIX, y='T: ') ref = nltk.word_tokenize( self.textData.sequence2str(answer, clean=True)) refs.append([ref]) hyp = nltk.word_tokenize(hypseqs[index]) hyps.append(hyp) tokens = len(ref) total_token += tokens dic1 = {k: ref.count(k) for k in set(ref)} uni_types = len(dic1) dic2 = {} for i in range(len(ref) - 1): item = ref[i] + ' ' + ref[i + 1] if item in dic2.keys(): dic2[item] += 1 else: dic2[item] = 1 bi_types = len(dic2) if tokens > 0: uni_ratio = float(uni_types) / float(tokens) else: uni_ratio = 0 if tokens > 1: bi_ratio = float(bi_types) / float(tokens - 1) else: bi_ratio = 0 for it1 in dic1.keys(): if it1 in uni_dict.keys(): uni_dict[it1] += 1 else: uni_dict[it1] = 1 for it2 in dic2.keys(): if it2 in bi_dict.keys(): bi_dict[it2] += 1 else: bi_dict[it2] = 1 # bleu = bleu_score.sentence_bleu([ref], hyp, smoothing_function=bleu_score.SmoothingFunction().method2, weights=[0.3, 0.3, 0.2, 0.2]) bleu = bleu_score.sentence_bleu( [ref], hyp, smoothing_function=bleu_score.SmoothingFunction( ).method2) try: gleu = gleu_score.sentence_gleu(ref, hyp) except (ZeroDivisionError): print( "Error: Division by zero, need smoothing function." ) gleu = 0.0 predString = predString + ( "Sentence BLEU %.4f, Sentence Google-BlEU %.4f.\n" "Unigram diversity %.4f, Bigram diversity %.4f.\n\n" % (bleu, gleu, uni_ratio, bi_ratio)) average_bleu += bleu average_gleu += gleu average_bi_ratio += bi_ratio average_uni_ratio += uni_ratio if self.args.verbose: tqdm.write(predString) f.write(predString) index += 1 if flag: average_bleu /= (len(lines) - nbIgnored) average_gleu /= (len(lines) - nbIgnored) average_uni_ratio /= (len(lines) - nbIgnored) average_bi_ratio /= (len(lines) - nbIgnored) av_total_uni_ratio = float( len(uni_dict)) / float(total_token) av_total_bi_ratio = float( len(bi_dict)) / float(total_token - len(lines) + nbIgnored) # corpus_bleu = bleu_score.corpus_bleu(refs, hyps, # smoothing_function=bleu_score.SmoothingFunction().method2, # weights=[0.3, 0.3, 0.2, 0.2]) corpus_bleu = bleu_score.corpus_bleu( refs, hyps, smoothing_function=bleu_score.SmoothingFunction( ).method2) f.write( "Average BLEU %.4f, Average Google-BLEU %.4f, Corpus BLEU %.4f.\n" "Average Unigram diversity %.4f, Average Bigram diversity %.4f.\n" "Average T-Unigram diversity %.4f, Average T-Bigram diversity %.4f." % (average_bleu, average_gleu, corpus_bleu, average_uni_ratio, average_bi_ratio, av_total_uni_ratio, av_total_bi_ratio)) print( 'Prediction finished, {}/{} sentences ignored (too long)'. format(nbIgnored, len(lines))) def mainTestInteractive(self, sess): """ Try predicting the sentences that the user will enter in the console Args: sess: The current running session """ # TODO: If verbose mode, also show similar sentences from the training set with the same words (include in mainTest also) # TODO: Also show the top 10 most likely predictions for each predicted output (when verbose mode) # TODO: Log the questions asked for latter re-use (merge with test/samples.txt) print('Testing: Launch interactive mode:') print('') print( 'Welcome to the interactive mode, here you can ask to Deep Q&A the sentence you want. Don\'t have high ' 'expectation. Type \'exit\' or just press ENTER to quit the program. Have fun.' ) while True: question = input(self.SENTENCES_PREFIX[0]) if question == '' or question == 'exit': break questionSeq = [ ] # Will be contain the question as seen by the encoder answer = self.singlePredict(question, questionSeq) if not answer: print( 'Warning: sentence too long, sorry. Maybe try a simpler sentence.' ) continue # Back to the beginning, try again print('{}{}'.format(self.SENTENCES_PREFIX[1], self.textData.sequence2str(answer, clean=True))) if self.args.verbose: print( self.textData.batchSeq2str(questionSeq, clean=True, reverse=True)) print(self.textData.sequence2str(answer)) print() def singlePredict(self, question, questionSeq=None): """ Predict the sentence Args: question (str): the raw input sentence questionSeq (List<int>): output argument. If given will contain the input batch sequence Return: list <int>: the word ids corresponding to the answer """ # Create the input batch batch = self.textData.sentence2enco(question) if not batch: return None if questionSeq is not None: # If the caller want to have the real input questionSeq.extend(batch.encoderSeqs) # Run the model ops, feedDict = self.model.step(batch) output = self.sess.run( ops[0], feedDict) # TODO: Summarize the output too (histogram, ...) answer = self.textData.deco2sentence(output) return answer def daemonPredict(self, sentence): """ Return the answer to a given sentence (same as singlePredict() but with additional cleaning) Args: sentence (str): the raw input sentence Return: str: the human readable sentence """ return self.textData.sequence2str(self.singlePredict(sentence), clean=True) def daemonClose(self): """ A utility function to close the daemon when finish """ print('Exiting the daemon mode...') self.sess.close() print('Daemon closed.') def loadEmbedding(self, sess): """ Initialize embeddings with pre-trained word2vec vectors Will modify the embedding weights of the current loaded model Uses the GoogleNews pre-trained values (path hardcoded) """ # Fetch embedding variables from model if self.args.useAttentions: with tf.variable_scope( "embedding_attention_seq2seq/rnn/embedding_wrapper", reuse=True): em_in = tf.get_variable("embedding") with tf.variable_scope( "embedding_attention_seq2seq/embedding_attention_decoder", reuse=True): em_out = tf.get_variable("embedding") else: with tf.variable_scope( "embedding_rnn_seq2seq/rnn/embedding_wrapper", reuse=True): em_in = tf.get_variable("embedding") with tf.variable_scope( "embedding_rnn_seq2seq/embedding_rnn_decoder", reuse=True): em_out = tf.get_variable("embedding") # Disable training for embeddings variables = tf.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES) variables.remove(em_in) variables.remove(em_out) # If restoring a model, we can leave here if self.globStep != 0: return # New model, we load the pre-trained word2vec data and initialize embeddings with open( os.path.join( self.args.rootDir, 'data/word2vec/GoogleNews-vectors-negative300.bin'), "rb", 0) as f: header = f.readline() vocab_size, vector_size = map(int, header.split()) binary_len = np.dtype('float32').itemsize * vector_size initW = np.random.uniform( -0.25, 0.25, (len(self.textData.word2id), vector_size)) for line in tqdm(range(vocab_size)): word = [] while True: ch = f.read(1) if ch == b' ': word = b''.join(word).decode('utf-8') break if ch != b'\n': word.append(ch) if word in self.textData.word2id: initW[self.textData.word2id[word]] = np.fromstring( f.read(binary_len), dtype='float32') else: f.read(binary_len) # PCA Decomposition to reduce word2vec dimensionality if self.args.embeddingSize < vector_size: U, s, Vt = np.linalg.svd(initW, full_matrices=False) S = np.zeros((vector_size, vector_size), dtype=complex) S[:vector_size, :vector_size] = np.diag(s) initW = np.dot( U[:, :self.args.embeddingSize], S[:self.args.embeddingSize, :self.args.embeddingSize]) # Initialize input and output embeddings sess.run(em_in.assign(initW)) sess.run(em_out.assign(initW)) def managePreviousModel(self, sess): """ Restore or reset the model, depending of the parameters If the destination directory already contains some file, it will handle the conflict as following: * If --reset is set, all present files will be removed (warning: no confirmation is asked) and the training restart from scratch (globStep & cie reinitialized) * Otherwise, it will depend of the directory content. If the directory contains: * No model files (only summary logs): works as a reset (restart from scratch) * Other model files, but modelName not found (surely keepAll option changed): raise error, the user should decide by himself what to do * The right model file (eventually some other): no problem, simply resume the training In any case, the directory will exist as it has been created by the summary writer Args: sess: The current running session """ # print('WARNING: ', end='') modelName = self._getModelName() ckpt = tf.train.get_checkpoint_state(self.modelDir) if os.listdir(self.modelDir): if self.args.reset: print('Reset: Destroying previous model at {}'.format( self.modelDir)) # Analysing directory content elif ckpt and tf.train.checkpoint_exists( ckpt.model_checkpoint_path ) and modelName == ckpt.model_checkpoint_path: # os.path.exists(modelName): # Restore the model print('Restoring previous model from {}'.format(modelName)) self.saver.restore(sess, ckpt.model_checkpoint_path) # self.saver.restore(sess, modelName) # Will crash when --reset is not activated and the model has not been saved yet print('Model restored.') elif self._getModelList(): print('Conflict with previous models.') raise RuntimeError( 'Some models are already present in \'{}\'. You should check them first (or re-try with the keepAll flag)' .format(self.modelDir)) else: # No other model to conflict with (probably summary files) print( 'No previous model found, but some files found at {}. Cleaning...' .format(self.modelDir)) # Warning: No confirmation asked self.args.reset = True if self.args.reset: fileList = [ os.path.join(self.modelDir, f) for f in os.listdir(self.modelDir) ] for f in fileList: print('Removing {}'.format(f)) os.remove(f) else: print('No previous model found, starting from clean directory: {}'. format(self.modelDir)) def _saveSession(self, sess): """ Save the model parameters and the variables Args: sess: the current session """ tqdm.write('Checkpoint reached: saving model (don\'t stop the run)...') self.saveModelParams() self.saver.save(sess, self._getModelName() ) # TODO: Put a limit size (ex: 3GB for the modelDir) tqdm.write('Model saved.') def _getModelList(self): """ Return the list of the model files inside the model directory """ return [ os.path.join(self.modelDir, f[0:f.index('.ckpt') + 5]) for f in os.listdir(self.modelDir) if f.endswith(self.MODEL_EXT + '.index') ] def loadModelParams(self): """ Load the some values associated with the current model, like the current globStep value For now, this function does not need to be called before loading the model (no parameters restored). However, the modelDir name will be initialized here so it is required to call this function before managePreviousModel(), _getModelName() or _getSummaryName() Warning: if you modify this function, make sure the changes mirror saveModelParams, also check if the parameters should be reset in managePreviousModel """ # Compute the current model path self.modelDir = os.path.join(self.args.rootDir, self.MODEL_DIR_BASE) if self.args.modelTag: self.modelDir += '-' + self.args.modelTag # If there is a previous model, restore some parameters configName = os.path.join(self.modelDir, self.CONFIG_FILENAME) if not self.args.reset and not self.args.createDataset and os.path.exists( configName): # Loading config = configparser.ConfigParser() config.read(configName) # Check the version currentVersion = config['General'].get('version') if currentVersion != self.CONFIG_VERSION: raise UserWarning( 'Present configuration version {0} does not match {1}. You can try manual changes on \'{2}\'' .format(currentVersion, self.CONFIG_VERSION, configName)) # Restoring the the parameters self.globStep = config['General'].getint('globStep') self.args.maxLength = config['General'].getint( 'maxLength' ) # We need to restore the model length because of the textData associated and the vocabulary size (TODO: Compatibility mode between different maxLength) self.args.watsonMode = config['General'].getboolean('watsonMode') self.args.corpus = config['General'].get('corpus') self.args.datasetTag = config['General'].get('datasetTag', '') self.args.hiddenSize = config['Network'].getint('hiddenSize') self.args.numLayers = config['Network'].getint('numLayers') self.args.embeddingSize = config['Network'].getint('embeddingSize') self.args.initEmbeddings = config['Network'].getboolean( 'initEmbeddings') self.args.softmaxSamples = config['Network'].getint( 'softmaxSamples') self.args.useAttentions = config['Network'].getboolean( 'useAttentions') # No restoring for training params, batch size or other non model dependent parameters # Show the restored params print() print('Warning: Restoring parameters:') print('globStep: {}'.format(self.globStep)) print('maxLength: {}'.format(self.args.maxLength)) print('watsonMode: {}'.format(self.args.watsonMode)) print('corpus: {}'.format(self.args.corpus)) print('datasetTag: {}'.format(self.args.datasetTag)) print('hiddenSize: {}'.format(self.args.hiddenSize)) print('numLayers: {}'.format(self.args.numLayers)) print('embeddingSize: {}'.format(self.args.embeddingSize)) print('initEmbeddings: {}'.format(self.args.initEmbeddings)) print('softmaxSamples: {}'.format(self.args.softmaxSamples)) print('useAttentions: {}'.format(self.args.useAttentions)) print() # For now, not arbitrary independent maxLength between encoder and decoder self.args.maxLengthEnco = self.args.maxLength self.args.maxLengthDeco = self.args.maxLength + 2 if self.args.watsonMode: self.SENTENCES_PREFIX.reverse() def saveModelParams(self): """ Save the params of the model, like the current globStep value Warning: if you modify this function, make sure the changes mirror loadModelParams """ config = configparser.ConfigParser() config['General'] = {} config['General']['version'] = self.CONFIG_VERSION config['General']['globStep'] = str(self.globStep) config['General']['maxLength'] = str(self.args.maxLength) config['General']['watsonMode'] = str(self.args.watsonMode) config['General']['corpus'] = str(self.args.corpus) config['General']['datasetTag'] = str(self.args.datasetTag) config['Network'] = {} config['Network']['hiddenSize'] = str(self.args.hiddenSize) config['Network']['numLayers'] = str(self.args.numLayers) config['Network']['embeddingSize'] = str(self.args.embeddingSize) config['Network']['initEmbeddings'] = str(self.args.initEmbeddings) config['Network']['softmaxSamples'] = str(self.args.softmaxSamples) config['Network']['useAttentions'] = str(self.args.useAttentions) # Keep track of the learning params (but without restoring them) config['Training (won\'t be restored)'] = {} config['Training (won\'t be restored)']['learningRate'] = str( self.args.learningRate) config['Training (won\'t be restored)']['batchSize'] = str( self.args.batchSize) config['Training (won\'t be restored)']['dropout'] = str( self.args.dropout) with open(os.path.join(self.modelDir, self.CONFIG_FILENAME), 'w') as configFile: config.write(configFile) def _getSummaryName(self): """ Parse the argument to decide were to save the summary, at the same place that the model The folder could already contain logs if we restore the training, those will be merged Return: str: The path and name of the summary """ return self.modelDir def _getModelName(self): """ Parse the argument to decide were to save/load the model This function is called at each checkpoint and the first time the model is load. If keepAll option is set, the globStep value will be included in the name. Return: str: The path and name were the model need to be saved """ modelName = os.path.join(self.modelDir, self.MODEL_NAME_BASE) if self.args.keepAll: # We do not erase the previously saved model by including the current step on the name modelName += '-' + str(self.globStep) return modelName + self.MODEL_EXT def getDevice(self): """ Parse the argument to decide on which device run the model Return: str: The name of the device on which run the program """ if self.args.device == 'cpu': return '/cpu:0' elif self.args.device == 'gpu': return '/gpu:0' elif self.args.device is None: # No specified device (default) return None else: print( 'Warning: Error in the device name: {}, use the default device' .format(self.args.device)) return None
class Chatbot: """ Main class which launch the training or testing mode """ class TestMode: """ Simple structure representing the different testing modes """ ALL = 'all' INTERACTIVE = 'interactive' # The user can write his own questions DAEMON = 'daemon' # The chatbot runs on background and can regularly be called to predict something def __init__(self): """ """ # Model/dataset parameters self.args = None # Task specific object self.textData = None # Dataset self.model = None # Sequence to sequence model # Tensorflow utilities for convenience saving/logging self.writer = None self.saver = None self.modelDir = '' # Where the model is saved self.globStep = 0 # Represent the number of iteration for the current model # TensorFlow main session (we keep track for the daemon) self.sess = None # Filename and directories constants self.MODEL_DIR_BASE = 'save/model' self.MODEL_NAME_BASE = 'model' self.MODEL_EXT = '.ckpt' self.CONFIG_FILENAME = 'params.ini' self.CONFIG_VERSION = '0.5' self.TEST_IN_NAME = 'data/test/samples.txt' self.TEST_OUT_SUFFIX = '_predictions.txt' self.SENTENCES_PREFIX = ['Q: ', 'A: '] @staticmethod def parseArgs(args): """ Parse the arguments from the given command line Args: args (list<str>): List of arguments to parse. If None, the default sys.argv will be parsed """ parser = argparse.ArgumentParser() # Global options globalArgs = parser.add_argument_group('Global options') globalArgs.add_argument( '--test', nargs='?', choices=[ Chatbot.TestMode.ALL, Chatbot.TestMode.INTERACTIVE, Chatbot.TestMode.DAEMON ], const=Chatbot.TestMode.ALL, default=None, help= 'if present, launch the program try to answer all sentences from data/test/ with' ' the defined model(s), in interactive mode, the user can wrote his own sentences,' ' use daemon mode to integrate the chatbot in another program') globalArgs.add_argument( '--createDataset', action='store_true', help= 'if present, the program will only generate the dataset from the corpus (no training/testing)' ) globalArgs.add_argument( '--playDataset', type=int, nargs='?', const=10, default=None, help= 'if set, the program will randomly play some samples(can be use conjointly with createDataset if this is the only action you want to perform)' ) globalArgs.add_argument( '--reset', action='store_true', help= 'use this if you want to ignore the previous model present on the model directory (Warning: the model will be destroyed with all the folder content)' ) globalArgs.add_argument( '--verbose', action='store_true', help= 'When testing, will plot the outputs at the same time they are computed' ) globalArgs.add_argument( '--debug', action='store_true', help= 'run DeepQA with Tensorflow debug mode. Read TF documentation for more details on this.' ) globalArgs.add_argument( '--keepAll', action='store_true', help= 'If this option is set, all saved model will be kept (Warning: make sure you have enough free disk space or increase saveEvery)' ) # TODO: Add an option to delimit the max size globalArgs.add_argument( '--modelTag', type=str, default=None, help='tag to differentiate which model to store/load') globalArgs.add_argument( '--rootDir', type=str, default=None, help='folder where to look for the models and data') globalArgs.add_argument( '--watsonMode', action='store_true', help= 'Inverse the questions and answer when training (the network try to guess the question)' ) globalArgs.add_argument( '--autoEncode', action='store_true', help= 'Randomly pick the question or the answer and use it both as input and output' ) globalArgs.add_argument( '--device', type=str, default=None, help= '\'gpu\' or \'cpu\' (Warning: make sure you have enough free RAM), allow to choose on which hardware run the model' ) globalArgs.add_argument('--seed', type=int, default=None, help='random seed for replication') # Dataset options datasetArgs = parser.add_argument_group('Dataset options') datasetArgs.add_argument('--corpus', choices=TextData.corpusChoices(), default=TextData.corpusChoices()[0], help='corpus on which extract the dataset.') datasetArgs.add_argument( '--datasetTag', type=str, default='', help= 'add a tag to the dataset (file where to load the vocabulary and the precomputed samples, not the original corpus). Useful to manage multiple versions. Also used to define the file used for the lightweight format.' ) # The samples are computed from the corpus if it does not exist already. There are saved in \'data/samples/\' datasetArgs.add_argument( '--ratioDataset', type=float, default=1.0, help='ratio of dataset used to avoid using the whole dataset' ) # Not implemented, useless ? datasetArgs.add_argument( '--maxLength', type=int, default=10, help= 'maximum length of the sentence (for input and output), define number of maximum step of the RNN' ) datasetArgs.add_argument( '--filterVocab', type=int, default=1, help= 'remove rarelly used words (by default words used only once). 0 to keep all words.' ) datasetArgs.add_argument( '--skipLines', action='store_true', help= 'Generate training samples by only using even conversation lines as questions (and odd lines as answer). Useful to train the network on a particular person.' ) datasetArgs.add_argument( '--vocabularySize', type=int, default=40000, help='Limit the number of words in the vocabulary (0 for unlimited)' ) # Network options (Warning: if modifying something here, also make the change on save/loadParams() ) nnArgs = parser.add_argument_group('Network options', 'architecture related option') nnArgs.add_argument('--hiddenSize', type=int, default=512, help='number of hidden units in each RNN cell') nnArgs.add_argument('--numLayers', type=int, default=2, help='number of rnn layers') nnArgs.add_argument( '--softmaxSamples', type=int, default=0, help= 'Number of samples in the sampled softmax loss function. A value of 0 deactivates sampled softmax' ) nnArgs.add_argument( '--initEmbeddings', action='store_true', help= 'if present, the program will initialize the embeddings with pre-trained word2vec vectors' ) nnArgs.add_argument('--embeddingSize', type=int, default=64, help='embedding size of the word representation') nnArgs.add_argument( '--embeddingSource', type=str, default="GoogleNews-vectors-negative300.bin", help='embedding file to use for the word representation') # Training options trainingArgs = parser.add_argument_group('Training options') trainingArgs.add_argument('--numEpochs', type=int, default=30, help='maximum number of epochs to run') trainingArgs.add_argument( '--saveEvery', type=int, default=2000, help='nb of mini-batch step before creating a model checkpoint') trainingArgs.add_argument('--batchSize', type=int, default=256, help='mini-batch size') trainingArgs.add_argument('--learningRate', type=float, default=0.002, help='Learning rate') trainingArgs.add_argument('--dropout', type=float, default=0.9, help='Dropout rate (keep probabilities)') return parser.parse_args(args) def main(self, args=None): """ Launch the training and/or the interactive mode """ print('Welcome to DeepQA v0.1 !') print() print('TensorFlow detected: v{}'.format(tf.__version__)) # General initialisation self.args = self.parseArgs(args) if not self.args.rootDir: self.args.rootDir = os.getcwd( ) # Use the current working directory #tf.logging.set_verbosity(tf.logging.INFO) # DEBUG, INFO, WARN (default), ERROR, or FATAL self.loadModelParams( ) # Update the self.modelDir and self.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) self.textData = TextData(self.args) # TODO: Add a mode where we can force the input of the decoder // Try to visualize the predictions for # each word of the vocabulary / decoder input # TODO: For now, the model are trained for a specific dataset (because of the maxLength which define the # vocabulary). Add a compatibility mode which allow to launch a model trained on a different vocabulary ( # remap the word2id/id2word variables). if self.args.createDataset: print('Dataset created! Thanks for using this program') return # No need to go further # Prepare the model with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.summary.FileWriter(self._getSummaryName()) self.saver = tf.train.Saver(max_to_keep=200) # TODO: Fixed seed (WARNING: If dataset shuffling, make sure to do that after saving the # dataset, otherwise, all which cames after the shuffling won't be replicable when # reloading the dataset). How to restore the seed after loading ?? # Also fix seed for random.shuffle (does it works globally for all files ?) # Running session self.sess = tf.Session( config=tf.ConfigProto( allow_soft_placement= True, # Allows backup device for non GPU-available operations (when forcing GPU) log_device_placement=False) # Too verbose ? ) # TODO: Replace all sess by self.sess (not necessary a good idea) ? if self.args.debug: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) print('Initialize variables...') self.sess.run(tf.global_variables_initializer()) # Reload the model eventually (if it exist.), on testing mode, the models are not loaded here (but in predictTestset) if self.args.test != Chatbot.TestMode.ALL: self.managePreviousModel(self.sess) # Initialize embeddings with pre-trained word2vec vectors if self.args.initEmbeddings: self.loadEmbedding(self.sess) if self.args.test: if self.args.test == Chatbot.TestMode.INTERACTIVE: self.mainTestInteractive(self.sess) elif self.args.test == Chatbot.TestMode.ALL: print('Start predicting...') self.predictTestset(self.sess) print('All predictions done') elif self.args.test == Chatbot.TestMode.DAEMON: print('Daemon mode, running in background...') else: raise RuntimeError('Unknown test mode: {}'.format( self.args.test)) # Should never happen else: self.mainTrain(self.sess) if self.args.test != Chatbot.TestMode.DAEMON: self.sess.close() print("The End! Thanks for using this program") def mainTrain(self, sess): """ Training loop Args: sess: The current running session """ # Specific training dependent loading self.textData.makeLighter( self.args.ratioDataset) # Limit the number of training samples mergedSummaries = tf.summary.merge_all( ) # Define the summary operator (Warning: Won't appear on the tensorboard graph) if self.globStep == 0: # Not restoring from previous run self.writer.add_graph(sess.graph) # First time only # If restoring a model, restore the progression bar ? and current batch ? print('Start training (press Ctrl+C to save and exit)...') try: # If the user exit while training, we still try to save the model for e in range(self.args.numEpochs): print() print("----- Epoch {}/{} ; (lr={}) -----".format( e + 1, self.args.numEpochs, self.args.learningRate)) batches = self.textData.getBatches() # TODO: Also update learning parameters eventually tic = datetime.datetime.now() for nextBatch in tqdm(batches, desc="Training"): # Training pass ops, feedDict = self.model.step(nextBatch) assert len(ops) == 2 # training, loss _, loss, summary = sess.run(ops + (mergedSummaries, ), feedDict) self.writer.add_summary(summary, self.globStep) self.globStep += 1 # Output training status if self.globStep % 100 == 0: perplexity = math.exp( float(loss)) if loss < 300 else float("inf") tqdm.write( "----- Step %d -- Loss %.2f -- Perplexity %.2f" % (self.globStep, loss, perplexity)) # Checkpoint if self.globStep % self.args.saveEvery == 0: self._saveSession(sess) toc = datetime.datetime.now() print( "Epoch finished in {}".format(toc - tic) ) # Warning: Will overflow if an epoch takes more than 24 hours, and the output isn't really nicer except (KeyboardInterrupt, SystemExit): # If the user press Ctrl+C while testing progress print('Interruption detected, exiting the program...') self._saveSession(sess) # Ultimate saving before complete exit def predictTestset(self, sess): """ Try predicting the sentences from the samples.txt file. The sentences are saved on the modelDir under the same name Args: sess: The current running session """ # Loading the file to predict with open(os.path.join(self.args.rootDir, self.TEST_IN_NAME), 'r') as f: lines = f.readlines() modelList = self._getModelList() if not modelList: print( 'Warning: No model found in \'{}\'. Please train a model before trying to predict' .format(self.modelDir)) return # Predicting for each model present in modelDir for modelName in sorted(modelList): # TODO: Natural sorting print('Restoring previous model from {}'.format(modelName)) self.saver.restore(sess, modelName) print('Testing...') saveName = modelName[:-len( self.MODEL_EXT )] + self.TEST_OUT_SUFFIX # We remove the model extension and add the prediction suffix with open(saveName, 'w') as f: nbIgnored = 0 for line in tqdm(lines, desc='Sentences'): question = line[:-1] # Remove the endl character answer = self.singlePredict(question) if not answer: nbIgnored += 1 continue # Back to the beginning, try again predString = '{x[0]}{0}\n{x[1]}{1}\n\n'.format( question, self.textData.sequence2str(answer, clean=True), x=self.SENTENCES_PREFIX) if self.args.verbose: tqdm.write(predString) f.write(predString) print( 'Prediction finished, {}/{} sentences ignored (too long)'. format(nbIgnored, len(lines))) def mainTestInteractive(self, sess): """ Try predicting the sentences that the user will enter in the console Args: sess: The current running session """ # TODO: If verbose mode, also show similar sentences from the training set with the same words (include in mainTest also) # TODO: Also show the top 10 most likely predictions for each predicted output (when verbose mode) # TODO: Log the questions asked for latter re-use (merge with test/samples.txt) #Edited by Rishabh for the RentBot application trigger_words = [ 'apartment', 'apartments', 'place', 'places', 'house', 'houses', 'condo', 'condos', 'rent' ] #We can add more trigger words, if need be. print('Testing: Launch interactive mode:') print('') print( 'Welcome to the interactive mode, here you can ask to Deep Q&A the sentence you want. Don\'t have high ' 'expectation. Type \'exit\' or just press ENTER to quit the program. Have fun.' ) #name = input("Hi, I am RentApt. I will be helping you to search for apartemnts today! May I know whom I am talking to?:") name = input( "Hi, I am RentApt. I will be helping you to search for apartemnts today! May I know whom I am talking to?:" ) print('Howdy %s, how may I help you today' % name) call_function_flag = False while True: question = input(self.SENTENCES_PREFIX[0]) if question == '' or question == 'exit': break for word in question.split(): # Add other delimiters too - Rishabh if word.lower() in trigger_words: call_function_flag = True if call_function_flag: #print('call the function') self.rentAnswers(question) else: questionSeq = [ ] # Will be contain the question as seen by the encoder answer = self.singlePredict(question, questionSeq) if not answer: print( 'Warning: sentence too long, sorry. Maybe try a simpler sentence.' ) continue # Back to the beginning, try again print('{}{}'.format( self.SENTENCES_PREFIX[1], self.textData.sequence2str(answer, clean=True))) if self.args.verbose: print( self.textData.batchSeq2str(questionSeq, clean=True, reverse=True)) print(self.textData.sequence2str(answer)) print() #by Rishabh def rentAnswers(self, question): """ Function to merge the AI bot with the rent bot """ key_words = ['location', 'rent', 'type'] #returnToPrevFunc = True df = pd.read_csv('AptData.csv') df['minPrice'] = df['minPrice'].replace( '[\$,)]', '', regex=True).replace('[(]', '-', regex=True).astype(float) df['maxPrice'] = df['maxPrice'].replace( '[\$,)]', '', regex=True).replace('[(]', '-', regex=True).astype(float) df['location'] = df['location'].str.lower() print('I will be Happy to assist you in finding an Apartment') print( 'What type of apartment are you looking for? Options: Studio, 1bedroom , 2bedroom, 3bedroom, 4bedroom and 5bedroom. You can choose multiple options separated by space' ) question = input(self.SENTENCES_PREFIX[0]).split() print(len(question)) if len(question) == 1: df = df[(df[question[0]].str.contains('yes'))] elif len(question) == 2: df = df[(df[question[0]].str.contains('yes')) | ( df[question[1]].str.contains('yes') )] #df[(df[type2[1]].str.contains('yes')) | df[type2[2]].str.contains('yes')] elif len(question) == 3: df = df[(df[question[0]].str.contains('yes')) | (df[question[1]].str.contains('yes')) | (df[question[2]].str.contains('yes'))] elif len(question) == 4: df = df[(df[question[0]].str.contains('yes')) | (df[question[1]].str.contains('yes')) | (df[question[2]].str.contains('yes')) | (df[question[3]].str.contains('yes'))] elif len(question) == 5: df = df[(df[question[0]].str.contains('yes')) | (df[question[1]].str.contains('yes')) | (df[question[2]].str.contains('yes')) | (df[question[3]].str.contains('yes')) | (df[question[4]].str.contains('yes'))] #print(df) #df = df[df[question[0]].str.contains('yes')] #Have not implemented the multiple types yet, this should be done soon print( 'How do you want to search your apartment, name based, location based or rent based?' ) question = input(self.SENTENCES_PREFIX[0]).lower() #if question == '' or question == 'exit': # break # search_criteria = [] # for word in question.split():# Add other delimiters too - Rishabh # if word.lower() in key_words: # search_criteria.append(word); #Currently, adding search by location only, can add search by type and price later on. #if len(search_criteria == 1): if 'location' in question: self.locationBased(df) #Call the location based search function elif 'rent' in question: self.rentBased(df) elif 'name' in question: self.nameBased(df) else: print('Wrong input, please enter again') #elif len(search_criteria == 2) #self.mutipleCriteria(df) #Add an else statement #question = input(self.SENTENCES_PREFIX[0]) #self.rentInfo(search_criteria,question) #By Rishabh def locationBased(self, df): #Currently, search only based on location - Will add other optins later on. # Creating the dataFrame print( 'Please enter the names of the streets you want me to search for separated by space.' ) inLoop = True while inLoop: question = input(self.SENTENCES_PREFIX[0]).lower() streets = question.split() for street in streets: new_df = df[df['location'].str.contains( street)] # Search based on location new_df = new_df.reset_index(drop=True) # Re-indexing #The print commands print('On {}, I found {} apartments.'.format( street, len(new_df))) for i in range(len(new_df)): inLoop = False print( '{} located at {} has units in the price range of {} to {}, more info is available @ {}' .format(new_df.iloc[i]['name'], new_df.iloc[i]['location'], new_df.iloc[i]['minPrice'], new_df.iloc[i]['maxPrice'], new_df.iloc[i]['website'])) #print('Would you like to narrow down the search based on another parameter?\n If yes, please choose from rent or type. If no, please type exit') #question = input(self.SENTENCES_PREFIX[0]) #if def nameBased(self, df): #Currently, search only based on location - Will add other optins later on. # Creating the dataFrame print( 'Please enter the names of the apartments you want me to search for separated by space.' ) inLoop = True while inLoop: question = input(self.SENTENCES_PREFIX[0]).lower() streets = question.split() for street in streets: new_df = df[df['name'].str.contains( street)] # Search based on location new_df = new_df.reset_index(drop=True) # Re-indexing #The print commands print('On {}, I found {} apartments.'.format( street, len(new_df))) for i in range(len(new_df)): inLoop = False print( '{} located at {} has units in the price range of {} to {}, more info is available @ {}' .format(new_df.iloc[i]['name'], new_df.iloc[i]['location'], new_df.iloc[i]['minPrice'], new_df.iloc[i]['maxPrice'], new_df.iloc[i]['website'])) def rentBased(self, df): #Write the rent based search algorithm print('Please enter the price range seperated by space') question = input(self.SENTENCES_PREFIX[0]).lower().split() new_df = df[(df['minPrice'] <= float(question[0])) & (df['maxPrice'] >= float(question[1]))] print(' I found {} apartments.'.format(len(new_df))) for i in range(len(new_df)): print( '{} located at {} has units in the price range of {} to {}, more info is available @ {}' .format(new_df.iloc[i]['name'], new_df.iloc[i]['location'], new_df.iloc[i]['minPrice'], new_df.iloc[i]['maxPrice'], new_df.iloc[i]['website'])) #def typeBased(seld, df): #Write the type based search algorithm #def multipleCriteria(seld, df): #Write the multiple criterial based search algorithm def singlePredict(self, question, questionSeq=None): """ Predict the sentence Args: question (str): the raw input sentence questionSeq (List<int>): output argument. If given will contain the input batch sequence Return: list <int>: the word ids corresponding to the answer """ # Create the input batch batch = self.textData.sentence2enco(question) if not batch: return None if questionSeq is not None: # If the caller want to have the real input questionSeq.extend(batch.encoderSeqs) # Run the model ops, feedDict = self.model.step(batch) output = self.sess.run( ops[0], feedDict) # TODO: Summarize the output too (histogram, ...) answer = self.textData.deco2sentence(output) return answer def daemonPredict(self, sentence): """ Return the answer to a given sentence (same as singlePredict() but with additional cleaning) Args: sentence (str): the raw input sentence Return: str: the human readable sentence """ return self.textData.sequence2str(self.singlePredict(sentence), clean=True) def daemonClose(self): """ A utility function to close the daemon when finish """ print('Exiting the daemon mode...') self.sess.close() print('Daemon closed.') def loadEmbedding(self, sess): """ Initialize embeddings with pre-trained word2vec vectors Will modify the embedding weights of the current loaded model Uses the GoogleNews pre-trained values (path hardcoded) """ # Fetch embedding variables from model with tf.variable_scope("embedding_rnn_seq2seq/rnn/embedding_wrapper", reuse=True): em_in = tf.get_variable("embedding") with tf.variable_scope("embedding_rnn_seq2seq/embedding_rnn_decoder", reuse=True): em_out = tf.get_variable("embedding") # Disable training for embeddings variables = tf.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES) variables.remove(em_in) variables.remove(em_out) # If restoring a model, we can leave here if self.globStep != 0: return # New model, we load the pre-trained word2vec data and initialize embeddings embeddings_path = os.path.join(self.args.rootDir, 'data', 'embeddings', self.args.embeddingSource) embeddings_format = os.path.splitext(embeddings_path)[1][1:] print("Loading pre-trained word embeddings from %s " % embeddings_path) with open(embeddings_path, "rb") as f: header = f.readline() vocab_size, vector_size = map(int, header.split()) binary_len = np.dtype('float32').itemsize * vector_size initW = np.random.uniform( -0.25, 0.25, (len(self.textData.word2id), vector_size)) for line in tqdm(range(vocab_size)): word = [] while True: ch = f.read(1) if ch == b' ': word = b''.join(word).decode('utf-8') break if ch != b'\n': word.append(ch) if word in self.textData.word2id: if embeddings_format == 'bin': vector = np.fromstring(f.read(binary_len), dtype='float32') elif embeddings_format == 'vec': vector = np.fromstring(f.readline(), sep=' ', dtype='float32') else: raise Exception("Unkown format for embeddings: %s " % embeddings_format) initW[self.textData.word2id[word]] = vector else: if embeddings_format == 'bin': f.read(binary_len) elif embeddings_format == 'vec': f.readline() else: raise Exception("Unkown format for embeddings: %s " % embeddings_format) # PCA Decomposition to reduce word2vec dimensionality if self.args.embeddingSize < vector_size: U, s, Vt = np.linalg.svd(initW, full_matrices=False) S = np.zeros((vector_size, vector_size), dtype=complex) S[:vector_size, :vector_size] = np.diag(s) initW = np.dot( U[:, :self.args.embeddingSize], S[:self.args.embeddingSize, :self.args.embeddingSize]) # Initialize input and output embeddings sess.run(em_in.assign(initW)) sess.run(em_out.assign(initW)) def managePreviousModel(self, sess): """ Restore or reset the model, depending of the parameters If the destination directory already contains some file, it will handle the conflict as following: * If --reset is set, all present files will be removed (warning: no confirmation is asked) and the training restart from scratch (globStep & cie reinitialized) * Otherwise, it will depend of the directory content. If the directory contains: * No model files (only summary logs): works as a reset (restart from scratch) * Other model files, but modelName not found (surely keepAll option changed): raise error, the user should decide by himself what to do * The right model file (eventually some other): no problem, simply resume the training In any case, the directory will exist as it has been created by the summary writer Args: sess: The current running session """ print('WARNING: ', end='') modelName = self._getModelName() if os.listdir(self.modelDir): if self.args.reset: print('Reset: Destroying previous model at {}'.format( self.modelDir)) # Analysing directory content elif os.path.exists(modelName): # Restore the model print('Restoring previous model from {}'.format(modelName)) self.saver.restore( sess, modelName ) # Will crash when --reset is not activated and the model has not been saved yet elif self._getModelList(): print('Conflict with previous models.') raise RuntimeError( 'Some models are already present in \'{}\'. You should check them first (or re-try with the keepAll flag)' .format(self.modelDir)) else: # No other model to conflict with (probably summary files) print( 'No previous model found, but some files found at {}. Cleaning...' .format(self.modelDir)) # Warning: No confirmation asked self.args.reset = True if self.args.reset: fileList = [ os.path.join(self.modelDir, f) for f in os.listdir(self.modelDir) ] for f in fileList: print('Removing {}'.format(f)) os.remove(f) else: print('No previous model found, starting from clean directory: {}'. format(self.modelDir)) def _saveSession(self, sess): """ Save the model parameters and the variables Args: sess: the current session """ tqdm.write('Checkpoint reached: saving model (don\'t stop the run)...') self.saveModelParams() model_name = self._getModelName() with open( model_name, 'w' ) as f: # HACK: Simulate the old model existance to avoid rewriting the file parser f.write( 'This file is used internally by DeepQA to check the model existance. Please do not remove.\n' ) self.saver.save( sess, model_name) # TODO: Put a limit size (ex: 3GB for the modelDir) tqdm.write('Model saved.') def _getModelList(self): """ Return the list of the model files inside the model directory """ return [ os.path.join(self.modelDir, f) for f in os.listdir(self.modelDir) if f.endswith(self.MODEL_EXT) ] def loadModelParams(self): """ Load the some values associated with the current model, like the current globStep value For now, this function does not need to be called before loading the model (no parameters restored). However, the modelDir name will be initialized here so it is required to call this function before managePreviousModel(), _getModelName() or _getSummaryName() Warning: if you modify this function, make sure the changes mirror saveModelParams, also check if the parameters should be reset in managePreviousModel """ # Compute the current model path self.modelDir = os.path.join(self.args.rootDir, self.MODEL_DIR_BASE) if self.args.modelTag: self.modelDir += '-' + self.args.modelTag # If there is a previous model, restore some parameters configName = os.path.join(self.modelDir, self.CONFIG_FILENAME) if not self.args.reset and not self.args.createDataset and os.path.exists( configName): # Loading config = configparser.ConfigParser() config.read(configName) # Check the version currentVersion = config['General'].get('version') if currentVersion != self.CONFIG_VERSION: raise UserWarning( 'Present configuration version {0} does not match {1}. You can try manual changes on \'{2}\'' .format(currentVersion, self.CONFIG_VERSION, configName)) # Restoring the the parameters self.globStep = config['General'].getint('globStep') self.args.watsonMode = config['General'].getboolean('watsonMode') self.args.autoEncode = config['General'].getboolean('autoEncode') self.args.corpus = config['General'].get('corpus') self.args.datasetTag = config['Dataset'].get('datasetTag') self.args.maxLength = config['Dataset'].getint( 'maxLength' ) # We need to restore the model length because of the textData associated and the vocabulary size (TODO: Compatibility mode between different maxLength) self.args.filterVocab = config['Dataset'].getint('filterVocab') self.args.skipLines = config['Dataset'].getboolean('skipLines') self.args.vocabularySize = config['Dataset'].getint( 'vocabularySize') self.args.hiddenSize = config['Network'].getint('hiddenSize') self.args.numLayers = config['Network'].getint('numLayers') self.args.softmaxSamples = config['Network'].getint( 'softmaxSamples') self.args.initEmbeddings = config['Network'].getboolean( 'initEmbeddings') self.args.embeddingSize = config['Network'].getint('embeddingSize') self.args.embeddingSource = config['Network'].get( 'embeddingSource') # No restoring for training params, batch size or other non model dependent parameters # Show the restored params print() print('Warning: Restoring parameters:') print('globStep: {}'.format(self.globStep)) print('watsonMode: {}'.format(self.args.watsonMode)) print('autoEncode: {}'.format(self.args.autoEncode)) print('corpus: {}'.format(self.args.corpus)) print('datasetTag: {}'.format(self.args.datasetTag)) print('maxLength: {}'.format(self.args.maxLength)) print('filterVocab: {}'.format(self.args.filterVocab)) print('skipLines: {}'.format(self.args.skipLines)) print('vocabularySize: {}'.format(self.args.vocabularySize)) print('hiddenSize: {}'.format(self.args.hiddenSize)) print('numLayers: {}'.format(self.args.numLayers)) print('softmaxSamples: {}'.format(self.args.softmaxSamples)) print('initEmbeddings: {}'.format(self.args.initEmbeddings)) print('embeddingSize: {}'.format(self.args.embeddingSize)) print('embeddingSource: {}'.format(self.args.embeddingSource)) print() # For now, not arbitrary independent maxLength between encoder and decoder self.args.maxLengthEnco = self.args.maxLength self.args.maxLengthDeco = self.args.maxLength + 2 if self.args.watsonMode: self.SENTENCES_PREFIX.reverse() def saveModelParams(self): """ Save the params of the model, like the current globStep value Warning: if you modify this function, make sure the changes mirror loadModelParams """ config = configparser.ConfigParser() config['General'] = {} config['General']['version'] = self.CONFIG_VERSION config['General']['globStep'] = str(self.globStep) config['General']['watsonMode'] = str(self.args.watsonMode) config['General']['autoEncode'] = str(self.args.autoEncode) config['General']['corpus'] = str(self.args.corpus) config['Dataset'] = {} config['Dataset']['datasetTag'] = str(self.args.datasetTag) config['Dataset']['maxLength'] = str(self.args.maxLength) config['Dataset']['filterVocab'] = str(self.args.filterVocab) config['Dataset']['skipLines'] = str(self.args.skipLines) config['Dataset']['vocabularySize'] = str(self.args.vocabularySize) config['Network'] = {} config['Network']['hiddenSize'] = str(self.args.hiddenSize) config['Network']['numLayers'] = str(self.args.numLayers) config['Network']['softmaxSamples'] = str(self.args.softmaxSamples) config['Network']['initEmbeddings'] = str(self.args.initEmbeddings) config['Network']['embeddingSize'] = str(self.args.embeddingSize) config['Network']['embeddingSource'] = str(self.args.embeddingSource) # Keep track of the learning params (but without restoring them) config['Training (won\'t be restored)'] = {} config['Training (won\'t be restored)']['learningRate'] = str( self.args.learningRate) config['Training (won\'t be restored)']['batchSize'] = str( self.args.batchSize) config['Training (won\'t be restored)']['dropout'] = str( self.args.dropout) with open(os.path.join(self.modelDir, self.CONFIG_FILENAME), 'w') as configFile: config.write(configFile) def _getSummaryName(self): """ Parse the argument to decide were to save the summary, at the same place that the model The folder could already contain logs if we restore the training, those will be merged Return: str: The path and name of the summary """ return self.modelDir def _getModelName(self): """ Parse the argument to decide were to save/load the model This function is called at each checkpoint and the first time the model is load. If keepAll option is set, the globStep value will be included in the name. Return: str: The path and name were the model need to be saved """ modelName = os.path.join(self.modelDir, self.MODEL_NAME_BASE) if self.args.keepAll: # We do not erase the previously saved model by including the current step on the name modelName += '-' + str(self.globStep) return modelName + self.MODEL_EXT def getDevice(self): """ Parse the argument to decide on which device run the model Return: str: The name of the device on which run the program """ if self.args.device == 'cpu': return '/cpu:0' elif self.args.device == 'gpu': return '/gpu:0' elif self.args.device is None: # No specified device (default) return None else: print( 'Warning: Error in the device name: {}, use the default device' .format(self.args.device)) return None
class Chatbot: """ Main class which launch the training or testing mode """ class TestMode: """ Simple structure representing the different testing modes """ INTERACTIVE = 'interactive' # The user can write his own questions DAEMON = 'daemon' # The chatbot runs on background and can regularly be called to predict something def __init__(self): """ """ # Model/dataset parameters self.args = None # Task specific object self.textData = None # Dataset self.model = None # Sequence to sequence model # Tensorflow utilities for convenience saving/logging self.writer = None self.saver = None self.modelDir = '' # Where the model is saved self.globStep = 0 # Represent the number of iteration for the current model # TensorFlow main session (we keep track for the daemon) self.sess = None # Filename and directories constants self.MODEL_DIR_BASE = 'save/model' self.MODEL_NAME_BASE = 'model' self.MODEL_EXT = '.ckpt' self.CONFIG_FILENAME = 'params.ini' self.CONFIG_VERSION = '0.4' self.TEST_IN_NAME = 'data/test/samples.txt' self.TEST_OUT_SUFFIX = '_predictions.txt' self.SENTENCES_PREFIX = ['Q: ', 'A: '] @staticmethod def parseArgs(args): """ Parse the arguments from the given command line Args: args (list<str>): List of arguments to parse. If None, the default sys.argv will be parsed """ parser = argparse.ArgumentParser() # Global options globalArgs = parser.add_argument_group('Global options') globalArgs.add_argument( '--test', nargs='?', choices=[Chatbot.TestMode.INTERACTIVE, Chatbot.TestMode.DAEMON], default=None, help= 'if present, launch the program try to answer all sentences from data/test/ with' ' the defined model(s), in interactive mode, the user can wrote his own sentences,' ' use daemon mode to integrate the chatbot in another program') globalArgs.add_argument( '--createDataset', action='store_true', help= 'if present, the program will only generate the dataset from the corpus (no training/testing)' ) globalArgs.add_argument( '--reset', action='store_true', help= 'use this if you want to ignore the previous model present on the model directory (Warning: the model will be destroyed with all the folder content)' ) globalArgs.add_argument( '--verbose', action='store_true', help= 'When testing, will plot the outputs at the same time they are computed' ) globalArgs.add_argument( '--debug', action='store_true', help= 'run DeepQA with Tensorflow debug mode. Read TF documentation for more details on this.' ) globalArgs.add_argument( '--keepAll', action='store_true', help= 'If this option is set, all saved model will be kept (Warning: make sure you have enough free disk space or increase saveEvery)' ) # TODO: Add an option to delimit the max size globalArgs.add_argument( '--modelTag', type=str, default=None, help='tag to differentiate which model to store/load') globalArgs.add_argument( '--rootDir', type=str, default=None, help='folder where to look for the models and data') globalArgs.add_argument( '--device', type=str, default=None, help= '\'gpu\' or \'cpu\' (Warning: make sure you have enough free RAM), allow to choose on which hardware run the model' ) # Dataset options datasetArgs = parser.add_argument_group('Dataset options') datasetArgs.add_argument('--corpus', choices=TextData.corpusChoices(), default=TextData.corpusChoices()[0], help='corpus on which extract the dataset.') datasetArgs.add_argument( '--datasetTag', type=str, default='', help= 'add a tag to the dataset (file where to load the vocabulary and the precomputed samples, not the original corpus). Useful to manage multiple versions. Also used to define the file used for the lightweight format.' ) # The samples are computed from the corpus if it does not exist already. There are saved in \'data/samples/\' datasetArgs.add_argument( '--maxLength', type=int, default=10, help= 'maximum length of the sentence (for input and output), define number of maximum step of the RNN' ) datasetArgs.add_argument( '--lightweightFile', type=str, default=None, help='file containing our lightweight-formatted corpus') # Network options (Warning: if modifying something here, also make the change on save/loadParams() ) nnArgs = parser.add_argument_group('Network options', 'architecture related option') nnArgs.add_argument('--hiddenSize', type=int, default=256, help='number of hidden units in each RNN cell') nnArgs.add_argument('--numLayers', type=int, default=2, help='number of rnn layers') nnArgs.add_argument('--embeddingSize', type=int, default=32, help='embedding size of the word representation') nnArgs.add_argument( '--initEmbeddings', action='store_true', help= 'if present, the program will initialize the embeddings with pre-trained word2vec vectors' ) nnArgs.add_argument( '--softmaxSamples', type=int, default=0, help= 'Number of samples in the sampled softmax loss function. A value of 0 deactivates sampled softmax' ) # Training options trainingArgs = parser.add_argument_group('Training options') trainingArgs.add_argument('--numEpochs', type=int, default=1, help='maximum number of epochs to run') trainingArgs.add_argument( '--saveEvery', type=int, default=1000, help='nb of mini-batch step before creating a model checkpoint') trainingArgs.add_argument('--batchSize', type=int, default=10, help='mini-batch size') trainingArgs.add_argument('--learningRate', type=float, default=0.001, help='Learning rate') return parser.parse_args(args) def main(self, args=None): """ Launch the training and/or the interactive mode """ # General initialisation self.args = self.parseArgs(args) if not self.args.rootDir: self.args.rootDir = os.getcwd( ) # Use the current working directory #tf.logging.set_verbosity(tf.logging.INFO) # DEBUG, INFO, WARN (default), ERROR, or FATAL self.loadModelParams( ) # Update the self.modelDir and self.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) self.textData = TextData(self.args) # each word of the vocabulary / decoder input # vocabulary). Add a compatibility mode which allow to launch a model trained on a different vocabulary ( # remap the word2id/id2word variables). if self.args.createDataset: print('Dataset created! Thanks for using this program') return # No need to go further # Prepare the model with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.summary.FileWriter(self._getSummaryName()) self.saver = tf.train.Saver( max_to_keep=200, write_version=tf.train.SaverDef.V1 ) # TODO: See GitHub for format name issue (when restoring the model) # dataset, otherwise, all which cames after the shuffling won't be replicable when # reloading the dataset). How to restore the seed after loading ?? # Also fix seed for random.shuffle (does it works globally for all files ?) # Running session self.sess = tf.Session(config=tf.ConfigProto( allow_soft_placement= True, # Allows backup device for non GPU-available operations (when forcing GPU) log_device_placement=False) # Too verbose ? ) if self.args.debug: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) print('Initialize variables...') self.sess.run(tf.global_variables_initializer()) # Reload the model eventually (if it exist.), on testing mode, the models are not loaded here (but in predictTestset) self.managePreviousModel(self.sess) # Initialize embeddings with pre-trained word2vec vectors if self.args.initEmbeddings: print( "Loading pre-trained embeddings from GoogleNews-vectors-negative300.bin" ) self.loadEmbedding(self.sess) if self.args.test: if self.args.test == Chatbot.TestMode.INTERACTIVE: self.mainTestInteractive(self.sess) elif self.args.test == Chatbot.TestMode.DAEMON: print('Daemon mode, running in background...') else: raise RuntimeError('Unknown test mode: {}'.format( self.args.test)) # Should never happen else: self.mainTrain(self.sess) if self.args.test != Chatbot.TestMode.DAEMON: self.sess.close() print("The End!") def mainTrain(self, sess): """ Training loop Args: sess: The current running session """ # Specific training dependent loading self.textData.makeLighter( self.args.ratioDataset) # Limit the number of training samples mergedSummaries = tf.summary.merge_all( ) # Define the summary operator (Warning: Won't appear on the tensorboard graph) if self.globStep == 0: # Not restoring from previous run self.writer.add_graph(sess.graph) # First time only # If restoring a model, restore the progression bar ? and current batch ? print('Start training (press Ctrl+C to save and exit)...') try: # If the user exit while training, we still try to save the model for e in range(self.args.numEpochs): print() print("----- Epoch {}/{} ; (lr={}) -----".format( e + 1, self.args.numEpochs, self.args.learningRate)) batches = self.textData.getBatches() tic = datetime.datetime.now() for nextBatch in tqdm(batches, desc="Training"): # Training pass ops, feedDict = self.model.step(nextBatch) assert len(ops) == 2 # training, loss _, loss, summary = sess.run(ops + (mergedSummaries, ), feedDict) self.writer.add_summary(summary, self.globStep) self.globStep += 1 # Output training status if self.globStep % 100 == 0: perplexity = math.exp( float(loss)) if loss < 300 else float("inf") tqdm.write( "----- Step %d -- Loss %.2f -- Perplexity %.2f" % (self.globStep, loss, perplexity)) # Checkpoint if self.globStep % self.args.saveEvery == 0: self._saveSession(sess) toc = datetime.datetime.now() print( "Epoch finished in {}".format(toc - tic) ) # Warning: Will overflow if an epoch takes more than 24 hours, and the output isn't really nicer except (KeyboardInterrupt, SystemExit): # If the user press Ctrl+C while testing progress print('Interruption detected, exiting the program...') self._saveSession(sess) # Ultimate saving before complete exit def predictTestset(self, sess): """ Try predicting the sentences from the samples.txt file. The sentences are saved on the modelDir under the same name Args: sess: The current running session """ # Loading the file to predict with open(os.path.join(self.args.rootDir, self.TEST_IN_NAME), 'r') as f: lines = f.readlines() modelList = self._getModelList() if not modelList: print( 'Warning: No model found in \'{}\'. Please train a model before trying to predict' .format(self.modelDir)) return # Predicting for each model present in modelDir for modelName in sorted(modelList): # TODO: Natural sorting print('Restoring previous model from {}'.format(modelName)) self.saver.restore(sess, modelName) print('Testing...') saveName = modelName[:-len( self.MODEL_EXT )] + self.TEST_OUT_SUFFIX # We remove the model extension and add the prediction suffix with open(saveName, 'w') as f: nbIgnored = 0 for line in tqdm(lines, desc='Sentences'): question = line[:-1] # Remove the endl character answer = self.singlePredict(question) if not answer: nbIgnored += 1 continue # Back to the beginning, try again predString = '{x[0]}{0}\n{x[1]}{1}\n\n'.format( question, self.textData.sequence2str(answer, clean=True), x=self.SENTENCES_PREFIX) if self.args.verbose: tqdm.write(predString) f.write(predString) print( 'Prediction finished, {}/{} sentences ignored (too long)'. format(nbIgnored, len(lines))) def mainTestInteractive(self, sess): """ Try predicting the sentences that the user will enter in the console Args: sess: The current running session """ print('Testing: Launch interactive mode:') print('') print( 'Type \'exit\' or just press ENTER to quit the program. Have fun!') while True: question = input(self.SENTENCES_PREFIX[0]) if question == '' or question == 'exit': break questionSeq = [ ] # Will be contain the question as seen by the encoder answer = self.singlePredict(question, questionSeq) if not answer: print( 'Warning: sentence too long, sorry. Maybe try a simpler sentence.' ) continue # Back to the beginning, try again print('{}{}'.format(self.SENTENCES_PREFIX[1], self.textData.sequence2str(answer, clean=True))) if self.args.verbose: print( self.textData.batchSeq2str(questionSeq, clean=True, reverse=True)) print(self.textData.sequence2str(answer)) print() def singlePredict(self, question, questionSeq=None): """ Predict the sentence Args: question (str): the raw input sentence questionSeq (List<int>): output argument. If given will contain the input batch sequence Return: list <int>: the word ids corresponding to the answer """ # Create the input batch batch = self.textData.sentence2enco(question) if not batch: return None if questionSeq is not None: # If the caller want to have the real input questionSeq.extend(batch.encoderSeqs) # Run the model ops, feedDict = self.model.step(batch) output = self.sess.run(ops[0], feedDict) answer = self.textData.deco2sentence(output) return answer def daemonPredict(self, sentence): """ Return the answer to a given sentence (same as singlePredict() but with additional cleaning) Args: sentence (str): the raw input sentence Return: str: the human readable sentence """ return self.textData.sequence2str(self.singlePredict(sentence), clean=True) def daemonClose(self): """ A utility function to close the daemon when finish """ print('Exiting the daemon mode...') self.sess.close() print('Daemon closed.') def loadEmbedding(self, sess): """ Initialize embeddings with pre-trained word2vec vectors Will modify the embedding weights of the current loaded model Uses the GoogleNews pre-trained values (path hardcoded) """ # Fetch embedding variables from model with tf.variable_scope("embedding_rnn_seq2seq/RNN/EmbeddingWrapper", reuse=tf.AUTO_REUSE): em_in = tf.get_variable( "word_embeddings", [self.textData.getVocabularySize(), self.args.embeddingSize]) with tf.variable_scope("embedding_rnn_seq2seq/embedding_rnn_decoder", reuse=tf.AUTO_REUSE): em_out = tf.get_variable( "word_embeddings", [self.textData.getVocabularySize(), self.args.embeddingSize]) # Disable training for embeddings variables = tf.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES) variables.remove(em_in) variables.remove(em_out) # If restoring a model, we can leave here if self.globStep != 0: return # New model, we load the pre-trained word2vec data and initialize embeddings with open( os.path.join( self.args.rootDir, 'data/word2vec/GoogleNews-vectors-negative300.bin'), "rb", 0) as f: header = f.readline() vocab_size, vector_size = map(int, header.split()) binary_len = np.dtype('float32').itemsize * vector_size initW = np.random.uniform( -0.25, 0.25, (len(self.textData.word2id), vector_size)) for line in tqdm(range(vocab_size)): word = [] while True: ch = f.read(1) if ch == b' ': word = b''.join(word).decode('utf-8') break if ch != b'\n': word.append(ch) if word in self.textData.word2id: initW[self.textData.word2id[word]] = np.fromstring( f.read(binary_len), dtype='float32') else: f.read(binary_len) # PCA Decomposition to reduce word2vec dimensionality if self.args.embeddingSize < vector_size: U, s, Vt = np.linalg.svd(initW, full_matrices=False) S = np.zeros((vector_size, vector_size), dtype=complex) S[:vector_size, :vector_size] = np.diag(s) initW = np.dot( U[:, :self.args.embeddingSize], S[:self.args.embeddingSize, :self.args.embeddingSize]) # Initialize input and output embeddings sess.run(em_in.assign(initW)) sess.run(em_out.assign(initW)) def managePreviousModel(self, sess): """ Restore or reset the model, depending of the parameters If the destination directory already contains some file, it will handle the conflict as following: * If --reset is set, all present files will be removed (warning: no confirmation is asked) and the training restart from scratch (globStep & cie reinitialized) * Otherwise, it will depend of the directory content. If the directory contains: * No model files (only summary logs): works as a reset (restart from scratch) * Other model files, but modelName not found (surely keepAll option changed): raise error, the user should decide by himself what to do * The right model file (eventually some other): no problem, simply resume the training In any case, the directory will exist as it has been created by the summary writer Args: sess: The current running session """ print('WARNING: ', end='') modelName = self._getModelName() print(modelName) if os.listdir(self.modelDir): if self.args.reset: print('Reset: Destroying previous model at {}'.format( self.modelDir)) # Analysing directory content elif os.path.exists(modelName): # Restore the model print('Restoring previous model from {}'.format(modelName)) self.saver.restore( sess, modelName ) # Will crash when --reset is not activated and the model has not been saved yet elif self._getModelList(): print('Conflict with previous models.') raise RuntimeError( 'Some models are already present in \'{}\'. You should check them first (or re-try with the keepAll flag)' .format(self.modelDir)) else: # No other model to conflict with (probably summary files) print( 'No previous model found, but some files found at {}. Cleaning...' .format(self.modelDir)) # Warning: No confirmation asked self.args.reset = True if self.args.reset: fileList = [ os.path.join(self.modelDir, f) for f in os.listdir(self.modelDir) ] for f in fileList: print('Removing {}'.format(f)) os.remove(f) else: print('No previous model found, starting from clean directory: {}'. format(self.modelDir)) def _saveSession(self, sess): """ Save the model parameters and the variables Args: sess: the current session """ tqdm.write('Checkpoint reached: saving model (don\'t stop the run)...') self.saveModelParams() self.saver.save(sess, self._getModelName()) tqdm.write('Model saved.') def _getModelList(self): """ Return the list of the model files inside the model directory """ return [ os.path.join(self.modelDir, f) for f in os.listdir(self.modelDir) if f.endswith(self.MODEL_EXT) ] def loadModelParams(self): """ Load the some values associated with the current model, like the current globStep value For now, this function does not need to be called before loading the model (no parameters restored). However, the modelDir name will be initialized here so it is required to call this function before managePreviousModel(), _getModelName() or _getSummaryName() Warning: if you modify this function, make sure the changes mirror saveModelParams, also check if the parameters should be reset in managePreviousModel """ # Compute the current model path self.modelDir = os.path.join(self.args.rootDir, self.MODEL_DIR_BASE) if self.args.modelTag: self.modelDir += '-' + self.args.modelTag # If there is a previous model, restore some parameters configName = os.path.join(self.modelDir, self.CONFIG_FILENAME) if not self.args.reset and not self.args.createDataset and os.path.exists( configName): # Loading config = configparser.ConfigParser() config.read(configName) # Check the version currentVersion = config['General'].get('version') if currentVersion != self.CONFIG_VERSION: raise UserWarning( 'Present configuration version {0} does not match {1}. You can try manual changes on \'{2}\'' .format(currentVersion, self.CONFIG_VERSION, configName)) # Restoring the the parameters self.globStep = config['General'].getint('globStep') self.args.maxLength = config['General'].getint( 'maxLength' ) # We need to restore the model length because of the textData associated and the vocabulary size (TODO: Compatibility mode between different maxLength) self.args.corpus = config['General'].get('corpus') self.args.datasetTag = config['General'].get('datasetTag', '') self.args.hiddenSize = config['Network'].getint('hiddenSize') self.args.numLayers = config['Network'].getint('numLayers') self.args.embeddingSize = config['Network'].getint('embeddingSize') self.args.initEmbeddings = config['Network'].getboolean( 'initEmbeddings') self.args.softmaxSamples = config['Network'].getint( 'softmaxSamples') # No restoring for training params, batch size or other non model dependent parameters # Show the restored params print() print('Warning: Restoring parameters:') print('globStep: {}'.format(self.globStep)) print('maxLength: {}'.format(self.args.maxLength)) print('corpus: {}'.format(self.args.corpus)) print('datasetTag: {}'.format(self.args.datasetTag)) print('hiddenSize: {}'.format(self.args.hiddenSize)) print('numLayers: {}'.format(self.args.numLayers)) print('embeddingSize: {}'.format(self.args.embeddingSize)) print('initEmbeddings: {}'.format(self.args.initEmbeddings)) print('softmaxSamples: {}'.format(self.args.softmaxSamples)) print() # For now, not arbitrary independent maxLength between encoder and decoder self.args.maxLengthEnco = self.args.maxLength self.args.maxLengthDeco = self.args.maxLength + 2 def saveModelParams(self): """ Save the params of the model, like the current globStep value Warning: if you modify this function, make sure the changes mirror loadModelParams """ config = configparser.ConfigParser() config['General'] = {} config['General']['version'] = self.CONFIG_VERSION config['General']['globStep'] = str(self.globStep) config['General']['maxLength'] = str(self.args.maxLength) config['General']['corpus'] = str(self.args.corpus) config['General']['datasetTag'] = str(self.args.datasetTag) config['Network'] = {} config['Network']['hiddenSize'] = str(self.args.hiddenSize) config['Network']['numLayers'] = str(self.args.numLayers) config['Network']['embeddingSize'] = str(self.args.embeddingSize) config['Network']['initEmbeddings'] = str(self.args.initEmbeddings) config['Network']['softmaxSamples'] = str(self.args.softmaxSamples) # Keep track of the learning params (but without restoring them) config['Training (won\'t be restored)'] = {} config['Training (won\'t be restored)']['learningRate'] = str( self.args.learningRate) config['Training (won\'t be restored)']['batchSize'] = str( self.args.batchSize) with open(os.path.join(self.modelDir, self.CONFIG_FILENAME), 'w') as configFile: config.write(configFile) def _getSummaryName(self): """ Parse the argument to decide were to save the summary, at the same place that the model The folder could already contain logs if we restore the training, those will be merged Return: str: The path and name of the summary """ return self.modelDir def _getModelName(self): """ Parse the argument to decide were to save/load the model This function is called at each checkpoint and the first time the model is load. If keepAll option is set, the globStep value will be included in the name. Return: str: The path and name were the model need to be saved """ modelName = os.path.join(self.modelDir, self.MODEL_NAME_BASE) if self.args.keepAll: # We do not erase the previously saved model by including the current step on the name modelName += '-' + str(self.globStep) return modelName + self.MODEL_EXT def getDevice(self): """ Parse the argument to decide on which device run the model Return: str: The name of the device on which run the program """ if self.args.device == 'cpu': return '/cpu:0' elif self.args.device == 'gpu': return '/gpu:0' elif self.args.device is None: # No specified device (default) return None else: print( 'Warning: Error in the device name: {}, use the default device' .format(self.args.device)) return None
class Chatbot: class TestMode: ALL = 'all' INTERACTIVE = 'interactive' def __init__(self): self.args = None self.textData = None self.model = None self.writer = None self.saver = None self.modelDir = '' self.globStep = 0 self.sess = None self.MODEL_DIR_BASE = 'save' + os.sep + 'model' self.MODEL_NAME_BASE = 'model' self.MODEL_EXT = '.ckpt' self.CONFIG_FILENAME = 'params.ini' self.CONFIG_VERSION = '0.5' self.TEST_IN_NAME = 'data' + os.sep + 'test' + os.sep + 'samples.txt' self.TEST_OUT_SUFFIX = '_predictions.txt' self.SENTENCES_PREFIX = ['Q: ', 'A: '] @staticmethod def parseArgs(args): parser = argparse.ArgumentParser() globalArgs = parser.add_argument_group('Global options') globalArgs.add_argument('--test', nargs='?', choices=[ Chatbot.TestMode.ALL, Chatbot.TestMode.INTERACTIVE, Chatbot.TestMode.DAEMON ], const=Chatbot.TestMode.ALL, default=None, help='if ') nnArgs = parser.add_argument_group('Network options', 'architecture related option') trainingArgs = parser.add_argument_group('Training options') return parser.parse_args(args) def main(self, args=None): print('TensorFlow detected: v{}'.format(tf.__version__)) self.args = self.parseArgs(args) if not self.args.rootDir: self.args.rootDir = os.getcwd() self.loadModelParams() self.textData = TextData(self.args) if self.args.createDataset: return with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) self.writer = tf.summary.FileWriter(self._getSummaryName()) self.saver = tf.train.Saver(max_to_keep=200) self.sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) if self.args.debug: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) print('Initialize variables...') self.sess.run(tf.global_variables_initializer()) if self.args.test != Chatbot.TestMode.ALL: self.managePreviousModel(self.sess) if self.args.initEmbeddings: self.loadEmbedding(self.sess) if self.args.test: if self.args.test == Chatbot.TestMode.INTERACTIVE: self.mainTestInteractive(self.sess) elif self.args.test == Chatbot.TestMode.ALL: print('Start predicting...') self.predictTestset(self.sess) print('All predictions done') elif self.args.test == Chatbot.TestMode.DAEMON: print('Daemon mode, running in background...') else: raise RuntimeError('Unknown test mode: {}'.format( self.args.test)) else: self.mainTrain(self.sess) if self.args.test != Chatbot.TestMode.DAEMON: self.sess.close() print("The End! Thanks for using this program") def mainTrain(self, sess): self.textData.makeLighter(self.args.ratioDataset) mergedSummaries = tf.summary.merge_all() if self.globStep == 0: self.writer.add_graph(sess.graph) print('Start training (press Ctrl+C to save and exit)...') try: for e in range(self.args.numEpochs): print() print("----- Epoch {}/{} ; (lr={}) -----".format( e + 1, self.args.numEpochs, self.args.learningRate)) batches = self.textData.getBatches() tic = datetime.datetime.now() for nextBatch in tqdm(batches, desc="Training"): ops, feedDict = self.model.step(nextBatch) assert len(ops) == 2 _, loss, summary = sess.run(ops + (mergedSummaries, ), feedDict) self.writer.add_summary(summary, self.globStep) self.globStep += 1 if self.globStep % 100 == 0: perplexity = math.exp( float(loss)) if loss < 300 else float("inf") tqdm.write( "----- Step %d -- Loss %.2f -- Perplexity %.2f" % (self.globStep, loss, perplexity)) if self.globStep % self.args.saveEvery == 0: self._saveSession(sess) toc = datetime.datetime.now() print("Epoch finished in {}".format(toc - tic)) except (KeyboardInterrupt, SystemExit): print('Interruption detected, exiting the program...') self._saveSession(sess) def predictTestset(self, sess): with open(os.path.join(self.args.rootDir, self.TEST_IN_NAME), 'r') as f: lines = f.readlines() modelList = self._getModelList() if not modelList: print( 'Warning: No model found in \'{}\'. Please train a model before trying to predict' .format(self.modelDir)) return for modelName in sorted(modelList): print('Restoring previous model from {}'.format(modelName)) self.saver.restore(sess, modelName) print('Testing...') saveName = modelName[:-len(self.MODEL_EXT)] + self.TEST_OUT_SUFFIX with open(saveName, 'w') as f: nbIgnored = 0 for line in tqdm(lines, desc='Sentences'): question = line[:-1] answer = self.singlePredict(question) if not answer: nbIgnored += 1 continue predString = '{x[0]}{0}\n{x[1]}{1}\n\n'.format( question, self.textData.sequence2str(answer, clean=True), x=self.SENTENCES_PREFIX) if self.args.verbose: tqdm.write(predString) f.write(predString) print( 'Prediction finished, {}/{} sentences ignored (too long)'. format(nbIgnored, len(lines))) def mainTestInteractive(self, sess): while True: question = input(self.SENTENCES_PREFIX[0]) if question == '' or question == 'exit': break questionSeq = [] answer = self.singlePredict(question, questionSeq) if not answer: print( 'Warning: sentence too long, sorry. Maybe try a simpler sentence.' ) continue print('{}{}'.format(self.SENTENCES_PREFIX[1], self.textData.sequence2str(answer, clean=True))) if self.args.verbose: print( self.textData.batchSeq2str(questionSeq, clean=True, reverse=True)) print(self.textData.sequence2str(answer)) print() def singlePredict(self, question, questionSeq=None): batch = self.textData.sentence2enco(question) if not batch: return None if questionSeq is not None: questionSeq.extend(batch.encoderSeqs) ops, feedDict = self.model.step(batch) output = self.sess.run(ops[0], feedDict) answer = self.textData.deco2sentence(output) return answer def daemonPredict(self, sentence): return self.textData.sequence2str(self.singlePredict(sentence), clean=True) def daemonClose(self): print('Exiting the daemon mode...') self.sess.close() print('Daemon closed.') def loadEmbedding(self, sess): with tf.variable_scope("embedding_rnn_seq2seq/rnn/embedding_wrapper", reuse=True): em_in = tf.get_variable("embedding") with tf.variable_scope("embedding_rnn_seq2seq/embedding_rnn_decoder", reuse=True): em_out = tf.get_variable("embedding") variables = tf.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES) variables.remove(em_in) variables.remove(em_out) if self.globStep != 0: return embeddings_path = os.path.join(self.args.rootDir, 'data', 'embeddings', self.args.embeddingSource) embeddings_format = os.path.splitext(embeddings_path)[1][1:] print("Loading pre-trained word embeddings from %s " % embeddings_path) with open(embeddings_path, "rb") as f: header = f.readline() vocab_size, vector_size = map(int, header.split()) binary_len = np.dtype('float32').itemsize * vector_size initW = np.random.uniform( -0.25, 0.25, (len(self.textData.word2id), vector_size)) for line in tqdm(range(vocab_size)): word = [] while True: ch = f.read(1) if ch == b' ': word = b''.join(word).decode('utf-8') break if ch != b'\n': word.append(ch) if word in self.textData.word2id: if embeddings_format == 'bin': vector = np.fromstring(f.read(binary_len), dtype='float32') elif embeddings_format == 'vec': vector = np.fromstring(f.readline(), sep=' ', dtype='float32') else: raise Exception("Unkown format for embeddings: %s " % embeddings_format) initW[self.textData.word2id[word]] = vector else: if embeddings_format == 'bin': f.read(binary_len) elif embeddings_format == 'vec': f.readline() else: raise Exception("Unkown format for embeddings: %s " % embeddings_format) if self.args.embeddingSize < vector_size: U, s, Vt = np.linalg.svd(initW, full_matrices=False) S = np.zeros((vector_size, vector_size), dtype=complex) S[:vector_size, :vector_size] = np.diag(s) initW = np.dot( U[:, :self.args.embeddingSize], S[:self.args.embeddingSize, :self.args.embeddingSize]) sess.run(em_in.assign(initW)) sess.run(em_out.assign(initW)) def managePreviousModel(self, sess): print('WARNING: ', end='') modelName = self._getModelName() if os.listdir(self.modelDir): if self.args.reset: print('Reset: Destroying previous model at {}'.format( self.modelDir)) elif os.path.exists(modelName): print('Restoring previous model from {}'.format(modelName)) self.saver.restore(sess, modelName) elif self._getModelList(): print('Conflict with previous models.') raise RuntimeError( 'Some models are already present in \'{}\'. You should check them first (or re-try with the keepAll flag)' .format(self.modelDir)) else: print( 'No previous model found, but some files found at {}. Cleaning...' .format(self.modelDir)) self.args.reset = True if self.args.reset: fileList = [ os.path.join(self.modelDir, f) for f in os.listdir(self.modelDir) ] for f in fileList: print('Removing {}'.format(f)) os.remove(f) else: print('No previous model found, starting from clean directory: {}'. format(self.modelDir)) def _saveSession(self, sess): tqdm.write('Checkpoint reached: saving model (don\'t stop the run)...') self.saveModelParams() model_name = self._getModelName() with open(model_name, 'w') as f: f.write( 'This file is used internally by DeepQA to check the model existance. Please do not remove.\n' ) self.saver.save(sess, model_name) tqdm.write('Model saved.') def _getModelList(self): return [ os.path.join(self.modelDir, f) for f in os.listdir(self.modelDir) if f.endswith(self.MODEL_EXT) ] def loadModelParams(self): self.modelDir = os.path.join(self.args.rootDir, self.MODEL_DIR_BASE) if self.args.modelTag: self.modelDir += '-' + self.args.modelTag configName = os.path.join(self.modelDir, self.CONFIG_FILENAME) if not self.args.reset and not self.args.createDataset and os.path.exists( configName): config = configparser.ConfigParser() config.read(configName) currentVersion = config['General'].get('version') if currentVersion != self.CONFIG_VERSION: raise UserWarning( 'Present configuration version {0} does not match {1}. You can try manual changes on \'{2}\'' .format(currentVersion, self.CONFIG_VERSION, configName)) self.globStep = config['General'].getint('globStep') self.args.watsonMode = config['General'].getboolean('watsonMode') self.args.autoEncode = config['General'].getboolean('autoEncode') self.args.corpus = config['General'].get('corpus') self.args.datasetTag = config['Dataset'].get('datasetTag') self.args.maxLength = config['Dataset'].getint('maxLength') self.args.filterVocab = config['Dataset'].getint('filterVocab') self.args.skipLines = config['Dataset'].getboolean('skipLines') self.args.vocabularySize = config['Dataset'].getint( 'vocabularySize') self.args.hiddenSize = config['Network'].getint('hiddenSize') self.args.numLayers = config['Network'].getint('numLayers') self.args.softmaxSamples = config['Network'].getint( 'softmaxSamples') self.args.initEmbeddings = config['Network'].getboolean( 'initEmbeddings') self.args.embeddingSize = config['Network'].getint('embeddingSize') self.args.embeddingSource = config['Network'].get( 'embeddingSource') print() print('Warning: Restoring parameters:') print('globStep: {}'.format(self.globStep)) print('watsonMode: {}'.format(self.args.watsonMode)) print('autoEncode: {}'.format(self.args.autoEncode)) print('corpus: {}'.format(self.args.corpus)) print('datasetTag: {}'.format(self.args.datasetTag)) print('maxLength: {}'.format(self.args.maxLength)) print('filterVocab: {}'.format(self.args.filterVocab)) print('skipLines: {}'.format(self.args.skipLines)) print('vocabularySize: {}'.format(self.args.vocabularySize)) print('hiddenSize: {}'.format(self.args.hiddenSize)) print('numLayers: {}'.format(self.args.numLayers)) print('softmaxSamples: {}'.format(self.args.softmaxSamples)) print('initEmbeddings: {}'.format(self.args.initEmbeddings)) print('embeddingSize: {}'.format(self.args.embeddingSize)) print('embeddingSource: {}'.format(self.args.embeddingSource)) print() self.args.maxLengthEnco = self.args.maxLength self.args.maxLengthDeco = self.args.maxLength + 2 if self.args.watsonMode: self.SENTENCES_PREFIX.reverse() def saveModelParams(self): config = configparser.ConfigParser() config['General'] = {} config['General']['version'] = self.CONFIG_VERSION config['General']['globStep'] = str(self.globStep) config['General']['watsonMode'] = str(self.args.watsonMode) config['General']['autoEncode'] = str(self.args.autoEncode) config['General']['corpus'] = str(self.args.corpus) config['Dataset'] = {} config['Dataset']['datasetTag'] = str(self.args.datasetTag) config['Dataset']['maxLength'] = str(self.args.maxLength) config['Dataset']['filterVocab'] = str(self.args.filterVocab) config['Dataset']['skipLines'] = str(self.args.skipLines) config['Dataset']['vocabularySize'] = str(self.args.vocabularySize) config['Network'] = {} config['Network']['hiddenSize'] = str(self.args.hiddenSize) config['Network']['numLayers'] = str(self.args.numLayers) config['Network']['softmaxSamples'] = str(self.args.softmaxSamples) config['Network']['initEmbeddings'] = str(self.args.initEmbeddings) config['Network']['embeddingSize'] = str(self.args.embeddingSize) config['Network']['embeddingSource'] = str(self.args.embeddingSource) # Keep track of the learning params (but without restoring them) config['Training (won\'t be restored)'] = {} config['Training (won\'t be restored)']['learningRate'] = str( self.args.learningRate) config['Training (won\'t be restored)']['batchSize'] = str( self.args.batchSize) config['Training (won\'t be restored)']['dropout'] = str( self.args.dropout) with open(os.path.join(self.modelDir, self.CONFIG_FILENAME), 'w') as configFile: config.write(configFile) def _getSummaryName(self): return self.modelDir def _getModelName(self): modelName = os.path.join(self.modelDir, self.MODEL_NAME_BASE) if self.args.keepAll: modelName += '-' + str(self.globStep) return modelName + self.MODEL_EXT def getDevice(self): if self.args.device == 'cpu': return '/cpu:0' elif self.args.device == 'gpu': return '/gpu:0' elif self.args.device is None: return None else: print( 'Warning: Error in the device name: {}, use the default device' .format(self.args.device)) return None