def preprocess(self): """Executes the preprocessing which generates the data used for learning """ print('Starting Preprocessing Phase...') start = datetime.datetime.now() raw_data = FileHelper.read_data_lower( self.config['preprocessing']['input_file']) # Model the characters as integers char2intDict = self.generate_char_to_int_dictionary(raw_data) # FileHelper.save_object_to_file('preprocessingCheckpoints/char2indexDict', char2indexDict) FileHelper.save_object_to_file( self.config['preprocessing']['checkpoints']['char2intDict_file'], char2intDict) int2CharDict = self.generate_int_to_char_dictionary(raw_data) FileHelper.save_object_to_file( self.config['preprocessing']['checkpoints']['int2charDict_file'], int2CharDict) # Generate the text patterns X, Y = self.generate_training_patterns(raw_data, char2intDict) end = datetime.datetime.now() deltaTime = end - start print('Preprocessing finished: %ds' % deltaTime.total_seconds()) return X, Y, char2intDict, int2CharDict
def generate_char_to_int_dictionary(self, data): """Creates the mapping of unique characters to unique integers """ # Sets are unordered collections of unique elements charSet = set(data) # Put the set into a list and sort it chars = list(charSet) vocab = sorted(chars) # FileHelper.save_object_to_file('preprocessingCheckpoints/vocab', vocab) FileHelper.save_object_to_file( self.config['preprocessing']['checkpoints']['vocabulary_file'], vocab) chars_len = len(data) vocab_len = len(vocab) print( 'Input data consists of %d Total Characters and a Vocabular of %d Characters' % (chars_len, vocab_len)) return dict( (character, index) for index, character in enumerate(vocab))
def main(): X = [] Y = [] char2intDict = None int2charDict = None vocabulary = None config = FileHelper.load_config('config.json') seq_length = config['preprocessing']['sequence_chars_length'] # Load data or preprocess if not config['preprocessing']['exec_preprocessing']: X = FileHelper.load_object_from_file( config['preprocessing']['checkpoints']['X_file']) Y = FileHelper.load_object_from_file( config['preprocessing']['checkpoints']['Y_file']) char2intDict = FileHelper.load_object_from_file( config['preprocessing']['checkpoints']['char2intDict_file']) int2charDict = FileHelper.load_object_from_file( config['preprocessing']['checkpoints']['int2charDict_file']) else: preprocessing = Preprocessing(config) X, Y, char2intDict, int2charDict = preprocessing.preprocess() FileHelper.save_object_to_file( config['preprocessing']['checkpoints']['X_file'], X) FileHelper.save_object_to_file( config['preprocessing']['checkpoints']['Y_file'], Y) vocabulary = FileHelper.load_object_from_file( config['preprocessing']['checkpoints']['vocabulary_file']) # Save the unshaped version of X because it's needed for generation later X_unshaped = X # Transform the data to the format the LTSM expects it [samples, timesteps, features] X = numpy.reshape(X, (len(X), seq_length, 1)) # Normalize/rescale all integers to range 0-1 X = X / float(len(vocabulary)) # As usual do one-hot encoding for categorial variables to the output variables (vector of zeros with a single 1 --> 0..N-1 categories) Y = np_utils.to_categorical(Y) training = Training(config) # Define the model model = training.define_model(X, Y) if config['training']['exec_training']: # Train the model model = training.train(X, Y, char2intDict, vocabulary, model) else: # Just set the previously trained weights for the model model.load_weights(config['training']['load_weights_filename']) model.compile(loss='categorical_crossentropy', optimizer='adam') if config['generation']['exec_generation']: # Generate the random seed used as starting value for text generation seed = generate_random_seed(X_unshaped) generatedText = generate_text( config['generation']['text_chars_length'], int2charDict, vocabulary, seed, model) # Save the generated text to file outputFilename = config['generation']['foldername'] + '/' + \ datetime.datetime.now().strftime('%Y%m%d_%H_%M_%S') + '.txt' FileHelper.write_data(outputFilename, generatedText)