コード例 #1
0
    def preprocess(self):
        """Executes the preprocessing which generates the data used for learning
        """
        print('Starting Preprocessing Phase...')
        start = datetime.datetime.now()
        raw_data = FileHelper.read_data_lower(
            self.config['preprocessing']['input_file'])
        # Model the characters as integers
        char2intDict = self.generate_char_to_int_dictionary(raw_data)
        # FileHelper.save_object_to_file('preprocessingCheckpoints/char2indexDict', char2indexDict)
        FileHelper.save_object_to_file(
            self.config['preprocessing']['checkpoints']['char2intDict_file'],
            char2intDict)
        int2CharDict = self.generate_int_to_char_dictionary(raw_data)
        FileHelper.save_object_to_file(
            self.config['preprocessing']['checkpoints']['int2charDict_file'],
            int2CharDict)

        # Generate the text patterns
        X, Y = self.generate_training_patterns(raw_data, char2intDict)
        end = datetime.datetime.now()
        deltaTime = end - start
        print('Preprocessing finished: %ds' % deltaTime.total_seconds())

        return X, Y, char2intDict, int2CharDict
コード例 #2
0
    def generate_char_to_int_dictionary(self, data):
        """Creates the mapping of unique characters to unique integers
        """
        # Sets are unordered collections of unique elements
        charSet = set(data)
        # Put the set into a list and sort it
        chars = list(charSet)
        vocab = sorted(chars)
        # FileHelper.save_object_to_file('preprocessingCheckpoints/vocab', vocab)
        FileHelper.save_object_to_file(
            self.config['preprocessing']['checkpoints']['vocabulary_file'],
            vocab)

        chars_len = len(data)
        vocab_len = len(vocab)
        print(
            'Input data consists of %d Total Characters and a Vocabular of %d Characters'
            % (chars_len, vocab_len))
        return dict(
            (character, index) for index, character in enumerate(vocab))
コード例 #3
0
def main():
    X = []
    Y = []
    char2intDict = None
    int2charDict = None
    vocabulary = None
    config = FileHelper.load_config('config.json')

    seq_length = config['preprocessing']['sequence_chars_length']

    # Load data or preprocess
    if not config['preprocessing']['exec_preprocessing']:
        X = FileHelper.load_object_from_file(
            config['preprocessing']['checkpoints']['X_file'])
        Y = FileHelper.load_object_from_file(
            config['preprocessing']['checkpoints']['Y_file'])
        char2intDict = FileHelper.load_object_from_file(
            config['preprocessing']['checkpoints']['char2intDict_file'])
        int2charDict = FileHelper.load_object_from_file(
            config['preprocessing']['checkpoints']['int2charDict_file'])
    else:
        preprocessing = Preprocessing(config)
        X, Y, char2intDict, int2charDict = preprocessing.preprocess()
        FileHelper.save_object_to_file(
            config['preprocessing']['checkpoints']['X_file'], X)
        FileHelper.save_object_to_file(
            config['preprocessing']['checkpoints']['Y_file'], Y)

    vocabulary = FileHelper.load_object_from_file(
        config['preprocessing']['checkpoints']['vocabulary_file'])

    # Save the unshaped version of X because it's needed for generation later
    X_unshaped = X

    # Transform the data to the format the LTSM expects it [samples, timesteps, features]
    X = numpy.reshape(X, (len(X), seq_length, 1))
    # Normalize/rescale all integers to range 0-1
    X = X / float(len(vocabulary))
    # As usual do one-hot encoding for categorial variables to the output variables (vector of zeros with a single 1 --> 0..N-1 categories)
    Y = np_utils.to_categorical(Y)

    training = Training(config)
    # Define the model
    model = training.define_model(X, Y)

    if config['training']['exec_training']:
        # Train the model
        model = training.train(X, Y, char2intDict, vocabulary, model)
    else:
        # Just set the previously trained weights for the model
        model.load_weights(config['training']['load_weights_filename'])
        model.compile(loss='categorical_crossentropy', optimizer='adam')

    if config['generation']['exec_generation']:
        # Generate the random seed used as starting value for text generation
        seed = generate_random_seed(X_unshaped)
        generatedText = generate_text(
            config['generation']['text_chars_length'], int2charDict,
            vocabulary, seed, model)

        # Save the generated text to file
        outputFilename = config['generation']['foldername'] + '/' + \
            datetime.datetime.now().strftime('%Y%m%d_%H_%M_%S') + '.txt'
        FileHelper.write_data(outputFilename, generatedText)