def test_preprocess_deu_eng_none_length(self):
        '''
        Testing Preprocessing on Deu-Eng pairs.
        '''
        # Set Input
        deu_eng = (
            'Go.\tGeh.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272\n'
            'Hi.\tHallo!\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123\n'
            'I\'m game.\tIch bin dabei.\tCC-BY 2.0 (France)\n'
            'Please keep me updated.\tBitte halten Sie mich auf dem Laufenden.\tCC-BY\n'
        )

        # Expected
        expected_destination = [['go'], ['hi'], ['im', 'game'],
                                ['please', 'keep', 'me', 'updated']]
        expected_target = [['geh'], ['hallo'], ['ich', 'bin', 'dabei'],
                           [
                               'bitte', 'halten', 'sie', 'mich', 'auf', 'dem',
                               'laufenden'
                           ]]

        # Do return all sentences lengths
        destination, target = TextPreprocessor.preprocess(deu_eng, None)

        self.assertEqual(destination, expected_destination)
        self.assertEqual(target, expected_target)
    def test_preprocess_nietzsche_zarathustra(self):
        '''
        Testing Preprocessing on Zarathustra.
        '''
        # Set Zarathustra (not in NLTK corpus)
        zarathustra = ('[5/0011] '
                       'I. '
                       'Als Zarathustra dreissig Jahr alt war, verliess er '
                       'seine Heimat und den See seiner Heimat und gieng in '
                       'das Gebirge. Hier genoss er seines Geistes und seiner '
                       'Einsamkeit und wurde dessen zehn Jahre nicht müde. '
                       'Endlich aber verwandelte sich sein Herz, — und eines '
                       'Morgens stand er mit der Morgenröthe auf, trat vor '
                       'die Sonne hin und sprach zu ihr also:')

        # Expected
        expected_zero = [
            'i', 'zarathustra', 'dreissig', 'jahr', 'alt', 'verliess',
            'heimat', 'see', 'heimat', 'gieng', 'gebirge'
        ]
        expected_two = [
            'genoss', 'geistes', 'einsamkeit', 'wurde', 'zehn', 'jahre', 'müde'
        ]

        norm_zarathustra = TextPreprocessor.preprocess(zarathustra, "german")

        self.assertEqual(norm_zarathustra[0], expected_zero)
        self.assertEqual(norm_zarathustra[1], expected_two)
Beispiel #3
0
    def prepare_data(cls, input_file=None, language="english", vocabulary_size=10000):
        '''
        Preprocess a given input file, and convert it to a mapped dictionary.
        '''
        # Load data from disk
        document = TextPreprocessor.read_file(input_file)

        # Preprocess and normalize
        norm_doc = TextPreprocessor.preprocess(document, language)

        # Generate mapping, dict and reverse_dict
        return TextPreprocessor.convert_to_dictionary(norm_doc, vocabulary_size)
Beispiel #4
0
    def prepare_data(cls,
                     input_file,
                     max_vocab_size=None,
                     dest_max_length=None) -> Dict:
        '''
        Preprocess a given input file, and convert it to a mapped dictionary.

        input_file: File doucment, having this structure:
                   DESTINATION_SENTENCE [TAB] TARGET_SENTENCE [TAB] ADD_INFO [NEWLINE]
        max_vocab_size: maximal vocabulary size to use.
        dest_max_length: Max length for DESTINATION_SENTENCE
        '''
        # Load data from disk
        document = text_pre.read_file(input_file)

        # Preprocess and normalize
        norm_destination, norm_target = text_pre.preprocess(
            document, dest_max_length)

        # Convert destination and target to vectors
        dest_tuple = text_pre.convert_to_dictionary(norm_destination,
                                                    max_vocab_size)
        targ_tuple = text_pre.convert_to_dictionary(norm_target,
                                                    max_vocab_size)
        # Unpack
        dest_data, dest_tok, dest_vocab_size, dest_length = (dest_tuple)
        targ_data, targ_tok, targ_vocab_size, targ_length = (targ_tuple)

        # Pad both vectors
        dest_data = text_pre.add_padding(dest_data, dest_length)
        targ_data = text_pre.add_padding(targ_data, targ_length)

        # Reshape Input - Sequential models need a 3-dimensional input:
        #  TrainSize x PadLength x 1 (word-int)
        dest_data = dest_data.reshape((-1, dest_data.shape[-1], 1))
        # Sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
        targ_data = targ_data.reshape(*targ_data.shape, 1)

        result_dict = {}
        dest_tuple = (dest_data, dest_tok, dest_vocab_size, dest_length)
        targ_tuple = (targ_data, targ_tok, targ_vocab_size, targ_length)
        for key, value in {
                'destination': dest_tuple,
                'target': targ_tuple
        }.items():
            result_dict[key + "_data"] = value[0]
            result_dict[key + "_tokenizer"] = value[1]
            result_dict[key + "_vocab_size"] = value[2]
            result_dict[key + "_max_length"] = value[3]

        return result_dict
    def test_preprocess_deu_eng_from_file(self):
        '''
        Testing Preprocessing on Deu-Eng pairs from a file.
        '''
        # Set Zarathustra (not in NLTK corpus)
        input_doc = TextPreprocessor.read_file(self.FILE_PATH)

        # Expected
        expected_destination = [['go'], ['hi'], ['hi'], ['run']]
        expected_target = [['geh'], ['hallo'], ['grüß', 'gott'], ['lauf']]

        destinations, targets = TextPreprocessor.preprocess(input_doc)

        self.assertEqual(destinations[0:4], expected_destination)
        self.assertEqual(targets[0:4], expected_target)
    def test_preprocess_king_james_bible(self):
        '''
        Testing Preprocessing on the KingJamesBible.
        '''
        # Load King James Bible
        bible = gutenberg.sents('bible-kjv.txt')

        # Expected
        expected_one = ['old', 'testament', 'king', 'james', 'bible']
        expected_ten = [
            'god', 'said', 'let', 'firmament', 'midst', 'waters', 'let',
            'divide', 'waters', 'waters'
        ]

        norm_bible = TextPreprocessor.preprocess(bible[:11], "english")

        self.assertEqual(norm_bible[1], expected_one)
        self.assertEqual(norm_bible[10], expected_ten)
    def test_preprocess_nietzsche_zarathustra_from_file(self):
        '''
        Testing Preprocessing on Zarathustra from a file.
        '''
        # Set Zarathustra (not in NLTK corpus)
        zarathustra = TextPreprocessor.read_file(self.FILE_PATH)

        # Expected
        expected_nineteen = [
            'i', 'zarathustra', 'dreissig', 'jahr', 'alt', 'verliess',
            'heimat', 'see', 'heimat', 'gieng', 'gebirge'
        ]
        expected_twenty = [
            'genoss', 'geistes', 'einsamkeit', 'wurde', 'zehn', 'jahre', 'müde'
        ]

        norm_zarathustra = TextPreprocessor.preprocess(zarathustra, "german")

        self.assertEqual(norm_zarathustra[19], expected_nineteen)
        self.assertEqual(norm_zarathustra[20], expected_twenty)