def test_read_file(self):
        '''
        Testing File reading.
        '''
        expected = ('Ein Buch\n' 'für\n' 'Alle und Keinen')
        doc = TextPreprocessor.read_file(self.FILE_PATH)

        self.assertEqual(len(doc), 120483)
        self.assertEqual(doc[92:120], expected)
    def test_read(self):
        '''
        Testing File reading.
        '''
        expected = ('Hallo!\tCC-BY 2.0 (France) Att')
        doc = TextPreprocessor.read_file(self.FILE_PATH)

        self.assertEqual(len(doc), 34382099)
        self.assertEqual(doc[91:120], expected)
Beispiel #3
0
    def prepare_data(cls, input_file=None, language="english", vocabulary_size=10000):
        '''
        Preprocess a given input file, and convert it to a mapped dictionary.
        '''
        # Load data from disk
        document = TextPreprocessor.read_file(input_file)

        # Preprocess and normalize
        norm_doc = TextPreprocessor.preprocess(document, language)

        # Generate mapping, dict and reverse_dict
        return TextPreprocessor.convert_to_dictionary(norm_doc, vocabulary_size)
Beispiel #4
0
    def prepare_data(cls,
                     input_file,
                     max_vocab_size=None,
                     dest_max_length=None) -> Dict:
        '''
        Preprocess a given input file, and convert it to a mapped dictionary.

        input_file: File doucment, having this structure:
                   DESTINATION_SENTENCE [TAB] TARGET_SENTENCE [TAB] ADD_INFO [NEWLINE]
        max_vocab_size: maximal vocabulary size to use.
        dest_max_length: Max length for DESTINATION_SENTENCE
        '''
        # Load data from disk
        document = text_pre.read_file(input_file)

        # Preprocess and normalize
        norm_destination, norm_target = text_pre.preprocess(
            document, dest_max_length)

        # Convert destination and target to vectors
        dest_tuple = text_pre.convert_to_dictionary(norm_destination,
                                                    max_vocab_size)
        targ_tuple = text_pre.convert_to_dictionary(norm_target,
                                                    max_vocab_size)
        # Unpack
        dest_data, dest_tok, dest_vocab_size, dest_length = (dest_tuple)
        targ_data, targ_tok, targ_vocab_size, targ_length = (targ_tuple)

        # Pad both vectors
        dest_data = text_pre.add_padding(dest_data, dest_length)
        targ_data = text_pre.add_padding(targ_data, targ_length)

        # Reshape Input - Sequential models need a 3-dimensional input:
        #  TrainSize x PadLength x 1 (word-int)
        dest_data = dest_data.reshape((-1, dest_data.shape[-1], 1))
        # Sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
        targ_data = targ_data.reshape(*targ_data.shape, 1)

        result_dict = {}
        dest_tuple = (dest_data, dest_tok, dest_vocab_size, dest_length)
        targ_tuple = (targ_data, targ_tok, targ_vocab_size, targ_length)
        for key, value in {
                'destination': dest_tuple,
                'target': targ_tuple
        }.items():
            result_dict[key + "_data"] = value[0]
            result_dict[key + "_tokenizer"] = value[1]
            result_dict[key + "_vocab_size"] = value[2]
            result_dict[key + "_max_length"] = value[3]

        return result_dict
    def test_preprocess_deu_eng_from_file(self):
        '''
        Testing Preprocessing on Deu-Eng pairs from a file.
        '''
        # Set Zarathustra (not in NLTK corpus)
        input_doc = TextPreprocessor.read_file(self.FILE_PATH)

        # Expected
        expected_destination = [['go'], ['hi'], ['hi'], ['run']]
        expected_target = [['geh'], ['hallo'], ['grüß', 'gott'], ['lauf']]

        destinations, targets = TextPreprocessor.preprocess(input_doc)

        self.assertEqual(destinations[0:4], expected_destination)
        self.assertEqual(targets[0:4], expected_target)
    def test_preprocess_nietzsche_zarathustra_from_file(self):
        '''
        Testing Preprocessing on Zarathustra from a file.
        '''
        # Set Zarathustra (not in NLTK corpus)
        zarathustra = TextPreprocessor.read_file(self.FILE_PATH)

        # Expected
        expected_nineteen = [
            'i', 'zarathustra', 'dreissig', 'jahr', 'alt', 'verliess',
            'heimat', 'see', 'heimat', 'gieng', 'gebirge'
        ]
        expected_twenty = [
            'genoss', 'geistes', 'einsamkeit', 'wurde', 'zehn', 'jahre', 'müde'
        ]

        norm_zarathustra = TextPreprocessor.preprocess(zarathustra, "german")

        self.assertEqual(norm_zarathustra[19], expected_nineteen)
        self.assertEqual(norm_zarathustra[20], expected_twenty)