Beispiel #1
0
    def prepare_data(cls, input_file=None, language="english", vocabulary_size=10000):
        '''
        Preprocess a given input file, and convert it to a mapped dictionary.
        '''
        # Load data from disk
        document = TextPreprocessor.read_file(input_file)

        # Preprocess and normalize
        norm_doc = TextPreprocessor.preprocess(document, language)

        # Generate mapping, dict and reverse_dict
        return TextPreprocessor.convert_to_dictionary(norm_doc, vocabulary_size)
    def test_preprocess_deu_eng_from_file(self):
        '''
        Testing Preprocessing on Deu-Eng pairs from a file.
        '''
        # Set Zarathustra (not in NLTK corpus)
        input_doc = TextPreprocessor.read_file(self.FILE_PATH)

        # Expected
        expected_destination = [['go'], ['hi'], ['hi'], ['run']]
        expected_target = [['geh'], ['hallo'], ['grüß', 'gott'], ['lauf']]

        destinations, targets = TextPreprocessor.preprocess(input_doc)

        self.assertEqual(destinations[0:4], expected_destination)
        self.assertEqual(targets[0:4], expected_target)
    def test_convert_to_dictionary(self):
        '''
        Testing convertion of a normalized doc to a dictionary and mapped dataset.
        '''
        # Setup small norm_doc
        norm_doc = [['lie', 'low'], ['lock', 'it'],
                    ['i', 'loved', 'that', 'house'],
                    [
                        'may', 'i', 'speak', 'to', 'you', 'outside', 'for',
                        'a', 'minute'
                    ], ["Hooray"], ['lock', 'it']]

        expected_data = np.array([[4, 5], [1, 2], [3, 6, 7, 8],
                                  [9, 3, 10, 11, 12, 13, 14, 15, 16], [17],
                                  [1, 2]])

        result_tuple = TextPreprocessor.convert_to_dictionary(norm_doc, 100)
        data = result_tuple[0]
        tokenizer = result_tuple[1]
        vocab_size = result_tuple[2]
        max_length = result_tuple[3]

        self.assertTrue((data == expected_data).all())
        self.assertIsInstance(tokenizer, Tokenizer)
        self.assertLessEqual(len(tokenizer.word_index), 100)
        self.assertEqual(vocab_size, 18)
        self.assertEqual(max_length, 9)
    def test_preprocess_deu_eng_none_length(self):
        '''
        Testing Preprocessing on Deu-Eng pairs.
        '''
        # Set Input
        deu_eng = (
            'Go.\tGeh.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272\n'
            'Hi.\tHallo!\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123\n'
            'I\'m game.\tIch bin dabei.\tCC-BY 2.0 (France)\n'
            'Please keep me updated.\tBitte halten Sie mich auf dem Laufenden.\tCC-BY\n'
        )

        # Expected
        expected_destination = [['go'], ['hi'], ['im', 'game'],
                                ['please', 'keep', 'me', 'updated']]
        expected_target = [['geh'], ['hallo'], ['ich', 'bin', 'dabei'],
                           [
                               'bitte', 'halten', 'sie', 'mich', 'auf', 'dem',
                               'laufenden'
                           ]]

        # Do return all sentences lengths
        destination, target = TextPreprocessor.preprocess(deu_eng, None)

        self.assertEqual(destination, expected_destination)
        self.assertEqual(target, expected_target)
    def test_preprocess_nietzsche_zarathustra(self):
        '''
        Testing Preprocessing on Zarathustra.
        '''
        # Set Zarathustra (not in NLTK corpus)
        zarathustra = ('[5/0011] '
                       'I. '
                       'Als Zarathustra dreissig Jahr alt war, verliess er '
                       'seine Heimat und den See seiner Heimat und gieng in '
                       'das Gebirge. Hier genoss er seines Geistes und seiner '
                       'Einsamkeit und wurde dessen zehn Jahre nicht müde. '
                       'Endlich aber verwandelte sich sein Herz, — und eines '
                       'Morgens stand er mit der Morgenröthe auf, trat vor '
                       'die Sonne hin und sprach zu ihr also:')

        # Expected
        expected_zero = [
            'i', 'zarathustra', 'dreissig', 'jahr', 'alt', 'verliess',
            'heimat', 'see', 'heimat', 'gieng', 'gebirge'
        ]
        expected_two = [
            'genoss', 'geistes', 'einsamkeit', 'wurde', 'zehn', 'jahre', 'müde'
        ]

        norm_zarathustra = TextPreprocessor.preprocess(zarathustra, "german")

        self.assertEqual(norm_zarathustra[0], expected_zero)
        self.assertEqual(norm_zarathustra[1], expected_two)
    def test_read(self):
        '''
        Testing File reading.
        '''
        expected = ('Hallo!\tCC-BY 2.0 (France) Att')
        doc = TextPreprocessor.read_file(self.FILE_PATH)

        self.assertEqual(len(doc), 34382099)
        self.assertEqual(doc[91:120], expected)
    def test_read_file(self):
        '''
        Testing File reading.
        '''
        expected = ('Ein Buch\n' 'für\n' 'Alle und Keinen')
        doc = TextPreprocessor.read_file(self.FILE_PATH)

        self.assertEqual(len(doc), 120483)
        self.assertEqual(doc[92:120], expected)
Beispiel #8
0
    def tokenizeDocument(self, doc, tokenize='sentence', returnDoc=False):

        if tokenize == 'sentence':
            spt = nltk.PunktSentenceTokenizer()

            if isinstance(doc, (pd.DataFrame, pd.Series)):
                tokens = doc.apply(
                    lambda row: spt.tokenize(' '.join(row))).values
            elif isinstance(doc, list):
                tokens = list()
                for each in doc:
                    tokens.append(spt.tokenize(each))
            else:
                tokens = spt.tokenize(doc)
        elif tokenize == 'word':

            wpt = nltk.WordPunctTokenizer()

            if isinstance(doc, (pd.DataFrame, pd.Series)):
                tokens = doc.apply(
                    lambda row: wpt.tokenize(' '.join(row))).values
            elif isinstance(doc, list):
                tokens = list()
                for each in doc:
                    tokens.append(wpt.tokenize(each))
            else:
                tokens = wpt.tokenize(doc)

        preProcObj = TextPreprocessor()

        #final_tokens = preProcObj.preprocess_text(tokens,[preProcObj.removeStopWords,preProcObj.removeNumbers,preProcObj.removeEmptyString],strFlag=False)
        #final_tokens = preProcObj.preprocess_text(tokens,[preProcObj.lowercase,preProcObj.lemmatize,preProcObj.removePunctuation,preProcObj.removeEmptyString,preProcObj.removehypen],strFlag=False)
        final_tokens = preProcObj.preprocess_text(tokens,
                                                  [preProcObj.lowercase],
                                                  strFlag=False)

        if returnDoc:
            # re-create document from filtered tokens
            doc = ' '.join(final_tokens)
            return final_tokens, doc
        else:
            return final_tokens
    def test_map_to_string(self):
        '''
        Map a given vector to an unpadded string.
        '''

        norm_doc = [['lie', 'low'], ['lock', 'it'],
                    ['i', 'loved', 'that', 'house'],
                    [
                        'may', 'i', 'speak', 'to', 'you', 'outside', 'for',
                        'a', 'minute'
                    ]]
        result_tuple = TextPreprocessor.convert_to_dictionary(norm_doc)
        tokenizer = result_tuple[1]

        input_vector = np.asarray([[1, 6, 12, 13, 14, 15, 16, 0, 0]])
        expected_sentence = ["i loved you outside for a minute"]

        mapped_sentence = TextPreprocessor.map_to_string(
            input_vector, tokenizer)
        self.assertEqual(mapped_sentence, expected_sentence)
    def test_convert_to_dictionary(self):
        '''
        Testing convertion of a normalized doc to a dictionary and mapped dataset.
        '''
        # Setup small norm_doc
        norm_doc = [['old', 'testament', 'king', 'james', 'bible'],
                    [
                        'god', 'said', 'let', 'firmament', 'midst', 'waters',
                        'let', 'divide', 'waters', 'waters'
                    ]]

        expected_dictionary = {
            '<unk>': 0,
            'bible': 7,
            'divide': 12,
            'firmament': 10,
            'god': 8,
            'james': 6,
            'king': 5,
            'let': 2,
            'midst': 11,
            'old': 3,
            'said': 9,
            'testament': 4,
            'waters': 1
        }
        expected_reverse_dictionary = {
            0: '<unk>',
            1: 'waters',
            2: 'let',
            3: 'old',
            4: 'testament',
            5: 'king',
            6: 'james',
            7: 'bible',
            8: 'god',
            9: 'said',
            10: 'firmament',
            11: 'midst',
            12: 'divide'
        }
        expected_data = [[3, 4, 5, 6, 7], [8, 9, 2, 10, 11, 1, 2, 12, 1, 1]]

        data, dictionary, reverse_dictionary = TextPreprocessor.convert_to_dictionary(
            norm_doc, 100)

        for i in range(12):
            word = reverse_dictionary[i]
            idx = dictionary[word]
            self.assertEqual(i, idx)

        self.assertEqual(data, expected_data)
        self.assertEqual(dictionary, expected_dictionary)
        self.assertEqual(reverse_dictionary, expected_reverse_dictionary)
    def test_preprocess_nietzsche_zarathustra_from_file(self):
        '''
        Testing Preprocessing on Zarathustra from a file.
        '''
        # Set Zarathustra (not in NLTK corpus)
        zarathustra = TextPreprocessor.read_file(self.FILE_PATH)

        # Expected
        expected_nineteen = [
            'i', 'zarathustra', 'dreissig', 'jahr', 'alt', 'verliess',
            'heimat', 'see', 'heimat', 'gieng', 'gebirge'
        ]
        expected_twenty = [
            'genoss', 'geistes', 'einsamkeit', 'wurde', 'zehn', 'jahre', 'müde'
        ]

        norm_zarathustra = TextPreprocessor.preprocess(zarathustra, "german")

        self.assertEqual(norm_zarathustra[19], expected_nineteen)
        self.assertEqual(norm_zarathustra[20], expected_twenty)
Beispiel #12
0
    def run_translation(self, model_path, dictionaries_path):
        '''
        Use a saved model (plus Dictionaries) to translate from Destination -> Target:
        '''

        model, model_dicts = self.__load_model_params(model_path,
                                                      dictionaries_path)

        input_length = model_dicts.get("params").get("input_length")
        input_tokenizer = model_dicts.get("input_tokenizer")
        output_tokenizer = model_dicts.get("output_tokenizer")

        # Run while loop to handle inputs
        input_sentence = ''
        while input_sentence != 'exit':
            # Ask for input
            input_sentence = input("Sentence to translate or enter 'exit': ")

            if input_sentence == 'exit':
                break

            # Convert - longer than max_length will return None
            processed_input = text_pre.map_to_vector(input_sentence,
                                                     input_tokenizer,
                                                     input_length)

            # Do not process inputs longer than allowed length
            if len(processed_input) > input_length:
                print("Sorry, input longer than: %d", input_length)
                break

            pred = model.predict(processed_input)
            # Pred is a list of predictions for each input
            # We find the vector by using argmax on axis 1
            for entry in pred:
                vector = [np.argmax(entry, axis=-1)]
                output_sentence = text_pre.map_to_string(
                    vector, output_tokenizer)
                print(output_sentence)
    def test_map_to_vector(self):
        '''
        Map a given sentence to a padded input vector.
        '''

        norm_doc = [['lie', 'low'], ['lock', 'it'],
                    ['i', 'loved', 'that', 'house'],
                    [
                        'may', 'i', 'speak', 'to', 'you', 'outside', 'for',
                        'a', 'minute'
                    ]]
        max_length = 9
        result_tuple = TextPreprocessor.convert_to_dictionary(norm_doc)
        tokenizer = result_tuple[1]

        input_sentence = "I loved you, outside for a minute."
        expected_vector = np.asarray([[[1], [6], [12], [13], [14], [15], [16],
                                       [0], [0]]])

        mapped_vector = TextPreprocessor.map_to_vector(input_sentence,
                                                       tokenizer, max_length)
        self.assertTrue((mapped_vector == expected_vector).all())
    def test_preprocess_king_james_bible(self):
        '''
        Testing Preprocessing on the KingJamesBible.
        '''
        # Load King James Bible
        bible = gutenberg.sents('bible-kjv.txt')

        # Expected
        expected_one = ['old', 'testament', 'king', 'james', 'bible']
        expected_ten = [
            'god', 'said', 'let', 'firmament', 'midst', 'waters', 'let',
            'divide', 'waters', 'waters'
        ]

        norm_bible = TextPreprocessor.preprocess(bible[:11], "english")

        self.assertEqual(norm_bible[1], expected_one)
        self.assertEqual(norm_bible[10], expected_ten)
    def test_padding(self):
        '''
        Testing convertion of a normalized doc to a dictionary and mapped dataset.
        '''
        # Setup small norm_doc
        sequences = np.array([[2, 3], [4, 5], [1, 6, 7, 8],
                              [9, 1, 10, 11, 12, 13, 14, 15, 16]])

        expected_sequences = np.array([[2, 3, 0, 0, 0, 0, 0, 0, 0, 0],
                                       [4, 5, 0, 0, 0, 0, 0, 0, 0, 0],
                                       [1, 6, 7, 8, 0, 0, 0, 0, 0, 0],
                                       [9, 1, 10, 11, 12, 13, 14, 15, 16, 0]])

        padding_length = 10
        padded_sequences = TextPreprocessor.add_padding(
            sequences, padding_length)

        self.assertTrue((padded_sequences == expected_sequences).all())
Beispiel #16
0
    def prepare_data(cls,
                     input_file,
                     max_vocab_size=None,
                     dest_max_length=None) -> Dict:
        '''
        Preprocess a given input file, and convert it to a mapped dictionary.

        input_file: File doucment, having this structure:
                   DESTINATION_SENTENCE [TAB] TARGET_SENTENCE [TAB] ADD_INFO [NEWLINE]
        max_vocab_size: maximal vocabulary size to use.
        dest_max_length: Max length for DESTINATION_SENTENCE
        '''
        # Load data from disk
        document = text_pre.read_file(input_file)

        # Preprocess and normalize
        norm_destination, norm_target = text_pre.preprocess(
            document, dest_max_length)

        # Convert destination and target to vectors
        dest_tuple = text_pre.convert_to_dictionary(norm_destination,
                                                    max_vocab_size)
        targ_tuple = text_pre.convert_to_dictionary(norm_target,
                                                    max_vocab_size)
        # Unpack
        dest_data, dest_tok, dest_vocab_size, dest_length = (dest_tuple)
        targ_data, targ_tok, targ_vocab_size, targ_length = (targ_tuple)

        # Pad both vectors
        dest_data = text_pre.add_padding(dest_data, dest_length)
        targ_data = text_pre.add_padding(targ_data, targ_length)

        # Reshape Input - Sequential models need a 3-dimensional input:
        #  TrainSize x PadLength x 1 (word-int)
        dest_data = dest_data.reshape((-1, dest_data.shape[-1], 1))
        # Sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
        targ_data = targ_data.reshape(*targ_data.shape, 1)

        result_dict = {}
        dest_tuple = (dest_data, dest_tok, dest_vocab_size, dest_length)
        targ_tuple = (targ_data, targ_tok, targ_vocab_size, targ_length)
        for key, value in {
                'destination': dest_tuple,
                'target': targ_tuple
        }.items():
            result_dict[key + "_data"] = value[0]
            result_dict[key + "_tokenizer"] = value[1]
            result_dict[key + "_vocab_size"] = value[2]
            result_dict[key + "_max_length"] = value[3]

        return result_dict
    def test_normalize_english(self):
        '''
        Testing normalization of english sentences.
        '''
        input_doc = (
            "The symbol ⟨β⟩ is the Greek letter beta. "
            "Song, J.; Choo, Y. -J.; Cho, J. -C. (2008). \"Perlucidibaca piscinae gen. "
            "nov., sp. nov., a freshwater bacterium belonging to the family "
            "Moraxellaceae\" "
            "The leaves range from 2 to 12 centimeters (0.79 to 4.72 in) in length and "
            "1 to 5 centimeters (0.39 to 1.97 in) in breadth. ")

        expected_doc = (
            "symbol greek letter beta "
            "song j choo j cho j c perlucidibaca piscinae gen nov sp nov freshwater "
            "bacterium belonging family moraxellaceae "
            "leaves range centimeters length centimeters breadth")

        normalized_doc = TextPreprocessor.normalize(input_doc, "english")
        self.assertTrue(normalized_doc == expected_doc)
    def test_normalize_german(self):
        '''
        Testing normalization of german sentences.
        '''
        input_doc = (
            "Felix Heuberger (* 7. März 1888 in Wien; † 25. Jänner 1968 in Hall "
            "in Tirol) war ein österreichischer Maler, Radierer und Ingenieur. "
            "Die Pfarrkirche wurde 1066 als „ecclesia Grazluppa“ erstmals urkundlich "
            "erwähnt. "
            "Spanisch ​[⁠β⁠]​: jedes b und v, das nicht im absoluten Anlaut steht und "
            "auch einem Nasal folgt. ")

        expected_doc = (
            "felix heuberger märz wien jänner hall tirol österreichischer "
            "maler radierer ingenieur "
            "pfarrkirche wurde ecclesia grazluppa erstmals urkundlich erwähnt "
            "spanisch b v absoluten anlaut steht nasal folgt")

        normalized_doc = TextPreprocessor.normalize(input_doc, "german")

        self.assertTrue(normalized_doc == expected_doc)
Beispiel #19
0
    ModelType.NB: True,
    ModelType.SVM: True,
    ModelType.DT: True,
    ModelType.RF: True
}
start_time = get_time_in_millis()

models_for_test = test_model.keys(
)  #[ModelType.LDA, ModelType.LSA, ModelType.NB, ModelType.LDA_Sklearn, ModelType.SVM, ModelType.RF, ModelType.DT]

tester = GeneralTester(log_writer, start_time)
datasets_helper = Dataset_Helper(preprocess=False)
datasets_helper.set_wanted_datasets([12])
#array to iterate should contain valid indexes (ranging from 0 to length of data_sets) of datasets that are present in list data_sets
while datasets_helper.next_dataset():  #range(len(data_sets)):
    topic_names = TextPreprocessor.load_csv(
        [datasets_helper.get_dataset_folder_path() + "\\topic-names.csv"])
    tester.set_new_dataset(datasets_helper.get_num_of_topics(), topic_names)
    statistics_to_merge = []
    models_params = {
        ModelType.LDA: {
            "topic_count": datasets_helper.get_num_of_topics(),
            "topic_word_count": 15,
            "kappa": 0.51,
            "tau": 2.0,
            "passes": 25,
            "iterations": 25
        },
        ModelType.LSA: {
            "topic_count": datasets_helper.get_num_of_topics(),
            "topic_word_count": 15,
            "one_pass": False,
Beispiel #20
0
    lsa_variations = []
    create_variations(0, [], lsa_all_vals, lsa_variations)
    statistics_to_merge = []
    for index, preproces_settings in enumerate(preproces_variations):
        seed = 5
        settings = {
            'strip_nums': preproces_settings[0],
            'use_stemmer': preproces_settings[1],
            'use_lemmatizer': preproces_settings[2],
            'strip_short': preproces_settings[3]
        }
        log_writer.add_log(
            "Initializing text preprocessor with strip_nums: {}, use_stemmer: {}, use_lemmatizer {}, strip_short: {}."
            .format(preproces_settings[0], preproces_settings[1],
                    preproces_settings[2], preproces_settings[3]))
        text_preprocessor = TextPreprocessor(settings)

        log_writer.add_log(
            "Starting preprocessing texts of {} for training".format(
                data_sets[i][0]))
        texts_for_train = text_preprocessor.load_and_prep_csv(
            [data_sets[i][0]], "eng", False, 1, ';')
        log_writer.add_log("Preprocessing finished")

        log_writer.add_log(
            "Starting preprocessing texts of {} for training".format(
                data_sets[i][0]))
        texts_for_topic_asses = text_preprocessor.load_and_prep_csv(
            [data_sets[i][0]], "eng", True, 1, ';')
        log_writer.add_log("Preprocessing finished")