コード例 #1
0
    def test_dataset_save_and_load_with_preprocessing_options(self):
        trigrams_func = load_func_from_disk('trigrams_func', SAVED_FUNCS_PATH)
        options = DatasetPreprocessingOptions(normalize=True,
                                              lowercase=True,
                                              stopwords=False,
                                              contractions=False,
                                              vulgar_words=True,
                                              emails=True,
                                              punctuation=False,
                                              ngrams='tri',
                                              ngrams_model_func=trigrams_func,
                                              lemmatize=True,
                                              stem=True,
                                              apostrophes=True,
                                              chars=True)
        dataset = deepcopy(self.dataset)
        dataset.preprocessing_options = options

        # Save the dataset to disk
        dataset.save('test_dataset', SAVED_OBJECTS_PATH)

        # Load the dataset from disk
        dataset_from_disk = TwentyNewsGroupsDataset.load(
            'test_dataset', SAVED_OBJECTS_PATH,
            TwentyNewsGroupsDataset.DATASET_PATH)

        # Remove the dataset previously stored on disk
        rmtree(join_paths(SAVED_OBJECTS_PATH, 'test_dataset'))

        # Check that the original dataset and the dataset saved and loaded are the same
        self.assertEqual(dataset, dataset_from_disk)
コード例 #2
0
    def test_save_and_load_func_on_disk(self):
        def test_func(x):
            return x**3

        save_func_to_disk(test_func, 'test_func', SAVED_FUNCS_PATH)
        test_func_from_disk = load_func_from_disk('test_func',
                                                  SAVED_FUNCS_PATH)

        os.remove(join_paths(SAVED_FUNCS_PATH, 'test_func.dill'))

        # To compare the functions, we have to use them
        test_func_result_list = [
            test_func(1),
            test_func(2),
            test_func(3),
            test_func(4)
        ]
        test_func_from_disk_result_list = [
            test_func_from_disk(1),
            test_func_from_disk(2),
            test_func_from_disk(3),
            test_func_from_disk(4)
        ]

        self.assertEqual(test_func_result_list,
                         test_func_from_disk_result_list)
コード例 #3
0
    def test_preprocess_text(self):
        text = """
        Windows DOS is a family of disk operating systems, hence the name. It isn't a very good OS. What do u think, [email protected]?
        Mike's house is too big! N.A.S.A.
        """
        expected_preprocessed_text = 'window disk_operating_system family disk_operating system good mike house big'

        trigrams_func = load_func_from_disk('trigrams_func', SAVED_FUNCS_PATH)
        preprocessed_text = preprocess_text(text,
                                            ngrams='tri',
                                            ngrams_model_func=trigrams_func)

        self.assertEqual(expected_preprocessed_text, preprocessed_text)
コード例 #4
0
    def test_dataset_preprocessing_options_as_dict(self):
        trigrams_func = load_func_from_disk('trigrams_func', SAVED_FUNCS_PATH)
        options = DatasetPreprocessingOptions(normalize=True,
                                              lowercase=True,
                                              stopwords=False,
                                              contractions=False,
                                              vulgar_words=True,
                                              emails=True,
                                              punctuation=False,
                                              ngrams='tri',
                                              ngrams_model_func=trigrams_func,
                                              lemmatize=True,
                                              stem=True,
                                              apostrophes=True,
                                              chars=True)
        expected_dict = {
            'normalize': True,
            'lowercase': True,
            'stopwords': False,
            'contractions': False,
            'vulgar_words': True,
            'emails': True,
            'punctuation': False,
            'ngrams': 'tri',
            # ngrams_model_func is not included because it can't be directly compared
            'lemmatize': True,
            'stem': True,
            'apostrophes': True,
            'chars': True
        }

        options_dict = options.as_dict()

        # First, check if the ngrams_model_func behaves as expected
        words_list = ['windows', 'disk', 'operating', 'system']
        expected_ngrams = ['windows', 'disk_operating_system']
        self.assertEqual(expected_ngrams, trigrams_func(words_list))
        self.assertEqual(trigrams_func(words_list),
                         options_dict['ngrams_model_func'](words_list))

        # Remove the ngrams_model_func from the options_dict
        del options_dict['ngrams_model_func']

        # Second, check if the rest of options are the expected
        self.assertEqual(expected_dict, options_dict)
コード例 #5
0
    def test_dataset_preprocessing_options_save_and_load(self):
        trigrams_func = load_func_from_disk('trigrams_func', SAVED_FUNCS_PATH)
        options = DatasetPreprocessingOptions(normalize=True,
                                              lowercase=True,
                                              stopwords=False,
                                              contractions=False,
                                              vulgar_words=True,
                                              emails=True,
                                              punctuation=False,
                                              ngrams='tri',
                                              ngrams_model_func=trigrams_func,
                                              lemmatize=True,
                                              stem=True,
                                              apostrophes=True,
                                              chars=True)

        # Save the options to disk
        options.save('test_options', SAVED_OBJECTS_PATH)

        # Load the options from disk
        options_from_disk = DatasetPreprocessingOptions.load(
            'test_options', SAVED_OBJECTS_PATH)

        # Remove the options previously stored on disk
        rmtree(join_paths(SAVED_OBJECTS_PATH, 'test_options'))

        # Check that the original options and the options saved and loaded are the same
        # This doesn't check that the ngrams_model_func behave the same. Only checks if both are None or not None.
        self.assertEqual(options, options_from_disk)
        # Check that the ngrams_model_func behave the same
        words_list = ['windows', 'disk', 'operating', 'system']
        expected_ngrams = ['windows', 'disk_operating_system']
        self.assertEqual(expected_ngrams,
                         options.ngrams_model_func(words_list))
        self.assertEqual(options.ngrams_model_func(words_list),
                         options_from_disk.ngrams_model_func(words_list))
コード例 #6
0
    def load(cls, name: str, parent_folder_path: str = None) -> 'DatasetPreprocessingOptions':
        """
        Loads the options of a saved DatasetPreprocessingOptions object stored on disk.

        :param name: Name of the folder that contains the DatasetPreprocessingOptions object files.
        :param parent_folder_path: Path of the folder that contains the folder with the object files.
        :return: The DatasetPreprocessingOptions object loaded from disk.
        """
        files_folder = join_paths(parent_folder_path, name)

        # Load all the attributes except the ngrams_model_func (it's a dict)
        # noinspection PyTypeChecker
        options_except_ngrams_model_func: dict = load_obj_from_disk(name + '_options_except_ngrams_model_func',
                                                                    files_folder)

        # Load the ngrams_model_func
        ngrams_model_func = load_func_from_disk(name + '_ngrams_model_func', files_folder)

        # Join them in the same dict
        options = options_except_ngrams_model_func
        options['ngrams_model_func'] = ngrams_model_func

        # Create an instance of this class using the dict
        return cls(**options)