Beispiel #1
0
    def test_get_k_best_sentences_of_text(self):
        text = """The baptism of Jesus is described in the gospels of Matthew, Mark and Luke. 
        John's gospel does not directly describe Jesus' baptism.
        Most modern theologians view the baptism of Jesus by John the Baptist as a historical event 
        to which a high degree of certainty can be assigned.[1][2][3][4][5] Along with the crucifixion 
        of Jesus, most biblical scholars view it as one of the two historically certain facts about him, 
        and often use it as the starting point for the study of the historical Jesus.[6]
        The baptism is one of the five major milestones in the gospel narrative of the life of Jesus, 
        the others being the Transfiguration, Crucifixion, Resurrection, and Ascension.[7][8] 
        Most Christian denominations view the baptism of Jesus as an important event and a basis for the 
        Christian rite of baptism (see also Acts 19:1–7). In Eastern Christianity, Jesus' baptism is commemorated 
        on 6 January (the Julian calendar date of which corresponds to 19 January on the Gregorian calendar), 
        the feast of Epiphany.[9] In the Roman Catholic Church, the Anglican Communion, the Lutheran Churches and 
        some other Western denominations, it is recalled on a day within the following week, the feast of the 
        baptism of the Lord. In Roman Catholicism, the baptism of Jesus is one of the Luminous Mysteries sometimes 
        added to the Rosary. It is a Trinitarian feast in the Eastern Orthodox Churches."""

        # Expected result was previously calculated and stored in disk
        expected_result = load_obj_from_disk(
            'test_get_k_best_sentences_of_text_expected_result',
            SAVED_OBJECTS_PATH)

        tr_model = TextRank()
        result = tr_model.get_k_best_sentences_of_text(text)

        self.assertEqual(expected_result, result)
    def test_save_and_load_obj_on_disk(self):
        test_list = [1, 2, 3, 4]
        save_obj_to_disk(test_list, 'test_list', SAVED_OBJECTS_PATH)
        test_list_from_disk = load_obj_from_disk('test_list',
                                                 SAVED_OBJECTS_PATH)

        os.remove(join_paths(SAVED_OBJECTS_PATH, 'test_list.pickle'))

        self.assertEqual(test_list, test_list_from_disk)
    def test_get_topics(self):
        # For testing this method, an LdaMalletModel loaded from disk will be used
        model = LdaMalletModel.load('lda-mallet-model',
                                    SAVED_TOPICS_MODELS_PATH)

        # Expected result was previously calculated and stored in disk
        expected_result = load_obj_from_disk('test_get_topics_expected_result',
                                             SAVED_OBJECTS_PATH)

        result = model.get_topics()

        self.assertEqual(expected_result, result)
    def test_get_k_most_representative_docs_of_topic_as_df(self):
        # For testing this method, an LdaMalletModel loaded from disk will be used
        model = LdaMalletModel.load('lda-mallet-model',
                                    SAVED_TOPICS_MODELS_PATH)

        # Expected result was previously calculated and stored in disk
        expected_result = load_obj_from_disk(
            'test_get_k_most_representative_docs_of_topic_as_df_expected_result',
            SAVED_OBJECTS_PATH)

        result = model.get_k_most_repr_docs_of_topic_as_df(topic=0, k=5)

        # noinspection PyUnresolvedReferences
        self.assertTrue(expected_result.equals(result))
    def test_get_dominant_topic_of_each_doc_as_df(self):
        # For testing this method, an LdaGensimModel loaded from disk will be used,
        # because LdaMallet is extremely slow to generate the docs_topics_df
        model = LdaGensimModel.load('lda-gensim-model',
                                    SAVED_TOPICS_MODELS_PATH)
        model.docs_topics_df = None  # After loading the model, docs_topics_df was also loaded from disk

        # Expected result was previously calculated and stored in disk
        expected_result = load_obj_from_disk(
            'test_get_dominant_topic_of_each_doc_as_df_expected_result',
            SAVED_OBJECTS_PATH)

        result = model.get_dominant_topic_of_each_doc_as_df()

        # noinspection PyUnresolvedReferences
        self.assertTrue(expected_result.equals(result))
    def test_predict_topic_prob_on_text(self):
        # For testing this method, an LdaMalletModel loaded from disk will be used
        model = LdaMalletModel.load('lda-mallet-model',
                                    SAVED_TOPICS_MODELS_PATH)

        text = """The baptism of Jesus is described in the gospels of Matthew, Mark and Luke. John's gospel does not
        directly describe Jesus' baptism. Most modern theologians view the baptism of Jesus by John the Baptist as a
        historical event to which a high degree of certainty can be assigned.[1][2][3][4][5] Along with the crucifixion
        of Jesus, most biblical scholars view it as one of the two historically certain facts about him, and often use 
        it as the starting point for the study of the historical Jesus.[6]"""

        # Expected result was previously calculated and stored in disk
        expected_result = load_obj_from_disk(
            'test_predict_topic_prob_on_text_expected_result',
            SAVED_OBJECTS_PATH)

        result = model.predict_topic_prob_on_text(text, print_table=False)

        self.assertEqual(expected_result, result)
Beispiel #7
0
    def load(cls, name: str, parent_folder_path: str = None) -> 'DatasetPreprocessingOptions':
        """
        Loads the options of a saved DatasetPreprocessingOptions object stored on disk.

        :param name: Name of the folder that contains the DatasetPreprocessingOptions object files.
        :param parent_folder_path: Path of the folder that contains the folder with the object files.
        :return: The DatasetPreprocessingOptions object loaded from disk.
        """
        files_folder = join_paths(parent_folder_path, name)

        # Load all the attributes except the ngrams_model_func (it's a dict)
        # noinspection PyTypeChecker
        options_except_ngrams_model_func: dict = load_obj_from_disk(name + '_options_except_ngrams_model_func',
                                                                    files_folder)

        # Load the ngrams_model_func
        ngrams_model_func = load_func_from_disk(name + '_ngrams_model_func', files_folder)

        # Join them in the same dict
        options = options_except_ngrams_model_func
        options['ngrams_model_func'] = ngrams_model_func

        # Create an instance of this class using the dict
        return cls(**options)
Beispiel #8
0
 def test_as_documents_content_list_method(self):
     expected_as_documents_content_list = load_obj_from_disk(
         'as_documents_content_list', SAVED_OBJECTS_PATH)
     self.assertEqual(expected_as_documents_content_list,
                      self.dataset.as_documents_content_list())
    def load(cls,
             name: str,
             parent_dir_path: str = None,
             dataset_path: str = None) -> 'Dataset':
        """
        Loads a saved dataset object from disk.

        :param name: Name of the folder where the dataset object files are stored.
        :param parent_dir_path: Path to the folder where the dataset object folder is stored on disk.
        :param dataset_path: Path to the folder that contains the original dataset documents.
        :return: The dataset loaded from disk.

        For example, consider the following directory structure:

        * stored-datasets-objects/dataset_obj_1/dataset_obj_1_preprocessing_options/...
        * stored-datasets-objects/dataset_obj_1/dataset_obj_1__except_preprocessing_options.pickle
        * datasets/20_newsgroups

        Where 20_newsgroups contains the original 20_newsgroups dataset documents and dataset_obj_1 contains |
        the files of a previously stored dataset object (with the save() method).

        To load the dataset_obj_1 dataset object that contains a dataset object of the 20 newsgroups dataset, \
        this method should be called this way:

        >>> from topics_and_summary.datasets.common import Dataset
        >>> dataset = Dataset.load('dataset_obj_1', 'path/to/stored-datasets-objects', 'path/to/datasets')
        """
        if parent_dir_path is None:
            parent_dir_path = get_abspath_from_project_source_root(
                'saved-elements/objects')

        files_folder = join_paths(parent_dir_path, name)

        # Load the dataset (except the preprocessing options)
        # noinspection PyTypeChecker
        dataset: Dataset = load_obj_from_disk(
            name + '_except_preprocessing_options', files_folder)

        # If the <dataset-name>_preprocessing_options folder exists, it means that the preprocessing_options where saved
        # In that case, the preprocessing_options are loaded
        if os.path.exists(
                join_paths(files_folder, name + '_preprocessing_options')):
            dataset.preprocessing_options = \
                DatasetPreprocessingOptions.load(name + '_preprocessing_options', files_folder)
        else:
            dataset.preprocessing_options = None

        # Update the dataset_path of the object if a value is given
        if dataset_path is not None:
            dataset.dataset_path = dataset_path
        else:
            # If the path to the files of the dataset has changed after the dataset object was stored,
            # the dataset_path attribute of the loaded object is wrong, but in this class we don't know the current
            # path of the dataset files, so the user needs to check if the path is ok or it needs to be updated
            warnings.warn(
                "\nThe dataset_path attribute of the loaded dataset object may need to be updated. "
                "It's current value is: {0}.\n"
                "If the path to the files of the dataset has changed after the dataset object was stored, "
                "the dataset_path attribute of the loaded object is wrong and needs to be changed manually.\n"
                "There are 2 ways to update the dataset path:\n"
                "\t1. Change it directly in the loaded model: dataset_obj.dataset_path = <path>\n"
                "\t2. Load the dataset again with load(), specifying the path in the dataset_path parameter"
                .format(dataset.dataset_path))

        return dataset