Esempio n. 1
0
    def test_check_same_csr_matrix(self):
        """複数回の変換を実施して、同一のcsr_matrixになることを確認する
        """
        n_joblib_tasks = 2

        data_csr_matrix1 = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix(
            labeled_documents=self.input_dict,
            ngram=1,
            n_jobs=n_joblib_tasks
        )
        assert isinstance(data_csr_matrix1, DataCsrMatrix)
        csr_matrix_1 = data_csr_matrix1.csr_matrix_
        label_group_dict_1 = data_csr_matrix1.label2id_dict
        vocabulary_1 = data_csr_matrix1.vocabulary
        n_doc_distri_1 = data_csr_matrix1.n_docs_distribution
        n_term_distir_1 = data_csr_matrix1.n_term_freq_distribution
        dense_matrix_1 = csr_matrix_1.toarray()

        data_csr_matrix2 = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix(
            labeled_documents=self.input_dict,
            ngram=1,
            n_jobs=n_joblib_tasks
        )
        assert isinstance(data_csr_matrix2, DataCsrMatrix)
        csr_matrix_2 = data_csr_matrix2.csr_matrix_
        label_group_dict_2 = data_csr_matrix2.label2id_dict
        vocabulary_2 = data_csr_matrix2.vocabulary
        n_doc_distri_2 = data_csr_matrix2.n_docs_distribution
        n_term_distir_2 = data_csr_matrix2.n_term_freq_distribution
        dense_matrix_2 = data_csr_matrix2.csr_matrix_.toarray()

        data_csr_matrix3 = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix(
            labeled_documents=self.input_dict,
            ngram=1,
            n_jobs=n_joblib_tasks
        )
        assert isinstance(data_csr_matrix3, DataCsrMatrix)
        csr_matrix_3 = data_csr_matrix3.csr_matrix_
        label_group_dict_3 = data_csr_matrix3.label2id_dict
        vocabulary_3 = data_csr_matrix3.vocabulary
        n_doc_distri_3 = data_csr_matrix3.n_docs_distribution
        n_term_distir_3 = data_csr_matrix3.n_term_freq_distribution
        dense_matrix_3 = data_csr_matrix3.csr_matrix_.toarray()

        assert numpy.array_equal(dense_matrix_1, dense_matrix_2)
        assert numpy.array_equal(dense_matrix_2, dense_matrix_3)
        assert numpy.array_equal(dense_matrix_1, dense_matrix_3)

        assert vocabulary_1 == vocabulary_2
        assert vocabulary_2 == vocabulary_3
        assert vocabulary_1 == vocabulary_3
    def setUp(self):
        input_dict = {
            "label_a": [["I", "aa", "aa", "aa", "aa", "aa"],
                        ["bb", "aa", "aa", "aa", "aa", "aa"],
                        ["I", "aa", "hero", "some", "ok", "aa"]],
            "label_b": [
                ["bb", "bb", "bb"],
                ["bb", "bb", "bb"],
                ["hero", "ok", "bb"],
                ["hero", "cc", "bb"],
            ],
            "label_c": [
                ["cc", "cc", "cc"],
                ["cc", "cc", "bb"],
                ["xx", "xx", "cc"],
                ["aa", "xx", "cc"],
            ]
        }

        data_csr_matrix = data_converter.DataConverter(
        ).convert_multi_docs2document_frequency_matrix(
            labeled_documents=input_dict, n_jobs=5)
        assert isinstance(data_csr_matrix, DataCsrMatrix)
        self.label2id_dict = data_csr_matrix.label2id_dict
        self.csr_matrix_ = data_csr_matrix.csr_matrix_
        self.n_docs_distribution = data_csr_matrix.n_docs_distribution
        self.vocabulary = data_csr_matrix.vocabulary
    def test_bns_cython(self):
        incorrect_input_dict = {
            "label_a": [
                ["I", "aa", "aa", "aa", "aa", "aa"],
                ["bb", "aa", "aa", "aa", "aa", "aa"],
                ["I", "aa", "hero", "some", "ok", "aa"]
            ],
            "label_b": [
                ["bb", "bb", "bb"],
                ["bb", "bb", "bb"],
                ["hero", "ok", "bb"],
                ["hero", "cc", "bb"],
                ["cc", "cc", "cc"],
                ["cc", "cc", "bb"],
                ["xx", "xx", "cc"],
                ["aa", "xx", "cc"],
            ]
        }

        data_csr_matrix = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix(
            labeled_documents=incorrect_input_dict,
            n_jobs=5
        )
        assert isinstance(data_csr_matrix, DataCsrMatrix)
        csr_matrix_ = data_csr_matrix.csr_matrix_
        n_docs_distribution = data_csr_matrix.n_docs_distribution

        result_bns = bns_python3.BNS().fit_transform(X=csr_matrix_,
                                        y=None,
                                        unit_distribution=n_docs_distribution,
                                        use_cython=True)
        print(result_bns)
    def test_check_input_error(self):
        incorrect_input_dict = {
            "label_a": [
                ["I", "aa", "aa", "aa", "aa", "aa"],
                ["bb", "aa", "aa", "aa", "aa", "aa"],
                ["I", "aa", "hero", "some", "ok", "aa"]
            ],
            "label_b": [
                ["bb", "bb", "bb"],
                ["bb", "bb", "bb"],
                ["hero", "ok", "bb"],
                ["hero", "cc", "bb"],
                ["cc", "cc", "cc"],
                ["cc", "cc", "bb"],
                ["xx", "xx", "cc"],
                ["aa", "xx", "cc"],
            ],
            "label_c":[
                ["aa", "xx", "cc"]
            ]
        }

        data_csr_matrix = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix(
            labeled_documents=incorrect_input_dict,
            n_jobs=5
        )
        assert isinstance(data_csr_matrix, DataCsrMatrix)
        csr_matrix_ = data_csr_matrix.csr_matrix_
        n_docs_distribution = data_csr_matrix.n_docs_distribution
        try:
            bns_python3.BNS().fit_transform(X=csr_matrix_, y=None, unit_distribution=n_docs_distribution)
        except:
            pass
    def test_fit_transform(self):

        data_csr_matrix = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix(
            labeled_documents=self.correct_input,
            n_jobs=5
        )
        assert isinstance(data_csr_matrix, DataCsrMatrix)
        label2id_dict = data_csr_matrix.label2id_dict
        csr_matrix_ = data_csr_matrix.csr_matrix_
        n_docs_distribution = data_csr_matrix.n_docs_distribution
        vocabulary = data_csr_matrix.vocabulary

        bns_score_csr_matrix = bns_python3.BNS().fit_transform(X=csr_matrix_, y=None,
                                        unit_distribution=n_docs_distribution,
                                        verbose=True)
        assert isinstance(bns_score_csr_matrix, csr_matrix)

        bns_scores_dict = ScoredResultObject(
            scored_matrix=bns_score_csr_matrix,
            label2id_dict=label2id_dict,
            feature2id_dict=vocabulary
        ).ScoreMatrix2ScoreDictionary()
        assert isinstance(bns_scores_dict, list)
        import pprint
        pprint.pprint(bns_scores_dict)
Esempio n. 6
0
    def test_complex_feature_convertion(self):
        """"""
        csr_matrix_information = data_converter.DataConverter(
        ).labeledMultiDocs2DocFreqMatrix(
            labeled_documents=self.input_dict_complex_feature, n_jobs=1)
        assert isinstance(csr_matrix_information, DataCsrMatrix)
        csr_matrix_ = csr_matrix_information.csr_matrix_
        label_group_dict = csr_matrix_information.label2id_dict
        vocabulary = csr_matrix_information.vocabulary

        assert isinstance(csr_matrix_, csr_matrix)
        assert isinstance(label_group_dict, dict)
        assert isinstance(vocabulary, dict)

        n_correct_sample = 3
        n_correct_feature = 5

        assert csr_matrix_.shape[0] == n_correct_sample
        assert csr_matrix_.shape[1] == n_correct_feature

        dense_matrix_constructed_matrix = csr_matrix_.toarray()

        # vocaburary id of correct matrix is {'cc': 3, 'aa': 1, 'some': 6, 'xx': 7, 'I': 0, 'ok': 5, 'hero': 4, 'bb': 2}
        # label id of correct matrix is {'label_c': 2, 'label_a': 0, 'label_b': 1}
        correct_array_numpy = numpy.array([
            [1.0, 3.0, 0.0, 2.0, 0.0],
            [0.0, 0.0, 1.0, 3.0, 1.0],
            [1.0, 1.0, 0.0, 2.0, 1.0],
        ]).astype(numpy.int64)
        assert numpy.array_equal(correct_array_numpy,
                                 dense_matrix_constructed_matrix)
    def setUp(self):
        input_dict = {
            "label_a": [["I", "aa", "aa", "aa", "aa", "aa"],
                        ["bb", "aa", "aa", "aa", "aa", "aa"],
                        ["I", "aa", "hero", "some", "ok", "aa"]],
            "label_b": [
                ["bb", "bb", "bb"],
                ["bb", "bb", "bb"],
                ["hero", "ok", "bb"],
                ["hero", "cc", "bb"],
            ],
            "label_c": [
                ["cc", "cc", "cc"],
                ["cc", "cc", "bb"],
                ["xx", "xx", "cc"],
                ["aa", "xx", "cc"],
            ]
        }

        tf_matrix = numpy.array([[2, 12, 1, 0, 1, 1, 1, 0],
                                 [0, 0, 8, 1, 2, 1, 0, 0],
                                 [0, 1, 1, 7, 0, 0, 0, 3]])

        data_csr_matrix = data_converter.DataConverter(
        ).labeledMultiDocs2DocFreqMatrix(labeled_documents=input_dict,
                                         ngram=1,
                                         n_jobs=-1)
        assert isinstance(data_csr_matrix, DataCsrMatrix)
        self.label2id_dict = data_csr_matrix.label2id_dict
        self.csr_matrix_ = data_csr_matrix.csr_matrix_
        self.n_docs_distribution = data_csr_matrix.n_docs_distribution
        self.vocabulary = data_csr_matrix.vocabulary

        numpy.array_equal(data_csr_matrix.csr_matrix_.toarray(), tf_matrix)
Esempio n. 8
0
    def test_multi_process_convert_data(self):
        """checks if it works or not when n_process is more than 1

        :return:
        """

        data_csr_object = data_converter.DataConverter(
        ).labeledMultiDocs2DocFreqMatrix(labeled_documents=self.input_dict,
                                         n_jobs=5)

        assert isinstance(data_csr_object.csr_matrix_, csr_matrix)
        assert isinstance(data_csr_object.label2id_dict, dict)
        assert isinstance(data_csr_object.vocabulary, dict)
Esempio n. 9
0
    def test_soa_doc_freq(self):
        data_csr_matrix = data_converter.DataConverter(
        ).convert_multi_docs2document_frequency_matrix(
            labeled_documents=self.input_dict, n_jobs=5)
        assert isinstance(data_csr_matrix, data_converter.DataCsrMatrix)
        label2id_dict = data_csr_matrix.label2id_dict
        csr_matrix_ = data_csr_matrix.csr_matrix_
        n_docs_distribution = data_csr_matrix.n_docs_distribution
        vocabulary = data_csr_matrix.vocabulary

        scored_matrix_doc_freq = soa_python3.SOA().fit_transform(
            X=csr_matrix_, unit_distribution=n_docs_distribution, verbose=True)

        soa_scores_doc_freq = ScoredResultObject(
            scored_matrix=scored_matrix_doc_freq,
            label2id_dict=label2id_dict,
            feature2id_dict=vocabulary).convert_score_matrix2score_record()
        self.assertTrue(isinstance(soa_scores_doc_freq, list))
Esempio n. 10
0
    def test_soa_doc_freq(self):
        data_csr_matrix = data_converter.DataConverter(
        ).labeledMultiDocs2DocFreqMatrix(labeled_documents=self.input_dict,
                                         n_jobs=5)
        assert isinstance(data_csr_matrix, data_converter.DataCsrMatrix)
        label2id_dict = data_csr_matrix.label2id_dict
        csr_matrix_ = data_csr_matrix.csr_matrix_
        n_docs_distribution = data_csr_matrix.n_docs_distribution
        vocabulary = data_csr_matrix.vocabulary

        scored_matrix_doc_freq = soa_python3.SOA().fit_transform(
            X=csr_matrix_, unit_distribution=n_docs_distribution, verbose=True)

        soa_scores_doc_freq = ScoredResultObject(
            scored_matrix=scored_matrix_doc_freq,
            label2id_dict=label2id_dict,
            feature2id_dict=vocabulary).ScoreMatrix2ScoreDictionary()

        import pprint
        print('doc freq based soa')
        pprint.pprint(soa_scores_doc_freq)
Esempio n. 11
0
    def test_basic_convert_data(self):
        """checks it works of not when n_jobs=1, n_process=1

        data convert過程のミスが疑われるので、整合性のチェックをする

        :return:
        """

        csr_matrix_information = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix(
            labeled_documents=self.input_dict,
            ngram=1,
            n_jobs=5
        )
        assert isinstance(csr_matrix_information, DataCsrMatrix)
        csr_matrix_ = csr_matrix_information.csr_matrix_
        label_group_dict = csr_matrix_information.label2id_dict
        vocabulary = csr_matrix_information.vocabulary

        assert isinstance(csr_matrix_, csr_matrix)
        assert isinstance(label_group_dict, dict)
        assert isinstance(vocabulary, dict)

        n_correct_sample = 3
        n_correct_featute = 8

        assert csr_matrix_.shape[0] == n_correct_sample
        assert csr_matrix_.shape[1] == n_correct_featute

        dense_matrix_constructed_matrix = csr_matrix_.toarray()

        # vocaburary id of correct matrix is {'cc': 3, 'aa': 1, 'some': 6, 'xx': 7, 'I': 0, 'ok': 5, 'hero': 4, 'bb': 2}
        # label id of correct matrix is {'label_c': 2, 'label_a': 0, 'label_b': 1}
        correct_array_numpy = numpy.array(
            [[2, 3, 1, 0, 1, 1, 1, 0],
             [0, 0, 4, 1, 2, 1, 0, 0],
             [0, 1, 1, 4, 0, 0, 0, 2]
         ]).astype(numpy.int64)
        assert numpy.array_equal(correct_array_numpy, dense_matrix_constructed_matrix)
Esempio n. 12
0
    def test_fit_transform(self):

        data_csr_matrix = data_converter.DataConverter(
        ).convert_multi_docs2document_frequency_matrix(
            labeled_documents=self.correct_input, n_jobs=5)
        assert isinstance(data_csr_matrix, DataCsrMatrix)
        label2id_dict = data_csr_matrix.label2id_dict
        csr_matrix_ = data_csr_matrix.csr_matrix_
        n_docs_distribution = data_csr_matrix.n_docs_distribution
        vocabulary = data_csr_matrix.vocabulary

        bns_score_csr_matrix = bns_python3.BNS().fit_transform(
            X=csr_matrix_,
            y=None,
            unit_distribution=n_docs_distribution,
            verbose=True)
        assert isinstance(bns_score_csr_matrix, csr_matrix)

        bns_scores_dict = ScoredResultObject(
            scored_matrix=bns_score_csr_matrix,
            label2id_dict=label2id_dict,
            feature2id_dict=vocabulary).convert_score_matrix2score_record()
        self.assertTrue(bns_scores_dict, list)
def run_feature_selection(input_dict: AvailableInputTypes,
                          method: str,
                          use_cython: bool = False,
                          is_use_cache: bool = False,
                          is_use_memmap: bool = False,
                          cache_backend: str = 'PersistentDict',
                          path_working_dir: str = None,
                          matrix_form=None,
                          n_jobs: int = 1) -> ScoredResultObject:
    """A interface function of DocumentFeatureSelection package.

    * Args
    - input_dict: Dict-object which has category-name as key and list of features as value.
        - You can put dict or sqlitedict.SqliteDict, or DocumentFeatureSelection.models.PersistentDict
    - method: A method name of feature selection metric
    - use_cython: boolean flag to use cython code for computation.
    It's much faster to use cython than native-python code
    - is_use_cache: boolean flag to use disk-drive for keeping objects which tends to be huge.
    - is_use_memmap: boolean flag to use memmap for keeping matrix object.
    - path_working_dir: str object.
        - The file path to directory where you save cache file or memmap matrix object. If you leave it None,
        it finds some directory and save files in it.
    - cache_backend
        - Named of cache backend if you put True on is_use_cache. [PersistentDict, SqliteDict]

    """
    if method not in METHOD_NAMES:
        raise Exception('method name must be either of {}. Yours: {}'.format(
            METHOD_NAMES, method))

    if (is_use_cache or is_use_memmap) and path_working_dir is None:
        path_working_dir = mkdtemp()
        logger.info(
            "Temporary files are created under {}".format(path_working_dir))

    if method == 'tf_idf':
        """You get scored-matrix with term-frequency.
        ATTENTION: the input for TF-IDF MUST be term-frequency matrix. NOT document-frequency matrix
        """
        matrix_data_object = data_converter.DataConverter(
        ).convert_multi_docs2term_frequency_matrix(
            labeled_documents=input_dict,
            n_jobs=n_jobs,
            is_use_cache=is_use_cache,
            is_use_memmap=is_use_memmap,
            path_working_dir=path_working_dir,
            cache_backend=cache_backend)
        assert isinstance(matrix_data_object, DataCsrMatrix)

        scored_sparse_matrix = TFIDF().fit_transform(
            X=matrix_data_object.csr_matrix_)
        assert isinstance(scored_sparse_matrix, csr_matrix)

    elif method in ['soa', 'pmi'] and matrix_form is None:
        """You get scored-matrix with either of soa or pmi.
        """
        matrix_data_object = data_converter.DataConverter(
        ).convert_multi_docs2document_frequency_matrix(
            labeled_documents=input_dict,
            n_jobs=n_jobs,
            is_use_cache=is_use_cache,
            is_use_memmap=is_use_memmap,
            path_working_dir=path_working_dir)
        assert isinstance(matrix_data_object, DataCsrMatrix)
        if method == 'pmi':
            backend_strategy = decide_joblib_strategy(
                matrix_data_object.vocabulary)
            scored_sparse_matrix = PMI().fit_transform(
                X=matrix_data_object.csr_matrix_,
                n_docs_distribution=matrix_data_object.n_docs_distribution,
                n_jobs=n_jobs,
                joblib_backend=backend_strategy,
                use_cython=use_cython)
            assert isinstance(scored_sparse_matrix, csr_matrix)
        elif method == 'soa':
            backend_strategy = decide_joblib_strategy(
                matrix_data_object.vocabulary)
            scored_sparse_matrix = SOA().fit_transform(
                X=matrix_data_object.csr_matrix_,
                unit_distribution=matrix_data_object.n_docs_distribution,
                n_jobs=n_jobs,
                joblib_backend=backend_strategy,
                use_cython=use_cython)
            assert isinstance(scored_sparse_matrix, csr_matrix)
        else:
            raise Exception()

    elif method == 'soa' and matrix_form == 'term_freq':
        # You get score-matrix with soa from term-frequency matrix.
        # ATTENTION: the input for TF-IDF MUST be term-frequency matrix. NOT document-frequency matrix
        matrix_data_object = data_converter.DataConverter(
        ).convert_multi_docs2term_frequency_matrix(
            labeled_documents=input_dict,
            n_jobs=n_jobs,
            is_use_cache=is_use_cache,
            is_use_memmap=is_use_memmap,
            path_working_dir=path_working_dir)
        assert isinstance(matrix_data_object, DataCsrMatrix)

        backend_strategy = decide_joblib_strategy(
            matrix_data_object.vocabulary)
        scored_sparse_matrix = SOA().fit_transform(
            X=matrix_data_object.csr_matrix_,
            unit_distribution=matrix_data_object.n_docs_distribution,
            n_jobs=n_jobs,
            joblib_backend=backend_strategy)
        assert isinstance(scored_sparse_matrix, csr_matrix)

    elif method == 'bns':
        # You get scored-matrix with bns.
        # ATTENTION: #label should be 2 always.
        # Consider shorter label name as positive label
        # (positive and negative does NOT have any meaning in this context) #
        positive_label_name = sorted(input_dict.keys(),
                                     key=lambda x: len(x))[0]

        if len(input_dict.keys()) >= 3:
            raise KeyError(
                'input_dict must not have more than 3 keys if you would like to use BNS.'
            )

        matrix_data_object = data_converter.DataConverter(
        ).convert_multi_docs2document_frequency_matrix(
            labeled_documents=input_dict,
            n_jobs=n_jobs,
            is_use_cache=is_use_cache,
            is_use_memmap=is_use_memmap,
            path_working_dir=path_working_dir)
        assert isinstance(matrix_data_object, DataCsrMatrix)

        true_class_index = matrix_data_object.label2id_dict[
            positive_label_name]
        backend_strategy = decide_joblib_strategy(
            matrix_data_object.vocabulary)
        scored_sparse_matrix = BNS().fit_transform(
            X=matrix_data_object.csr_matrix_,
            unit_distribution=matrix_data_object.n_term_freq_distribution,
            n_jobs=n_jobs,
            true_index=true_class_index,
            joblib_backend=backend_strategy,
            use_cython=use_cython)
        assert isinstance(scored_sparse_matrix, csr_matrix)
    else:
        raise Exception()
    logger.info('Done computation.')

    # delete tmp file directory
    if is_use_cache or is_use_memmap:
        logger.debug("Delete temporary files {}".format(path_working_dir))
        shutil.rmtree(path_working_dir)

    return ScoredResultObject(scored_matrix=scored_sparse_matrix,
                              label2id_dict=matrix_data_object.label2id_dict,
                              feature2id_dict=matrix_data_object.vocabulary,
                              method=method,
                              matrix_form=matrix_form,
                              frequency_matrix=matrix_data_object.csr_matrix_)
Esempio n. 14
0
    def test_get_pmi_feature_dictionary(self):
        """checks if it works or not, that getting scored dictionary object from scored_matrix

        :return:
        """
        data_csr_object = data_converter.DataConverter(
        ).labeledMultiDocs2DocFreqMatrix(labeled_documents=self.input_dict,
                                         ngram=1,
                                         n_jobs=5)

        assert isinstance(data_csr_object.csr_matrix_, csr_matrix)
        assert isinstance(data_csr_object.label2id_dict, dict)
        assert isinstance(data_csr_object.vocabulary, dict)

        pmi_scored_matrix = PMI_python3.PMI().fit_transform(
            X=data_csr_object.csr_matrix_,
            n_jobs=5,
            n_docs_distribution=data_csr_object.n_docs_distribution)

        # main part of test
        # when sort is True, cut_zero is True, outformat is dict
        pmi_scored_dictionary_objects = ScoredResultObject(
            scored_matrix=pmi_scored_matrix,
            label2id_dict=data_csr_object.label2id_dict,
            feature2id_dict=data_csr_object.vocabulary
        ).ScoreMatrix2ScoreDictionary(outformat='dict',
                                      sort_desc=True,
                                      n_jobs=5)
        assert isinstance(pmi_scored_dictionary_objects, dict)
        logging.debug(pmi_scored_dictionary_objects)

        # when sort is True, cut_zero is True, outformat is items
        pmi_scored_dictionary_objects = ScoredResultObject(
            scored_matrix=pmi_scored_matrix,
            label2id_dict=data_csr_object.label2id_dict,
            feature2id_dict=data_csr_object.vocabulary
        ).ScoreMatrix2ScoreDictionary(outformat='items',
                                      sort_desc=True,
                                      n_jobs=5)
        assert isinstance(pmi_scored_dictionary_objects, list)
        for d in pmi_scored_dictionary_objects:
            assert isinstance(d, dict)
        logging.debug(pmi_scored_dictionary_objects)

        # when sort is True, cut_zero is False, outformat is dict
        pmi_scored_dictionary_objects = ScoredResultObject(
            scored_matrix=pmi_scored_matrix,
            label2id_dict=data_csr_object.label2id_dict,
            feature2id_dict=data_csr_object.vocabulary
        ).ScoreMatrix2ScoreDictionary(outformat='dict',
                                      sort_desc=True,
                                      n_jobs=5)
        assert isinstance(pmi_scored_dictionary_objects, dict)
        logging.debug(pmi_scored_dictionary_objects)

        # when sort is True, cut_zero is False, outformat is items
        pmi_scored_dictionary_objects = ScoredResultObject(
            scored_matrix=pmi_scored_matrix,
            label2id_dict=data_csr_object.label2id_dict,
            feature2id_dict=data_csr_object.vocabulary
        ).ScoreMatrix2ScoreDictionary(outformat='items',
                                      sort_desc=True,
                                      n_jobs=5)
        assert isinstance(pmi_scored_dictionary_objects, list)
        for d in pmi_scored_dictionary_objects:
            assert isinstance(d, dict)
        logging.debug(pmi_scored_dictionary_objects)