Ejemplo n.º 1
0
    def fit_transform(self,
                      X: Union[csr_matrix, memmap],
                      n_docs_distribution,
                      n_jobs=1,
                      verbose=False,
                      joblib_backend='multiprocessing',
                      use_cython: bool = False):
        """Main method of PMI class.
        """
        assert isinstance(X, (memmap, csr_matrix))
        assert isinstance(n_docs_distribution, numpy.ndarray)

        matrix_size = X.shape
        sample_range = list(range(0, matrix_size[0]))
        feature_range = list(range(0, matrix_size[1]))
        n_total_document = sum(n_docs_distribution)

        logger.debug(msg='Start calculating PMI')
        logger.debug(
            msg='size(input_matrix)={} * {}'.format(X.shape[0], X.shape[1]))

        if use_cython:
            import pyximport
            pyximport.install()
            from DocumentFeatureSelection.pmi.pmi_cython import main
            logger.warning(
                msg='n_jobs parameter is invalid when use_cython=True')
            pmi_score_csr_source = main(
                X=X,
                n_docs_distribution=n_docs_distribution,
                sample_range=sample_range,
                feature_range=feature_range,
                n_total_doc=n_total_document,
                verbose=False)

        else:
            self.pmi = pmi
            pmi_score_csr_source = joblib.Parallel(
                n_jobs=n_jobs,
                backend=joblib_backend)(joblib.delayed(self.docId_word_PMI)(
                    X=X,
                    n_docs_distribution=n_docs_distribution,
                    feature_index=feature_index,
                    sample_index=sample_index,
                    n_total_doc=n_total_document,
                    verbose=verbose) for sample_index in sample_range
                                        for feature_index in feature_range)

        row_list = [t[0] for t in pmi_score_csr_source]
        col_list = [t[1] for t in pmi_score_csr_source]
        data_list = [t[2] for t in pmi_score_csr_source]

        pmi_featured_csr_matrix = csr_matrix((data_list, (row_list, col_list)),
                                             shape=(X.shape[0], X.shape[1]))

        logging.debug(msg='End calculating PMI')

        return pmi_featured_csr_matrix
Ejemplo n.º 2
0
    def fit_transform(self,
                      X: Union[memmap, csr_matrix],
                      unit_distribution: numpy.ndarray,
                      n_jobs: int = 1,
                      verbose=False,
                      joblib_backend: str = 'multiprocessing',
                      use_cython: bool = False):
        """* What you can do
        - Get SOA weighted-score matrix.
        - You can get fast-speed with Cython
        """
        assert isinstance(X, (memmap, csr_matrix))
        assert isinstance(unit_distribution, numpy.ndarray)

        matrix_size = X.shape
        sample_range = list(range(0, matrix_size[0]))
        feature_range = list(range(0, matrix_size[1]))
        n_total_document = sum(unit_distribution)

        logger.debug(msg='Start calculating SOA')
        logger.debug(
            msg='size(input_matrix)={} * {}'.format(X.shape[0], X.shape[1]))

        if use_cython:
            import pyximport
            pyximport.install()
            from DocumentFeatureSelection.soa.soa_cython import main
            logger.warning(
                msg='n_jobs parameter is invalid when use_cython=True')
            soa_score_csr_source = main(X=X,
                                        n_docs_distribution=unit_distribution,
                                        n_total_doc=n_total_document,
                                        sample_range=sample_range,
                                        feature_range=feature_range,
                                        verbose=False)
        else:
            self.soa = soa
            soa_score_csr_source = joblib.Parallel(
                n_jobs=n_jobs,
                backend=joblib_backend)(joblib.delayed(self.docId_word_soa)(
                    X=X,
                    unit_distribution=unit_distribution,
                    feature_index=feature_index,
                    sample_index=sample_index,
                    n_total_doc=n_total_document,
                    verbose=verbose) for sample_index in sample_range
                                        for feature_index in feature_range)

        row_list = [t[0] for t in soa_score_csr_source]
        col_list = [t[1] for t in soa_score_csr_source]
        data_list = [t[2] for t in soa_score_csr_source]

        soa_featured_csr_matrix = csr_matrix((data_list, (row_list, col_list)),
                                             shape=(X.shape[0], X.shape[1]))

        logging.debug(msg='End calculating SOA')

        return soa_featured_csr_matrix
    def convert_multi_docs2document_frequency_matrix(
            self,
            labeled_documents: AvailableInputTypes,
            is_use_cache: bool = False,
            is_use_memmap: bool = False,
            path_working_dir: str = None,
            n_jobs: int = 1) -> DataCsrMatrix:
        """This function makes document-frequency matrix. Document-frequency matrix is scipy.csr_matrix.

        * Input object
        - "labeled_structure" is either of Dict object or shelve.DbfilenameShelf. The example format is below
            >>> {"label_a": [["I", "aa", "aa", "aa", "aa", "aa"],["bb", "aa", "aa", "aa", "aa", "aa"],["I", "aa", "hero", "some", "ok", "aa"]],
            >>> "label_b": [["bb", "bb", "bb"],["bb", "bb", "bb"],["hero", "ok", "bb"],["hero", "cc", "bb"],],
            >>> "label_c": [["cc", "cc", "cc"],["cc", "cc", "bb"],["xx", "xx", "cc"],["aa", "xx", "cc"],]}

        * Output
        - DataCsrMatrix object.
        """
        labeled_documents = self.make_feature_object2json_string(
            labeled_documents)

        logger.debug(msg='Now pre-processing before CSR matrix')
        # convert data structure
        set_document_information = func_data_converter.make_multi_docs2doc_freq_info(
            labeled_documents, n_jobs=n_jobs)
        assert isinstance(set_document_information,
                          func_data_converter.SetDocumentInformation)

        # count n(docs) per label
        n_docs_distribution = self.count_document_distribution(
            labeled_documents=labeled_documents,
            label2id=set_document_information.label2id)
        # count term-frequency per label
        term_frequency_distribution = self.count_term_frequency_distribution(
            labeled_documents=labeled_documents,
            label2id=set_document_information.label2id)
        return DataCsrMatrix(
            csr_matrix_=set_document_information.matrix_object,
            label2id_dict=set_document_information.label2id,
            vocabulary=set_document_information.feature2id,
            n_docs_distribution=n_docs_distribution,
            n_term_freq_distribution=term_frequency_distribution,
            is_use_cache=is_use_cache,
            is_use_memmap=is_use_memmap,
            path_working_dir=path_working_dir)
    def convert_multi_docs2term_frequency_matrix(
            self,
            labeled_documents: AvailableInputTypes,
            is_use_cache: bool = False,
            is_use_memmap: bool = False,
            path_working_dir: str = tempfile.mkdtemp(),
            cache_backend: str = 'PersistentDict',
            n_jobs: int = 1):
        """* What you can do
        - This function makes TERM-frequency matrix for TF-IDF calculation.
        - TERM-frequency matrix is scipy.csr_matrix.

        * Params
        - labeled_documents: Dict object which has category-name as key, and list of features as value
        - is_use_cache: boolean flag to use disk-drive for keeping objects which tends to be huge.
        - path_working_dir: path to directory for saving cache files
        """
        labeled_documents = self.make_feature_object2json_string(
            labeled_documents)

        logger.debug(msg='Now pre-processing before CSR matrix')
        # convert data structure
        set_document_information = func_data_converter.make_multi_docs2term_freq_info(
            labeled_documents)

        # count n(docs) per label
        n_docs_distribution = self.count_document_distribution(
            labeled_documents=labeled_documents,
            label2id=set_document_information.label2id)
        # count term-frequency per label
        term_frequency_distribution = self.count_term_frequency_distribution(
            labeled_documents=labeled_documents,
            label2id=set_document_information.label2id)

        return DataCsrMatrix(
            csr_matrix_=set_document_information.matrix_object,
            label2id_dict=set_document_information.label2id,
            vocabulary=set_document_information.feature2id,
            n_docs_distribution=n_docs_distribution,
            n_term_freq_distribution=term_frequency_distribution,
            is_use_cache=is_use_cache,
            is_use_memmap=is_use_memmap,
            path_working_dir=path_working_dir,
            cache_backend=cache_backend)
def run_feature_selection(input_dict: AvailableInputTypes,
                          method: str,
                          use_cython: bool = False,
                          is_use_cache: bool = False,
                          is_use_memmap: bool = False,
                          cache_backend: str = 'PersistentDict',
                          path_working_dir: str = None,
                          matrix_form=None,
                          n_jobs: int = 1) -> ScoredResultObject:
    """A interface function of DocumentFeatureSelection package.

    * Args
    - input_dict: Dict-object which has category-name as key and list of features as value.
        - You can put dict or sqlitedict.SqliteDict, or DocumentFeatureSelection.models.PersistentDict
    - method: A method name of feature selection metric
    - use_cython: boolean flag to use cython code for computation.
    It's much faster to use cython than native-python code
    - is_use_cache: boolean flag to use disk-drive for keeping objects which tends to be huge.
    - is_use_memmap: boolean flag to use memmap for keeping matrix object.
    - path_working_dir: str object.
        - The file path to directory where you save cache file or memmap matrix object. If you leave it None,
        it finds some directory and save files in it.
    - cache_backend
        - Named of cache backend if you put True on is_use_cache. [PersistentDict, SqliteDict]

    """
    if method not in METHOD_NAMES:
        raise Exception('method name must be either of {}. Yours: {}'.format(
            METHOD_NAMES, method))

    if (is_use_cache or is_use_memmap) and path_working_dir is None:
        path_working_dir = mkdtemp()
        logger.info(
            "Temporary files are created under {}".format(path_working_dir))

    if method == 'tf_idf':
        """You get scored-matrix with term-frequency.
        ATTENTION: the input for TF-IDF MUST be term-frequency matrix. NOT document-frequency matrix
        """
        matrix_data_object = data_converter.DataConverter(
        ).convert_multi_docs2term_frequency_matrix(
            labeled_documents=input_dict,
            n_jobs=n_jobs,
            is_use_cache=is_use_cache,
            is_use_memmap=is_use_memmap,
            path_working_dir=path_working_dir,
            cache_backend=cache_backend)
        assert isinstance(matrix_data_object, DataCsrMatrix)

        scored_sparse_matrix = TFIDF().fit_transform(
            X=matrix_data_object.csr_matrix_)
        assert isinstance(scored_sparse_matrix, csr_matrix)

    elif method in ['soa', 'pmi'] and matrix_form is None:
        """You get scored-matrix with either of soa or pmi.
        """
        matrix_data_object = data_converter.DataConverter(
        ).convert_multi_docs2document_frequency_matrix(
            labeled_documents=input_dict,
            n_jobs=n_jobs,
            is_use_cache=is_use_cache,
            is_use_memmap=is_use_memmap,
            path_working_dir=path_working_dir)
        assert isinstance(matrix_data_object, DataCsrMatrix)
        if method == 'pmi':
            backend_strategy = decide_joblib_strategy(
                matrix_data_object.vocabulary)
            scored_sparse_matrix = PMI().fit_transform(
                X=matrix_data_object.csr_matrix_,
                n_docs_distribution=matrix_data_object.n_docs_distribution,
                n_jobs=n_jobs,
                joblib_backend=backend_strategy,
                use_cython=use_cython)
            assert isinstance(scored_sparse_matrix, csr_matrix)
        elif method == 'soa':
            backend_strategy = decide_joblib_strategy(
                matrix_data_object.vocabulary)
            scored_sparse_matrix = SOA().fit_transform(
                X=matrix_data_object.csr_matrix_,
                unit_distribution=matrix_data_object.n_docs_distribution,
                n_jobs=n_jobs,
                joblib_backend=backend_strategy,
                use_cython=use_cython)
            assert isinstance(scored_sparse_matrix, csr_matrix)
        else:
            raise Exception()

    elif method == 'soa' and matrix_form == 'term_freq':
        # You get score-matrix with soa from term-frequency matrix.
        # ATTENTION: the input for TF-IDF MUST be term-frequency matrix. NOT document-frequency matrix
        matrix_data_object = data_converter.DataConverter(
        ).convert_multi_docs2term_frequency_matrix(
            labeled_documents=input_dict,
            n_jobs=n_jobs,
            is_use_cache=is_use_cache,
            is_use_memmap=is_use_memmap,
            path_working_dir=path_working_dir)
        assert isinstance(matrix_data_object, DataCsrMatrix)

        backend_strategy = decide_joblib_strategy(
            matrix_data_object.vocabulary)
        scored_sparse_matrix = SOA().fit_transform(
            X=matrix_data_object.csr_matrix_,
            unit_distribution=matrix_data_object.n_docs_distribution,
            n_jobs=n_jobs,
            joblib_backend=backend_strategy)
        assert isinstance(scored_sparse_matrix, csr_matrix)

    elif method == 'bns':
        # You get scored-matrix with bns.
        # ATTENTION: #label should be 2 always.
        # Consider shorter label name as positive label
        # (positive and negative does NOT have any meaning in this context) #
        positive_label_name = sorted(input_dict.keys(),
                                     key=lambda x: len(x))[0]

        if len(input_dict.keys()) >= 3:
            raise KeyError(
                'input_dict must not have more than 3 keys if you would like to use BNS.'
            )

        matrix_data_object = data_converter.DataConverter(
        ).convert_multi_docs2document_frequency_matrix(
            labeled_documents=input_dict,
            n_jobs=n_jobs,
            is_use_cache=is_use_cache,
            is_use_memmap=is_use_memmap,
            path_working_dir=path_working_dir)
        assert isinstance(matrix_data_object, DataCsrMatrix)

        true_class_index = matrix_data_object.label2id_dict[
            positive_label_name]
        backend_strategy = decide_joblib_strategy(
            matrix_data_object.vocabulary)
        scored_sparse_matrix = BNS().fit_transform(
            X=matrix_data_object.csr_matrix_,
            unit_distribution=matrix_data_object.n_term_freq_distribution,
            n_jobs=n_jobs,
            true_index=true_class_index,
            joblib_backend=backend_strategy,
            use_cython=use_cython)
        assert isinstance(scored_sparse_matrix, csr_matrix)
    else:
        raise Exception()
    logger.info('Done computation.')

    # delete tmp file directory
    if is_use_cache or is_use_memmap:
        logger.debug("Delete temporary files {}".format(path_working_dir))
        shutil.rmtree(path_working_dir)

    return ScoredResultObject(scored_matrix=scored_sparse_matrix,
                              label2id_dict=matrix_data_object.label2id_dict,
                              feature2id_dict=matrix_data_object.vocabulary,
                              method=method,
                              matrix_form=matrix_form,
                              frequency_matrix=matrix_data_object.csr_matrix_)
Ejemplo n.º 6
0
    def get_feature_dictionary(
            self,
            weighted_matrix: csr_matrix,
            vocabulary: Dict[str, int],
            label_group_dict: Dict[str, int],
            cache_backend: str = 'PersistentDict',
            is_use_cache: bool = True,
            frequency_matrix: csr_matrix = None) -> List[Dict[str, Any]]:
        """* What you can do
        - Get dictionary structure from weighted-featured scores.
        """
        assert isinstance(weighted_matrix, csr_matrix)
        assert isinstance(vocabulary, dict)
        assert isinstance(label_group_dict, dict)

        logger.debug(
            msg='Start making scored dictionary object from scored matrix')
        logger.debug(msg='Input matrix size= {} * {}'.format(
            weighted_matrix.shape[0], weighted_matrix.shape[1]))

        weight_value_index_items = self.make_non_zero_information(
            weighted_matrix)
        if not frequency_matrix is None:
            frequency_value_index_items = self.make_non_zero_information(
                frequency_matrix)
            dict_position2value = {
                (t_col_row.col, t_col_row.row): t_col_row.val
                for t_col_row in frequency_value_index_items
            }
        else:
            dict_position2value = None

        if is_use_cache:
            dict_index_information = self.initialize_cache_dict_object(
                cache_backend, file_name='dict_index_information')
        else:
            dict_index_information = {}

        dict_index_information['id2label'] = {
            value: key
            for key, value in label_group_dict.items()
        }
        dict_index_information['id2vocab'] = {
            value: key
            for key, value in vocabulary.items()
        }
        if isinstance(dict_index_information, SqliteDict):
            dict_index_information.commit()
        elif isinstance(dict_index_information, PersistentDict):
            dict_index_information.sync()
        else:
            pass

        # TODO may be this func takes too much time. consider cython.
        seq_score_objects = [None] * len(
            weight_value_index_items)  # type: List[Dict[str,Any]]
        for i, weight_row_col_val_tuple in enumerate(weight_value_index_items):
            seq_score_objects[i] = self.SUB_FUNC_feature_extraction(
                weight_row_col_val_tuple, dict_index_information,
                dict_position2value)

        logger.debug(msg='Finished making scored dictionary')

        return seq_score_objects
Ejemplo n.º 7
0
"""This example shows you how to work on huge dataset.
For persisted-dict object you can choose PersistentDict or SqliteDict
"""

DATA_LIMIT = 100000


def run_nltk_lemma(subject_name: str) -> List[str]:
    return [
        lemmatizer.lemmatize(t).strip(':?!><')
        for t in subject_name.lower().split()
    ]


category_names = newsgroups_train.target_names
logger.debug("20-news has {} categories".format(len(category_names)))
logger.debug("Now pre-processing on subject text...")
news_lemma = [run_nltk_lemma(d) for d in newsgroups_train.data[:DATA_LIMIT]]

index2category = {i: t for i, t in enumerate(newsgroups_train.target_names)}
dict_index2label = {
    i: index2category[t_no]
    for i, t_no in enumerate(newsgroups_train.target[:DATA_LIMIT])
}
logger.info("Subject distribution")
for k, v in dict(Counter(dict_index2label.values())).items():
    logger.info("{} is {}, {}%".format(k, v, v / len(dict_index2label) * 100))

# Case of PersistentDict
logger.info("Putting documents into dict object...")
persistent_dict_obj = PersistentDict('demo.json', 'c', format='json')
    def fit_transform(self,
                      X: Union[memmap, csr_matrix],
                      y=None,
                      **fit_params):
        """* What you can do

        * Args
        - X; scipy.csr_matrix or numpy.memmap: Matrix object

        * Params
        - unit_distribution; list or ndarray: The number of document frequency per label. Ex. [10, 20]
        - n_jobs: The number of cores when you use joblib.
        - joblib_backend: "multiprocessing" or "multithreding"
        - true_index: The index number of True label.
        - use_cython; boolean: True, then Use Cython for computation. False, not.
        """
        assert isinstance(X, csr_matrix)

        # --------------------------------------------------------
        # Check parameters
        if not 'unit_distribution' in fit_params:
            raise Exception('You must put unit_distribution parameter')
        assert isinstance(fit_params['unit_distribution'], (list, ndarray))
        self.__check_matrix_form(X)

        unit_distribution = fit_params['unit_distribution']

        if 'n_jobs' in fit_params:
            n_jobs = fit_params['n_jobs']
        else:
            n_jobs = 1

        if 'true_index' in fit_params:
            true_index = fit_params['true_index']
        else:
            true_index = 0

        if 'verbose' in fit_params:
            verbose = True
        else:
            verbose = False

        if 'joblib_backend' in fit_params:
            joblib_backend = fit_params['joblib_backend']
        else:
            joblib_backend = 'multiprocessing'

        if 'use_cython' in fit_params:
            is_use_cython = True
        else:
            is_use_cython = False
        # --------------------------------------------------------

        matrix_size = X.shape
        sample_range = list(range(0, matrix_size[0]))
        feature_range = list(range(0, matrix_size[1]))

        logger.debug(
            msg='Start calculating BNS with n(process)={}'.format(n_jobs))
        logger.debug(
            msg='size(input_matrix)={} * {}'.format(X.shape[0], X.shape[1]))

        if is_use_cython:
            import pyximport
            pyximport.install()
            from DocumentFeatureSelection.bns.bns_cython import main
            logger.warning(
                msg='n_jobs parameter is invalid when use_cython=True')
            bns_score_csr_source = main(X=X,
                                        unit_distribution=unit_distribution,
                                        sample_range=sample_range,
                                        feature_range=feature_range,
                                        true_index=true_index,
                                        verbose=verbose)
        else:
            bns_score_csr_source = joblib.Parallel(
                n_jobs=n_jobs,
                backend=joblib_backend)(joblib.delayed(self.docId_word_BNS)(
                    X=X,
                    feature_index=feature_index,
                    sample_index=sample_index,
                    true_index=true_index,
                    unit_distribution=unit_distribution,
                    verbose=verbose) for sample_index in sample_range
                                        for feature_index in feature_range)

        row_list = [t[0] for t in bns_score_csr_source]
        col_list = [t[1] for t in bns_score_csr_source]
        data_list = [t[2] for t in bns_score_csr_source]

        bns_featured_csr_matrix = csr_matrix((data_list, (row_list, col_list)),
                                             shape=(X.shape[0], X.shape[1]))

        logging.debug(msg='End calculating BNS')

        return bns_featured_csr_matrix