def multiDocs2DocFreqInfo(labeled_documents:AvailableInputTypes,
                          n_jobs:int=1,
                          path_working_dir:str=tempfile.mkdtemp(),
                          is_use_cache: bool = True)->SetDocumentInformation:
    """This function generates information for constructing document-frequency matrix.
    """
    assert isinstance(labeled_documents, (SqliteDict, dict))
    type_flag = set([judge_feature_type(docs) for docs in list(labeled_documents.values())])
    assert len(type_flag)==1

    counted_frequency = joblib.Parallel(n_jobs=n_jobs)(
        joblib.delayed(generate_document_dict)(key, docs)
        for key, docs in sorted(list(labeled_documents.items()), key=lambda key_value_tuple: key_value_tuple[0]))

    ### construct [{}] structure for input of DictVectorizer() ###
    seq_feature_documents = (dict(label_freqCounter_tuple[1]) for label_freqCounter_tuple in counted_frequency)

    ### Save index-string dictionary
    if is_use_cache:
        dict_matrix_index = init_cache_object('matrix_element_object', path_working_dir)
    else:
        dict_matrix_index = {}

    # use sklearn feature-extraction
    vec = DictVectorizer()
    dict_matrix_index['matrix_object'] = vec.fit_transform(seq_feature_documents).tocsr()
    dict_matrix_index['feature2id'] = {feat:feat_id for feat_id, feat in enumerate(vec.get_feature_names())}
    dict_matrix_index['label2id'] = {label_freqCounter_tuple[0]:label_id for label_id, label_freqCounter_tuple in enumerate(counted_frequency)}

    return SetDocumentInformation(dict_matrix_index)
    def make_feature_object2json_string(
        self, labeled_document: AvailableInputTypes
    ) -> Dict[str, AvailableInputTypes]:
        """* What u can do
        - This function converts feature-object in sequence object into json string.
        - This function make every object into json string.
            - string object -> json array which has one string. Ex. "feature" -> '["feature"]'
            - list object -> json array. Ex. ["feature", "feature"] -> '["feature", "feature"]'
            - tuple object -> json array. Ex. ("feature", "feature") -> '["feature", "feature"]'
        * Parameters
        - labeled_document: dict object which has key of 'label-name', and value is 2-dim list of features.

        """
        assert isinstance(labeled_document, (dict, PersistentDict, SqliteDict))
        replaced_labeled_document = {key: [] for key in labeled_document}
        for key, docs_in_label in labeled_document.items():
            assert isinstance(docs_in_label, list)
            replaced_docs_in_label = [None] * len(docs_in_label)
            for i, doc_label in enumerate(docs_in_label):
                replaced_docs_in_label[
                    i] = self.__make_feature_object2json_string(doc_label)
            else:
                replaced_labeled_document[key] = replaced_docs_in_label
        else:
            return replaced_labeled_document
def multiDocs2DocFreqInfo(labeled_documents:AvailableInputTypes,
                          n_jobs:int=1)->SetDocumentInformation:
    """This function generates information for constructing document-frequency matrix.
    """
    assert isinstance(labeled_documents, (SqliteDict, dict))
    type_flag = set([judge_feature_type(docs) for docs in labeled_documents.values()])
    assert len(type_flag)==1

    counted_frequency = joblib.Parallel(n_jobs=n_jobs)(
        joblib.delayed(generate_document_dict)(key, docs)
        for key, docs in sorted(labeled_documents.items(), key=lambda key_value_tuple: key_value_tuple[0]))
    feature_documents = [dict(label_freqCounter_tuple[1]) for label_freqCounter_tuple in counted_frequency]

    # use sklearn feature-extraction
    vec = DictVectorizer()
    matrix_object = vec.fit_transform(feature_documents).tocsr()
    feature2id = {feat:feat_id for feat_id, feat in enumerate(vec.get_feature_names())}
    label2id = {label_freqCounter_tuple[0]:label_id for label_id, label_freqCounter_tuple in  enumerate(counted_frequency)}

    return SetDocumentInformation(matrix_object, label2id, feature2id)
def multiDocs2TermFreqInfo(labeled_documents:AvailableInputTypes):
    """This function generates information to construct term-frequency matrix
    """
    assert isinstance(labeled_documents, (SqliteDict, dict))

    counted_frequency = [(label, Counter(list(itertools.chain.from_iterable(documents))))
                         for label, documents in labeled_documents.items()]
    feature_documents = [dict(label_freqCounter_tuple[1]) for label_freqCounter_tuple in counted_frequency]

    # use sklearn feature-extraction
    vec = DictVectorizer()
    matrix_object = vec.fit_transform(feature_documents).tocsr()
    feature2id = {feat:feat_id for feat_id, feat in enumerate(vec.get_feature_names())}
    label2id = {label_freqCounter_tuple[0]:label_id for label_id, label_freqCounter_tuple in  enumerate(counted_frequency)}

    return SetDocumentInformation(matrix_object, label2id, feature2id)
    def count_document_distribution(self, labeled_documents:AvailableInputTypes, label2id:Dict[str,int])->numpy.ndarray:
        """This method count n(docs) per label.
        """
        assert isinstance(labeled_documents, (SqliteDict, dict))
        assert isinstance(label2id, dict)

        # count n(docs) per label
        n_doc_distribution = {
            label: len(document_lists)
            for label, document_lists
            in labeled_documents.items()
        }

        # make list of distribution
        n_doc_distribution_list = [0] * len(labeled_documents)

        for label_string, n_doc in n_doc_distribution.items():
            #docs_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value']
            docs_index = label2id[label_string]
            n_doc_distribution_list[docs_index] = n_doc

        return numpy.array(n_doc_distribution_list, dtype='i8')
    def count_term_frequency_distribution(self, labeled_documents:AvailableInputTypes, label2id:Dict[str,int]):
        """Count term-distribution per label.
        """
        assert isinstance(labeled_documents, (SqliteDict, dict))
        assert isinstance(label2id, dict)

        # count total term-frequency per label
        term_frequency_distribution = {
            label: len(list(utils.flatten(document_lists)))
            for label, document_lists
            in labeled_documents.items()
        }

        # make list of distribution
        term_frequency_distribution_list = [0] * len(labeled_documents)

        for label_string, n_doc in term_frequency_distribution.items():
            #term_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value']
            term_index = label2id[label_string]
            term_frequency_distribution_list[term_index] = n_doc

        return numpy.array(term_frequency_distribution_list, dtype='i8')
Example #7
0
    def count_document_distribution(self, labeled_documents:AvailableInputTypes, label2id:Dict[str,int])->numpy.ndarray:
        """This method count n(docs) per label.
        """
        assert isinstance(labeled_documents, (SqliteDict, dict))
        assert isinstance(label2id, dict)

        # count n(docs) per label
        n_doc_distribution = {
            label: len(document_lists)
            for label, document_lists
            in labeled_documents.items()
        }

        # make list of distribution
        n_doc_distribution_list = [0] * len(labeled_documents)

        for label_string, n_doc in n_doc_distribution.items():
            #docs_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value']
            docs_index = label2id[label_string]
            n_doc_distribution_list[docs_index] = n_doc

        return numpy.array(n_doc_distribution_list, dtype='i8')
Example #8
0
    def count_term_frequency_distribution(self, labeled_documents:AvailableInputTypes, label2id:Dict[str,int]):
        """Count term-distribution per label.
        """
        assert isinstance(labeled_documents, (SqliteDict, dict))
        assert isinstance(label2id, dict)

        # count total term-frequency per label
        term_frequency_distribution = {
            label: len(list(utils.flatten(document_lists)))
            for label, document_lists
            in labeled_documents.items()
        }

        # make list of distribution
        term_frequency_distribution_list = [0] * len(labeled_documents)

        for label_string, n_doc in term_frequency_distribution.items():
            #term_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value']
            term_index = label2id[label_string]
            term_frequency_distribution_list[term_index] = n_doc

        return numpy.array(term_frequency_distribution_list, dtype='i8')
def make_multi_docs2term_freq_info(labeled_documents: AvailableInputTypes,
                                   is_use_cache: bool = True,
                                   path_work_dir: str = tempfile.mkdtemp()):
    """* What u can do
    - This function generates information to construct term-frequency matrix
    """
    assert isinstance(labeled_documents, (SqliteDict, dict))

    counted_frequency = [
        (label, Counter(list(itertools.chain.from_iterable(documents))))
        for label, documents in labeled_documents.items()
    ]
    feature_documents = [
        dict(label_freqCounter_tuple[1])
        for label_freqCounter_tuple in counted_frequency
    ]

    if is_use_cache:
        dict_matrix_index = init_cache_object('matrix_element_objects',
                                              path_work_dir=path_work_dir)
    else:
        dict_matrix_index = {}

    # use sklearn feature-extraction
    vec = DictVectorizer()
    dict_matrix_index['matrix_object'] = vec.fit_transform(
        feature_documents).tocsr()
    dict_matrix_index['feature2id'] = {
        feat: feat_id
        for feat_id, feat in enumerate(vec.get_feature_names())
    }
    dict_matrix_index['label2id'] = {
        label_freqCounter_tuple[0]: label_id
        for label_id, label_freqCounter_tuple in enumerate(counted_frequency)
    }

    return SetDocumentInformation(dict_matrix_index)
def run_feature_selection(input_dict: AvailableInputTypes,
                          method: str,
                          use_cython: bool = False,
                          is_use_cache: bool = False,
                          is_use_memmap: bool = False,
                          cache_backend: str = 'PersistentDict',
                          path_working_dir: str = None,
                          matrix_form=None,
                          n_jobs: int = 1) -> ScoredResultObject:
    """A interface function of DocumentFeatureSelection package.

    * Args
    - input_dict: Dict-object which has category-name as key and list of features as value.
        - You can put dict or sqlitedict.SqliteDict, or DocumentFeatureSelection.models.PersistentDict
    - method: A method name of feature selection metric
    - use_cython: boolean flag to use cython code for computation.
    It's much faster to use cython than native-python code
    - is_use_cache: boolean flag to use disk-drive for keeping objects which tends to be huge.
    - is_use_memmap: boolean flag to use memmap for keeping matrix object.
    - path_working_dir: str object.
        - The file path to directory where you save cache file or memmap matrix object. If you leave it None,
        it finds some directory and save files in it.
    - cache_backend
        - Named of cache backend if you put True on is_use_cache. [PersistentDict, SqliteDict]

    """
    if method not in METHOD_NAMES:
        raise Exception('method name must be either of {}. Yours: {}'.format(
            METHOD_NAMES, method))

    if (is_use_cache or is_use_memmap) and path_working_dir is None:
        path_working_dir = mkdtemp()
        logger.info(
            "Temporary files are created under {}".format(path_working_dir))

    if method == 'tf_idf':
        """You get scored-matrix with term-frequency.
        ATTENTION: the input for TF-IDF MUST be term-frequency matrix. NOT document-frequency matrix
        """
        matrix_data_object = data_converter.DataConverter(
        ).convert_multi_docs2term_frequency_matrix(
            labeled_documents=input_dict,
            n_jobs=n_jobs,
            is_use_cache=is_use_cache,
            is_use_memmap=is_use_memmap,
            path_working_dir=path_working_dir,
            cache_backend=cache_backend)
        assert isinstance(matrix_data_object, DataCsrMatrix)

        scored_sparse_matrix = TFIDF().fit_transform(
            X=matrix_data_object.csr_matrix_)
        assert isinstance(scored_sparse_matrix, csr_matrix)

    elif method in ['soa', 'pmi'] and matrix_form is None:
        """You get scored-matrix with either of soa or pmi.
        """
        matrix_data_object = data_converter.DataConverter(
        ).convert_multi_docs2document_frequency_matrix(
            labeled_documents=input_dict,
            n_jobs=n_jobs,
            is_use_cache=is_use_cache,
            is_use_memmap=is_use_memmap,
            path_working_dir=path_working_dir)
        assert isinstance(matrix_data_object, DataCsrMatrix)
        if method == 'pmi':
            backend_strategy = decide_joblib_strategy(
                matrix_data_object.vocabulary)
            scored_sparse_matrix = PMI().fit_transform(
                X=matrix_data_object.csr_matrix_,
                n_docs_distribution=matrix_data_object.n_docs_distribution,
                n_jobs=n_jobs,
                joblib_backend=backend_strategy,
                use_cython=use_cython)
            assert isinstance(scored_sparse_matrix, csr_matrix)
        elif method == 'soa':
            backend_strategy = decide_joblib_strategy(
                matrix_data_object.vocabulary)
            scored_sparse_matrix = SOA().fit_transform(
                X=matrix_data_object.csr_matrix_,
                unit_distribution=matrix_data_object.n_docs_distribution,
                n_jobs=n_jobs,
                joblib_backend=backend_strategy,
                use_cython=use_cython)
            assert isinstance(scored_sparse_matrix, csr_matrix)
        else:
            raise Exception()

    elif method == 'soa' and matrix_form == 'term_freq':
        # You get score-matrix with soa from term-frequency matrix.
        # ATTENTION: the input for TF-IDF MUST be term-frequency matrix. NOT document-frequency matrix
        matrix_data_object = data_converter.DataConverter(
        ).convert_multi_docs2term_frequency_matrix(
            labeled_documents=input_dict,
            n_jobs=n_jobs,
            is_use_cache=is_use_cache,
            is_use_memmap=is_use_memmap,
            path_working_dir=path_working_dir)
        assert isinstance(matrix_data_object, DataCsrMatrix)

        backend_strategy = decide_joblib_strategy(
            matrix_data_object.vocabulary)
        scored_sparse_matrix = SOA().fit_transform(
            X=matrix_data_object.csr_matrix_,
            unit_distribution=matrix_data_object.n_docs_distribution,
            n_jobs=n_jobs,
            joblib_backend=backend_strategy)
        assert isinstance(scored_sparse_matrix, csr_matrix)

    elif method == 'bns':
        # You get scored-matrix with bns.
        # ATTENTION: #label should be 2 always.
        # Consider shorter label name as positive label
        # (positive and negative does NOT have any meaning in this context) #
        positive_label_name = sorted(input_dict.keys(),
                                     key=lambda x: len(x))[0]

        if len(input_dict.keys()) >= 3:
            raise KeyError(
                'input_dict must not have more than 3 keys if you would like to use BNS.'
            )

        matrix_data_object = data_converter.DataConverter(
        ).convert_multi_docs2document_frequency_matrix(
            labeled_documents=input_dict,
            n_jobs=n_jobs,
            is_use_cache=is_use_cache,
            is_use_memmap=is_use_memmap,
            path_working_dir=path_working_dir)
        assert isinstance(matrix_data_object, DataCsrMatrix)

        true_class_index = matrix_data_object.label2id_dict[
            positive_label_name]
        backend_strategy = decide_joblib_strategy(
            matrix_data_object.vocabulary)
        scored_sparse_matrix = BNS().fit_transform(
            X=matrix_data_object.csr_matrix_,
            unit_distribution=matrix_data_object.n_term_freq_distribution,
            n_jobs=n_jobs,
            true_index=true_class_index,
            joblib_backend=backend_strategy,
            use_cython=use_cython)
        assert isinstance(scored_sparse_matrix, csr_matrix)
    else:
        raise Exception()
    logger.info('Done computation.')

    # delete tmp file directory
    if is_use_cache or is_use_memmap:
        logger.debug("Delete temporary files {}".format(path_working_dir))
        shutil.rmtree(path_working_dir)

    return ScoredResultObject(scored_matrix=scored_sparse_matrix,
                              label2id_dict=matrix_data_object.label2id_dict,
                              feature2id_dict=matrix_data_object.vocabulary,
                              method=method,
                              matrix_form=matrix_form,
                              frequency_matrix=matrix_data_object.csr_matrix_)
def run_feature_selection(input_dict:AvailableInputTypes,
                          method:str,
                          use_cython:bool=False,
                          is_use_cache:bool=False,
                          is_use_memmap:bool=False,
                          path_working_dir:str=None,
                          matrix_form=None,
                          joblib_backend='auto',
                          n_jobs:int=1,
                          ngram:int=1)->ScoredResultObject:
    """A interface function of DocumentFeatureSelection package.

    * Parameters
    - input_dict: Dict-object which has category-name as key and list of features as value.
        You can put dict or sqlitedict.SqliteDict, or DocumentFeatureSelection.models.PersistentDict
    - method: A method name of feature selection metric
    - use_cython: boolean flag to use cython code for computation. It's much faster to use cython than native-python code
    - is_use_cache: boolean flag to use disk-drive for keeping objects which tends to be huge.
    - is_use_memmap: boolean flag to use memmap for keeping matrix object.
    - path_working_dir: str object.
        The file path to directory where you save cache file or memmap matrix object. If you leave it None, it finds some directory and save files in it.
    """
    if not method in METHOD_NAMES:
        raise Exception('method name must be either of {}. Yours: {}'.format(METHOD_NAMES, method))

    if method == 'tf_idf':
        """You get scored-matrix with term-frequency.
        ATTENTION: the input for TF-IDF MUST be term-frequency matrix. NOT document-frequency matrix
        """
        matrix_data_object = data_converter.DataConverter().labeledMultiDocs2TermFreqMatrix(
            labeled_documents=input_dict,
            ngram=ngram,
            n_jobs=n_jobs,
            joblib_backend=joblib_backend,
            is_use_cache=is_use_cache,
            is_use_memmap=is_use_memmap,
            path_working_dir=path_working_dir
        )
        assert isinstance(matrix_data_object, DataCsrMatrix)

        scored_sparse_matrix = TFIDF().fit_transform(X=matrix_data_object.csr_matrix_)
        assert isinstance(scored_sparse_matrix, csr_matrix)

    elif method in ['soa', 'pmi'] and matrix_form is None:
        """You get scored-matrix with either of soa or pmi.
        """
        matrix_data_object = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix(
            labeled_documents=input_dict,
            ngram=ngram,
            n_jobs=n_jobs,
            joblib_backend=joblib_backend,
            is_use_cache=is_use_cache,
            is_use_memmap=is_use_memmap,
            path_working_dir=path_working_dir
        )
        assert isinstance(matrix_data_object, DataCsrMatrix)
        if method == 'pmi':
            backend_strategy = decide_joblib_strategy(matrix_data_object.vocabulary)
            scored_sparse_matrix = PMI().fit_transform(X=matrix_data_object.csr_matrix_,
                                                       n_docs_distribution=matrix_data_object.n_docs_distribution,
                                                       n_jobs=n_jobs,
                                                       joblib_backend=backend_strategy,
                                                       use_cython=use_cython)
            assert isinstance(scored_sparse_matrix, csr_matrix)
        elif method == 'soa':
            backend_strategy = decide_joblib_strategy(matrix_data_object.vocabulary)
            scored_sparse_matrix = SOA().fit_transform(X=matrix_data_object.csr_matrix_,
                                                       unit_distribution=matrix_data_object.n_docs_distribution,
                                                       n_jobs=n_jobs,
                                                       joblib_backend=backend_strategy,
                                                       use_cython=use_cython)
            assert isinstance(scored_sparse_matrix, csr_matrix)
        else:
            raise Exception()

    elif method == 'soa' and matrix_form == 'term_freq':
        """You get score-matrix with soa from term-frequency matrix.
        ATTENTION: the input for TF-IDF MUST be term-frequency matrix. NOT document-frequency matrix
        """
        matrix_data_object = data_converter.DataConverter().labeledMultiDocs2TermFreqMatrix(
            labeled_documents=input_dict,
            ngram=ngram,
            n_jobs=n_jobs,
            joblib_backend=joblib_backend,
            is_use_cache=is_use_cache,
            is_use_memmap=is_use_memmap,
            path_working_dir=path_working_dir
        )
        assert isinstance(matrix_data_object, DataCsrMatrix)

        backend_strategy = decide_joblib_strategy(matrix_data_object.vocabulary)
        scored_sparse_matrix = SOA().fit_transform(X=matrix_data_object.csr_matrix_,
                                                   unit_distribution=matrix_data_object.n_docs_distribution,
                                                   n_jobs=n_jobs,
                                                   joblib_backend=backend_strategy)
        assert isinstance(scored_sparse_matrix, csr_matrix)

    elif method == 'bns':
        """You get scored-matrix with bns.
        ATTENTION: #label should be 2 always.
        """
        if not 'positive' in input_dict:
            raise KeyError('input_dict must have "positive" key')
        if not 'negative' in input_dict:
            raise KeyError('input_dict must have "negative" key')
        if len(input_dict.keys()) >= 3:
            raise KeyError('input_dict must not have more than 3 keys if you would like to use BNS.')

        matrix_data_object = data_converter.DataConverter().labeledMultiDocs2TermFreqMatrix(
            labeled_documents=input_dict,
            ngram=ngram,
            n_jobs=n_jobs,
            joblib_backend=joblib_backend,
            is_use_cache=is_use_cache,
            is_use_memmap=is_use_memmap,
            path_working_dir=path_working_dir
        )
        assert isinstance(matrix_data_object, DataCsrMatrix)

        true_class_index = matrix_data_object.label2id_dict['positive']
        backend_strategy = decide_joblib_strategy(matrix_data_object.vocabulary)
        scored_sparse_matrix = BNS().fit_transform(
            X=matrix_data_object.csr_matrix_,
            unit_distribution=matrix_data_object.n_term_freq_distribution,
            n_jobs=n_jobs,
            true_index=true_class_index,
            joblib_backend=backend_strategy
        )
        assert isinstance(scored_sparse_matrix, csr_matrix)
    else:
        raise Exception()

    return ScoredResultObject(
        scored_matrix=scored_sparse_matrix,
        label2id_dict=matrix_data_object.label2id_dict,
        feature2id_dict=matrix_data_object.vocabulary,
        method=method,
        matrix_form=matrix_form
    )