Exemple #1
0
    def __init__(self,
                 X: csr_matrix,
                 term_vocabulary: List[str],
                 mX: Optional[csr_matrix] = None,
                 y: Optional[np.array] = None,
                 category_names: Optional[str] = None,
                 metadata_vocabulary: Optional[List[str]] = None,
                 text_df: Optional[pd.DataFrame] = None,
                 text_col: Optional[str] = None,
                 parsed_col: Optional[str] = None,
                 category_col: Optional[str] = None,
                 unigram_frequency_path: Optional[str] = None):
        '''
        Parameters
        ----------
        X: csr_matrix; term-document frequency matrix; columns represent terms and rows documents
        term_vocabulary: List[str]; Each entry corresponds to a term
        mX: Optional[csr_matrix]; metadata csr matrix
        y: Optional[np.array[int]]; indices of category names for each document
        category_names: Optional[List[str]], names of categories for y
        text_df: pd.DataFrame with a row containing the raw document text
        text_col: str; name of row containing the text of each document
        parsed_col: str; name of row containing the parsed text of each document
        unigram_frequency_path: str (see TermDocMatrix)

        '''
        self.X = X
        self.term_idx_store = IndexStoreFromList.build(term_vocabulary)
        assert self.X.shape[1] == len(term_vocabulary)
        self.metadata_idx_store = IndexStore()
        if y is None:
            self.y = np.zeros(self.X.shape[0], dtype=np.int)
            self.category_idx_store = IndexStoreFromList.build(['_'])
            assert category_names is None
        else:
            self.y = y
            assert len(category_names) == len(set(y))
            self.category_idx_store = IndexStoreFromList.build(category_names)
        if metadata_vocabulary is not None:
            assert mX.shape[1] == metadata_vocabulary
            self.mX = mX
            self.metadata_idx_store = IndexStoreFromList.build(
                metadata_vocabulary)
        else:
            assert metadata_vocabulary is None
            self.mX = csr_matrix((0, 0))
            self.metadata_idx_store = IndexStore()
        self.text_df = text_df
        if parsed_col is not None:
            assert parsed_col in text_df
        if text_col is not None:
            assert text_col in text_df
        if category_col is not None:
            assert category_col in text_df
        self.category_col = category_col
        self.text_col = text_col
        self.parsed_col = parsed_col
        self.unigram_frequency_path = unigram_frequency_path
	def _get_build_kwargs(self):
		constructor_kwargs = {
			'X': csr_matrix(self.term_freq_df.values.T),
			'mX': csr_matrix((0, 0)),
			'y': np.array(range(len(self.term_freq_df.columns))),
			'term_idx_store': IndexStoreFromList.build(self.term_freq_df.index.values),
			'metadata_idx_store': IndexStore(),
			'category_idx_store': IndexStoreFromList.build(self.term_freq_df.columns),
			'unigram_frequency_path': self.unigram_frequency_path
		}
		return constructor_kwargs
 def change_category_names(self, new_category_names):
     if len(new_category_names) != self.get_num_categories():
         raise Exception("The number of category names passed (%s) needs to equal "
                         "the number of categories in the corpus (%s)." %
                         (len(new_category_names), self.get_num_categories()))
     return self._make_new_term_doc_matrix(
         new_category_idx_store=IndexStoreFromList.build(new_category_names)
     )
	def _get_build_kwargs(self):
		constructor_kwargs = {'X': self.X,
		                      'mX': csr_matrix((0, 0)),
		                      'y': self.y,
		                      'term_idx_store': IndexStoreFromDict.build(self.feature_vocabulary),
		                      'metadata_idx_store': IndexStore(),
		                      'category_idx_store': IndexStoreFromList.build(self.category_names),
		                      'unigram_frequency_path': self.unigram_frequency_path}
		return constructor_kwargs
 def _get_build_kwargs(self):
     constructor_kwargs = {
         'X': self.X,
         'mX': csr_matrix((0, 0)),
         'y': self.y,
         'term_idx_store':
         IndexStoreFromDict.build(self.feature_vocabulary),
         'metadata_idx_store': IndexStore(),
         'category_idx_store':
         IndexStoreFromList.build(self.category_names),
         'unigram_frequency_path': self.unigram_frequency_path
     }
     return constructor_kwargs
    def recategorize(self, new_categories):
        '''
        Parameters
        ----------
        new_categories : array like
        String names of new categories. Length should be equal to number of documents

        Returns
        -------
        TermDocMatrix
        '''
        assert len(new_categories) == self.get_num_docs()

        new_category_idx_store = IndexStoreFromList.build(set(new_categories))
        new_y = np.array(new_category_idx_store.getidxstrictbatch(new_categories))

        new_tdm = self._make_new_term_doc_matrix(self._X, self._mX, new_y, self._term_idx_store, new_category_idx_store,
                                                 self._metadata_idx_store, new_y == new_y)
        return new_tdm
    def use_doc_labeled_terms_as_metadata(self, doc_labels, separator='_', replace_metadata = True):
        '''
        Makes the metadata of a new TermDocMatrix a copy of the term-document matrix, except each term is prefixed
        by its document's label followed by the separator.

        :param doc_labels: list[str], should be the same size as the number of documents in the TermDocMatrix.
        :param separator: str, default is '_'
        :return: self
        '''

        assert len(doc_labels) == self.get_num_docs()

        doc_labels = np.array(doc_labels)

        terms_in_corpus = np.array(self._term_idx_store.values())
        new_metadata_list = []
        new_meta_X = None

        ordered_doc_labels = list(sorted(set(doc_labels)))
        X = self._X
        if replace_metadata:
            #X = self._mX
            X = self._X

        for doc_label in ordered_doc_labels:
            label_doc_mask = doc_labels == doc_label
            label_X = X[label_doc_mask, :]
            label_term_mask = (X.sum(axis=0) > 0).A1
            label_X = label_X[:, label_term_mask]
            cols_to_pad = len(new_metadata_list)

            new_metadata_list += [doc_label + separator + term
                                  for term in terms_in_corpus[label_term_mask]]
            if new_meta_X is None:
                new_meta_X = label_X
            else:
                label_X_pad = (CSRMatrixFactory()
                               .set_last_col_idx(cols_to_pad - 1)
                               .set_last_row_idx(sum(label_doc_mask) - 1)
                               .get_csr_matrix())
                padded_label_X = scipy.sparse.hstack([label_X_pad, label_X])
                new_meta_X.resize(new_meta_X.shape[0], padded_label_X.shape[1])
                new_meta_X = scipy.sparse.vstack([new_meta_X,
                                                  padded_label_X])

        new_metadata_idx_store = IndexStoreFromList.build(new_metadata_list)
        new_meta_X = new_meta_X.tocsr()
        new_mX = (CSRMatrixFactory()
                  .set_last_col_idx(new_meta_X.shape[1] - 1)
                  .set_last_row_idx(new_meta_X.shape[0] - 1)
                  .get_csr_matrix().tolil())
        start_row = 0
        for doc_label in ordered_doc_labels:
            label_doc_mask = doc_labels == doc_label
            num_rows = sum(label_doc_mask)
            new_mX[label_doc_mask, :] = new_meta_X[start_row:start_row + num_rows, :]
            start_row += num_rows

        new_mX = new_mX.tocsr()
        new_tdm = self._make_new_term_doc_matrix(self._X,
                                                 new_mX,
                                                 self._y,
                                                 self._term_idx_store,
                                                 self._category_idx_store,
                                                 new_metadata_idx_store,
                                                 self._y == self._y)
        return new_tdm