def __init__(self, X: csr_matrix, term_vocabulary: List[str], mX: Optional[csr_matrix] = None, y: Optional[np.array] = None, category_names: Optional[str] = None, metadata_vocabulary: Optional[List[str]] = None, text_df: Optional[pd.DataFrame] = None, text_col: Optional[str] = None, parsed_col: Optional[str] = None, category_col: Optional[str] = None, unigram_frequency_path: Optional[str] = None): ''' Parameters ---------- X: csr_matrix; term-document frequency matrix; columns represent terms and rows documents term_vocabulary: List[str]; Each entry corresponds to a term mX: Optional[csr_matrix]; metadata csr matrix y: Optional[np.array[int]]; indices of category names for each document category_names: Optional[List[str]], names of categories for y text_df: pd.DataFrame with a row containing the raw document text text_col: str; name of row containing the text of each document parsed_col: str; name of row containing the parsed text of each document unigram_frequency_path: str (see TermDocMatrix) ''' self.X = X self.term_idx_store = IndexStoreFromList.build(term_vocabulary) assert self.X.shape[1] == len(term_vocabulary) self.metadata_idx_store = IndexStore() if y is None: self.y = np.zeros(self.X.shape[0], dtype=np.int) self.category_idx_store = IndexStoreFromList.build(['_']) assert category_names is None else: self.y = y assert len(category_names) == len(set(y)) self.category_idx_store = IndexStoreFromList.build(category_names) if metadata_vocabulary is not None: assert mX.shape[1] == metadata_vocabulary self.mX = mX self.metadata_idx_store = IndexStoreFromList.build( metadata_vocabulary) else: assert metadata_vocabulary is None self.mX = csr_matrix((0, 0)) self.metadata_idx_store = IndexStore() self.text_df = text_df if parsed_col is not None: assert parsed_col in text_df if text_col is not None: assert text_col in text_df if category_col is not None: assert category_col in text_df self.category_col = category_col self.text_col = text_col self.parsed_col = parsed_col self.unigram_frequency_path = unigram_frequency_path
def _get_build_kwargs(self): constructor_kwargs = { 'X': csr_matrix(self.term_freq_df.values.T), 'mX': csr_matrix((0, 0)), 'y': np.array(range(len(self.term_freq_df.columns))), 'term_idx_store': IndexStoreFromList.build(self.term_freq_df.index.values), 'metadata_idx_store': IndexStore(), 'category_idx_store': IndexStoreFromList.build(self.term_freq_df.columns), 'unigram_frequency_path': self.unigram_frequency_path } return constructor_kwargs
def change_category_names(self, new_category_names): if len(new_category_names) != self.get_num_categories(): raise Exception("The number of category names passed (%s) needs to equal " "the number of categories in the corpus (%s)." % (len(new_category_names), self.get_num_categories())) return self._make_new_term_doc_matrix( new_category_idx_store=IndexStoreFromList.build(new_category_names) )
def _get_build_kwargs(self): constructor_kwargs = {'X': self.X, 'mX': csr_matrix((0, 0)), 'y': self.y, 'term_idx_store': IndexStoreFromDict.build(self.feature_vocabulary), 'metadata_idx_store': IndexStore(), 'category_idx_store': IndexStoreFromList.build(self.category_names), 'unigram_frequency_path': self.unigram_frequency_path} return constructor_kwargs
def _get_build_kwargs(self): constructor_kwargs = { 'X': self.X, 'mX': csr_matrix((0, 0)), 'y': self.y, 'term_idx_store': IndexStoreFromDict.build(self.feature_vocabulary), 'metadata_idx_store': IndexStore(), 'category_idx_store': IndexStoreFromList.build(self.category_names), 'unigram_frequency_path': self.unigram_frequency_path } return constructor_kwargs
def recategorize(self, new_categories): ''' Parameters ---------- new_categories : array like String names of new categories. Length should be equal to number of documents Returns ------- TermDocMatrix ''' assert len(new_categories) == self.get_num_docs() new_category_idx_store = IndexStoreFromList.build(set(new_categories)) new_y = np.array(new_category_idx_store.getidxstrictbatch(new_categories)) new_tdm = self._make_new_term_doc_matrix(self._X, self._mX, new_y, self._term_idx_store, new_category_idx_store, self._metadata_idx_store, new_y == new_y) return new_tdm
def use_doc_labeled_terms_as_metadata(self, doc_labels, separator='_', replace_metadata = True): ''' Makes the metadata of a new TermDocMatrix a copy of the term-document matrix, except each term is prefixed by its document's label followed by the separator. :param doc_labels: list[str], should be the same size as the number of documents in the TermDocMatrix. :param separator: str, default is '_' :return: self ''' assert len(doc_labels) == self.get_num_docs() doc_labels = np.array(doc_labels) terms_in_corpus = np.array(self._term_idx_store.values()) new_metadata_list = [] new_meta_X = None ordered_doc_labels = list(sorted(set(doc_labels))) X = self._X if replace_metadata: #X = self._mX X = self._X for doc_label in ordered_doc_labels: label_doc_mask = doc_labels == doc_label label_X = X[label_doc_mask, :] label_term_mask = (X.sum(axis=0) > 0).A1 label_X = label_X[:, label_term_mask] cols_to_pad = len(new_metadata_list) new_metadata_list += [doc_label + separator + term for term in terms_in_corpus[label_term_mask]] if new_meta_X is None: new_meta_X = label_X else: label_X_pad = (CSRMatrixFactory() .set_last_col_idx(cols_to_pad - 1) .set_last_row_idx(sum(label_doc_mask) - 1) .get_csr_matrix()) padded_label_X = scipy.sparse.hstack([label_X_pad, label_X]) new_meta_X.resize(new_meta_X.shape[0], padded_label_X.shape[1]) new_meta_X = scipy.sparse.vstack([new_meta_X, padded_label_X]) new_metadata_idx_store = IndexStoreFromList.build(new_metadata_list) new_meta_X = new_meta_X.tocsr() new_mX = (CSRMatrixFactory() .set_last_col_idx(new_meta_X.shape[1] - 1) .set_last_row_idx(new_meta_X.shape[0] - 1) .get_csr_matrix().tolil()) start_row = 0 for doc_label in ordered_doc_labels: label_doc_mask = doc_labels == doc_label num_rows = sum(label_doc_mask) new_mX[label_doc_mask, :] = new_meta_X[start_row:start_row + num_rows, :] start_row += num_rows new_mX = new_mX.tocsr() new_tdm = self._make_new_term_doc_matrix(self._X, new_mX, self._y, self._term_idx_store, self._category_idx_store, new_metadata_idx_store, self._y == self._y) return new_tdm