def __init__(self, X, mX, y, term_idx_store, category_idx_store, metadata_idx_store, raw_texts, unigram_frequency_path=None): ''' Parameters ---------- X : csr_matrix term document matrix mX : csr_matrix metadata-document matrix y : np.array category index array term_idx_store : IndexStore Term indices category_idx_store : IndexStore Catgory indices metadata_idx_store : IndexStore Document metadata indices raw_texts : np.array or pd.Series Raw texts unigram_frequency_path : str or None Path to term frequency file. ''' TermDocMatrix.__init__(self, X, mX, y, term_idx_store, category_idx_store, metadata_idx_store, unigram_frequency_path) self._raw_texts = raw_texts
def build_from_category_whitespace_delimited_text(category_text_iter): ''' Parameters ---------- category_text_iter iterator of (string category name, one line per sentence, whitespace-delimited text) pairs Returns ------- TermDocMatrix ''' y = [] X_factory = CSRMatrixFactory() term_idx_store = IndexStore() category_idx_store = IndexStore() mX_factory = CSRMatrixFactory() for doci, (category, text) in enumerate(category_text_iter): y.append(category_idx_store.getidx(category)) term_freq = Counter() for sent in text.strip(string.punctuation).lower().split('\n'): unigrams = [] for tok in sent.strip().split(): unigrams.append(tok) bigrams = list(map(' '.join, zip(unigrams[:-1], unigrams[1:]))) for term in unigrams + bigrams: term_freq[term_idx_store.getidx(term)] += 1 for word_idx, freq in term_freq.items(): X_factory[doci, word_idx] = freq metadata_idx_store = IndexStore() return TermDocMatrix(X=X_factory.get_csr_matrix(), mX=mX_factory.get_csr_matrix(), y=np.array(y), term_idx_store=term_idx_store, metadata_idx_store=metadata_idx_store, category_idx_store=category_idx_store)
def build(self): ''' Returns ------- TermDocMatrix ''' constructor_kwargs = self._get_build_kwargs() return TermDocMatrix(**constructor_kwargs)
def _apply_pipeline_and_get_build_instance(self, X_factory, mX_factory, category_idx_store, df, parse_pipeline, term_idx_store, metadata_idx_store, y): df.apply(parse_pipeline.parse, axis=1) y = np.array(y) X, mX = self._build_sparse_matrices(y, X_factory, mX_factory) tdm = TermDocMatrix(X, mX, y, term_idx_store, category_idx_store, metadata_idx_store) return tdm
def build(self): ''' Returns ------- CorpusDF ''' if self.text_df is not None: if self.parsed_col is not None: if self.category_col is None: self.text_df = self.text_df.assign( Category=self.category_idx_store.getvalbatch(self.y)) self.category_col = 'Category' return ParsedCorpus( df=self.text_df, X=self.X, mX=self.mX, y=self.y, parsed_col=self.parsed_col, term_idx_store=self.term_idx_store, category_idx_store=self.category_idx_store, metadata_idx_store=self.metadata_idx_store, unigram_frequency_path=self.unigram_frequency_path, category_col=self.category_col) elif self.text_col is not None: return CorpusDF( df=self.text_df, X=self.X, mX=self.mX, y=self.y, text_col=self.text_col, term_idx_store=self.term_idx_store, category_idx_store=self.category_idx_store, metadata_idx_store=self.metadata_idx_store, unigram_frequency_path=self.unigram_frequency_path) return TermDocMatrix( X=self.X, mX=self.mX, y=self.y, term_idx_store=self.term_idx_store, category_idx_store=self.category_idx_store, metadata_idx_store=self.metadata_idx_store, unigram_frequency_path=self.unigram_frequency_path)
def _build_from_category_spacy_doc_iter(self, category_doc_iter): ''' Parameters ---------- category_doc_iter : iterator of (string category name, spacy.tokens.doc.Doc) pairs Returns ---------- t : TermDocMatrix ''' term_idx_store = IndexStore() category_idx_store = IndexStore() metadata_idx_store = IndexStore() X, mX, y = self._get_features_and_labels_from_documents_and_indexes( category_doc_iter, category_idx_store, term_idx_store, metadata_idx_store) return TermDocMatrix(X, mX, y, term_idx_store=term_idx_store, category_idx_store=category_idx_store, metadata_idx_store=metadata_idx_store)