def __init__(self, df, X, mX, y, term_idx_store, category_idx_store, metadata_idx_store, parsed_col, category_col, unigram_frequency_path=None): ''' Parameters ---------- convention_df pd.DataFrame, contains parsed_col and metadata X, csr_matrix mX csr_matrix y, np.array term_idx_store, IndexStore category_idx_store, IndexStore parsed_col str, column in convention_df containing parsed documents category_col str, columns in convention_df containing category unigram_frequency_path str, None by default, path of unigram counts file ''' self._df = df self._parsed_col = parsed_col self._category_col = category_col Corpus.__init__(self, X, mX, y, term_idx_store, category_idx_store, metadata_idx_store, self._df[self._parsed_col], unigram_frequency_path)
def __init__(self, df, X, mX, y, text_col, term_idx_store, category_idx_store, metadata_idx_store, unigram_frequency_path=None): ''' Parameters ---------- X : csr_matrix term document matrix mX : csr_matrix metadata-document matrix y : np.array category index array term_idx_store : IndexStore Term indices category_idx_store : IndexStore Catgory indices metadata_idx_store : IndexStore Document metadata indices text_col: np.array or pd.Series Raw texts unigram_frequency_path : str or None Path to term frequency file. ''' self._df = df self._text_col = text_col Corpus.__init__(self, X, mX, y, term_idx_store, category_idx_store, metadata_idx_store, df[text_col], unigram_frequency_path)
def _apply_pipeline_and_get_build_instance(self, X_factory, mX_factory, category_idx_store, df, parse_pipeline, term_idx_store, metadata_idx_store, y): df.apply(parse_pipeline.parse, axis=1) y = np.array(y) X, mX = build_sparse_matrices(y, X_factory, mX_factory) raw_texts = df[self._text_col] return Corpus(X, mX, y, term_idx_store, category_idx_store, metadata_idx_store, raw_texts)