Esempio n. 1
0
    def __init__(self,
                 df,
                 X,
                 mX,
                 y,
                 term_idx_store,
                 category_idx_store,
                 metadata_idx_store,
                 parsed_col,
                 category_col,
                 unigram_frequency_path=None):
        '''

        Parameters
        ----------
        convention_df pd.DataFrame, contains parsed_col and metadata
        X, csr_matrix
        mX csr_matrix
        y, np.array
        term_idx_store, IndexStore
        category_idx_store, IndexStore
        parsed_col str, column in convention_df containing parsed documents
        category_col str, columns in convention_df containing category
        unigram_frequency_path str, None by default, path of unigram counts file
        '''
        self._df = df
        self._parsed_col = parsed_col
        self._category_col = category_col
        Corpus.__init__(self, X, mX, y, term_idx_store, category_idx_store,
                        metadata_idx_store, self._df[self._parsed_col],
                        unigram_frequency_path)
Esempio n. 2
0
 def __init__(self,
              df,
              X,
              mX,
              y,
              text_col,
              term_idx_store,
              category_idx_store,
              metadata_idx_store,
              unigram_frequency_path=None):
     '''
     Parameters
     ----------
     X : csr_matrix
         term document matrix
     mX : csr_matrix
         metadata-document matrix
     y : np.array
         category index array
     term_idx_store : IndexStore
         Term indices
     category_idx_store : IndexStore
         Catgory indices
     metadata_idx_store : IndexStore
       Document metadata indices
     text_col: np.array or pd.Series
         Raw texts
     unigram_frequency_path : str or None
         Path to term frequency file.
     '''
     self._df = df
     self._text_col = text_col
     Corpus.__init__(self, X, mX, y, term_idx_store, category_idx_store,
                     metadata_idx_store, df[text_col],
                     unigram_frequency_path)
Esempio n. 3
0
    def __init__(self,
                 df,
                 X,
                 mX,
                 y,
                 term_idx_store,
                 category_idx_store,
                 metadata_idx_store,
                 parsed_col,
                 category_col,
                 unigram_frequency_path=None):
        '''

        Parameters
        ----------
        convention_df pd.DataFrame, contains parsed_col and metadata
        X, csr_matrix
        mX csr_matrix
        y, np.array
        term_idx_store, IndexStore
        category_idx_store, IndexStore
        parsed_col str, column in convention_df containing parsed documents
        category_col str, columns in convention_df containing category
        unigram_frequency_path str, None by default, path of unigram counts file
        '''
        self._df = df
        self._parsed_col = parsed_col
        self._category_col = category_col
        Corpus.__init__(self, X, mX, y, term_idx_store, category_idx_store,
                                           metadata_idx_store,
                                           self._df[self._parsed_col],
                                           unigram_frequency_path)
Esempio n. 4
0
 def _apply_pipeline_and_get_build_instance(self, X_factory, mX_factory,
                                            category_idx_store, df,
                                            parse_pipeline, term_idx_store,
                                            metadata_idx_store, y):
     df.apply(parse_pipeline.parse, axis=1)
     y = np.array(y)
     X, mX = build_sparse_matrices(y, X_factory, mX_factory)
     raw_texts = df[self._text_col]
     return Corpus(X, mX, y, term_idx_store, category_idx_store,
                   metadata_idx_store, raw_texts)
Esempio n. 5
0
 def __init__(self,
              df,
              X,
              mX,
              y,
              text_col,
              term_idx_store,
              category_idx_store,
              metadata_idx_store,
              unigram_frequency_path=None):
     '''
     Parameters
     ----------
     X : csr_matrix
         term document matrix
     mX : csr_matrix
         metadata-document matrix
     y : np.array
         category index array
     term_idx_store : IndexStore
         Term indices
     category_idx_store : IndexStore
         Catgory indices
     metadata_idx_store : IndexStore
       Document metadata indices
     text_col: np.array or pd.Series
         Raw texts
     unigram_frequency_path : str or None
         Path to term frequency file.
     '''
     self._df = df
     self._text_col = text_col
     Corpus.__init__(self,
                     X,
                     mX,
                     y,
                     term_idx_store,
                     category_idx_store,
                     metadata_idx_store,
                     df[text_col],
                     unigram_frequency_path)