def _make_new_term_doc_matrix(self,
	                              new_X,
	                              new_mX,
	                              new_y,
	                              new_term_idx_store,
	                              new_category_idx_store,
	                              new_metadata_idx_store,
	                              new_y_mask):
		if self._parsed_col is not None and self._parsed_col in self._df:
			return ParsedCorpus(self._df[new_y_mask],
			                    new_X,
			                    new_mX,
			                    new_y,
			                    new_term_idx_store,
			                    new_category_idx_store,
			                    new_metadata_idx_store,
			                    self._parsed_col,
			                    self._category_col)
		else:
			return CorpusDF(self._df[new_y_mask],
			                new_X,
			                new_mX,
			                new_y,
			                self._text_col,
			                new_term_idx_store,
			                new_category_idx_store,
			                new_metadata_idx_store,
			                self._df[self._text_col][new_y_mask])
Beispiel #2
0
 def _make_new_term_doc_matrix(self, new_X, new_mX, new_y,
                               new_term_idx_store, new_category_idx_store,
                               new_metadata_idx_store, new_y_mask):
     return ParsedCorpus(self._df[new_y_mask], new_X, new_mX, new_y,
                         new_term_idx_store, new_category_idx_store,
                         new_metadata_idx_store, self._parsed_col,
                         self._category_col)
	def build(self):
		'''Constructs the term doc matrix.

		Returns
		-------
		scattertext.ParsedCorpus.ParsedCorpus
		'''
		self._y = self._get_y_and_populate_category_idx_store()
		self._df.apply(self._add_to_x_factory, axis=1)
		self._X = self._X_factory.set_last_row_idx(len(self._y) - 1).get_csr_matrix()
		self._mX = self._mX_factory.set_last_row_idx(len(self._y) - 1).get_csr_matrix()
		if self._parsed_col is not None and self._parsed_col in self._df:
			return ParsedCorpus(self._df,
			                    self._X,
			                    self._mX,
			                    self._y,
			                    self._term_idx_store,
			                    self._category_idx_store,
			                    self._metadata_idx_store,
			                    self._parsed_col,
			                    self._category_col)
		else:
			return CorpusDF(self._df,
			                self._X,
			                self._mX,
			                self._y,
			                self._text_col,
			                self._term_idx_store,
			                self._category_idx_store,
			                self._metadata_idx_store)
    def build(self, show_progress=False):
        '''Constructs the term doc matrix.

        Returns
        -------
        scattertext.ParsedCorpus.ParsedCorpus
        '''
        y = self._get_y_and_populate_category_idx_store(
            self._df[self._category_col])
        if show_progress is True:
            self._df.progress_apply(self._add_to_x_factory, axis=1)
        else:
            self._df.apply(self._add_to_x_factory, axis=1)
        self._mX = self._mX_factory.set_last_row_idx(len(y) -
                                                     1).get_csr_matrix()
        return ParsedCorpus(
            df=self._df,
            X=self._X_factory.set_last_row_idx(len(y) - 1).get_csr_matrix(),
            mX=self._mX_factory.set_last_row_idx(len(y) - 1).get_csr_matrix(),
            y=y,
            term_idx_store=self._term_idx_store,
            category_idx_store=self._category_idx_store,
            metadata_idx_store=self._metadata_idx_store,
            parsed_col=self._parsed_col,
            category_col=self._category_col)
    def build(self):
        '''Constructs the term doc matrix.

		Returns
		-------
		scattertext.ParsedCorpus.ParsedCorpus
		'''
        self._y = self._get_y_and_populate_category_idx_store()
        self._df.apply(self._add_to_x_factory, axis=1)
        self._X = self._X_factory.get_csr_matrix()
        self._mX = self._mX_factory.get_csr_matrix()
        return ParsedCorpus(self._df, self._X, self._mX, self._y,
                            self._term_idx_store, self._category_idx_store,
                            self._metadata_idx_store, self._parsed_col,
                            self._category_col)
	def _make_new_term_doc_matrix(self,
	                              new_X=None,
	                              new_mX=None,
	                              new_y=None,
	                              new_term_idx_store=None,
	                              new_category_idx_store=None,
	                              new_metadata_idx_store=None,
	                              new_y_mask=None):
		return ParsedCorpus(self._df[new_y_mask] if new_y_mask else self._df,
		                    self._X if new_X is None else new_X,
		                    self._mX if new_mX is None else new_mX,
		                    self._y if new_y is None else new_y,
		                    self._term_idx_store if new_term_idx_store is None else new_term_idx_store,
		                    self._category_idx_store if new_category_idx_store is None else new_category_idx_store,
		                    self._metadata_idx_store if new_metadata_idx_store is None else new_metadata_idx_store,
		                    self._parsed_col,
		                    self._category_col)
Beispiel #7
0
 def build(self):
     '''
     Returns
     -------
     CorpusDF
     '''
     if self.text_df is not None:
         if self.parsed_col is not None:
             if self.category_col is None:
                 self.text_df = self.text_df.assign(
                     Category=self.category_idx_store.getvalbatch(self.y))
                 self.category_col = 'Category'
             return ParsedCorpus(
                 df=self.text_df,
                 X=self.X,
                 mX=self.mX,
                 y=self.y,
                 parsed_col=self.parsed_col,
                 term_idx_store=self.term_idx_store,
                 category_idx_store=self.category_idx_store,
                 metadata_idx_store=self.metadata_idx_store,
                 unigram_frequency_path=self.unigram_frequency_path,
                 category_col=self.category_col)
         elif self.text_col is not None:
             return CorpusDF(
                 df=self.text_df,
                 X=self.X,
                 mX=self.mX,
                 y=self.y,
                 text_col=self.text_col,
                 term_idx_store=self.term_idx_store,
                 category_idx_store=self.category_idx_store,
                 metadata_idx_store=self.metadata_idx_store,
                 unigram_frequency_path=self.unigram_frequency_path)
     return TermDocMatrix(
         X=self.X,
         mX=self.mX,
         y=self.y,
         term_idx_store=self.term_idx_store,
         category_idx_store=self.category_idx_store,
         metadata_idx_store=self.metadata_idx_store,
         unigram_frequency_path=self.unigram_frequency_path)