def __init__(self,
	             df,
	             category_col,
	             text_col,
	             feature_col,
	             metadata_col=None,
	             parsed_col=None):

		'''
		Parameters
		----------
		df : pd.DataFrame
		 contains category_col, and parse_col, were parsed col is entirely spacy docs
		category_col : str
				name of category column in convention_df
		text_col : str
				The name of the column which contains each document's raw text.
		feature_col : str
				name of column in convention_df with a feature dictionary
		metadata_col : str, optional
				name of column in convention_df with a meatadata dictionary
		parsed_col : str, optional
				name of column in convention_df with parsed strings
		'''
		self._df = df.reset_index()
		self._category_col = category_col
		self._text_col = text_col
		self._feature_col = feature_col
		self._parsed_col = parsed_col
		self._metadata_col = metadata_col
		self._category_idx_store = IndexStore()
		self._X_factory = CSRMatrixFactory()
		self._mX_factory = CSRMatrixFactory()
		self._term_idx_store = IndexStore()
		self._metadata_idx_store = IndexStore()
    def build(self):
        '''Constructs the term doc matrix.

        Returns
        -------
        TermDocMatrix
        '''

        X_factory = CSRMatrixFactory()
        mX_factory = CSRMatrixFactory()
        term_idx_store = IndexStore()
        metadata_idx_store = IndexStore()

        parse_pipeline = ParsePipelineFactoryWithoutCategories(self.get_nlp(),
                                              X_factory,
                                              mX_factory,
                                              term_idx_store,
                                              metadata_idx_store,
                                              self)
        df = self._clean_and_filter_nulls_and_empties_from_dataframe()
        tdm = self._apply_pipeline_and_get_build_instance(X_factory,
                                                          mX_factory,
                                                          df,
                                                          parse_pipeline,
                                                          term_idx_store,
                                                          metadata_idx_store)
        return tdm
コード例 #3
0
    def add_doc_names_as_metadata(self, doc_names):
        '''
        :param doc_names: array-like[str], document names of reach document
        :return: Corpus-like object with doc names as metadata. If two documents share the same name
        (doc number) will be appended to their names.
        '''
        if len(doc_names) != self.get_num_docs():
            raise Exception("The parameter doc_names contains %s elements. "
                            "It should have %s elements, one per document." %
                            (len(doc_names), self.get_num_docs()))

        doc_names_counter = collections.Counter(np.array(doc_names))
        metafact = CSRMatrixFactory()
        metaidxstore = IndexStore()
        doc_id_uses = collections.Counter()
        for i in range(self.get_num_docs()):
            doc_id = doc_names[i]
            if doc_names_counter[doc_id] > 1:
                doc_id_uses[doc_id] += 1
                doc_name_idx = metaidxstore.getidx(
                    '%s (%s)' % (doc_id, doc_id_uses[doc_id]))
            else:
                doc_name_idx = metaidxstore.getidx(doc_id)
            metafact[i, i] = doc_name_idx
        return self.add_metadata(metafact.get_csr_matrix(), metaidxstore)
コード例 #4
0
	def __init__(self,
	             df,
	             category_col,
	             parsed_col,
	             feats_from_spacy_doc=FeatsFromSpacyDoc()):

		'''
		Parameters
		----------
		df : pd.DataFrame
		 contains category_col, and parse_col, were parsed col is entirely spacy docs
		category_col : str
			name of category column in convention_df
		parsed_col : str
			name of spacy parsed column in convention_df
		feats_from_spacy_doc : FeatsFromSpacyDoc
		'''
		self._df = df.reset_index()
		self._category_col = category_col
		self._parsed_col = parsed_col
		self._category_idx_store = IndexStore()
		self._X_factory = CSRMatrixFactory()
		self._mX_factory = CSRMatrixFactory()
		self._term_idx_store = IndexStore()
		self._metadata_idx_store = IndexStore()
		self._feats_from_spacy_doc = feats_from_spacy_doc
コード例 #5
0
    def __init__(self,
                 df,
                 category_col,
                 parsed_col,
                 feats_from_spacy_doc=FeatsFromSpacyDoc()):
        '''
		Parameters
		----------
		df : pd.DataFrame
		 contains category_col, and parse_col, were parsed col is entirely spacy docs
		category_col : str
			name of category column in convention_df
		parsed_col : str
			name of spacy parsed column in convention_df
		feats_from_spacy_doc : FeatsFromSpacyDoc
		'''
        self._df = df.reset_index()
        self._category_col = category_col
        self._parsed_col = parsed_col
        self._category_idx_store = IndexStore()
        self._X_factory = CSRMatrixFactory()
        self._mX_factory = CSRMatrixFactory()
        self._term_idx_store = IndexStore()
        self._metadata_idx_store = IndexStore()
        self._feats_from_spacy_doc = feats_from_spacy_doc
コード例 #6
0
 def __init__(self,
              df,
              parsed_col,
              feat_and_offset_getter,
              category_col=None):
     '''
     Parameters
     ----------
     df : pd.DataFrame
      contains category_col, and parse_col, were parsed col is entirely spacy docs
     parsed_col : str
         name of spacy parsed column in convention_df
     feats_from_spacy_doc : FeatsFromSpacyDoc
     category_col : str, Optional
         name of category column in df; if None, all category names will be '_'
     '''
     self._df = df.reset_index()
     self._category_col = category_col
     self._parsed_col = parsed_col
     self._category_idx_store = IndexStore()
     self._X_factory = CSRMatrixFactory()
     self._mX_factory = CSRMatrixFactory()
     self._term_idx_store = IndexStore()
     self._metadata_idx_store = IndexStore()
     self._feat_and_offset_getter = feat_and_offset_getter
     self._term_offsets = {}
     self._metadata_offsets = {}
コード例 #7
0
class CorpusFromParsedDocuments(object):
	def __init__(self,
	             df,
	             category_col,
	             parsed_col,
	             feats_from_spacy_doc=FeatsFromSpacyDoc()):

		'''
		Parameters
		----------
		df : pd.DataFrame
		 contains category_col, and parse_col, were parsed col is entirely spacy docs
		category_col : str
			name of category column in df
		parsed_col : str
			name of spacy parsed column in df
		feats_from_spacy_doc : FeatsFromSpacyDoc
		'''
		self._df = df.reset_index()
		self._category_col = category_col
		self._parsed_col = parsed_col
		self._category_idx_store = IndexStore()
		self._X_factory = CSRMatrixFactory()
		self._mX_factory = CSRMatrixFactory()
		self._term_idx_store = IndexStore()
		self._metadata_idx_store = IndexStore()
		self._feats_from_spacy_doc = feats_from_spacy_doc

	def build(self):
		'''Constructs the term doc matrix.

		Returns
		-------
		scattertext.ParsedCorpus.ParsedCorpus
		'''
		self._y = self._get_y_and_populate_category_idx_store()
		self._df.apply(self._add_to_x_factory, axis=1)
		self._X = self._X_factory.set_last_row_idx(len(self._y)-1).get_csr_matrix()
		self._mX = self._mX_factory.set_last_row_idx(len(self._y)-1).get_csr_matrix()
		return ParsedCorpus(self._df,
		                    self._X,
		                    self._mX,
		                    self._y,
		                    self._term_idx_store,
		                    self._category_idx_store,
		                    self._metadata_idx_store,
		                    self._parsed_col,
		                    self._category_col)

	def _get_y_and_populate_category_idx_store(self):
		return np.array(self._df[self._category_col].apply(self._category_idx_store.getidx))

	def _add_to_x_factory(self, row):
		parsed_text = row[self._parsed_col]
		for term, count in self._feats_from_spacy_doc.get_feats(parsed_text).items():
			term_idx = self._term_idx_store.getidx(term)
			self._X_factory[row.name, term_idx] = count
		for meta, val in self._feats_from_spacy_doc.get_doc_metadata(parsed_text).items():
			meta_idx = self._metadata_idx_store.getidx(meta)
			self._mX_factory[row.name, meta_idx] = val
コード例 #8
0
    def init_term_doc_matrix_variables():
        y = []
        X_factory = CSRMatrixFactory()
        mX_factory = CSRMatrixFactory()
        category_idx_store = IndexStore()
        term_idx_store = IndexStore()
        metadata_idx_store = IndexStore()

        return X_factory, mX_factory, category_idx_store, \
               term_idx_store, metadata_idx_store, y
コード例 #9
0
    def build(values):
        '''
		Parameters
		----------
		values: [term, ...]

		Returns
		-------
		IndexStore
		'''
        idxstore = IndexStore()
        idxstore._i2val = values
        idxstore._val2i = {term: i for i, term in enumerate(values)}
        idxstore._next_i = len(values)
        return idxstore
コード例 #10
0
 def test_main(self):
     index_store = IndexStore()
     self.assertEqual(index_store.getidx('a'), 0)
     self.assertEqual(index_store.getidx('b'), 1)
     self.assertEqual(index_store.getidx('a'), 0)
     self.assertEqual(index_store.getval(0), 'a')
     self.assertEqual(index_store.getval(1), 'b')
     self.assertTrue('a' in index_store)
     self.assertFalse('c' in index_store)
     self.assertEqual(set(index_store.values()), set(['a', 'b']))
     self.assertFalse(0 in index_store)
     self.assertTrue(index_store.hasidx(0))
     self.assertFalse(index_store.hasidx(2))
     self.assertEqual(index_store.getnumvals(), 2)
     self.assertEqual(list(index_store.items()), [(0, 'a'), (1, 'b')])
コード例 #11
0
	def build(values):
		'''
		Parameters
		----------
		values: [term, ...]

		Returns
		-------
		IndexStore
		'''
		idxstore = IndexStore()
		idxstore._i2val = list(values)
		idxstore._val2i = {term:i for i,term in enumerate(values)}
		idxstore._next_i = len(values)
		return idxstore
コード例 #12
0
def build_from_category_whitespace_delimited_text(category_text_iter):
    '''

    Parameters
    ----------
    category_text_iter iterator of (string category name, one line per sentence, whitespace-delimited text) pairs

    Returns
    -------
    TermDocMatrix
    '''
    y = []
    X_factory = CSRMatrixFactory()
    term_idx_store = IndexStore()
    category_idx_store = IndexStore()
    mX_factory = CSRMatrixFactory()
    for doci, (category, text) in enumerate(category_text_iter):
        y.append(category_idx_store.getidx(category))
        term_freq = Counter()
        for sent in text.strip(string.punctuation).lower().split('\n'):
            unigrams = []
            for tok in sent.strip().split():
                unigrams.append(tok)
            bigrams = list(map(' '.join, zip(unigrams[:-1], unigrams[1:])))
            for term in unigrams + bigrams:
                term_freq[term_idx_store.getidx(term)] += 1
        for word_idx, freq in term_freq.items():
            X_factory[doci, word_idx] = freq
    metadata_idx_store = IndexStore()
    return TermDocMatrix(X=X_factory.get_csr_matrix(),
                         mX=mX_factory.get_csr_matrix(),
                         y=np.array(y),
                         term_idx_store=term_idx_store,
                         metadata_idx_store=metadata_idx_store,
                         category_idx_store=category_idx_store)
コード例 #13
0
 def test_getidxstrict(self):
     index_store = IndexStore()
     self.assertEqual(index_store.getidx('a'), 0)
     self.assertEqual(index_store.getidx('b'), 1)
     self.assertEqual(index_store.getidx('a'), 0)
     with self.assertRaises(KeyError):
         index_store.getidxstrict('c')
コード例 #14
0
 def test_batch_delete(self):
     index_store = IndexStore()
     self.assertEqual(index_store.getidx('a'), 0)
     self.assertEqual(index_store.getidx('b'), 1)
     self.assertEqual(index_store.getidx('c'), 2)
     self.assertEqual(index_store.getidx('d'), 3)
     with self.assertRaises(KeyError):
         new_idx_store = index_store.batch_delete_vals(['e', 'c'])
     new_idx_store = index_store.batch_delete_vals(['b', 'c'])
     self.assertEqual(new_idx_store.getidx('a'), 0)
     self.assertEqual(new_idx_store.getidx('c'), 2)
     self.assertEqual(new_idx_store.getidx('e'), 3)
     self.assertEqual(index_store.getidx('d'), 3)
     self.assertEqual(index_store.getidx('c'), 2)
     self.assertEqual(index_store.getidx('b'), 1)
     self.assertEqual(index_store.getidx('a'), 0)
     with self.assertRaises(ValueError):
         new_idx_store = index_store.batch_delete_idx([5, 1])
     new_idx_store = index_store.batch_delete_idx([2, 1])
     self.assertEqual(new_idx_store.getidx('a'), 0)
     self.assertEqual(new_idx_store.getidx('c'), 2)
     self.assertEqual(new_idx_store.getidx('e'), 3)
コード例 #15
0
    def add_doc_names_as_metadata(self, doc_names):
        '''
        :param doc_names: array-like[str], document names of reach document
        :return: Corpus-like object with doc names as metadata. If two documents share the same name
        (doc number) will be appended to their names.
        '''
        if len(doc_names) != self.get_num_docs():
            raise Exception("The parameter doc_names contains %s elements. "
                            "It should have %s elements, one per document." % (len(doc_names), self.get_num_docs()))

        doc_names_counter = collections.Counter(np.array(doc_names))
        metafact = CSRMatrixFactory()
        metaidxstore = IndexStore()
        doc_id_uses = collections.Counter()
        for i in range(self.get_num_docs()):
            doc_id = doc_names[i]
            if doc_names_counter[doc_id] > 1:
                doc_id_uses[doc_id] += 1
                doc_name_idx = metaidxstore.getidx('%s (%s)' % (doc_id, doc_id_uses[doc_id]))
            else:
                doc_name_idx = metaidxstore.getidx(doc_id)
            metafact[i, i] = doc_name_idx
        return self.add_metadata(metafact.get_csr_matrix(), metaidxstore)
コード例 #16
0
    def _build_from_category_spacy_doc_iter(self, category_doc_iter):
        '''
        Parameters
        ----------
        category_doc_iter : iterator of (string category name, spacy.tokens.doc.Doc) pairs

        Returns
        ----------
        t : TermDocMatrix
        '''
        term_idx_store = IndexStore()
        category_idx_store = IndexStore()
        metadata_idx_store = IndexStore()
        X, mX, y = self._get_features_and_labels_from_documents_and_indexes \
            (category_doc_iter,
             category_idx_store,
             term_idx_store,
             metadata_idx_store)
        return TermDocMatrix(X,
                             mX,
                             y,
                             term_idx_store=term_idx_store,
                             category_idx_store=category_idx_store,
                             metadata_idx_store=metadata_idx_store)
コード例 #17
0
 def test_getidxstrictbatch(self):
     index_store = IndexStore()
     self.assertEqual(index_store.getidx('a'), 0)
     self.assertEqual(index_store.getidx('b'), 1)
     self.assertEqual(index_store.getidx('c'), 2)
     self.assertEqual(index_store.getidx('d'), 3)
     self.assertEqual(index_store.getidx('e'), 4)
     self.assertEqual(index_store.getidx('f'), 5)
     self.assertEqual(index_store.getidxstrictbatch(['b', 'f', 'b', 'a']),
                      [1, 5, 1, 0])
コード例 #18
0
    def test_batch_delete_extra(self):
        index_store = IndexStore()
        self.assertEqual(index_store.getidx('a'), 0)
        self.assertEqual(index_store.getidx('b'), 1)
        self.assertEqual(index_store.getidx('c'), 2)
        self.assertEqual(index_store.getidx('d'), 3)
        self.assertEqual(index_store.getidx('e'), 4)
        self.assertEqual(index_store.getidx('f'), 5)
        del_idxstore = index_store.batch_delete_vals(['b', 'e'])
        self.assertEqual(list(del_idxstore.items()), [(0, 'a'), (1, 'c'),
                                                      (2, 'd'), (3, 'f')])

        del_idxstore2 = del_idxstore.batch_delete_vals([])
        self.assertEqual(list(del_idxstore.items()),
                         list(del_idxstore2.items()))
コード例 #19
0
ファイル: ParsedCorpus.py プロジェクト: zzmjohn/scattertext
    def term_group_freq_df(self, group_col):
        # type: (str) -> pd.DataFrame
        '''
        Returns a dataframe indexed on the number of groups a term occured in.

        Parameters
        ----------
        group_col

        Returns
        -------
        pd.DataFrame
        '''
        group_idx_store = IndexStore()
        X = self._X
        group_idx_to_cat_idx, row_group_cat \
            = self._get_group_docids_and_index_store(X, group_col, group_idx_store)
        newX = self._change_document_type_in_matrix(X, row_group_cat)
        newX = self._make_all_positive_data_ones(newX)
        category_row = newX.tocoo().row
        for group_idx, cat_idx in group_idx_to_cat_idx.items():
            category_row[category_row == group_idx] = cat_idx
        catX = self._change_document_type_in_matrix(newX, category_row)
        return self._term_freq_df_from_matrix(catX)
コード例 #20
0
class OffsetCorpusFactory(object):
    def __init__(self,
                 df,
                 parsed_col,
                 feat_and_offset_getter,
                 category_col=None):
        '''
        Parameters
        ----------
        df : pd.DataFrame
         contains category_col, and parse_col, were parsed col is entirely spacy docs
        parsed_col : str
            name of spacy parsed column in convention_df
        feats_from_spacy_doc : FeatsFromSpacyDoc
        category_col : str, Optional
            name of category column in df; if None, all category names will be '_'
        '''
        self._df = df.reset_index()
        self._category_col = category_col
        self._parsed_col = parsed_col
        self._category_idx_store = IndexStore()
        self._X_factory = CSRMatrixFactory()
        self._mX_factory = CSRMatrixFactory()
        self._term_idx_store = IndexStore()
        self._metadata_idx_store = IndexStore()
        self._feat_and_offset_getter = feat_and_offset_getter
        self._term_offsets = {}
        self._metadata_offsets = {}

    def build(self):
        '''Constructs the term doc matrix.

        Returns
        -------
        scattertext.ParsedCorpus.ParsedCorpus
        '''
        self._ensure_category_col_is_in_df()

        y = self._get_y_and_populate_category_idx_store(
            self._df[self._category_col])
        self._df.apply(self._add_to_x_factory, axis=1)
        self._mX = self._mX_factory.set_last_row_idx(len(y) -
                                                     1).get_csr_matrix()
        return OffsetCorpus(
            df=self._df,
            X=self._X_factory.set_last_row_idx(len(y) - 1).get_csr_matrix(),
            mX=self._mX_factory.set_last_row_idx(len(y) - 1).get_csr_matrix(),
            y=self._get_y_and_populate_category_idx_store(
                self._df[self._category_col]),
            term_idx_store=self._term_idx_store,
            category_idx_store=self._category_idx_store,
            metadata_idx_store=self._metadata_idx_store,
            parsed_col=self._parsed_col,
            category_col=self._category_col,
            term_offsets=self._term_offsets,
            metadata_offsets=self._metadata_offsets)

    def _ensure_category_col_is_in_df(self):
        if self._category_col not in self._df:
            self._category_col = 'Category'
            while self._category_col in self._df:
                self._category_col = 'Category_' + ''.join(
                    np.random.choice(string.ascii_letters) for _ in range(5))

    def _get_y_and_populate_category_idx_store(self, categories):
        return np.array(categories.apply(self._category_idx_store.getidx))

    def _add_to_x_factory(self, row):
        parsed_text = row[self._parsed_col]
        for term, (count,
                   offsets) in self._feat_and_offset_getter.get_term_offsets(
                       parsed_text):
            term_idx = self._term_idx_store.getidx(term)
            self._X_factory[row.name, term_idx] = count
            if offsets is not None:
                self._term_offsets.setdefault(term, {}).setdefault(
                    row.name, []).extend(offsets)

        for meta, (
                val, offsets
        ) in self._feat_and_offset_getter.get_metadata_offsets(parsed_text):
            meta_idx = self._metadata_idx_store.getidx(meta)
            self._mX_factory[row.name, meta_idx] = val
            if offsets is not None:
                self._metadata_offsets.setdefault(meta, {}).setdefault(
                    row.name, []).extend(offsets)
コード例 #21
0
class CorpusFromParsedDocuments(object):
	def __init__(self,
	             df,
	             category_col,
	             parsed_col,
	             feats_from_spacy_doc=FeatsFromSpacyDoc()):

		'''
		Parameters
		----------
		df : pd.DataFrame
		 contains category_col, and parse_col, were parsed col is entirely spacy docs
		category_col : str
			name of category column in convention_df
		parsed_col : str
			name of spacy parsed column in convention_df
		feats_from_spacy_doc : FeatsFromSpacyDoc
		'''
		self._df = df.reset_index()
		self._category_col = category_col
		self._parsed_col = parsed_col
		self._category_idx_store = IndexStore()
		self._X_factory = CSRMatrixFactory()
		self._mX_factory = CSRMatrixFactory()
		self._term_idx_store = IndexStore()
		self._metadata_idx_store = IndexStore()
		self._feats_from_spacy_doc = feats_from_spacy_doc

	def build(self):
		'''Constructs the term doc matrix.

		Returns
		-------
		scattertext.ParsedCorpus.ParsedCorpus
		'''
		self._y = self._get_y_and_populate_category_idx_store()
		self._df.apply(self._add_to_x_factory, axis=1)
		self._X = self._X_factory.set_last_row_idx(len(self._y)-1).get_csr_matrix()
		self._mX = self._mX_factory.set_last_row_idx(len(self._y)-1).get_csr_matrix()
		return ParsedCorpus(self._df,
		                    self._X,
		                    self._mX,
		                    self._y,
		                    self._term_idx_store,
		                    self._category_idx_store,
		                    self._metadata_idx_store,
		                    self._parsed_col,
		                    self._category_col)

	def _get_y_and_populate_category_idx_store(self):
		return np.array(self._df[self._category_col].apply(self._category_idx_store.getidx))

	def _add_to_x_factory(self, row):
		parsed_text = row[self._parsed_col]
		for term, count in self._feats_from_spacy_doc.get_feats(parsed_text).items():
			term_idx = self._term_idx_store.getidx(term)
			self._X_factory[row.name, term_idx] = count
		for meta, val in self._feats_from_spacy_doc.get_doc_metadata(parsed_text).items():
			meta_idx = self._metadata_idx_store.getidx(meta)
			self._mX_factory[row.name, meta_idx] = val

	def _make_new_term_doc_matrix(self,
	                              new_X,
	                              new_mX,
	                              new_y,
	                              new_term_idx_store,
	                              new_category_idx_store,
	                              new_metadata_idx_store,
	                              new_y_mask):
		return ParsedCorpus(self._df[new_y_mask],
		                    new_X,
		                    new_mX,
		                    new_y,
		                    new_term_idx_store,
		                    new_category_idx_store,
		                    new_metadata_idx_store,
		                    self._parsed_col,
		                    self._category_col)
class CorpusFromFeatureDict(object):
	def __init__(self,
	             df,
	             category_col,
	             text_col,
	             feature_col,
	             metadata_col=None,
	             parsed_col=None):

		'''
		Parameters
		----------
		df : pd.DataFrame
		 contains category_col, and parse_col, were parsed col is entirely spacy docs
		category_col : str
				name of category column in convention_df
		text_col : str
				The name of the column which contains each document's raw text.
		feature_col : str
				name of column in convention_df with a feature dictionary
		metadata_col : str, optional
				name of column in convention_df with a meatadata dictionary
		parsed_col : str, optional
				name of column in convention_df with parsed strings
		'''
		self._df = df.reset_index()
		self._category_col = category_col
		self._text_col = text_col
		self._feature_col = feature_col
		self._parsed_col = parsed_col
		self._metadata_col = metadata_col
		self._category_idx_store = IndexStore()
		self._X_factory = CSRMatrixFactory()
		self._mX_factory = CSRMatrixFactory()
		self._term_idx_store = IndexStore()
		self._metadata_idx_store = IndexStore()

	def build(self):
		'''Constructs the term doc matrix.

		Returns
		-------
		scattertext.ParsedCorpus.ParsedCorpus
		'''
		self._y = self._get_y_and_populate_category_idx_store()
		self._df.apply(self._add_to_x_factory, axis=1)
		self._X = self._X_factory.set_last_row_idx(len(self._y) - 1).get_csr_matrix()
		self._mX = self._mX_factory.set_last_row_idx(len(self._y) - 1).get_csr_matrix()
		if self._parsed_col is not None and self._parsed_col in self._df:
			return ParsedCorpus(self._df,
			                    self._X,
			                    self._mX,
			                    self._y,
			                    self._term_idx_store,
			                    self._category_idx_store,
			                    self._metadata_idx_store,
			                    self._parsed_col,
			                    self._category_col)
		else:
			return CorpusDF(self._df,
			                self._X,
			                self._mX,
			                self._y,
			                self._text_col,
			                self._term_idx_store,
			                self._category_idx_store,
			                self._metadata_idx_store)

	def _get_y_and_populate_category_idx_store(self):
		return np.array(self._df[self._category_col].apply(self._category_idx_store.getidx))

	def _add_to_x_factory(self, row):
		for feat, count in row[self._feature_col].items():
			feat_idx = self._term_idx_store.getidx(feat)
			self._X_factory[row.name, feat_idx] = count
		if self._metadata_col in self._df:
			for meta, count in row[self._metadata_col].items():
				meta_idx = self._metadata_idx_store.getidx(meta)
				self._mX_factory[row.name, meta_idx] = count

	def _make_new_term_doc_matrix(self,
	                              new_X,
	                              new_mX,
	                              new_y,
	                              new_term_idx_store,
	                              new_category_idx_store,
	                              new_metadata_idx_store,
	                              new_y_mask):
		if self._parsed_col is not None and self._parsed_col in self._df:
			return ParsedCorpus(self._df[new_y_mask],
			                    new_X,
			                    new_mX,
			                    new_y,
			                    new_term_idx_store,
			                    new_category_idx_store,
			                    new_metadata_idx_store,
			                    self._parsed_col,
			                    self._category_col)
		else:
			return CorpusDF(self._df[new_y_mask],
			                new_X,
			                new_mX,
			                new_y,
			                self._text_col,
			                new_term_idx_store,
			                new_category_idx_store,
			                new_metadata_idx_store,
			                self._df[self._text_col][new_y_mask])