def use_external_metadata_lists(self, metadata_lists): ''' Takes a list of string lists. Each list corresponds to metadata to associate its corresponding document. :param metadata: List[List[str]] :return: new TermDocMatrix ''' metadata_index_store = IndexStore() metadata_csr_factory = CSRMatrixFactory() assert len(metadata_lists) == self.get_num_docs() print("STARTING") for doc_i, metadata_list in enumerate(metadata_lists): print("L", metadata_list) for metadatum in metadata_list: print("METADATUM", metadatum) # raise Exception(str(metadatum) # + " " + str(type(metadatum)) + " " + str(len(metadatum)) + str(metadata_list) # + " " + str(type(metadata_list)) + " " + str(len(metadata_list)) + str(metadata_lists)) # raise Exception(f"METADATUM {metadatum} " + metadatum + ":::" + metadata_list) metadata_csr_factory[ doc_i, metadata_index_store.getidx(metadatum)] = 1 return self._make_new_term_doc_matrix( new_mX=metadata_csr_factory.get_csr_matrix(dtype=int), new_metadata_idx_store=metadata_index_store, new_y_mask=self._y == self._y)
def _get_term_indices_to_compact_from_term_freqs(self, term_freqs): fact = CSRMatrixFactory() idx = IndexStore() tdf_vals = term_freqs.values valid_terms_mask = tdf_vals.sum(axis=1) >= self.minimum_term_count tdf_vals = term_freqs[valid_terms_mask].values terms = np.array(term_freqs.index)[valid_terms_mask] lengths = [] for i, t in enumerate(terms): for tok in t.split(): fact[i, idx.getidx(tok)] = 1 lengths.append(len(t.split())) lengths = np.array(lengths) mat = fact.get_csr_matrix() coocs = lengths - (mat * mat.T) pairs = np.argwhere(coocs == 0).T pairs = pairs.T[(pairs[0] != pairs[1])] pairs = pairs[np.array([terms[i[1]] in terms[i[0]] for i in pairs])] pairs = pairs[np.all(tdf_vals[pairs[:, 1]] <= tdf_vals[pairs[:, 0]], axis=1)] idx_store = self.term_doc_matrix._term_idx_store redundant_terms = idx_store.getidxstrictbatch(terms[np.unique( pairs[:, 1])]) infrequent_terms = np.argwhere(~valid_terms_mask).T[0] terms_to_remove = np.concatenate([redundant_terms, infrequent_terms]) return terms_to_remove
def _get_term_indices_to_compact_from_term_freqs(self, term_freqs, term_doc_matrix, non_text): idx = IndexStore() tdf_vals = term_freqs.values valid_terms_mask = tdf_vals.sum(axis=1) >= self.minimum_term_count tdf_vals = term_freqs[valid_terms_mask].values terms = np.array(term_freqs.index)[valid_terms_mask] lengths = [] fact = CSRMatrixFactory() for i, t in enumerate(terms): for tok in t.split(): fact[i, idx.getidx(tok)] = 1 lengths.append(len(t.split())) lengths = np.array(lengths) mat = fact.get_csr_matrix() coocs = lengths - (mat * mat.T) pairs = np.argwhere(coocs == 0).T pairs = self._limit_to_non_identical_terms(pairs) pairs = self._limit_to_pairs_of_bigrams_and_a_constituent_unigram( pairs, terms) pairs = self._limit_to_redundant_unigrams(pairs, tdf_vals) idx_store = term_doc_matrix._get_relevant_idx_store(non_text) redundant_terms = idx_store.getidxstrictbatch(terms[np.unique( pairs[:, 1])]) infrequent_terms = np.argwhere(~valid_terms_mask).T[0] terms_to_remove = np.concatenate([redundant_terms, infrequent_terms]) return terms_to_remove
def build(self): '''Constructs the term doc matrix. Returns ------- TermDocMatrix ''' X_factory = CSRMatrixFactory() mX_factory = CSRMatrixFactory() term_idx_store = IndexStore() metadata_idx_store = IndexStore() parse_pipeline = ParsePipelineFactoryWithoutCategories(self.get_nlp(), X_factory, mX_factory, term_idx_store, metadata_idx_store, self) df = self._clean_and_filter_nulls_and_empties_from_dataframe() tdm = self._apply_pipeline_and_get_build_instance(X_factory, mX_factory, df, parse_pipeline, term_idx_store, metadata_idx_store) return tdm
def _get_term_indices_to_compact_from_term_freqs(self, term_freqs, term_doc_matrix): idx = IndexStore() tdf_vals = term_freqs.values valid_terms_mask = tdf_vals.sum(axis=1) >= self.minimum_term_count tdf_vals = term_freqs[valid_terms_mask].values terms = np.array(term_freqs.index)[valid_terms_mask] lengths = [] fact = CSRMatrixFactory() for i, t in enumerate(terms): for tok in t.split(): fact[i, idx.getidx(tok)] = 1 lengths.append(len(t.split())) lengths = np.array(lengths) mat = fact.get_csr_matrix() coocs = lengths - (mat * mat.T) pairs = np.argwhere(coocs == 0).T pairs = self._limit_to_non_identical_terms(pairs) pairs = self._limit_to_pairs_of_bigrams_and_a_constituent_unigram(pairs, terms) pairs = self._limit_to_redundant_unigrams(pairs, tdf_vals) idx_store = term_doc_matrix._term_idx_store redundant_terms = idx_store.getidxstrictbatch(terms[np.unique(pairs[:, 1])]) infrequent_terms = np.argwhere(~valid_terms_mask).T[0] terms_to_remove = np.concatenate([redundant_terms, infrequent_terms]) return terms_to_remove
def __init__(self, df, category_col, parsed_col, feats_from_spacy_doc=FeatsFromSpacyDoc()): ''' Parameters ---------- df : pd.DataFrame contains category_col, and parse_col, were parsed col is entirely spacy docs category_col : str name of category column in convention_df parsed_col : str name of spacy parsed column in convention_df feats_from_spacy_doc : FeatsFromSpacyDoc ''' self._df = df.reset_index() self._category_col = category_col self._parsed_col = parsed_col self._category_idx_store = IndexStore() self._X_factory = CSRMatrixFactory() self._mX_factory = CSRMatrixFactory() self._term_idx_store = IndexStore() self._metadata_idx_store = IndexStore() self._feats_from_spacy_doc = feats_from_spacy_doc
def add_doc_names_as_metadata(self, doc_names): ''' :param doc_names: array-like[str], document names of reach document :return: Corpus-like object with doc names as metadata. If two documents share the same name (doc number) will be appended to their names. ''' if len(doc_names) != self.get_num_docs(): raise Exception("The parameter doc_names contains %s elements. " "It should have %s elements, one per document." % (len(doc_names), self.get_num_docs())) doc_names_counter = collections.Counter(np.array(doc_names)) metafact = CSRMatrixFactory() metaidxstore = IndexStore() doc_id_uses = collections.Counter() for i in range(self.get_num_docs()): doc_id = doc_names[i] if doc_names_counter[doc_id] > 1: doc_id_uses[doc_id] += 1 doc_name_idx = metaidxstore.getidx( '%s (%s)' % (doc_id, doc_id_uses[doc_id])) else: doc_name_idx = metaidxstore.getidx(doc_id) metafact[i, i] = doc_name_idx return self.add_metadata(metafact.get_csr_matrix(), metaidxstore)
def __init__(self, df, category_col, parsed_col, feats_from_spacy_doc=FeatsFromSpacyDoc()): ''' Parameters ---------- df : pd.DataFrame contains category_col, and parse_col, were parsed col is entirely spacy docs category_col : str name of category column in convention_df parsed_col : str name of spacy parsed column in convention_df feats_from_spacy_doc : FeatsFromSpacyDoc ''' self._df = df.reset_index() self._category_col = category_col self._parsed_col = parsed_col self._category_idx_store = IndexStore() self._X_factory = CSRMatrixFactory() self._mX_factory = CSRMatrixFactory() self._term_idx_store = IndexStore() self._metadata_idx_store = IndexStore() self._feats_from_spacy_doc = feats_from_spacy_doc
def __init__(self, df, parsed_col, feat_and_offset_getter, category_col=None): ''' Parameters ---------- df : pd.DataFrame contains category_col, and parse_col, were parsed col is entirely spacy docs parsed_col : str name of spacy parsed column in convention_df feats_from_spacy_doc : FeatsFromSpacyDoc category_col : str, Optional name of category column in df; if None, all category names will be '_' ''' self._df = df.reset_index() self._category_col = category_col self._parsed_col = parsed_col self._category_idx_store = IndexStore() self._X_factory = CSRMatrixFactory() self._mX_factory = CSRMatrixFactory() self._term_idx_store = IndexStore() self._metadata_idx_store = IndexStore() self._feat_and_offset_getter = feat_and_offset_getter self._term_offsets = {} self._metadata_offsets = {}
def __init__(self, df, category_col, text_col, feature_col, metadata_col=None, parsed_col=None): ''' Parameters ---------- df : pd.DataFrame contains category_col, and parse_col, were parsed col is entirely spacy docs category_col : str name of category column in convention_df text_col : str The name of the column which contains each document's raw text. feature_col : str name of column in convention_df with a feature dictionary metadata_col : str, optional name of column in convention_df with a meatadata dictionary parsed_col : str, optional name of column in convention_df with parsed strings ''' self._df = df.reset_index() self._category_col = category_col self._text_col = text_col self._feature_col = feature_col self._parsed_col = parsed_col self._metadata_col = metadata_col self._category_idx_store = IndexStore() self._X_factory = CSRMatrixFactory() self._mX_factory = CSRMatrixFactory() self._term_idx_store = IndexStore() self._metadata_idx_store = IndexStore()
class CorpusFromParsedDocuments(object): def __init__(self, df, category_col, parsed_col, feats_from_spacy_doc=FeatsFromSpacyDoc()): ''' Parameters ---------- df : pd.DataFrame contains category_col, and parse_col, were parsed col is entirely spacy docs category_col : str name of category column in df parsed_col : str name of spacy parsed column in df feats_from_spacy_doc : FeatsFromSpacyDoc ''' self._df = df.reset_index() self._category_col = category_col self._parsed_col = parsed_col self._category_idx_store = IndexStore() self._X_factory = CSRMatrixFactory() self._mX_factory = CSRMatrixFactory() self._term_idx_store = IndexStore() self._metadata_idx_store = IndexStore() self._feats_from_spacy_doc = feats_from_spacy_doc def build(self): '''Constructs the term doc matrix. Returns ------- scattertext.ParsedCorpus.ParsedCorpus ''' self._y = self._get_y_and_populate_category_idx_store() self._df.apply(self._add_to_x_factory, axis=1) self._X = self._X_factory.set_last_row_idx(len(self._y)-1).get_csr_matrix() self._mX = self._mX_factory.set_last_row_idx(len(self._y)-1).get_csr_matrix() return ParsedCorpus(self._df, self._X, self._mX, self._y, self._term_idx_store, self._category_idx_store, self._metadata_idx_store, self._parsed_col, self._category_col) def _get_y_and_populate_category_idx_store(self): return np.array(self._df[self._category_col].apply(self._category_idx_store.getidx)) def _add_to_x_factory(self, row): parsed_text = row[self._parsed_col] for term, count in self._feats_from_spacy_doc.get_feats(parsed_text).items(): term_idx = self._term_idx_store.getidx(term) self._X_factory[row.name, term_idx] = count for meta, val in self._feats_from_spacy_doc.get_doc_metadata(parsed_text).items(): meta_idx = self._metadata_idx_store.getidx(meta) self._mX_factory[row.name, meta_idx] = val
def test_main(self): mat_factory = CSRMatrixFactory() mat_factory[0, 0] = 4 mat_factory[1, 5] = 3 mat = mat_factory.get_csr_matrix() self.assertEqual(type(mat), csr_matrix) np.testing.assert_array_almost_equal( np.array([[4, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 3]]), mat.todense())
def init_term_doc_matrix_variables(): y = [] X_factory = CSRMatrixFactory() mX_factory = CSRMatrixFactory() category_idx_store = IndexStore() term_idx_store = IndexStore() metadata_idx_store = IndexStore() return X_factory, mX_factory, category_idx_store, \ term_idx_store, metadata_idx_store, y
def use_categories_as_metadata_and_replace_terms(self): ''' Returns a TermDocMatrix which is identical to self except the metadata values are now identical to the categories present and term-doc-matrix is now the metadata matrix. :return: TermDocMatrix ''' new_metadata_factory = CSRMatrixFactory() for i, category_idx in enumerate(self.get_category_ids()): new_metadata_factory[i, category_idx] = 1 new_metadata = new_metadata_factory.get_csr_matrix() new_tdm = self._make_new_term_doc_matrix( self._mX, new_metadata, self._y, self._metadata_idx_store, self._category_idx_store, copy(self._category_idx_store), self._y == self._y) return new_tdm
def feats_from_doc(self, raw_text): ''' Parameters ---------- raw_text, uncleaned text for parsing out features Returns ------- csr_matrix, feature matrix ''' parsed_text = self._nlp(self._clean_function(raw_text)) X_factory = CSRMatrixFactory() X_factory.set_last_col_idx(self._term_idx_store.getnumvals() - 1) term_freq = self._get_features_from_parsed_text( parsed_text, self._term_idx_store) self._register_document_features_with_X_factory( X_factory, 0, term_freq) return X_factory.get_csr_matrix()
def test_typing(self): mat_factory = CSRMatrixFactory() mat_factory[0, 0] = 4 mat_factory[1, 5] = 3.1 mat = mat_factory.get_csr_matrix() self.assertEqual(type(mat), csr_matrix) np.testing.assert_array_almost_equal( np.array([[4, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 3.1]]), mat.todense()) mat = mat_factory.get_csr_matrix(dtype=np.bool) self.assertEqual(type(mat), csr_matrix) np.testing.assert_array_almost_equal( np.array([[1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1]]), mat.todense()) mat = mat_factory.get_csr_matrix(dtype=np.int32) self.assertEqual(type(mat), csr_matrix) np.testing.assert_array_almost_equal( np.array([[4, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 3]]), mat.todense())
def build_from_category_whitespace_delimited_text(category_text_iter): ''' Parameters ---------- category_text_iter iterator of (string category name, one line per sentence, whitespace-delimited text) pairs Returns ------- TermDocMatrix ''' y = [] X_factory = CSRMatrixFactory() term_idx_store = IndexStore() category_idx_store = IndexStore() mX_factory = CSRMatrixFactory() for doci, (category, text) in enumerate(category_text_iter): y.append(category_idx_store.getidx(category)) term_freq = Counter() for sent in text.strip(string.punctuation).lower().split('\n'): unigrams = [] for tok in sent.strip().split(): unigrams.append(tok) bigrams = list(map(' '.join, zip(unigrams[:-1], unigrams[1:]))) for term in unigrams + bigrams: term_freq[term_idx_store.getidx(term)] += 1 for word_idx, freq in term_freq.items(): X_factory[doci, word_idx] = freq metadata_idx_store = IndexStore() return TermDocMatrix(X=X_factory.get_csr_matrix(), mX=mX_factory.get_csr_matrix(), y=np.array(y), term_idx_store=term_idx_store, metadata_idx_store=metadata_idx_store, category_idx_store=category_idx_store)
def add_doc_names_as_metadata(self, doc_names): ''' :param doc_names: array-like[str], document names of reach document :return: Corpus-like object with doc names as metadata. If two documents share the same name (doc number) will be appended to their names. ''' if len(doc_names) != self.get_num_docs(): raise Exception("The parameter doc_names contains %s elements. " "It should have %s elements, one per document." % (len(doc_names), self.get_num_docs())) doc_names_counter = collections.Counter(np.array(doc_names)) metafact = CSRMatrixFactory() metaidxstore = IndexStore() doc_id_uses = collections.Counter() for i in range(self.get_num_docs()): doc_id = doc_names[i] if doc_names_counter[doc_id] > 1: doc_id_uses[doc_id] += 1 doc_name_idx = metaidxstore.getidx('%s (%s)' % (doc_id, doc_id_uses[doc_id])) else: doc_name_idx = metaidxstore.getidx(doc_id) metafact[i, i] = doc_name_idx return self.add_metadata(metafact.get_csr_matrix(), metaidxstore)
def _get_features_and_labels_from_documents_and_indexes( self, category_doc_iter, category_idx_store, term_idx_store, metadata_idx_store): y = [] X_factory = CSRMatrixFactory() mX_factory = CSRMatrixFactory() for document_index, (category, parsed_text) in enumerate(category_doc_iter): self._register_doc_and_category(X_factory, mX_factory, category, category_idx_store, document_index, parsed_text, term_idx_store, metadata_idx_store, y) X = X_factory.get_csr_matrix() mX = mX_factory.get_csr_matrix() y = np.array(y) return X, mX, y
class CorpusFromFeatureDict(object): def __init__(self, df, category_col, text_col, feature_col, metadata_col=None, parsed_col=None): ''' Parameters ---------- df : pd.DataFrame contains category_col, and parse_col, were parsed col is entirely spacy docs category_col : str name of category column in convention_df text_col : str The name of the column which contains each document's raw text. feature_col : str name of column in convention_df with a feature dictionary metadata_col : str, optional name of column in convention_df with a meatadata dictionary parsed_col : str, optional name of column in convention_df with parsed strings ''' self._df = df.reset_index() self._category_col = category_col self._text_col = text_col self._feature_col = feature_col self._parsed_col = parsed_col self._metadata_col = metadata_col self._category_idx_store = IndexStore() self._X_factory = CSRMatrixFactory() self._mX_factory = CSRMatrixFactory() self._term_idx_store = IndexStore() self._metadata_idx_store = IndexStore() def build(self): '''Constructs the term doc matrix. Returns ------- scattertext.ParsedCorpus.ParsedCorpus ''' self._y = self._get_y_and_populate_category_idx_store() self._df.apply(self._add_to_x_factory, axis=1) self._X = self._X_factory.set_last_row_idx(len(self._y) - 1).get_csr_matrix() self._mX = self._mX_factory.set_last_row_idx(len(self._y) - 1).get_csr_matrix() if self._parsed_col is not None and self._parsed_col in self._df: return ParsedCorpus(self._df, self._X, self._mX, self._y, self._term_idx_store, self._category_idx_store, self._metadata_idx_store, self._parsed_col, self._category_col) else: return CorpusDF(self._df, self._X, self._mX, self._y, self._text_col, self._term_idx_store, self._category_idx_store, self._metadata_idx_store) def _get_y_and_populate_category_idx_store(self): return np.array(self._df[self._category_col].apply(self._category_idx_store.getidx)) def _add_to_x_factory(self, row): for feat, count in row[self._feature_col].items(): feat_idx = self._term_idx_store.getidx(feat) self._X_factory[row.name, feat_idx] = count if self._metadata_col in self._df: for meta, count in row[self._metadata_col].items(): meta_idx = self._metadata_idx_store.getidx(meta) self._mX_factory[row.name, meta_idx] = count def _make_new_term_doc_matrix(self, new_X, new_mX, new_y, new_term_idx_store, new_category_idx_store, new_metadata_idx_store, new_y_mask): if self._parsed_col is not None and self._parsed_col in self._df: return ParsedCorpus(self._df[new_y_mask], new_X, new_mX, new_y, new_term_idx_store, new_category_idx_store, new_metadata_idx_store, self._parsed_col, self._category_col) else: return CorpusDF(self._df[new_y_mask], new_X, new_mX, new_y, self._text_col, new_term_idx_store, new_category_idx_store, new_metadata_idx_store, self._df[self._text_col][new_y_mask])
class CorpusFromParsedDocuments(object): def __init__(self, df, category_col, parsed_col, feats_from_spacy_doc=FeatsFromSpacyDoc()): ''' Parameters ---------- df : pd.DataFrame contains category_col, and parse_col, were parsed col is entirely spacy docs category_col : str name of category column in convention_df parsed_col : str name of spacy parsed column in convention_df feats_from_spacy_doc : FeatsFromSpacyDoc ''' self._df = df.reset_index() self._category_col = category_col self._parsed_col = parsed_col self._category_idx_store = IndexStore() self._X_factory = CSRMatrixFactory() self._mX_factory = CSRMatrixFactory() self._term_idx_store = IndexStore() self._metadata_idx_store = IndexStore() self._feats_from_spacy_doc = feats_from_spacy_doc def build(self): '''Constructs the term doc matrix. Returns ------- scattertext.ParsedCorpus.ParsedCorpus ''' self._y = self._get_y_and_populate_category_idx_store() self._df.apply(self._add_to_x_factory, axis=1) self._X = self._X_factory.set_last_row_idx(len(self._y)-1).get_csr_matrix() self._mX = self._mX_factory.set_last_row_idx(len(self._y)-1).get_csr_matrix() return ParsedCorpus(self._df, self._X, self._mX, self._y, self._term_idx_store, self._category_idx_store, self._metadata_idx_store, self._parsed_col, self._category_col) def _get_y_and_populate_category_idx_store(self): return np.array(self._df[self._category_col].apply(self._category_idx_store.getidx)) def _add_to_x_factory(self, row): parsed_text = row[self._parsed_col] for term, count in self._feats_from_spacy_doc.get_feats(parsed_text).items(): term_idx = self._term_idx_store.getidx(term) self._X_factory[row.name, term_idx] = count for meta, val in self._feats_from_spacy_doc.get_doc_metadata(parsed_text).items(): meta_idx = self._metadata_idx_store.getidx(meta) self._mX_factory[row.name, meta_idx] = val def _make_new_term_doc_matrix(self, new_X, new_mX, new_y, new_term_idx_store, new_category_idx_store, new_metadata_idx_store, new_y_mask): return ParsedCorpus(self._df[new_y_mask], new_X, new_mX, new_y, new_term_idx_store, new_category_idx_store, new_metadata_idx_store, self._parsed_col, self._category_col)
def use_doc_labeled_terms_as_metadata(self, doc_labels, separator='_', replace_metadata = True): ''' Makes the metadata of a new TermDocMatrix a copy of the term-document matrix, except each term is prefixed by its document's label followed by the separator. :param doc_labels: list[str], should be the same size as the number of documents in the TermDocMatrix. :param separator: str, default is '_' :return: self ''' assert len(doc_labels) == self.get_num_docs() doc_labels = np.array(doc_labels) terms_in_corpus = np.array(self._term_idx_store.values()) new_metadata_list = [] new_meta_X = None ordered_doc_labels = list(sorted(set(doc_labels))) X = self._X if replace_metadata: #X = self._mX X = self._X for doc_label in ordered_doc_labels: label_doc_mask = doc_labels == doc_label label_X = X[label_doc_mask, :] label_term_mask = (X.sum(axis=0) > 0).A1 label_X = label_X[:, label_term_mask] cols_to_pad = len(new_metadata_list) new_metadata_list += [doc_label + separator + term for term in terms_in_corpus[label_term_mask]] if new_meta_X is None: new_meta_X = label_X else: label_X_pad = (CSRMatrixFactory() .set_last_col_idx(cols_to_pad - 1) .set_last_row_idx(sum(label_doc_mask) - 1) .get_csr_matrix()) padded_label_X = scipy.sparse.hstack([label_X_pad, label_X]) new_meta_X.resize(new_meta_X.shape[0], padded_label_X.shape[1]) new_meta_X = scipy.sparse.vstack([new_meta_X, padded_label_X]) new_metadata_idx_store = IndexStoreFromList.build(new_metadata_list) new_meta_X = new_meta_X.tocsr() new_mX = (CSRMatrixFactory() .set_last_col_idx(new_meta_X.shape[1] - 1) .set_last_row_idx(new_meta_X.shape[0] - 1) .get_csr_matrix().tolil()) start_row = 0 for doc_label in ordered_doc_labels: label_doc_mask = doc_labels == doc_label num_rows = sum(label_doc_mask) new_mX[label_doc_mask, :] = new_meta_X[start_row:start_row + num_rows, :] start_row += num_rows new_mX = new_mX.tocsr() new_tdm = self._make_new_term_doc_matrix(self._X, new_mX, self._y, self._term_idx_store, self._category_idx_store, new_metadata_idx_store, self._y == self._y) return new_tdm
class OffsetCorpusFactory(object): def __init__(self, df, parsed_col, feat_and_offset_getter, category_col=None): ''' Parameters ---------- df : pd.DataFrame contains category_col, and parse_col, were parsed col is entirely spacy docs parsed_col : str name of spacy parsed column in convention_df feats_from_spacy_doc : FeatsFromSpacyDoc category_col : str, Optional name of category column in df; if None, all category names will be '_' ''' self._df = df.reset_index() self._category_col = category_col self._parsed_col = parsed_col self._category_idx_store = IndexStore() self._X_factory = CSRMatrixFactory() self._mX_factory = CSRMatrixFactory() self._term_idx_store = IndexStore() self._metadata_idx_store = IndexStore() self._feat_and_offset_getter = feat_and_offset_getter self._term_offsets = {} self._metadata_offsets = {} def build(self): '''Constructs the term doc matrix. Returns ------- scattertext.ParsedCorpus.ParsedCorpus ''' self._ensure_category_col_is_in_df() y = self._get_y_and_populate_category_idx_store( self._df[self._category_col]) self._df.apply(self._add_to_x_factory, axis=1) self._mX = self._mX_factory.set_last_row_idx(len(y) - 1).get_csr_matrix() return OffsetCorpus( df=self._df, X=self._X_factory.set_last_row_idx(len(y) - 1).get_csr_matrix(), mX=self._mX_factory.set_last_row_idx(len(y) - 1).get_csr_matrix(), y=self._get_y_and_populate_category_idx_store( self._df[self._category_col]), term_idx_store=self._term_idx_store, category_idx_store=self._category_idx_store, metadata_idx_store=self._metadata_idx_store, parsed_col=self._parsed_col, category_col=self._category_col, term_offsets=self._term_offsets, metadata_offsets=self._metadata_offsets) def _ensure_category_col_is_in_df(self): if self._category_col not in self._df: self._category_col = 'Category' while self._category_col in self._df: self._category_col = 'Category_' + ''.join( np.random.choice(string.ascii_letters) for _ in range(5)) def _get_y_and_populate_category_idx_store(self, categories): return np.array(categories.apply(self._category_idx_store.getidx)) def _add_to_x_factory(self, row): parsed_text = row[self._parsed_col] for term, (count, offsets) in self._feat_and_offset_getter.get_term_offsets( parsed_text): term_idx = self._term_idx_store.getidx(term) self._X_factory[row.name, term_idx] = count if offsets is not None: self._term_offsets.setdefault(term, {}).setdefault( row.name, []).extend(offsets) for meta, ( val, offsets ) in self._feat_and_offset_getter.get_metadata_offsets(parsed_text): meta_idx = self._metadata_idx_store.getidx(meta) self._mX_factory[row.name, meta_idx] = val if offsets is not None: self._metadata_offsets.setdefault(meta, {}).setdefault( row.name, []).extend(offsets)