def __init__(self, df, category_col, text_col, feature_col, metadata_col=None, parsed_col=None): ''' Parameters ---------- df : pd.DataFrame contains category_col, and parse_col, were parsed col is entirely spacy docs category_col : str name of category column in convention_df text_col : str The name of the column which contains each document's raw text. feature_col : str name of column in convention_df with a feature dictionary metadata_col : str, optional name of column in convention_df with a meatadata dictionary parsed_col : str, optional name of column in convention_df with parsed strings ''' self._df = df.reset_index() self._category_col = category_col self._text_col = text_col self._feature_col = feature_col self._parsed_col = parsed_col self._metadata_col = metadata_col self._category_idx_store = IndexStore() self._X_factory = CSRMatrixFactory() self._mX_factory = CSRMatrixFactory() self._term_idx_store = IndexStore() self._metadata_idx_store = IndexStore()
def build(self): '''Constructs the term doc matrix. Returns ------- TermDocMatrix ''' X_factory = CSRMatrixFactory() mX_factory = CSRMatrixFactory() term_idx_store = IndexStore() metadata_idx_store = IndexStore() parse_pipeline = ParsePipelineFactoryWithoutCategories(self.get_nlp(), X_factory, mX_factory, term_idx_store, metadata_idx_store, self) df = self._clean_and_filter_nulls_and_empties_from_dataframe() tdm = self._apply_pipeline_and_get_build_instance(X_factory, mX_factory, df, parse_pipeline, term_idx_store, metadata_idx_store) return tdm
def add_doc_names_as_metadata(self, doc_names): ''' :param doc_names: array-like[str], document names of reach document :return: Corpus-like object with doc names as metadata. If two documents share the same name (doc number) will be appended to their names. ''' if len(doc_names) != self.get_num_docs(): raise Exception("The parameter doc_names contains %s elements. " "It should have %s elements, one per document." % (len(doc_names), self.get_num_docs())) doc_names_counter = collections.Counter(np.array(doc_names)) metafact = CSRMatrixFactory() metaidxstore = IndexStore() doc_id_uses = collections.Counter() for i in range(self.get_num_docs()): doc_id = doc_names[i] if doc_names_counter[doc_id] > 1: doc_id_uses[doc_id] += 1 doc_name_idx = metaidxstore.getidx( '%s (%s)' % (doc_id, doc_id_uses[doc_id])) else: doc_name_idx = metaidxstore.getidx(doc_id) metafact[i, i] = doc_name_idx return self.add_metadata(metafact.get_csr_matrix(), metaidxstore)
def __init__(self, df, category_col, parsed_col, feats_from_spacy_doc=FeatsFromSpacyDoc()): ''' Parameters ---------- df : pd.DataFrame contains category_col, and parse_col, were parsed col is entirely spacy docs category_col : str name of category column in convention_df parsed_col : str name of spacy parsed column in convention_df feats_from_spacy_doc : FeatsFromSpacyDoc ''' self._df = df.reset_index() self._category_col = category_col self._parsed_col = parsed_col self._category_idx_store = IndexStore() self._X_factory = CSRMatrixFactory() self._mX_factory = CSRMatrixFactory() self._term_idx_store = IndexStore() self._metadata_idx_store = IndexStore() self._feats_from_spacy_doc = feats_from_spacy_doc
def __init__(self, df, parsed_col, feat_and_offset_getter, category_col=None): ''' Parameters ---------- df : pd.DataFrame contains category_col, and parse_col, were parsed col is entirely spacy docs parsed_col : str name of spacy parsed column in convention_df feats_from_spacy_doc : FeatsFromSpacyDoc category_col : str, Optional name of category column in df; if None, all category names will be '_' ''' self._df = df.reset_index() self._category_col = category_col self._parsed_col = parsed_col self._category_idx_store = IndexStore() self._X_factory = CSRMatrixFactory() self._mX_factory = CSRMatrixFactory() self._term_idx_store = IndexStore() self._metadata_idx_store = IndexStore() self._feat_and_offset_getter = feat_and_offset_getter self._term_offsets = {} self._metadata_offsets = {}
class CorpusFromParsedDocuments(object): def __init__(self, df, category_col, parsed_col, feats_from_spacy_doc=FeatsFromSpacyDoc()): ''' Parameters ---------- df : pd.DataFrame contains category_col, and parse_col, were parsed col is entirely spacy docs category_col : str name of category column in df parsed_col : str name of spacy parsed column in df feats_from_spacy_doc : FeatsFromSpacyDoc ''' self._df = df.reset_index() self._category_col = category_col self._parsed_col = parsed_col self._category_idx_store = IndexStore() self._X_factory = CSRMatrixFactory() self._mX_factory = CSRMatrixFactory() self._term_idx_store = IndexStore() self._metadata_idx_store = IndexStore() self._feats_from_spacy_doc = feats_from_spacy_doc def build(self): '''Constructs the term doc matrix. Returns ------- scattertext.ParsedCorpus.ParsedCorpus ''' self._y = self._get_y_and_populate_category_idx_store() self._df.apply(self._add_to_x_factory, axis=1) self._X = self._X_factory.set_last_row_idx(len(self._y)-1).get_csr_matrix() self._mX = self._mX_factory.set_last_row_idx(len(self._y)-1).get_csr_matrix() return ParsedCorpus(self._df, self._X, self._mX, self._y, self._term_idx_store, self._category_idx_store, self._metadata_idx_store, self._parsed_col, self._category_col) def _get_y_and_populate_category_idx_store(self): return np.array(self._df[self._category_col].apply(self._category_idx_store.getidx)) def _add_to_x_factory(self, row): parsed_text = row[self._parsed_col] for term, count in self._feats_from_spacy_doc.get_feats(parsed_text).items(): term_idx = self._term_idx_store.getidx(term) self._X_factory[row.name, term_idx] = count for meta, val in self._feats_from_spacy_doc.get_doc_metadata(parsed_text).items(): meta_idx = self._metadata_idx_store.getidx(meta) self._mX_factory[row.name, meta_idx] = val
def init_term_doc_matrix_variables(): y = [] X_factory = CSRMatrixFactory() mX_factory = CSRMatrixFactory() category_idx_store = IndexStore() term_idx_store = IndexStore() metadata_idx_store = IndexStore() return X_factory, mX_factory, category_idx_store, \ term_idx_store, metadata_idx_store, y
def build(values): ''' Parameters ---------- values: [term, ...] Returns ------- IndexStore ''' idxstore = IndexStore() idxstore._i2val = values idxstore._val2i = {term: i for i, term in enumerate(values)} idxstore._next_i = len(values) return idxstore
def test_main(self): index_store = IndexStore() self.assertEqual(index_store.getidx('a'), 0) self.assertEqual(index_store.getidx('b'), 1) self.assertEqual(index_store.getidx('a'), 0) self.assertEqual(index_store.getval(0), 'a') self.assertEqual(index_store.getval(1), 'b') self.assertTrue('a' in index_store) self.assertFalse('c' in index_store) self.assertEqual(set(index_store.values()), set(['a', 'b'])) self.assertFalse(0 in index_store) self.assertTrue(index_store.hasidx(0)) self.assertFalse(index_store.hasidx(2)) self.assertEqual(index_store.getnumvals(), 2) self.assertEqual(list(index_store.items()), [(0, 'a'), (1, 'b')])
def build(values): ''' Parameters ---------- values: [term, ...] Returns ------- IndexStore ''' idxstore = IndexStore() idxstore._i2val = list(values) idxstore._val2i = {term:i for i,term in enumerate(values)} idxstore._next_i = len(values) return idxstore
def build_from_category_whitespace_delimited_text(category_text_iter): ''' Parameters ---------- category_text_iter iterator of (string category name, one line per sentence, whitespace-delimited text) pairs Returns ------- TermDocMatrix ''' y = [] X_factory = CSRMatrixFactory() term_idx_store = IndexStore() category_idx_store = IndexStore() mX_factory = CSRMatrixFactory() for doci, (category, text) in enumerate(category_text_iter): y.append(category_idx_store.getidx(category)) term_freq = Counter() for sent in text.strip(string.punctuation).lower().split('\n'): unigrams = [] for tok in sent.strip().split(): unigrams.append(tok) bigrams = list(map(' '.join, zip(unigrams[:-1], unigrams[1:]))) for term in unigrams + bigrams: term_freq[term_idx_store.getidx(term)] += 1 for word_idx, freq in term_freq.items(): X_factory[doci, word_idx] = freq metadata_idx_store = IndexStore() return TermDocMatrix(X=X_factory.get_csr_matrix(), mX=mX_factory.get_csr_matrix(), y=np.array(y), term_idx_store=term_idx_store, metadata_idx_store=metadata_idx_store, category_idx_store=category_idx_store)
def test_getidxstrict(self): index_store = IndexStore() self.assertEqual(index_store.getidx('a'), 0) self.assertEqual(index_store.getidx('b'), 1) self.assertEqual(index_store.getidx('a'), 0) with self.assertRaises(KeyError): index_store.getidxstrict('c')
def test_batch_delete(self): index_store = IndexStore() self.assertEqual(index_store.getidx('a'), 0) self.assertEqual(index_store.getidx('b'), 1) self.assertEqual(index_store.getidx('c'), 2) self.assertEqual(index_store.getidx('d'), 3) with self.assertRaises(KeyError): new_idx_store = index_store.batch_delete_vals(['e', 'c']) new_idx_store = index_store.batch_delete_vals(['b', 'c']) self.assertEqual(new_idx_store.getidx('a'), 0) self.assertEqual(new_idx_store.getidx('c'), 2) self.assertEqual(new_idx_store.getidx('e'), 3) self.assertEqual(index_store.getidx('d'), 3) self.assertEqual(index_store.getidx('c'), 2) self.assertEqual(index_store.getidx('b'), 1) self.assertEqual(index_store.getidx('a'), 0) with self.assertRaises(ValueError): new_idx_store = index_store.batch_delete_idx([5, 1]) new_idx_store = index_store.batch_delete_idx([2, 1]) self.assertEqual(new_idx_store.getidx('a'), 0) self.assertEqual(new_idx_store.getidx('c'), 2) self.assertEqual(new_idx_store.getidx('e'), 3)
def add_doc_names_as_metadata(self, doc_names): ''' :param doc_names: array-like[str], document names of reach document :return: Corpus-like object with doc names as metadata. If two documents share the same name (doc number) will be appended to their names. ''' if len(doc_names) != self.get_num_docs(): raise Exception("The parameter doc_names contains %s elements. " "It should have %s elements, one per document." % (len(doc_names), self.get_num_docs())) doc_names_counter = collections.Counter(np.array(doc_names)) metafact = CSRMatrixFactory() metaidxstore = IndexStore() doc_id_uses = collections.Counter() for i in range(self.get_num_docs()): doc_id = doc_names[i] if doc_names_counter[doc_id] > 1: doc_id_uses[doc_id] += 1 doc_name_idx = metaidxstore.getidx('%s (%s)' % (doc_id, doc_id_uses[doc_id])) else: doc_name_idx = metaidxstore.getidx(doc_id) metafact[i, i] = doc_name_idx return self.add_metadata(metafact.get_csr_matrix(), metaidxstore)
def _build_from_category_spacy_doc_iter(self, category_doc_iter): ''' Parameters ---------- category_doc_iter : iterator of (string category name, spacy.tokens.doc.Doc) pairs Returns ---------- t : TermDocMatrix ''' term_idx_store = IndexStore() category_idx_store = IndexStore() metadata_idx_store = IndexStore() X, mX, y = self._get_features_and_labels_from_documents_and_indexes \ (category_doc_iter, category_idx_store, term_idx_store, metadata_idx_store) return TermDocMatrix(X, mX, y, term_idx_store=term_idx_store, category_idx_store=category_idx_store, metadata_idx_store=metadata_idx_store)
def test_getidxstrictbatch(self): index_store = IndexStore() self.assertEqual(index_store.getidx('a'), 0) self.assertEqual(index_store.getidx('b'), 1) self.assertEqual(index_store.getidx('c'), 2) self.assertEqual(index_store.getidx('d'), 3) self.assertEqual(index_store.getidx('e'), 4) self.assertEqual(index_store.getidx('f'), 5) self.assertEqual(index_store.getidxstrictbatch(['b', 'f', 'b', 'a']), [1, 5, 1, 0])
def test_batch_delete_extra(self): index_store = IndexStore() self.assertEqual(index_store.getidx('a'), 0) self.assertEqual(index_store.getidx('b'), 1) self.assertEqual(index_store.getidx('c'), 2) self.assertEqual(index_store.getidx('d'), 3) self.assertEqual(index_store.getidx('e'), 4) self.assertEqual(index_store.getidx('f'), 5) del_idxstore = index_store.batch_delete_vals(['b', 'e']) self.assertEqual(list(del_idxstore.items()), [(0, 'a'), (1, 'c'), (2, 'd'), (3, 'f')]) del_idxstore2 = del_idxstore.batch_delete_vals([]) self.assertEqual(list(del_idxstore.items()), list(del_idxstore2.items()))
def term_group_freq_df(self, group_col): # type: (str) -> pd.DataFrame ''' Returns a dataframe indexed on the number of groups a term occured in. Parameters ---------- group_col Returns ------- pd.DataFrame ''' group_idx_store = IndexStore() X = self._X group_idx_to_cat_idx, row_group_cat \ = self._get_group_docids_and_index_store(X, group_col, group_idx_store) newX = self._change_document_type_in_matrix(X, row_group_cat) newX = self._make_all_positive_data_ones(newX) category_row = newX.tocoo().row for group_idx, cat_idx in group_idx_to_cat_idx.items(): category_row[category_row == group_idx] = cat_idx catX = self._change_document_type_in_matrix(newX, category_row) return self._term_freq_df_from_matrix(catX)
class OffsetCorpusFactory(object): def __init__(self, df, parsed_col, feat_and_offset_getter, category_col=None): ''' Parameters ---------- df : pd.DataFrame contains category_col, and parse_col, were parsed col is entirely spacy docs parsed_col : str name of spacy parsed column in convention_df feats_from_spacy_doc : FeatsFromSpacyDoc category_col : str, Optional name of category column in df; if None, all category names will be '_' ''' self._df = df.reset_index() self._category_col = category_col self._parsed_col = parsed_col self._category_idx_store = IndexStore() self._X_factory = CSRMatrixFactory() self._mX_factory = CSRMatrixFactory() self._term_idx_store = IndexStore() self._metadata_idx_store = IndexStore() self._feat_and_offset_getter = feat_and_offset_getter self._term_offsets = {} self._metadata_offsets = {} def build(self): '''Constructs the term doc matrix. Returns ------- scattertext.ParsedCorpus.ParsedCorpus ''' self._ensure_category_col_is_in_df() y = self._get_y_and_populate_category_idx_store( self._df[self._category_col]) self._df.apply(self._add_to_x_factory, axis=1) self._mX = self._mX_factory.set_last_row_idx(len(y) - 1).get_csr_matrix() return OffsetCorpus( df=self._df, X=self._X_factory.set_last_row_idx(len(y) - 1).get_csr_matrix(), mX=self._mX_factory.set_last_row_idx(len(y) - 1).get_csr_matrix(), y=self._get_y_and_populate_category_idx_store( self._df[self._category_col]), term_idx_store=self._term_idx_store, category_idx_store=self._category_idx_store, metadata_idx_store=self._metadata_idx_store, parsed_col=self._parsed_col, category_col=self._category_col, term_offsets=self._term_offsets, metadata_offsets=self._metadata_offsets) def _ensure_category_col_is_in_df(self): if self._category_col not in self._df: self._category_col = 'Category' while self._category_col in self._df: self._category_col = 'Category_' + ''.join( np.random.choice(string.ascii_letters) for _ in range(5)) def _get_y_and_populate_category_idx_store(self, categories): return np.array(categories.apply(self._category_idx_store.getidx)) def _add_to_x_factory(self, row): parsed_text = row[self._parsed_col] for term, (count, offsets) in self._feat_and_offset_getter.get_term_offsets( parsed_text): term_idx = self._term_idx_store.getidx(term) self._X_factory[row.name, term_idx] = count if offsets is not None: self._term_offsets.setdefault(term, {}).setdefault( row.name, []).extend(offsets) for meta, ( val, offsets ) in self._feat_and_offset_getter.get_metadata_offsets(parsed_text): meta_idx = self._metadata_idx_store.getidx(meta) self._mX_factory[row.name, meta_idx] = val if offsets is not None: self._metadata_offsets.setdefault(meta, {}).setdefault( row.name, []).extend(offsets)
class CorpusFromParsedDocuments(object): def __init__(self, df, category_col, parsed_col, feats_from_spacy_doc=FeatsFromSpacyDoc()): ''' Parameters ---------- df : pd.DataFrame contains category_col, and parse_col, were parsed col is entirely spacy docs category_col : str name of category column in convention_df parsed_col : str name of spacy parsed column in convention_df feats_from_spacy_doc : FeatsFromSpacyDoc ''' self._df = df.reset_index() self._category_col = category_col self._parsed_col = parsed_col self._category_idx_store = IndexStore() self._X_factory = CSRMatrixFactory() self._mX_factory = CSRMatrixFactory() self._term_idx_store = IndexStore() self._metadata_idx_store = IndexStore() self._feats_from_spacy_doc = feats_from_spacy_doc def build(self): '''Constructs the term doc matrix. Returns ------- scattertext.ParsedCorpus.ParsedCorpus ''' self._y = self._get_y_and_populate_category_idx_store() self._df.apply(self._add_to_x_factory, axis=1) self._X = self._X_factory.set_last_row_idx(len(self._y)-1).get_csr_matrix() self._mX = self._mX_factory.set_last_row_idx(len(self._y)-1).get_csr_matrix() return ParsedCorpus(self._df, self._X, self._mX, self._y, self._term_idx_store, self._category_idx_store, self._metadata_idx_store, self._parsed_col, self._category_col) def _get_y_and_populate_category_idx_store(self): return np.array(self._df[self._category_col].apply(self._category_idx_store.getidx)) def _add_to_x_factory(self, row): parsed_text = row[self._parsed_col] for term, count in self._feats_from_spacy_doc.get_feats(parsed_text).items(): term_idx = self._term_idx_store.getidx(term) self._X_factory[row.name, term_idx] = count for meta, val in self._feats_from_spacy_doc.get_doc_metadata(parsed_text).items(): meta_idx = self._metadata_idx_store.getidx(meta) self._mX_factory[row.name, meta_idx] = val def _make_new_term_doc_matrix(self, new_X, new_mX, new_y, new_term_idx_store, new_category_idx_store, new_metadata_idx_store, new_y_mask): return ParsedCorpus(self._df[new_y_mask], new_X, new_mX, new_y, new_term_idx_store, new_category_idx_store, new_metadata_idx_store, self._parsed_col, self._category_col)
class CorpusFromFeatureDict(object): def __init__(self, df, category_col, text_col, feature_col, metadata_col=None, parsed_col=None): ''' Parameters ---------- df : pd.DataFrame contains category_col, and parse_col, were parsed col is entirely spacy docs category_col : str name of category column in convention_df text_col : str The name of the column which contains each document's raw text. feature_col : str name of column in convention_df with a feature dictionary metadata_col : str, optional name of column in convention_df with a meatadata dictionary parsed_col : str, optional name of column in convention_df with parsed strings ''' self._df = df.reset_index() self._category_col = category_col self._text_col = text_col self._feature_col = feature_col self._parsed_col = parsed_col self._metadata_col = metadata_col self._category_idx_store = IndexStore() self._X_factory = CSRMatrixFactory() self._mX_factory = CSRMatrixFactory() self._term_idx_store = IndexStore() self._metadata_idx_store = IndexStore() def build(self): '''Constructs the term doc matrix. Returns ------- scattertext.ParsedCorpus.ParsedCorpus ''' self._y = self._get_y_and_populate_category_idx_store() self._df.apply(self._add_to_x_factory, axis=1) self._X = self._X_factory.set_last_row_idx(len(self._y) - 1).get_csr_matrix() self._mX = self._mX_factory.set_last_row_idx(len(self._y) - 1).get_csr_matrix() if self._parsed_col is not None and self._parsed_col in self._df: return ParsedCorpus(self._df, self._X, self._mX, self._y, self._term_idx_store, self._category_idx_store, self._metadata_idx_store, self._parsed_col, self._category_col) else: return CorpusDF(self._df, self._X, self._mX, self._y, self._text_col, self._term_idx_store, self._category_idx_store, self._metadata_idx_store) def _get_y_and_populate_category_idx_store(self): return np.array(self._df[self._category_col].apply(self._category_idx_store.getidx)) def _add_to_x_factory(self, row): for feat, count in row[self._feature_col].items(): feat_idx = self._term_idx_store.getidx(feat) self._X_factory[row.name, feat_idx] = count if self._metadata_col in self._df: for meta, count in row[self._metadata_col].items(): meta_idx = self._metadata_idx_store.getidx(meta) self._mX_factory[row.name, meta_idx] = count def _make_new_term_doc_matrix(self, new_X, new_mX, new_y, new_term_idx_store, new_category_idx_store, new_metadata_idx_store, new_y_mask): if self._parsed_col is not None and self._parsed_col in self._df: return ParsedCorpus(self._df[new_y_mask], new_X, new_mX, new_y, new_term_idx_store, new_category_idx_store, new_metadata_idx_store, self._parsed_col, self._category_col) else: return CorpusDF(self._df[new_y_mask], new_X, new_mX, new_y, self._text_col, new_term_idx_store, new_category_idx_store, new_metadata_idx_store, self._df[self._text_col][new_y_mask])