def __init__(self,
                 df,
                 category_col,
                 parsed_col,
                 feats_from_spacy_doc=FeatsFromSpacyDoc()):
        '''
		Parameters
		----------
		df : pd.DataFrame
		 contains category_col, and parse_col, were parsed col is entirely spacy docs
		category_col : str
			name of category column in df
		parsed_col : str
			name of spacy parsed column in df
		feats_from_spacy_doc : FeatsFromSpacyDoc
		'''
        self._df = df.reset_index()
        self._category_col = category_col
        self._parsed_col = parsed_col

        self._category_idx_store = IndexStore()
        self._X_factory = CSRMatrixFactory()
        self._mX_factory = CSRMatrixFactory()
        self._term_idx_store = IndexStore()
        self._metadata_idx_store = IndexStore()
        self._feats_from_spacy_doc = feats_from_spacy_doc
Exemple #2
0
class CorpusFromParsedDocuments(object):
    def __init__(self,
                 df,
                 category_col,
                 parsed_col,
                 feats_from_spacy_doc=FeatsFromSpacyDoc()):
        '''
		Parameters
		----------
		df : pd.DataFrame
		 contains category_col, and parse_col, were parsed col is entirely spacy docs
		category_col : str
			name of category column in df
		parsed_col : str
			name of spacy parsed column in df
		feats_from_spacy_doc : FeatsFromSpacyDoc
		'''
        self._df = df.reset_index()
        self._category_col = category_col
        self._parsed_col = parsed_col
        self._category_idx_store = IndexStore()
        self._X_factory = CSRMatrixFactory()
        self._mX_factory = CSRMatrixFactory()
        self._term_idx_store = IndexStore()
        self._metadata_idx_store = IndexStore()
        self._feats_from_spacy_doc = feats_from_spacy_doc

    def build(self):
        '''Constructs the term doc matrix.

		Returns
		-------
		scattertext.ParsedCorpus.ParsedCorpus
		'''
        self._y = self._get_y_and_populate_category_idx_store()
        self._df.apply(self._add_to_x_factory, axis=1)
        self._X = self._X_factory.set_last_row_idx(len(self._y) -
                                                   1).get_csr_matrix()
        self._mX = self._mX_factory.set_last_row_idx(len(self._y) -
                                                     1).get_csr_matrix()
        return ParsedCorpus(self._df, self._X, self._mX, self._y,
                            self._term_idx_store, self._category_idx_store,
                            self._metadata_idx_store, self._parsed_col,
                            self._category_col)

    def _get_y_and_populate_category_idx_store(self):
        return np.array(self._df[self._category_col].apply(
            self._category_idx_store.getidx))

    def _add_to_x_factory(self, row):
        parsed_text = row[self._parsed_col]
        for term, count in self._feats_from_spacy_doc.get_feats(
                parsed_text).items():
            term_idx = self._term_idx_store.getidx(term)
            self._X_factory[row.name, term_idx] = count
        for meta, val in self._feats_from_spacy_doc.get_doc_metadata(
                parsed_text).items():
            meta_idx = self._metadata_idx_store.getidx(meta)
            self._mX_factory[row.name, meta_idx] = val
Exemple #3
0
    def init_term_doc_matrix_variables():
        y = []
        X_factory = CSRMatrixFactory()
        mX_factory = CSRMatrixFactory()
        category_idx_store = IndexStore()
        term_idx_store = IndexStore()
        metadata_idx_store = IndexStore()

        return X_factory, mX_factory, category_idx_store, \
               term_idx_store, metadata_idx_store, y
Exemple #4
0
 def test_main(self):
     index_store = IndexStore()
     self.assertEqual(index_store.getidx('a'), 0)
     self.assertEqual(index_store.getidx('b'), 1)
     self.assertEqual(index_store.getidx('a'), 0)
     self.assertEqual(index_store.getval(0), 'a')
     self.assertEqual(index_store.getval(1), 'b')
     self.assertTrue('a' in index_store)
     self.assertFalse('c' in index_store)
     self.assertEqual(set(index_store.values()), set(['a', 'b']))
     self.assertFalse(0 in index_store)
     self.assertTrue(index_store.hasidx(0))
     self.assertFalse(index_store.hasidx(2))
     self.assertEqual(index_store.getnumvals(), 2)
     self.assertEqual(list(index_store.items()), [(0, 'a'), (1, 'b')])
Exemple #5
0
 def test_getidxstrict(self):
     index_store = IndexStore()
     self.assertEqual(index_store.getidx('a'), 0)
     self.assertEqual(index_store.getidx('b'), 1)
     self.assertEqual(index_store.getidx('a'), 0)
     with self.assertRaises(KeyError):
         index_store.getidxstrict('c')
    def _build_from_category_spacy_doc_iter(self, category_doc_iter):
        '''
		Parameters
		----------
		category_doc_iter : iterator of (string category name, spacy.tokens.doc.Doc) pairs

		Returns
		----------
		t : TermDocMatrix
		'''
        term_idx_store = IndexStore()
        category_idx_store = IndexStore()
        metadata_idx_store = IndexStore()
        X, mX, y = self._get_features_and_labels_from_documents_and_indexes(
            category_doc_iter, category_idx_store, term_idx_store,
            metadata_idx_store)
        return TermDocMatrix(X,
                             mX,
                             y,
                             term_idx_store=term_idx_store,
                             category_idx_store=category_idx_store,
                             metadata_idx_store=metadata_idx_store)
Exemple #7
0
 def test_batch_delete(self):
     index_store = IndexStore()
     self.assertEqual(index_store.getidx('a'), 0)
     self.assertEqual(index_store.getidx('b'), 1)
     self.assertEqual(index_store.getidx('c'), 2)
     self.assertEqual(index_store.getidx('d'), 3)
     with self.assertRaises(KeyError):
         new_idx_store = index_store.batch_delete_vals(['e', 'c'])
     new_idx_store = index_store.batch_delete_vals(['b', 'c'])
     self.assertEqual(new_idx_store.getidx('a'), 0)
     self.assertEqual(new_idx_store.getidx('c'), 2)
     self.assertEqual(new_idx_store.getidx('e'), 3)
     self.assertEqual(index_store.getidx('d'), 3)
     self.assertEqual(index_store.getidx('c'), 2)
     self.assertEqual(index_store.getidx('b'), 1)
     self.assertEqual(index_store.getidx('a'), 0)
     with self.assertRaises(ValueError):
         new_idx_store = index_store.batch_delete_idx([5, 1])
     new_idx_store = index_store.batch_delete_idx([2, 1])
     self.assertEqual(new_idx_store.getidx('a'), 0)
     self.assertEqual(new_idx_store.getidx('c'), 2)
     self.assertEqual(new_idx_store.getidx('e'), 3)
Exemple #8
0
    def test_batch_delete_extra(self):
        index_store = IndexStore()
        self.assertEqual(index_store.getidx('a'), 0)
        self.assertEqual(index_store.getidx('b'), 1)
        self.assertEqual(index_store.getidx('c'), 2)
        self.assertEqual(index_store.getidx('d'), 3)
        self.assertEqual(index_store.getidx('e'), 4)
        self.assertEqual(index_store.getidx('f'), 5)
        del_idxstore = index_store.batch_delete_vals(['b', 'e'])
        self.assertEqual(list(del_idxstore.items()), [(0, 'a'), (1, 'c'),
                                                      (2, 'd'), (3, 'f')])

        del_idxstore2 = del_idxstore.batch_delete_vals([])
        self.assertEqual(list(del_idxstore.items()),
                         list(del_idxstore2.items()))
def build_from_category_whitespace_delimited_text(category_text_iter):
    '''

	Parameters
	----------
	category_text_iter iterator of (string category name, one line per sentence, whitespace-delimited text) pairs

	Returns
	-------
	TermDocMatrix
	'''
    y = []
    X_factory = CSRMatrixFactory()
    term_idx_store = IndexStore()
    category_idx_store = IndexStore()
    mX_factory = CSRMatrixFactory()
    metadata_idx_store = IndexStore()
    for doci, (category, text) in enumerate(category_text_iter):
        y.append(category_idx_store.getidx(category))
        term_freq = Counter()
        for sent in text.strip(string.punctuation).lower().split('\n'):
            unigrams = []
            for tok in sent.strip().split():
                unigrams.append(tok)
            bigrams = list(map(' '.join, zip(unigrams[:-1], unigrams[1:])))
            for term in unigrams + bigrams:
                term_freq[term_idx_store.getidx(term)] += 1
        for word_idx, freq in term_freq.items():
            X_factory[doci, word_idx] = freq
    metadata_idx_store = IndexStore()
    return TermDocMatrix(X=X_factory.get_csr_matrix(),
                         mX=mX_factory.get_csr_matrix(),
                         y=np.array(y),
                         term_idx_store=term_idx_store,
                         metadata_idx_store=metadata_idx_store,
                         category_idx_store=category_idx_store)
    def term_group_freq_df(self, group_col):
        # type: (str) -> pd.DataFrame
        '''
		Returns a dataframe indexed on the number of groups a term occured in.

		Parameters
		----------
		group_col

		Returns
		-------
		pd.DataFrame
		'''
        group_idx_store = IndexStore()
        X = self._X
        group_idx_to_cat_idx, row_group_cat \
         = self._get_group_docids_and_index_store(X, group_col, group_idx_store)
        newX = self._change_document_type_in_matrix(X, row_group_cat)
        newX = self._make_all_positive_data_ones(newX)
        category_row = newX.tocoo().row
        for group_idx, cat_idx in group_idx_to_cat_idx.items():
            category_row[category_row == group_idx] = cat_idx
        catX = self._change_document_type_in_matrix(newX, category_row)
        return self._term_freq_df_from_matrix(catX)