def __init__(self, corpus): ''' Parameters ---------- corpus ''' assert isinstance(corpus, ParsedCorpus) self.corpus = corpus self.termidxstore = corpus._term_idx_store matfact = CSRMatrixFactory() self.doclabs = [] self.sentlabs = [] self.sentdocs = [] senti = 0 for doci, doc in enumerate(corpus.get_parsed_docs()): for sent in doc.sents: validsent = False for t in sent: try: termi = self.termidxstore.getidxstrict(t.lower_) except: continue if validsent is False: senti += 1 self.sentlabs.append(corpus._y[doci]) self.sentdocs.append(doci) validsent = True matfact[senti, termi] = 1 self.sentX = matfact.get_csr_matrix().astype(bool)
def test_add_metadata(self): hamlet = get_hamlet_term_doc_matrix() meta_index_store = IndexStore() meta_fact = CSRMatrixFactory() for i in range(hamlet.get_num_docs()): meta_fact[i, i] = meta_index_store.getidx(str(i)) other_hamlet = hamlet.add_metadata(meta_fact.get_csr_matrix(), meta_index_store) assert other_hamlet != hamlet meta_index_store = IndexStore() meta_fact = CSRMatrixFactory() for i in range(hamlet.get_num_docs() - 5): meta_fact[i, i] = meta_index_store.getidx(str(i)) with self.assertRaises(AssertionError): hamlet.add_metadata(meta_fact.get_csr_matrix(), meta_index_store)
def test_main(self): categories, documents = get_docs_categories() df = pd.DataFrame({'category': categories, 'text': documents}) corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build() self.assertEqual(CategoryColorAssigner(corpus).get_category_colors().to_dict(), {'???': [255, 127, 14], 'hamlet': [174, 199, 232], 'jay-z/r. kelly': [31, 119, 180]}) term_colors = CategoryColorAssigner(corpus).get_term_colors() self.assertEqual(term_colors['this time'], 'aec7e8') self.assertEqual(term_colors['sire'], '1f77b4') self.assertEqual(len(term_colors), corpus.get_num_terms()) mfact = CSRMatrixFactory() mis = IndexStore() for i, c in enumerate(df['category']): mfact[i, mis.getidx(c)] = 1 corpus = corpus.add_metadata(mfact.get_csr_matrix(), mis) meta_colors = CategoryColorAssigner(corpus, use_non_text_features=True).get_term_colors() self.assertEqual(meta_colors, {'hamlet': 'aec7e8', 'jay-z/r. kelly': '1f77b4', '???': 'ff7f0e'}) self.assertNotEqual(CategoryColorAssigner(corpus).get_term_colors(), meta_colors)