Esempio n. 1
0
    def __init__(self, corpus):
        '''

		Parameters
		----------
		corpus
		'''
        assert isinstance(corpus, ParsedCorpus)
        self.corpus = corpus
        self.termidxstore = corpus._term_idx_store
        matfact = CSRMatrixFactory()
        self.doclabs = []
        self.sentlabs = []
        self.sentdocs = []
        senti = 0
        for doci, doc in enumerate(corpus.get_parsed_docs()):
            for sent in doc.sents:
                validsent = False
                for t in sent:
                    try:
                        termi = self.termidxstore.getidxstrict(t.lower_)
                    except:
                        continue
                    if validsent is False:
                        senti += 1
                        self.sentlabs.append(corpus._y[doci])
                        self.sentdocs.append(doci)
                        validsent = True
                    matfact[senti, termi] = 1
        self.sentX = matfact.get_csr_matrix().astype(bool)
	def __init__(self, corpus):
		'''

		Parameters
		----------
		corpus
		'''
		assert isinstance(corpus, ParsedCorpus)
		self.corpus = corpus
		self.termidxstore = corpus._term_idx_store
		matfact = CSRMatrixFactory()
		self.doclabs = []
		self.sentlabs = []
		self.sentdocs = []
		senti = 0
		for doci, doc in enumerate(corpus.get_parsed_docs()):
			for sent in doc.sents:
				validsent = False
				for t in sent:
					try:
						termi = self.termidxstore.getidxstrict(t.lower_)
					except:
						continue
					if validsent is False:
						senti += 1
						self.sentlabs.append(corpus._y[doci])
						self.sentdocs.append(doci)
						validsent = True
					matfact[senti, termi] = 1
		self.sentX = matfact.get_csr_matrix().astype(bool)
 def test_add_metadata(self):
     hamlet = get_hamlet_term_doc_matrix()
     meta_index_store = IndexStore()
     meta_fact = CSRMatrixFactory()
     for i in range(hamlet.get_num_docs()):
         meta_fact[i, i] = meta_index_store.getidx(str(i))
     other_hamlet = hamlet.add_metadata(meta_fact.get_csr_matrix(),
                                        meta_index_store)
     assert other_hamlet != hamlet
     meta_index_store = IndexStore()
     meta_fact = CSRMatrixFactory()
     for i in range(hamlet.get_num_docs() - 5):
         meta_fact[i, i] = meta_index_store.getidx(str(i))
     with self.assertRaises(AssertionError):
         hamlet.add_metadata(meta_fact.get_csr_matrix(),
                             meta_index_store)
 def test_main(self):
     categories, documents = get_docs_categories()
     df = pd.DataFrame({'category': categories, 'text': documents})
     corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build()
     self.assertEqual(CategoryColorAssigner(corpus).get_category_colors().to_dict(),
                      {'???': [255, 127, 14],
                       'hamlet': [174, 199, 232],
                       'jay-z/r. kelly': [31, 119, 180]})
     term_colors = CategoryColorAssigner(corpus).get_term_colors()
     self.assertEqual(term_colors['this time'], 'aec7e8')
     self.assertEqual(term_colors['sire'], '1f77b4')
     self.assertEqual(len(term_colors), corpus.get_num_terms())
     mfact = CSRMatrixFactory()
     mis = IndexStore()
     for i, c in enumerate(df['category']):
         mfact[i, mis.getidx(c)] = 1
     corpus = corpus.add_metadata(mfact.get_csr_matrix(), mis)
     meta_colors = CategoryColorAssigner(corpus, use_non_text_features=True).get_term_colors()
     self.assertEqual(meta_colors, {'hamlet': 'aec7e8', 'jay-z/r. kelly': '1f77b4', '???': 'ff7f0e'})
     self.assertNotEqual(CategoryColorAssigner(corpus).get_term_colors(), meta_colors)
Esempio n. 5
0
 def test_add_metadata(self):
     hamlet = get_hamlet_term_doc_matrix()
     meta_index_store = IndexStore()
     meta_fact = CSRMatrixFactory()
     for i in range(hamlet.get_num_docs()):
         meta_fact[i, i] = meta_index_store.getidx(str(i))
     other_hamlet = hamlet.add_metadata(meta_fact.get_csr_matrix(),
                                        meta_index_store)
     assert other_hamlet != hamlet
     meta_index_store = IndexStore()
     meta_fact = CSRMatrixFactory()
     for i in range(hamlet.get_num_docs() - 5):
         meta_fact[i, i] = meta_index_store.getidx(str(i))
     with self.assertRaises(AssertionError):
         hamlet.add_metadata(meta_fact.get_csr_matrix(), meta_index_store)