def setUp(cls):
     categories, documents = get_docs_categories()
     cls.df = pd.DataFrame({'category': categories, 'text': documents})
     cls.corpus = CorpusFromPandas(cls.df,
                                   'category',
                                   'text',
                                   nlp=whitespace_nlp).build()
コード例 #2
0
 def setUp(cls):
     categories, documents = get_docs_categories()
     cls.df = pd.DataFrame({'category': categories,
                            'text': documents})
     cls.corpus = CorpusFromPandas(cls.df,
                                   'category',
                                   'text',
                                   nlp=whitespace_nlp).build()
コード例 #3
0
	def setUp(cls):
		cls.categories, cls.documents = get_docs_categories()
		cls.parsed_docs = []
		for doc in cls.documents:
			cls.parsed_docs.append(whitespace_nlp(doc))
		cls.df = pd.DataFrame({'category': cls.categories,
		                       'parsed': cls.parsed_docs})
		cls.corpus_fact = CorpusFromParsedDocuments(cls.df, 'category', 'parsed')
コード例 #4
0
	def setUp(cls):
		cls.categories, cls.documents = get_docs_categories()
		cls.parsed_docs = []
		for doc in cls.documents:
			cls.parsed_docs.append(whitespace_nlp(doc))
		cls.df = pd.DataFrame({'category': cls.categories,
		                       'parsed': cls.parsed_docs})
		cls.corpus_fact = CorpusFromParsedDocuments(cls.df, 'category', 'parsed')
コード例 #5
0
	def setUp(cls):
		cls.categories, cls.documents = get_docs_categories()
		cls.parsed_docs = []
		for doc in cls.documents:
			cls.parsed_docs.append(whitespace_nlp(doc))
		cls.df = pd.DataFrame({'category': cls.categories,
		                       'parsed': cls.parsed_docs,
		                       'orig': [d.upper() for d in cls.documents]})
		cls.parsed_corpus = CorpusFromParsedDocuments(cls.df, 'category', 'parsed').build()
		cls.corpus = CorpusFromPandas(cls.df, 'category', 'orig', nlp=whitespace_nlp).build()
コード例 #6
0
	def setUp(cls):
		cls.categories, cls.documents = get_docs_categories()
		cls.parsed_docs = []
		for doc in cls.documents:
			cls.parsed_docs.append(whitespace_nlp(doc))
		cls.df = pd.DataFrame({'category': cls.categories,
		                       'author': ['a', 'a', 'c', 'c', 'c',
		                                  'c', 'd', 'd', 'e', 'e'],
		                       'parsed': cls.parsed_docs,
		                       'document_lengths': [len(doc) for doc in cls.documents]})
		cls.corpus = CorpusFromParsedDocuments(cls.df, 'category', 'parsed').build()
コード例 #7
0
	def setUp(cls):
		cls.categories, cls.documents = get_docs_categories()
		cls.parsed_docs = []
		for doc in cls.documents:
			cls.parsed_docs.append(whitespace_nlp(doc))
		cls.df = pd.DataFrame({'category': cls.categories,
		                       'author': ['a', 'a', 'c', 'c', 'c',
		                                  'c', 'd', 'd', 'e', 'e'],
		                       'parsed': cls.parsed_docs,
		                       'document_lengths': [len(doc) for doc in cls.documents]})
		cls.corpus = CorpusFromParsedDocuments(cls.df, 'category', 'parsed').build()
    def test_term_category_matrix_from_pandas_without_categories(self):
        tdm = get_term_doc_matrix_without_categories()
        categories, documents = get_docs_categories()
        reg_tdm = TermDocMatrixFromPandas(pd.DataFrame({'text': documents, 'categories': categories}),
                                          text_col='text',
                                          category_col='categories',
                                          nlp=whitespace_nlp).build()

        self.assertIsInstance(tdm, TermDocMatrixWithoutCategories)
        self.assertEqual(tdm.get_terms(), reg_tdm.get_terms())
        self.assertEqual(tdm.get_num_docs(), reg_tdm.get_num_docs())
        np.testing.assert_equal(tdm.get_term_doc_mat().data, reg_tdm.get_term_doc_mat().data)
コード例 #9
0
	def test_get_term_df(self):
		categories, documents = get_docs_categories()
		df = pd.DataFrame({'category': categories,
		                   'text': documents})
		tdm_factory = TermDocMatrixFromPandas(df,
		                                      'category',
		                                      'text',
		                                      nlp=whitespace_nlp)
		term_doc_matrix = tdm_factory.build()

		term_df = term_doc_matrix.get_term_freq_df()
		self.assertEqual(dict(term_df.ix['speak up']),
		                 {'??? freq': 2, 'hamlet freq': 0, 'jay-z/r. kelly freq': 1})
		self.assertEqual(dict(term_df.ix['that']),
		                 {'??? freq': 0, 'hamlet freq': 2, 'jay-z/r. kelly freq': 0})
コード例 #10
0
    def test_get_term_df(self):
        categories, documents = get_docs_categories()
        df = pd.DataFrame({'category': categories,
                           'text': documents})
        tdm_factory = TermDocMatrixFromPandas(df,
                                              'category',
                                              'text',
                                              nlp=whitespace_nlp)
        term_doc_matrix = tdm_factory.build()

        term_df = term_doc_matrix.get_term_freq_df()
        self.assertEqual(dict(term_df.ix['speak up']),
                         {'??? freq': 2, 'hamlet freq': 0, 'jay-z/r. kelly freq': 1})
        self.assertEqual(dict(term_df.ix['that']),
                         {'??? freq': 0, 'hamlet freq': 2, 'jay-z/r. kelly freq': 0})
コード例 #11
0
 def setUp(cls):
     cls.categories, cls.documents = get_docs_categories()
     cls.parsed_docs = []
     for doc in cls.documents:
         cls.parsed_docs.append(whitespace_nlp(doc))
     cls.df = pd.DataFrame({
         'category': cls.categories,
         'parsed': cls.parsed_docs,
         'orig': [d.upper() for d in cls.documents]
     })
     cls.parsed_corpus = CorpusFromParsedDocuments(cls.df, 'category',
                                                   'parsed').build()
     cls.corpus = CorpusFromPandas(cls.df,
                                   'category',
                                   'orig',
                                   nlp=whitespace_nlp).build()
 def test_main(self):
     categories, documents = get_docs_categories()
     df = pd.DataFrame({'category': categories, 'text': documents})
     corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build()
     self.assertEqual(CategoryColorAssigner(corpus).get_category_colors().to_dict(),
                      {'???': [255, 127, 14],
                       'hamlet': [174, 199, 232],
                       'jay-z/r. kelly': [31, 119, 180]})
     term_colors = CategoryColorAssigner(corpus).get_term_colors()
     self.assertEqual(term_colors['this time'], 'aec7e8')
     self.assertEqual(term_colors['sire'], '1f77b4')
     self.assertEqual(len(term_colors), corpus.get_num_terms())
     mfact = CSRMatrixFactory()
     mis = IndexStore()
     for i, c in enumerate(df['category']):
         mfact[i, mis.getidx(c)] = 1
     corpus = corpus.add_metadata(mfact.get_csr_matrix(), mis)
     meta_colors = CategoryColorAssigner(corpus, use_non_text_features=True).get_term_colors()
     self.assertEqual(meta_colors, {'hamlet': 'aec7e8', 'jay-z/r. kelly': '1f77b4', '???': 'ff7f0e'})
     self.assertNotEqual(CategoryColorAssigner(corpus).get_term_colors(), meta_colors)
 def setUp(cls):
     categories, documents = get_docs_categories()
     cls.df = pd.DataFrame({'category': categories, 'text': documents})
     cls.df['parsed'] = cls.df.text.apply(whitespace_nlp)
     cls.corpus = CorpusFromParsedDocuments(cls.df, 'category',
                                            'parsed').build()
def get_term_doc_matrix_without_categories():
    categories, documents = get_docs_categories()
    df = pd.DataFrame({'text': documents})
    tdm = TermDocMatrixWithoutCategoriesFromPandas(df, 'text', nlp=whitespace_nlp).build()
    return tdm