def test_get_term_df(self):
		categories, documents = get_docs_categories()
		df = pd.DataFrame({'category': categories,
		                   'text': documents})
		tdm_factory = TermDocMatrixFromPandas(df,
		                                      'category',
		                                      'text',
		                                      nlp=whitespace_nlp)
		term_doc_matrix = tdm_factory.build()

		term_df = term_doc_matrix.get_term_freq_df()
		self.assertEqual(dict(term_df.ix['speak up']),
		                 {'??? freq': 2, 'hamlet freq': 0, 'jay-z/r. kelly freq': 1})
		self.assertEqual(dict(term_df.ix['that']),
		                 {'??? freq': 0, 'hamlet freq': 2, 'jay-z/r. kelly freq': 0})
	def test_main(self):
		categories, documents = get_docs_categories()
		df = pd.DataFrame({'category': categories,
		                   'text': documents})
		tdm_factory = TermDocMatrixFromPandas(df,
		                                      'category',
		                                      'text',
		                                      nlp=whitespace_nlp)
		term_doc_matrix = tdm_factory.build()
		self.assertIsInstance(term_doc_matrix, TermDocMatrix)
		self.assertEqual(set(term_doc_matrix.get_categories()),
		                 set(['hamlet', 'jay-z/r. kelly']))
		self.assertEqual(term_doc_matrix.get_num_docs(), 9)
		term_doc_df = term_doc_matrix.get_term_freq_df()
		self.assertEqual(term_doc_df.ix['of'].sum(), 3)
Example #3
0
	def test_main(self):
		categories, documents = get_docs_categories()
		df = pd.DataFrame({'category': categories,
		                   'text': documents})
		tdm_factory = TermDocMatrixFromPandas(df,
		                                      'category',
		                                      'text',
		                                      nlp=whitespace_nlp)
		term_doc_matrix = tdm_factory.build()
		self.assertIsInstance(term_doc_matrix, TermDocMatrix)
		self.assertEqual(set(term_doc_matrix.get_categories()),
		                 set(['hamlet','jay-z/r. kelly']))
		self.assertEqual(term_doc_matrix.get_num_docs(), 9)
		term_doc_df = term_doc_matrix.get_term_freq_df()
		self.assertEqual(term_doc_df.ix['of'].sum(), 3)
    def test_get_term_df(self):
        categories, documents = get_docs_categories()
        df = pd.DataFrame({'category': categories,
                           'text': documents})
        tdm_factory = TermDocMatrixFromPandas(df,
                                              'category',
                                              'text',
                                              nlp=whitespace_nlp)
        term_doc_matrix = tdm_factory.build()

        term_df = term_doc_matrix.get_term_freq_df()
        self.assertEqual(dict(term_df.ix['speak up']),
                         {'??? freq': 2, 'hamlet freq': 0, 'jay-z/r. kelly freq': 1})
        self.assertEqual(dict(term_df.ix['that']),
                         {'??? freq': 0, 'hamlet freq': 2, 'jay-z/r. kelly freq': 0})