def test_inject_metadata_term_lists(self): tdm = build_hamlet_jz_term_doc_mat() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) with self.assertRaises(TermDocMatrixHasNoMetadataException): scatter_chart.inject_metadata_term_lists({'blah': ['a', 'adsf', 'asfd']}) scatter_chart = ScatterChart(term_doc_matrix=build_hamlet_jz_corpus_with_meta(), minimum_term_frequency=0, use_non_text_features=True) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists({'blash': [3, 1]}) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists({3: ['a', 'b']}) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists({'a': {'a', 'b'}}) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists(3) self.assertEqual(type(scatter_chart.inject_metadata_term_lists({'a': ['a', 'b']})), ScatterChart) j = scatter_chart.to_dict('hamlet') self.assertEqual(set(j.keys()), set(['info', 'data', 'metalists'])) self.assertEqual(set(j['info'].keys()), set(['not_category_name', 'category_name', 'category_terms', 'not_category_terms', 'category_internal_name', 'not_category_internal_names', 'extra_category_internal_names', 'neutral_category_internal_names', 'categories']))
def test_inject_metadata_term_lists(self): tdm = build_hamlet_jz_term_doc_mat() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) with self.assertRaises(TermDocMatrixHasNoMetadataException): scatter_chart.inject_metadata_term_lists({'blah': ['a', 'adsf', 'asfd']}) scatter_chart = ScatterChart(term_doc_matrix=build_hamlet_jz_corpus_with_meta(), minimum_term_frequency=0, use_non_text_features=True) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists({'blash': [3, 1]}) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists({3: ['a', 'b']}) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists({'a': {'a', 'b'}}) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists(3) self.assertEqual(type(scatter_chart.inject_metadata_term_lists({'a': ['a', 'b']})), ScatterChart) j = scatter_chart.to_dict('hamlet') self.assertEqual(set(j.keys()), set(['info', 'data', 'metalists'])) self.assertEqual(set(j['info'].keys()), set(['not_category_name', 'category_name', 'category_terms', 'not_category_terms', 'category_internal_name', 'not_category_internal_names', 'extra_category_internal_names', 'neutral_category_internal_names', 'categories']))
def test_get_metadata_freq_df(self): hamlet_meta = build_hamlet_jz_corpus_with_meta() mdf = hamlet_meta.get_metadata_freq_df() self.assertEqual(list(mdf.columns), ['hamlet freq', 'jay-z/r. kelly freq']) mdf = hamlet_meta.get_metadata_freq_df('') self.assertEqual(list(mdf.columns), ['hamlet', 'jay-z/r. kelly'])
def test_main(self): tdm = build_hamlet_jz_corpus_with_meta() features = FeatureLister(tdm._mX, tdm._metadata_idx_store, tdm.get_num_docs()).output() self.assertEqual(features, [{ 'cat4': 2, 'cat3': 1 }, { 'cat4': 2 }, { 'cat5': 1, 'cat3': 2 }, { 'cat6': 2, 'cat9': 1 }, { 'cat4': 2, 'cat3': 1 }, { 'cat2': 1, 'cat1': 2 }, { 'cat2': 2, 'cat5': 1 }, { 'cat4': 1, 'cat3': 2 }])
def test_inject_term_colors(self): tdm = build_hamlet_jz_corpus_with_meta() freq_df = tdm.get_term_freq_df() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) scatter_chart.inject_term_colors({'t1': '00ffee'}) j = scatter_chart.to_dict('hamlet') self.assertIn('term_colors', j['info'])
def test_get_metadata_doc_count_df(self): corpus = build_hamlet_jz_corpus_with_meta() np.testing.assert_array_almost_equal( corpus.get_metadata_doc_count_df(), [[4, 4]]) self.assertEqual(list(corpus.get_metadata_doc_count_df().columns), ['hamlet freq','jay-z/r. kelly freq']) self.assertEqual(list(corpus.get_metadata_doc_count_df().index), ['cat1'])
def test_inject_term_colors(self): tdm = build_hamlet_jz_corpus_with_meta() freq_df = tdm.get_term_freq_df() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) scatter_chart.inject_term_colors({'t1': '00ffee'}) j = scatter_chart.to_dict('hamlet') self.assertIn('term_colors', j['info'])
def test_get_metadata_doc_count_df(self): corpus = build_hamlet_jz_corpus_with_meta() np.testing.assert_array_almost_equal( corpus.get_metadata_doc_count_df(), [[4, 4]]) self.assertEqual(list(corpus.get_metadata_doc_count_df().columns), ['hamlet freq', 'jay-z/r. kelly freq']) self.assertEqual(list(corpus.get_metadata_doc_count_df().index), ['cat1'])
def test_extra_features(self): corpus = build_hamlet_jz_corpus_with_meta() d = DocsAndLabelsFromCorpus(corpus).use_non_text_features() metadata = ['meta%s'%(i) for i in range(corpus.get_num_docs())] output = d.get_labels_and_texts_and_meta(metadata) self.assertEqual(output, {'categories': ['hamlet', 'jay-z/r. kelly'], 'texts': ["what art thou that usurp'st this time of night,", 'together with that fair and warlike form', 'in which the majesty of buried denmark', 'did sometimes march? by heaven i charge thee, speak!', 'halt! who goes there?', 'it is i sire tone from brooklyn.', 'well, speak up man what is it?', 'news from the east sire! the best of both worlds has returned!'], 'meta': ['meta0', 'meta1', 'meta2', 'meta3', 'meta4', 'meta5', 'meta6', 'meta7'], 'labels': [0, 0, 0, 0, 1, 1, 1, 1], 'extra': [{'cat3': 1, 'cat4': 2}, {'cat4': 2}, {'cat5': 1, 'cat3': 2}, {'cat9': 1, 'cat6': 2}, {'cat3': 1, 'cat4': 2}, {'cat1': 2, 'cat2': 1}, {'cat5': 1, 'cat2': 2}, {'cat3': 2, 'cat4': 1}]} )
def test_extra_features(self): corpus = build_hamlet_jz_corpus_with_meta() meta = ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight'] j = (ScatterChartExplorer(corpus, minimum_term_frequency=0, use_non_text_features=True).to_dict( 'hamlet', metadata=meta)) extras = [{ 'cat3': 1, 'cat4': 2 }, { 'cat4': 2 }, { 'cat3': 2, 'cat5': 1 }, { 'cat6': 2, 'cat9': 1 }, { 'cat3': 1, 'cat4': 2 }, { 'cat1': 2, 'cat2': 1 }, { 'cat2': 2, 'cat5': 1 }, { 'cat3': 2, 'cat4': 1 }] extras = [{'cat1': 2}] * 8 self.maxDiff = None j['docs']['labels'] = list(j['docs']['labels']) self.assertEqual( j['docs'], { 'labels': [0, 0, 0, 0, 1, 1, 1, 1], 'categories': ['hamlet', 'jay-z/r. kelly'], 'extra': extras, 'meta': [ 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight' ], 'texts': [ "what art thou that usurp'st this time of night,", 'together with that fair and warlike form', 'in which the majesty of buried denmark', 'did sometimes march? by heaven i charge thee, speak!', 'halt! who goes there?', 'it is i sire tone from brooklyn.', 'well, speak up man what is it?', 'news from the east sire! the best of both worlds has returned!' ] })
def test_use_categories_as_metadata_and_replace_terms(self): hamlet = build_hamlet_jz_corpus_with_meta() meta_hamlet = hamlet.use_categories_as_metadata_and_replace_terms() np.testing.assert_array_almost_equal(hamlet.get_metadata_doc_mat().toarray(), np.array([[2] for _ in range(8)])) self.assertEqual(meta_hamlet.get_metadata(), ['hamlet', 'jay-z/r. kelly']) self.assertEqual(meta_hamlet.get_metadata_doc_mat().shape, (meta_hamlet.get_num_docs(), len(meta_hamlet.get_metadata()))) self.assertTrue(all(meta_hamlet.get_metadata_doc_mat().todense().T[0].astype(bool).A1 == (meta_hamlet.get_category_names_by_row() == 'hamlet'))) self.assertTrue(all(meta_hamlet.get_metadata_doc_mat().todense().T[1].astype(bool).A1 == (meta_hamlet.get_category_names_by_row() == 'jay-z/r. kelly'))) np.testing.assert_array_equal(meta_hamlet.get_term_doc_mat().todense(), hamlet.get_metadata_doc_mat().toarray())
def test_use_categories_as_metadata_and_replace_terms(self): hamlet = build_hamlet_jz_corpus_with_meta() meta_hamlet = hamlet.use_categories_as_metadata_and_replace_terms() np.testing.assert_array_almost_equal(hamlet.get_metadata_doc_mat().toarray(), np.array([[2] for _ in range(8)])) self.assertEqual(meta_hamlet.get_metadata(), ['hamlet', 'jay-z/r. kelly']) self.assertEqual(meta_hamlet.get_metadata_doc_mat().shape, (meta_hamlet.get_num_docs(), len(meta_hamlet.get_metadata()))) self.assertTrue(all(meta_hamlet.get_metadata_doc_mat().todense().T[0].astype(bool).A1 == (meta_hamlet.get_category_names_by_row() == 'hamlet'))) self.assertTrue(all(meta_hamlet.get_metadata_doc_mat().todense().T[1].astype(bool).A1 == (meta_hamlet.get_category_names_by_row() == 'jay-z/r. kelly'))) np.testing.assert_array_equal(meta_hamlet.get_term_doc_mat().todense(), hamlet.get_metadata_doc_mat().toarray())
def test_main(self): tdm = build_hamlet_jz_corpus_with_meta() features = FeatureLister(tdm._mX, tdm._metadata_idx_store, tdm.get_num_docs()).output() expected = [{'cat4': 2, 'cat3': 1}, {'cat4': 2}, {'cat5': 1, 'cat3': 2}, {'cat6': 2, 'cat9': 1}, {'cat4': 2, 'cat3': 1}, {'cat2': 1, 'cat1': 2}, {'cat2': 2, 'cat5': 1}, {'cat4': 1, 'cat3': 2}] expected = [{'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}] self.assertEqual(features, expected)
def test_inject_metadata_descriptions(self): tdm = build_hamlet_jz_corpus_with_meta() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) with self.assertRaises(AssertionError): scatter_chart.inject_metadata_descriptions(3323) if (sys.version_info > (3, 0)): ''' with self.assertRaisesRegex(Exception, 'The following meta data terms are not present: blah'): scatter_chart.inject_metadata_descriptions({'blah': 'asjdkflasdjklfsadjk jsdkafsd'}) with self.assertRaisesRegex(Exception, 'The following meta data terms are not present: cat2'): scatter_chart.inject_metadata_descriptions({'cat1': 'asjdkflasdjklfsadjk jsdkafsd', 'cat2': 'asdf'}) ''' assert scatter_chart == scatter_chart.inject_metadata_descriptions({'cat1': 'asjdkflasdjklfsadjk jsdkafsd'}) j = scatter_chart.to_dict('hamlet') self.assertEqual(set(j.keys()), set(['info', 'data', 'metadescriptions']))
def test_inject_metadata_descriptions(self): tdm = build_hamlet_jz_corpus_with_meta() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) with self.assertRaises(AssertionError): scatter_chart.inject_metadata_descriptions(3323) if (sys.version_info > (3, 0)): ''' with self.assertRaisesRegex(Exception, 'The following meta data terms are not present: blah'): scatter_chart.inject_metadata_descriptions({'blah': 'asjdkflasdjklfsadjk jsdkafsd'}) with self.assertRaisesRegex(Exception, 'The following meta data terms are not present: cat2'): scatter_chart.inject_metadata_descriptions({'cat1': 'asjdkflasdjklfsadjk jsdkafsd', 'cat2': 'asdf'}) ''' assert scatter_chart == scatter_chart.inject_metadata_descriptions({'cat1': 'asjdkflasdjklfsadjk jsdkafsd'}) j = scatter_chart.to_dict('hamlet') self.assertEqual(set(j.keys()), set(['info', 'data', 'metadescriptions']))
def test_change_categories(self): corpus = build_hamlet_jz_corpus_with_meta() with self.assertRaisesRegex( Exception, r"The number of category names passed \(0\) needs to equal the number of categories in the corpus \(2\)\." ): corpus.change_category_names([]) with self.assertRaisesRegex( Exception, r"The number of category names passed \(1\) needs to equal the number of categories in the corpus \(2\)\." ): corpus.change_category_names(['a']) new_corpus = corpus.change_category_names(['aaa', 'bcd']) self.assertEquals(new_corpus.get_categories(), ['aaa', 'bcd']) self.assertEquals(corpus.get_categories(), ['hamlet', 'jay-z/r. kelly'])
def test_to_json_use_non_text_features(self): tdm = build_hamlet_jz_corpus_with_meta() # with self.assertRaises(NoWordMeetsTermFrequencyRequirementsError): # ScatterChart(term_doc_matrix=tdm).to_dict('hamlet') j = (ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0, use_non_text_features=True).to_dict('hamlet')) self.assertEqual(set(j.keys()), set(['info', 'data'])) self.assertEqual( set(j['info'].keys()), set([ 'not_category_name', 'category_name', 'category_terms', 'not_category_terms', 'category_internal_name' ])) self.assertEqual( {t['term'] for t in j['data']}, {'cat6', 'cat4', 'cat9', 'cat5', 'cat1', 'cat3', 'cat2'}) json.dumps(j)
def test_to_json_use_non_text_features(self): tdm = build_hamlet_jz_corpus_with_meta() # with self.assertRaises(NoWordMeetsTermFrequencyRequirementsError): # ScatterChart(term_doc_matrix=tdm).to_dict('hamlet') j = (ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0, use_non_text_features=True) .to_dict('hamlet')) self.assertEqual(set(j.keys()), set(['info', 'data'])) self.assertEqual(set(j['info'].keys()), set(['not_category_name', 'category_name', 'category_terms', 'not_category_terms', 'category_internal_name', 'not_category_internal_names', 'extra_category_internal_names', 'neutral_category_internal_names', 'categories'])) self.assertEqual({t['term'] for t in j['data']}, {'cat1'} # {'cat4', 'cat9', 'cat5', 'cat0', 'cat3', 'cat2', 'cat1'} )
def test_extra_features(self): corpus = build_hamlet_jz_corpus_with_meta() meta = ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight'] j = (ScatterChartExplorer(corpus, minimum_term_frequency=0, use_non_text_features=True) .to_dict('hamlet', metadata=meta)) extras = [{'cat3': 1, 'cat4': 2}, {'cat4': 2}, {'cat3': 2, 'cat5': 1}, {'cat6': 2, 'cat9': 1}, {'cat3': 1, 'cat4': 2}, {'cat1': 2, 'cat2': 1}, {'cat2': 2, 'cat5': 1}, {'cat3': 2, 'cat4': 1}] extras = [{'cat1': 2}] * 8 self.maxDiff = None j['docs']['labels'] = list(j['docs']['labels']) self.assertEqual(j['docs'], {'labels': [0, 0, 0, 0, 1, 1, 1, 1], 'categories': ['hamlet', 'jay-z/r. kelly'], 'extra': extras, 'meta': ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight'], 'texts': ["what art thou that usurp'st this time of night,", 'together with that fair and warlike form', 'in which the majesty of buried denmark', 'did sometimes march? by heaven i charge thee, speak!', 'halt! who goes there?', 'it is i sire tone from brooklyn.', 'well, speak up man what is it?', 'news from the east sire! the best of both worlds has returned!']} )
def test_extra_features(self): corpus = build_hamlet_jz_corpus_with_meta() d = DocsAndLabelsFromCorpus(corpus).use_non_text_features() metadata = ['meta%s' % (i) for i in range(corpus.get_num_docs())] output = d.get_labels_and_texts_and_meta(metadata) extra_val = [{'cat3': 1, 'cat4': 2}, {'cat4': 2}, {'cat5': 1, 'cat3': 2}, {'cat9': 1, 'cat6': 2}, {'cat3': 1, 'cat4': 2}, {'cat1': 2, 'cat2': 1}, {'cat5': 1, 'cat2': 2}, {'cat3': 2, 'cat4': 1}] extra_val = [{'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}] output['labels'] = list(output['labels']) self.assertEqual(output, {'categories': ['hamlet', 'jay-z/r. kelly'], 'texts': ["what art thou that usurp'st this time of night,", 'together with that fair and warlike form', 'in which the majesty of buried denmark', 'did sometimes march? by heaven i charge thee, speak!', 'halt! who goes there?', 'it is i sire tone from brooklyn.', 'well, speak up man what is it?', 'news from the east sire! the best of both worlds has returned!'], 'meta': ['meta0', 'meta1', 'meta2', 'meta3', 'meta4', 'meta5', 'meta6', 'meta7'], 'labels': [0, 0, 0, 0, 1, 1, 1, 1], 'extra': extra_val} )
def test_metadata_in_use(self): hamlet = get_hamlet_term_doc_matrix() self.assertFalse(hamlet.metadata_in_use()) hamlet_meta = build_hamlet_jz_corpus_with_meta() self.assertTrue(hamlet_meta.metadata_in_use())
def test_metadata_in_use(self): hamlet = get_hamlet_term_doc_matrix() self.assertFalse(hamlet.metadata_in_use()) hamlet_meta = build_hamlet_jz_corpus_with_meta() self.assertTrue(hamlet_meta.metadata_in_use())
def test_get_metadata_doc_mat(self): hamlet_meta = build_hamlet_jz_corpus_with_meta() mX = hamlet_meta.get_metadata_doc_mat() np.testing.assert_array_equal( mX.shape, (hamlet_meta.get_num_docs(), len(hamlet_meta.get_metadata_freq_df())))
def test_get_metadata(self): hamlet_meta = build_hamlet_jz_corpus_with_meta() self.assertEqual(hamlet_meta.get_metadata(), ['cat1'])
def test_get_metadata_count_mat(self): corpus = build_hamlet_jz_corpus_with_meta() np.testing.assert_array_almost_equal(corpus.get_metadata_count_mat(), [[4, 4]])
def test_get_metadata_freq_df(self): hamlet_meta = build_hamlet_jz_corpus_with_meta() mdf = hamlet_meta.get_metadata_freq_df() self.assertEqual(list(mdf.columns), ['hamlet freq', 'jay-z/r. kelly freq']) mdf = hamlet_meta.get_metadata_freq_df('') self.assertEqual(list(mdf.columns), ['hamlet', 'jay-z/r. kelly'])
def test_get_metadata(self): hamlet_meta = build_hamlet_jz_corpus_with_meta() self.assertEqual(hamlet_meta.get_metadata(), ['cat1'])
def test_get_ranks_meta(self): corpus = build_hamlet_jz_corpus_with_meta() self.assertEquals( ZScores(corpus).set_term_ranker( OncePerDocFrequencyRanker).set_categories('hamlet').get_name(), "Z-Score from Welch's T-Test")
def test_get_metadata_doc_mat(self): hamlet_meta = build_hamlet_jz_corpus_with_meta() mX = hamlet_meta.get_metadata_doc_mat() np.testing.assert_array_equal(mX.shape, (hamlet_meta.get_num_docs(), len(hamlet_meta.get_metadata_freq_df())))
def test_get_ranks_meta(self): corpus = build_hamlet_jz_corpus_with_meta() self.assertEquals(ZScores(corpus) .set_term_ranker(OncePerDocFrequencyRanker) .set_categories('hamlet').get_name(), "Z-Score from Welch's T-Test")
def test_get_metadata_count_mat(self): corpus = build_hamlet_jz_corpus_with_meta() np.testing.assert_array_almost_equal( corpus.get_metadata_count_mat(), [[4, 4]])