Example #1
0
    def test_inject_metadata_term_lists(self):
        tdm = build_hamlet_jz_term_doc_mat()
        scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                     minimum_term_frequency=0)
        with self.assertRaises(TermDocMatrixHasNoMetadataException):
            scatter_chart.inject_metadata_term_lists({'blah': ['a', 'adsf', 'asfd']})
        scatter_chart = ScatterChart(term_doc_matrix=build_hamlet_jz_corpus_with_meta(),
                                     minimum_term_frequency=0,
                                     use_non_text_features=True)

        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({'blash': [3, 1]})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({3: ['a', 'b']})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({'a': {'a', 'b'}})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists(3)
        self.assertEqual(type(scatter_chart.inject_metadata_term_lists({'a': ['a', 'b']})), ScatterChart)
        j = scatter_chart.to_dict('hamlet')
        self.assertEqual(set(j.keys()), set(['info', 'data', 'metalists']))
        self.assertEqual(set(j['info'].keys()),
                         set(['not_category_name',
                              'category_name',
                              'category_terms',
                              'not_category_terms',
                              'category_internal_name',
                              'not_category_internal_names',
                              'extra_category_internal_names',
                              'neutral_category_internal_names',
                              'categories']))
    def test_inject_metadata_term_lists(self):
        tdm = build_hamlet_jz_term_doc_mat()
        scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                     minimum_term_frequency=0)
        with self.assertRaises(TermDocMatrixHasNoMetadataException):
            scatter_chart.inject_metadata_term_lists({'blah': ['a', 'adsf', 'asfd']})
        scatter_chart = ScatterChart(term_doc_matrix=build_hamlet_jz_corpus_with_meta(),
                                     minimum_term_frequency=0,
                                     use_non_text_features=True)

        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({'blash': [3, 1]})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({3: ['a', 'b']})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({'a': {'a', 'b'}})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists(3)
        self.assertEqual(type(scatter_chart.inject_metadata_term_lists({'a': ['a', 'b']})), ScatterChart)
        j = scatter_chart.to_dict('hamlet')
        self.assertEqual(set(j.keys()), set(['info', 'data', 'metalists']))
        self.assertEqual(set(j['info'].keys()),
                         set(['not_category_name',
                              'category_name',
                              'category_terms',
                              'not_category_terms',
                              'category_internal_name',
                              'not_category_internal_names',
                              'extra_category_internal_names',
                              'neutral_category_internal_names',
                              'categories']))
Example #3
0
 def test_get_metadata_freq_df(self):
     hamlet_meta = build_hamlet_jz_corpus_with_meta()
     mdf = hamlet_meta.get_metadata_freq_df()
     self.assertEqual(list(mdf.columns),
                      ['hamlet freq', 'jay-z/r. kelly freq'])
     mdf = hamlet_meta.get_metadata_freq_df('')
     self.assertEqual(list(mdf.columns), ['hamlet', 'jay-z/r. kelly'])
Example #4
0
 def test_main(self):
     tdm = build_hamlet_jz_corpus_with_meta()
     features = FeatureLister(tdm._mX, tdm._metadata_idx_store,
                              tdm.get_num_docs()).output()
     self.assertEqual(features, [{
         'cat4': 2,
         'cat3': 1
     }, {
         'cat4': 2
     }, {
         'cat5': 1,
         'cat3': 2
     }, {
         'cat6': 2,
         'cat9': 1
     }, {
         'cat4': 2,
         'cat3': 1
     }, {
         'cat2': 1,
         'cat1': 2
     }, {
         'cat2': 2,
         'cat5': 1
     }, {
         'cat4': 1,
         'cat3': 2
     }])
 def test_inject_term_colors(self):
     tdm = build_hamlet_jz_corpus_with_meta()
     freq_df = tdm.get_term_freq_df()
     scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                  minimum_term_frequency=0)
     scatter_chart.inject_term_colors({'t1': '00ffee'})
     j = scatter_chart.to_dict('hamlet')
     self.assertIn('term_colors', j['info'])
 def test_get_metadata_doc_count_df(self):
     corpus = build_hamlet_jz_corpus_with_meta()
     np.testing.assert_array_almost_equal(
         corpus.get_metadata_doc_count_df(), [[4, 4]])
     self.assertEqual(list(corpus.get_metadata_doc_count_df().columns),
                      ['hamlet freq','jay-z/r. kelly freq'])
     self.assertEqual(list(corpus.get_metadata_doc_count_df().index),
                      ['cat1'])
Example #7
0
 def test_inject_term_colors(self):
     tdm = build_hamlet_jz_corpus_with_meta()
     freq_df = tdm.get_term_freq_df()
     scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                  minimum_term_frequency=0)
     scatter_chart.inject_term_colors({'t1': '00ffee'})
     j = scatter_chart.to_dict('hamlet')
     self.assertIn('term_colors', j['info'])
Example #8
0
 def test_get_metadata_doc_count_df(self):
     corpus = build_hamlet_jz_corpus_with_meta()
     np.testing.assert_array_almost_equal(
         corpus.get_metadata_doc_count_df(), [[4, 4]])
     self.assertEqual(list(corpus.get_metadata_doc_count_df().columns),
                      ['hamlet freq', 'jay-z/r. kelly freq'])
     self.assertEqual(list(corpus.get_metadata_doc_count_df().index),
                      ['cat1'])
Example #9
0
	def test_extra_features(self):
		corpus = build_hamlet_jz_corpus_with_meta()
		d = DocsAndLabelsFromCorpus(corpus).use_non_text_features()
		metadata = ['meta%s'%(i) for i in range(corpus.get_num_docs())]
		output = d.get_labels_and_texts_and_meta(metadata)
		self.assertEqual(output, {'categories': ['hamlet', 'jay-z/r. kelly'],
		                          'texts': ["what art thou that usurp'st this time of night,", 'together with that fair and warlike form', 'in which the majesty of buried denmark', 'did sometimes march? by heaven i charge thee, speak!', 'halt! who goes there?', 'it is i sire tone from brooklyn.', 'well, speak up man what is it?', 'news from the east sire! the best of both worlds has returned!'], 'meta': ['meta0', 'meta1', 'meta2', 'meta3', 'meta4', 'meta5', 'meta6', 'meta7'],
		                          'labels': [0, 0, 0, 0, 1, 1, 1, 1],
		                          'extra': [{'cat3': 1, 'cat4': 2}, {'cat4': 2}, {'cat5': 1, 'cat3': 2}, {'cat9': 1, 'cat6': 2}, {'cat3': 1, 'cat4': 2}, {'cat1': 2, 'cat2': 1}, {'cat5': 1, 'cat2': 2}, {'cat3': 2, 'cat4': 1}]}
)
 def test_extra_features(self):
     corpus = build_hamlet_jz_corpus_with_meta()
     meta = ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight']
     j = (ScatterChartExplorer(corpus,
                               minimum_term_frequency=0,
                               use_non_text_features=True).to_dict(
                                   'hamlet', metadata=meta))
     extras = [{
         'cat3': 1,
         'cat4': 2
     }, {
         'cat4': 2
     }, {
         'cat3': 2,
         'cat5': 1
     }, {
         'cat6': 2,
         'cat9': 1
     }, {
         'cat3': 1,
         'cat4': 2
     }, {
         'cat1': 2,
         'cat2': 1
     }, {
         'cat2': 2,
         'cat5': 1
     }, {
         'cat3': 2,
         'cat4': 1
     }]
     extras = [{'cat1': 2}] * 8
     self.maxDiff = None
     j['docs']['labels'] = list(j['docs']['labels'])
     self.assertEqual(
         j['docs'], {
             'labels': [0, 0, 0, 0, 1, 1, 1, 1],
             'categories': ['hamlet', 'jay-z/r. kelly'],
             'extra':
             extras,
             'meta': [
                 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
                 'eight'
             ],
             'texts': [
                 "what art thou that usurp'st this time of night,",
                 'together with that fair and warlike form',
                 'in which the majesty of buried denmark',
                 'did sometimes march? by heaven i charge thee, speak!',
                 'halt! who goes there?',
                 'it is i sire tone from brooklyn.',
                 'well, speak up man what is it?',
                 'news from the east sire! the best of both worlds has returned!'
             ]
         })
    def test_use_categories_as_metadata_and_replace_terms(self):
        hamlet = build_hamlet_jz_corpus_with_meta()
        meta_hamlet = hamlet.use_categories_as_metadata_and_replace_terms()

        np.testing.assert_array_almost_equal(hamlet.get_metadata_doc_mat().toarray(), np.array([[2] for _ in range(8)]))
        self.assertEqual(meta_hamlet.get_metadata(), ['hamlet', 'jay-z/r. kelly'])
        self.assertEqual(meta_hamlet.get_metadata_doc_mat().shape,
                         (meta_hamlet.get_num_docs(), len(meta_hamlet.get_metadata())))
        self.assertTrue(all(meta_hamlet.get_metadata_doc_mat().todense().T[0].astype(bool).A1
                            == (meta_hamlet.get_category_names_by_row() == 'hamlet')))
        self.assertTrue(all(meta_hamlet.get_metadata_doc_mat().todense().T[1].astype(bool).A1
                            == (meta_hamlet.get_category_names_by_row() == 'jay-z/r. kelly')))
        np.testing.assert_array_equal(meta_hamlet.get_term_doc_mat().todense(), hamlet.get_metadata_doc_mat().toarray())
Example #12
0
    def test_use_categories_as_metadata_and_replace_terms(self):
        hamlet = build_hamlet_jz_corpus_with_meta()
        meta_hamlet = hamlet.use_categories_as_metadata_and_replace_terms()

        np.testing.assert_array_almost_equal(hamlet.get_metadata_doc_mat().toarray(), np.array([[2] for _ in range(8)]))
        self.assertEqual(meta_hamlet.get_metadata(), ['hamlet', 'jay-z/r. kelly'])
        self.assertEqual(meta_hamlet.get_metadata_doc_mat().shape,
                         (meta_hamlet.get_num_docs(), len(meta_hamlet.get_metadata())))
        self.assertTrue(all(meta_hamlet.get_metadata_doc_mat().todense().T[0].astype(bool).A1
                            == (meta_hamlet.get_category_names_by_row() == 'hamlet')))
        self.assertTrue(all(meta_hamlet.get_metadata_doc_mat().todense().T[1].astype(bool).A1
                            == (meta_hamlet.get_category_names_by_row() == 'jay-z/r. kelly')))
        np.testing.assert_array_equal(meta_hamlet.get_term_doc_mat().todense(), hamlet.get_metadata_doc_mat().toarray())
	def test_main(self):
		tdm = build_hamlet_jz_corpus_with_meta()
		features = FeatureLister(tdm._mX,
		                         tdm._metadata_idx_store,
		                         tdm.get_num_docs()).output()
		expected  = [{'cat4': 2, 'cat3': 1}, {'cat4': 2}, {'cat5': 1, 'cat3': 2},
		                  {'cat6': 2, 'cat9': 1},
		                  {'cat4': 2, 'cat3': 1}, {'cat2': 1, 'cat1': 2},
		                  {'cat2': 2, 'cat5': 1},
		                  {'cat4': 1, 'cat3': 2}]
		expected = [{'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}]

		self.assertEqual(features,
		                 expected)
 def test_inject_metadata_descriptions(self):
     tdm = build_hamlet_jz_corpus_with_meta()
     scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0)
     with self.assertRaises(AssertionError):
         scatter_chart.inject_metadata_descriptions(3323)
     if (sys.version_info > (3, 0)):
         '''
         with self.assertRaisesRegex(Exception, 'The following meta data terms are not present: blah'):
             scatter_chart.inject_metadata_descriptions({'blah': 'asjdkflasdjklfsadjk jsdkafsd'})
         with self.assertRaisesRegex(Exception, 'The following meta data terms are not present: cat2'):
             scatter_chart.inject_metadata_descriptions({'cat1': 'asjdkflasdjklfsadjk jsdkafsd', 'cat2': 'asdf'})
         '''
     assert scatter_chart == scatter_chart.inject_metadata_descriptions({'cat1': 'asjdkflasdjklfsadjk jsdkafsd'})
     j = scatter_chart.to_dict('hamlet')
     self.assertEqual(set(j.keys()), set(['info', 'data', 'metadescriptions']))
Example #15
0
 def test_inject_metadata_descriptions(self):
     tdm = build_hamlet_jz_corpus_with_meta()
     scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0)
     with self.assertRaises(AssertionError):
         scatter_chart.inject_metadata_descriptions(3323)
     if (sys.version_info > (3, 0)):
         '''
         with self.assertRaisesRegex(Exception, 'The following meta data terms are not present: blah'):
             scatter_chart.inject_metadata_descriptions({'blah': 'asjdkflasdjklfsadjk jsdkafsd'})
         with self.assertRaisesRegex(Exception, 'The following meta data terms are not present: cat2'):
             scatter_chart.inject_metadata_descriptions({'cat1': 'asjdkflasdjklfsadjk jsdkafsd', 'cat2': 'asdf'})
         '''
     assert scatter_chart == scatter_chart.inject_metadata_descriptions({'cat1': 'asjdkflasdjklfsadjk jsdkafsd'})
     j = scatter_chart.to_dict('hamlet')
     self.assertEqual(set(j.keys()), set(['info', 'data', 'metadescriptions']))
Example #16
0
    def test_change_categories(self):
        corpus = build_hamlet_jz_corpus_with_meta()
        with self.assertRaisesRegex(
                Exception,
                r"The number of category names passed \(0\) needs to equal the number of categories in the corpus \(2\)\."
        ):
            corpus.change_category_names([])
        with self.assertRaisesRegex(
                Exception,
                r"The number of category names passed \(1\) needs to equal the number of categories in the corpus \(2\)\."
        ):
            corpus.change_category_names(['a'])

        new_corpus = corpus.change_category_names(['aaa', 'bcd'])
        self.assertEquals(new_corpus.get_categories(), ['aaa', 'bcd'])
        self.assertEquals(corpus.get_categories(), ['hamlet', 'jay-z/r. kelly'])
Example #17
0
 def test_to_json_use_non_text_features(self):
     tdm = build_hamlet_jz_corpus_with_meta()
     # with self.assertRaises(NoWordMeetsTermFrequencyRequirementsError):
     #	ScatterChart(term_doc_matrix=tdm).to_dict('hamlet')
     j = (ScatterChart(term_doc_matrix=tdm,
                       minimum_term_frequency=0,
                       use_non_text_features=True).to_dict('hamlet'))
     self.assertEqual(set(j.keys()), set(['info', 'data']))
     self.assertEqual(
         set(j['info'].keys()),
         set([
             'not_category_name', 'category_name', 'category_terms',
             'not_category_terms', 'category_internal_name'
         ]))
     self.assertEqual(
         {t['term']
          for t in j['data']},
         {'cat6', 'cat4', 'cat9', 'cat5', 'cat1', 'cat3', 'cat2'})
     json.dumps(j)
 def test_to_json_use_non_text_features(self):
     tdm = build_hamlet_jz_corpus_with_meta()
     # with self.assertRaises(NoWordMeetsTermFrequencyRequirementsError):
     #	ScatterChart(term_doc_matrix=tdm).to_dict('hamlet')
     j = (ScatterChart(term_doc_matrix=tdm,
                       minimum_term_frequency=0,
                       use_non_text_features=True)
          .to_dict('hamlet'))
     self.assertEqual(set(j.keys()), set(['info', 'data']))
     self.assertEqual(set(j['info'].keys()),
                      set(['not_category_name',
                           'category_name',
                           'category_terms',
                           'not_category_terms',
                           'category_internal_name',
                           'not_category_internal_names',
                           'extra_category_internal_names',
                           'neutral_category_internal_names',
                           'categories']))
     self.assertEqual({t['term'] for t in j['data']}, {'cat1'}
                      # {'cat4', 'cat9', 'cat5', 'cat0', 'cat3', 'cat2', 'cat1'}
                      )
	def test_extra_features(self):
		corpus = build_hamlet_jz_corpus_with_meta()
		meta = ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight']
		j = (ScatterChartExplorer(corpus,
		                          minimum_term_frequency=0,
		                          use_non_text_features=True)
			.to_dict('hamlet', metadata=meta))
		extras = [{'cat3': 1, 'cat4': 2},
		          {'cat4': 2},
		          {'cat3': 2, 'cat5': 1},
		          {'cat6': 2, 'cat9': 1},
		          {'cat3': 1, 'cat4': 2},
		          {'cat1': 2, 'cat2': 1},
		          {'cat2': 2, 'cat5': 1},
		          {'cat3': 2, 'cat4': 1}]
		extras = [{'cat1': 2}] * 8
		self.maxDiff = None
		j['docs']['labels'] = list(j['docs']['labels'])
		self.assertEqual(j['docs'],
		                 {'labels': [0, 0, 0, 0, 1, 1, 1, 1],
		                  'categories': ['hamlet', 'jay-z/r. kelly'],
		                  'extra': extras,
		                  'meta': ['one',
		                           'two',
		                           'three',
		                           'four',
		                           'five',
		                           'six',
		                           'seven',
		                           'eight'],
		                  'texts': ["what art thou that usurp'st this time of night,",
		                            'together with that fair and warlike form',
		                            'in which the majesty of buried denmark',
		                            'did sometimes march? by heaven i charge thee, speak!',
		                            'halt! who goes there?',
		                            'it is i sire tone from brooklyn.',
		                            'well, speak up man what is it?',
		                            'news from the east sire! the best of both worlds has returned!']}
		                 )
	def test_extra_features(self):
		corpus = build_hamlet_jz_corpus_with_meta()
		d = DocsAndLabelsFromCorpus(corpus).use_non_text_features()
		metadata = ['meta%s' % (i) for i in range(corpus.get_num_docs())]
		output = d.get_labels_and_texts_and_meta(metadata)
		extra_val = [{'cat3': 1, 'cat4': 2}, {'cat4': 2}, {'cat5': 1, 'cat3': 2},
		             {'cat9': 1, 'cat6': 2}, {'cat3': 1, 'cat4': 2},
		             {'cat1': 2, 'cat2': 1},
		             {'cat5': 1, 'cat2': 2},
		             {'cat3': 2, 'cat4': 1}]
		extra_val = [{'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}]
		output['labels'] = list(output['labels'])
		self.assertEqual(output, {'categories': ['hamlet', 'jay-z/r. kelly'],
		                          'texts': ["what art thou that usurp'st this time of night,",
		                                    'together with that fair and warlike form',
		                                    'in which the majesty of buried denmark',
		                                    'did sometimes march? by heaven i charge thee, speak!', 'halt! who goes there?',
		                                    'it is i sire tone from brooklyn.', 'well, speak up man what is it?',
		                                    'news from the east sire! the best of both worlds has returned!'],
		                          'meta': ['meta0', 'meta1', 'meta2', 'meta3', 'meta4', 'meta5', 'meta6', 'meta7'],
		                          'labels': [0, 0, 0, 0, 1, 1, 1, 1],
		                          'extra': extra_val}
		                 )
Example #21
0
 def test_metadata_in_use(self):
     hamlet = get_hamlet_term_doc_matrix()
     self.assertFalse(hamlet.metadata_in_use())
     hamlet_meta = build_hamlet_jz_corpus_with_meta()
     self.assertTrue(hamlet_meta.metadata_in_use())
 def test_metadata_in_use(self):
     hamlet = get_hamlet_term_doc_matrix()
     self.assertFalse(hamlet.metadata_in_use())
     hamlet_meta = build_hamlet_jz_corpus_with_meta()
     self.assertTrue(hamlet_meta.metadata_in_use())
Example #23
0
 def test_get_metadata_doc_mat(self):
     hamlet_meta = build_hamlet_jz_corpus_with_meta()
     mX = hamlet_meta.get_metadata_doc_mat()
     np.testing.assert_array_equal(
         mX.shape, (hamlet_meta.get_num_docs(),
                    len(hamlet_meta.get_metadata_freq_df())))
Example #24
0
 def test_get_metadata(self):
     hamlet_meta = build_hamlet_jz_corpus_with_meta()
     self.assertEqual(hamlet_meta.get_metadata(), ['cat1'])
Example #25
0
 def test_get_metadata_count_mat(self):
     corpus = build_hamlet_jz_corpus_with_meta()
     np.testing.assert_array_almost_equal(corpus.get_metadata_count_mat(),
                                          [[4, 4]])
 def test_get_metadata_freq_df(self):
     hamlet_meta = build_hamlet_jz_corpus_with_meta()
     mdf = hamlet_meta.get_metadata_freq_df()
     self.assertEqual(list(mdf.columns), ['hamlet freq', 'jay-z/r. kelly freq'])
     mdf = hamlet_meta.get_metadata_freq_df('')
     self.assertEqual(list(mdf.columns), ['hamlet', 'jay-z/r. kelly'])
 def test_get_metadata(self):
     hamlet_meta = build_hamlet_jz_corpus_with_meta()
     self.assertEqual(hamlet_meta.get_metadata(), ['cat1'])
 def test_get_ranks_meta(self):
     corpus = build_hamlet_jz_corpus_with_meta()
     self.assertEquals(
         ZScores(corpus).set_term_ranker(
             OncePerDocFrequencyRanker).set_categories('hamlet').get_name(),
         "Z-Score from Welch's T-Test")
 def test_get_metadata_doc_mat(self):
     hamlet_meta = build_hamlet_jz_corpus_with_meta()
     mX = hamlet_meta.get_metadata_doc_mat()
     np.testing.assert_array_equal(mX.shape, (hamlet_meta.get_num_docs(), len(hamlet_meta.get_metadata_freq_df())))
Example #30
0
	def test_get_ranks_meta(self):
		corpus = build_hamlet_jz_corpus_with_meta()
		self.assertEquals(ZScores(corpus)
						  .set_term_ranker(OncePerDocFrequencyRanker)
						  .set_categories('hamlet').get_name(), "Z-Score from Welch's T-Test")
 def test_get_metadata_count_mat(self):
     corpus = build_hamlet_jz_corpus_with_meta()
     np.testing.assert_array_almost_equal(
         corpus.get_metadata_count_mat(), [[4, 4]])