Beispiel #1
0
 def test_title_case_names(self):
     tdm = build_hamlet_jz_term_doc_mat()
     j = (ScatterChart(term_doc_matrix=tdm,
                       minimum_term_frequency=0)
          .to_dict('hamlet', 'HAMLET', 'NOT HAMLET'))
     self.assertEqual(j['info']['category_name'], 'HAMLET')
     self.assertEqual(j['info']['not_category_name'], 'NOT HAMLET')
     tdm = build_hamlet_jz_term_doc_mat()
     j = (ScatterChart(term_doc_matrix=tdm,
                       minimum_term_frequency=0)
          .to_dict('hamlet', 'HAMLET', 'NOT HAMLET', title_case_names=True))
     self.assertEqual(j['info']['category_name'], 'Hamlet')
     self.assertEqual(j['info']['not_category_name'], 'Not Hamlet')
 def test_title_case_names(self):
     tdm = build_hamlet_jz_term_doc_mat()
     j = (ScatterChart(term_doc_matrix=tdm,
                       minimum_term_frequency=0)
          .to_dict('hamlet', 'HAMLET', 'NOT HAMLET'))
     self.assertEqual(j['info']['category_name'], 'HAMLET')
     self.assertEqual(j['info']['not_category_name'], 'NOT HAMLET')
     tdm = build_hamlet_jz_term_doc_mat()
     j = (ScatterChart(term_doc_matrix=tdm,
                       minimum_term_frequency=0)
          .to_dict('hamlet', 'HAMLET', 'NOT HAMLET', title_case_names=True))
     self.assertEqual(j['info']['category_name'], 'Hamlet')
     self.assertEqual(j['info']['not_category_name'], 'Not Hamlet')
 def test_inject_coordinates(self):
     tdm = build_hamlet_jz_term_doc_mat()
     freq_df = tdm.get_term_freq_df()
     scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                  minimum_term_frequency=0)
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates([], [])
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(freq_df[freq_df.columns[0]], [])
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates([], freq_df[freq_df.columns[0]])
     x = freq_df[freq_df.columns[1]].astype(np.float)
     y = freq_df[freq_df.columns[0]].astype(np.float)
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(x, y)
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(x, y / y.max())
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(x / x.max(), y)
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(-x / x.max(), -y / y.max())
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(-x / x.max(), y / y.max())
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(x / x.max(), -y / y.max())
     scatter_chart.inject_coordinates(x / x.max(), y / y.max())
Beispiel #4
0
	def test_get_p_vals(self):
		tdm = build_hamlet_jz_term_doc_mat()
		df = tdm.get_term_freq_df()
		X = df[['hamlet freq', 'jay-z/r. kelly freq']].values
		pvals = LogOddsRatioUninformativeDirichletPrior().get_p_vals(X)
		self.assertGreaterEqual(min(pvals), 0)
		self.assertLessEqual(min(pvals), 1)
 def test_inject_coordinates(self):
     tdm = build_hamlet_jz_term_doc_mat()
     freq_df = tdm.get_term_freq_df()
     scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                  minimum_term_frequency=0)
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates([], [])
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(freq_df[freq_df.columns[0]], [])
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates([], freq_df[freq_df.columns[0]])
     x = freq_df[freq_df.columns[1]].astype(np.float)
     y = freq_df[freq_df.columns[0]].astype(np.float)
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(x, y)
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(x, y / y.max())
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(x / x.max(), y)
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(-x / x.max(), -y / y.max())
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(-x / x.max(), y / y.max())
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(x / x.max(), -y / y.max())
     scatter_chart.inject_coordinates(x / x.max(), y / y.max())
 def test_to_json(self):
     tdm = build_hamlet_jz_term_doc_mat()
     # with self.assertRaises(NoWordMeetsTermFrequencyRequirementsError):
     #	ScatterChart(term_doc_matrix=tdm).to_dict('hamlet')
     j = (ScatterChart(term_doc_matrix=tdm,
                       minimum_term_frequency=0).to_dict('hamlet'))
     self.assertEqual(set(j.keys()), set(['info', 'data']))
     self.assertEqual(
         set(j['info'].keys()),
         set([
             'not_category_name', 'category_name', 'category_terms',
             'not_category_terms', 'category_internal_name',
             'not_category_internal_names', 'categories'
         ]))
     expected = {
         "x": 0.0,
         "y": 0.42,
         'ox': 0,
         'oy': 0.42,
         "term": "art",
         "cat25k": 758,
         "ncat25k": 0,
         "neut25k": 0,
         'neut': 0,
         's': 0.5,
         'os': 3,
         'bg': 3
     }
     datum = self._get_data_example(j)
     for var in ['cat25k', 'ncat25k']:
         np.testing.assert_almost_equal(expected[var],
                                        datum[var],
                                        decimal=1)
     self.assertEqual(set(expected.keys()), set(datum.keys()))
     self.assertEqual(expected['term'], datum['term'])
	def test_compact(self):
		tdm = build_hamlet_jz_term_doc_mat()
		c = PhraseSelector(minimum_pmi=10).compact(tdm)
		bigrams = [t for t in tdm.get_terms() if ' ' in t]
		new_bigrams = [t for t in c.get_terms() if ' ' in t]
		self.assertLess(len(new_bigrams), len(bigrams))
		self.assertTrue(set(new_bigrams) -set(bigrams) == set())
Beispiel #8
0
    def test_max_terms(self):
        tdm = build_hamlet_jz_term_doc_mat()
        j = (ScatterChart(term_doc_matrix=tdm,
                          minimum_term_frequency=0,
                          max_terms=2)
             .to_dict('hamlet'))
        self.assertEqual(2, len(j['data']))

        j = (ScatterChart(term_doc_matrix=tdm,
                          minimum_term_frequency=0,
                          max_terms=10)
             .to_dict('hamlet'))
        self.assertEqual(10, len(j['data']))

        j = (ScatterChart(term_doc_matrix=tdm,
                          minimum_term_frequency=0,
                          pmi_threshold_coefficient=0,
                          max_terms=10000)
             .to_dict('hamlet'))
        self.assertEqual(len(tdm.get_term_freq_df()), len(j['data']))

        j = (ScatterChart(term_doc_matrix=tdm,
                          minimum_term_frequency=0,
                          pmi_threshold_coefficient=0,
                          max_terms=None)
             .to_dict('hamlet'))
        self.assertEqual(len(tdm.get_term_freq_df()), len(j['data']))
Beispiel #9
0
    def test_inject_metadata_term_lists(self):
        tdm = build_hamlet_jz_term_doc_mat()
        scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                     minimum_term_frequency=0)
        with self.assertRaises(TermDocMatrixHasNoMetadataException):
            scatter_chart.inject_metadata_term_lists({'blah': ['a', 'adsf', 'asfd']})
        scatter_chart = ScatterChart(term_doc_matrix=build_hamlet_jz_corpus_with_meta(),
                                     minimum_term_frequency=0,
                                     use_non_text_features=True)

        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({'blash': [3, 1]})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({3: ['a', 'b']})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({'a': {'a', 'b'}})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists(3)
        self.assertEqual(type(scatter_chart.inject_metadata_term_lists({'a': ['a', 'b']})), ScatterChart)
        j = scatter_chart.to_dict('hamlet')
        self.assertEqual(set(j.keys()), set(['info', 'data', 'metalists']))
        self.assertEqual(set(j['info'].keys()),
                         set(['not_category_name',
                              'category_name',
                              'category_terms',
                              'not_category_terms',
                              'category_internal_name',
                              'not_category_internal_names',
                              'extra_category_internal_names',
                              'neutral_category_internal_names',
                              'categories']))
Beispiel #10
0
 def test_compact(self):
     tdm = build_hamlet_jz_term_doc_mat()
     c = PhraseSelector(minimum_pmi=10).compact(tdm)
     bigrams = [t for t in tdm.get_terms() if ' ' in t]
     new_bigrams = [t for t in c.get_terms() if ' ' in t]
     self.assertLess(len(new_bigrams), len(bigrams))
     self.assertTrue(set(new_bigrams) - set(bigrams) == set())
    def test_max_terms(self):
        tdm = build_hamlet_jz_term_doc_mat()
        j = (ScatterChart(term_doc_matrix=tdm,
                          minimum_term_frequency=0,
                          max_terms=2)
             .to_dict('hamlet'))
        self.assertEqual(2, len(j['data']))

        j = (ScatterChart(term_doc_matrix=tdm,
                          minimum_term_frequency=0,
                          max_terms=10)
             .to_dict('hamlet'))
        self.assertEqual(10, len(j['data']))

        j = (ScatterChart(term_doc_matrix=tdm,
                          minimum_term_frequency=0,
                          pmi_threshold_coefficient=0,
                          max_terms=10000)
             .to_dict('hamlet'))
        self.assertEqual(len(tdm.get_term_freq_df()), len(j['data']))

        j = (ScatterChart(term_doc_matrix=tdm,
                          minimum_term_frequency=0,
                          pmi_threshold_coefficient=0,
                          max_terms=None)
             .to_dict('hamlet'))
        self.assertEqual(len(tdm.get_term_freq_df()), len(j['data']))
    def test_inject_metadata_term_lists(self):
        tdm = build_hamlet_jz_term_doc_mat()
        scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                     minimum_term_frequency=0)
        with self.assertRaises(TermDocMatrixHasNoMetadataException):
            scatter_chart.inject_metadata_term_lists({'blah': ['a', 'adsf', 'asfd']})
        scatter_chart = ScatterChart(term_doc_matrix=build_hamlet_jz_corpus_with_meta(),
                                     minimum_term_frequency=0,
                                     use_non_text_features=True)

        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({'blash': [3, 1]})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({3: ['a', 'b']})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({'a': {'a', 'b'}})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists(3)
        self.assertEqual(type(scatter_chart.inject_metadata_term_lists({'a': ['a', 'b']})), ScatterChart)
        j = scatter_chart.to_dict('hamlet')
        self.assertEqual(set(j.keys()), set(['info', 'data', 'metalists']))
        self.assertEqual(set(j['info'].keys()),
                         set(['not_category_name',
                              'category_name',
                              'category_terms',
                              'not_category_terms',
                              'category_internal_name',
                              'not_category_internal_names',
                              'extra_category_internal_names',
                              'neutral_category_internal_names',
                              'categories']))
Beispiel #13
0
 def test_terms_to_include(self):
     tdm = build_hamlet_jz_term_doc_mat()
     terms_to_include = list(sorted(['both worlds', 'thou', 'the', 'of', 'st', 'returned', 'best', ]))
     j = (ScatterChart(term_doc_matrix=tdm,
                       minimum_term_frequency=0,
                       terms_to_include=terms_to_include)
          .to_dict('hamlet', 'HAMLET', 'NOT HAMLET'))
     self.assertEqual(list(sorted(t['term'] for t in j['data'])), terms_to_include)
 def test_p_vals(self):
     tdm = build_hamlet_jz_term_doc_mat()
     j = (ScatterChart(term_doc_matrix=tdm,
                       minimum_term_frequency=0,
                       term_significance=LogOddsRatioUninformativeDirichletPrior())
          .to_dict('hamlet'))
     datum = self._get_data_example(j)
     self.assertIn('p', datum.keys())
Beispiel #15
0
 def test_p_vals(self):
     tdm = build_hamlet_jz_term_doc_mat()
     j = (ScatterChart(term_doc_matrix=tdm,
                       minimum_term_frequency=0,
                       term_significance=LogOddsRatioUninformativeDirichletPrior())
          .to_dict('hamlet'))
     datum = self._get_data_example(j)
     self.assertIn('p', datum.keys())
 def test_terms_to_include(self):
     tdm = build_hamlet_jz_term_doc_mat()
     terms_to_include = list(sorted(['both worlds', 'thou', 'the', 'of', 'st', 'returned', 'best', ]))
     j = (ScatterChart(term_doc_matrix=tdm,
                       minimum_term_frequency=0,
                       terms_to_include=terms_to_include)
          .to_dict('hamlet', 'HAMLET', 'NOT HAMLET'))
     self.assertEqual(list(sorted(t['term'] for t in j['data'])), terms_to_include)
Beispiel #17
0
 def test_inject_coordinates_original(self):
     tdm = build_hamlet_jz_term_doc_mat()
     freq_df = tdm.get_term_freq_df()
     scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                  minimum_term_frequency=0)
     x = freq_df[freq_df.columns[1]].astype(np.float)
     y = freq_df[freq_df.columns[0]].astype(np.float)
     scatter_chart.inject_coordinates(x / x.max(), y / y.max(), original_x=x, original_y=y)
     j = scatter_chart.to_dict('hamlet')
     self.assertEqual(j['data'][0].keys(),
                      {'x', 'os', 'y', 'ncat25k', 'neut', 'cat25k', 'ox', 'neut25k', 'extra25k', 'extra', 'oy',
                       'term',
                       's', 'bg'})
     and_term = [t for t in j['data'] if t['term'] == 'and'][0]
     self.assertEqual(and_term['ox'], 0)
     self.assertEqual(and_term['oy'], 1)
 def test_inject_coordinates_original(self):
     tdm = build_hamlet_jz_term_doc_mat()
     freq_df = tdm.get_term_freq_df()
     scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                  minimum_term_frequency=0)
     x = freq_df[freq_df.columns[1]].astype(np.float)
     y = freq_df[freq_df.columns[0]].astype(np.float)
     scatter_chart.inject_coordinates(x / x.max(), y / y.max(), original_x=x, original_y=y)
     j = scatter_chart.to_dict('hamlet')
     self.assertEqual(j['data'][0].keys(),
                      {'x', 'os', 'y', 'ncat25k', 'neut', 'cat25k', 'ox', 'neut25k', 'extra25k', 'extra', 'oy',
                       'term',
                       's', 'bg'})
     and_term = [t for t in j['data'] if t['term'] == 'and'][0]
     self.assertEqual(and_term['ox'], 0)
     self.assertEqual(and_term['oy'], 1)
    def test_max_terms(self):
        tdm = build_hamlet_jz_term_doc_mat()
        # with self.assertRaises(NoWordMeetsTermFrequencyRequirementsError):
        #	ScatterChart(term_doc_matrix=tdm).to_dict('hamlet')
        j = (ScatterChart(term_doc_matrix=tdm,
                          minimum_term_frequency=0,
                          max_terms=2).to_dict('hamlet'))
        self.assertEqual(2, len(j['data']))

        j = (ScatterChart(term_doc_matrix=tdm,
                          minimum_term_frequency=0,
                          max_terms=10).to_dict('hamlet'))
        self.assertEqual(10, len(j['data']))

        j = (ScatterChart(term_doc_matrix=tdm,
                          minimum_term_frequency=0,
                          max_terms=10000).to_dict('hamlet'))
        self.assertEqual(51, len(j['data']))

        j = (ScatterChart(term_doc_matrix=tdm,
                          minimum_term_frequency=0,
                          max_terms=None).to_dict('hamlet'))
        self.assertEqual(51, len(j['data']))
 def test_to_json(self):
     tdm = build_hamlet_jz_term_doc_mat()
     # with self.assertRaises(NoWordMeetsTermFrequencyRequirementsError):
     #	ScatterChart(term_doc_matrix=tdm).to_dict('hamlet')
     j = (ScatterChart(term_doc_matrix=tdm,
                       minimum_term_frequency=0)
          .to_dict('hamlet'))
     self.assertEqual(set(j.keys()), set(['info', 'data']))
     self.assertEqual(set(j['info'].keys()),
                      set(['not_category_name',
                           'category_name',
                           'category_terms',
                           'not_category_terms',
                           'category_internal_name',
                           'not_category_internal_names',
                           'neutral_category_internal_names',
                           'extra_category_internal_names',
                           'categories']))
     expected = {"x": 0.0,
                 "y": 0.42,
                 'ox': 0,
                 'oy': 0.42,
                 "term": "art",
                 "cat25k": 758,
                 "ncat25k": 0,
                 "neut25k": 0,
                 'neut': 0,
                 "extra25k": 0,
                 'extra': 0,
                 's': 0.5,
                 'os': 3,
                 'bg': 3}
     datum = self._get_data_example(j)
     for var in ['cat25k', 'ncat25k']:
         np.testing.assert_almost_equal(expected[var], datum[var], decimal=1)
     self.assertEqual(set(expected.keys()), set(datum.keys()))
     self.assertEqual(expected['term'], datum['term'])