def test_quantile(self):
     t = XArray([1, 2, 3, 4, 5])
     ss = t.sketch_summary()
     self.assertAlmostEqual(3, ss.quantile(0.5), places=1)
     self.assertAlmostEqual(4, ss.quantile(0.8), places=1)
     self.assertAlmostEqual(5, ss.quantile(0.9), places=1)
     self.assertAlmostEqual(5, ss.quantile(0.99), places=1)
 def test_quantile(self):
     t = XArray([1, 2, 3, 4, 5])
     ss = t.sketch_summary()
     self.assertAlmostEqual(3, ss.quantile(0.5), places=1)
     self.assertAlmostEqual(4, ss.quantile(0.8), places=1)
     self.assertAlmostEqual(5, ss.quantile(0.9), places=1)
     self.assertAlmostEqual(5, ss.quantile(0.99), places=1)
 def test_construct(self):
     t = XArray([1, 2, 3, 4, 5])
     ss = t.sketch_summary()
     self.assertEqual(5, ss.size())
     self.assertEqual(5, ss.max())
     self.assertEqual(1, ss.min())
     self.assertEqual(15, ss.sum())
     self.assertEqual(3, ss.mean())
     self.assertAlmostEqual(1.4142135623730951, ss.std())
     self.assertAlmostEqual(2.0, ss.var())
 def test_construct(self):
     t = XArray([1, 2, 3, 4, 5])
     ss = t.sketch_summary()
     self.assertEqual(5, ss.size())
     self.assertEqual(5, ss.max())
     self.assertEqual(1, ss.min())
     self.assertEqual(15, ss.sum())
     self.assertEqual(3, ss.mean())
     self.assertAlmostEqual(1.4142135623730951, ss.std())
     self.assertAlmostEqual(2.0, ss.var())
 def test_missing(self):
     t = XArray([None], dtype=int)
     ss = t.sketch_summary()
     self.assertIsNone(ss.min())
     self.assertIsNone(ss.max())
     self.assertEqual(0, ss.mean())
     self.assertEqual(0.0, ss.sum())
     self.assertIsNone(ss.var())
     self.assertIsNone(ss.std())
     self.assertIsNone(ss.max())
     self.assertEqual(0, ss.avg_length())
 def test_missing(self):
     t = XArray([None], dtype=int)
     ss = t.sketch_summary()
     self.assertIsNone(ss.min())
     self.assertIsNone(ss.max())
     self.assertEqual(0, ss.mean())
     self.assertEqual(0.0, ss.sum())
     self.assertIsNone(ss.var())
     self.assertIsNone(ss.std())
     self.assertIsNone(ss.max())
     self.assertEqual(0, ss.avg_length())
 def test_tf_idf_str(self):
     t = XArray(['this is a test', 'another test'])
     ss = t.sketch_summary()
     tf_idf = ss.tf_idf()
     self.assertEqual({'this': 0.4054651081081644,
                       'a': 0.4054651081081644,
                       'is': 0.4054651081081644,
                       'test': 0.0},
                      tf_idf[0])
     self.assertEqual({'test': 0.0,
                       'another': 0.4054651081081644},
                      tf_idf[1])
 def test_tf_idf_str(self):
     t = XArray(['this is a test', 'another test'])
     ss = t.sketch_summary()
     tf_idf = ss.tf_idf()
     self.assertEqual(
         {
             'this': 0.4054651081081644,
             'a': 0.4054651081081644,
             'is': 0.4054651081081644,
             'test': 0.0
         }, tf_idf[0])
     self.assertEqual({
         'test': 0.0,
         'another': 0.4054651081081644
     }, tf_idf[1])
    def tf_idf(self):
        """
        Returns a tf-idf analysis of each document in a collection.

        If the elements in the column are documents in string form, then a simple splitter is
        used to create a list of words.

        If the elemenst are already in list form, then the list elements are used as the terms.  These
        are usually strings, but could be numeric instead.

        Returns
        -------
        out : XArray of dict
            For each document, a dictionary mapping terms to their tf_idf score.
        """
        if self._impl.dtype not in [list, str]:
            raise TypeError('Column must be of type "list" or "str".')

        return XArray(data=[], impl=self._impl.tf_idf())
Exemple #10
0
 def test_avg_length_float(self):
     t = XArray([1.0, 2.0, 3.0, 4.0, 5.0])
     ss = t.sketch_summary()
     self.assertEqual(1, ss.avg_length())
Exemple #11
0
 def test_avg_length_int(self):
     t = XArray([1, 2, 3, 4, 5])
     ss = t.sketch_summary()
     self.assertEqual(1, ss.avg_length())
 def test_frequency_count(self):
     t = XArray([1, 2, 3, 4, 5, 3])
     ss = t.sketch_summary()
     self.assertEqual(2, ss.frequency_count(3))
 def test_avg_length_int(self):
     t = XArray([1, 2, 3, 4, 5])
     ss = t.sketch_summary()
     self.assertEqual(1, ss.avg_length())
Exemple #14
0
 def test_frequency_count(self):
     t = XArray([1, 2, 3, 4, 5, 3])
     ss = t.sketch_summary()
     self.assertEqual(2, ss.frequency_count(3))
 def test_avg_length_str(self):
     t = XArray(['a', 'bb', 'ccc', 'dddd', 'eeeee'])
     ss = t.sketch_summary()
     self.assertEqual(3, ss.avg_length())
Exemple #16
0
 def test_avg_length_str(self):
     t = XArray(['a', 'bb', 'ccc', 'dddd', 'eeeee'])
     ss = t.sketch_summary()
     self.assertEqual(3, ss.avg_length())
 def test_avg_length_list(self):
     t = XArray([[1, 2, 3, 4], [5, 6]])
     ss = t.sketch_summary()
     self.assertEqual(3, ss.avg_length())
Exemple #18
0
 def test_frequent_items(self):
     t = XArray([1, 2, 3, 2])
     ss = t.sketch_summary()
     self.assertEqual({1: 1, 2: 2, 3: 1}, ss.frequent_items())
Exemple #19
0
 def test_num_undefined(self):
     t = XArray([1, 2, 3, 4, 5, None])
     ss = t.sketch_summary()
     self.assertEqual(1, ss.num_undefined())
 def test_frequent_items(self):
     t = XArray([1, 2, 3, 2])
     ss = t.sketch_summary()
     self.assertEqual({1: 1, 2: 2, 3: 1}, ss.frequent_items())
 def test_num_unique(self):
     t = XArray([1, 2, 3, 4, 5])
     ss = t.sketch_summary()
     self.assertEqual(5, ss.num_unique())
 def test_num_undefined(self):
     t = XArray([1, 2, 3, 4, 5, None])
     ss = t.sketch_summary()
     self.assertEqual(1, ss.num_undefined())
 def test_avg_length_empty(self):
     t = XArray([])
     ss = t.sketch_summary()
     self.assertEqual(0, ss.avg_length())
Exemple #24
0
 def test_avg_length_list(self):
     t = XArray([[1, 2, 3, 4], [5, 6]])
     ss = t.sketch_summary()
     self.assertEqual(3, ss.avg_length())
Exemple #25
0
 def test_avg_length_dict(self):
     t = XArray([{1: 1, 2: 2, 3: 3, 4: 4}, {5: 5, 6: 6}])
     ss = t.sketch_summary()
     self.assertEqual(3, ss.avg_length())
 def test_avg_length_float(self):
     t = XArray([1.0, 2.0, 3.0, 4.0, 5.0])
     ss = t.sketch_summary()
     self.assertEqual(1, ss.avg_length())
Exemple #27
0
 def test_avg_length_empty(self):
     t = XArray([])
     ss = t.sketch_summary()
     self.assertEqual(0, ss.avg_length())
 def test_avg_length_dict(self):
     t = XArray([{1: 1, 2: 2, 3: 3, 4: 4}, {5: 5, 6: 6}])
     ss = t.sketch_summary()
     self.assertEqual(3, ss.avg_length())
Exemple #29
0
 def test_num_unique(self):
     t = XArray([1, 2, 3, 4, 5])
     ss = t.sketch_summary()
     self.assertEqual(5, ss.num_unique())
Exemple #30
0
    def __repr__(self):
        """
        Emits a brief summary of all the statistics as a string.
        """
        return "<sketch>"     # TODO remove

        fields = [
            ['size',           'Length' ,       'Yes'],
            ['min',            'Min' ,          'Yes'],
            ['max',            'Max' ,          'Yes'],
            ['mean',           'Mean' ,         'Yes'],
            ['sum',            'Sum' ,          'Yes'],
            ['var',            'Variance' ,     'Yes'],
            ['std',            'Standard Deviation' , 'Yes'],
            ['num_undefined', '# Missing Values' , 'Yes',],
            ['num_unique',     '# unique values',  'No' ]
        ]

        s = '\n'
        result = []
        for field in fields:
            try:
                method_to_call = getattr(self, field[0])
                result.append([field[1], str(method_to_call()), field[2]])
            except:
                pass
        sf = XArray(result).unpack(column_name_prefix="")
        sf.rename({'0': 'item', '1': 'value', '2': 'is exact'})
        s += sf.__str__(footer=False)
        s += "\n"

        s += "\nMost frequent items:\n"
        frequent = self.frequent_items()
        sorted_freq = sorted(frequent.iteritems(), key=operator.itemgetter(1), reverse=True)
        if len(sorted_freq) == 0:
            s += " -- All elements appear with less than 0.01% frequency -- \n"
        else:
            sorted_freq = sorted_freq[:10]
            sf = XFrame()
            sf.add_column(XArray(['count']), 'value')
            for elem in sorted_freq:
                sf.add_column(XArray([elem[1]]), str(elem[0]))
            s += sf.__str__(footer=False) + "\n"
        s += "\n"

        try:
            # print quantiles
            t = self.quantile(0)
            s += "Quantiles: \n"
            sf = XFrame()
            for q in [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.00]:
                sf.add_column(XArray([self.quantile(q)]), str(int(q * 100)) + '%')
            s += sf.__str__(footer=False) + "\n"
        except:
            pass

        try:
            t_k = self.dict_key_summary()
            t_v = self.dict_value_summary()
            s += "\n******** Dictionary Element Key Summary ********\n"
            s += t_k.__repr__()
            s += "\n******** Dictionary Element Value Summary ********\n"
            s += t_v.__repr__() + '\n'
        except:
            pass

        try:
            t_k = self.element_summary()
            s += "\n******** Element Summary ********\n"
            s += t_k.__repr__() + '\n'
        except:
            pass

        return s.expandtabs(8)
    def __repr__(self):
        """
        Emits a brief summary of all the statistics as a string.
        """
        return "<sketch>"  # TODO remove

        fields = [['size', 'Length', 'Yes'], ['min', 'Min', 'Yes'],
                  ['max', 'Max', 'Yes'], ['mean', 'Mean', 'Yes'],
                  ['sum', 'Sum', 'Yes'], ['var', 'Variance', 'Yes'],
                  ['std', 'Standard Deviation', 'Yes'],
                  [
                      'num_undefined',
                      '# Missing Values',
                      'Yes',
                  ], ['num_unique', '# unique values', 'No']]

        s = '\n'
        result = []
        for field in fields:
            try:
                method_to_call = getattr(self, field[0])
                result.append([field[1], str(method_to_call()), field[2]])
            except:
                pass
        sf = XArray(result).unpack(column_name_prefix="")
        sf.rename({'0': 'item', '1': 'value', '2': 'is exact'})
        s += sf.__str__(footer=False)
        s += "\n"

        s += "\nMost frequent items:\n"
        frequent = self.frequent_items()
        sorted_freq = sorted(frequent.iteritems(),
                             key=operator.itemgetter(1),
                             reverse=True)
        if len(sorted_freq) == 0:
            s += " -- All elements appear with less than 0.01% frequency -- \n"
        else:
            sorted_freq = sorted_freq[:10]
            sf = XFrame()
            sf.add_column(XArray(['count']), 'value')
            for elem in sorted_freq:
                sf.add_column(XArray([elem[1]]), str(elem[0]))
            s += sf.__str__(footer=False) + "\n"
        s += "\n"

        try:
            # print quantiles
            t = self.quantile(0)
            s += "Quantiles: \n"
            sf = XFrame()
            for q in [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.00]:
                sf.add_column(XArray([self.quantile(q)]),
                              str(int(q * 100)) + '%')
            s += sf.__str__(footer=False) + "\n"
        except:
            pass

        try:
            t_k = self.dict_key_summary()
            t_v = self.dict_value_summary()
            s += "\n******** Dictionary Element Key Summary ********\n"
            s += t_k.__repr__()
            s += "\n******** Dictionary Element Value Summary ********\n"
            s += t_v.__repr__() + '\n'
        except:
            pass

        try:
            t_k = self.element_summary()
            s += "\n******** Element Summary ********\n"
            s += t_k.__repr__() + '\n'
        except:
            pass

        return s.expandtabs(8)
def eq_list(expected, result):
    return (XArray(expected) == result).all()