def test_quantile(self): t = XArray([1, 2, 3, 4, 5]) ss = t.sketch_summary() self.assertAlmostEqual(3, ss.quantile(0.5), places=1) self.assertAlmostEqual(4, ss.quantile(0.8), places=1) self.assertAlmostEqual(5, ss.quantile(0.9), places=1) self.assertAlmostEqual(5, ss.quantile(0.99), places=1)
def test_construct(self): t = XArray([1, 2, 3, 4, 5]) ss = t.sketch_summary() self.assertEqual(5, ss.size()) self.assertEqual(5, ss.max()) self.assertEqual(1, ss.min()) self.assertEqual(15, ss.sum()) self.assertEqual(3, ss.mean()) self.assertAlmostEqual(1.4142135623730951, ss.std()) self.assertAlmostEqual(2.0, ss.var())
def test_missing(self): t = XArray([None], dtype=int) ss = t.sketch_summary() self.assertIsNone(ss.min()) self.assertIsNone(ss.max()) self.assertEqual(0, ss.mean()) self.assertEqual(0.0, ss.sum()) self.assertIsNone(ss.var()) self.assertIsNone(ss.std()) self.assertIsNone(ss.max()) self.assertEqual(0, ss.avg_length())
def test_tf_idf_str(self): t = XArray(['this is a test', 'another test']) ss = t.sketch_summary() tf_idf = ss.tf_idf() self.assertEqual({'this': 0.4054651081081644, 'a': 0.4054651081081644, 'is': 0.4054651081081644, 'test': 0.0}, tf_idf[0]) self.assertEqual({'test': 0.0, 'another': 0.4054651081081644}, tf_idf[1])
def test_tf_idf_str(self): t = XArray(['this is a test', 'another test']) ss = t.sketch_summary() tf_idf = ss.tf_idf() self.assertEqual( { 'this': 0.4054651081081644, 'a': 0.4054651081081644, 'is': 0.4054651081081644, 'test': 0.0 }, tf_idf[0]) self.assertEqual({ 'test': 0.0, 'another': 0.4054651081081644 }, tf_idf[1])
def tf_idf(self): """ Returns a tf-idf analysis of each document in a collection. If the elements in the column are documents in string form, then a simple splitter is used to create a list of words. If the elemenst are already in list form, then the list elements are used as the terms. These are usually strings, but could be numeric instead. Returns ------- out : XArray of dict For each document, a dictionary mapping terms to their tf_idf score. """ if self._impl.dtype not in [list, str]: raise TypeError('Column must be of type "list" or "str".') return XArray(data=[], impl=self._impl.tf_idf())
def test_avg_length_float(self): t = XArray([1.0, 2.0, 3.0, 4.0, 5.0]) ss = t.sketch_summary() self.assertEqual(1, ss.avg_length())
def test_avg_length_int(self): t = XArray([1, 2, 3, 4, 5]) ss = t.sketch_summary() self.assertEqual(1, ss.avg_length())
def test_frequency_count(self): t = XArray([1, 2, 3, 4, 5, 3]) ss = t.sketch_summary() self.assertEqual(2, ss.frequency_count(3))
def test_avg_length_str(self): t = XArray(['a', 'bb', 'ccc', 'dddd', 'eeeee']) ss = t.sketch_summary() self.assertEqual(3, ss.avg_length())
def test_avg_length_list(self): t = XArray([[1, 2, 3, 4], [5, 6]]) ss = t.sketch_summary() self.assertEqual(3, ss.avg_length())
def test_frequent_items(self): t = XArray([1, 2, 3, 2]) ss = t.sketch_summary() self.assertEqual({1: 1, 2: 2, 3: 1}, ss.frequent_items())
def test_num_undefined(self): t = XArray([1, 2, 3, 4, 5, None]) ss = t.sketch_summary() self.assertEqual(1, ss.num_undefined())
def test_num_unique(self): t = XArray([1, 2, 3, 4, 5]) ss = t.sketch_summary() self.assertEqual(5, ss.num_unique())
def test_avg_length_empty(self): t = XArray([]) ss = t.sketch_summary() self.assertEqual(0, ss.avg_length())
def test_avg_length_dict(self): t = XArray([{1: 1, 2: 2, 3: 3, 4: 4}, {5: 5, 6: 6}]) ss = t.sketch_summary() self.assertEqual(3, ss.avg_length())
def __repr__(self): """ Emits a brief summary of all the statistics as a string. """ return "<sketch>" # TODO remove fields = [ ['size', 'Length' , 'Yes'], ['min', 'Min' , 'Yes'], ['max', 'Max' , 'Yes'], ['mean', 'Mean' , 'Yes'], ['sum', 'Sum' , 'Yes'], ['var', 'Variance' , 'Yes'], ['std', 'Standard Deviation' , 'Yes'], ['num_undefined', '# Missing Values' , 'Yes',], ['num_unique', '# unique values', 'No' ] ] s = '\n' result = [] for field in fields: try: method_to_call = getattr(self, field[0]) result.append([field[1], str(method_to_call()), field[2]]) except: pass sf = XArray(result).unpack(column_name_prefix="") sf.rename({'0': 'item', '1': 'value', '2': 'is exact'}) s += sf.__str__(footer=False) s += "\n" s += "\nMost frequent items:\n" frequent = self.frequent_items() sorted_freq = sorted(frequent.iteritems(), key=operator.itemgetter(1), reverse=True) if len(sorted_freq) == 0: s += " -- All elements appear with less than 0.01% frequency -- \n" else: sorted_freq = sorted_freq[:10] sf = XFrame() sf.add_column(XArray(['count']), 'value') for elem in sorted_freq: sf.add_column(XArray([elem[1]]), str(elem[0])) s += sf.__str__(footer=False) + "\n" s += "\n" try: # print quantiles t = self.quantile(0) s += "Quantiles: \n" sf = XFrame() for q in [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.00]: sf.add_column(XArray([self.quantile(q)]), str(int(q * 100)) + '%') s += sf.__str__(footer=False) + "\n" except: pass try: t_k = self.dict_key_summary() t_v = self.dict_value_summary() s += "\n******** Dictionary Element Key Summary ********\n" s += t_k.__repr__() s += "\n******** Dictionary Element Value Summary ********\n" s += t_v.__repr__() + '\n' except: pass try: t_k = self.element_summary() s += "\n******** Element Summary ********\n" s += t_k.__repr__() + '\n' except: pass return s.expandtabs(8)
def __repr__(self): """ Emits a brief summary of all the statistics as a string. """ return "<sketch>" # TODO remove fields = [['size', 'Length', 'Yes'], ['min', 'Min', 'Yes'], ['max', 'Max', 'Yes'], ['mean', 'Mean', 'Yes'], ['sum', 'Sum', 'Yes'], ['var', 'Variance', 'Yes'], ['std', 'Standard Deviation', 'Yes'], [ 'num_undefined', '# Missing Values', 'Yes', ], ['num_unique', '# unique values', 'No']] s = '\n' result = [] for field in fields: try: method_to_call = getattr(self, field[0]) result.append([field[1], str(method_to_call()), field[2]]) except: pass sf = XArray(result).unpack(column_name_prefix="") sf.rename({'0': 'item', '1': 'value', '2': 'is exact'}) s += sf.__str__(footer=False) s += "\n" s += "\nMost frequent items:\n" frequent = self.frequent_items() sorted_freq = sorted(frequent.iteritems(), key=operator.itemgetter(1), reverse=True) if len(sorted_freq) == 0: s += " -- All elements appear with less than 0.01% frequency -- \n" else: sorted_freq = sorted_freq[:10] sf = XFrame() sf.add_column(XArray(['count']), 'value') for elem in sorted_freq: sf.add_column(XArray([elem[1]]), str(elem[0])) s += sf.__str__(footer=False) + "\n" s += "\n" try: # print quantiles t = self.quantile(0) s += "Quantiles: \n" sf = XFrame() for q in [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.00]: sf.add_column(XArray([self.quantile(q)]), str(int(q * 100)) + '%') s += sf.__str__(footer=False) + "\n" except: pass try: t_k = self.dict_key_summary() t_v = self.dict_value_summary() s += "\n******** Dictionary Element Key Summary ********\n" s += t_k.__repr__() s += "\n******** Dictionary Element Value Summary ********\n" s += t_v.__repr__() + '\n' except: pass try: t_k = self.element_summary() s += "\n******** Element Summary ********\n" s += t_k.__repr__() + '\n' except: pass return s.expandtabs(8)
def eq_list(expected, result): return (XArray(expected) == result).all()