Esempio n. 1
0
 def test_background_sketch(self):
     dict_data = [{str(i):1} for i in range(1,10000)]
     sa = SArray(dict_data)
     s = sa.sketch_summary(background=True, sub_sketch_keys=[str(i ) for i in range(100,200)])
     s.sketch_ready() # cannot check the actual value as it depends on the speed of processing
     t = s.element_sub_sketch([str(i) for i in range(100, 105)])
     self.assertEqual(len(t), 5)
Esempio n. 2
0
    def test_str_sketch(self):
        str_data = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", None]
        sa = SArray(data=str_data)
        sketch = sa.sketch_summary()
        with self.assertRaises(RuntimeError):
            sketch.min()
        with self.assertRaises(RuntimeError):
            sketch.max()
        with self.assertRaises(RuntimeError):
            sketch.sum()
        with self.assertRaises(RuntimeError):
            sketch.mean()
        with self.assertRaises(RuntimeError):
            sketch.var()
        with self.assertRaises(RuntimeError):
            sketch.std()

        self.assertAlmostEqual(sketch.num_unique(), 10, delta=3)
        self.assertEqual(sketch.num_undefined(), 1)
        self.assertEqual(sketch.size(), len(str_data))

        with self.assertRaises(RuntimeError):
            sketch.quantile(0.5)
        self.assertEqual(sketch.frequency_count("1"), 1)
        self.assertEqual(sketch.frequency_count("2"), 1)
        t = sketch.frequent_items()
        self.assertEqual(len(t), 10)
Esempio n. 3
0
    def test_str_sketch(self):
        str_data = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", None]
        sa = SArray(data=str_data)
        sketch = sa.sketch_summary()
        with self.assertRaises(RuntimeError):
          sketch.min()
        with self.assertRaises(RuntimeError):
          sketch.max()
        with self.assertRaises(RuntimeError):
          sketch.sum()
        with self.assertRaises(RuntimeError):
          sketch.mean()
        with self.assertRaises(RuntimeError):
          sketch.var()
        with self.assertRaises(RuntimeError):
          sketch.std()

        self.assertAlmostEqual(sketch.num_unique(), 10, delta=3)
        self.assertEqual(sketch.num_undefined(), 1)
        self.assertEqual(sketch.size(), len(str_data))

        with self.assertRaises(RuntimeError):
          sketch.quantile(0.5)
        self.assertEqual(sketch.frequency_count("1"), 1)
        self.assertEqual(sketch.frequency_count("2"), 1)
        t = sketch.frequent_items()
        self.assertEqual(len(t), 10)
Esempio n. 4
0
 def test_background_sketch(self):
     dict_data = [{str(i): 1} for i in range(1, 10000)]
     sa = SArray(dict_data)
     s = sa.sketch_summary(
         background=True, sub_sketch_keys=[str(i) for i in range(100, 200)])
     s.sketch_ready(
     )  # cannot check the actual value as it depends on the speed of processing
     t = s.element_sub_sketch([str(i) for i in range(100, 105)])
     self.assertEqual(len(t), 5)
Esempio n. 5
0
    def test_vector_sketch(self):
        vector_data = [[], [1,2], [3], [4,5,6,7], [8,9,10], None]
        sa = SArray(data=vector_data)

        sketch = sa.sketch_summary();
        self.__validate_sketch_result(sketch, sa)
        self.__validate_sketch_result(sketch.element_length_summary(), sa.dropna().item_length())

        flattened = list(itertools.chain.from_iterable(list(sa.dropna())))
        self.__validate_sketch_result(sketch.element_summary(), SArray(flattened))

        fi = sketch.frequent_items()
        self.assertEqual(len(fi), 5)
        self.assertEqual((fi['[1 2]']), 1)
        self.assertEqual((fi['[4 5 6 7]']), 1)

        # sub sketch with one key
        s = sa.sketch_summary(sub_sketch_keys = 1).element_sub_sketch(1)
        expected = sa.vector_slice(1)
        self.__validate_sketch_result(s, expected)

        # sub sketch with multiple keys
        keys = [1,3]
        s = sa.sketch_summary(sub_sketch_keys = keys).element_sub_sketch(keys)
        self.assertEqual(len(s), len(keys))
        for key in keys:
            self.assertTrue(s.has_key(key))
            expected = sa.vector_slice(key)
            self.__validate_sketch_result(s[key], expected)

        indexes = range(0,10)
        s = sa.sketch_summary(sub_sketch_keys = indexes).element_sub_sketch()
        self.assertEqual(len(s), len(indexes))
Esempio n. 6
0
    def test_list_sketch(self):
        list_data = [[], [1,2],[1,2], ['a', 'a', 'a', 'b'], [ 1 ,1 , 2], None]
        sa = SArray(list_data)
        self.__validate_nested_sketch_result(sa)
        sketch = sa.sketch_summary();

        self.assertEqual(sketch.num_unique(), 4)
        element_summary = sketch.element_summary()
        another_rep = list(itertools.chain.from_iterable(list(sa.dropna())))
        self.__validate_sketch_result(element_summary, SArray(another_rep, str))

        fi = sketch.frequent_items()
        self.assertEqual(len(fi), 4)
        self.assertEqual((fi['[1,2]']), 2)
        self.assertEqual((fi['["a","a","a","b"]']), 1)
Esempio n. 7
0
    def test_list_sketch(self):
        list_data = [[], [1, 2], [1, 2], ['a', 'a', 'a', 'b'], [1, 1, 2], None]
        sa = SArray(list_data)
        self.__validate_nested_sketch_result(sa)
        sketch = sa.sketch_summary()

        self.assertEqual(sketch.num_unique(), 4)
        element_summary = sketch.element_summary()
        another_rep = list(itertools.chain.from_iterable(list(sa.dropna())))
        self.__validate_sketch_result(element_summary,
                                      SArray(another_rep, str))

        fi = sketch.frequent_items()
        self.assertEqual(len(fi), 4)
        self.assertEqual((fi['[1,2]']), 2)
        self.assertEqual((fi['["a","a","a","b"]']), 1)
Esempio n. 8
0
    def test_empty_sketch(self):
        int_data = []
        sa = SArray(data=int_data)
        sketch = sa.sketch_summary()
        self.assertTrue(math.isnan(sketch.min()))
        self.assertTrue(math.isnan(sketch.max()))
        self.assertEquals(sketch.sum(), 0)
        self.assertEqual(sketch.mean(), 0)
        self.assertEqual(sketch.var(), 0)
        self.assertEqual(sketch.std(), 0)
        self.assertEqual(sketch.num_unique(), 0)
        self.assertEqual(sketch.num_undefined(), 0)
        self.assertEqual(sketch.size(), 0)
        with self.assertRaises(RuntimeError):
            sketch.quantile(0.5)

        t = sketch.frequent_items()
        self.assertEqual(len(t), 0)
Esempio n. 9
0
    def test_empty_sketch(self):
        int_data = []
        sa = SArray(data=int_data)
        sketch = sa.sketch_summary()
        self.assertTrue(math.isnan(sketch.min()))
        self.assertTrue(math.isnan(sketch.max()))
        self.assertEquals(sketch.sum(), 0)
        self.assertEqual(sketch.mean(), 0)
        self.assertEqual(sketch.var(), 0)
        self.assertEqual(sketch.std(), 0)
        self.assertEqual(sketch.num_unique(), 0)
        self.assertEqual(sketch.num_undefined(),0)
        self.assertEqual(sketch.size(), 0)
        with self.assertRaises(RuntimeError):
          sketch.quantile(0.5)

        t = sketch.frequent_items()
        self.assertEqual(len(t), 0)
Esempio n. 10
0
    def __validate_sketch_result(self, sketch, sa, delta=1E-7):
        df = pd.DataFrame(list(sa.dropna()))
        pds = pd.Series(list(sa.dropna()))
        if (sa.dtype() == int or sa.dtype() == float):
            if (len(sa) == 0):
                self.assertTrue(math.isnan(sketch.min()))
                self.assertTrue(math.isnan(sketch.min()))
                self.assertEquals(sketch.sum(), 0.0)
                self.assertEquals(sketch.mean(), 0.0)
                self.assertEquals(sketch.var(), 0.0)
                self.assertEquals(sketch.std(), 0.0)
            else:
                self.assertEquals(sketch.min(), sa.min())
                self.assertEquals(sketch.max(), sa.max())
                self.assertEquals(sketch.sum(), sa.sum())
                self.assertAlmostEqual(sketch.mean(),
                                       sa.dropna().mean(),
                                       delta=delta)
                self.assertAlmostEqual(sketch.var(),
                                       sa.dropna().var(),
                                       delta=delta)
                self.assertAlmostEqual(sketch.std(),
                                       sa.dropna().std(),
                                       delta=delta)
                self.assertAlmostEqual(sketch.quantile(0.5),
                                       df.quantile(0.5)[0],
                                       delta=1)
                self.assertEqual(sketch.quantile(0), df.quantile(0)[0])
                self.assertEqual(sketch.quantile(1), df.quantile(1)[0])

                self.assertEqual(sketch.frequent_items(),
                                 SArray(pds).sketch_summary().frequent_items())
                for item in pds.value_counts().index:
                    self.assertEqual(sketch.frequency_count(item),
                                     pds.value_counts()[item])

                self.assertAlmostEqual(sketch.num_unique(),
                                       len(sa.unique()),
                                       delta=3)
        else:
            with self.assertRaises(RuntimeError):
                sketch.quantile((0.5))

        self.assertEqual(sketch.num_undefined(), sa.num_missing())
        self.assertEqual(sketch.size(), len(sa))
        self.assertEqual(sketch.sketch_ready(), True)
        self.assertEqual(sketch.num_elements_processed(), sketch.size())
Esempio n. 11
0
    def test_dict_sketch_int_value(self):
        dict_data = [{}, {'a':1, 'b':2}, {'a':1, 'b':2}, {'a':3, 'c':1}, {'a': 1, 'b': 2, 'c': 3}, None]
        sa = SArray(data=dict_data)
        self.__validate_nested_sketch_result(sa)

        sketch = sa.sketch_summary()
        self.assertEqual(sketch.num_unique(), 4)
        fi = sketch.frequent_items()
        self.assertEqual(len(fi), 4)
        self.assertEqual((fi['{"a":1, "b":2}']), 2)
        self.assertEqual((fi['{"a":3, "c":1}']), 1)

        # Get dict key sketch
        key_summary = sketch.dict_key_summary()
        another_rep = list(itertools.chain.from_iterable(list(sa.dict_keys().dropna())))
        self.__validate_sketch_result(key_summary, SArray(another_rep))

        # Get dict value sketch
        value_summary = sketch.dict_value_summary()
        another_rep = list(itertools.chain.from_iterable(list(sa.dict_values().dropna())))
        self.__validate_sketch_result(value_summary, SArray(another_rep))

        # sub sketch with one key
        s = sa.sketch_summary(sub_sketch_keys ='a').element_sub_sketch('a')
        expected = sa.unpack(column_name_prefix="")['a']
        self.__validate_sketch_result(s, expected)

        s = sa.sketch_summary(sub_sketch_keys ='Nonexist').element_sub_sketch('Nonexist')
        self.assertEqual(s.num_undefined(), len(sa))

        # sub sketch with multiple keys
        keys = ['a', 'b']
        s = sa.sketch_summary(sub_sketch_keys =keys).element_sub_sketch(keys)
        self.assertEqual(len(s), len(keys))
        for key in keys:
            self.assertTrue(s.has_key(key))
            expected = sa.unpack(column_name_prefix="")[key]
            self.__validate_sketch_result(s[key], expected)
def _combine(task):
    '''
    The actual code that will be ran inside of a task to combine all results and add the parameter
    column to the final SFrame(s).
    '''
    # Initialize empty SFrames for each output.
    for out_name in task.get_outputs():
        task.outputs[out_name] = _SFrame()

    params_to_outputs = task.params[_COMBINE_PARAMETER_NAME]
    for params, path in params_to_outputs:
        for out_name in task.get_outputs():

            try: 
                cur_result = _SFrame(_path_join(path, out_name))
            except IOError:
                _log.info("No output for %s with parameters: %s " % (out_name, str(params)))
                continue

            # Add the 'Parameters' column and append to previous results.
            cur_result['parameters'] = _SArray.from_const(params, len(cur_result))
            cur_result.__materialize__()
            task.outputs[out_name] = task.outputs[out_name].append(cur_result)
Esempio n. 13
0
 def rating_prediction_switch(self, dataset, dataset_switch, model_manager, force):
     from graphlab.data_structures.sframe import SFrame
     from graphlab.data_structures.sarray import SArray
     import os
     
     for folder in dataset.folders:
         rating_prediction_file  = self._get_rating_prediction_file(dataset_switch, folder)
         class_prediction_file   = self._get_class_prediction_file(dataset_switch, folder)
         
         if os.path.exists(rating_prediction_file) and not force:
             print "Model " + self.id + " in " + dataset_switch.id + " " + folder.id + " already tested."
             continue 
         
         cf_predictions      = model_manager.get_predictions(dataset, folder)
         
         if self.id == 'best':
             test_sframe = folder.test_sframe
             target      = test_sframe.select_column(key = 'rating')
             
             rating_predictions  = map(lambda t, *p: self._get_best_prediction(t, *p), target, *cf_predictions)
             rating_array        = SArray(rating_predictions)
             rating_array.save(filename = rating_prediction_file)
             
         else:
             
             sf                  = SFrame.read_csv(class_prediction_file, header = True, quote_char = '"', 
                                                   column_type_hints = [int, str])
             switch_predictions  = sf.select_column(key = 'x') 
             
             index_switch_predictions = model_manager.get_index_model(switch_predictions)
             
             rating_predictions  = map(lambda t, *p: self._get_switch_prediction(t, *p), 
                                       index_switch_predictions, *cf_predictions)
             
             rating_array = SArray(rating_predictions)
             rating_array.save(filename = rating_prediction_file)
Esempio n. 14
0
 def test_sketch_int(self):
     int_data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None]
     sa = SArray(data=int_data)
     self.__validate_sketch_result(sa.sketch_summary(), sa)
Esempio n. 15
0
 def test_large_value_sketch(self):
     sa = SArray([1234567890 for i in range(100)])
     sk = sa.sketch_summary()
     self.__validate_sketch_result(sa.sketch_summary(), sa, 1E-5)
Esempio n. 16
0
 def test_cancelation(self):
     sa = SArray(range(1, 10000))
     s = sa.sketch_summary(background=True)
     s.cancel()
Esempio n. 17
0
 def test_large_value_sketch(self):
     sa = SArray([1234567890 for i in range(100)])
     sk = sa.sketch_summary();
     self.__validate_sketch_result(sa.sketch_summary(), sa, 1E-5)
Esempio n. 18
0
    def test_dict_sketch_str_value(self):
        # Dict value sketch type should be auto inferred
        dict_data = [{'a':'b', 'b':'c'}, {'a':'b', 'b':'c'}, {'a':'d', 'b':'4'}, None]
        sa = SArray(data=dict_data)
        self.__validate_nested_sketch_result(sa)

        sketch = sa.sketch_summary()
        fi = sketch.frequent_items()
        self.assertEqual(len(fi), 2)
        self.assertEqual(fi['{"a":"b", "b":"c"}'], 2)
        self.assertEqual(fi['{"a":"d", "b":"4"}'], 1)

        # Get dict key sketch
        key_summary = sketch.dict_key_summary()
        another_rep = list(itertools.chain.from_iterable(list(sa.dict_keys().dropna())))
        self.__validate_sketch_result(key_summary, SArray(another_rep))

        # Get dict value sketch
        value_summary = sketch.dict_value_summary()
        another_rep = list(itertools.chain.from_iterable(list(sa.dict_values().dropna())))
        self.__validate_sketch_result(value_summary, SArray(another_rep))

        # sub sketch with one key
        s = sa.sketch_summary(sub_sketch_keys ='a').element_sub_sketch('a')
        expected = sa.unpack(column_name_prefix="")['a']
        self.__validate_sketch_result(s, expected)

        s = sa.sketch_summary(sub_sketch_keys ='Nonexist').element_sub_sketch('Nonexist')
        self.assertEqual(s.num_undefined(), len(sa))

        # sub sketch with multiple keys
        keys = ['a', 'b']
        s = sa.sketch_summary(sub_sketch_keys =keys).element_sub_sketch(keys)
        self.assertEqual(len(s), len(keys))
        for key in keys:
            self.assertTrue(s.has_key(key))
            expected = sa.unpack(column_name_prefix="")[key]
            self.__validate_sketch_result(s[key], expected)

        # allow pass in empty keys, which will retrieve all keys
        s = sa.sketch_summary(sub_sketch_keys =keys).element_sub_sketch()
        self.assertEqual(len(s), len(keys))
        for key in keys:
            self.assertTrue(s.has_key(key))
            expected = sa.unpack(column_name_prefix="")[key]
            self.__validate_sketch_result(s[key], expected)
Esempio n. 19
0
 def test_sketch_int(self):
     int_data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None]
     sa = SArray(data=int_data)
     self.__validate_sketch_result(sa.sketch_summary(), sa)
Esempio n. 20
0
 def test_cancelation(self):
     sa = SArray(range(1,10000))
     s = sa.sketch_summary(background=True)
     s.cancel()
Esempio n. 21
0
    def test_dict_sketch_str_value(self):
        # Dict value sketch type should be auto inferred
        dict_data = [{
            'a': 'b',
            'b': 'c'
        }, {
            'a': 'b',
            'b': 'c'
        }, {
            'a': 'd',
            'b': '4'
        }, None]
        sa = SArray(data=dict_data)
        self.__validate_nested_sketch_result(sa)

        sketch = sa.sketch_summary()
        fi = sketch.frequent_items()
        self.assertEqual(len(fi), 2)
        self.assertEqual(fi['{"a":"b", "b":"c"}'], 2)
        self.assertEqual(fi['{"a":"d", "b":"4"}'], 1)

        # Get dict key sketch
        key_summary = sketch.dict_key_summary()
        another_rep = list(
            itertools.chain.from_iterable(list(sa.dict_keys().dropna())))
        self.__validate_sketch_result(key_summary, SArray(another_rep))

        # Get dict value sketch
        value_summary = sketch.dict_value_summary()
        another_rep = list(
            itertools.chain.from_iterable(list(sa.dict_values().dropna())))
        self.__validate_sketch_result(value_summary, SArray(another_rep))

        # sub sketch with one key
        s = sa.sketch_summary(sub_sketch_keys='a').element_sub_sketch('a')
        expected = sa.unpack(column_name_prefix="")['a']
        self.__validate_sketch_result(s, expected)

        s = sa.sketch_summary(
            sub_sketch_keys='Nonexist').element_sub_sketch('Nonexist')
        self.assertEqual(s.num_undefined(), len(sa))

        # sub sketch with multiple keys
        keys = ['a', 'b']
        s = sa.sketch_summary(sub_sketch_keys=keys).element_sub_sketch(keys)
        self.assertEqual(len(s), len(keys))
        for key in keys:
            self.assertTrue(s.has_key(key))
            expected = sa.unpack(column_name_prefix="")[key]
            self.__validate_sketch_result(s[key], expected)

        # allow pass in empty keys, which will retrieve all keys
        s = sa.sketch_summary(sub_sketch_keys=keys).element_sub_sketch()
        self.assertEqual(len(s), len(keys))
        for key in keys:
            self.assertTrue(s.has_key(key))
            expected = sa.unpack(column_name_prefix="")[key]
            self.__validate_sketch_result(s[key], expected)
Esempio n. 22
0
    def test_dict_sketch_int_value(self):
        dict_data = [{}, {
            'a': 1,
            'b': 2
        }, {
            'a': 1,
            'b': 2
        }, {
            'a': 3,
            'c': 1
        }, {
            'a': 1,
            'b': 2,
            'c': 3
        }, None]
        sa = SArray(data=dict_data)
        self.__validate_nested_sketch_result(sa)

        sketch = sa.sketch_summary()
        self.assertEqual(sketch.num_unique(), 4)
        fi = sketch.frequent_items()
        self.assertEqual(len(fi), 4)
        self.assertEqual((fi['{"a":1, "b":2}']), 2)
        self.assertEqual((fi['{"a":3, "c":1}']), 1)

        # Get dict key sketch
        key_summary = sketch.dict_key_summary()
        another_rep = list(
            itertools.chain.from_iterable(list(sa.dict_keys().dropna())))
        self.__validate_sketch_result(key_summary, SArray(another_rep))

        # Get dict value sketch
        value_summary = sketch.dict_value_summary()
        another_rep = list(
            itertools.chain.from_iterable(list(sa.dict_values().dropna())))
        self.__validate_sketch_result(value_summary, SArray(another_rep))

        # sub sketch with one key
        s = sa.sketch_summary(sub_sketch_keys='a').element_sub_sketch('a')
        expected = sa.unpack(column_name_prefix="")['a']
        self.__validate_sketch_result(s, expected)

        s = sa.sketch_summary(
            sub_sketch_keys='Nonexist').element_sub_sketch('Nonexist')
        self.assertEqual(s.num_undefined(), len(sa))

        # sub sketch with multiple keys
        keys = ['a', 'b']
        s = sa.sketch_summary(sub_sketch_keys=keys).element_sub_sketch(keys)
        self.assertEqual(len(s), len(keys))
        for key in keys:
            self.assertTrue(s.has_key(key))
            expected = sa.unpack(column_name_prefix="")[key]
            self.__validate_sketch_result(s[key], expected)
Esempio n. 23
0
 def test_sketch_float(self):
     int_data = [1.2, 3,.4, 6.789, None]
     sa = SArray(data=int_data)
     self.__validate_sketch_result(sa.sketch_summary(), sa)
Esempio n. 24
0
    def __repr__(self):
      """
      Emits a brief summary of all the statistics as a string.
      """
      fields = [
        ['size',           'Length' ,       'Yes'],
        ['min',            'Min' ,          'Yes'],
        ['max',            'Max' ,          'Yes'],
        ['mean',           'Mean' ,         'Yes'],
        ['sum',            'Sum' ,          'Yes'],
        ['var',            'Variance' ,     'Yes'],
        ['std',            'Standard Deviation' , 'Yes'],
        ['num_undefined', '# Missing Values' , 'Yes',],
        ['num_unique',     '# unique values',  'No' ]
      ]

      s = '\n'
      result = []
      for field in fields:
        try:
          method_to_call = getattr(self, field[0])
          result.append([field[1], str(method_to_call()), field[2]])
        except:
          pass
      sf = SArray(result).unpack(column_name_prefix = "")
      sf.rename({'0': 'item', '1':'value', '2': 'is exact'})
      s += sf.__str__(footer=False)
      s += "\n"

      s += "\nMost frequent items:\n"
      frequent = self.frequent_items()
      sorted_freq = sorted(frequent.iteritems(), key=operator.itemgetter(1), reverse=True)
      if len(sorted_freq) == 0:
          s += " -- All elements appear with less than 0.01% frequency -- \n"
      else:
        sorted_freq = sorted_freq[:10]
        sf = SFrame()
        sf.add_column(SArray(['count']), 'value')
        for elem in sorted_freq:
          sf.add_column(SArray([elem[1]]), str(elem[0]))
        s += sf.__str__(footer=False) + "\n"
      s += "\n"

      try:
        # print quantiles
        t = self.quantile(0)
        s += "Quantiles: \n"
        sf = SFrame()
        for q in [0.0,0.01,0.05,0.25,0.5,0.75,0.95,0.99,1.00]:
          sf.add_column(SArray([self.quantile(q)]), str(int(q * 100)) + '%')
        s += sf.__str__(footer=False) + "\n"
      except:
        pass

      try:
        t_k = self.dict_key_summary()
        t_v = self.dict_value_summary()
        s += "\n******** Dictionary Element Key Summary ********\n"
        s += t_k.__repr__()
        s += "\n******** Dictionary Element Value Summary ********\n"
        s += t_v.__repr__() + '\n'
      except:
        pass

      try:
        t_k = self.element_summary()
        s += "\n******** Element Summary ********\n"
        s += t_k.__repr__() + '\n'
      except:
        pass

      return s.expandtabs(8)
Esempio n. 25
0
 def test_sketch_float(self):
     int_data = [1.2, 3, .4, 6.789, None]
     sa = SArray(data=int_data)
     self.__validate_sketch_result(sa.sketch_summary(), sa)
Esempio n. 26
0
    def test_vector_sketch(self):
        vector_data = [[], [1, 2], [3], [4, 5, 6, 7], [8, 9, 10], None]
        sa = SArray(data=vector_data)

        sketch = sa.sketch_summary()
        self.__validate_sketch_result(sketch, sa)
        self.__validate_sketch_result(sketch.element_length_summary(),
                                      sa.dropna().item_length())

        flattened = list(itertools.chain.from_iterable(list(sa.dropna())))
        self.__validate_sketch_result(sketch.element_summary(),
                                      SArray(flattened))

        fi = sketch.frequent_items()
        self.assertEqual(len(fi), 5)
        self.assertEqual((fi['[1 2]']), 1)
        self.assertEqual((fi['[4 5 6 7]']), 1)

        # sub sketch with one key
        s = sa.sketch_summary(sub_sketch_keys=1).element_sub_sketch(1)
        expected = sa.vector_slice(1)
        self.__validate_sketch_result(s, expected)

        # sub sketch with multiple keys
        keys = [1, 3]
        s = sa.sketch_summary(sub_sketch_keys=keys).element_sub_sketch(keys)
        self.assertEqual(len(s), len(keys))
        for key in keys:
            self.assertTrue(s.has_key(key))
            expected = sa.vector_slice(key)
            self.__validate_sketch_result(s[key], expected)

        indexes = range(0, 10)
        s = sa.sketch_summary(sub_sketch_keys=indexes).element_sub_sketch()
        self.assertEqual(len(s), len(indexes))
Esempio n. 27
0
    def __repr__(self):
        """
      Emits a brief summary of all the statistics as a string.
      """
        fields = [['size', 'Length', 'Yes'], ['min', 'Min', 'Yes'],
                  ['max', 'Max', 'Yes'], ['mean', 'Mean', 'Yes'],
                  ['sum', 'Sum', 'Yes'], ['var', 'Variance', 'Yes'],
                  ['std', 'Standard Deviation', 'Yes'],
                  [
                      'num_undefined',
                      '# Missing Values',
                      'Yes',
                  ], ['num_unique', '# unique values', 'No']]

        s = '\n'
        result = []
        for field in fields:
            try:
                method_to_call = getattr(self, field[0])
                result.append([field[1], str(method_to_call()), field[2]])
            except:
                pass
        sf = SArray(result).unpack(column_name_prefix="")
        sf.rename({'0': 'item', '1': 'value', '2': 'is exact'})
        s += sf.__str__(footer=False)
        s += "\n"

        s += "\nMost frequent items:\n"
        frequent = self.frequent_items()
        sorted_freq = sorted(frequent.iteritems(),
                             key=operator.itemgetter(1),
                             reverse=True)
        if len(sorted_freq) == 0:
            s += " -- All elements appear with less than 0.01% frequency -- \n"
        else:
            sorted_freq = sorted_freq[:10]
            sf = SFrame()
            sf.add_column(SArray(['count']), 'value')
            for elem in sorted_freq:
                sf.add_column(SArray([elem[1]]), str(elem[0]))
            s += sf.__str__(footer=False) + "\n"
        s += "\n"

        try:
            # print quantiles
            t = self.quantile(0)
            s += "Quantiles: \n"
            sf = SFrame()
            for q in [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.00]:
                sf.add_column(SArray([self.quantile(q)]),
                              str(int(q * 100)) + '%')
            s += sf.__str__(footer=False) + "\n"
        except:
            pass

        try:
            t_k = self.dict_key_summary()
            t_v = self.dict_value_summary()
            s += "\n******** Dictionary Element Key Summary ********\n"
            s += t_k.__repr__()
            s += "\n******** Dictionary Element Value Summary ********\n"
            s += t_v.__repr__() + '\n'
        except:
            pass

        try:
            t_k = self.element_summary()
            s += "\n******** Element Summary ********\n"
            s += t_k.__repr__() + '\n'
        except:
            pass

        return s.expandtabs(8)