def dict_key_summary(self): """ Returns the sketch summary for all dictionary keys. This is only valid for sketch object from an SArray of dict type. Dictionary keys are converted to strings and then do the sketch summary. Examples -------- >>> sa = graphlab.SArray([{'I':1, 'love': 2}, {'nature':3, 'beauty':4}]) >>> sa.sketch_summary().dict_key_summary() +------------------+-------+----------+ | item | value | is exact | +------------------+-------+----------+ | Length | 4 | Yes | | # Missing Values | 0 | Yes | | # unique values | 4 | No | +------------------+-------+----------+ Most frequent items: +-------+---+------+--------+--------+ | value | I | love | beauty | nature | +-------+---+------+--------+--------+ | count | 1 | 1 | 1 | 1 | +-------+---+------+--------+--------+ """ _mt._get_metric_tracker().track('sketch.dict_key_summary') with cython_context(): return Sketch(_proxy = self.__proxy__.dict_key_summary())
def remove_column(self, name): """ Removes the column with the given name from the SFrame. Parameters ---------- name : string The name of the column to remove. """ if name not in self.column_names(): raise KeyError('Cannot find column %s' % name) self.__is_dirty__ = True try: with cython_context(): if self._is_vertex_frame(): assert name != '__id', 'Cannot remove \"__id\" column' graph_proxy = self.__graph__.__proxy__.delete_vertex_field(name) self.__graph__.__proxy__ = graph_proxy elif self._is_edge_frame(): assert name != '__src_id', 'Cannot remove \"__src_id\" column' assert name != '__dst_id', 'Cannot remove \"__dst_id\" column' graph_proxy = self.__graph__.__proxy__.delete_edge_field(name) self.__graph__.__proxy__ = graph_proxy except: self.__is_dirty__ = False raise
def frequency_count(self, element): """ Returns a sketched estimate of the number of occurrences of a given element. This estimate is based on the count sketch. The element type must be of the same type as the input SArray. Throws an exception if element is of the incorrect type. Parameters ---------- element : val An element of the same type as the SArray. Raises ------ RuntimeError Throws an exception if element is of the incorrect type. Returns ------- out : int An estimate of the number of occurrences of the element. """ _mt._get_metric_tracker().track('sketch.frequency_count') with cython_context(): return int(self.__proxy__.frequency_count(element))
def quantile(self, quantile_val): """ Returns a sketched estimate of the value at a particular quantile between 0.0 and 1.0. The quantile is guaranteed to be accurate within 1%: meaning that if you ask for the 0.55 quantile, the returned value is guaranteed to be between the true 0.54 quantile and the true 0.56 quantile. The quantiles are only defined for numeric arrays and this function will throw an exception if called on a sketch constructed for a non-numeric column. Parameters ---------- quantile_val : float A value between 0.0 and 1.0 inclusive. Values below 0.0 will be interpreted as 0.0. Values above 1.0 will be interpreted as 1.0. Raises ------ RuntimeError If the sarray is a non-numeric type. Returns ------- out : float | str An estimate of the value at a quantile. """ _mt._get_metric_tracker().track('sketch.quantile.%g' % quantile_val) with cython_context(): return self.__proxy__.get_quantile(quantile_val)
def dict_key_summary(self): """ Returns the sketch summary for all dictionary keys. This is only valid for sketch object from an SArray of dict type. Dictionary keys are converted to strings and then do the sketch summary. Examples -------- >>> sa = graphlab.SArray([{'I':1, 'love': 2}, {'nature':3, 'beauty':4}]) >>> sa.sketch_summary().dict_key_summary() +------------------+-------+----------+ | item | value | is exact | +------------------+-------+----------+ | Length | 4 | Yes | | # Missing Values | 0 | Yes | | # unique values | 4 | No | +------------------+-------+----------+ Most frequent items: +-------+---+------+--------+--------+ | value | I | love | beauty | nature | +-------+---+------+--------+--------+ | count | 1 | 1 | 1 | 1 | +-------+---+------+--------+--------+ """ _mt._get_metric_tracker().track('sketch.dict_key_summary') with cython_context(): return Sketch(_proxy=self.__proxy__.dict_key_summary())
def add_column(self, data, name=""): """ Adds the specified column to this SFrame. The number of elements in the data given must match every other column of the SFrame. Parameters ---------- data : SArray The 'column' of data. name : string The name of the column. If no name is given, a default name is chosen. """ # Check type for pandas dataframe or SArray? if not isinstance(data, SArray): raise TypeError("Must give column as SArray") if not isinstance(name, str): raise TypeError("Invalid column name: must be str") self.__is_dirty__ = True with cython_context(): if self._is_vertex_frame(): graph_proxy = self.__graph__.__proxy__.add_vertex_field( data.__proxy__, name) self.__graph__.__proxy__ = graph_proxy elif self._is_edge_frame(): graph_proxy = self.__graph__.__proxy__.add_edge_field( data.__proxy__, name) self.__graph__.__proxy__ = graph_proxy
def remove_column(self, name): """ Removes the column with the given name from the SFrame. Parameters ---------- name : string The name of the column to remove. """ if name not in self.column_names(): raise KeyError('Cannot find column %s' % name) self.__is_dirty__ = True try: with cython_context(): if self._is_vertex_frame(): assert name != '__id', 'Cannot remove \"__id\" column' graph_proxy = self.__graph__.__proxy__.delete_vertex_field( name) self.__graph__.__proxy__ = graph_proxy elif self._is_edge_frame(): assert name != '__src_id', 'Cannot remove \"__src_id\" column' assert name != '__dst_id', 'Cannot remove \"__dst_id\" column' graph_proxy = self.__graph__.__proxy__.delete_edge_field( name) self.__graph__.__proxy__ = graph_proxy except: self.__is_dirty__ = False raise
def add_column(self, data, name=""): """ Adds the specified column to this SFrame. The number of elements in the data given must match every other column of the SFrame. Parameters ---------- data : SArray The 'column' of data. name : string The name of the column. If no name is given, a default name is chosen. """ # Check type for pandas dataframe or SArray? if not isinstance(data, SArray): raise TypeError("Must give column as SArray") if not isinstance(name, str): raise TypeError("Invalid column name: must be str") self.__is_dirty__ = True with cython_context(): if self._is_vertex_frame(): graph_proxy = self.__graph__.__proxy__.add_vertex_field(data.__proxy__, name) self.__graph__.__proxy__ = graph_proxy elif self._is_edge_frame(): graph_proxy = self.__graph__.__proxy__.add_edge_field(data.__proxy__, name) self.__graph__.__proxy__ = graph_proxy
def sketch_ready(self): """ Returns true if the sketch has been executed on all the data. If the sketch is created with background == False (default), this will always return True. Otherwise, this will return False until the sketch is ready. """ with cython_context(): return self.__proxy__.sketch_ready()
def num_elements_processed(self): """ Returns the number of elements processed so far. If the sketch is created with background == False (default), this will always return the length of the input array. Otherwise, this will return the number of elements processed so far. """ with cython_context(): return self.__proxy__.num_elements_processed()
def size(self): """ Returns the size of the input SArray. Returns ------- out : int The number of elements of the input SArray. """ with cython_context(): return int(self.__proxy__.size())
def num_undefined(self): """ Returns the the number of undefined elements in the SArray. Return 0 on an empty SArray. Returns ------- out : int The number of missing values in the SArray. """ with cython_context(): return int(self.__proxy__.num_undefined())
def cancel(self): """ Cancels a background sketch computation immediately if one is ongoing. Does nothing otherwise. Examples -------- >>> s = sa.sketch_summary(array, background=True) >>> s.cancel() """ with cython_context(): self.__proxy__.cancel()
def _run_toolkit_function(fnname, arguments, args, kwargs): """ Dispatches arguments to a toolkit function. Parameters ---------- fnname : string The toolkit function to run arguments : list[string] The list of all the arguments the function takes. args : list The arguments that were passed kwargs : dictionary The keyword arguments that were passed """ # scan for all the arguments in args num_args_got = len(args) + len(kwargs) num_args_required = len(arguments) if num_args_got != num_args_required: raise TypeError("Expecting " + str(num_args_required) + " arguments, got " + str(num_args_got)) ## fill the dict first with the regular args argument_dict = {} for i in range(len(args)): argument_dict[arguments[i]] = args[i] # now fill with the kwargs. for k in kwargs.keys(): if k in argument_dict: raise TypeError("Got multiple values for keyword argument '" + k + "'") argument_dict[k] = kwargs[k] argument_dict = _translate_function_arguments(argument_dict) # unwrap it with cython_context(): ret = _gl.connect.main.get_unity().run_toolkit(fnname, argument_dict) # handle errors if ret[0] != True: if len(ret[1]) > 0: raise _ToolkitError(ret[1]) else: raise _ToolkitError("Toolkit failed with unknown error") ret = _wrap_function_return(ret[2]) if type(ret) == dict and 'return_value' in ret: return ret['return_value'] else: return ret
def num_unique(self): """ Returns a sketched estimate of the number of unique values in the SArray based on the Hyperloglog sketch. Returns ------- out : float An estimate of the number of unique values in the SArray. """ _mt._get_metric_tracker().track('sketch.num_unique') with cython_context(): return int(self.__proxy__.num_unique())
def var(self): """ Returns the variance of the values in the sarray. Returns 0 on an empty array. Throws an exception if called on an SArray with non-numeric type. Raises ------ RuntimeError If the sarray is a non-numeric type. Returns ------- out : float The variance of all the values. Returns 0 if the SArray is empty. """ with cython_context(): return self.__proxy__.var()
def min(self): """ Returns the minimum value in the SArray. Returns *nan* on an empty array. Throws an exception if called on an SArray with non-numeric type. Raises ------ RuntimeError If the sarray is a non-numeric type. Returns ------- out : type of SArray Minimum value of SArray. Returns nan if the sarray is empty. """ with cython_context(): return self.__proxy__.min()
def element_length_summary(self): """ Returns the sketch summary for the element length. This is only valid for a sketch constructed SArray of type list/array/dict, raises Runtime exception otherwise. Examples -------- >>> sa = graphlab.SArray([[j for j in range(i)] for i in range(1,1000)]) >>> sa.sketch_summary().element_length_summary() +--------------------+---------------+----------+ | item | value | is exact | +--------------------+---------------+----------+ | Length | 999 | Yes | | Min | 1.0 | Yes | | Max | 999.0 | Yes | | Mean | 500.0 | Yes | | Sum | 499500.0 | Yes | | Variance | 83166.6666667 | Yes | | Standard Deviation | 288.386314978 | Yes | | # Missing Values | 0 | Yes | | # unique values | 992 | No | +--------------------+---------------+----------+ Most frequent items: +-------+---+---+---+---+---+---+---+---+---+----+ | value | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | +-------+---+---+---+---+---+---+---+---+---+----+ | count | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | +-------+---+---+---+---+---+---+---+---+---+----+ Quantiles: +-----+------+------+-------+-------+-------+-------+-------+-------+ | 0% | 1% | 5% | 25% | 50% | 75% | 95% | 99% | 100% | +-----+------+------+-------+-------+-------+-------+-------+-------+ | 1.0 | 10.0 | 50.0 | 250.0 | 500.0 | 750.0 | 950.0 | 990.0 | 999.0 | +-----+------+------+-------+-------+-------+-------+-------+-------+ Returns ------- out : Sketch An new sketch object regarding the element length of the current SArray """ _mt._get_metric_tracker().track('sketch.element_length_summary') with cython_context(): return Sketch(_proxy = self.__proxy__.element_length_summary())
def element_length_summary(self): """ Returns the sketch summary for the element length. This is only valid for a sketch constructed SArray of type list/array/dict, raises Runtime exception otherwise. Examples -------- >>> sa = graphlab.SArray([[j for j in range(i)] for i in range(1,1000)]) >>> sa.sketch_summary().element_length_summary() +--------------------+---------------+----------+ | item | value | is exact | +--------------------+---------------+----------+ | Length | 999 | Yes | | Min | 1.0 | Yes | | Max | 999.0 | Yes | | Mean | 500.0 | Yes | | Sum | 499500.0 | Yes | | Variance | 83166.6666667 | Yes | | Standard Deviation | 288.386314978 | Yes | | # Missing Values | 0 | Yes | | # unique values | 992 | No | +--------------------+---------------+----------+ Most frequent items: +-------+---+---+---+---+---+---+---+---+---+----+ | value | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | +-------+---+---+---+---+---+---+---+---+---+----+ | count | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | +-------+---+---+---+---+---+---+---+---+---+----+ Quantiles: +-----+------+------+-------+-------+-------+-------+-------+-------+ | 0% | 1% | 5% | 25% | 50% | 75% | 95% | 99% | 100% | +-----+------+------+-------+-------+-------+-------+-------+-------+ | 1.0 | 10.0 | 50.0 | 250.0 | 500.0 | 750.0 | 950.0 | 990.0 | 999.0 | +-----+------+------+-------+-------+-------+-------+-------+-------+ Returns ------- out : Sketch An new sketch object regarding the element length of the current SArray """ _mt._get_metric_tracker().track('sketch.element_length_summary') with cython_context(): return Sketch(_proxy=self.__proxy__.element_length_summary())
def sum(self): """ Returns the sum of all the values in the SArray. Returns 0 on an empty array. Throws an exception if called on an sarray with non-numeric type. Will overflow without warning. Raises ------ RuntimeError If the sarray is a non-numeric type. Returns ------- out : type of SArray Sum of all values in SArray. Returns 0 if the SArray is empty. """ with cython_context(): return self.__proxy__.sum()
def dict_value_summary(self): """ Returns the sketch summary for all dictionary values. This is only valid for sketch object from an SArray of dict type. Type of value summary is inferred from first set of values. Examples -------- >>> sa = graphlab.SArray([{'I':1, 'love': 2}, {'nature':3, 'beauty':4}]) >>> sa.sketch_summary().dict_value_summary() +--------------------+---------------+----------+ | item | value | is exact | +--------------------+---------------+----------+ | Length | 4 | Yes | | Min | 1.0 | Yes | | Max | 4.0 | Yes | | Mean | 2.5 | Yes | | Sum | 10.0 | Yes | | Variance | 1.25 | Yes | | Standard Deviation | 1.11803398875 | Yes | | # Missing Values | 0 | Yes | | # unique values | 4 | No | +--------------------+---------------+----------+ Most frequent items: +-------+-----+-----+-----+-----+ | value | 1.0 | 2.0 | 3.0 | 4.0 | +-------+-----+-----+-----+-----+ | count | 1 | 1 | 1 | 1 | +-------+-----+-----+-----+-----+ Quantiles: +-----+-----+-----+-----+-----+-----+-----+-----+------+ | 0% | 1% | 5% | 25% | 50% | 75% | 95% | 99% | 100% | +-----+-----+-----+-----+-----+-----+-----+-----+------+ | 1.0 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 | 4.0 | +-----+-----+-----+-----+-----+-----+-----+-----+------+ """ _mt._get_metric_tracker().track('sketch.dict_value_summary') with cython_context(): return Sketch(_proxy=self.__proxy__.dict_value_summary())
def dict_value_summary(self): """ Returns the sketch summary for all dictionary values. This is only valid for sketch object from an SArray of dict type. Type of value summary is inferred from first set of values. Examples -------- >>> sa = graphlab.SArray([{'I':1, 'love': 2}, {'nature':3, 'beauty':4}]) >>> sa.sketch_summary().dict_value_summary() +--------------------+---------------+----------+ | item | value | is exact | +--------------------+---------------+----------+ | Length | 4 | Yes | | Min | 1.0 | Yes | | Max | 4.0 | Yes | | Mean | 2.5 | Yes | | Sum | 10.0 | Yes | | Variance | 1.25 | Yes | | Standard Deviation | 1.11803398875 | Yes | | # Missing Values | 0 | Yes | | # unique values | 4 | No | +--------------------+---------------+----------+ Most frequent items: +-------+-----+-----+-----+-----+ | value | 1.0 | 2.0 | 3.0 | 4.0 | +-------+-----+-----+-----+-----+ | count | 1 | 1 | 1 | 1 | +-------+-----+-----+-----+-----+ Quantiles: +-----+-----+-----+-----+-----+-----+-----+-----+------+ | 0% | 1% | 5% | 25% | 50% | 75% | 95% | 99% | 100% | +-----+-----+-----+-----+-----+-----+-----+-----+------+ | 1.0 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 | 4.0 | +-----+-----+-----+-----+-----+-----+-----+-----+------+ """ _mt._get_metric_tracker().track('sketch.dict_value_summary') with cython_context(): return Sketch(_proxy = self.__proxy__.dict_value_summary())
def swap_columns(self, column_1, column_2): """ Swaps the columns with the given names. Parameters ---------- column_1 : string Name of column to swap column_2 : string Name of other column to swap """ self.__is_dirty__ = True with cython_context(): if self._is_vertex_frame(): graph_proxy = self.__graph__.__proxy__.swap_vertex_fields(column_1, column_2) self.__graph__.__proxy__ = graph_proxy elif self._is_edge_frame(): graph_proxy = self.__graph__.__proxy__.swap_edge_fields(column_1, column_2) self.__graph__.__proxy__ = graph_proxy
def element_summary(self): """ Returns the sketch summary for all element values. This is only valid for sketch object created from SArray of list or vector(array) type. For SArray of list type, all list values are treated as string for sketch summary. For SArray of vector type, the sketch summary is on FLOAT type. Examples -------- >>> sa = graphlab.SArray([[1,2,3], [4,5]]) >>> sa.sketch_summary().element_summary() +--------------------+---------------+----------+ | item | value | is exact | +--------------------+---------------+----------+ | Length | 5 | Yes | | Min | 1.0 | Yes | | Max | 5.0 | Yes | | Mean | 3.0 | Yes | | Sum | 15.0 | Yes | | Variance | 2.0 | Yes | | Standard Deviation | 1.41421356237 | Yes | | # Missing Values | 0 | Yes | | # unique values | 5 | No | +--------------------+---------------+----------+ Most frequent items: +-------+-----+-----+-----+-----+-----+ | value | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 | +-------+-----+-----+-----+-----+-----+ | count | 1 | 1 | 1 | 1 | 1 | +-------+-----+-----+-----+-----+-----+ Quantiles: +-----+-----+-----+-----+-----+-----+-----+-----+------+ | 0% | 1% | 5% | 25% | 50% | 75% | 95% | 99% | 100% | +-----+-----+-----+-----+-----+-----+-----+-----+------+ | 1.0 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 | 5.0 | 5.0 | +-----+-----+-----+-----+-----+-----+-----+-----+------+ """ _mt._get_metric_tracker().track('sketch.element_summary') with cython_context(): return Sketch(_proxy = self.__proxy__.element_summary())
def element_summary(self): """ Returns the sketch summary for all element values. This is only valid for sketch object created from SArray of list or vector(array) type. For SArray of list type, all list values are treated as string for sketch summary. For SArray of vector type, the sketch summary is on FLOAT type. Examples -------- >>> sa = graphlab.SArray([[1,2,3], [4,5]]) >>> sa.sketch_summary().element_summary() +--------------------+---------------+----------+ | item | value | is exact | +--------------------+---------------+----------+ | Length | 5 | Yes | | Min | 1.0 | Yes | | Max | 5.0 | Yes | | Mean | 3.0 | Yes | | Sum | 15.0 | Yes | | Variance | 2.0 | Yes | | Standard Deviation | 1.41421356237 | Yes | | # Missing Values | 0 | Yes | | # unique values | 5 | No | +--------------------+---------------+----------+ Most frequent items: +-------+-----+-----+-----+-----+-----+ | value | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 | +-------+-----+-----+-----+-----+-----+ | count | 1 | 1 | 1 | 1 | 1 | +-------+-----+-----+-----+-----+-----+ Quantiles: +-----+-----+-----+-----+-----+-----+-----+-----+------+ | 0% | 1% | 5% | 25% | 50% | 75% | 95% | 99% | 100% | +-----+-----+-----+-----+-----+-----+-----+-----+------+ | 1.0 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 | 5.0 | 5.0 | +-----+-----+-----+-----+-----+-----+-----+-----+------+ """ _mt._get_metric_tracker().track('sketch.element_summary') with cython_context(): return Sketch(_proxy=self.__proxy__.element_summary())
def rename(self, names): """ Rename the columns using the 'names' dict. This changes the names of the columns given as the keys and replaces them with the names given as the values. Parameters ---------- names : dict[string, string] Dictionary of [old_name, new_name] """ if (type(names) is not dict): raise TypeError('names must be a dictionary: oldname -> newname') self.__is_dirty__ = True with cython_context(): if self._is_vertex_frame(): graph_proxy = self.__graph__.__proxy__.rename_vertex_fields(names.keys(), names.values()) self.__graph__.__proxy__ = graph_proxy elif self._is_edge_frame(): graph_proxy = self.__graph__.__proxy__.rename_edge_fields(names.keys(), names.values()) self.__graph__.__proxy__ = graph_proxy
def swap_columns(self, column_1, column_2): """ Swaps the columns with the given names. Parameters ---------- column_1 : string Name of column to swap column_2 : string Name of other column to swap """ self.__is_dirty__ = True with cython_context(): if self._is_vertex_frame(): graph_proxy = self.__graph__.__proxy__.swap_vertex_fields( column_1, column_2) self.__graph__.__proxy__ = graph_proxy elif self._is_edge_frame(): graph_proxy = self.__graph__.__proxy__.swap_edge_fields( column_1, column_2) self.__graph__.__proxy__ = graph_proxy
def frequent_items(self): """ Returns a sketched estimate of the most frequent elements in the SArray based on the SpaceSaving sketch. It is only guaranteed that all elements which appear in more than 0.01% rows of the array will appear in the set of returned elements. However, other elements may also appear in the result. The item counts are estimated using the CountSketch. Missing values are not taken into account when copmuting frequent items. If this function returns no elements, it means that all elements appear with less than 0.01% occurrence. Returns ------- out : dict A dictionary mapping items and their estimated occurrence frequencies. """ _mt._get_metric_tracker().track('sketch.frequent_items') with cython_context(): return self.__proxy__.frequent_items()
def rename(self, names): """ Rename the columns using the 'names' dict. This changes the names of the columns given as the keys and replaces them with the names given as the values. Parameters ---------- names : dict[string, string] Dictionary of [old_name, new_name] """ if (type(names) is not dict): raise TypeError('names must be a dictionary: oldname -> newname') self.__is_dirty__ = True with cython_context(): if self._is_vertex_frame(): graph_proxy = self.__graph__.__proxy__.rename_vertex_fields( names.keys(), names.values()) self.__graph__.__proxy__ = graph_proxy elif self._is_edge_frame(): graph_proxy = self.__graph__.__proxy__.rename_edge_fields( names.keys(), names.values()) self.__graph__.__proxy__ = graph_proxy
def element_sub_sketch(self, keys=None): """ Returns the sketch summary for the given set of keys. This is only applicable for sketch summary created from SArray of sarray or dict type. For dict SArray, the keys are the keys in dict value. For array Sarray, the keys are indexes into the array value. The keys must be passed into original sketch_summary() call in order to be able to be retrieved later Parameters ----------- keys : list of str | str | list of int | int The list of dictionary keys or array index to get sub sketch from. if not given, then retrieve all sub sketches that are available Returns ------- A dictionary that maps from the key(index) to the actual sketch summary for that key(index) Examples -------- >>> sa = graphlab.SArray([{'a':1, 'b':2}, {'a':4, 'd':1}]) >>> s = sa.sketch_summary(sub_sketch_keys=['a','b']) >>> s.element_sub_sketch(['a']) {'a': +--------------------+-------+----------+ | item | value | is exact | +--------------------+-------+----------+ | Length | 2 | Yes | | Min | 1.0 | Yes | | Max | 4.0 | Yes | | Mean | 2.5 | Yes | | Sum | 5.0 | Yes | | Variance | 2.25 | Yes | | Standard Deviation | 1.5 | Yes | | # Missing Values | 0 | Yes | | # unique values | 2 | No | +--------------------+-------+----------+ Most frequent items: +-------+-----+-----+ | value | 1.0 | 4.0 | +-------+-----+-----+ | count | 1 | 1 | +-------+-----+-----+ Quantiles: +-----+-----+-----+-----+-----+-----+-----+-----+------+ | 0% | 1% | 5% | 25% | 50% | 75% | 95% | 99% | 100% | +-----+-----+-----+-----+-----+-----+-----+-----+------+ | 1.0 | 1.0 | 1.0 | 1.0 | 4.0 | 4.0 | 4.0 | 4.0 | 4.0 | +-----+-----+-----+-----+-----+-----+-----+-----+------+} """ single_val = False if keys == None: keys = [] else: if not hasattr(keys, "__iter__"): single_val = True keys = [keys] value_types = set([type(i) for i in keys]) if (len(value_types) > 1): raise ValueError("All keys should have the same type.") _mt._get_metric_tracker().track('sketch.element_sub_sketch') with cython_context(): ret_sketches = self.__proxy__.element_sub_sketch(keys) ret = {} # check return key matches input key for key in keys: if key not in ret_sketches: raise KeyError( "Cannot retrieve element sub sketch for key '" + str(key) + "'. Element sub sketch can only be retrieved when the sketch_summary object was created using the 'sub_sketch_keys' option." ) for key in ret_sketches: ret[key] = Sketch(_proxy=ret_sketches[key]) if single_val: return ret[keys[0]] else: return ret
def element_sub_sketch(self, keys = None): """ Returns the sketch summary for the given set of keys. This is only applicable for sketch summary created from SArray of sarray or dict type. For dict SArray, the keys are the keys in dict value. For array Sarray, the keys are indexes into the array value. The keys must be passed into original sketch_summary() call in order to be able to be retrieved later Parameters ----------- keys : list of str | str | list of int | int The list of dictionary keys or array index to get sub sketch from. if not given, then retrieve all sub sketches that are available Returns ------- A dictionary that maps from the key(index) to the actual sketch summary for that key(index) Examples -------- >>> sa = graphlab.SArray([{'a':1, 'b':2}, {'a':4, 'd':1}]) >>> s = sa.sketch_summary(sub_sketch_keys=['a','b']) >>> s.element_sub_sketch(['a']) {'a': +--------------------+-------+----------+ | item | value | is exact | +--------------------+-------+----------+ | Length | 2 | Yes | | Min | 1.0 | Yes | | Max | 4.0 | Yes | | Mean | 2.5 | Yes | | Sum | 5.0 | Yes | | Variance | 2.25 | Yes | | Standard Deviation | 1.5 | Yes | | # Missing Values | 0 | Yes | | # unique values | 2 | No | +--------------------+-------+----------+ Most frequent items: +-------+-----+-----+ | value | 1.0 | 4.0 | +-------+-----+-----+ | count | 1 | 1 | +-------+-----+-----+ Quantiles: +-----+-----+-----+-----+-----+-----+-----+-----+------+ | 0% | 1% | 5% | 25% | 50% | 75% | 95% | 99% | 100% | +-----+-----+-----+-----+-----+-----+-----+-----+------+ | 1.0 | 1.0 | 1.0 | 1.0 | 4.0 | 4.0 | 4.0 | 4.0 | 4.0 | +-----+-----+-----+-----+-----+-----+-----+-----+------+} """ single_val = False if keys == None: keys = [] else: if not hasattr(keys, "__iter__"): single_val = True keys = [keys] value_types = set([type(i) for i in keys]) if (len(value_types) > 1): raise ValueError("All keys should have the same type.") _mt._get_metric_tracker().track('sketch.element_sub_sketch') with cython_context(): ret_sketches = self.__proxy__.element_sub_sketch(keys) ret = {} # check return key matches input key for key in keys: if key not in ret_sketches: raise KeyError("Cannot retrieve element sub sketch for key '" + str(key) + "'. Element sub sketch can only be retrieved when the sketch_summary object was created using the 'sub_sketch_keys' option.") for key in ret_sketches: ret[key] = Sketch(_proxy = ret_sketches[key]) if single_val: return ret[keys[0]] else: return ret