def dict_key_summary(self):
        """
        Returns the sketch summary for all dictionary keys. This is only valid
        for sketch object from an SArray of dict type. Dictionary keys are
        converted to strings and then do the sketch summary.

        Examples
        --------
        >>> sa = graphlab.SArray([{'I':1, 'love': 2}, {'nature':3, 'beauty':4}])
        >>> sa.sketch_summary().dict_key_summary()
        +------------------+-------+----------+
        |       item       | value | is exact |
        +------------------+-------+----------+
        |      Length      |   4   |   Yes    |
        | # Missing Values |   0   |   Yes    |
        | # unique values  |   4   |    No    |
        +------------------+-------+----------+
        Most frequent items:
        +-------+---+------+--------+--------+
        | value | I | love | beauty | nature |
        +-------+---+------+--------+--------+
        | count | 1 |  1   |   1    |   1    |
        +-------+---+------+--------+--------+

        """
        _mt._get_metric_tracker().track('sketch.dict_key_summary')
        with cython_context():
            return Sketch(_proxy = self.__proxy__.dict_key_summary())
Exemple #2
0
    def remove_column(self, name):
        """
        Removes the column with the given name from the SFrame.

        Parameters
        ----------
        name : string
            The name of the column to remove.
        """
        if name not in self.column_names():
            raise KeyError('Cannot find column %s' % name)
        self.__is_dirty__ = True
        try:
            with cython_context():
                if self._is_vertex_frame():
                    assert name != '__id', 'Cannot remove \"__id\" column'
                    graph_proxy = self.__graph__.__proxy__.delete_vertex_field(name)
                    self.__graph__.__proxy__ = graph_proxy
                elif self._is_edge_frame():
                    assert name != '__src_id', 'Cannot remove \"__src_id\" column'
                    assert name != '__dst_id', 'Cannot remove \"__dst_id\" column'
                    graph_proxy = self.__graph__.__proxy__.delete_edge_field(name)
                    self.__graph__.__proxy__ = graph_proxy
        except:
            self.__is_dirty__ = False
            raise
    def frequency_count(self, element):
        """
        Returns a sketched estimate of the number of occurrences of a given
        element. This estimate is based on the count sketch. The element type
        must be of the same type as the input SArray. Throws an exception if
        element is of the incorrect type.

        Parameters
        ----------
        element : val
            An element of the same type as the SArray.

        Raises
        ------
        RuntimeError
            Throws an exception if element is of the incorrect type.

        Returns
        -------
        out : int
            An estimate of the number of occurrences of the element.
        """
        _mt._get_metric_tracker().track('sketch.frequency_count')
        with cython_context():
            return int(self.__proxy__.frequency_count(element))
Exemple #4
0
    def frequency_count(self, element):
        """
        Returns a sketched estimate of the number of occurrences of a given
        element. This estimate is based on the count sketch. The element type
        must be of the same type as the input SArray. Throws an exception if
        element is of the incorrect type.

        Parameters
        ----------
        element : val
            An element of the same type as the SArray.

        Raises
        ------
        RuntimeError
            Throws an exception if element is of the incorrect type.

        Returns
        -------
        out : int
            An estimate of the number of occurrences of the element.
        """
        _mt._get_metric_tracker().track('sketch.frequency_count')
        with cython_context():
            return int(self.__proxy__.frequency_count(element))
    def quantile(self, quantile_val):
        """
        Returns a sketched estimate of the value at a particular quantile
        between 0.0 and 1.0. The quantile is guaranteed to be accurate within
        1%: meaning that if you ask for the 0.55 quantile, the returned value is
        guaranteed to be between the true 0.54 quantile and the true 0.56
        quantile. The quantiles are only defined for numeric arrays and this
        function will throw an exception if called on a sketch constructed for a
        non-numeric column.

        Parameters
        ----------
        quantile_val : float
            A value between 0.0 and 1.0 inclusive. Values below 0.0 will be
            interpreted as 0.0. Values above 1.0 will be interpreted as 1.0.

        Raises
        ------
        RuntimeError
            If the sarray is a non-numeric type.

        Returns
        -------
        out : float | str
            An estimate of the value at a quantile.
        """
        _mt._get_metric_tracker().track('sketch.quantile.%g' % quantile_val)

        with cython_context():
            return self.__proxy__.get_quantile(quantile_val)
Exemple #6
0
    def dict_key_summary(self):
        """
        Returns the sketch summary for all dictionary keys. This is only valid
        for sketch object from an SArray of dict type. Dictionary keys are
        converted to strings and then do the sketch summary.

        Examples
        --------
        >>> sa = graphlab.SArray([{'I':1, 'love': 2}, {'nature':3, 'beauty':4}])
        >>> sa.sketch_summary().dict_key_summary()
        +------------------+-------+----------+
        |       item       | value | is exact |
        +------------------+-------+----------+
        |      Length      |   4   |   Yes    |
        | # Missing Values |   0   |   Yes    |
        | # unique values  |   4   |    No    |
        +------------------+-------+----------+
        Most frequent items:
        +-------+---+------+--------+--------+
        | value | I | love | beauty | nature |
        +-------+---+------+--------+--------+
        | count | 1 |  1   |   1    |   1    |
        +-------+---+------+--------+--------+

        """
        _mt._get_metric_tracker().track('sketch.dict_key_summary')
        with cython_context():
            return Sketch(_proxy=self.__proxy__.dict_key_summary())
Exemple #7
0
    def add_column(self, data, name=""):
        """
        Adds the specified column to this SFrame.  The number of elements in
        the data given must match every other column of the SFrame.

        Parameters
        ----------
        data : SArray
            The 'column' of data.

        name : string
            The name of the column. If no name is given, a default name is chosen.
        """
        # Check type for pandas dataframe or SArray?
        if not isinstance(data, SArray):
            raise TypeError("Must give column as SArray")
        if not isinstance(name, str):
            raise TypeError("Invalid column name: must be str")

        self.__is_dirty__ = True
        with cython_context():
            if self._is_vertex_frame():
                graph_proxy = self.__graph__.__proxy__.add_vertex_field(
                    data.__proxy__, name)
                self.__graph__.__proxy__ = graph_proxy
            elif self._is_edge_frame():
                graph_proxy = self.__graph__.__proxy__.add_edge_field(
                    data.__proxy__, name)
                self.__graph__.__proxy__ = graph_proxy
Exemple #8
0
    def quantile(self, quantile_val):
        """
        Returns a sketched estimate of the value at a particular quantile
        between 0.0 and 1.0. The quantile is guaranteed to be accurate within
        1%: meaning that if you ask for the 0.55 quantile, the returned value is
        guaranteed to be between the true 0.54 quantile and the true 0.56
        quantile. The quantiles are only defined for numeric arrays and this
        function will throw an exception if called on a sketch constructed for a
        non-numeric column.

        Parameters
        ----------
        quantile_val : float
            A value between 0.0 and 1.0 inclusive. Values below 0.0 will be
            interpreted as 0.0. Values above 1.0 will be interpreted as 1.0.

        Raises
        ------
        RuntimeError
            If the sarray is a non-numeric type.

        Returns
        -------
        out : float | str
            An estimate of the value at a quantile.
        """
        _mt._get_metric_tracker().track('sketch.quantile.%g' % quantile_val)

        with cython_context():
            return self.__proxy__.get_quantile(quantile_val)
Exemple #9
0
    def remove_column(self, name):
        """
        Removes the column with the given name from the SFrame.

        Parameters
        ----------
        name : string
            The name of the column to remove.
        """
        if name not in self.column_names():
            raise KeyError('Cannot find column %s' % name)
        self.__is_dirty__ = True
        try:
            with cython_context():
                if self._is_vertex_frame():
                    assert name != '__id', 'Cannot remove \"__id\" column'
                    graph_proxy = self.__graph__.__proxy__.delete_vertex_field(
                        name)
                    self.__graph__.__proxy__ = graph_proxy
                elif self._is_edge_frame():
                    assert name != '__src_id', 'Cannot remove \"__src_id\" column'
                    assert name != '__dst_id', 'Cannot remove \"__dst_id\" column'
                    graph_proxy = self.__graph__.__proxy__.delete_edge_field(
                        name)
                    self.__graph__.__proxy__ = graph_proxy
        except:
            self.__is_dirty__ = False
            raise
Exemple #10
0
    def add_column(self, data, name=""):
        """
        Adds the specified column to this SFrame.  The number of elements in
        the data given must match every other column of the SFrame.

        Parameters
        ----------
        data : SArray
            The 'column' of data.

        name : string
            The name of the column. If no name is given, a default name is chosen.
        """
        # Check type for pandas dataframe or SArray?
        if not isinstance(data, SArray):
            raise TypeError("Must give column as SArray")
        if not isinstance(name, str):
            raise TypeError("Invalid column name: must be str")

        self.__is_dirty__ = True
        with cython_context():
            if self._is_vertex_frame():
                graph_proxy = self.__graph__.__proxy__.add_vertex_field(data.__proxy__, name)
                self.__graph__.__proxy__ = graph_proxy
            elif self._is_edge_frame():
                graph_proxy = self.__graph__.__proxy__.add_edge_field(data.__proxy__, name)
                self.__graph__.__proxy__ = graph_proxy
Exemple #11
0
 def sketch_ready(self):
     """
     Returns true if the sketch has been executed on all the data.
     If the sketch is created with background == False (default), this will
     always return True. Otherwise, this will return False until the sketch
     is ready.
     """
     with cython_context():
         return self.__proxy__.sketch_ready()
Exemple #12
0
 def num_elements_processed(self):
     """
     Returns the number of elements processed so far.
     If the sketch is created with background == False (default), this will
     always return the length of the input array. Otherwise, this will
     return the number of elements processed so far.
     """
     with cython_context():
         return self.__proxy__.num_elements_processed()
 def sketch_ready(self):
     """
     Returns true if the sketch has been executed on all the data.
     If the sketch is created with background == False (default), this will
     always return True. Otherwise, this will return False until the sketch
     is ready.
     """
     with cython_context():
         return self.__proxy__.sketch_ready()
 def num_elements_processed(self):
     """
     Returns the number of elements processed so far.
     If the sketch is created with background == False (default), this will
     always return the length of the input array. Otherwise, this will
     return the number of elements processed so far.
     """
     with cython_context():
         return self.__proxy__.num_elements_processed()
    def size(self):
        """
        Returns the size of the input SArray.

        Returns
        -------
        out : int
            The number of elements of the input SArray.
        """
        with cython_context():
            return int(self.__proxy__.size())
Exemple #16
0
    def size(self):
        """
        Returns the size of the input SArray.

        Returns
        -------
        out : int
            The number of elements of the input SArray.
        """
        with cython_context():
            return int(self.__proxy__.size())
Exemple #17
0
    def num_undefined(self):
        """
        Returns the the number of undefined elements in the SArray. Return 0
        on an empty SArray.

        Returns
        -------
        out : int
            The number of missing values in the SArray.
        """
        with cython_context():
            return int(self.__proxy__.num_undefined())
    def cancel(self):
      """
      Cancels a background sketch computation immediately if one is ongoing.
      Does nothing otherwise.

      Examples
      --------
      >>> s = sa.sketch_summary(array, background=True)
      >>> s.cancel()
      """
      with cython_context():
        self.__proxy__.cancel()
    def num_undefined(self):
        """
        Returns the the number of undefined elements in the SArray. Return 0
        on an empty SArray.

        Returns
        -------
        out : int
            The number of missing values in the SArray.
        """
        with cython_context():
            return int(self.__proxy__.num_undefined())
Exemple #20
0
def _run_toolkit_function(fnname, arguments, args, kwargs):
    """
    Dispatches arguments to a toolkit function.

    Parameters
    ----------
    fnname : string
        The toolkit function to run

    arguments : list[string]
        The list of all the arguments the function takes.

    args : list
        The arguments that were passed

    kwargs : dictionary
        The keyword arguments that were passed
    """
    # scan for all the arguments in args
    num_args_got = len(args) + len(kwargs)
    num_args_required = len(arguments)
    if num_args_got != num_args_required:
        raise TypeError("Expecting " + str(num_args_required) +
                        " arguments, got " + str(num_args_got))

    ## fill the dict first with the regular args
    argument_dict = {}
    for i in range(len(args)):
        argument_dict[arguments[i]] = args[i]

    # now fill with the kwargs.
    for k in kwargs.keys():
        if k in argument_dict:
            raise TypeError("Got multiple values for keyword argument '" + k +
                            "'")
        argument_dict[k] = kwargs[k]

    argument_dict = _translate_function_arguments(argument_dict)
    # unwrap it
    with cython_context():
        ret = _gl.connect.main.get_unity().run_toolkit(fnname, argument_dict)
    # handle errors
    if ret[0] != True:
        if len(ret[1]) > 0:
            raise _ToolkitError(ret[1])
        else:
            raise _ToolkitError("Toolkit failed with unknown error")

    ret = _wrap_function_return(ret[2])
    if type(ret) == dict and 'return_value' in ret:
        return ret['return_value']
    else:
        return ret
Exemple #21
0
    def cancel(self):
        """
      Cancels a background sketch computation immediately if one is ongoing.
      Does nothing otherwise.

      Examples
      --------
      >>> s = sa.sketch_summary(array, background=True)
      >>> s.cancel()
      """
        with cython_context():
            self.__proxy__.cancel()
def _run_toolkit_function(fnname, arguments, args, kwargs):
    """
    Dispatches arguments to a toolkit function.

    Parameters
    ----------
    fnname : string
        The toolkit function to run

    arguments : list[string]
        The list of all the arguments the function takes.

    args : list
        The arguments that were passed

    kwargs : dictionary
        The keyword arguments that were passed
    """
    # scan for all the arguments in args
    num_args_got = len(args) + len(kwargs)
    num_args_required = len(arguments)
    if num_args_got != num_args_required:
        raise TypeError("Expecting " + str(num_args_required) + " arguments, got " + str(num_args_got))

    ## fill the dict first with the regular args
    argument_dict = {}
    for i in range(len(args)):
        argument_dict[arguments[i]] = args[i]

    # now fill with the kwargs.
    for k in kwargs.keys():
        if k in argument_dict:
            raise TypeError("Got multiple values for keyword argument '" + k + "'")
        argument_dict[k] = kwargs[k]

    argument_dict = _translate_function_arguments(argument_dict)
    # unwrap it
    with cython_context():
        ret = _gl.connect.main.get_unity().run_toolkit(fnname, argument_dict)
    # handle errors
    if ret[0] != True:
        if len(ret[1]) > 0:
            raise _ToolkitError(ret[1])
        else:
            raise _ToolkitError("Toolkit failed with unknown error")

    ret = _wrap_function_return(ret[2])
    if type(ret) == dict and 'return_value' in ret:
        return ret['return_value']
    else:
        return ret
Exemple #23
0
    def num_unique(self):
        """
        Returns a sketched estimate of the number of unique values in the
        SArray based on the Hyperloglog sketch.

        Returns
        -------
        out : float
            An estimate of the number of unique values in the SArray.
        """
        _mt._get_metric_tracker().track('sketch.num_unique')

        with cython_context():
            return int(self.__proxy__.num_unique())
    def num_unique(self):
        """
        Returns a sketched estimate of the number of unique values in the
        SArray based on the Hyperloglog sketch.

        Returns
        -------
        out : float
            An estimate of the number of unique values in the SArray.
        """
        _mt._get_metric_tracker().track('sketch.num_unique')

        with cython_context():
            return int(self.__proxy__.num_unique())
Exemple #25
0
    def var(self):
        """
        Returns the variance of the values in the sarray. Returns 0 on an empty
        array. Throws an exception if called on an SArray with non-numeric type.

        Raises
        ------
        RuntimeError
            If the sarray is a non-numeric type.

        Returns
        -------
        out : float
            The variance of all the values. Returns 0 if the SArray is empty.
        """
        with cython_context():
            return self.__proxy__.var()
    def var(self):
        """
        Returns the variance of the values in the sarray. Returns 0 on an empty
        array. Throws an exception if called on an SArray with non-numeric type.

        Raises
        ------
        RuntimeError
            If the sarray is a non-numeric type.

        Returns
        -------
        out : float
            The variance of all the values. Returns 0 if the SArray is empty.
        """
        with cython_context():
            return self.__proxy__.var()
    def min(self):
        """
        Returns the minimum value in the SArray. Returns *nan* on an empty
        array. Throws an exception if called on an SArray with non-numeric type.

        Raises
        ------
        RuntimeError
            If the sarray is a non-numeric type.

        Returns
        -------
        out : type of SArray
            Minimum value of SArray. Returns nan if the sarray is empty.
        """
        with cython_context():
            return self.__proxy__.min()
Exemple #28
0
    def min(self):
        """
        Returns the minimum value in the SArray. Returns *nan* on an empty
        array. Throws an exception if called on an SArray with non-numeric type.

        Raises
        ------
        RuntimeError
            If the sarray is a non-numeric type.

        Returns
        -------
        out : type of SArray
            Minimum value of SArray. Returns nan if the sarray is empty.
        """
        with cython_context():
            return self.__proxy__.min()
    def element_length_summary(self):
        """
        Returns the sketch summary for the element length. This is only valid for
        a sketch constructed SArray of type list/array/dict, raises Runtime
        exception otherwise.

        Examples
        --------
        >>> sa = graphlab.SArray([[j for j in range(i)] for i in range(1,1000)])
        >>> sa.sketch_summary().element_length_summary()
        +--------------------+---------------+----------+
        |        item        |     value     | is exact |
        +--------------------+---------------+----------+
        |       Length       |      999      |   Yes    |
        |        Min         |      1.0      |   Yes    |
        |        Max         |     999.0     |   Yes    |
        |        Mean        |     500.0     |   Yes    |
        |        Sum         |    499500.0   |   Yes    |
        |      Variance      | 83166.6666667 |   Yes    |
        | Standard Deviation | 288.386314978 |   Yes    |
        |  # Missing Values  |       0       |   Yes    |
        |  # unique values   |      992      |    No    |
        +--------------------+---------------+----------+
        Most frequent items:
        +-------+---+---+---+---+---+---+---+---+---+----+
        | value | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |
        +-------+---+---+---+---+---+---+---+---+---+----+
        | count | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1  |
        +-------+---+---+---+---+---+---+---+---+---+----+
        Quantiles:
        +-----+------+------+-------+-------+-------+-------+-------+-------+
        |  0% |  1%  |  5%  |  25%  |  50%  |  75%  |  95%  |  99%  |  100% |
        +-----+------+------+-------+-------+-------+-------+-------+-------+
        | 1.0 | 10.0 | 50.0 | 250.0 | 500.0 | 750.0 | 950.0 | 990.0 | 999.0 |
        +-----+------+------+-------+-------+-------+-------+-------+-------+

        Returns
        -------
        out : Sketch
          An new sketch object regarding the element length of the current SArray
        """
        _mt._get_metric_tracker().track('sketch.element_length_summary')

        with cython_context():
            return Sketch(_proxy = self.__proxy__.element_length_summary())
Exemple #30
0
    def element_length_summary(self):
        """
        Returns the sketch summary for the element length. This is only valid for
        a sketch constructed SArray of type list/array/dict, raises Runtime
        exception otherwise.

        Examples
        --------
        >>> sa = graphlab.SArray([[j for j in range(i)] for i in range(1,1000)])
        >>> sa.sketch_summary().element_length_summary()
        +--------------------+---------------+----------+
        |        item        |     value     | is exact |
        +--------------------+---------------+----------+
        |       Length       |      999      |   Yes    |
        |        Min         |      1.0      |   Yes    |
        |        Max         |     999.0     |   Yes    |
        |        Mean        |     500.0     |   Yes    |
        |        Sum         |    499500.0   |   Yes    |
        |      Variance      | 83166.6666667 |   Yes    |
        | Standard Deviation | 288.386314978 |   Yes    |
        |  # Missing Values  |       0       |   Yes    |
        |  # unique values   |      992      |    No    |
        +--------------------+---------------+----------+
        Most frequent items:
        +-------+---+---+---+---+---+---+---+---+---+----+
        | value | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |
        +-------+---+---+---+---+---+---+---+---+---+----+
        | count | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1  |
        +-------+---+---+---+---+---+---+---+---+---+----+
        Quantiles:
        +-----+------+------+-------+-------+-------+-------+-------+-------+
        |  0% |  1%  |  5%  |  25%  |  50%  |  75%  |  95%  |  99%  |  100% |
        +-----+------+------+-------+-------+-------+-------+-------+-------+
        | 1.0 | 10.0 | 50.0 | 250.0 | 500.0 | 750.0 | 950.0 | 990.0 | 999.0 |
        +-----+------+------+-------+-------+-------+-------+-------+-------+

        Returns
        -------
        out : Sketch
          An new sketch object regarding the element length of the current SArray
        """
        _mt._get_metric_tracker().track('sketch.element_length_summary')

        with cython_context():
            return Sketch(_proxy=self.__proxy__.element_length_summary())
    def sum(self):
        """
        Returns the sum of all the values in the SArray.  Returns 0 on an empty
        array. Throws an exception if called on an sarray with non-numeric type.
        Will overflow without warning.

        Raises
        ------
        RuntimeError
            If the sarray is a non-numeric type.

        Returns
        -------
        out : type of SArray
            Sum of all values in SArray. Returns 0 if the SArray is empty.
        """
        with cython_context():
            return self.__proxy__.sum()
Exemple #32
0
    def sum(self):
        """
        Returns the sum of all the values in the SArray.  Returns 0 on an empty
        array. Throws an exception if called on an sarray with non-numeric type.
        Will overflow without warning.

        Raises
        ------
        RuntimeError
            If the sarray is a non-numeric type.

        Returns
        -------
        out : type of SArray
            Sum of all values in SArray. Returns 0 if the SArray is empty.
        """
        with cython_context():
            return self.__proxy__.sum()
Exemple #33
0
    def dict_value_summary(self):
        """
        Returns the sketch summary for all dictionary values. This is only valid
        for sketch object from an SArray of dict type.

        Type of value summary is inferred from first set of values.

        Examples
        --------

        >>> sa = graphlab.SArray([{'I':1, 'love': 2}, {'nature':3, 'beauty':4}])
        >>> sa.sketch_summary().dict_value_summary()
        +--------------------+---------------+----------+
        |        item        |     value     | is exact |
        +--------------------+---------------+----------+
        |       Length       |       4       |   Yes    |
        |        Min         |      1.0      |   Yes    |
        |        Max         |      4.0      |   Yes    |
        |        Mean        |      2.5      |   Yes    |
        |        Sum         |      10.0     |   Yes    |
        |      Variance      |      1.25     |   Yes    |
        | Standard Deviation | 1.11803398875 |   Yes    |
        |  # Missing Values  |       0       |   Yes    |
        |  # unique values   |       4       |    No    |
        +--------------------+---------------+----------+
        Most frequent items:
        +-------+-----+-----+-----+-----+
        | value | 1.0 | 2.0 | 3.0 | 4.0 |
        +-------+-----+-----+-----+-----+
        | count |  1  |  1  |  1  |  1  |
        +-------+-----+-----+-----+-----+
        Quantiles:
        +-----+-----+-----+-----+-----+-----+-----+-----+------+
        |  0% |  1% |  5% | 25% | 50% | 75% | 95% | 99% | 100% |
        +-----+-----+-----+-----+-----+-----+-----+-----+------+
        | 1.0 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 | 4.0  |
        +-----+-----+-----+-----+-----+-----+-----+-----+------+

        """
        _mt._get_metric_tracker().track('sketch.dict_value_summary')
        with cython_context():
            return Sketch(_proxy=self.__proxy__.dict_value_summary())
    def dict_value_summary(self):
        """
        Returns the sketch summary for all dictionary values. This is only valid
        for sketch object from an SArray of dict type.

        Type of value summary is inferred from first set of values.

        Examples
        --------

        >>> sa = graphlab.SArray([{'I':1, 'love': 2}, {'nature':3, 'beauty':4}])
        >>> sa.sketch_summary().dict_value_summary()
        +--------------------+---------------+----------+
        |        item        |     value     | is exact |
        +--------------------+---------------+----------+
        |       Length       |       4       |   Yes    |
        |        Min         |      1.0      |   Yes    |
        |        Max         |      4.0      |   Yes    |
        |        Mean        |      2.5      |   Yes    |
        |        Sum         |      10.0     |   Yes    |
        |      Variance      |      1.25     |   Yes    |
        | Standard Deviation | 1.11803398875 |   Yes    |
        |  # Missing Values  |       0       |   Yes    |
        |  # unique values   |       4       |    No    |
        +--------------------+---------------+----------+
        Most frequent items:
        +-------+-----+-----+-----+-----+
        | value | 1.0 | 2.0 | 3.0 | 4.0 |
        +-------+-----+-----+-----+-----+
        | count |  1  |  1  |  1  |  1  |
        +-------+-----+-----+-----+-----+
        Quantiles:
        +-----+-----+-----+-----+-----+-----+-----+-----+------+
        |  0% |  1% |  5% | 25% | 50% | 75% | 95% | 99% | 100% |
        +-----+-----+-----+-----+-----+-----+-----+-----+------+
        | 1.0 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 | 4.0  |
        +-----+-----+-----+-----+-----+-----+-----+-----+------+

        """
        _mt._get_metric_tracker().track('sketch.dict_value_summary')
        with cython_context():
            return Sketch(_proxy = self.__proxy__.dict_value_summary())
Exemple #35
0
    def swap_columns(self, column_1, column_2):
        """
        Swaps the columns with the given names.

        Parameters
        ----------
        column_1 : string
            Name of column to swap

        column_2 : string
            Name of other column to swap
        """
        self.__is_dirty__ = True
        with cython_context():
            if self._is_vertex_frame():
                graph_proxy = self.__graph__.__proxy__.swap_vertex_fields(column_1, column_2)
                self.__graph__.__proxy__ = graph_proxy
            elif self._is_edge_frame():
                graph_proxy = self.__graph__.__proxy__.swap_edge_fields(column_1, column_2)
                self.__graph__.__proxy__ = graph_proxy
    def element_summary(self):
        """
        Returns the sketch summary for all element values. This is only valid for
        sketch object created from SArray of list or vector(array) type.
        For SArray of list type, all list values are treated as string for
        sketch summary.
        For SArray of vector type, the sketch summary is on FLOAT type.

        Examples
        --------
        >>> sa = graphlab.SArray([[1,2,3], [4,5]])
        >>> sa.sketch_summary().element_summary()
        +--------------------+---------------+----------+
        |        item        |     value     | is exact |
        +--------------------+---------------+----------+
        |       Length       |       5       |   Yes    |
        |        Min         |      1.0      |   Yes    |
        |        Max         |      5.0      |   Yes    |
        |        Mean        |      3.0      |   Yes    |
        |        Sum         |      15.0     |   Yes    |
        |      Variance      |      2.0      |   Yes    |
        | Standard Deviation | 1.41421356237 |   Yes    |
        |  # Missing Values  |       0       |   Yes    |
        |  # unique values   |       5       |    No    |
        +--------------------+---------------+----------+
        Most frequent items:
        +-------+-----+-----+-----+-----+-----+
        | value | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 |
        +-------+-----+-----+-----+-----+-----+
        | count |  1  |  1  |  1  |  1  |  1  |
        +-------+-----+-----+-----+-----+-----+
        Quantiles:
        +-----+-----+-----+-----+-----+-----+-----+-----+------+
        |  0% |  1% |  5% | 25% | 50% | 75% | 95% | 99% | 100% |
        +-----+-----+-----+-----+-----+-----+-----+-----+------+
        | 1.0 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 | 5.0 | 5.0  |
        +-----+-----+-----+-----+-----+-----+-----+-----+------+
        """
        _mt._get_metric_tracker().track('sketch.element_summary')
        with cython_context():
            return Sketch(_proxy = self.__proxy__.element_summary())
Exemple #37
0
    def element_summary(self):
        """
        Returns the sketch summary for all element values. This is only valid for
        sketch object created from SArray of list or vector(array) type.
        For SArray of list type, all list values are treated as string for
        sketch summary.
        For SArray of vector type, the sketch summary is on FLOAT type.

        Examples
        --------
        >>> sa = graphlab.SArray([[1,2,3], [4,5]])
        >>> sa.sketch_summary().element_summary()
        +--------------------+---------------+----------+
        |        item        |     value     | is exact |
        +--------------------+---------------+----------+
        |       Length       |       5       |   Yes    |
        |        Min         |      1.0      |   Yes    |
        |        Max         |      5.0      |   Yes    |
        |        Mean        |      3.0      |   Yes    |
        |        Sum         |      15.0     |   Yes    |
        |      Variance      |      2.0      |   Yes    |
        | Standard Deviation | 1.41421356237 |   Yes    |
        |  # Missing Values  |       0       |   Yes    |
        |  # unique values   |       5       |    No    |
        +--------------------+---------------+----------+
        Most frequent items:
        +-------+-----+-----+-----+-----+-----+
        | value | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 |
        +-------+-----+-----+-----+-----+-----+
        | count |  1  |  1  |  1  |  1  |  1  |
        +-------+-----+-----+-----+-----+-----+
        Quantiles:
        +-----+-----+-----+-----+-----+-----+-----+-----+------+
        |  0% |  1% |  5% | 25% | 50% | 75% | 95% | 99% | 100% |
        +-----+-----+-----+-----+-----+-----+-----+-----+------+
        | 1.0 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 | 5.0 | 5.0  |
        +-----+-----+-----+-----+-----+-----+-----+-----+------+
        """
        _mt._get_metric_tracker().track('sketch.element_summary')
        with cython_context():
            return Sketch(_proxy=self.__proxy__.element_summary())
Exemple #38
0
    def rename(self, names):
        """
        Rename the columns using the 'names' dict.  This changes the names of
        the columns given as the keys and replaces them with the names given as
        the values.

        Parameters
        ----------
        names : dict[string, string]
            Dictionary of [old_name, new_name]
        """
        if (type(names) is not dict):
            raise TypeError('names must be a dictionary: oldname -> newname')

        self.__is_dirty__ = True
        with cython_context():
            if self._is_vertex_frame():
                graph_proxy = self.__graph__.__proxy__.rename_vertex_fields(names.keys(), names.values())
                self.__graph__.__proxy__ = graph_proxy
            elif self._is_edge_frame():
                graph_proxy = self.__graph__.__proxy__.rename_edge_fields(names.keys(), names.values())
                self.__graph__.__proxy__ = graph_proxy
Exemple #39
0
    def swap_columns(self, column_1, column_2):
        """
        Swaps the columns with the given names.

        Parameters
        ----------
        column_1 : string
            Name of column to swap

        column_2 : string
            Name of other column to swap
        """
        self.__is_dirty__ = True
        with cython_context():
            if self._is_vertex_frame():
                graph_proxy = self.__graph__.__proxy__.swap_vertex_fields(
                    column_1, column_2)
                self.__graph__.__proxy__ = graph_proxy
            elif self._is_edge_frame():
                graph_proxy = self.__graph__.__proxy__.swap_edge_fields(
                    column_1, column_2)
                self.__graph__.__proxy__ = graph_proxy
    def frequent_items(self):
        """
        Returns a sketched estimate of the most frequent elements in the SArray
        based on the SpaceSaving sketch. It is only guaranteed that all
        elements which appear in more than 0.01% rows of the array will
        appear in the set of returned elements. However, other elements may
        also appear in the result. The item counts are estimated using
        the CountSketch.

        Missing values are not taken into account when copmuting frequent items.

        If this function returns no elements, it means that all elements appear
        with less than 0.01% occurrence.

        Returns
        -------
        out : dict
            A dictionary mapping items and their estimated occurrence frequencies.
        """
        _mt._get_metric_tracker().track('sketch.frequent_items')

        with cython_context():
            return self.__proxy__.frequent_items()
Exemple #41
0
    def frequent_items(self):
        """
        Returns a sketched estimate of the most frequent elements in the SArray
        based on the SpaceSaving sketch. It is only guaranteed that all
        elements which appear in more than 0.01% rows of the array will
        appear in the set of returned elements. However, other elements may
        also appear in the result. The item counts are estimated using
        the CountSketch.

        Missing values are not taken into account when copmuting frequent items.

        If this function returns no elements, it means that all elements appear
        with less than 0.01% occurrence.

        Returns
        -------
        out : dict
            A dictionary mapping items and their estimated occurrence frequencies.
        """
        _mt._get_metric_tracker().track('sketch.frequent_items')

        with cython_context():
            return self.__proxy__.frequent_items()
Exemple #42
0
    def rename(self, names):
        """
        Rename the columns using the 'names' dict.  This changes the names of
        the columns given as the keys and replaces them with the names given as
        the values.

        Parameters
        ----------
        names : dict[string, string]
            Dictionary of [old_name, new_name]
        """
        if (type(names) is not dict):
            raise TypeError('names must be a dictionary: oldname -> newname')

        self.__is_dirty__ = True
        with cython_context():
            if self._is_vertex_frame():
                graph_proxy = self.__graph__.__proxy__.rename_vertex_fields(
                    names.keys(), names.values())
                self.__graph__.__proxy__ = graph_proxy
            elif self._is_edge_frame():
                graph_proxy = self.__graph__.__proxy__.rename_edge_fields(
                    names.keys(), names.values())
                self.__graph__.__proxy__ = graph_proxy
Exemple #43
0
    def element_sub_sketch(self, keys=None):
        """
        Returns the sketch summary for the given set of keys. This is only
        applicable for sketch summary created from SArray of sarray or dict type.
        For dict SArray, the keys are the keys in dict value.
        For array Sarray, the keys are indexes into the array value.

        The keys must be passed into original sketch_summary() call in order to
        be able to be retrieved later

        Parameters
        -----------
        keys : list of str | str | list of int | int
            The list of dictionary keys or array index to get sub sketch from.
            if not given, then retrieve all sub sketches that are available

        Returns
        -------
        A dictionary that maps from the key(index) to the actual sketch summary
        for that key(index)

        Examples
        --------
        >>> sa = graphlab.SArray([{'a':1, 'b':2}, {'a':4, 'd':1}])
        >>> s = sa.sketch_summary(sub_sketch_keys=['a','b'])
        >>> s.element_sub_sketch(['a'])
        {'a':
         +--------------------+-------+----------+
         |        item        | value | is exact |
         +--------------------+-------+----------+
         |       Length       |   2   |   Yes    |
         |        Min         |  1.0  |   Yes    |
         |        Max         |  4.0  |   Yes    |
         |        Mean        |  2.5  |   Yes    |
         |        Sum         |  5.0  |   Yes    |
         |      Variance      |  2.25 |   Yes    |
         | Standard Deviation |  1.5  |   Yes    |
         |  # Missing Values  |   0   |   Yes    |
         |  # unique values   |   2   |    No    |
         +--------------------+-------+----------+
         Most frequent items:
         +-------+-----+-----+
         | value | 1.0 | 4.0 |
         +-------+-----+-----+
         | count |  1  |  1  |
         +-------+-----+-----+
         Quantiles:
         +-----+-----+-----+-----+-----+-----+-----+-----+------+
         |  0% |  1% |  5% | 25% | 50% | 75% | 95% | 99% | 100% |
         +-----+-----+-----+-----+-----+-----+-----+-----+------+
         | 1.0 | 1.0 | 1.0 | 1.0 | 4.0 | 4.0 | 4.0 | 4.0 | 4.0  |
         +-----+-----+-----+-----+-----+-----+-----+-----+------+}
        """
        single_val = False
        if keys == None:
            keys = []
        else:
            if not hasattr(keys, "__iter__"):
                single_val = True
                keys = [keys]
            value_types = set([type(i) for i in keys])
            if (len(value_types) > 1):
                raise ValueError("All keys should have the same type.")

        _mt._get_metric_tracker().track('sketch.element_sub_sketch')
        with cython_context():
            ret_sketches = self.__proxy__.element_sub_sketch(keys)
            ret = {}

            # check return key matches input key
            for key in keys:
                if key not in ret_sketches:
                    raise KeyError(
                        "Cannot retrieve element sub sketch for key '" +
                        str(key) +
                        "'. Element sub sketch can only be retrieved when the sketch_summary object was created using the 'sub_sketch_keys' option."
                    )
            for key in ret_sketches:
                ret[key] = Sketch(_proxy=ret_sketches[key])

            if single_val:
                return ret[keys[0]]
            else:
                return ret
    def element_sub_sketch(self, keys = None):
        """
        Returns the sketch summary for the given set of keys. This is only
        applicable for sketch summary created from SArray of sarray or dict type.
        For dict SArray, the keys are the keys in dict value.
        For array Sarray, the keys are indexes into the array value.

        The keys must be passed into original sketch_summary() call in order to
        be able to be retrieved later

        Parameters
        -----------
        keys : list of str | str | list of int | int
            The list of dictionary keys or array index to get sub sketch from.
            if not given, then retrieve all sub sketches that are available

        Returns
        -------
        A dictionary that maps from the key(index) to the actual sketch summary
        for that key(index)

        Examples
        --------
        >>> sa = graphlab.SArray([{'a':1, 'b':2}, {'a':4, 'd':1}])
        >>> s = sa.sketch_summary(sub_sketch_keys=['a','b'])
        >>> s.element_sub_sketch(['a'])
        {'a':
         +--------------------+-------+----------+
         |        item        | value | is exact |
         +--------------------+-------+----------+
         |       Length       |   2   |   Yes    |
         |        Min         |  1.0  |   Yes    |
         |        Max         |  4.0  |   Yes    |
         |        Mean        |  2.5  |   Yes    |
         |        Sum         |  5.0  |   Yes    |
         |      Variance      |  2.25 |   Yes    |
         | Standard Deviation |  1.5  |   Yes    |
         |  # Missing Values  |   0   |   Yes    |
         |  # unique values   |   2   |    No    |
         +--------------------+-------+----------+
         Most frequent items:
         +-------+-----+-----+
         | value | 1.0 | 4.0 |
         +-------+-----+-----+
         | count |  1  |  1  |
         +-------+-----+-----+
         Quantiles:
         +-----+-----+-----+-----+-----+-----+-----+-----+------+
         |  0% |  1% |  5% | 25% | 50% | 75% | 95% | 99% | 100% |
         +-----+-----+-----+-----+-----+-----+-----+-----+------+
         | 1.0 | 1.0 | 1.0 | 1.0 | 4.0 | 4.0 | 4.0 | 4.0 | 4.0  |
         +-----+-----+-----+-----+-----+-----+-----+-----+------+}
        """
        single_val = False
        if keys == None:
            keys = []
        else:
            if not hasattr(keys, "__iter__"):
                single_val = True
                keys = [keys]
            value_types = set([type(i) for i in keys])
            if (len(value_types) > 1):
                raise ValueError("All keys should have the same type.")

        _mt._get_metric_tracker().track('sketch.element_sub_sketch')
        with cython_context():
            ret_sketches = self.__proxy__.element_sub_sketch(keys)
            ret = {}

            # check return key matches input key
            for key in keys:
              if key not in ret_sketches:
                raise KeyError("Cannot retrieve element sub sketch for key '" + str(key) + "'. Element sub sketch can only be retrieved when the sketch_summary object was created using the 'sub_sketch_keys' option.")
            for key in ret_sketches:
                ret[key] = Sketch(_proxy = ret_sketches[key])

            if single_val:
                return ret[keys[0]]
            else:
                return ret