Esempio n. 1
0
    def min(self):
        """
        Get minimum numeric value in the RDD.

        Returns None on an empty RDD. Raises an exception if called on an
        RDD with non-numeric type.
        """
        self._entry()
        count = self._count()  # action
        if count == 0:  # action
            return None
        if not is_numeric_type(self.elem_type):
            raise TypeError('sum: non numeric type')
        return self._rdd.min()  # action
Esempio n. 2
0
    def mean(self):
        """
        Mean of all the values in the RDD.

        Returns None on an empty RDD. Raises an exception if called on an
        RDD with non-numeric type.
        """
        self._entry()
        count = self._count()  # action
        if count == 0:  # action
            return None
        if not is_numeric_type(self.elem_type):
            raise TypeError('mean: non numeric type')
        return self._rdd.mean()  # action
Esempio n. 3
0
    def mean(self):
        """
        Mean of all the values in the RDD.

        Returns None on an empty RDD. Raises an exception if called on an
        RDD with non-numeric type.
        """
        self._entry()
        count = self._count()     # action
        if count == 0:     # action
            return None
        if not is_numeric_type(self.elem_type):
            raise TypeError('mean: non numeric type')
        return self._rdd.mean()       # action
Esempio n. 4
0
    def min(self):
        """
        Get minimum numeric value in the RDD.

        Returns None on an empty RDD. Raises an exception if called on an
        RDD with non-numeric type.
        """
        self._entry()
        count = self._count()     # action
        if count == 0:     # action
            return None
        if not is_numeric_type(self.elem_type):
            raise TypeError('sum: non numeric type')
        return self._rdd.min()      # action
Esempio n. 5
0
    def sum(self):
        """
        Sum of all values in the RDD.

        Raises an exception if called on an RDD of strings, lists, or
        dictionaries. If the RDD contains numeric lists or arrays (array.array) and
        all the arrays are the same length, the sum over all the arrays will be
        returned. Returns None on an empty RDD. For large values, this may
        overflow without warning.
        """
        self._entry()
        count = self._count()  # action
        if count == 0:  # action
            return None

        if is_numeric_type(self.elem_type):
            total = self._rdd.sum()  # action
        elif self.elem_type is array.array:

            def array_sum(x, y):
                if x.typecode != y.typecode:
                    logging.warn('Sum: arrays are not compatible')
                total = array.array(x.typecode)
                total.fromlist([a + b for a, b in zip(x, y)])
                return total

            total = self._rdd.reduce(array_sum)
        elif self.elem_type is list:

            def list_sum(x, y):
                return [a + b for a, b in zip(x, y)]

            total = self._rdd.reduce(list_sum)
        elif self.elem_type is dict:

            def dict_sum(x, y):
                return {k: x.get(k, 0) + y.get(k, 0) for k in set(x) & set(y)}

            total = self._rdd.reduce(dict_sum)

        else:
            raise TypeError('sum: non numeric type')
        return total
    def construct_from_xarray(self, xa, sub_sketch_keys=None):
        self._entry(sub_sketch_keys=sub_sketch_keys)
        if sub_sketch_keys is not None:
            raise NotImplementedError('sub_sketch_keys mode not implemented')

        # these are not going through the xrdd layer -- should they?
        defined = xa.to_rdd().filter(lambda x: not is_missing(x))
        defined.cache()
        self.dtype = xa.dtype()
        self.count = defined.count()
        if util.is_numeric_type(self.dtype):
            self.sketch_type = 'numeric'
        elif util.is_date_type(self.dtype):
            self.sketch_type = 'date'
        else:
            self.sketch_type = 'non-numeric'

        # compute others later if needed
        self._rdd = xa.to_rdd()
        self.defined = defined
Esempio n. 7
0
    def var(self, ddof):
        """
        Variance of all the values in the RDD.

        Returns None on an empty RDD. Raises an exception if called on an
        RDD with non-numeric type or if `ddof` >= length of XArray.
        """
        self._entry(ddof=ddof)
        count = self._count()  # action
        if count == 0:  # action
            return None
        if not is_numeric_type(self.elem_type):
            raise TypeError('mean: non numeric type')
        if ddof < 0 or ddof > 1 or ddof >= count:
            raise ValueError('std: invalid ddof {}'.format(ddof))
        if ddof == 0:
            res = self._rdd.variance()  # action
        else:
            res = self._rdd.sampleVariance()  # action
        return res
Esempio n. 8
0
    def var(self, ddof):
        """
        Variance of all the values in the RDD.

        Returns None on an empty RDD. Raises an exception if called on an
        RDD with non-numeric type or if `ddof` >= length of XArray.
        """
        self._entry(ddof=ddof)
        count = self._count()     # action
        if count == 0:      # action
            return None
        if not is_numeric_type(self.elem_type):
            raise TypeError('mean: non numeric type')
        if ddof < 0 or ddof > 1 or ddof >= count:
            raise ValueError('std: invalid ddof {}'.format(ddof))
        if ddof == 0:
            res = self._rdd.variance()     # action
        else:
            res = self._rdd.sampleVariance()     # action
        return res
Esempio n. 9
0
    def sum(self):
        """
        Sum of all values in the RDD.

        Raises an exception if called on an RDD of strings, lists, or
        dictionaries. If the RDD contains numeric lists or arrays (array.array) and
        all the arrays are the same length, the sum over all the arrays will be
        returned. Returns None on an empty RDD. For large values, this may
        overflow without warning.
        """
        self._entry()
        count = self._count()     # action
        if count == 0:     # action
            return None

        if is_numeric_type(self.elem_type):
            total = self._rdd.sum()    # action
        elif self.elem_type is array.array:
            def array_sum(x, y):
                if x.typecode != y.typecode:
                    logging.warn('Sum: arrays are not compatible')
                total = array.array(x.typecode)
                total.fromlist([a + b for a, b in zip(x, y)])
                return total
            total = self._rdd.reduce(array_sum)
        elif self.elem_type is list:
            def list_sum(x, y):
                return [a + b for a, b in zip(x, y)]
            total = self._rdd.reduce(list_sum)
        elif self.elem_type is dict:
            def dict_sum(x, y):
                return {k: x.get(k, 0) + y.get(k, 0) for k in set(x) & set(y)}
            total = self._rdd.reduce(dict_sum)

        else:
            raise TypeError('sum: non numeric type')
        return total