Example #1
0
def _format_inputs(responses, predictors, weights=None):

    if not len(responses):
        return numpy.array([]), numpy.array([]), numpy.array([])
    responses = nice_array(responses, shape=numpy.size(responses))
    predictors = nice_array(predictors, shape=(len(responses), numpy.size(predictors) / len(responses)))
    if weights is not None:
        weights = nice_array(weights, shape=numpy.shape(responses))

    return responses, predictors, weights
Example #2
0
    def update(self, values, weights=None, negative_weights=None):
        """
        Can update one datapoint at a time (in which case values is an array
        and weights must be a scalar), or a set (in which case values are
        rows of a 2D array, and weights is a 1D array).

        If negative_weights are specified, values get multiplied by the sign of
        the corresponding negative_weight, and weights get set to
        abs(negative_weights).
        """

        values = arrays.nice_array(values, logger=self.logger,
                                   shape=(numpy.size(values) / self.nvars, self.nvars))

        if self.weighted:
            if negative_weights is not None:
                if weights is not None: raise AssertionError('Can not specify both weights and negative weights')
                negative_weights = arrays.nice_array(negative_weights, shape=len(values),
                                                     logger=self.logger)
                weights = abs(negative_weights)
                values = values.copy()*numpy.sign(negative_weights)[:, numpy.newaxis]
            elif weights is None:
                raise AssertionError('Weighted statistics object received no weights in update.')
            else:
                weights = arrays.nice_array(weights, shape=len(values), logger=self.logger)
        else:
            if weights is not None:
                raise AssertionError('Unweighted statistics object received weights in update.')
            weights = numpy.ma.ones(len(values))
            
        for i in range(self.nvars):
            if self.weighted: self.Sparse[i].update(values[:, i], weights)
            else: self.Sparse[i].update(values[:, i])

            for j in range(i):
                valid = ~(values.mask[:, i] | values.mask[:, j] | weights.mask)
                self.count_ij[i, j] += numpy.sum(valid)
                self.sum_ijw[i, j] += numpy.sum(values[:, i] * values[:, j] * weights)
                self.sum_wij[i, j] += numpy.sum(weights[valid])
                self.sum_wwij[i, j] += numpy.sum(weights[valid] ** 2)
                
        self.size += len(weights)
        valid = numpy.any(~values.mask, axis=1) & (~weights.mask)
        self.count += numpy.sum(valid)
        self.sum_w += numpy.sum(weights[valid])
        self.sum_ww += numpy.sum(weights[valid] ** 2)

        if self.last_update is not None: self.last_update = (values, weights, valid)
        if self.all_update is not None: self.all_update.append((values, weights, valid))
Example #3
0
    def stats(cls, data, weights=None, axis=None, step=1, sliced=None, select=None,
              overlay=None, split=None, buckets=None, group=None,
              labels=None, label_index=None, label_all=None, label_other='Other', 
              negative_weights=None, IDs=None,
              datab=None, name=None, formats=None, **opts):
        """
        Calls Class(data).compute(), handling complexities in the form of data.

        data can be two dimensional, and axis can be 0 or 1. In this case,
        a list of statistics-records is returned, in Datab form (unless datab=False).

        overlay:
        run stats only for records selected by this mask.

        split:
        run stats for all records, records selected by this mask, and for
        the others, returning a 3-tuple of results. Does not work with axis option.

        buckets:
        run stats for all records, and for records selected by each of the masks
        in this list of masks. Does not work with axis option.

        group:
        bucket stats by values in this field.

        sliced:
        run stats for records selected by this slice.

        select:
        run stats for records selected by this boolean mask.

        step:
        When axis option is specified, clump these many rows together for each row
        stat to be computed. This can optionally be a list of steps, in which case
        each clump can have variable number of rows.

        label_all:
        Relevant only when axis or split/buckets option present. If not None,
        compute stats over entire dataset, in addition to for each index of the
        axis or split/buckets, and place results in an entry of output with this
        label.

        label_other:
        Relevant only when buckets option present. If not None, compute stats
        over part of dataset not in any bucket, in addition to for each bucket,
        and place results in an entry of output with this label.

        labels:
        list to use to add labels to each entry of output. Relevant only when
        there are multiple lines of output.

        label_index:
        like labels, except use label_index[::step].

        name:
        in the header, label the key column with this string.
        """

        if group is not None:
            if buckets is not None and split is not None:
                raise AssertionError('group, buckets and split options not supported together.')
            label_other = None
            labels, buckets = [], []
            for group_name in numpy.unique(group):
                labels.append(group_name)
                buckets.append(group == group_name)
            if name is None: name = 'group'

        if split is not None:
            if buckets is not None:
                raise AssertionError('group, buckets and split options not supported together.')
            buckets = [split]
            if labels is None:
                labels = ['True']
                label_other = 'False'
            else:
                label_other = labels[1]
                labels = [labels[0]]
            if name is None: name = 'condn'
        elif buckets is not None:
            if labels is None:
                labels = [str(d + 1) for d in range(len(buckets))]
            if name is None: name = 'bucket'

        data = arrays.nice_array(data)
        if weights is not None: weights = arrays.nice_array(weights)
        if negative_weights is not None:
            if weights is not None: raise AssertionError('Can not specify both weights and negative weights')
            weights = abs(negative_weights)
            data = data.copy()*numpy.sign(negative_weights)

        if axis is None and numpy.isscalar(step) and step == 1:
            data, weights, IDs = \
                arrays.select([data, weights, IDs],
                              sliced=sliced, overlay=overlay, select=select)
            if buckets is None:
                results = cls(data, weights=weights, IDs=IDs, **opts).compute()
                if datab is True: return Datab([results], formats=formats)
                else: return results

            if label_all:
                all_labels = [label_all]
                results = [cls.stats(data, weights=weights, IDs=IDs, formats=formats,
                                     **opts)]
            else: all_labels, results = [], []

            if label_other: other = numpy.ones(numpy.shape(data), dtype=bool)
            buckets = arrays.select(buckets,
                                    sliced=sliced, overlay=overlay, select=select)
            all_labels.extend(labels)

            for b in buckets:
                results.append(cls.stats(data, weights=weights, IDs=IDs, overlay=b,
                                         formats=formats, **opts))
                if label_other: other[b] = False
            if label_other:
                all_labels.append(label_other)
                results.append(cls.stats(data, weights=weights, IDs=IDs,
                                         overlay=other, formats=formats, **opts))

            if datab is False: return results
            else: return Datab(results, labels=all_labels, name=name, formats=formats)

        if buckets is not None:
            raise AssertionError('split/buckets option not supported with axis/step option.')

        data, weights, IDs = arrays.select([data, weights, IDs],
                                           sliced=sliced, overlay=overlay, select=select)

        if cls != Multivariate:
            if axis is not None and (axis > 1 or axis < 0 or data.ndim != 2):
                raise ValueError('Got unsupported axis option value that is ' +
                                 'not 0 or 1; or data is not two-dimensional.')
            if axis == 0:
                data = data.transpose()
                if overlay is not None: overlay = overlay.transpose()
                if IDs is not None: IDs = IDs.transpose()
                if weights is not None: weights = weights.transpose()
        elif axis is not None and axis != 0:
            raise ValueError('Axis option value 0 is the only one supported for Multivariate stats.')

        if weights is not None and weights.ndim == 1 and data.ndim == 2:
#            print numpy.shape(data)
#            print numpy.shape(weights)
            if len(weights) != numpy.shape(data)[1]:
                raise ValueError('shape mismatch: 1D weights cannot be broadcast to shape of values')
            sys.stderr.write('stats.stats: Broadcasting 1D weights for 2D values.\n')
            weights = arrays.extend(weights, numpy.shape(data)[0]).T

        if label_all is not None:
            results = [cls(data, weights=weights, IDs=IDs, **opts).compute()]
            all_labels = [label_all]
        else:
            results = []
            all_labels = []

        start_idx = 0
        count = 0
        while start_idx < len(data):
            if numpy.isscalar(step): end_idx = start_idx + step
            else: end_idx = start_idx + step[min(count, len(step)-1)]
            
            row_data, row_weights, row_IDs = \
                arrays.select([data, weights, IDs], sliced=(start_idx, end_idx, 1))

            results.append(cls.stats(row_data, weights=row_weights, IDs=row_IDs))

            if labels is not None and len(labels): all_labels.append(labels[count])
            elif label_index is not None: all_labels.append(label_index[start_idx] + '-')
            else: all_labels.append(str(start_idx) + '-')

            start_idx = end_idx
            count += 1

        if datab is False: return results
        else: return Datab(results, labels=all_labels, name=name or 'key', formats=formats)
Example #4
0
    def update(self, values, weights=None, IDs=None, negative_weights=None):
        """
        values, [negative_]weights and IDs may either be all arrays or all scalars.

        If negative_weights are specified, values get multiplied by the sign of
        the corresponding negative_weight, and weights get set to
        abs(negative_weights).
        
        datapoints with either the value or the weight being nan are ignored,
        as are datapoints with weight <= 0. Though these datapoints affect
        the 'size' statistic (but not the 'count' statistic).

        values, weights and IDs are returned (useful to get back defaults/masked
        versions of the inputs).
        """

        values = arrays.nice_array(values, logger=self.logger)
        mask = values.mask.copy()

        if self.weighted:
            if negative_weights is not None:
                if weights is not None: raise AssertionError('Can not specify both weights and negative weights')
                weights = abs(negative_weights)
                values = values.copy()*numpy.sign(negative_weights)
            elif weights is None:
                raise AssertionError('Weighted statistics object received no weights in update.')
            weights = arrays.nice_array(weights, shape=values.shape, logger=self.logger)
            mask |= weights.mask
            # Following contortion to avoid bogus
            #    "RuntimeWarning: Invalid value encountered in less_equal"
            mask[~mask] = (weights[~mask] <= 0)
            fweights = weights.flatten()
        else:
            if weights is not None:
                raise AssertionError('Unweighted statistics object received weights in update.')
            fweights = numpy.ma.ones(values.size, dtype=float)            
            
        fweights.mask = mask.flatten()
        fvalues = values.flatten()
        fvalues.mask = fweights.mask

        if IDs is None: IDs = numpy.array(range(fvalues.size), dtype=int) + self.size
        elif not isinstance(IDs, numpy.ndarray): IDs = numpy.array(IDs)

        self.size += fvalues.size
        count = fvalues.count()
        if count == 0:
            if self.last_update is not None: self.last_update = ([], [], [])
            return 

        min_index = numpy.ma.argmin(fvalues)
        max_index = numpy.ma.argmax(fvalues)
        if self.count == 0:
            self.statistics['min'] = (fvalues[min_index], IDs.flat[min_index])
            self.statistics['max'] = (fvalues[max_index], IDs.flat[max_index])
        else:
            if fvalues[min_index] < self.statistics['min'][0]:
                self.statistics['min'] = (fvalues[min_index], IDs.flat[min_index])
            if fvalues[max_index] > self.statistics['max'][0]:
                self.statistics['max'] = (fvalues[max_index], IDs.flat[max_index])

        self.count += count
        self.sum_xw += numpy.ma.sum(fvalues * fweights)
        self.sum_xxw += numpy.ma.sum(fvalues * fvalues * fweights)
        self.sum_w += numpy.ma.sum(fweights)
        self.sum_ww += numpy.ma.sum(fweights * fweights)

        if self.last_update is not None:
            self.last_update = (fvalues, fweights, IDs.flat)
Example #5
0
def regress(
    responses,
    predictors,
    weights=None,
    constant=True,
    forecast=False,
    errors=False,
    axis=None,
    step=1,
    sliced=None,
    select=None,
    overlay=None,
    split=None,
    buckets=None,
    group=None,
    labels=None,
    label_index=None,
    label_all="All",
    label_other="Other",
    datab=None,
    names=None,
    name=None,
    formats=None,
):
    """
    Wrapper around Regress(*args, **kwargs).compute(), handling some additional
    options.

    data can be two dimensional, and axis can be 0 or 1. In this case,
    a list of statistics-records is returned, in Datab form.

    split:
    run stats for all records, records selected by this mask, and for
    the others, returning a 3-tuple of results. Does not work with axis
    option, or if data is a dict.

    buckets:
    run stats for all records, and for records selected by each of the masks
    in this list of masks. Does not work with axis option, or if data is a dict.

    group:
    bucket stats by values in this field.

    sliced, overlay, select:
    run stats for records selected by this slice, overlay or selection.

    step:
    When axis option is specified, clump these many rows together for each row
    stat to be computed. This can optionally be a list of steps, in which case
    each clump can have variable number of rows.

    label_all:
    Relevant only when axis or split/buckets option present. If not None,
    compute stats over entire dataset, in addition to for each index of the
    axis or split/buckets, and place results in an entry of output with this
    label.

    label_other:
    Relevant only when buckets option present. If not None, compute stats
    over part of dataset not in any bucket, in addition to for each bucket,
    and place results in an entry of output with this label.

    labels:
    list to use to add labels to each entry of output. Relevant only when
    there are multiple lines of output.

    label_index:
    like labels, except use label_index[::step].

    names:
    in the header, labeled the predictor columns with these strings.

    name:
    in the header, label the key column with this string.

    datab:
    Return results in datab format rather than as a list, if appropriate.
    Defaults to True.

    formats:
    If using datab format, use this to pretty print floats. Defaults to '%9.6f'.
    """

    if datab is None:
        datab = not (forecast) and not (errors)
    if datab == True:
        # datab output cannot hold forecasts or errors per datapoint
        forecast = False
        errors = False

    if group is not None:
        if buckets is not None and split is not None:
            raise AssertionError("group, buckets and split options not supported together.")
        label_other = None
        labels, buckets = [], []
        for group_name in numpy.unique(group):
            labels.append(group_name)
            buckets.append(group == group_name)
        if name is None:
            name = "group"

    if split is not None:
        if buckets is not None:
            raise AssertionError("group, buckets and split options not supported together.")
        buckets = [split]
        if labels is None:
            labels = ["True"]
            label_other = "False"
        else:
            label_other = labels[1]
            labels = [labels[0]]
        if name is None:
            name = "condn"
    elif buckets is not None:
        if labels is None:
            labels = [str(d + 1) for d in range(len(buckets))]
        if name is None:
            name = "bucket"

    responses, predictors, weights = arrays.select(
        [nice_array(responses), nice_array(predictors), nice_array(weights)],
        sliced=sliced,
        overlay=overlay,
        select=select,
    )

    results = []
    if label_all is not None:
        reg = Regress(responses, predictors, weights=weights, constant=constant, names=names, store_last=True)
        results.append(reg.compute(forecast=forecast, errors=errors))

    if axis is None and numpy.isscalar(step) and step == 1:
        if buckets is None:
            if not datab:
                return results[0]
            else:
                return Datab(results, formats=formats)
        else:
            if label_all is not None:
                results[-1]["label"] = label_all
            if label_other:
                other = numpy.ones(numpy.shape(responses), dtype=bool)
            buckets = arrays.select(buckets, sliced=sliced, overlay=overlay, select=select)

            for b, label in zip(buckets, labels):
                respb, predb, wtb = arrays.select([responses, predictors, weights], select=b)
                reg = Regress(respb, predb, weights=wtb, constant=constant, names=names, store_last=True)
                results.append(reg.compute(forecast=forecast, errors=errors))
                results[-1]["label"] = label
                if label_other:
                    other[b] = False

            if label_other:
                respb, predb, wtb = arrays.select([responses, predictors, weights], select=other)
                reg = Regress(respb, predb, weights=wtb, constant=constant, names=names, store_last=True)
                results.append(reg.compute(forecast=forecast, errors=errors))
                results[-1]["label"] = label_other

            if datab is False:
                return results
            else:
                return Datab(results, name=name or "key", formats=formats)
    else:
        if buckets is not None:
            raise AssertionError("split/buckets option not supported with axis/step option.")
        if label_all is not None:
            results[-1]["label"] = label_all

    if axis > 1 or axis < 0 or numpy.ndim(responses) != 2:
        raise IndexError("Got unsupported axis option value that is " + "not 0 or 1; or data is not two-dimensional")

    if axis == 0:
        responses = responses.transpose()
        predictors = predictors.transpose()
        if weights is not None:
            weights = weights.transpose()

    start_idx = 0
    count = 0
    while start_idx < len(responses):

        row_responses, row_predictors, row_weights = arrays.select(
            [responses, predictors, weights], sliced=(start_idx, start_idx + step, 1)
        )
        r = Regress(row_responses, row_predictors, weights=row_weights, constant=constant, names=names, store_last=True)
        start_idx += step
        count += 1
        if not r.Multivariate.count:
            continue

        results.append(r.compute(forecast=forecast, errors=errors))
        if labels is not None and len(labels):
            results[-1]["label"] = labels[count - 1]
        elif label_index is not None:
            results[-1]["label"] = label_index[start_idx - step] + "-"
        else:
            results[-1]["label"] = str(start_idx) + "-"

    if datab is False:
        return results
    else:
        return Datab(results, name=name or "key", formats=formats)