def _format_inputs(responses, predictors, weights=None): if not len(responses): return numpy.array([]), numpy.array([]), numpy.array([]) responses = nice_array(responses, shape=numpy.size(responses)) predictors = nice_array(predictors, shape=(len(responses), numpy.size(predictors) / len(responses))) if weights is not None: weights = nice_array(weights, shape=numpy.shape(responses)) return responses, predictors, weights
def update(self, values, weights=None, negative_weights=None): """ Can update one datapoint at a time (in which case values is an array and weights must be a scalar), or a set (in which case values are rows of a 2D array, and weights is a 1D array). If negative_weights are specified, values get multiplied by the sign of the corresponding negative_weight, and weights get set to abs(negative_weights). """ values = arrays.nice_array(values, logger=self.logger, shape=(numpy.size(values) / self.nvars, self.nvars)) if self.weighted: if negative_weights is not None: if weights is not None: raise AssertionError('Can not specify both weights and negative weights') negative_weights = arrays.nice_array(negative_weights, shape=len(values), logger=self.logger) weights = abs(negative_weights) values = values.copy()*numpy.sign(negative_weights)[:, numpy.newaxis] elif weights is None: raise AssertionError('Weighted statistics object received no weights in update.') else: weights = arrays.nice_array(weights, shape=len(values), logger=self.logger) else: if weights is not None: raise AssertionError('Unweighted statistics object received weights in update.') weights = numpy.ma.ones(len(values)) for i in range(self.nvars): if self.weighted: self.Sparse[i].update(values[:, i], weights) else: self.Sparse[i].update(values[:, i]) for j in range(i): valid = ~(values.mask[:, i] | values.mask[:, j] | weights.mask) self.count_ij[i, j] += numpy.sum(valid) self.sum_ijw[i, j] += numpy.sum(values[:, i] * values[:, j] * weights) self.sum_wij[i, j] += numpy.sum(weights[valid]) self.sum_wwij[i, j] += numpy.sum(weights[valid] ** 2) self.size += len(weights) valid = numpy.any(~values.mask, axis=1) & (~weights.mask) self.count += numpy.sum(valid) self.sum_w += numpy.sum(weights[valid]) self.sum_ww += numpy.sum(weights[valid] ** 2) if self.last_update is not None: self.last_update = (values, weights, valid) if self.all_update is not None: self.all_update.append((values, weights, valid))
def stats(cls, data, weights=None, axis=None, step=1, sliced=None, select=None, overlay=None, split=None, buckets=None, group=None, labels=None, label_index=None, label_all=None, label_other='Other', negative_weights=None, IDs=None, datab=None, name=None, formats=None, **opts): """ Calls Class(data).compute(), handling complexities in the form of data. data can be two dimensional, and axis can be 0 or 1. In this case, a list of statistics-records is returned, in Datab form (unless datab=False). overlay: run stats only for records selected by this mask. split: run stats for all records, records selected by this mask, and for the others, returning a 3-tuple of results. Does not work with axis option. buckets: run stats for all records, and for records selected by each of the masks in this list of masks. Does not work with axis option. group: bucket stats by values in this field. sliced: run stats for records selected by this slice. select: run stats for records selected by this boolean mask. step: When axis option is specified, clump these many rows together for each row stat to be computed. This can optionally be a list of steps, in which case each clump can have variable number of rows. label_all: Relevant only when axis or split/buckets option present. If not None, compute stats over entire dataset, in addition to for each index of the axis or split/buckets, and place results in an entry of output with this label. label_other: Relevant only when buckets option present. If not None, compute stats over part of dataset not in any bucket, in addition to for each bucket, and place results in an entry of output with this label. labels: list to use to add labels to each entry of output. Relevant only when there are multiple lines of output. label_index: like labels, except use label_index[::step]. name: in the header, label the key column with this string. """ if group is not None: if buckets is not None and split is not None: raise AssertionError('group, buckets and split options not supported together.') label_other = None labels, buckets = [], [] for group_name in numpy.unique(group): labels.append(group_name) buckets.append(group == group_name) if name is None: name = 'group' if split is not None: if buckets is not None: raise AssertionError('group, buckets and split options not supported together.') buckets = [split] if labels is None: labels = ['True'] label_other = 'False' else: label_other = labels[1] labels = [labels[0]] if name is None: name = 'condn' elif buckets is not None: if labels is None: labels = [str(d + 1) for d in range(len(buckets))] if name is None: name = 'bucket' data = arrays.nice_array(data) if weights is not None: weights = arrays.nice_array(weights) if negative_weights is not None: if weights is not None: raise AssertionError('Can not specify both weights and negative weights') weights = abs(negative_weights) data = data.copy()*numpy.sign(negative_weights) if axis is None and numpy.isscalar(step) and step == 1: data, weights, IDs = \ arrays.select([data, weights, IDs], sliced=sliced, overlay=overlay, select=select) if buckets is None: results = cls(data, weights=weights, IDs=IDs, **opts).compute() if datab is True: return Datab([results], formats=formats) else: return results if label_all: all_labels = [label_all] results = [cls.stats(data, weights=weights, IDs=IDs, formats=formats, **opts)] else: all_labels, results = [], [] if label_other: other = numpy.ones(numpy.shape(data), dtype=bool) buckets = arrays.select(buckets, sliced=sliced, overlay=overlay, select=select) all_labels.extend(labels) for b in buckets: results.append(cls.stats(data, weights=weights, IDs=IDs, overlay=b, formats=formats, **opts)) if label_other: other[b] = False if label_other: all_labels.append(label_other) results.append(cls.stats(data, weights=weights, IDs=IDs, overlay=other, formats=formats, **opts)) if datab is False: return results else: return Datab(results, labels=all_labels, name=name, formats=formats) if buckets is not None: raise AssertionError('split/buckets option not supported with axis/step option.') data, weights, IDs = arrays.select([data, weights, IDs], sliced=sliced, overlay=overlay, select=select) if cls != Multivariate: if axis is not None and (axis > 1 or axis < 0 or data.ndim != 2): raise ValueError('Got unsupported axis option value that is ' + 'not 0 or 1; or data is not two-dimensional.') if axis == 0: data = data.transpose() if overlay is not None: overlay = overlay.transpose() if IDs is not None: IDs = IDs.transpose() if weights is not None: weights = weights.transpose() elif axis is not None and axis != 0: raise ValueError('Axis option value 0 is the only one supported for Multivariate stats.') if weights is not None and weights.ndim == 1 and data.ndim == 2: # print numpy.shape(data) # print numpy.shape(weights) if len(weights) != numpy.shape(data)[1]: raise ValueError('shape mismatch: 1D weights cannot be broadcast to shape of values') sys.stderr.write('stats.stats: Broadcasting 1D weights for 2D values.\n') weights = arrays.extend(weights, numpy.shape(data)[0]).T if label_all is not None: results = [cls(data, weights=weights, IDs=IDs, **opts).compute()] all_labels = [label_all] else: results = [] all_labels = [] start_idx = 0 count = 0 while start_idx < len(data): if numpy.isscalar(step): end_idx = start_idx + step else: end_idx = start_idx + step[min(count, len(step)-1)] row_data, row_weights, row_IDs = \ arrays.select([data, weights, IDs], sliced=(start_idx, end_idx, 1)) results.append(cls.stats(row_data, weights=row_weights, IDs=row_IDs)) if labels is not None and len(labels): all_labels.append(labels[count]) elif label_index is not None: all_labels.append(label_index[start_idx] + '-') else: all_labels.append(str(start_idx) + '-') start_idx = end_idx count += 1 if datab is False: return results else: return Datab(results, labels=all_labels, name=name or 'key', formats=formats)
def update(self, values, weights=None, IDs=None, negative_weights=None): """ values, [negative_]weights and IDs may either be all arrays or all scalars. If negative_weights are specified, values get multiplied by the sign of the corresponding negative_weight, and weights get set to abs(negative_weights). datapoints with either the value or the weight being nan are ignored, as are datapoints with weight <= 0. Though these datapoints affect the 'size' statistic (but not the 'count' statistic). values, weights and IDs are returned (useful to get back defaults/masked versions of the inputs). """ values = arrays.nice_array(values, logger=self.logger) mask = values.mask.copy() if self.weighted: if negative_weights is not None: if weights is not None: raise AssertionError('Can not specify both weights and negative weights') weights = abs(negative_weights) values = values.copy()*numpy.sign(negative_weights) elif weights is None: raise AssertionError('Weighted statistics object received no weights in update.') weights = arrays.nice_array(weights, shape=values.shape, logger=self.logger) mask |= weights.mask # Following contortion to avoid bogus # "RuntimeWarning: Invalid value encountered in less_equal" mask[~mask] = (weights[~mask] <= 0) fweights = weights.flatten() else: if weights is not None: raise AssertionError('Unweighted statistics object received weights in update.') fweights = numpy.ma.ones(values.size, dtype=float) fweights.mask = mask.flatten() fvalues = values.flatten() fvalues.mask = fweights.mask if IDs is None: IDs = numpy.array(range(fvalues.size), dtype=int) + self.size elif not isinstance(IDs, numpy.ndarray): IDs = numpy.array(IDs) self.size += fvalues.size count = fvalues.count() if count == 0: if self.last_update is not None: self.last_update = ([], [], []) return min_index = numpy.ma.argmin(fvalues) max_index = numpy.ma.argmax(fvalues) if self.count == 0: self.statistics['min'] = (fvalues[min_index], IDs.flat[min_index]) self.statistics['max'] = (fvalues[max_index], IDs.flat[max_index]) else: if fvalues[min_index] < self.statistics['min'][0]: self.statistics['min'] = (fvalues[min_index], IDs.flat[min_index]) if fvalues[max_index] > self.statistics['max'][0]: self.statistics['max'] = (fvalues[max_index], IDs.flat[max_index]) self.count += count self.sum_xw += numpy.ma.sum(fvalues * fweights) self.sum_xxw += numpy.ma.sum(fvalues * fvalues * fweights) self.sum_w += numpy.ma.sum(fweights) self.sum_ww += numpy.ma.sum(fweights * fweights) if self.last_update is not None: self.last_update = (fvalues, fweights, IDs.flat)
def regress( responses, predictors, weights=None, constant=True, forecast=False, errors=False, axis=None, step=1, sliced=None, select=None, overlay=None, split=None, buckets=None, group=None, labels=None, label_index=None, label_all="All", label_other="Other", datab=None, names=None, name=None, formats=None, ): """ Wrapper around Regress(*args, **kwargs).compute(), handling some additional options. data can be two dimensional, and axis can be 0 or 1. In this case, a list of statistics-records is returned, in Datab form. split: run stats for all records, records selected by this mask, and for the others, returning a 3-tuple of results. Does not work with axis option, or if data is a dict. buckets: run stats for all records, and for records selected by each of the masks in this list of masks. Does not work with axis option, or if data is a dict. group: bucket stats by values in this field. sliced, overlay, select: run stats for records selected by this slice, overlay or selection. step: When axis option is specified, clump these many rows together for each row stat to be computed. This can optionally be a list of steps, in which case each clump can have variable number of rows. label_all: Relevant only when axis or split/buckets option present. If not None, compute stats over entire dataset, in addition to for each index of the axis or split/buckets, and place results in an entry of output with this label. label_other: Relevant only when buckets option present. If not None, compute stats over part of dataset not in any bucket, in addition to for each bucket, and place results in an entry of output with this label. labels: list to use to add labels to each entry of output. Relevant only when there are multiple lines of output. label_index: like labels, except use label_index[::step]. names: in the header, labeled the predictor columns with these strings. name: in the header, label the key column with this string. datab: Return results in datab format rather than as a list, if appropriate. Defaults to True. formats: If using datab format, use this to pretty print floats. Defaults to '%9.6f'. """ if datab is None: datab = not (forecast) and not (errors) if datab == True: # datab output cannot hold forecasts or errors per datapoint forecast = False errors = False if group is not None: if buckets is not None and split is not None: raise AssertionError("group, buckets and split options not supported together.") label_other = None labels, buckets = [], [] for group_name in numpy.unique(group): labels.append(group_name) buckets.append(group == group_name) if name is None: name = "group" if split is not None: if buckets is not None: raise AssertionError("group, buckets and split options not supported together.") buckets = [split] if labels is None: labels = ["True"] label_other = "False" else: label_other = labels[1] labels = [labels[0]] if name is None: name = "condn" elif buckets is not None: if labels is None: labels = [str(d + 1) for d in range(len(buckets))] if name is None: name = "bucket" responses, predictors, weights = arrays.select( [nice_array(responses), nice_array(predictors), nice_array(weights)], sliced=sliced, overlay=overlay, select=select, ) results = [] if label_all is not None: reg = Regress(responses, predictors, weights=weights, constant=constant, names=names, store_last=True) results.append(reg.compute(forecast=forecast, errors=errors)) if axis is None and numpy.isscalar(step) and step == 1: if buckets is None: if not datab: return results[0] else: return Datab(results, formats=formats) else: if label_all is not None: results[-1]["label"] = label_all if label_other: other = numpy.ones(numpy.shape(responses), dtype=bool) buckets = arrays.select(buckets, sliced=sliced, overlay=overlay, select=select) for b, label in zip(buckets, labels): respb, predb, wtb = arrays.select([responses, predictors, weights], select=b) reg = Regress(respb, predb, weights=wtb, constant=constant, names=names, store_last=True) results.append(reg.compute(forecast=forecast, errors=errors)) results[-1]["label"] = label if label_other: other[b] = False if label_other: respb, predb, wtb = arrays.select([responses, predictors, weights], select=other) reg = Regress(respb, predb, weights=wtb, constant=constant, names=names, store_last=True) results.append(reg.compute(forecast=forecast, errors=errors)) results[-1]["label"] = label_other if datab is False: return results else: return Datab(results, name=name or "key", formats=formats) else: if buckets is not None: raise AssertionError("split/buckets option not supported with axis/step option.") if label_all is not None: results[-1]["label"] = label_all if axis > 1 or axis < 0 or numpy.ndim(responses) != 2: raise IndexError("Got unsupported axis option value that is " + "not 0 or 1; or data is not two-dimensional") if axis == 0: responses = responses.transpose() predictors = predictors.transpose() if weights is not None: weights = weights.transpose() start_idx = 0 count = 0 while start_idx < len(responses): row_responses, row_predictors, row_weights = arrays.select( [responses, predictors, weights], sliced=(start_idx, start_idx + step, 1) ) r = Regress(row_responses, row_predictors, weights=row_weights, constant=constant, names=names, store_last=True) start_idx += step count += 1 if not r.Multivariate.count: continue results.append(r.compute(forecast=forecast, errors=errors)) if labels is not None and len(labels): results[-1]["label"] = labels[count - 1] elif label_index is not None: results[-1]["label"] = label_index[start_idx - step] + "-" else: results[-1]["label"] = str(start_idx) + "-" if datab is False: return results else: return Datab(results, name=name or "key", formats=formats)