def _examine_factor_types(factors, factor_states, default_env, data_iter_maker): num_column_counts = {} cat_levels_contrasts = {} cat_postprocessors = {} examine_needed = set(factors) for data in data_iter_maker(): # We might have gathered all the information we need after the first # chunk of data. If so, then we shouldn't spend time loading all the # rest of the chunks. if not examine_needed: break for factor in list(examine_needed): value = factor.eval(factor_states[factor], DictStack([data, default_env])) if isinstance(value, Categorical): cat_levels_contrasts[factor] = (value.levels, value.contrast) examine_needed.remove(factor) continue value = atleast_2d_column_default(value) _max_allowed_dim(2, value, factor) if np.issubdtype(value.dtype, np.number): if isinstance(value, Series): column_count = 1 else: column_count = value.shape[1] num_column_counts[factor] = column_count examine_needed.remove(factor) # issubdtype(X, bool) isn't reliable -- it returns true for # X == int! So check the kind code instead: elif value.dtype.kind == "b": # Special case: give it a transformer, but don't bother # processing the rest of the data if value.shape[1] > 1: msg = ("factor '%s' evaluates to a boolean array with " "%s columns; I can only handle single-column " "boolean arrays" % (factor.name(), value.shape[1])) raise CharltonError(msg, factor) cat_postprocessors[factor] = _BoolToCat(factor) examine_needed.remove(factor) else: if value.shape[1] > 1: msg = ("factor '%s' appears to categorical and has " "%s columns; I can only handle single-column " "categorical factors" % (factor.name(), value.shape[1])) raise CharltonError(msg, factor) if factor not in cat_postprocessors: cat_postprocessors[factor] = CategoricalTransform() processor = cat_postprocessors[factor] processor.memorize_chunk(value) for factor, processor in cat_postprocessors.iteritems(): processor.memorize_finish() cat_levels_contrasts[factor] = (processor.levels(), None) return (num_column_counts, cat_levels_contrasts, cat_postprocessors)
def memorize_chunk(self, x): x = atleast_2d_column_default(x) self._count += x.shape[0] this_total = np.sum(x, 0, dtype=wide_dtype_for(x)) # This is to handle potentially multi-column x's: if self._sum is None: self._sum = this_total else: self._sum += this_total
def transform(self, x, center=True, rescale=True, ddof=0): x = atleast_2d_column_default(x) if np.issubdtype(x.dtype, np.integer): x = np.array(x, dtype=float) else: x = np.array(x) if center: x -= self.current_mean if rescale: x /= np.sqrt(self.current_M2 / (self.current_n - ddof)) return x
def memorize_chunk(self, x, center=True, rescale=True, ddof=0): x = atleast_2d_column_default(x) if self.current_mean is None: self.current_mean = np.zeros(x.shape[1], dtype=wide_dtype_for(x)) self.current_M2 = np.zeros(x.shape[1], dtype=wide_dtype_for(x)) # XX this can surely be vectorized but I am feeling lazy: for i in xrange(x.shape[0]): self.current_n += 1 delta = x[i, :] - self.current_mean self.current_mean += delta / self.current_n self.current_M2 += delta * (x[i, :] - self.current_mean)
def _examine_factor_types(factors, factor_states, default_env, data_iter_maker): num_column_counts = {} cat_levels_contrasts = {} cat_postprocessors = {} examine_needed = set(factors) for data in data_iter_maker(): # We might have gathered all the information we need after the first # chunk of data. If so, then we shouldn't spend time loading all the # rest of the chunks. if not examine_needed: break for factor in list(examine_needed): value = factor.eval(factor_states[factor], DictStack([data, default_env])) if isinstance(value, Categorical): cat_levels_contrasts[factor] = (value.levels, value.contrast) examine_needed.remove(factor) continue value = atleast_2d_column_default(value) _max_allowed_dim(2, value, factor) if np.issubdtype(value.dtype, np.number): column_count = value.shape[1] num_column_counts[factor] = column_count examine_needed.remove(factor) # issubdtype(X, bool) isn't reliable -- it returns true for # X == int! So check the kind code instead: elif value.dtype.kind == "b": # Special case: give it a transformer, but don't bother # processing the rest of the data if value.shape[1] > 1: msg = ("factor '%s' evaluates to a boolean array with " "%s columns; I can only handle single-column " "boolean arrays" % (factor.name(), value.shape[1])) raise CharltonError(msg, factor) cat_postprocessors[factor] = _BoolToCat(factor) examine_needed.remove(factor) else: if value.shape[1] > 1: msg = ("factor '%s' appears to categorical and has " "%s columns; I can only handle single-column " "categorical factors" % (factor.name(), value.shape[1])) raise CharltonError(msg, factor) if factor not in cat_postprocessors: cat_postprocessors[factor] = CategoricalTransform() processor = cat_postprocessors[factor] processor.memorize_chunk(value) for factor, processor in cat_postprocessors.iteritems(): processor.memorize_finish() cat_levels_contrasts[factor] = (processor.levels(), None) return (num_column_counts, cat_levels_contrasts, cat_postprocessors)
def eval(self, data): result = self.factor.eval(self._state, DictStack([data, self._default_env])) result = atleast_2d_column_default(result) _max_allowed_dim(2, result, self.factor) if result.shape[1] != self._expected_columns: raise CharltonError("when evaluating factor %s, I got %s columns " "instead of the %s I was expecting" % (self.factor.name(), self._expected_columns, result.shape[1]), self.factor) if not np.issubdtype(result.dtype, np.number): raise CharltonError("when evaluating numeric factor %s, " "I got non-numeric data of type '%s'" % (self.factor.name(), result.dtype), self.factor) return result
def eval(self, data): result = self.factor.eval(self._state, DictStack([data, self._default_env])) result = atleast_2d_column_default(result) _max_allowed_dim(2, result, self.factor) if result.shape[1] != self._expected_columns: raise CharltonError( "when evaluating factor %s, I got %s columns " "instead of the %s I was expecting" % (self.factor.name(), self._expected_columns, result.shape[1]), self.factor) if not np.issubdtype(result.dtype, np.number): raise CharltonError( "when evaluating numeric factor %s, " "I got non-numeric data of type '%s'" % (self.factor.name(), result.dtype), self.factor) return result
def eval(self, data): result = self.factor.eval(self._state, DictStack([data, self._default_env])) if self._postprocessor is not None: result = self._postprocessor.transform(result) if not isinstance(result, Categorical): msg = ("when evaluating categoric factor %s, I got a " "result that is not of type Categorical (but rather %s)" # result.__class__.__name__ would be better, but not # defined for old-style classes: % (self.factor.name(), result.__class__)) raise CharltonError(msg, self.factor) if result.levels != self._expected_levels: msg = ("when evaluating categoric factor %s, I got Categorical " " data with unexpected levels (wanted %s, got %s)" % (self.factor.name(), self._expected_levels, result.levels)) raise CharltonError(msg, self.factor) _max_allowed_dim(1, result.int_array, self.factor) # For consistency, evaluators *always* return 2d arrays (though in # this case it will always have only 1 column): return atleast_2d_column_default(result.int_array)
def eval(self, data): result = self.factor.eval(self._state, DictStack([data, self._default_env])) if self._postprocessor is not None: result = self._postprocessor.transform(result) if not isinstance(result, Categorical): msg = ( "when evaluating categoric factor %s, I got a " "result that is not of type Categorical (but rather %s)" # result.__class__.__name__ would be better, but not # defined for old-style classes: % (self.factor.name(), result.__class__)) raise CharltonError(msg, self.factor) if result.levels != self._expected_levels: msg = ("when evaluating categoric factor %s, I got Categorical " " data with unexpected levels (wanted %s, got %s)" % (self.factor.name(), self._expected_levels, result.levels)) raise CharltonError(msg, self.factor) _max_allowed_dim(1, result.int_array, self.factor) # For consistency, evaluators *always* return 2d arrays (though in # this case it will always have only 1 column): return atleast_2d_column_default(result.int_array)
def test__ColumnBuilder(): from charlton.contrasts import ContrastMatrix f1 = _MockFactor("f1") f2 = _MockFactor("f2") f3 = _MockFactor("f3") contrast = ContrastMatrix(np.array([[0, 0.5], [3, 0]]), ["[c1]", "[c2]"]) cb = _ColumnBuilder([f1, f2, f3], {f1: 1, f3: 1}, {f2: contrast}) mat = np.empty((3, 2)) assert cb.column_names() == ["f1:f2[c1]:f3", "f1:f2[c2]:f3"] cb.build( { f1: atleast_2d_column_default([1, 2, 3]), f2: atleast_2d_column_default([0, 0, 1]), f3: atleast_2d_column_default([7.5, 2, -12]) }, mat) assert np.allclose( mat, [[0, 0.5 * 1 * 7.5], [0, 0.5 * 2 * 2], [3 * 3 * -12, 0]]) cb2 = _ColumnBuilder([f1, f2, f3], {f1: 2, f3: 1}, {f2: contrast}) mat2 = np.empty((3, 4)) cb2.build( { f1: atleast_2d_column_default([[1, 2], [3, 4], [5, 6]]), f2: atleast_2d_column_default([0, 0, 1]), f3: atleast_2d_column_default([7.5, 2, -12]) }, mat2) assert cb2.column_names() == [ "f1[0]:f2[c1]:f3", "f1[0]:f2[c2]:f3", "f1[1]:f2[c1]:f3", "f1[1]:f2[c2]:f3" ] assert np.allclose( mat2, [[0, 0.5 * 1 * 7.5, 0, 0.5 * 2 * 7.5], [0, 0.5 * 3 * 2, 0, 0.5 * 4 * 2], [3 * 5 * -12, 0, 3 * 6 * -12, 0]]) # Check intercept building: cb_intercept = _ColumnBuilder([], {}, {}) assert cb_intercept.column_names() == ["Intercept"] mat3 = np.empty((3, 1)) cb_intercept.build({f1: [1, 2, 3], f2: [1, 2, 3], f3: [1, 2, 3]}, mat3) assert np.allclose(mat3, 1)
def test__ColumnBuilder(): from charlton.contrasts import ContrastMatrix f1 = _MockFactor("f1") f2 = _MockFactor("f2") f3 = _MockFactor("f3") contrast = ContrastMatrix(np.array([[0, 0.5], [3, 0]]), ["[c1]", "[c2]"]) cb = _ColumnBuilder([f1, f2, f3], {f1: 1, f3: 1}, {f2: contrast}) mat = np.empty((3, 2)) assert cb.column_names() == ["f1:f2[c1]:f3", "f1:f2[c2]:f3"] cb.build({f1: atleast_2d_column_default([1, 2, 3]), f2: atleast_2d_column_default([0, 0, 1]), f3: atleast_2d_column_default([7.5, 2, -12])}, mat) assert np.allclose(mat, [[0, 0.5 * 1 * 7.5], [0, 0.5 * 2 * 2], [3 * 3 * -12, 0]]) cb2 = _ColumnBuilder([f1, f2, f3], {f1: 2, f3: 1}, {f2: contrast}) mat2 = np.empty((3, 4)) cb2.build({f1: atleast_2d_column_default([[1, 2], [3, 4], [5, 6]]), f2: atleast_2d_column_default([0, 0, 1]), f3: atleast_2d_column_default([7.5, 2, -12])}, mat2) assert cb2.column_names() == ["f1[0]:f2[c1]:f3", "f1[0]:f2[c2]:f3", "f1[1]:f2[c1]:f3", "f1[1]:f2[c2]:f3"] assert np.allclose(mat2, [[0, 0.5 * 1 * 7.5, 0, 0.5 * 2 * 7.5], [0, 0.5 * 3 * 2, 0, 0.5 * 4 * 2], [3 * 5 * -12, 0, 3 * 6 * -12, 0]]) # Check intercept building: cb_intercept = _ColumnBuilder([], {}, {}) assert cb_intercept.column_names() == ["Intercept"] mat3 = np.empty((3, 1)) cb_intercept.build({f1: [1, 2, 3], f2: [1, 2, 3], f3: [1, 2, 3]}, mat3) assert np.allclose(mat3, 1)
def transform(self, x): # XX: this probably returns very wide floating point data, which is # perhaps not what we desire -- should the mean be cast down to the # input data's width? (well, not if the input data is integer, but you # know what I mean.) return atleast_2d_column_default(x) - (self._sum / self._count)