Beispiel #1
0
def _examine_factor_types(factors, factor_states, default_env, data_iter_maker):
    num_column_counts = {}
    cat_levels_contrasts = {}
    cat_postprocessors = {}
    examine_needed = set(factors)
    for data in data_iter_maker():
        # We might have gathered all the information we need after the first
        # chunk of data. If so, then we shouldn't spend time loading all the
        # rest of the chunks.
        if not examine_needed:
            break
        for factor in list(examine_needed):
            value = factor.eval(factor_states[factor],
                                DictStack([data, default_env]))
            if isinstance(value, Categorical):
                cat_levels_contrasts[factor] = (value.levels,
                                                value.contrast)
                examine_needed.remove(factor)
                continue
            value = atleast_2d_column_default(value)
            _max_allowed_dim(2, value, factor)
            if np.issubdtype(value.dtype, np.number):
                if isinstance(value, Series):
                    column_count = 1
                else:
                    column_count = value.shape[1]
                num_column_counts[factor] = column_count
                examine_needed.remove(factor)
            # issubdtype(X, bool) isn't reliable -- it returns true for
            # X == int! So check the kind code instead:
            elif value.dtype.kind == "b":
                # Special case: give it a transformer, but don't bother
                # processing the rest of the data
                if value.shape[1] > 1:
                    msg = ("factor '%s' evaluates to a boolean array with "
                           "%s columns; I can only handle single-column "
                           "boolean arrays" % (factor.name(), value.shape[1]))
                    raise CharltonError(msg, factor)
                cat_postprocessors[factor] = _BoolToCat(factor)
                examine_needed.remove(factor)
            else:
                if value.shape[1] > 1:
                    msg = ("factor '%s' appears to categorical and has "
                           "%s columns; I can only handle single-column "
                           "categorical factors"
                           % (factor.name(), value.shape[1]))
                    raise CharltonError(msg, factor)
                if factor not in cat_postprocessors:
                    cat_postprocessors[factor] = CategoricalTransform()
                processor = cat_postprocessors[factor]
                processor.memorize_chunk(value)
    for factor, processor in cat_postprocessors.iteritems():
        processor.memorize_finish()
        cat_levels_contrasts[factor] = (processor.levels(), None)
    return (num_column_counts,
            cat_levels_contrasts,
            cat_postprocessors)
Beispiel #2
0
 def memorize_chunk(self, x):
     x = atleast_2d_column_default(x)
     self._count += x.shape[0]
     this_total = np.sum(x, 0, dtype=wide_dtype_for(x))
     # This is to handle potentially multi-column x's:
     if self._sum is None:
         self._sum = this_total
     else:
         self._sum += this_total
Beispiel #3
0
 def memorize_chunk(self, x):
     x = atleast_2d_column_default(x)
     self._count += x.shape[0]
     this_total = np.sum(x, 0, dtype=wide_dtype_for(x))
     # This is to handle potentially multi-column x's:
     if self._sum is None:
         self._sum = this_total
     else:
         self._sum += this_total
Beispiel #4
0
 def transform(self, x, center=True, rescale=True, ddof=0):
     x = atleast_2d_column_default(x)
     if np.issubdtype(x.dtype, np.integer):
         x = np.array(x, dtype=float)
     else:
         x = np.array(x)
     if center:
         x -= self.current_mean
     if rescale:
         x /= np.sqrt(self.current_M2 / (self.current_n - ddof))
     return x
Beispiel #5
0
 def memorize_chunk(self, x, center=True, rescale=True, ddof=0):
     x = atleast_2d_column_default(x)
     if self.current_mean is None:
         self.current_mean = np.zeros(x.shape[1], dtype=wide_dtype_for(x))
         self.current_M2 = np.zeros(x.shape[1], dtype=wide_dtype_for(x))
     # XX this can surely be vectorized but I am feeling lazy:
     for i in xrange(x.shape[0]):
         self.current_n += 1
         delta = x[i, :] - self.current_mean
         self.current_mean += delta / self.current_n
         self.current_M2 += delta * (x[i, :] - self.current_mean)
Beispiel #6
0
 def transform(self, x, center=True, rescale=True, ddof=0):
     x = atleast_2d_column_default(x)
     if np.issubdtype(x.dtype, np.integer):
         x = np.array(x, dtype=float)
     else:
         x = np.array(x)
     if center:
         x -= self.current_mean
     if rescale:
         x /= np.sqrt(self.current_M2 / (self.current_n - ddof))
     return x
Beispiel #7
0
 def memorize_chunk(self, x, center=True, rescale=True, ddof=0):
     x = atleast_2d_column_default(x)
     if self.current_mean is None:
         self.current_mean = np.zeros(x.shape[1], dtype=wide_dtype_for(x))
         self.current_M2 = np.zeros(x.shape[1], dtype=wide_dtype_for(x))
     # XX this can surely be vectorized but I am feeling lazy:
     for i in xrange(x.shape[0]):
         self.current_n += 1
         delta = x[i, :] - self.current_mean
         self.current_mean += delta / self.current_n
         self.current_M2 += delta * (x[i, :] - self.current_mean)
Beispiel #8
0
def _examine_factor_types(factors, factor_states, default_env,
                          data_iter_maker):
    num_column_counts = {}
    cat_levels_contrasts = {}
    cat_postprocessors = {}
    examine_needed = set(factors)
    for data in data_iter_maker():
        # We might have gathered all the information we need after the first
        # chunk of data. If so, then we shouldn't spend time loading all the
        # rest of the chunks.
        if not examine_needed:
            break
        for factor in list(examine_needed):
            value = factor.eval(factor_states[factor],
                                DictStack([data, default_env]))
            if isinstance(value, Categorical):
                cat_levels_contrasts[factor] = (value.levels, value.contrast)
                examine_needed.remove(factor)
                continue
            value = atleast_2d_column_default(value)
            _max_allowed_dim(2, value, factor)
            if np.issubdtype(value.dtype, np.number):
                column_count = value.shape[1]
                num_column_counts[factor] = column_count
                examine_needed.remove(factor)
            # issubdtype(X, bool) isn't reliable -- it returns true for
            # X == int! So check the kind code instead:
            elif value.dtype.kind == "b":
                # Special case: give it a transformer, but don't bother
                # processing the rest of the data
                if value.shape[1] > 1:
                    msg = ("factor '%s' evaluates to a boolean array with "
                           "%s columns; I can only handle single-column "
                           "boolean arrays" % (factor.name(), value.shape[1]))
                    raise CharltonError(msg, factor)
                cat_postprocessors[factor] = _BoolToCat(factor)
                examine_needed.remove(factor)
            else:
                if value.shape[1] > 1:
                    msg = ("factor '%s' appears to categorical and has "
                           "%s columns; I can only handle single-column "
                           "categorical factors" %
                           (factor.name(), value.shape[1]))
                    raise CharltonError(msg, factor)
                if factor not in cat_postprocessors:
                    cat_postprocessors[factor] = CategoricalTransform()
                processor = cat_postprocessors[factor]
                processor.memorize_chunk(value)
    for factor, processor in cat_postprocessors.iteritems():
        processor.memorize_finish()
        cat_levels_contrasts[factor] = (processor.levels(), None)
    return (num_column_counts, cat_levels_contrasts, cat_postprocessors)
Beispiel #9
0
 def eval(self, data):
     result = self.factor.eval(self._state,
                               DictStack([data, self._default_env]))
     result = atleast_2d_column_default(result)
     _max_allowed_dim(2, result, self.factor)
     if result.shape[1] != self._expected_columns:
         raise CharltonError("when evaluating factor %s, I got %s columns "
                             "instead of the %s I was expecting"
                             % (self.factor.name(), self._expected_columns,
                                result.shape[1]),
                             self.factor)
     if not np.issubdtype(result.dtype, np.number):
         raise CharltonError("when evaluating numeric factor %s, "
                             "I got non-numeric data of type '%s'"
                             % (self.factor.name(), result.dtype),
                             self.factor)
     return result
Beispiel #10
0
 def eval(self, data):
     result = self.factor.eval(self._state,
                               DictStack([data, self._default_env]))
     result = atleast_2d_column_default(result)
     _max_allowed_dim(2, result, self.factor)
     if result.shape[1] != self._expected_columns:
         raise CharltonError(
             "when evaluating factor %s, I got %s columns "
             "instead of the %s I was expecting" %
             (self.factor.name(), self._expected_columns, result.shape[1]),
             self.factor)
     if not np.issubdtype(result.dtype, np.number):
         raise CharltonError(
             "when evaluating numeric factor %s, "
             "I got non-numeric data of type '%s'" %
             (self.factor.name(), result.dtype), self.factor)
     return result
Beispiel #11
0
 def eval(self, data):
     result = self.factor.eval(self._state,
                               DictStack([data, self._default_env]))
     if self._postprocessor is not None:
         result = self._postprocessor.transform(result)
     if not isinstance(result, Categorical):
         msg = ("when evaluating categoric factor %s, I got a "
                "result that is not of type Categorical (but rather %s)"
                # result.__class__.__name__ would be better, but not
                # defined for old-style classes:
                % (self.factor.name(), result.__class__))
         raise CharltonError(msg, self.factor)
     if result.levels != self._expected_levels:
         msg = ("when evaluating categoric factor %s, I got Categorical "
                " data with unexpected levels (wanted %s, got %s)"
                % (self.factor.name(), self._expected_levels, result.levels))
         raise CharltonError(msg, self.factor)
     _max_allowed_dim(1, result.int_array, self.factor)
     # For consistency, evaluators *always* return 2d arrays (though in
     # this case it will always have only 1 column):
     return atleast_2d_column_default(result.int_array)
Beispiel #12
0
 def eval(self, data):
     result = self.factor.eval(self._state,
                               DictStack([data, self._default_env]))
     if self._postprocessor is not None:
         result = self._postprocessor.transform(result)
     if not isinstance(result, Categorical):
         msg = (
             "when evaluating categoric factor %s, I got a "
             "result that is not of type Categorical (but rather %s)"
             # result.__class__.__name__ would be better, but not
             # defined for old-style classes:
             % (self.factor.name(), result.__class__))
         raise CharltonError(msg, self.factor)
     if result.levels != self._expected_levels:
         msg = ("when evaluating categoric factor %s, I got Categorical "
                " data with unexpected levels (wanted %s, got %s)" %
                (self.factor.name(), self._expected_levels, result.levels))
         raise CharltonError(msg, self.factor)
     _max_allowed_dim(1, result.int_array, self.factor)
     # For consistency, evaluators *always* return 2d arrays (though in
     # this case it will always have only 1 column):
     return atleast_2d_column_default(result.int_array)
Beispiel #13
0
def test__ColumnBuilder():
    from charlton.contrasts import ContrastMatrix
    f1 = _MockFactor("f1")
    f2 = _MockFactor("f2")
    f3 = _MockFactor("f3")
    contrast = ContrastMatrix(np.array([[0, 0.5], [3, 0]]), ["[c1]", "[c2]"])

    cb = _ColumnBuilder([f1, f2, f3], {f1: 1, f3: 1}, {f2: contrast})
    mat = np.empty((3, 2))
    assert cb.column_names() == ["f1:f2[c1]:f3", "f1:f2[c2]:f3"]
    cb.build(
        {
            f1: atleast_2d_column_default([1, 2, 3]),
            f2: atleast_2d_column_default([0, 0, 1]),
            f3: atleast_2d_column_default([7.5, 2, -12])
        }, mat)
    assert np.allclose(
        mat, [[0, 0.5 * 1 * 7.5], [0, 0.5 * 2 * 2], [3 * 3 * -12, 0]])
    cb2 = _ColumnBuilder([f1, f2, f3], {f1: 2, f3: 1}, {f2: contrast})
    mat2 = np.empty((3, 4))
    cb2.build(
        {
            f1: atleast_2d_column_default([[1, 2], [3, 4], [5, 6]]),
            f2: atleast_2d_column_default([0, 0, 1]),
            f3: atleast_2d_column_default([7.5, 2, -12])
        }, mat2)
    assert cb2.column_names() == [
        "f1[0]:f2[c1]:f3", "f1[0]:f2[c2]:f3", "f1[1]:f2[c1]:f3",
        "f1[1]:f2[c2]:f3"
    ]
    assert np.allclose(
        mat2,
        [[0, 0.5 * 1 * 7.5, 0, 0.5 * 2 * 7.5],
         [0, 0.5 * 3 * 2, 0, 0.5 * 4 * 2], [3 * 5 * -12, 0, 3 * 6 * -12, 0]])
    # Check intercept building:
    cb_intercept = _ColumnBuilder([], {}, {})
    assert cb_intercept.column_names() == ["Intercept"]
    mat3 = np.empty((3, 1))
    cb_intercept.build({f1: [1, 2, 3], f2: [1, 2, 3], f3: [1, 2, 3]}, mat3)
    assert np.allclose(mat3, 1)
Beispiel #14
0
def test__ColumnBuilder():
    from charlton.contrasts import ContrastMatrix
    f1 = _MockFactor("f1")
    f2 = _MockFactor("f2")
    f3 = _MockFactor("f3")
    contrast = ContrastMatrix(np.array([[0, 0.5],
                                        [3, 0]]),
                              ["[c1]", "[c2]"])

    cb = _ColumnBuilder([f1, f2, f3], {f1: 1, f3: 1}, {f2: contrast})
    mat = np.empty((3, 2))
    assert cb.column_names() == ["f1:f2[c1]:f3", "f1:f2[c2]:f3"]
    cb.build({f1: atleast_2d_column_default([1, 2, 3]),
              f2: atleast_2d_column_default([0, 0, 1]),
              f3: atleast_2d_column_default([7.5, 2, -12])},
             mat)
    assert np.allclose(mat, [[0, 0.5 * 1 * 7.5],
                             [0, 0.5 * 2 * 2],
                             [3 * 3 * -12, 0]])
    cb2 = _ColumnBuilder([f1, f2, f3], {f1: 2, f3: 1}, {f2: contrast})
    mat2 = np.empty((3, 4))
    cb2.build({f1: atleast_2d_column_default([[1, 2], [3, 4], [5, 6]]),
               f2: atleast_2d_column_default([0, 0, 1]),
               f3: atleast_2d_column_default([7.5, 2, -12])},
              mat2)
    assert cb2.column_names() == ["f1[0]:f2[c1]:f3",
                                  "f1[0]:f2[c2]:f3",
                                  "f1[1]:f2[c1]:f3",
                                  "f1[1]:f2[c2]:f3"]
    assert np.allclose(mat2, [[0, 0.5 * 1 * 7.5, 0, 0.5 * 2 * 7.5],
                              [0, 0.5 * 3 * 2, 0, 0.5 * 4 * 2],
                              [3 * 5 * -12, 0, 3 * 6 * -12, 0]])
    # Check intercept building:
    cb_intercept = _ColumnBuilder([], {}, {})
    assert cb_intercept.column_names() == ["Intercept"]
    mat3 = np.empty((3, 1))
    cb_intercept.build({f1: [1, 2, 3], f2: [1, 2, 3], f3: [1, 2, 3]}, mat3)
    assert np.allclose(mat3, 1)
Beispiel #15
0
 def transform(self, x):
     # XX: this probably returns very wide floating point data, which is
     # perhaps not what we desire -- should the mean be cast down to the
     # input data's width? (well, not if the input data is integer, but you
     # know what I mean.)
     return atleast_2d_column_default(x) - (self._sum / self._count)
Beispiel #16
0
 def transform(self, x):
     # XX: this probably returns very wide floating point data, which is
     # perhaps not what we desire -- should the mean be cast down to the
     # input data's width? (well, not if the input data is integer, but you
     # know what I mean.)
     return atleast_2d_column_default(x) - (self._sum / self._count)