Exemple #1
0
 def eval(self, tree, require_evalexpr=True):
     result = None
     if isinstance(tree, str):
         if tree == "0":
             result = IntermediateExpr(False, None, True, [])
         elif tree == "1":
             result = IntermediateExpr(True, tree.origin, False, [])
         elif self._is_a(int, tree) or self._is_a(float, tree):
             raise CharltonError("numbers besides '0' and '1' are "
                                 "only allowed with **", tree)
         else:
             # Guess it's a Python expression
             result = IntermediateExpr(False, None, False,
                                       [Term([EvalFactor(tree)])])
     else:
         assert isinstance(tree, ParseNode)
         key = (tree.op.token, len(tree.args))
         if key not in self._evaluators:
             raise CharltonError("I don't know how to evaluate "
                                 "this '%s' operator" % (tree.op.token,),
                                 tree.op)
         result = self._evaluators[key](self, tree)
     if require_evalexpr and not isinstance(result, IntermediateExpr):
         if isinstance(result, ModelDesc):
             raise CharltonError("~ can only be used once, and "
                                 "only at the top level",
                                 tree)
         else:
             raise CharltonError("custom operator returned an "
                                 "object that I don't know how to "
                                 "handle", tree)
     return result
Exemple #2
0
def _read_python_expr(token_source, c):
    end_tokens = set(c.binary_ops.keys() + c.unary_ops.keys() + [")"])
    token_types = []
    tokens = []
    bracket_level = 0
    while (bracket_level
           or (token_source.peek()[1] not in end_tokens
               and token_source.peek()[0] != tokenize.ENDMARKER)):
        assert bracket_level >= 0
        (token_type, token) = token_source.next()
        _check_token(token_type, token)
        if token in ("(", "[", "{"):
            bracket_level += 1
        if token in (")", "]", "}"):
            bracket_level -= 1
        if bracket_level < 0:
            raise CharltonError("unmatched close bracket", token)
        if token_type == tokenize.ENDMARKER:
            assert bracket_level > 0
            raise CharltonError(
                "unclosed bracket in embedded Python "
                "expression", _combine_origin_attrs(tokens))
        token_types.append(token_type)
        tokens.append(token)
    text = pretty_untokenize(zip(token_types, tokens))
    return StringWithOrigin(text, _combine_origin_attrs(tokens))
Exemple #3
0
 def from_strings(cls, sequence, levels=None, **kwargs):
     if levels is None:
         try:
             levels = list(set(sequence))
         except TypeError:
             raise CharltonError("Error converting data to categorical: "
                                 "all items must be hashable")
         levels.sort()
     level_to_int = {}
     for i, level in enumerate(levels):
         try:
             level_to_int[level] = i
         except TypeError:
             raise CharltonError(
                 "Error converting data to categorical: "
                 "all levels must be hashable (and %r isn't)" % (level, ))
     int_array = np.empty(len(sequence), dtype=int)
     for i, entry in enumerate(sequence):
         try:
             int_array[i] = level_to_int[entry]
         except ValueError:
             raise CharltonError("Error converting data to categorical: "
                                 "object '%r' does not match any of the "
                                 "expected levels" % (entry, ))
     return cls(int_array, levels, **kwargs)
Exemple #4
0
def _check_token(token_type, token):
    # These are filtered out of our input string, so they should never
    # appear...
    assert token_type not in (tokenize.NL, tokenize.NEWLINE)
    if token_type == tokenize.ERRORTOKEN:
        raise CharltonError(
            "error tokenizing input "
            "(maybe an unclosed string?)", token)
    if token_type == tokenize.COMMENT:
        raise CharltonError("comments are not allowed", token)
Exemple #5
0
def _examine_factor_types(factors, factor_states, default_env,
                          data_iter_maker):
    num_column_counts = {}
    cat_levels_contrasts = {}
    cat_postprocessors = {}
    examine_needed = set(factors)
    for data in data_iter_maker():
        # We might have gathered all the information we need after the first
        # chunk of data. If so, then we shouldn't spend time loading all the
        # rest of the chunks.
        if not examine_needed:
            break
        for factor in list(examine_needed):
            value = factor.eval(factor_states[factor],
                                DictStack([data, default_env]))
            if isinstance(value, Categorical):
                cat_levels_contrasts[factor] = (value.levels, value.contrast)
                examine_needed.remove(factor)
                continue
            value = atleast_2d_column_default(value)
            _max_allowed_dim(2, value, factor)
            if np.issubdtype(value.dtype, np.number):
                column_count = value.shape[1]
                num_column_counts[factor] = column_count
                examine_needed.remove(factor)
            # issubdtype(X, bool) isn't reliable -- it returns true for
            # X == int! So check the kind code instead:
            elif value.dtype.kind == "b":
                # Special case: give it a transformer, but don't bother
                # processing the rest of the data
                if value.shape[1] > 1:
                    msg = ("factor '%s' evaluates to a boolean array with "
                           "%s columns; I can only handle single-column "
                           "boolean arrays" % (factor.name(), value.shape[1]))
                    raise CharltonError(msg, factor)
                cat_postprocessors[factor] = _BoolToCat(factor)
                examine_needed.remove(factor)
            else:
                if value.shape[1] > 1:
                    msg = ("factor '%s' appears to categorical and has "
                           "%s columns; I can only handle single-column "
                           "categorical factors" %
                           (factor.name(), value.shape[1]))
                    raise CharltonError(msg, factor)
                if factor not in cat_postprocessors:
                    cat_postprocessors[factor] = CategoricalTransform()
                processor = cat_postprocessors[factor]
                processor.memorize_chunk(value)
    for factor, processor in cat_postprocessors.iteritems():
        processor.memorize_finish()
        cat_levels_contrasts[factor] = (processor.levels(), None)
    return (num_column_counts, cat_levels_contrasts, cat_postprocessors)
Exemple #6
0
def make_model_matrices(builders, data, dtype=float):
    evaluator_to_values = {}
    num_rows = None
    for builder in builders:
        # We look at evaluators rather than factors here, because it might
        # happen that we have the same factor twice, but with different
        # memorized state.
        for evaluator in builder._evaluators:
            if evaluator not in evaluator_to_values:
                value = evaluator.eval(data)
                assert value.ndim == 2
                if num_rows is None:
                    num_rows = value.shape[0]
                else:
                    if num_rows != value.shape[0]:
                        msg = ("Row mismatch: factor %s had %s rows, when "
                               "previous factors had %s rows" %
                               (evaluator.factor.name(), value.shape[0],
                                num_rows))
                        raise CharltonError(msg, evaluator.factor)
                evaluator_to_values[evaluator] = value
    matrices = []
    for builder in builders:
        matrices.append(builder._build(evaluator_to_values, dtype))
    return matrices
Exemple #7
0
 def _code_either(self, intercept, levels):
     n = len(levels)
     scores = self.scores
     if scores is None:
         scores = np.arange(n)
     scores = np.asarray(scores)
     if len(scores) != n:
         raise CharltonError("number of levels (%s) does not match"
                             " number of scores (%s)" % (n, len(scores)))
     # Strategy: just make a matrix whose columns are naive linear,
     # quadratic, etc., functions of the raw scores, and then use 'qr' to
     # orthogonalize each column against those to its left.
     scores -= scores.mean()
     raw_poly = scores.reshape((-1, 1))**np.arange(n).reshape((1, -1))
     q, r = np.linalg.qr(raw_poly)
     q *= np.sign(np.diag(r))
     q /= np.sqrt(np.sum(q**2, axis=1))
     names = [".Constant", ".Linear", ".Quadratic", ".Cubic"]
     names += ["^%s" % (i, ) for i in xrange(4, n)]
     names = names[:n]
     if intercept:
         return ContrastMatrix(q, names)
     else:
         # We always include the constant/intercept column as something to
         # orthogonalize against, but we don't always return it:
         return ContrastMatrix(q[:, 1:], names[1:])
Exemple #8
0
def _eval_unary_minus(evaluator, tree):
    if tree.args[0] == "0":
        return IntermediateExpr(True, tree.origin, False, [])
    elif tree.args[0] == "1":
        return IntermediateExpr(False, None, True, [])
    else:
        raise CharltonError("Unary minus can only be applied to 1 or 0", tree)
Exemple #9
0
 def eval(self, data):
     result = self.factor.eval(self._state,
                               DictStack([data, self._default_env]))
     result = atleast_2d_column_default(result)
     _max_allowed_dim(2, result, self.factor)
     if result.shape[1] != self._expected_columns:
         raise CharltonError(
             "when evaluating factor %s, I got %s columns "
             "instead of the %s I was expecting" %
             (self.factor.name(), self._expected_columns, result.shape[1]),
             self.factor)
     if not np.issubdtype(result.dtype, np.number):
         raise CharltonError(
             "when evaluating numeric factor %s, "
             "I got non-numeric data of type '%s'" %
             (self.factor.name(), result.dtype), self.factor)
     return result
Exemple #10
0
 def transform(self, data, levels=None, **kwargs):
     if isinstance(data, Categorical):
         if levels is not None and data.levels != levels:
             raise CharltonError("changing levels of categorical data "
                                 "not supported yet")
         return Categorical(data.int_array, data.levels, **kwargs)
     if levels is None:
         levels = self._levels_tuple
     return Categorical.from_strings(data, levels=levels, **kwargs)
Exemple #11
0
 def transform(self, data):
     data = np.asarray(data)
     _max_allowed_dim(1, data, self.factor)
     # issubdtype(int, bool) is true! So we can't use it:
     if not data.dtype.kind == "b":
         raise CharltonError(
             "factor %s, which I thought was boolean, "
             "gave non-boolean data of dtype %s" %
             (self.factor.name(), data.dtype), self.factor)
     return Categorical(data, levels=[False, True])
Exemple #12
0
def _read_op_context(token_source, c):
    token_type, token = token_source.next()
    assert token_type != tokenize.ENDMARKER
    if token == ")":
        while c.op_stack and c.op_stack[-1].token != "(":
            _run_op(c)
        if not c.op_stack:
            raise CharltonError("missing '(' or extra ')'", token)
        assert c.op_stack[-1].token == "("
        c.op_stack.pop()
        return False
    elif token in c.binary_ops:
        op = c.binary_ops[token].with_origin(token.origin)
        while (c.op_stack and op.precedence <= c.op_stack[-1].precedence):
            _run_op(c)
        c.op_stack.append(op)
        return True
    else:
        raise CharltonError("expected an operator", token)
    assert False
Exemple #13
0
def replace_bare_funcalls(code, replacer):
    tokens = []
    for (token_type, token, props) in annotated_tokens(code):
        if props["bare_ref"]:
            replacement = replacer(token)
            if replacement != token:
                if not props["bare_funcall"]:
                    msg = ("magic functions like '%s' can only be called, "
                           "not otherwise referenced" % (token, ))
                    raise CharltonError(msg, token.origin)
                token = replacement
        tokens.append((token_type, token))
    return pretty_untokenize(tokens)
Exemple #14
0
 def eval(self, data):
     result = self.factor.eval(self._state,
                               DictStack([data, self._default_env]))
     if self._postprocessor is not None:
         result = self._postprocessor.transform(result)
     if not isinstance(result, Categorical):
         msg = (
             "when evaluating categoric factor %s, I got a "
             "result that is not of type Categorical (but rather %s)"
             # result.__class__.__name__ would be better, but not
             # defined for old-style classes:
             % (self.factor.name(), result.__class__))
         raise CharltonError(msg, self.factor)
     if result.levels != self._expected_levels:
         msg = ("when evaluating categoric factor %s, I got Categorical "
                " data with unexpected levels (wanted %s, got %s)" %
                (self.factor.name(), self._expected_levels, result.levels))
         raise CharltonError(msg, self.factor)
     _max_allowed_dim(1, result.int_array, self.factor)
     # For consistency, evaluators *always* return 2d arrays (though in
     # this case it will always have only 1 column):
     return atleast_2d_column_default(result.int_array)
Exemple #15
0
def _read_noun_context(token_source, c):
    token_type, token = token_source.next()
    if token == "(":
        c.op_stack.append(_open_paren.with_origin(token.origin))
        return True
    elif token in c.unary_ops:
        c.op_stack.append(c.unary_ops[token].with_origin(token.origin))
        return True
    elif token == ")" or token in c.binary_ops:
        raise CharltonError("expected a noun, not '%s'" % (token, ), token)
    elif token_type == tokenize.ENDMARKER:
        assert c.op_stack
        raise CharltonError("expected a noun, but the formula ended instead",
                            c.op_stack[-1])
    elif token_type == tokenize.NUMBER:
        c.noun_stack.append(token)
        return False
    else:
        token_source.push_back(token_type, token)
        c.noun_stack.append(_read_python_expr(token_source, c))
        return False
    assert False
Exemple #16
0
def parse(code, extra_operators=[]):
    code = code.replace("\n", " ").strip()
    if not code:
        code = "~ 1"
    token_source = TokenSource(code)

    for extra_operator in extra_operators:
        if extra_operator.precedence < 0:
            raise ValueError, "all operators must have precedence >= 0"

    all_op_list = _default_ops + extra_operators
    unary_ops = {}
    binary_ops = {}
    for op in all_op_list:
        if op.arity == 1:
            unary_ops[op.token] = op
        elif op.arity == 2:
            binary_ops[op.token] = op
        else:
            raise ValueError, "operators must be unary or binary"

    c = _ParseContext(unary_ops, binary_ops)

    # This is an implementation of Dijkstra's shunting yard algorithm:
    #   http://en.wikipedia.org/wiki/Shunting_yard_algorithm
    #   http://www.engr.mun.ca/~theo/Misc/exp_parsing.htm

    want_noun = True
    while True:
        if want_noun:
            want_noun = _read_noun_context(token_source, c)
        else:
            if token_source.peek()[0] == tokenize.ENDMARKER:
                break
            want_noun = _read_op_context(token_source, c)

    while c.op_stack:
        if c.op_stack[-1].token == "(":
            raise CharltonError("Unmatched '('", c.op_stack[-1])
        _run_op(c)

    assert len(c.noun_stack) == 1
    tree = c.noun_stack.pop()
    if not isinstance(tree, ParseNode) or tree.op.token != "~":
        tree = ParseNode(unary_ops["~"], [tree], tree.origin)
    return tree
Exemple #17
0
def _eval_binary_power(evaluator, tree):
    left_expr = evaluator.eval(tree.args[0])
    _check_interactable(left_expr)
    power = -1
    try:
        power = int(tree.args[1])
    except (ValueError, TypeError):
        pass
    if power < 1:
        raise CharltonError("'**' requires a positive integer", tree.args[1])
    all_terms = left_expr.terms
    big_expr = left_expr
    # Small optimization: (a + b)**100 is just the same as (a + b)**2.
    power = min(len(left_expr.terms), power)
    for i in xrange(1, power):
        big_expr = _interaction(left_expr, big_expr)
        all_terms = all_terms + big_expr.terms
    return IntermediateExpr(False, None, False, all_terms)
Exemple #18
0
 def index(self, column_specifier):
     """Take anything (raw indices, term names, column names...) and return
     something that can be used as an index into the model matrix
     ndarray."""
     column_specifier = np.atleast_1d(column_specifier)
     if np.issubdtype(column_specifier.dtype, int):
         return column_specifier
     if column_specifier.dtype.kind == "b":
         return column_specifier
     columns = []
     for name in column_specifier:
         if name in self.term_to_columns:
             columns += range(*self.term_to_columns[name])
         elif name in self.term_name_to_columns:
             columns += range(*self.term_name_to_columns[name])
         elif name in self.column_name_to_column:
             columns.append(self.column_name_to_column[name])
         else:
             raise CharltonError("unknown column specifier '%s'" % (name, ))
     return columns
Exemple #19
0
    def memorize_passes_needed(self, state, stateful_transforms):
        # 'stateful_transforms' is a dict {name: transform_factory}, where
        # transform_factory is just a zero-arg callable that makes the given
        # sort of transform (probably just the class itself).
        # 'state' is just an empty dict which we can do whatever we want with,
        # and that will be passed back to later memorize functions
        state["transforms"] = {}

        # example code: == "2 * center(x)"
        i = [0]

        def new_name_maker(token):
            if token in stateful_transforms:
                obj_name = "_charlton_stobj%s__%s__" % (i[0], token)
                i[0] += 1
                state["transforms"][obj_name] = stateful_transforms[token]()
                return obj_name + ".transform"
            else:
                return token

        # example eval_code: == "2 * _charlton_stobj0__center__.transform(x)"
        eval_code = replace_bare_funcalls(self.code, new_name_maker)
        state["eval_code"] = eval_code
        # paranoia: verify that none of our new names appeared anywhere in the
        # original code
        if has_bare_variable_reference(state["transforms"], self.code):
            raise CharltonError(
                "names of this form are reserved for "
                "internal use (%s)" % (token, ), token.origin)
        # Pull out all the '_charlton_stobj0__center__.transform(x)' pieces
        # to make '_charlton_stobj0__center__.memorize_chunk(x)' pieces
        state["memorize_code"] = {}
        for obj_name in state["transforms"]:
            transform_calls = capture_obj_method_calls(obj_name, eval_code)
            assert len(transform_calls) == 1
            transform_call = transform_calls[0]
            transform_call_name, transform_call_code = transform_call
            assert transform_call_name == obj_name + ".transform"
            assert transform_call_code.startswith(transform_call_name + "(")
            memorize_code = (obj_name + ".memorize_chunk" +
                             transform_call_code[len(transform_call_name):])
            state["memorize_code"][obj_name] = memorize_code
        # Then sort the codes into bins, so that every item in bin number i
        # depends only on items in bin (i-1) or less. (By 'depends', we mean
        # that in something like:
        #   spline(center(x))
        # we have to first run:
        #    center.memorize_chunk(x)
        # then
        #    center.memorize_finish(x)
        # and only then can we run:
        #    spline.memorize_chunk(center.transform(x))
        # Since all of our objects have unique names, figuring out who
        # depends on who is pretty easy -- we just check whether the
        # memorization code for spline:
        #    spline.memorize_chunk(center.transform(x))
        # mentions the variable 'center' (which in the example, of course, it
        # does).
        pass_bins = []
        unsorted = set(state["transforms"])
        while unsorted:
            pass_bin = set()
            for obj_name in unsorted:
                other_objs = unsorted.difference([obj_name])
                memorize_code = state["memorize_code"][obj_name]
                if not has_bare_variable_reference(other_objs, memorize_code):
                    pass_bin.add(obj_name)
            assert pass_bin
            unsorted.difference_update(pass_bin)
            pass_bins.append(pass_bin)
        state["pass_bins"] = pass_bins

        return len(pass_bins)
Exemple #20
0
def _max_allowed_dim(dim, arr, factor):
    if arr.ndim > dim:
        msg = ("factor '%s' evaluates to an %s-dimensional array; I only "
               "handle arrays with dimension <= %s" %
               (factor.name(), arr.ndim, dim))
        raise CharltonError(msg, factor)
Exemple #21
0
def _check_interactable(expr):
    if expr.intercept:
        raise CharltonError("intercept term cannot interact with "
                            "anything else", expr.intercept_origin)