Ejemplo n.º 1
 def eval(self, tree, require_evalexpr=True):
     result = None
     if isinstance(tree, str):
         if tree == "0":
             result = IntermediateExpr(False, None, True, [])
         elif tree == "1":
             result = IntermediateExpr(True, tree.origin, False, [])
         elif self._is_a(int, tree) or self._is_a(float, tree):
             raise CharltonError("numbers besides '0' and '1' are "
                                 "only allowed with **", tree)
             # Guess it's a Python expression
             result = IntermediateExpr(False, None, False,
         assert isinstance(tree, ParseNode)
         key = (tree.op.token, len(tree.args))
         if key not in self._evaluators:
             raise CharltonError("I don't know how to evaluate "
                                 "this '%s' operator" % (tree.op.token,),
         result = self._evaluators[key](self, tree)
     if require_evalexpr and not isinstance(result, IntermediateExpr):
         if isinstance(result, ModelDesc):
             raise CharltonError("~ can only be used once, and "
                                 "only at the top level",
             raise CharltonError("custom operator returned an "
                                 "object that I don't know how to "
                                 "handle", tree)
     return result
Ejemplo n.º 2
def _read_python_expr(token_source, c):
    end_tokens = set(c.binary_ops.keys() + c.unary_ops.keys() + [")"])
    token_types = []
    tokens = []
    bracket_level = 0
    while (bracket_level
           or (token_source.peek()[1] not in end_tokens
               and token_source.peek()[0] != tokenize.ENDMARKER)):
        assert bracket_level >= 0
        (token_type, token) = token_source.next()
        _check_token(token_type, token)
        if token in ("(", "[", "{"):
            bracket_level += 1
        if token in (")", "]", "}"):
            bracket_level -= 1
        if bracket_level < 0:
            raise CharltonError("unmatched close bracket", token)
        if token_type == tokenize.ENDMARKER:
            assert bracket_level > 0
            raise CharltonError(
                "unclosed bracket in embedded Python "
                "expression", _combine_origin_attrs(tokens))
    text = pretty_untokenize(zip(token_types, tokens))
    return StringWithOrigin(text, _combine_origin_attrs(tokens))
Ejemplo n.º 3
 def from_strings(cls, sequence, levels=None, **kwargs):
     if levels is None:
             levels = list(set(sequence))
         except TypeError:
             raise CharltonError("Error converting data to categorical: "
                                 "all items must be hashable")
     level_to_int = {}
     for i, level in enumerate(levels):
             level_to_int[level] = i
         except TypeError:
             raise CharltonError(
                 "Error converting data to categorical: "
                 "all levels must be hashable (and %r isn't)" % (level, ))
     int_array = np.empty(len(sequence), dtype=int)
     for i, entry in enumerate(sequence):
             int_array[i] = level_to_int[entry]
         except ValueError:
             raise CharltonError("Error converting data to categorical: "
                                 "object '%r' does not match any of the "
                                 "expected levels" % (entry, ))
     return cls(int_array, levels, **kwargs)
Ejemplo n.º 4
def _check_token(token_type, token):
    # These are filtered out of our input string, so they should never
    # appear...
    assert token_type not in (tokenize.NL, tokenize.NEWLINE)
    if token_type == tokenize.ERRORTOKEN:
        raise CharltonError(
            "error tokenizing input "
            "(maybe an unclosed string?)", token)
    if token_type == tokenize.COMMENT:
        raise CharltonError("comments are not allowed", token)
Ejemplo n.º 5
def _examine_factor_types(factors, factor_states, default_env,
    num_column_counts = {}
    cat_levels_contrasts = {}
    cat_postprocessors = {}
    examine_needed = set(factors)
    for data in data_iter_maker():
        # We might have gathered all the information we need after the first
        # chunk of data. If so, then we shouldn't spend time loading all the
        # rest of the chunks.
        if not examine_needed:
        for factor in list(examine_needed):
            value = factor.eval(factor_states[factor],
                                DictStack([data, default_env]))
            if isinstance(value, Categorical):
                cat_levels_contrasts[factor] = (value.levels, value.contrast)
            value = atleast_2d_column_default(value)
            _max_allowed_dim(2, value, factor)
            if np.issubdtype(value.dtype, np.number):
                column_count = value.shape[1]
                num_column_counts[factor] = column_count
            # issubdtype(X, bool) isn't reliable -- it returns true for
            # X == int! So check the kind code instead:
            elif value.dtype.kind == "b":
                # Special case: give it a transformer, but don't bother
                # processing the rest of the data
                if value.shape[1] > 1:
                    msg = ("factor '%s' evaluates to a boolean array with "
                           "%s columns; I can only handle single-column "
                           "boolean arrays" % (factor.name(), value.shape[1]))
                    raise CharltonError(msg, factor)
                cat_postprocessors[factor] = _BoolToCat(factor)
                if value.shape[1] > 1:
                    msg = ("factor '%s' appears to categorical and has "
                           "%s columns; I can only handle single-column "
                           "categorical factors" %
                           (factor.name(), value.shape[1]))
                    raise CharltonError(msg, factor)
                if factor not in cat_postprocessors:
                    cat_postprocessors[factor] = CategoricalTransform()
                processor = cat_postprocessors[factor]
    for factor, processor in cat_postprocessors.iteritems():
        cat_levels_contrasts[factor] = (processor.levels(), None)
    return (num_column_counts, cat_levels_contrasts, cat_postprocessors)
Ejemplo n.º 6
def make_model_matrices(builders, data, dtype=float):
    evaluator_to_values = {}
    num_rows = None
    for builder in builders:
        # We look at evaluators rather than factors here, because it might
        # happen that we have the same factor twice, but with different
        # memorized state.
        for evaluator in builder._evaluators:
            if evaluator not in evaluator_to_values:
                value = evaluator.eval(data)
                assert value.ndim == 2
                if num_rows is None:
                    num_rows = value.shape[0]
                    if num_rows != value.shape[0]:
                        msg = ("Row mismatch: factor %s had %s rows, when "
                               "previous factors had %s rows" %
                               (evaluator.factor.name(), value.shape[0],
                        raise CharltonError(msg, evaluator.factor)
                evaluator_to_values[evaluator] = value
    matrices = []
    for builder in builders:
        matrices.append(builder._build(evaluator_to_values, dtype))
    return matrices
Ejemplo n.º 7
 def _code_either(self, intercept, levels):
     n = len(levels)
     scores = self.scores
     if scores is None:
         scores = np.arange(n)
     scores = np.asarray(scores)
     if len(scores) != n:
         raise CharltonError("number of levels (%s) does not match"
                             " number of scores (%s)" % (n, len(scores)))
     # Strategy: just make a matrix whose columns are naive linear,
     # quadratic, etc., functions of the raw scores, and then use 'qr' to
     # orthogonalize each column against those to its left.
     scores -= scores.mean()
     raw_poly = scores.reshape((-1, 1))**np.arange(n).reshape((1, -1))
     q, r = np.linalg.qr(raw_poly)
     q *= np.sign(np.diag(r))
     q /= np.sqrt(np.sum(q**2, axis=1))
     names = [".Constant", ".Linear", ".Quadratic", ".Cubic"]
     names += ["^%s" % (i, ) for i in xrange(4, n)]
     names = names[:n]
     if intercept:
         return ContrastMatrix(q, names)
         # We always include the constant/intercept column as something to
         # orthogonalize against, but we don't always return it:
         return ContrastMatrix(q[:, 1:], names[1:])
Ejemplo n.º 8
def _eval_unary_minus(evaluator, tree):
    if tree.args[0] == "0":
        return IntermediateExpr(True, tree.origin, False, [])
    elif tree.args[0] == "1":
        return IntermediateExpr(False, None, True, [])
        raise CharltonError("Unary minus can only be applied to 1 or 0", tree)
Ejemplo n.º 9
 def eval(self, data):
     result = self.factor.eval(self._state,
                               DictStack([data, self._default_env]))
     result = atleast_2d_column_default(result)
     _max_allowed_dim(2, result, self.factor)
     if result.shape[1] != self._expected_columns:
         raise CharltonError(
             "when evaluating factor %s, I got %s columns "
             "instead of the %s I was expecting" %
             (self.factor.name(), self._expected_columns, result.shape[1]),
     if not np.issubdtype(result.dtype, np.number):
         raise CharltonError(
             "when evaluating numeric factor %s, "
             "I got non-numeric data of type '%s'" %
             (self.factor.name(), result.dtype), self.factor)
     return result
Ejemplo n.º 10
 def transform(self, data, levels=None, **kwargs):
     if isinstance(data, Categorical):
         if levels is not None and data.levels != levels:
             raise CharltonError("changing levels of categorical data "
                                 "not supported yet")
         return Categorical(data.int_array, data.levels, **kwargs)
     if levels is None:
         levels = self._levels_tuple
     return Categorical.from_strings(data, levels=levels, **kwargs)
Ejemplo n.º 11
 def transform(self, data):
     data = np.asarray(data)
     _max_allowed_dim(1, data, self.factor)
     # issubdtype(int, bool) is true! So we can't use it:
     if not data.dtype.kind == "b":
         raise CharltonError(
             "factor %s, which I thought was boolean, "
             "gave non-boolean data of dtype %s" %
             (self.factor.name(), data.dtype), self.factor)
     return Categorical(data, levels=[False, True])
Ejemplo n.º 12
def _read_op_context(token_source, c):
    token_type, token = token_source.next()
    assert token_type != tokenize.ENDMARKER
    if token == ")":
        while c.op_stack and c.op_stack[-1].token != "(":
        if not c.op_stack:
            raise CharltonError("missing '(' or extra ')'", token)
        assert c.op_stack[-1].token == "("
        return False
    elif token in c.binary_ops:
        op = c.binary_ops[token].with_origin(token.origin)
        while (c.op_stack and op.precedence <= c.op_stack[-1].precedence):
        return True
        raise CharltonError("expected an operator", token)
    assert False
Ejemplo n.º 13
def replace_bare_funcalls(code, replacer):
    tokens = []
    for (token_type, token, props) in annotated_tokens(code):
        if props["bare_ref"]:
            replacement = replacer(token)
            if replacement != token:
                if not props["bare_funcall"]:
                    msg = ("magic functions like '%s' can only be called, "
                           "not otherwise referenced" % (token, ))
                    raise CharltonError(msg, token.origin)
                token = replacement
        tokens.append((token_type, token))
    return pretty_untokenize(tokens)
Ejemplo n.º 14
 def eval(self, data):
     result = self.factor.eval(self._state,
                               DictStack([data, self._default_env]))
     if self._postprocessor is not None:
         result = self._postprocessor.transform(result)
     if not isinstance(result, Categorical):
         msg = (
             "when evaluating categoric factor %s, I got a "
             "result that is not of type Categorical (but rather %s)"
             # result.__class__.__name__ would be better, but not
             # defined for old-style classes:
             % (self.factor.name(), result.__class__))
         raise CharltonError(msg, self.factor)
     if result.levels != self._expected_levels:
         msg = ("when evaluating categoric factor %s, I got Categorical "
                " data with unexpected levels (wanted %s, got %s)" %
                (self.factor.name(), self._expected_levels, result.levels))
         raise CharltonError(msg, self.factor)
     _max_allowed_dim(1, result.int_array, self.factor)
     # For consistency, evaluators *always* return 2d arrays (though in
     # this case it will always have only 1 column):
     return atleast_2d_column_default(result.int_array)
Ejemplo n.º 15
def _read_noun_context(token_source, c):
    token_type, token = token_source.next()
    if token == "(":
        return True
    elif token in c.unary_ops:
        return True
    elif token == ")" or token in c.binary_ops:
        raise CharltonError("expected a noun, not '%s'" % (token, ), token)
    elif token_type == tokenize.ENDMARKER:
        assert c.op_stack
        raise CharltonError("expected a noun, but the formula ended instead",
    elif token_type == tokenize.NUMBER:
        return False
        token_source.push_back(token_type, token)
        c.noun_stack.append(_read_python_expr(token_source, c))
        return False
    assert False
Ejemplo n.º 16
def parse(code, extra_operators=[]):
    code = code.replace("\n", " ").strip()
    if not code:
        code = "~ 1"
    token_source = TokenSource(code)

    for extra_operator in extra_operators:
        if extra_operator.precedence < 0:
            raise ValueError, "all operators must have precedence >= 0"

    all_op_list = _default_ops + extra_operators
    unary_ops = {}
    binary_ops = {}
    for op in all_op_list:
        if op.arity == 1:
            unary_ops[op.token] = op
        elif op.arity == 2:
            binary_ops[op.token] = op
            raise ValueError, "operators must be unary or binary"

    c = _ParseContext(unary_ops, binary_ops)

    # This is an implementation of Dijkstra's shunting yard algorithm:
    #   http://en.wikipedia.org/wiki/Shunting_yard_algorithm
    #   http://www.engr.mun.ca/~theo/Misc/exp_parsing.htm

    want_noun = True
    while True:
        if want_noun:
            want_noun = _read_noun_context(token_source, c)
            if token_source.peek()[0] == tokenize.ENDMARKER:
            want_noun = _read_op_context(token_source, c)

    while c.op_stack:
        if c.op_stack[-1].token == "(":
            raise CharltonError("Unmatched '('", c.op_stack[-1])

    assert len(c.noun_stack) == 1
    tree = c.noun_stack.pop()
    if not isinstance(tree, ParseNode) or tree.op.token != "~":
        tree = ParseNode(unary_ops["~"], [tree], tree.origin)
    return tree
Ejemplo n.º 17
def _eval_binary_power(evaluator, tree):
    left_expr = evaluator.eval(tree.args[0])
    power = -1
        power = int(tree.args[1])
    except (ValueError, TypeError):
    if power < 1:
        raise CharltonError("'**' requires a positive integer", tree.args[1])
    all_terms = left_expr.terms
    big_expr = left_expr
    # Small optimization: (a + b)**100 is just the same as (a + b)**2.
    power = min(len(left_expr.terms), power)
    for i in xrange(1, power):
        big_expr = _interaction(left_expr, big_expr)
        all_terms = all_terms + big_expr.terms
    return IntermediateExpr(False, None, False, all_terms)
Ejemplo n.º 18
 def index(self, column_specifier):
     """Take anything (raw indices, term names, column names...) and return
     something that can be used as an index into the model matrix
     column_specifier = np.atleast_1d(column_specifier)
     if np.issubdtype(column_specifier.dtype, int):
         return column_specifier
     if column_specifier.dtype.kind == "b":
         return column_specifier
     columns = []
     for name in column_specifier:
         if name in self.term_to_columns:
             columns += range(*self.term_to_columns[name])
         elif name in self.term_name_to_columns:
             columns += range(*self.term_name_to_columns[name])
         elif name in self.column_name_to_column:
             raise CharltonError("unknown column specifier '%s'" % (name, ))
     return columns
Ejemplo n.º 19
    def memorize_passes_needed(self, state, stateful_transforms):
        # 'stateful_transforms' is a dict {name: transform_factory}, where
        # transform_factory is just a zero-arg callable that makes the given
        # sort of transform (probably just the class itself).
        # 'state' is just an empty dict which we can do whatever we want with,
        # and that will be passed back to later memorize functions
        state["transforms"] = {}

        # example code: == "2 * center(x)"
        i = [0]

        def new_name_maker(token):
            if token in stateful_transforms:
                obj_name = "_charlton_stobj%s__%s__" % (i[0], token)
                i[0] += 1
                state["transforms"][obj_name] = stateful_transforms[token]()
                return obj_name + ".transform"
                return token

        # example eval_code: == "2 * _charlton_stobj0__center__.transform(x)"
        eval_code = replace_bare_funcalls(self.code, new_name_maker)
        state["eval_code"] = eval_code
        # paranoia: verify that none of our new names appeared anywhere in the
        # original code
        if has_bare_variable_reference(state["transforms"], self.code):
            raise CharltonError(
                "names of this form are reserved for "
                "internal use (%s)" % (token, ), token.origin)
        # Pull out all the '_charlton_stobj0__center__.transform(x)' pieces
        # to make '_charlton_stobj0__center__.memorize_chunk(x)' pieces
        state["memorize_code"] = {}
        for obj_name in state["transforms"]:
            transform_calls = capture_obj_method_calls(obj_name, eval_code)
            assert len(transform_calls) == 1
            transform_call = transform_calls[0]
            transform_call_name, transform_call_code = transform_call
            assert transform_call_name == obj_name + ".transform"
            assert transform_call_code.startswith(transform_call_name + "(")
            memorize_code = (obj_name + ".memorize_chunk" +
            state["memorize_code"][obj_name] = memorize_code
        # Then sort the codes into bins, so that every item in bin number i
        # depends only on items in bin (i-1) or less. (By 'depends', we mean
        # that in something like:
        #   spline(center(x))
        # we have to first run:
        #    center.memorize_chunk(x)
        # then
        #    center.memorize_finish(x)
        # and only then can we run:
        #    spline.memorize_chunk(center.transform(x))
        # Since all of our objects have unique names, figuring out who
        # depends on who is pretty easy -- we just check whether the
        # memorization code for spline:
        #    spline.memorize_chunk(center.transform(x))
        # mentions the variable 'center' (which in the example, of course, it
        # does).
        pass_bins = []
        unsorted = set(state["transforms"])
        while unsorted:
            pass_bin = set()
            for obj_name in unsorted:
                other_objs = unsorted.difference([obj_name])
                memorize_code = state["memorize_code"][obj_name]
                if not has_bare_variable_reference(other_objs, memorize_code):
            assert pass_bin
        state["pass_bins"] = pass_bins

        return len(pass_bins)
Ejemplo n.º 20
def _max_allowed_dim(dim, arr, factor):
    if arr.ndim > dim:
        msg = ("factor '%s' evaluates to an %s-dimensional array; I only "
               "handle arrays with dimension <= %s" %
               (factor.name(), arr.ndim, dim))
        raise CharltonError(msg, factor)
Ejemplo n.º 21
def _check_interactable(expr):
    if expr.intercept:
        raise CharltonError("intercept term cannot interact with "
                            "anything else", expr.intercept_origin)