def eval(self, tree, require_evalexpr=True): result = None if isinstance(tree, str): if tree == "0": result = IntermediateExpr(False, None, True, []) elif tree == "1": result = IntermediateExpr(True, tree.origin, False, []) elif self._is_a(int, tree) or self._is_a(float, tree): raise CharltonError("numbers besides '0' and '1' are " "only allowed with **", tree) else: # Guess it's a Python expression result = IntermediateExpr(False, None, False, [Term([EvalFactor(tree)])]) else: assert isinstance(tree, ParseNode) key = (tree.op.token, len(tree.args)) if key not in self._evaluators: raise CharltonError("I don't know how to evaluate " "this '%s' operator" % (tree.op.token,), tree.op) result = self._evaluators[key](self, tree) if require_evalexpr and not isinstance(result, IntermediateExpr): if isinstance(result, ModelDesc): raise CharltonError("~ can only be used once, and " "only at the top level", tree) else: raise CharltonError("custom operator returned an " "object that I don't know how to " "handle", tree) return result
def _read_python_expr(token_source, c): end_tokens = set(c.binary_ops.keys() + c.unary_ops.keys() + [")"]) token_types = [] tokens = [] bracket_level = 0 while (bracket_level or (token_source.peek()[1] not in end_tokens and token_source.peek()[0] != tokenize.ENDMARKER)): assert bracket_level >= 0 (token_type, token) = token_source.next() _check_token(token_type, token) if token in ("(", "[", "{"): bracket_level += 1 if token in (")", "]", "}"): bracket_level -= 1 if bracket_level < 0: raise CharltonError("unmatched close bracket", token) if token_type == tokenize.ENDMARKER: assert bracket_level > 0 raise CharltonError( "unclosed bracket in embedded Python " "expression", _combine_origin_attrs(tokens)) token_types.append(token_type) tokens.append(token) text = pretty_untokenize(zip(token_types, tokens)) return StringWithOrigin(text, _combine_origin_attrs(tokens))
def from_strings(cls, sequence, levels=None, **kwargs): if levels is None: try: levels = list(set(sequence)) except TypeError: raise CharltonError("Error converting data to categorical: " "all items must be hashable") levels.sort() level_to_int = {} for i, level in enumerate(levels): try: level_to_int[level] = i except TypeError: raise CharltonError( "Error converting data to categorical: " "all levels must be hashable (and %r isn't)" % (level, )) int_array = np.empty(len(sequence), dtype=int) for i, entry in enumerate(sequence): try: int_array[i] = level_to_int[entry] except ValueError: raise CharltonError("Error converting data to categorical: " "object '%r' does not match any of the " "expected levels" % (entry, )) return cls(int_array, levels, **kwargs)
def _check_token(token_type, token): # These are filtered out of our input string, so they should never # appear... assert token_type not in (tokenize.NL, tokenize.NEWLINE) if token_type == tokenize.ERRORTOKEN: raise CharltonError( "error tokenizing input " "(maybe an unclosed string?)", token) if token_type == tokenize.COMMENT: raise CharltonError("comments are not allowed", token)
def _examine_factor_types(factors, factor_states, default_env, data_iter_maker): num_column_counts = {} cat_levels_contrasts = {} cat_postprocessors = {} examine_needed = set(factors) for data in data_iter_maker(): # We might have gathered all the information we need after the first # chunk of data. If so, then we shouldn't spend time loading all the # rest of the chunks. if not examine_needed: break for factor in list(examine_needed): value = factor.eval(factor_states[factor], DictStack([data, default_env])) if isinstance(value, Categorical): cat_levels_contrasts[factor] = (value.levels, value.contrast) examine_needed.remove(factor) continue value = atleast_2d_column_default(value) _max_allowed_dim(2, value, factor) if np.issubdtype(value.dtype, np.number): column_count = value.shape[1] num_column_counts[factor] = column_count examine_needed.remove(factor) # issubdtype(X, bool) isn't reliable -- it returns true for # X == int! So check the kind code instead: elif value.dtype.kind == "b": # Special case: give it a transformer, but don't bother # processing the rest of the data if value.shape[1] > 1: msg = ("factor '%s' evaluates to a boolean array with " "%s columns; I can only handle single-column " "boolean arrays" % (factor.name(), value.shape[1])) raise CharltonError(msg, factor) cat_postprocessors[factor] = _BoolToCat(factor) examine_needed.remove(factor) else: if value.shape[1] > 1: msg = ("factor '%s' appears to categorical and has " "%s columns; I can only handle single-column " "categorical factors" % (factor.name(), value.shape[1])) raise CharltonError(msg, factor) if factor not in cat_postprocessors: cat_postprocessors[factor] = CategoricalTransform() processor = cat_postprocessors[factor] processor.memorize_chunk(value) for factor, processor in cat_postprocessors.iteritems(): processor.memorize_finish() cat_levels_contrasts[factor] = (processor.levels(), None) return (num_column_counts, cat_levels_contrasts, cat_postprocessors)
def make_model_matrices(builders, data, dtype=float): evaluator_to_values = {} num_rows = None for builder in builders: # We look at evaluators rather than factors here, because it might # happen that we have the same factor twice, but with different # memorized state. for evaluator in builder._evaluators: if evaluator not in evaluator_to_values: value = evaluator.eval(data) assert value.ndim == 2 if num_rows is None: num_rows = value.shape[0] else: if num_rows != value.shape[0]: msg = ("Row mismatch: factor %s had %s rows, when " "previous factors had %s rows" % (evaluator.factor.name(), value.shape[0], num_rows)) raise CharltonError(msg, evaluator.factor) evaluator_to_values[evaluator] = value matrices = [] for builder in builders: matrices.append(builder._build(evaluator_to_values, dtype)) return matrices
def _code_either(self, intercept, levels): n = len(levels) scores = self.scores if scores is None: scores = np.arange(n) scores = np.asarray(scores) if len(scores) != n: raise CharltonError("number of levels (%s) does not match" " number of scores (%s)" % (n, len(scores))) # Strategy: just make a matrix whose columns are naive linear, # quadratic, etc., functions of the raw scores, and then use 'qr' to # orthogonalize each column against those to its left. scores -= scores.mean() raw_poly = scores.reshape((-1, 1))**np.arange(n).reshape((1, -1)) q, r = np.linalg.qr(raw_poly) q *= np.sign(np.diag(r)) q /= np.sqrt(np.sum(q**2, axis=1)) names = [".Constant", ".Linear", ".Quadratic", ".Cubic"] names += ["^%s" % (i, ) for i in xrange(4, n)] names = names[:n] if intercept: return ContrastMatrix(q, names) else: # We always include the constant/intercept column as something to # orthogonalize against, but we don't always return it: return ContrastMatrix(q[:, 1:], names[1:])
def _eval_unary_minus(evaluator, tree): if tree.args[0] == "0": return IntermediateExpr(True, tree.origin, False, []) elif tree.args[0] == "1": return IntermediateExpr(False, None, True, []) else: raise CharltonError("Unary minus can only be applied to 1 or 0", tree)
def eval(self, data): result = self.factor.eval(self._state, DictStack([data, self._default_env])) result = atleast_2d_column_default(result) _max_allowed_dim(2, result, self.factor) if result.shape[1] != self._expected_columns: raise CharltonError( "when evaluating factor %s, I got %s columns " "instead of the %s I was expecting" % (self.factor.name(), self._expected_columns, result.shape[1]), self.factor) if not np.issubdtype(result.dtype, np.number): raise CharltonError( "when evaluating numeric factor %s, " "I got non-numeric data of type '%s'" % (self.factor.name(), result.dtype), self.factor) return result
def transform(self, data, levels=None, **kwargs): if isinstance(data, Categorical): if levels is not None and data.levels != levels: raise CharltonError("changing levels of categorical data " "not supported yet") return Categorical(data.int_array, data.levels, **kwargs) if levels is None: levels = self._levels_tuple return Categorical.from_strings(data, levels=levels, **kwargs)
def transform(self, data): data = np.asarray(data) _max_allowed_dim(1, data, self.factor) # issubdtype(int, bool) is true! So we can't use it: if not data.dtype.kind == "b": raise CharltonError( "factor %s, which I thought was boolean, " "gave non-boolean data of dtype %s" % (self.factor.name(), data.dtype), self.factor) return Categorical(data, levels=[False, True])
def _read_op_context(token_source, c): token_type, token = token_source.next() assert token_type != tokenize.ENDMARKER if token == ")": while c.op_stack and c.op_stack[-1].token != "(": _run_op(c) if not c.op_stack: raise CharltonError("missing '(' or extra ')'", token) assert c.op_stack[-1].token == "(" c.op_stack.pop() return False elif token in c.binary_ops: op = c.binary_ops[token].with_origin(token.origin) while (c.op_stack and op.precedence <= c.op_stack[-1].precedence): _run_op(c) c.op_stack.append(op) return True else: raise CharltonError("expected an operator", token) assert False
def replace_bare_funcalls(code, replacer): tokens = [] for (token_type, token, props) in annotated_tokens(code): if props["bare_ref"]: replacement = replacer(token) if replacement != token: if not props["bare_funcall"]: msg = ("magic functions like '%s' can only be called, " "not otherwise referenced" % (token, )) raise CharltonError(msg, token.origin) token = replacement tokens.append((token_type, token)) return pretty_untokenize(tokens)
def eval(self, data): result = self.factor.eval(self._state, DictStack([data, self._default_env])) if self._postprocessor is not None: result = self._postprocessor.transform(result) if not isinstance(result, Categorical): msg = ( "when evaluating categoric factor %s, I got a " "result that is not of type Categorical (but rather %s)" # result.__class__.__name__ would be better, but not # defined for old-style classes: % (self.factor.name(), result.__class__)) raise CharltonError(msg, self.factor) if result.levels != self._expected_levels: msg = ("when evaluating categoric factor %s, I got Categorical " " data with unexpected levels (wanted %s, got %s)" % (self.factor.name(), self._expected_levels, result.levels)) raise CharltonError(msg, self.factor) _max_allowed_dim(1, result.int_array, self.factor) # For consistency, evaluators *always* return 2d arrays (though in # this case it will always have only 1 column): return atleast_2d_column_default(result.int_array)
def _read_noun_context(token_source, c): token_type, token = token_source.next() if token == "(": c.op_stack.append(_open_paren.with_origin(token.origin)) return True elif token in c.unary_ops: c.op_stack.append(c.unary_ops[token].with_origin(token.origin)) return True elif token == ")" or token in c.binary_ops: raise CharltonError("expected a noun, not '%s'" % (token, ), token) elif token_type == tokenize.ENDMARKER: assert c.op_stack raise CharltonError("expected a noun, but the formula ended instead", c.op_stack[-1]) elif token_type == tokenize.NUMBER: c.noun_stack.append(token) return False else: token_source.push_back(token_type, token) c.noun_stack.append(_read_python_expr(token_source, c)) return False assert False
def parse(code, extra_operators=[]): code = code.replace("\n", " ").strip() if not code: code = "~ 1" token_source = TokenSource(code) for extra_operator in extra_operators: if extra_operator.precedence < 0: raise ValueError, "all operators must have precedence >= 0" all_op_list = _default_ops + extra_operators unary_ops = {} binary_ops = {} for op in all_op_list: if op.arity == 1: unary_ops[op.token] = op elif op.arity == 2: binary_ops[op.token] = op else: raise ValueError, "operators must be unary or binary" c = _ParseContext(unary_ops, binary_ops) # This is an implementation of Dijkstra's shunting yard algorithm: # http://en.wikipedia.org/wiki/Shunting_yard_algorithm # http://www.engr.mun.ca/~theo/Misc/exp_parsing.htm want_noun = True while True: if want_noun: want_noun = _read_noun_context(token_source, c) else: if token_source.peek()[0] == tokenize.ENDMARKER: break want_noun = _read_op_context(token_source, c) while c.op_stack: if c.op_stack[-1].token == "(": raise CharltonError("Unmatched '('", c.op_stack[-1]) _run_op(c) assert len(c.noun_stack) == 1 tree = c.noun_stack.pop() if not isinstance(tree, ParseNode) or tree.op.token != "~": tree = ParseNode(unary_ops["~"], [tree], tree.origin) return tree
def _eval_binary_power(evaluator, tree): left_expr = evaluator.eval(tree.args[0]) _check_interactable(left_expr) power = -1 try: power = int(tree.args[1]) except (ValueError, TypeError): pass if power < 1: raise CharltonError("'**' requires a positive integer", tree.args[1]) all_terms = left_expr.terms big_expr = left_expr # Small optimization: (a + b)**100 is just the same as (a + b)**2. power = min(len(left_expr.terms), power) for i in xrange(1, power): big_expr = _interaction(left_expr, big_expr) all_terms = all_terms + big_expr.terms return IntermediateExpr(False, None, False, all_terms)
def index(self, column_specifier): """Take anything (raw indices, term names, column names...) and return something that can be used as an index into the model matrix ndarray.""" column_specifier = np.atleast_1d(column_specifier) if np.issubdtype(column_specifier.dtype, int): return column_specifier if column_specifier.dtype.kind == "b": return column_specifier columns = [] for name in column_specifier: if name in self.term_to_columns: columns += range(*self.term_to_columns[name]) elif name in self.term_name_to_columns: columns += range(*self.term_name_to_columns[name]) elif name in self.column_name_to_column: columns.append(self.column_name_to_column[name]) else: raise CharltonError("unknown column specifier '%s'" % (name, )) return columns
def memorize_passes_needed(self, state, stateful_transforms): # 'stateful_transforms' is a dict {name: transform_factory}, where # transform_factory is just a zero-arg callable that makes the given # sort of transform (probably just the class itself). # 'state' is just an empty dict which we can do whatever we want with, # and that will be passed back to later memorize functions state["transforms"] = {} # example code: == "2 * center(x)" i = [0] def new_name_maker(token): if token in stateful_transforms: obj_name = "_charlton_stobj%s__%s__" % (i[0], token) i[0] += 1 state["transforms"][obj_name] = stateful_transforms[token]() return obj_name + ".transform" else: return token # example eval_code: == "2 * _charlton_stobj0__center__.transform(x)" eval_code = replace_bare_funcalls(self.code, new_name_maker) state["eval_code"] = eval_code # paranoia: verify that none of our new names appeared anywhere in the # original code if has_bare_variable_reference(state["transforms"], self.code): raise CharltonError( "names of this form are reserved for " "internal use (%s)" % (token, ), token.origin) # Pull out all the '_charlton_stobj0__center__.transform(x)' pieces # to make '_charlton_stobj0__center__.memorize_chunk(x)' pieces state["memorize_code"] = {} for obj_name in state["transforms"]: transform_calls = capture_obj_method_calls(obj_name, eval_code) assert len(transform_calls) == 1 transform_call = transform_calls[0] transform_call_name, transform_call_code = transform_call assert transform_call_name == obj_name + ".transform" assert transform_call_code.startswith(transform_call_name + "(") memorize_code = (obj_name + ".memorize_chunk" + transform_call_code[len(transform_call_name):]) state["memorize_code"][obj_name] = memorize_code # Then sort the codes into bins, so that every item in bin number i # depends only on items in bin (i-1) or less. (By 'depends', we mean # that in something like: # spline(center(x)) # we have to first run: # center.memorize_chunk(x) # then # center.memorize_finish(x) # and only then can we run: # spline.memorize_chunk(center.transform(x)) # Since all of our objects have unique names, figuring out who # depends on who is pretty easy -- we just check whether the # memorization code for spline: # spline.memorize_chunk(center.transform(x)) # mentions the variable 'center' (which in the example, of course, it # does). pass_bins = [] unsorted = set(state["transforms"]) while unsorted: pass_bin = set() for obj_name in unsorted: other_objs = unsorted.difference([obj_name]) memorize_code = state["memorize_code"][obj_name] if not has_bare_variable_reference(other_objs, memorize_code): pass_bin.add(obj_name) assert pass_bin unsorted.difference_update(pass_bin) pass_bins.append(pass_bin) state["pass_bins"] = pass_bins return len(pass_bins)
def _max_allowed_dim(dim, arr, factor): if arr.ndim > dim: msg = ("factor '%s' evaluates to an %s-dimensional array; I only " "handle arrays with dimension <= %s" % (factor.name(), arr.ndim, dim)) raise CharltonError(msg, factor)
def _check_interactable(expr): if expr.intercept: raise CharltonError("intercept term cannot interact with " "anything else", expr.intercept_origin)