def test_term_new_data_numeric(): data = pd.DataFrame({"x": [10, 10, 10]}) var_expr = Parser(Scanner("x").scan(False)).parse() var_term = Variable(var_expr.name.lexeme, var_expr.level) var_term.set_type(data) var_term.set_data() assert (var_term.value == [10, 10, 10]).all() data = pd.DataFrame({"x": [1, 2, 3]}) assert (var_term.eval_new_data(data).T == [1, 2, 3]).all()
def test_term_new_data_categoric(): data = pd.DataFrame({"x": ["A", "B", "C"]}) # Full rank encoding var_expr = Parser(Scanner("x").scan(False)).parse() var_term = Variable(var_expr.name.lexeme, var_expr.level) var_term.set_type(data) var_term.set_data(spans_intercept=True) assert (np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) == var_term.value).all() data = pd.DataFrame({"x": ["B", "C"]}) assert (var_term.eval_new_data(data) == np.array([[0, 1, 0], [0, 0, 1]])).all() # It remembers it saw "A", "B", and "C", but not "D". # So when you pass a new level, it raises a ValueError. with pytest.raises( ValueError, match="The levels D in 'x' are not present in the original data set." ): data = pd.DataFrame({"x": ["B", "C", "D"]}) var_term.eval_new_data(data) # The same with reduced encoding data = pd.DataFrame({"x": ["A", "B", "C"]}) var_expr = Parser(Scanner("x").scan(False)).parse() var_term = Variable(var_expr.name.lexeme, var_expr.level) var_term.set_type(data) var_term.set_data() assert (np.array([[0, 0], [1, 0], [0, 1]]) == var_term.value).all() data = pd.DataFrame({"x": ["A", "C"]}) assert (var_term.eval_new_data(data) == np.array([[0, 0], [0, 1]])).all() # It remembers it saw "A", "B", and "C", but not "D". # So when you pass a new level, it raises a ValueError. with pytest.raises( ValueError, match="The levels D in 'x' are not present in the original data set." ): data = pd.DataFrame({"x": ["B", "C", "D"]}) var_term.eval_new_data(data)
def test_variable_set_data_errors(): x = Variable("x") with pytest.raises(ValueError): x.set_data(True) with pytest.raises(ValueError): x.kind = "hello" x.set_data(True) with pytest.raises(Exception): x.kind = "categoric" x.set_data(True)