def test_unsupported_numpy_dtype_unsupported(self): # We can't check if a numpy dtype == 'category'. # https://github.com/pandas-dev/pandas/issues/16697 arr = np.array([1, 2, 3]).astype("complex") # we don't support complex dataframe = pd.DataFrame({"A": arr}) with self.assertRaisesRegex(ValueError, "unsupported dtype"): validate_dataframe(dataframe)
def test_null_is_not_a_category(self): # pd.CategoricalDtype means storing nulls as -1. Don't consider -1 when # counting the used categories. with self.assertRaisesRegex(ValueError, "unused category 'b'"): validate_dataframe( pd.DataFrame({"foo": ["a", None]}, dtype=pd.CategoricalDtype(["a", "b"])))
def test_datetime64tz_unsupported(self): dataframe = pd.DataFrame({ # We don't support datetimes with time zone data ... yet "A": pd.Series([pd.to_datetime("2019-04-23T12:34:00-0500")]) }) with self.assertRaisesRegex(ValueError, "unsupported dtype"): validate_dataframe(dataframe)
def test_unsupported_dtype(self): dataframe = pd.DataFrame({ # A type we never plan on supporting "A": pd.Series([pd.Interval(0, 1)], dtype="interval") }) with self.assertRaisesRegex(ValueError, "unsupported dtype"): validate_dataframe(dataframe)
def test_nullable_int_unsupported(self): dataframe = pd.DataFrame({ # We don't support nullable integer columns ... yet "A": pd.Series([1, np.nan], dtype=pd.Int64Dtype()) }) with self.assertRaisesRegex(ValueError, "unsupported dtype"): validate_dataframe(dataframe)
def test_infinity_not_supported(self): # Make 'A': [1, -inf, +inf, nan] num = pd.Series([1, -2, 3, np.nan]) denom = pd.Series([1, 0, 0, 1]) dataframe = pd.DataFrame({"A": num / denom}) with self.assertRaisesRegex( ValueError, ("invalid value -inf in column 'A', row 1 " "\(infinity is not supported\)"), ): validate_dataframe(dataframe)
def test_index(self): with self.assertRaisesRegex(ValueError, "must use the default RangeIndex"): validate_dataframe(pd.DataFrame({"A": [1, 2]})[1:])
def test_numpy_dtype(self): # Numpy dtypes should be treated just like pandas dtypes. dataframe = pd.DataFrame({"A": np.array([1, 2, 3])}) validate_dataframe(dataframe)
def test_empty_colname(self): dataframe = pd.DataFrame({"": [1], "B": [2]}) with self.assertRaisesRegex(ValueError, "empty column name"): validate_dataframe(dataframe)
def test_unique_colnames(self): dataframe = pd.DataFrame({"A": [1], "B": [2]}) dataframe.columns = ["A", "A"] with self.assertRaisesRegex(ValueError, "duplicate column name"): validate_dataframe(dataframe)
def test_unused_categories(self): with self.assertRaisesRegex(ValueError, "unused category 'b'"): validate_dataframe( pd.DataFrame({"foo": ["a", "a"]}, dtype=pd.CategoricalDtype(["a", "b"])))
def test_non_str_categories(self): with self.assertRaisesRegex(ValueError, "must all be str"): validate_dataframe( pd.DataFrame({"foo": ["a", 1]}, dtype="category"))
def test_empty_categories_with_wrong_dtype(self): with self.assertRaisesRegex(ValueError, "must have dtype=object"): validate_dataframe( pd.DataFrame({ "foo": [np.nan] }, dtype=float).astype("category"))
def test_non_str_objects(self): with self.assertRaisesRegex(ValueError, "must all be str"): validate_dataframe(pd.DataFrame({"foo": ["a", 1]}))
def test_colnames_all_str(self): with self.assertRaisesRegex(ValueError, "column names"): # df.columns is object, but not all are str validate_dataframe(pd.DataFrame({"A": [1], 2: [2]}))
def test_colnames_dtype_object(self): with self.assertRaisesRegex(ValueError, "column names"): # df.columns is numeric validate_dataframe(pd.DataFrame({1: [1]}))
def test_empty_categories(self): df = pd.DataFrame({"A": []}, dtype="category") validate_dataframe(df)
def eval_process(code, table): """ Runs `code`'s "process" method; returns (retval, log). stdout, stderr, exception tracebacks, and error messages will all be written to log. (The UX is: log is displayed as a monospaced console to the user -- presumably the person who wrote the code.) If there's an Exception `err`, `str(err)` will be returned as the retval. This method relies on `cjwkernel.kernel` for sandboxing. The process() function can access to anything the module can access. This should never raise an exception. (TODO handle out-of-memory.) Exceptions would email _us_; but in this case, we want the _user_ to see all error messages. """ log = io.StringIO() eval_globals = {"pd": pd, "np": np, "math": math} def ret(retval): """ Usage: `return ret(whatever)` """ if isinstance(retval, str): log.write(retval) return (retval, log.getvalue()) try: compiled_code = compile(code, "your code", "exec") except SyntaxError as err: return ret("Line %d: %s" % (err.lineno, err)) except ValueError: # Apparently this is another thing that compile() can raise return ret("Your code contains null bytes") # Override sys.stdout and sys.stderr ... but only in the context of # `process()`. After `process()`, the module needs its original values # again so it can send a Thrift object over stdout and log errors (which # should never happen) to stderr. # # This function's sandbox isn't perfect, but we aren't protecting anything # dangerous. Writing to the _original_ `sys.stdout` and `sys.stderr` can at # worst cause a single `ModuleExitedError`, which would email us. That's # the security risk: an email to us. with _patch_log("stdout", log): with _patch_log("stderr", log): try: exec(compiled_code, eval_globals) # raise any exception if "process" not in eval_globals: return ret('Please define a "process(table)" function') process = eval_globals["process"] if len(signature(process).parameters) != 1: return ret( "Please make your process(table) function accept exactly 1 argument" ) retval = process(table) # raise any exception except Exception: # An error in the code or in process() etype, value, tb = sys.exc_info() tb = tb.tb_next # omit this method from the stack trace traceback.print_exception(etype, value, tb) return ret(f"Line {tb.tb_lineno}: {etype.__name__}: {value}") if isinstance(retval, pd.DataFrame): try: validate_dataframe(retval) # raise ValueError except ValueError as err: return ret( "Unhandled DataFrame: %s. Please return a different DataFrame." % str(err)) return ret(retval) elif isinstance(retval, str): return ret(retval) else: return ret("Please make process(table) return a pd.DataFrame. " "(Yours returned a %s.)" % type(retval).__name__)