def test_unsupported_numpy_dtype_unsupported(self):
     # We can't check if a numpy dtype == 'category'.
     # https://github.com/pandas-dev/pandas/issues/16697
     arr = np.array([1, 2, 3]).astype("complex")  # we don't support complex
     dataframe = pd.DataFrame({"A": arr})
     with self.assertRaisesRegex(ValueError, "unsupported dtype"):
         validate_dataframe(dataframe)
 def test_null_is_not_a_category(self):
     # pd.CategoricalDtype means storing nulls as -1. Don't consider -1 when
     # counting the used categories.
     with self.assertRaisesRegex(ValueError, "unused category 'b'"):
         validate_dataframe(
             pd.DataFrame({"foo": ["a", None]},
                          dtype=pd.CategoricalDtype(["a", "b"])))
 def test_datetime64tz_unsupported(self):
     dataframe = pd.DataFrame({
         # We don't support datetimes with time zone data ... yet
         "A":
         pd.Series([pd.to_datetime("2019-04-23T12:34:00-0500")])
     })
     with self.assertRaisesRegex(ValueError, "unsupported dtype"):
         validate_dataframe(dataframe)
 def test_unsupported_dtype(self):
     dataframe = pd.DataFrame({
         # A type we never plan on supporting
         "A":
         pd.Series([pd.Interval(0, 1)], dtype="interval")
     })
     with self.assertRaisesRegex(ValueError, "unsupported dtype"):
         validate_dataframe(dataframe)
 def test_nullable_int_unsupported(self):
     dataframe = pd.DataFrame({
         # We don't support nullable integer columns ... yet
         "A":
         pd.Series([1, np.nan], dtype=pd.Int64Dtype())
     })
     with self.assertRaisesRegex(ValueError, "unsupported dtype"):
         validate_dataframe(dataframe)
 def test_infinity_not_supported(self):
     # Make 'A': [1, -inf, +inf, nan]
     num = pd.Series([1, -2, 3, np.nan])
     denom = pd.Series([1, 0, 0, 1])
     dataframe = pd.DataFrame({"A": num / denom})
     with self.assertRaisesRegex(
             ValueError,
         ("invalid value -inf in column 'A', row 1 "
          "\(infinity is not supported\)"),
     ):
         validate_dataframe(dataframe)
 def test_index(self):
     with self.assertRaisesRegex(ValueError,
                                 "must use the default RangeIndex"):
         validate_dataframe(pd.DataFrame({"A": [1, 2]})[1:])
 def test_numpy_dtype(self):
     # Numpy dtypes should be treated just like pandas dtypes.
     dataframe = pd.DataFrame({"A": np.array([1, 2, 3])})
     validate_dataframe(dataframe)
 def test_empty_colname(self):
     dataframe = pd.DataFrame({"": [1], "B": [2]})
     with self.assertRaisesRegex(ValueError, "empty column name"):
         validate_dataframe(dataframe)
 def test_unique_colnames(self):
     dataframe = pd.DataFrame({"A": [1], "B": [2]})
     dataframe.columns = ["A", "A"]
     with self.assertRaisesRegex(ValueError, "duplicate column name"):
         validate_dataframe(dataframe)
 def test_unused_categories(self):
     with self.assertRaisesRegex(ValueError, "unused category 'b'"):
         validate_dataframe(
             pd.DataFrame({"foo": ["a", "a"]},
                          dtype=pd.CategoricalDtype(["a", "b"])))
 def test_non_str_categories(self):
     with self.assertRaisesRegex(ValueError, "must all be str"):
         validate_dataframe(
             pd.DataFrame({"foo": ["a", 1]}, dtype="category"))
 def test_empty_categories_with_wrong_dtype(self):
     with self.assertRaisesRegex(ValueError, "must have dtype=object"):
         validate_dataframe(
             pd.DataFrame({
                 "foo": [np.nan]
             }, dtype=float).astype("category"))
 def test_non_str_objects(self):
     with self.assertRaisesRegex(ValueError, "must all be str"):
         validate_dataframe(pd.DataFrame({"foo": ["a", 1]}))
 def test_colnames_all_str(self):
     with self.assertRaisesRegex(ValueError, "column names"):
         # df.columns is object, but not all are str
         validate_dataframe(pd.DataFrame({"A": [1], 2: [2]}))
 def test_colnames_dtype_object(self):
     with self.assertRaisesRegex(ValueError, "column names"):
         # df.columns is numeric
         validate_dataframe(pd.DataFrame({1: [1]}))
 def test_empty_categories(self):
     df = pd.DataFrame({"A": []}, dtype="category")
     validate_dataframe(df)
Esempio n. 18
0
def eval_process(code, table):
    """
    Runs `code`'s "process" method; returns (retval, log).

    stdout, stderr, exception tracebacks, and error messages will all be
    written to log. (The UX is: log is displayed as a monospaced console to the
    user -- presumably the person who wrote the code.)

    If there's an Exception `err`, `str(err)` will be returned as the retval.

    This method relies on `cjwkernel.kernel` for sandboxing. The process()
    function can access to anything the module can access.

    This should never raise an exception. (TODO handle out-of-memory.)
    Exceptions would email _us_; but in this case, we want the _user_ to see
    all error messages.
    """
    log = io.StringIO()
    eval_globals = {"pd": pd, "np": np, "math": math}

    def ret(retval):
        """
        Usage: `return ret(whatever)`
        """
        if isinstance(retval, str):
            log.write(retval)
        return (retval, log.getvalue())

    try:
        compiled_code = compile(code, "your code", "exec")
    except SyntaxError as err:
        return ret("Line %d: %s" % (err.lineno, err))
    except ValueError:
        # Apparently this is another thing that compile() can raise
        return ret("Your code contains null bytes")

    # Override sys.stdout and sys.stderr ... but only in the context of
    # `process()`. After `process()`, the module needs its original values
    # again so it can send a Thrift object over stdout and log errors (which
    # should never happen) to stderr.
    #
    # This function's sandbox isn't perfect, but we aren't protecting anything
    # dangerous. Writing to the _original_ `sys.stdout` and `sys.stderr` can at
    # worst cause a single `ModuleExitedError`, which would email us. That's
    # the security risk: an email to us.
    with _patch_log("stdout", log):
        with _patch_log("stderr", log):
            try:
                exec(compiled_code, eval_globals)  # raise any exception

                if "process" not in eval_globals:
                    return ret('Please define a "process(table)" function')
                process = eval_globals["process"]
                if len(signature(process).parameters) != 1:
                    return ret(
                        "Please make your process(table) function accept exactly 1 argument"
                    )

                retval = process(table)  # raise any exception
            except Exception:
                # An error in the code or in process()
                etype, value, tb = sys.exc_info()
                tb = tb.tb_next  # omit this method from the stack trace
                traceback.print_exception(etype, value, tb)
                return ret(f"Line {tb.tb_lineno}: {etype.__name__}: {value}")

    if isinstance(retval, pd.DataFrame):
        try:
            validate_dataframe(retval)  # raise ValueError
        except ValueError as err:
            return ret(
                "Unhandled DataFrame: %s. Please return a different DataFrame."
                % str(err))
        return ret(retval)
    elif isinstance(retval, str):
        return ret(retval)
    else:
        return ret("Please make process(table) return a pd.DataFrame. "
                   "(Yours returned a %s.)" % type(retval).__name__)