Beispiel #1
0
def check_meta(x, meta, funcname=None):
    """Check that the dask metadata matches the result.
    If metadata matches, ``x`` is passed through unchanged. A nice error is
    raised if metadata doesn't match.

    Parameters
    ----------
    x : DataFrame, Series, or Index
    meta : DataFrame, Series, or Index
        The expected metadata that ``x`` should match
    funcname : str, optional
        The name of the function in which the metadata was specified. If
        provided, the function name will be included in the error message to be
        more helpful to users.
    """

    if not isinstance(meta, (cudf.Series, cudf.Index, cudf.DataFrame)):
        raise TypeError(
            "Expected partition to be DataFrame, Series, or "
            "Index of cudf, got `%s`" % type(meta).__name__
        )

    if type(x) != type(meta):
        errmsg = "Expected partition of type `%s` but got " "`%s`" % (
            type(meta).__name__,
            type(x).__name__,
        )
    elif isinstance(meta, cudf.DataFrame):

        extra_cols = set(x.columns) ^ set(meta.columns)
        if extra_cols:
            errmsg = "extra columns"
        else:
            bad = [
                (col, x[col].dtype, meta[col].dtype)
                for col in x.columns
                if not series_type_eq(x[col], meta[col])
            ]

            if not bad:
                return x
            errmsg = "Partition type: `%s`\n%s" % (
                type(meta).__name__,
                asciitable(["Column", "Found", "Expected"], bad),
            )
    else:
        if series_type_eq(x, meta):
            return x

        errmsg = "Partition type: `%s`\n%s" % (
            type(meta).__name__,
            asciitable(["", "dtype"], [("Found", x.dtype), ("Expected", meta.dtype)]),
        )

    raise ValueError(
        "Metadata mismatch found%s.\n\n"
        "%s" % ((" in `%s`" % funcname if funcname else ""), errmsg)
    )
Beispiel #2
0
def test_asciitable():
    res = asciitable(['fruit', 'color'],
                     [('apple', 'red'), ('banana', 'yellow'),
                      ('tomato', 'red'), ('pear', 'green')])
    assert res == ('+--------+--------+\n'
                   '| fruit  | color  |\n'
                   '+--------+--------+\n'
                   '| apple  | red    |\n'
                   '| banana | yellow |\n'
                   '| tomato | red    |\n'
                   '| pear   | green  |\n'
                   '+--------+--------+')
Beispiel #3
0
def test_asciitable():
    res = asciitable(
        ["fruit", "color"],
        [("apple", "red"), ("banana", "yellow"), ("tomato", "red"),
         ("pear", "green")],
    )
    assert res == ("+--------+--------+\n"
                   "| fruit  | color  |\n"
                   "+--------+--------+\n"
                   "| apple  | red    |\n"
                   "| banana | yellow |\n"
                   "| tomato | red    |\n"
                   "| pear   | green  |\n"
                   "+--------+--------+")
Beispiel #4
0
def test_asciitable():
    res = asciitable(['fruit', 'color'],
                     [('apple', 'red'),
                      ('banana', 'yellow'),
                      ('tomato', 'red'),
                      ('pear', 'green')])
    assert res == ('+--------+--------+\n'
                   '| fruit  | color  |\n'
                   '+--------+--------+\n'
                   '| apple  | red    |\n'
                   '| banana | yellow |\n'
                   '| tomato | red    |\n'
                   '| pear   | green  |\n'
                   '+--------+--------+')
Beispiel #5
0
def coerce_dtypes(df, dtypes):
    """Coerce dataframe to dtypes safely

    Operates in place

    Parameters
    ----------
    df: Pandas DataFrame
    dtypes: dict like {'x': float}
    """
    bad_dtypes = []
    bad_dates = []
    errors = []
    for c in df.columns:
        if c in dtypes and df.dtypes[c] != dtypes[c]:
            actual = df.dtypes[c]
            desired = dtypes[c]
            if is_float_dtype(actual) and is_integer_dtype(desired):
                bad_dtypes.append((c, actual, desired))
            elif is_object_dtype(actual) and is_datetime64_any_dtype(desired):
                # This can only occur when parse_dates is specified, but an
                # invalid date is encountered. Pandas then silently falls back
                # to object dtype. Since `object_array.astype(datetime)` will
                # silently overflow, error here and report.
                bad_dates.append(c)
            else:
                try:
                    df[c] = df[c].astype(dtypes[c])
                except Exception as e:
                    bad_dtypes.append((c, actual, desired))
                    errors.append((c, e))

    if bad_dtypes:
        if errors:
            ex = "\n".join(f"- {c}\n  {e!r}"
                           for c, e in sorted(errors, key=lambda x: str(x[0])))
            exceptions = ("The following columns also raised exceptions on "
                          "conversion:\n\n%s\n\n") % ex
            extra = ""
        else:
            exceptions = ""
            # All mismatches are int->float, also suggest `assume_missing=True`
            extra = ("\n\nAlternatively, provide `assume_missing=True` "
                     "to interpret\n"
                     "all unspecified integer columns as floats.")

        bad_dtypes = sorted(bad_dtypes, key=lambda x: str(x[0]))
        table = asciitable(["Column", "Found", "Expected"], bad_dtypes)
        dtype_kw = "dtype={%s}" % ",\n       ".join(
            f"{k!r}: '{v}'" for (k, v, _) in bad_dtypes)

        dtype_msg = (
            "{table}\n\n"
            "{exceptions}"
            "Usually this is due to dask's dtype inference failing, and\n"
            "*may* be fixed by specifying dtypes manually by adding:\n\n"
            "{dtype_kw}\n\n"
            "to the call to `read_csv`/`read_table`."
            "{extra}").format(table=table,
                              exceptions=exceptions,
                              dtype_kw=dtype_kw,
                              extra=extra)
    else:
        dtype_msg = None

    if bad_dates:
        also = " also " if bad_dtypes else " "
        cols = "\n".join("- %s" % c for c in bad_dates)
        date_msg = (
            "The following columns{also}failed to properly parse as dates:\n\n"
            "{cols}\n\n"
            "This is usually due to an invalid value in that column. To\n"
            "diagnose and fix it's recommended to drop these columns from the\n"
            "`parse_dates` keyword, and manually convert them to dates later\n"
            "using `dd.to_datetime`.").format(also=also, cols=cols)
    else:
        date_msg = None

    if bad_dtypes or bad_dates:
        rule = "\n\n%s\n\n" % ("-" * 61)
        msg = "Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\n\n%s" % (
            rule.join(filter(None, [dtype_msg, date_msg])))
        raise ValueError(msg)
Beispiel #6
0
def check_meta(x, meta, funcname=None, numeric_equal=True):
    """Check that the dask metadata matches the result.

    If metadata matches, ``x`` is passed through unchanged. A nice error is
    raised if metadata doesn't match.

    Parameters
    ----------
    x : DataFrame, Series, or Index
    meta : DataFrame, Series, or Index
        The expected metadata that ``x`` should match
    funcname : str, optional
        The name of the function in which the metadata was specified. If
        provided, the function name will be included in the error message to be
        more helpful to users.
    numeric_equal : bool, optionl
        If True, integer and floating dtypes compare equal. This is useful due
        to panda's implicit conversion of integer to floating upon encountering
        missingness, which is hard to infer statically.
    """
    eq_types = {"i", "f", "u"} if numeric_equal else set()

    def equal_dtypes(a, b):
        if is_categorical_dtype(a) != is_categorical_dtype(b):
            return False
        if isinstance(a, str) and a == "-" or isinstance(b, str) and b == "-":
            return False
        if is_categorical_dtype(a) and is_categorical_dtype(b):
            if UNKNOWN_CATEGORIES in a.categories or UNKNOWN_CATEGORIES in b.categories:
                return True
            return a == b
        return (a.kind in eq_types and b.kind in eq_types) or is_dtype_equal(
            a, b)

    if not (is_dataframe_like(meta) or is_series_like(meta)
            or is_index_like(meta)) or is_dask_collection(meta):
        raise TypeError("Expected partition to be DataFrame, Series, or "
                        "Index, got `%s`" % typename(type(meta)))

    # Notice, we use .__class__ as opposed to type() in order to support
    # object proxies see <https://github.com/dask/dask/pull/6981>
    if x.__class__ != meta.__class__:
        errmsg = "Expected partition of type `{}` but got `{}`".format(
            typename(type(meta)),
            typename(type(x)),
        )
    elif is_dataframe_like(meta):
        dtypes = pd.concat([x.dtypes, meta.dtypes], axis=1, sort=True)
        bad_dtypes = [(repr(col), a, b)
                      for col, a, b in dtypes.fillna("-").itertuples()
                      if not equal_dtypes(a, b)]
        if bad_dtypes:
            errmsg = "Partition type: `{}`\n{}".format(
                typename(type(meta)),
                asciitable(["Column", "Found", "Expected"], bad_dtypes),
            )
        else:
            check_matching_columns(meta, x)
            return x
    else:
        if equal_dtypes(x.dtype, meta.dtype):
            return x
        errmsg = "Partition type: `{}`\n{}".format(
            typename(type(meta)),
            asciitable(["", "dtype"], [("Found", x.dtype),
                                       ("Expected", meta.dtype)]),
        )

    raise ValueError("Metadata mismatch found%s.\n\n"
                     "%s" %
                     ((" in `%s`" % funcname if funcname else ""), errmsg))