Exemple #1
0
def df_timestamp(npartitions):
    return dd.from_pandas(
        pd.DataFrame(
            {'a': list(range(10))}, dtype=pd.DatetimeTZDtype(tz='UTC')
        ),
        npartitions=npartitions,
    )
Exemple #2
0
def test_pandas_extension_types():
    """Test pandas extension data type happy path."""
    # pylint: disable=no-member
    test_params = [
        (pd.CategoricalDtype(),
         pd.Series(["a", "a", "b", "b", "c", "c"], dtype="category"), None),
        (pd.DatetimeTZDtype(tz='UTC'),
         pd.Series(pd.date_range(start="20200101", end="20200301"),
                   dtype="datetime64[ns, utc]"), None),
        (pd.Int64Dtype(), pd.Series(range(10), dtype="Int64"), None),
        (pd.StringDtype(), pd.Series(["foo", "bar", "baz"],
                                     dtype="string"), None),
        (pd.PeriodDtype(freq='D'),
         pd.Series(pd.period_range('1/1/2019', '1/1/2020', freq='D')), None),
        (
            pd.SparseDtype("float"),
            pd.Series(range(100)).where(lambda s: s < 5,
                                        other=np.nan).astype("Sparse[float]"),
            {
                "nullable": True
            },
        ),
        (pd.BooleanDtype(), pd.Series([1, 0, 0, 1, 1], dtype="boolean"), None),
        (
            pd.IntervalDtype(subtype="int64"),
            pd.Series(pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4])),
            None,
        )
    ]
    for dtype, data, series_kwargs in test_params:
        series_kwargs = {} if series_kwargs is None else series_kwargs
        series_schema = SeriesSchema(pandas_dtype=dtype, **series_kwargs)
        assert isinstance(series_schema.validate(data), pd.Series)
Exemple #3
0
    def __post_init__(self):
        if self.tz is None:
            type_ = np.dtype("datetime64[ns]")
        else:
            type_ = pd.DatetimeTZDtype(self.unit, self.tz)
            # DatetimeTZDtype converted tz to tzinfo for us
            object.__setattr__(self, "tz", type_.tz)

        object.__setattr__(self, "type", type_)
def test_timezones_fixed_format_series_nonempty(setup_path, tz_aware_fixture):
    # GH 20594

    dtype = pd.DatetimeTZDtype(tz=tz_aware_fixture)

    with ensure_clean_store(setup_path) as store:
        s = Series([0], dtype=dtype)
        store["s"] = s
        result = store["s"]
        tm.assert_series_equal(result, s)
Exemple #5
0
def test_timezones_fixed_format_frame_empty(setup_path, tz_aware_fixture):
    # GH 20594

    dtype = pd.DatetimeTZDtype(tz=tz_aware_fixture)

    with ensure_clean_store(setup_path) as store:
        s = Series(dtype=dtype)
        df = DataFrame({"A": s})
        store["df"] = df
        result = store["df"]
        tm.assert_frame_equal(result, df)
Exemple #6
0
def df_timestamp(npartitions):
    df = pd.DataFrame({
        'a': list(range(10)),
        'b': list('wwwwwxxxxx'),
        'c': list('yyyzzzyyzz'),
    })
    df["a"] = df.a.astype(pd.DatetimeTZDtype(tz='UTC'))
    return dd.from_pandas(
        df,
        npartitions=npartitions,
    )
def test_meta_nonempty_scalar():
    meta = meta_nonempty(np.float64(1.0))
    assert isinstance(meta, np.float64)

    x = pd.Timestamp(2000, 1, 1)
    meta = meta_nonempty(x)
    assert meta is x

    # DatetimeTZDtype
    x = pd.DatetimeTZDtype(tz="UTC")
    meta = meta_nonempty(x)
    assert meta == pd.Timestamp(1, tz=x.tz, unit=x.unit)
def test_timezones_fixed_format_empty(setup_path, tz_aware_fixture, frame_or_series):
    # GH 20594

    dtype = pd.DatetimeTZDtype(tz=tz_aware_fixture)

    obj = Series(dtype=dtype, name="A")
    if frame_or_series is DataFrame:
        obj = obj.to_frame()

    with ensure_clean_store(setup_path) as store:
        store["obj"] = obj
        result = store["obj"]
        tm.assert_equal(result, obj)
Exemple #9
0
    def _extract_schema(self, df):
        schema = []
        for column_name, dtype in dict(df.dtypes).items():
            type_map = {
                np.dtype('object'): bigquery.SchemaField(column_name, 'STRING'),
                np.dtype('int32'): bigquery.SchemaField(column_name, 'INTEGER'),
                np.dtype('int64'): bigquery.SchemaField(column_name, 'INTEGER'),
                np.dtype('float64'): bigquery.SchemaField(column_name, 'FLOAT'),
                np.dtype('datetime64[ns]'): bigquery.SchemaField(column_name, 'DATE'),
                pd.DatetimeTZDtype(tz='UTC'): bigquery.SchemaField(column_name, 'DATE')
            }
            schema.append(type_map[dtype])

        return schema
Exemple #10
0
def df_time() -> pd.DataFrame:
    return pd.DataFrame({
        "date":
        [pd.Timestamp("now"),
         pd.Timestamp("now"),
         pd.Timestamp("now")],
        "datetz":
        pd.Series(
            [pd.Timestamp("now"),
             pd.Timestamp("now"),
             pd.Timestamp("now")]).astype(pd.DatetimeTZDtype(tz="UTC")),
        "timedelta": [pd.Timedelta(1),
                      pd.Timedelta(1),
                      pd.Timedelta(1)],
        "period":
        pd.arrays.PeriodArray([0, 1, 2], freq="D"),
    })
Exemple #11
0
def sql_to_python_type(sql_type: str) -> type:
    """Turn an SQL type into a dataframe dtype"""
    if sql_type.startswith("CHAR(") or sql_type.startswith("VARCHAR("):
        return pd.StringDtype()
    elif sql_type.startswith("INTERVAL"):
        return np.dtype("<m8[ns]")
    elif sql_type.startswith("TIMESTAMP(") or sql_type.startswith("TIME("):
        return np.dtype("<M8[ns]")
    elif sql_type.startswith("TIMESTAMP_WITH_LOCAL_TIME_ZONE("):
        # Everything is converted to UTC
        # So far, this did not break
        return pd.DatetimeTZDtype(unit="ns", tz="UTC")
    elif sql_type.startswith("DECIMAL("):
        # We use np.float64 always, even though we might
        # be able to use a smaller type
        return np.float64
    else:
        try:
            return _SQL_TO_PYTHON_FRAMES[sql_type]
        except KeyError:  # pragma: no cover
            raise NotImplementedError(
                f"The SQL type {sql_type} is not implemented (yet)")
]
datetime_dtypes = [np.datetime64, np.timedelta64]
string_dtypes = [object]
try:
    extension_dtypes = [
        pd.Int8Dtype,
        pd.Int16Dtype,
        pd.Int32Dtype,
        pd.Int64Dtype,
        pd.UInt8Dtype,
        pd.UInt16Dtype,
        pd.UInt32Dtype,
        pd.UInt64Dtype,
        pd.CategoricalDtype,
        pd.IntervalDtype,
        pd.DatetimeTZDtype("ns", "UTC"),
        pd.PeriodDtype("D"),
    ]
except AttributeError:
    extension_dtypes = []


def setup(*args, **kwargs):
    # This function just needs to be imported into each benchmark file to
    # set up the random seed before each function.
    # https://asv.readthedocs.io/en/latest/writing_benchmarks.html
    np.random.seed(1234)


class BaseIO:
    """
Exemple #13
0
     DatetimeArray._from_sequence(["2000", "2001"]),
 ),
 (
     pd.DatetimeIndex(["2000", "2001"]),
     None,
     DatetimeArray._from_sequence(["2000", "2001"]),
 ),
 (
     ["2000", "2001"],
     np.dtype("datetime64[ns]"),
     DatetimeArray._from_sequence(["2000", "2001"]),
 ),
 # Datetime (tz-aware)
 (
     ["2000", "2001"],
     pd.DatetimeTZDtype(tz="CET"),
     DatetimeArray._from_sequence(["2000", "2001"],
                                  dtype=pd.DatetimeTZDtype(tz="CET")),
 ),
 # Timedelta
 (
     ["1H", "2H"],
     np.dtype("timedelta64[ns]"),
     TimedeltaArray._from_sequence(["1H", "2H"]),
 ),
 (
     pd.TimedeltaIndex(["1H", "2H"]),
     np.dtype("timedelta64[ns]"),
     TimedeltaArray._from_sequence(["1H", "2H"]),
 ),
 (
import numpy as np
import pytest

import pandas as pd
from pandas import (
    DatetimeIndex,
    Index,
)
import pandas._testing as tm

dtlike_dtypes = [
    np.dtype("timedelta64[ns]"),
    np.dtype("datetime64[ns]"),
    pd.DatetimeTZDtype("ns", "Asia/Tokyo"),
    pd.PeriodDtype("ns"),
]


@pytest.mark.parametrize("ldtype", dtlike_dtypes)
@pytest.mark.parametrize("rdtype", dtlike_dtypes)
def test_get_indexer_non_unique_wrong_dtype(ldtype, rdtype):

    vals = np.tile(3600 * 10**9 * np.arange(3), 2)

    def construct(dtype):
        if dtype is dtlike_dtypes[-1]:
            # PeriodArray will try to cast ints to strings
            return DatetimeIndex(vals).astype(dtype)
        return Index(vals, dtype=dtype)

    left = construct(ldtype)
Exemple #15
0
        # datetime
        ([pd.Timestamp('2000', ), pd.Timestamp('2001')
          ], pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])),
        ([datetime.datetime(2000, 1, 1),
          datetime.datetime(2001, 1, 1)
          ], pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])),
        (np.array([1, 2], dtype='M8[ns]'),
         pd.arrays.DatetimeArray(np.array([1, 2], dtype='M8[ns]'))),
        (np.array([1, 2], dtype='M8[us]'),
         pd.arrays.DatetimeArray(np.array([1000, 2000], dtype='M8[ns]'))),

        # datetimetz
        ([pd.Timestamp('2000', tz='CET'),
          pd.Timestamp('2001', tz='CET')],
         pd.arrays.DatetimeArray._from_sequence(
             ['2000', '2001'], dtype=pd.DatetimeTZDtype(tz='CET'))),
        ([
            datetime.datetime(2000, 1, 1, tzinfo=cet),
            datetime.datetime(2001, 1, 1, tzinfo=cet)
        ], pd.arrays.DatetimeArray._from_sequence(['2000', '2001'], tz=cet)),

        # timedelta
        ([pd.Timedelta('1H'), pd.Timedelta('2H')
          ], pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])),
        (np.array([1, 2], dtype='m8[ns]'),
         pd.arrays.TimedeltaArray(np.array([1, 2], dtype='m8[ns]'))),
        (np.array([1, 2], dtype='m8[us]'),
         pd.arrays.TimedeltaArray(np.array([1000, 2000], dtype='m8[ns]'))),
    ])
def test_array_inference(data, expected):
    result = pd.array(data)
Exemple #16
0
    (np.array([1, 2], dtype='datetime64[ns]'), None,
     pd.arrays.DatetimeArray._from_sequence(
         np.array([1, 2], dtype='datetime64[ns]'))),

    (pd.DatetimeIndex(['2000', '2001']), np.dtype('datetime64[ns]'),
     pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])),

    (pd.DatetimeIndex(['2000', '2001']), None,
     pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])),

    (['2000', '2001'], np.dtype('datetime64[ns]'),
     pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])),

    # Datetime (tz-aware)
    (['2000', '2001'], pd.DatetimeTZDtype(tz="CET"),
     pd.arrays.DatetimeArray._from_sequence(
         ['2000', '2001'], dtype=pd.DatetimeTZDtype(tz="CET"))),

    # Timedelta
    (['1H', '2H'], np.dtype('timedelta64[ns]'),
     pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])),

    (pd.TimedeltaIndex(['1H', '2H']), np.dtype('timedelta64[ns]'),
     pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])),

    (pd.TimedeltaIndex(['1H', '2H']), None,
     pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])),

    # Category
    (['a', 'b'], 'category', pd.Categorical(['a', 'b'])),
Exemple #17
0
def test_python_to_sql():
    assert str(python_to_sql_type(np.dtype("int32"))) == "INTEGER"
    assert str(python_to_sql_type(np.dtype(">M8[ns]"))) == "TIMESTAMP"
    assert (str(python_to_sql_type(pd.DatetimeTZDtype(
        unit="ns", tz="UTC"))) == "TIMESTAMP_WITH_LOCAL_TIME_ZONE")
Exemple #18
0
    for checks in [
            pa.Check.le(check_arg),
            pa.Check.ge(check_arg),
            pa.Check.eq(check_arg),
            pa.Check.isin([check_arg]),
    ]:
        column_schema = pa.Column("datetime",
                                  checks=checks,
                                  name="test_datetime")
        column_schema(data.draw(column_schema.strategy()))


@pytest.mark.parametrize(
    "dtype",
    (
        pd.DatetimeTZDtype(tz="UTC"),
        pd.DatetimeTZDtype(tz="dateutil/US/Central"),
    ),
)
@pytest.mark.parametrize(
    "check_arg",
    [
        pd.Timestamp("2006-01-01", tz="CET"),
        pd.Timestamp("2006-01-01", tz="UTC"),
    ],
)
@hypothesis.given(st.data())
@hypothesis.settings(
    suppress_health_check=[hypothesis.HealthCheck.too_slow], )
def test_datetime_tz_example(dtype, check_arg, data) -> None:
    """Test Column schema example method generate examples of
Exemple #19
0
class TestSeriesConvertDtypes:
    # The answerdict has keys that have 4 tuples, corresponding to the arguments
    # infer_objects, convert_string, convert_integer, convert_boolean
    # This allows all 16 possible combinations to be tested.  Since common
    # combinations expect the same answer, this provides an easy way to list
    # all the possibilities
    @pytest.mark.parametrize(
        "data, maindtype, answerdict",
        [
            (
                [1, 2, 3],
                np.dtype("int32"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "Int32",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("int32"),
                },
            ),
            (
                [1, 2, 3],
                np.dtype("int64"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "Int64",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("int64"),
                },
            ),
            (
                ["x", "y", "z"],
                np.dtype("O"),
                {
                    (
                        (True, False),
                        (True, ),
                        (True, False),
                        (True, False),
                    ):
                    pd.StringDtype(),
                    ((True, False), (False, ), (True, False), (True, False)):
                    np.dtype("O"),
                },
            ),
            (
                [True, False, np.nan],
                np.dtype("O"),
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, ),
                    ):
                    pd.BooleanDtype(),
                    ((True, False), (True, False), (True, False), (False, )):
                    np.dtype("O"),
                },
            ),
            (
                ["h", "i", np.nan],
                np.dtype("O"),
                {
                    (
                        (True, False),
                        (True, ),
                        (True, False),
                        (True, False),
                    ):
                    pd.StringDtype(),
                    ((True, False), (False, ), (True, False), (True, False)):
                    np.dtype("O"),
                },
            ),
            (
                [10, np.nan, 20],
                np.dtype("float"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "Int64",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("float"),
                },
            ),
            (
                [np.nan, 100.5, 200],
                np.dtype("float"),
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("float"),
                },
            ),
            (
                [3, 4, 5],
                "Int8",
                {
                    ((True, False), (True, False), (True, False), (True, False)):
                    "Int8"
                },
            ),
            (
                [[1, 2], [3, 4], [5]],
                None,
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("O"),
                },
            ),
            (
                [4, 5, 6],
                np.dtype("uint32"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "UInt32",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("uint32"),
                },
            ),
            (
                [-10, 12, 13],
                np.dtype("i1"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "Int8",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("i1"),
                },
            ),
            (
                [1, 2.0],
                object,
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "Int64",
                    ((True, ), (True, False), (False, ), (True, False)):
                    np.dtype("float"),
                    ((False, ), (True, False), (False, ), (True, False)):
                    np.dtype("object"),
                },
            ),
            (
                ["a", "b"],
                pd.CategoricalDtype(),
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    pd.CategoricalDtype(),
                },
            ),
            (
                pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
                pd.DatetimeTZDtype(tz="UTC"),
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    pd.DatetimeTZDtype(tz="UTC"),
                },
            ),
            (
                pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
                "datetime64[ns]",
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("datetime64[ns]"),
                },
            ),
            (
                pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
                object,
                {
                    (
                        (True, ),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("datetime64[ns]"),
                    (
                        (False, ),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("O"),
                },
            ),
            (
                pd.period_range("1/1/2011", freq="M", periods=3),
                None,
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    pd.PeriodDtype("M"),
                },
            ),
            (
                pd.arrays.IntervalArray([pd.Interval(0, 1),
                                         pd.Interval(1, 5)]),
                None,
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    pd.IntervalDtype("int64"),
                },
            ),
        ],
    )
    @pytest.mark.parametrize("params", product(*[(True, False)] * 4))
    def test_convert_dtypes(self, data, maindtype, params, answerdict):
        if maindtype is not None:
            series = pd.Series(data, dtype=maindtype)
        else:
            series = pd.Series(data)
        answers = {
            k: a
            for (kk, a) in answerdict.items() for k in product(*kk)
        }

        ns = series.convert_dtypes(*params)
        expected_dtype = answers[tuple(params)]
        expected = pd.Series(series.values, dtype=expected_dtype)
        tm.assert_series_equal(ns, expected)

        # Test that it is a copy
        copy = series.copy(deep=True)
        ns[ns.notna()] = np.nan

        # Make sure original not changed
        tm.assert_series_equal(series, copy)
Exemple #20
0
    'plant_id_eia': 'uint16',
    'unitid': 'category',
    'gross_load_mw': 'float32',
    'steam_load_1000_lbs': 'float32',
    'so2_mass_lbs': 'float32',
    'so2_mass_measurement_code': 'category',
    'nox_rate_lbs_mmbtu': 'float32',
    'nox_rate_measurement_code': 'category',
    'nox_mass_lbs': 'float32',
    'nox_mass_measurement_code': 'category',
    'co2_mass_tons': 'float32',
    'co2_mass_measurement_code': 'category',
    'heat_content_mmbtu': 'float32',
    'facility_id': 'category',
    'unit_id_epa': 'category',
    'operating_datetime_utc': pd.DatetimeTZDtype(tz="UTC"),
    'operating_time_hours': 'float32'
}


def parse_command_line(argv):
    """
    Parse command line arguments. See the -h option.

    :param argv: arguments on the command line must include caller file name.
    """
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '-q',
        '--quiet',
        break
    except (ImportError, TypeError, ValueError):
        pass

numeric_dtypes = [np.int64, np.int32, np.uint32, np.uint64, np.float32,
                  np.float64, np.int16, np.int8, np.uint16, np.uint8]
datetime_dtypes = [np.datetime64, np.timedelta64]
string_dtypes = [np.object]
try:
    extension_dtypes = [pd.Int8Dtype, pd.Int16Dtype,
                        pd.Int32Dtype, pd.Int64Dtype,
                        pd.UInt8Dtype, pd.UInt16Dtype,
                        pd.UInt32Dtype, pd.UInt64Dtype,
                        pd.CategoricalDtype,
                        pd.IntervalDtype,
                        pd.DatetimeTZDtype('ns', 'UTC'),
                        pd.PeriodDtype('D')]
except AttributeError:
    extension_dtypes = []


def setup(*args, **kwargs):
    # This function just needs to be imported into each benchmark file to
    # set up the random seed before each function.
    # http://asv.readthedocs.io/en/latest/writing_benchmarks.html
    np.random.seed(1234)


class BaseIO(object):
    """
    Base class for IO benchmarks
def test_make_meta():
    df = pd.DataFrame({
        "a": [1, 2, 3],
        "b": list("abc"),
        "c": [1.0, 2.0, 3.0]
    },
                      index=[10, 20, 30])

    # Pandas dataframe
    meta = make_meta(df)
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes).all()
    assert isinstance(meta.index, type(df.index))

    # Pandas series
    meta = make_meta(df.a)
    assert len(meta) == 0
    assert meta.dtype == df.a.dtype
    assert isinstance(meta.index, type(df.index))

    # Pandas index
    meta = make_meta(df.index)
    assert isinstance(meta, type(df.index))
    assert len(meta) == 0

    # Dask object
    ddf = dd.from_pandas(df, npartitions=2)
    assert make_meta(ddf) is ddf._meta

    # Dict
    meta = make_meta({"a": "i8", "b": "O", "c": "f8"})
    assert isinstance(meta, pd.DataFrame)
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes).all()
    assert isinstance(meta.index, pd.RangeIndex)

    # List
    meta = make_meta([("a", "i8"), ("c", "f8"), ("b", "O")])
    assert (meta.columns == ["a", "c", "b"]).all()
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes[meta.dtypes.index]).all()
    assert isinstance(meta.index, pd.RangeIndex)

    # Tuple
    meta = make_meta(("a", "i8"))
    assert isinstance(meta, pd.Series)
    assert len(meta) == 0
    assert meta.dtype == "i8"
    assert meta.name == "a"

    # Iterable
    class CustomMetadata(Iterable):
        """Custom class iterator returning pandas types."""
        def __init__(self, max=0):
            self.types = [("a", "i8"), ("c", "f8"), ("b", "O")]

        def __iter__(self):
            self.n = 0
            return self

        def __next__(self):
            if self.n < len(self.types):
                ret = self.types[self.n]
                self.n += 1
                return ret
            else:
                raise StopIteration

    meta = make_meta(CustomMetadata())
    assert (meta.columns == ["a", "c", "b"]).all()
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes[meta.dtypes.index]).all()
    assert isinstance(meta.index, pd.RangeIndex)

    # With index
    idx = pd.Index([1, 2], name="foo")
    meta = make_meta(
        {
            "a": "i8",
            "b": "i4"
        },
        index=idx,
    )
    assert type(meta.index) is type(idx)
    assert meta.index.dtype == "int64"
    assert len(meta.index) == 0

    meta = make_meta(("a", "i8"), index=idx)
    assert type(meta.index) is type(idx)
    assert meta.index.dtype == "int64"
    assert len(meta.index) == 0

    # Categoricals
    meta = make_meta({"a": "category"}, parent_meta=df)
    assert len(meta.a.cat.categories) == 1
    assert meta.a.cat.categories[0] == UNKNOWN_CATEGORIES
    meta = make_meta(("a", "category"), parent_meta=df)
    assert len(meta.cat.categories) == 1
    assert meta.cat.categories[0] == UNKNOWN_CATEGORIES

    # Numpy scalar
    meta = make_meta(np.float64(1.0), parent_meta=df)
    assert isinstance(meta, np.float64)

    # Python scalar
    meta = make_meta(1.0, parent_meta=df)
    assert isinstance(meta, np.float64)

    # Timestamp
    x = pd.Timestamp(2000, 1, 1)
    meta = make_meta(x, parent_meta=df)
    assert meta is x

    # DatetimeTZDtype
    x = pd.DatetimeTZDtype(tz="UTC")
    meta = make_meta(x)
    assert meta == pd.Timestamp(1, tz=x.tz, unit=x.unit)

    # Dtype expressions
    meta = make_meta("i8", parent_meta=df)
    assert isinstance(meta, np.int64)
    meta = make_meta(float, parent_meta=df)
    assert isinstance(meta, np.dtype(float).type)
    meta = make_meta(np.dtype("bool"), parent_meta=df)
    assert isinstance(meta, np.bool_)
    assert pytest.raises(TypeError, lambda: make_meta(None))
Exemple #23
0
def test_make_meta():
    df = pd.DataFrame({
        "a": [1, 2, 3],
        "b": list("abc"),
        "c": [1.0, 2.0, 3.0]
    },
                      index=[10, 20, 30])

    # Pandas dataframe
    meta = make_meta(df)
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes).all()
    assert isinstance(meta.index, type(df.index))

    # Pandas series
    meta = make_meta(df.a)
    assert len(meta) == 0
    assert meta.dtype == df.a.dtype
    assert isinstance(meta.index, type(df.index))

    # Pandas index
    meta = make_meta(df.index)
    assert isinstance(meta, type(df.index))
    assert len(meta) == 0

    # Dask object
    ddf = dd.from_pandas(df, npartitions=2)
    assert make_meta(ddf) is ddf._meta

    # Dict
    meta = make_meta({"a": "i8", "b": "O", "c": "f8"})
    assert isinstance(meta, pd.DataFrame)
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes).all()
    assert isinstance(meta.index, pd.RangeIndex)

    # Iterable
    meta = make_meta([("a", "i8"), ("c", "f8"), ("b", "O")])
    assert (meta.columns == ["a", "c", "b"]).all()
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes[meta.dtypes.index]).all()
    assert isinstance(meta.index, pd.RangeIndex)

    # Tuple
    meta = make_meta(("a", "i8"))
    assert isinstance(meta, pd.Series)
    assert len(meta) == 0
    assert meta.dtype == "i8"
    assert meta.name == "a"

    # With index
    meta = make_meta(
        {
            "a": "i8",
            "b": "i4"
        },
        index=pd.Int64Index([1, 2], name="foo"),
    )
    assert isinstance(meta.index, pd.Int64Index)
    assert len(meta.index) == 0
    meta = make_meta(("a", "i8"), index=pd.Int64Index([1, 2], name="foo"))
    assert isinstance(meta.index, pd.Int64Index)
    assert len(meta.index) == 0

    # Categoricals
    meta = make_meta({"a": "category"}, parent_meta=df)
    assert len(meta.a.cat.categories) == 1
    assert meta.a.cat.categories[0] == UNKNOWN_CATEGORIES
    meta = make_meta(("a", "category"), parent_meta=df)
    assert len(meta.cat.categories) == 1
    assert meta.cat.categories[0] == UNKNOWN_CATEGORIES

    # Numpy scalar
    meta = make_meta(np.float64(1.0), parent_meta=df)
    assert isinstance(meta, np.float64)

    # Python scalar
    meta = make_meta(1.0, parent_meta=df)
    assert isinstance(meta, np.float64)

    # Timestamp
    x = pd.Timestamp(2000, 1, 1)
    meta = make_meta(x, parent_meta=df)
    assert meta is x

    # DatetimeTZDtype
    x = pd.DatetimeTZDtype(tz="UTC")
    meta = make_meta(x)
    assert meta == pd.Timestamp(1, tz=x.tz, unit=x.unit)

    # Dtype expressions
    meta = make_meta("i8", parent_meta=df)
    assert isinstance(meta, np.int64)
    meta = make_meta(float, parent_meta=df)
    assert isinstance(meta, np.dtype(float).type)
    meta = make_meta(np.dtype("bool"), parent_meta=df)
    assert isinstance(meta, np.bool_)
    assert pytest.raises(TypeError, lambda: make_meta(None))
Exemple #24
0
object_dtypes = {object: "object", np.object_: "object"}

category_dtypes = {
    pa.Category:
    "category",
    pa.Category(["A", "B"], ordered=True):
    pd.CategoricalDtype(["A", "B"], ordered=True),
    pd.CategoricalDtype(["A", "B"], ordered=True):
    pd.CategoricalDtype(["A", "B"], ordered=True),
}

timestamp_dtypes = {
    datetime.datetime: "datetime64[ns]",
    np.datetime64: "datetime64[ns]",
    pa.Timestamp: "datetime64[ns]",
    pd.DatetimeTZDtype(tz="CET"): "datetime64[ns, CET]",
    pandas_engine.DateTime: "datetime64[ns]",
    pandas_engine.DateTime(unit="ns", tz="CET"):
    "datetime64[ns, CET]",  # type: ignore
}

timedelta_dtypes = {
    datetime.timedelta: "timedelta64",
    datetime.timedelta: "timedelta64",
    np.timedelta64: "timedelta64",
    pd.Timedelta: "timedelta64",
    pa.Timedelta: "timedelta64",
}

period_dtypes = {pd.PeriodDtype(freq="D"): "period[D]"}
# Series.astype does not accept a string alias for SparseDtype.
class TestSeriesConvertDtypes:
    # The answerdict has keys that have 4 tuples, corresponding to the arguments
    # infer_objects, convert_string, convert_integer, convert_boolean
    # This allows all 16 possible combinations to be tested.  Since common
    # combinations expect the same answer, this provides an easy way to list
    # all the possibilities
    @pytest.mark.parametrize(
        "data, maindtype, answerdict",
        [
            (
                [1, 2, 3],
                np.dtype("int32"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "Int32",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("int32"),
                },
            ),
            (
                [1, 2, 3],
                np.dtype("int64"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "Int64",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("int64"),
                },
            ),
            (
                ["x", "y", "z"],
                np.dtype("O"),
                {
                    (
                        (True, False),
                        (True, ),
                        (True, False),
                        (True, False),
                    ):
                    pd.StringDtype(),
                    ((True, False), (False, ), (True, False), (True, False)):
                    np.dtype("O"),
                },
            ),
            (
                [True, False, np.nan],
                np.dtype("O"),
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, ),
                    ):
                    pd.BooleanDtype(),
                    ((True, False), (True, False), (True, False), (False, )):
                    np.dtype("O"),
                },
            ),
            (
                ["h", "i", np.nan],
                np.dtype("O"),
                {
                    (
                        (True, False),
                        (True, ),
                        (True, False),
                        (True, False),
                    ):
                    pd.StringDtype(),
                    ((True, False), (False, ), (True, False), (True, False)):
                    np.dtype("O"),
                },
            ),
            (  # GH32117
                ["h", "i", 1],
                np.dtype("O"),
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("O"),
                },
            ),
            (
                [10, np.nan, 20],
                np.dtype("float"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "Int64",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("float"),
                },
            ),
            (
                [np.nan, 100.5, 200],
                np.dtype("float"),
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("float"),
                },
            ),
            (
                [3, 4, 5],
                "Int8",
                {
                    ((True, False), (True, False), (True, False), (True, False)):
                    "Int8"
                },
            ),
            (
                [[1, 2], [3, 4], [5]],
                None,
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("O"),
                },
            ),
            (
                [4, 5, 6],
                np.dtype("uint32"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "UInt32",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("uint32"),
                },
            ),
            (
                [-10, 12, 13],
                np.dtype("i1"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "Int8",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("i1"),
                },
            ),
            (
                [1, 2.0],
                object,
                {
                    ((True, ), (True, False), (True, ), (True, False)):
                    "Int64",
                    ((True, ), (True, False), (False, ), (True, False)):
                    np.dtype("float"),
                    ((False, ), (True, False), (True, False), (True, False)):
                    np.dtype("object"),
                },
            ),
            (
                [1, 2.5],
                object,
                {
                    ((True, ), (True, False), (True, False), (True, False)):
                    np.dtype("float"),
                    ((False, ), (True, False), (True, False), (True, False)):
                    np.dtype("object"),
                },
            ),
            (
                ["a", "b"],
                pd.CategoricalDtype(),
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    pd.CategoricalDtype(),
                },
            ),
            (
                pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
                pd.DatetimeTZDtype(tz="UTC"),
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    pd.DatetimeTZDtype(tz="UTC"),
                },
            ),
            (
                pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
                "datetime64[ns]",
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("datetime64[ns]"),
                },
            ),
            (
                pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
                object,
                {
                    (
                        (True, ),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("datetime64[ns]"),
                    (
                        (False, ),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("O"),
                },
            ),
            (
                pd.period_range("1/1/2011", freq="M", periods=3),
                None,
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    pd.PeriodDtype("M"),
                },
            ),
            (
                pd.arrays.IntervalArray([pd.Interval(0, 1),
                                         pd.Interval(1, 5)]),
                None,
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    pd.IntervalDtype("int64"),
                },
            ),
        ],
    )
    @pytest.mark.parametrize("params", product(*[(True, False)] * 4))
    def test_convert_dtypes(self, data, maindtype, params, answerdict):
        if maindtype is not None:
            series = pd.Series(data, dtype=maindtype)
        else:
            series = pd.Series(data)
        answers = {
            k: a
            for (kk, a) in answerdict.items() for k in product(*kk)
        }

        ns = series.convert_dtypes(*params)
        expected_dtype = answers[tuple(params)]
        expected = pd.Series(series.values, dtype=expected_dtype)
        tm.assert_series_equal(ns, expected)

        # Test that it is a copy
        copy = series.copy(deep=True)
        if is_interval_dtype(ns.dtype) and ns.dtype.subtype.kind in ["i", "u"]:
            msg = "Cannot set float NaN to integer-backed IntervalArray"
            with pytest.raises(ValueError, match=msg):
                ns[ns.notna()] = np.nan
        else:
            ns[ns.notna()] = np.nan

        # Make sure original not changed
        tm.assert_series_equal(series, copy)

    def test_convert_string_dtype(self):
        # https://github.com/pandas-dev/pandas/issues/31731 -> converting columns
        # that are already string dtype
        df = pd.DataFrame({
            "A": ["a", "b", pd.NA],
            "B": ["ä", "ö", "ü"]
        },
                          dtype="string")
        result = df.convert_dtypes()
        tm.assert_frame_equal(df, result)

    def test_convert_bool_dtype(self):
        # GH32287
        df = pd.DataFrame({"A": pd.array([True])})
        tm.assert_frame_equal(df, df.convert_dtypes())
 def astype_datetime(self):
     self.df["float"].astype(pd.DatetimeTZDtype(tz="US/Pacific"))
Exemple #27
0
     DatetimeArray._from_sequence(["2000", "2001"]),
 ),
 (
     pd.DatetimeIndex(["2000", "2001"]),
     None,
     DatetimeArray._from_sequence(["2000", "2001"]),
 ),
 (
     ["2000", "2001"],
     np.dtype("datetime64[ns]"),
     DatetimeArray._from_sequence(["2000", "2001"]),
 ),
 # Datetime (tz-aware)
 (
     ["2000", "2001"],
     pd.DatetimeTZDtype(tz="CET"),
     DatetimeArray._from_sequence(
         ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET")
     ),
 ),
 # Timedelta
 (
     ["1H", "2H"],
     np.dtype("timedelta64[ns]"),
     TimedeltaArray._from_sequence(["1H", "2H"]),
 ),
 (
     pd.TimedeltaIndex(["1H", "2H"]),
     np.dtype("timedelta64[ns]"),
     TimedeltaArray._from_sequence(["1H", "2H"]),
 ),
         np.dtype("object"),
     },
 ),
 (
     [1, 2.5],
     object,
     "Float64",
     {
         ("convert_floating", False): np.dtype("float"),
         ("infer_objects", False): np.dtype("object"),
     },
 ),
 (["a", "b"], pd.CategoricalDtype(), pd.CategoricalDtype(), {}),
 (
     pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
     pd.DatetimeTZDtype(tz="UTC"),
     pd.DatetimeTZDtype(tz="UTC"),
     {},
 ),
 (
     pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
     "datetime64[ns]",
     np.dtype("datetime64[ns]"),
     {},
 ),
 (
     pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
     object,
     np.dtype("datetime64[ns]"),
     {
         ("infer_objects", False): np.dtype("object")