Exemple #1
0
def test_setitem_new_column(sample_df):
    dt = DataTable(sample_df)
    new_series = pd.Series([1, 2, 3])
    if ks and isinstance(sample_df, ks.DataFrame):
        dtype = 'int64'
        new_series = ks.Series(new_series)
    else:
        dtype = 'Int64'

    new_col = DataColumn(new_series, use_standard_tags=False)
    assert new_col.name is None

    dt['test_col2'] = new_col
    updated_df = dt.to_dataframe()
    assert 'test_col2' in dt.columns
    assert dt['test_col2'].logical_type == Integer
    assert dt['test_col2'].semantic_tags == set()
    assert dt['test_col2'].name == 'test_col2'
    assert dt['test_col2']._series.name == 'test_col2'
    assert 'test_col2' in updated_df.columns
    assert updated_df['test_col2'].dtype == dtype

    # Standard tags and no logical type
    new_series = pd.Series(['new', 'column', 'inserted'], name='test_col')
    if ks and isinstance(sample_df, ks.DataFrame):
        dtype = 'object'
        new_series = ks.Series(new_series)
    else:
        dtype = 'category'
    new_col = DataColumn(new_series, use_standard_tags=True)
    dt['test_col'] = new_col
    updated_df = dt.to_dataframe()
    assert 'test_col' in dt.columns
    assert dt['test_col'].logical_type == Categorical
    assert dt['test_col'].semantic_tags == {'category'}
    assert dt['test_col'].name == 'test_col'
    assert dt['test_col']._series.name == 'test_col'
    assert 'test_col' in updated_df.columns
    assert updated_df['test_col'].dtype == dtype

    # Add with logical type and semantic tag
    new_series = pd.Series([1, 2, 3])
    if ks and isinstance(sample_df, ks.DataFrame):
        new_series = ks.Series(new_series)
    new_col = DataColumn(new_series,
                         logical_type=Double,
                         use_standard_tags=False,
                         semantic_tags={'test_tag'})
    dt['test_col3'] = new_col
    updated_df = dt.to_dataframe()
    assert 'test_col3' in dt.columns
    assert dt['test_col3'].logical_type == Double
    assert dt['test_col3'].semantic_tags == {'test_tag'}
    assert dt['test_col3'].name == 'test_col3'
    assert dt['test_col3']._series.name == 'test_col3'
    assert 'test_col3' in updated_df.columns
    assert updated_df['test_col3'].dtype == 'float'
def test_sets_category_dtype_on_init():
    column_name = 'test_series'
    series_list = [
        pd.Series(['a', 'b', 'c'], name=column_name),
        pd.Series(['a', None, 'c'], name=column_name),
        pd.Series(['a', np.nan, 'c'], name=column_name),
        pd.Series(['a', pd.NA, 'c'], name=column_name),
        pd.Series(['a', pd.NaT, 'c'], name=column_name),
    ]

    logical_types = [
        Categorical,
        CountryCode,
        Ordinal(order=['a', 'b', 'c']),
        SubRegionCode,
        ZIPCode,
    ]

    for series in series_list:
        series = series.astype('object')
        for logical_type in logical_types:
            ltypes = {
                column_name: logical_type,
            }
            dt = DataTable(pd.DataFrame(series), logical_types=ltypes)
            assert dt.columns[column_name].logical_type == logical_type
            assert dt.columns[column_name].dtype == logical_type.pandas_dtype
            assert dt.to_dataframe(
            )[column_name].dtype == logical_type.pandas_dtype
def test_sets_string_dtype_on_init():
    column_name = 'test_series'
    series_list = [
        pd.Series(['a', 'b', 'c'], name=column_name),
        pd.Series(['a', None, 'c'], name=column_name),
        pd.Series(['a', np.nan, 'c'], name=column_name),
        pd.Series(['a', pd.NA, 'c'], name=column_name),
    ]

    logical_types = [
        Filepath,
        FullName,
        IPAddress,
        NaturalLanguage,
        PhoneNumber,
        URL,
    ]

    for series in series_list:
        series = series.astype('object')
        for logical_type in logical_types:
            ltypes = {
                column_name: logical_type,
            }
            dt = DataTable(pd.DataFrame(series), logical_types=ltypes)
            assert dt.columns[column_name].logical_type == logical_type
            assert dt.columns[column_name].dtype == logical_type.pandas_dtype
            assert dt.to_dataframe(
            )[column_name].dtype == logical_type.pandas_dtype
Exemple #4
0
def test_to_csv(sample_df, tmpdir):
    dt = DataTable(sample_df,
                   name='test_data',
                   index='id',
                   semantic_tags={'id': 'tag1'},
                   logical_types={'age': Ordinal(order=[25, 33, 57])},
                   column_descriptions={
                       'signup_date': 'original signup date',
                       'age': 'age of the user'
                   },
                   column_metadata={
                       'id': {
                           'is_sorted': True
                       },
                       'age': {
                           'interesting_values': [33, 57]
                       }
                   })

    dt.to_csv(str(tmpdir), encoding='utf-8', engine='python')
    _dt = deserialize.read_datatable(str(tmpdir))

    pd.testing.assert_frame_equal(
        to_pandas(dt.to_dataframe(), index=_dt.index, sort_index=True),
        to_pandas(_dt.to_dataframe(), index=_dt.index, sort_index=True))
    assert dt == _dt
Exemple #5
0
def test_deserialize_s3_csv(sample_df_pandas):
    dt = DataTable(sample_df_pandas, index='id')
    _dt = deserialize.read_datatable(S3_URL)

    pd.testing.assert_frame_equal(
        to_pandas(dt.to_dataframe(), index=dt.index),
        to_pandas(_dt.to_dataframe(), index=_dt.index))
    assert dt == _dt
def test_sets_object_dtype_on_update(latlong_df):
    for column_name in latlong_df.columns:
        ltypes = {column_name: NaturalLanguage}
        dt = DataTable(latlong_df.loc[:, [column_name]], logical_types=ltypes)
        dt = dt.set_types(logical_types={column_name: LatLong})
        assert dt.columns[column_name].logical_type == LatLong
        assert dt.columns[column_name].dtype == LatLong.pandas_dtype
        assert dt.to_dataframe()[column_name].dtype == LatLong.pandas_dtype
def test_set_index(sample_df):
    # Test setting index with set_index()
    dt = DataTable(sample_df)
    new_dt = dt.set_index('id')
    assert new_dt is not dt
    assert new_dt.index == 'id'
    assert dt.index is None
    assert new_dt.columns['id'].semantic_tags == {'index'}
    non_index_cols = [
        col for col in new_dt.columns.values() if col.name != 'id'
    ]
    assert all(['index' not in col.semantic_tags for col in non_index_cols])
    # Test changing index with set_index()
    new_dt2 = new_dt.set_index('full_name')
    assert new_dt.index == 'id'
    assert new_dt2.columns['full_name'].semantic_tags == {'index'}
    non_index_cols = [
        col for col in new_dt2.columns.values() if col.name != 'full_name'
    ]
    assert all(['index' not in col.semantic_tags for col in non_index_cols])

    # Test setting index using setter
    dt = DataTable(sample_df)
    dt.index = 'id'
    assert dt.index == 'id'
    assert 'index' in dt.columns['id'].semantic_tags
    non_index_cols = [col for col in dt.columns.values() if col.name != 'id']
    assert all(['index' not in col.semantic_tags for col in non_index_cols])
    # Test changing index with setter
    dt.index = 'full_name'
    assert 'index' in dt.columns['full_name'].semantic_tags
    non_index_cols = [
        col for col in dt.columns.values() if col.name != 'full_name'
    ]
    assert all(['index' not in col.semantic_tags for col in non_index_cols])

    # Test changing index also changes underlying DataFrame - pandas only
    if isinstance(sample_df, pd.DataFrame):
        dt = DataTable(sample_df)
        dt.index = 'id'
        assert (dt.to_dataframe().index == [0, 1, 2, 3]).all()
        assert (dt._dataframe.index == [0, 1, 2, 3]).all()
        dt.index = 'full_name'
        assert (
            dt.to_dataframe().index == dt.to_dataframe()['full_name']).all()
        assert (dt._dataframe.index == dt.to_dataframe()['full_name']).all()
Exemple #8
0
def test_to_parquet(sample_df, tmpdir):
    dt = DataTable(sample_df, index='id')
    dt.to_parquet(str(tmpdir))
    _dt = deserialize.read_datatable(str(tmpdir))
    pd.testing.assert_frame_equal(
        to_pandas(dt.to_dataframe(), index=dt.index, sort_index=True),
        to_pandas(_dt.to_dataframe(), index=_dt.index, sort_index=True))
    assert dt == _dt
Exemple #9
0
def test_deserialize_url_csv_anon(sample_df_pandas):
    dt = DataTable(sample_df_pandas, index='id')
    _dt = deserialize.read_datatable(URL, profile_name=False)

    pd.testing.assert_frame_equal(
        to_pandas(dt.to_dataframe(), index=dt.index),
        to_pandas(_dt.to_dataframe(), index=_dt.index))
    assert dt == _dt
Exemple #10
0
def test_setitem_overwrite_column(sample_df):
    dt = DataTable(sample_df,
                   index='id',
                   time_index='signup_date',
                   use_standard_tags=True)

    # Change to column no change in types
    original_col = dt['age']
    new_series = pd.Series([1, 2, 3])
    if ks and isinstance(sample_df, ks.DataFrame):
        dtype = 'int64'
        new_series = ks.Series(new_series)
    else:
        dtype = 'Int64'
    overwrite_col = DataColumn(new_series, use_standard_tags=True)
    dt['age'] = overwrite_col
    updated_df = dt.to_dataframe()

    assert 'age' in dt.columns
    assert dt['age'].logical_type == original_col.logical_type
    assert dt['age'].semantic_tags == original_col.semantic_tags
    assert 'age' in updated_df.columns
    assert updated_df['age'].dtype == dtype
    assert original_col.to_series() is not dt['age'].to_series()

    # Change dtype, logical types, and tags with conflicting use_standard_tags
    original_col = dt['full_name']
    new_series = pd.Series([True, False, False])
    if ks and isinstance(sample_df, ks.DataFrame):
        new_series = ks.Series(new_series)
        dtype = 'bool'
    else:
        dtype = 'boolean'
    overwrite_col = DataColumn(new_series.astype(dtype),
                               use_standard_tags=False,
                               semantic_tags='test_tag')
    dt['full_name'] = overwrite_col
    updated_df = dt.to_dataframe()

    assert 'full_name' in dt.columns
    assert dt['full_name'].logical_type == Boolean
    assert dt['full_name'].semantic_tags == {'test_tag'}
    assert 'full_name' in updated_df.columns
    assert updated_df['full_name'].dtype == dtype
    assert original_col.to_series() is not dt['full_name'].to_series()
Exemple #11
0
def test_datatable_getitem_list_input(sample_df):
    # Test regular columns
    dt = DataTable(sample_df, time_index='signup_date', index='id', name='dt_name')
    df = dt.to_dataframe()
    columns = ['age', 'full_name']
    new_dt = dt[columns]
    assert new_dt is not dt
    assert new_dt.to_dataframe() is not df
    pd.testing.assert_frame_equal(to_pandas(df[columns]).reset_index(drop=True), to_pandas(new_dt.to_dataframe()))
    assert all(new_dt.to_dataframe().columns == ['age', 'full_name'])
    assert set(new_dt.columns.keys()) == {'age', 'full_name'}
    assert new_dt.index is None
    assert new_dt.time_index is None

    # Test with index
    columns = ['id', 'full_name']
    new_dt = dt[columns]
    assert new_dt is not dt
    assert new_dt.to_dataframe() is not df
    pd.testing.assert_frame_equal(to_pandas(df[columns]), to_pandas(new_dt.to_dataframe()))
    assert all(new_dt.to_dataframe().columns == ['id', 'full_name'])
    assert set(new_dt.columns.keys()) == {'id', 'full_name'}
    assert new_dt.index == 'id'
    assert new_dt.time_index is None

    # Test with time_index
    columns = ['id', 'signup_date', 'full_name']
    new_dt = dt[columns]
    assert new_dt is not dt
    assert new_dt.to_dataframe() is not df
    pd.testing.assert_frame_equal(to_pandas(df[columns]), to_pandas(new_dt.to_dataframe()), check_index_type=False)
    assert all(new_dt.to_dataframe().columns == ['id', 'signup_date', 'full_name'])
    assert set(new_dt.columns.keys()) == {'id', 'signup_date', 'full_name'}
    assert new_dt.index == 'id'

    # Test with empty list selector
    columns = []
    new_dt = dt[columns]
    assert new_dt is not dt
    assert new_dt.to_dataframe() is not df
    assert to_pandas(new_dt.to_dataframe()).empty
    assert set(new_dt.columns.keys()) == set()
    assert new_dt.index is None
    assert new_dt.time_index is None

    # Test that reversed column order reverses resulting column order
    columns = list(reversed(list(dt.columns.keys())))
    new_dt = dt[columns]

    assert new_dt is not dt
    assert new_dt.to_dataframe() is not df
    assert all(df.columns[::-1] == new_dt.to_dataframe().columns)
    assert all(dt.types.index[::-1] == new_dt.types.index)
    assert all(new_dt.to_dataframe().columns == new_dt.types.index)
    assert set(new_dt.columns.keys()) == set(dt.columns.keys())
    assert new_dt.index == 'id'
    assert new_dt.time_index == 'signup_date'
def test_sets_object_dtype_on_init(latlong_df):
    for column_name in latlong_df.columns:
        ltypes = {
            column_name: LatLong,
        }
        dt = DataTable(latlong_df.loc[:, [column_name]], logical_types=ltypes)
        assert dt.columns[column_name].logical_type == LatLong
        assert dt.columns[column_name].dtype == LatLong.pandas_dtype
        assert dt.to_dataframe()[column_name].dtype == LatLong.pandas_dtype
Exemple #13
0
def test_underlying_index_on_update(sample_df):
    if dd and isinstance(sample_df, dd.DataFrame):
        pytest.xfail(
            'Setting underlying index is not supported with Dask input')
    if ks and isinstance(sample_df, ks.DataFrame):
        pytest.xfail(
            'Setting underlying index is not supported with Koalas input')

    dt = DataTable(sample_df.copy(), index='id')

    dt.update_dataframe(sample_df.tail(2))
    assert (dt._dataframe.index == [2, 3]).all()
    assert dt._dataframe.index.name is None
    assert type(dt._dataframe.index) == pd.Int64Index
    assert type(dt.to_dataframe().index) == pd.Int64Index

    actual = dt.iloc[[0, 1]]
    assert type(actual._dataframe.index) == pd.Index
    assert type(actual.to_dataframe().index) == pd.Index

    actual = dt.select(dt.index)
    assert type(actual._dataframe.index) == pd.Int64Index
    assert type(actual.to_dataframe().index) == pd.Int64Index

    actual = dt[['age']]
    assert type(actual._dataframe.index) == pd.Int64Index
    assert type(actual.to_dataframe().index) == pd.Int64Index

    actual = dt.drop(dt.index)
    assert type(actual._dataframe.index) == pd.RangeIndex
    assert type(actual.to_dataframe().index) == pd.RangeIndex

    actual = dt.reset_semantic_tags(retain_index_tags=False)
    assert type(actual._dataframe.index) == pd.RangeIndex
    assert type(actual.to_dataframe().index) == pd.RangeIndex

    actual = dt.set_types(retain_index_tags=False,
                          semantic_tags={'id': 'numeric'})
    assert type(actual._dataframe.index) == pd.RangeIndex
    assert type(actual.to_dataframe().index) == pd.RangeIndex

    dt.pop(dt.index)
    assert type(dt._dataframe.index) == pd.RangeIndex
    assert type(dt.to_dataframe().index) == pd.RangeIndex
Exemple #14
0
def test_serialize_s3_pickle_anon(sample_df_pandas, s3_client, s3_bucket):
    pandas_dt = DataTable(sample_df_pandas)
    pandas_dt.to_pickle(TEST_S3_URL, profile_name=False)
    make_public(s3_client, s3_bucket)
    _dt = deserialize.read_datatable(TEST_S3_URL, profile_name=False)

    pd.testing.assert_frame_equal(
        to_pandas(pandas_dt.to_dataframe(), index=pandas_dt.index),
        to_pandas(_dt.to_dataframe(), index=_dt.index))
    assert pandas_dt == _dt
def test_underlying_index_no_index(sample_df):
    if dd and isinstance(sample_df, dd.DataFrame):
        pytest.xfail(
            'Setting underlying index is not supported with Dask input')
    if ks and isinstance(sample_df, ks.DataFrame):
        pytest.xfail(
            'Setting underlying index is not supported with Koalas input')

    assert type(sample_df.index) == pd.RangeIndex
    dt = DataTable(sample_df.copy())
    assert type(dt._dataframe.index) == pd.RangeIndex
    assert type(dt.to_dataframe().index) == pd.RangeIndex

    sample_df = sample_df.sort_values('full_name')
    assert type(sample_df.index) == pd.Int64Index
    dt = DataTable(sample_df)

    assert type(dt._dataframe.index) == pd.RangeIndex
    assert type(dt.to_dataframe().index) == pd.RangeIndex
def test_datatable_init(sample_df):
    dt = DataTable(sample_df)
    df = dt.to_dataframe()

    assert dt.name is None
    assert dt.index is None
    assert dt.time_index is None

    assert set(dt.columns.keys()) == set(sample_df.columns)
    assert df is sample_df
    pd.testing.assert_frame_equal(to_pandas(df), to_pandas(sample_df))
Exemple #17
0
def test_to_parquet_with_latlong(latlong_df, tmpdir):
    dt = DataTable(
        latlong_df,
        logical_types={col: 'LatLong'
                       for col in latlong_df.columns})
    dt.to_parquet(str(tmpdir))
    _dt = deserialize.read_datatable(str(tmpdir))

    pd.testing.assert_frame_equal(
        to_pandas(dt.to_dataframe(), index=dt.index, sort_index=True),
        to_pandas(_dt.to_dataframe(), index=_dt.index, sort_index=True))
    assert dt == _dt
Exemple #18
0
def test_serialize_s3_parquet_anon(sample_df, s3_client, s3_bucket):
    xfail_tmp_disappears(sample_df)

    dt = DataTable(sample_df)
    dt.to_parquet(TEST_S3_URL, profile_name=False)
    make_public(s3_client, s3_bucket)
    _dt = deserialize.read_datatable(TEST_S3_URL, profile_name=False)

    pd.testing.assert_frame_equal(
        to_pandas(dt.to_dataframe(), index=dt.index),
        to_pandas(_dt.to_dataframe(), index=_dt.index))
    assert dt == _dt
def test_sets_float64_dtype_on_update():
    column_name = 'test_series'
    series = pd.Series([0, 1, 0], name=column_name)
    series = series.astype('object')
    ltypes = {
        column_name: Integer,
    }
    dt = DataTable(pd.DataFrame(series), logical_types=ltypes)
    dt = dt.set_types(logical_types={column_name: Double})
    assert dt.columns[column_name].logical_type == Double
    assert dt.columns[column_name].dtype == Double.pandas_dtype
    assert dt.to_dataframe()[column_name].dtype == Double.pandas_dtype
Exemple #20
0
def test_setitem_different_name(sample_df):
    dt = DataTable(sample_df)

    new_series = pd.Series([1, 2, 3, 4], name='wrong')
    if ks and isinstance(sample_df, ks.DataFrame):
        new_series = ks.Series(new_series)

    warning = 'Name mismatch between wrong and id. DataColumn and underlying series name are now id'
    with pytest.warns(ColumnNameMismatchWarning, match=warning):
        dt['id'] = DataColumn(new_series,
                              use_standard_tags=False)

    assert dt['id'].name == 'id'
    assert dt['id'].to_series().name == 'id'
    assert dt.to_dataframe()['id'].name == 'id'
    assert 'wrong' not in dt.columns

    new_series2 = pd.Series([1, 2, 3, 4], name='wrong2')
    if ks and isinstance(sample_df, ks.DataFrame):
        new_series2 = ks.Series(new_series2)

    warning = 'Name mismatch between wrong2 and new_col. DataColumn and underlying series name are now new_col'
    with pytest.warns(ColumnNameMismatchWarning, match=warning):
        dt['new_col'] = DataColumn(new_series2,
                                   use_standard_tags=False)

    assert dt['new_col'].name == 'new_col'
    assert dt['new_col'].to_series().name == 'new_col'
    assert dt.to_dataframe()['new_col'].name == 'new_col'
    assert 'wrong2' not in dt.columns

    warning = 'Name mismatch between wrong and col_with_name. DataColumn and underlying series name are now col_with_name'
    with pytest.warns(ColumnNameMismatchWarning, match=warning):
        dt['col_with_name'] = DataColumn(new_series,
                                         use_standard_tags=False, name='wrong')
    assert dt['col_with_name'].name == 'col_with_name'
    assert dt['col_with_name'].to_series().name == 'col_with_name'
    assert dt.to_dataframe()['col_with_name'].name == 'col_with_name'
    assert 'wrong' not in dt.columns
def test_sets_datetime_dtype_on_update():
    column_name = 'test_series'
    series = pd.Series(['2020-01-01', '2020-01-02', '2020-01-03'],
                       name=column_name)
    series = series.astype('object')
    ltypes = {
        column_name: NaturalLanguage,
    }
    dt = DataTable(pd.DataFrame(series), logical_types=ltypes)
    dt = dt.set_types(logical_types={column_name: Datetime})
    assert dt.columns[column_name].logical_type == Datetime
    assert dt.columns[column_name].dtype == Datetime.pandas_dtype
    assert dt.to_dataframe()[column_name].dtype == Datetime.pandas_dtype
Exemple #22
0
def test_to_pickle(sample_df, tmpdir):
    dt = DataTable(sample_df)
    if not isinstance(sample_df, pd.DataFrame):
        msg = 'DataFrame type not compatible with pickle serialization. Please serialize to another format.'
        with pytest.raises(ValueError, match=msg):
            dt.to_pickle(str(tmpdir))
    else:
        dt.to_pickle(str(tmpdir))
        _dt = deserialize.read_datatable(str(tmpdir))

        pd.testing.assert_frame_equal(
            to_pandas(dt.to_dataframe(), index=dt.index),
            to_pandas(_dt.to_dataframe(), index=_dt.index))
        assert dt == _dt
Exemple #23
0
def test_s3_test_profile(sample_df, s3_client, s3_bucket, setup_test_profile):
    xfail_tmp_disappears(sample_df)
    dt = DataTable(sample_df)
    dt.to_csv(TEST_S3_URL,
              encoding='utf-8',
              engine='python',
              profile_name='test')
    make_public(s3_client, s3_bucket)
    _dt = deserialize.read_datatable(TEST_S3_URL, profile_name='test')

    pd.testing.assert_frame_equal(
        to_pandas(dt.to_dataframe(), index=dt.index),
        to_pandas(_dt.to_dataframe(), index=_dt.index))
    assert dt == _dt
def test_underlying_index_on_update(sample_df):
    if dd and isinstance(sample_df, dd.DataFrame):
        pytest.xfail(
            'Setting underlying index is not supported with Dask input')
    if ks and isinstance(sample_df, ks.DataFrame):
        pytest.xfail(
            'Setting underlying index is not supported with Koalas input')

    dt = DataTable(sample_df.copy(), index='id')

    dt.update_dataframe(sample_df.tail(2))
    assert (dt._dataframe.index == [2, 3]).all()
    assert dt._dataframe.index.name is None
    assert type(dt._dataframe.index) == pd.Int64Index
    assert type(dt.to_dataframe().index) == pd.Int64Index
def test_select_ltypes_no_match_and_all(sample_df):
    dt = DataTable(sample_df)
    dt = dt.set_types(logical_types={
        'full_name': FullName,
        'email': EmailAddress,
        'phone_number': PhoneNumber,
        'age': Double,
        'signup_date': Datetime,
    })
    assert len(dt.select(ZIPCode).columns) == 0
    assert len(dt.select(['ZIPCode', PhoneNumber]).columns) == 1
    all_types = ww.type_system.registered_types
    dt_all_types = dt.select(all_types)
    assert len(dt_all_types.columns) == len(dt.columns)
    assert len(dt_all_types.to_dataframe().columns) == len(dt.to_dataframe().columns)
def test_sets_int64_dtype_on_update():
    column_name = 'test_series'
    series = pd.Series([1.0, 2.0, 1.0], name=column_name)
    series = series.astype('object')
    logical_types = [Integer]

    for logical_type in logical_types:
        ltypes = {
            column_name: Double,
        }
        dt = DataTable(pd.DataFrame(series), logical_types=ltypes)
        dt = dt.set_types(logical_types={column_name: logical_type})
        assert dt.columns[column_name].logical_type == logical_type
        assert dt.columns[column_name].dtype == logical_type.pandas_dtype
        assert dt.to_dataframe(
        )[column_name].dtype == logical_type.pandas_dtype
Exemple #27
0
def test_to_pickle_with_latlong(latlong_df, tmpdir):
    dt = DataTable(
        latlong_df,
        logical_types={col: 'LatLong'
                       for col in latlong_df.columns})
    if not isinstance(latlong_df, pd.DataFrame):
        msg = 'DataFrame type not compatible with pickle serialization. Please serialize to another format.'
        with pytest.raises(ValueError, match=msg):
            dt.to_pickle(str(tmpdir))
    else:
        dt.to_pickle(str(tmpdir))
        _dt = deserialize.read_datatable(str(tmpdir))

        pd.testing.assert_frame_equal(
            to_pandas(dt.to_dataframe(), index=dt.index, sort_index=True),
            to_pandas(_dt.to_dataframe(), index=_dt.index, sort_index=True))
        assert dt == _dt
Exemple #28
0
def test_to_csv_S3(sample_df, s3_client, s3_bucket):
    xfail_tmp_disappears(sample_df)

    dt = DataTable(sample_df,
                   name='test_data',
                   index='id',
                   semantic_tags={'id': 'tag1'},
                   logical_types={'age': Ordinal(order=[25, 33, 57])})
    dt.to_csv(TEST_S3_URL, encoding='utf-8', engine='python')
    make_public(s3_client, s3_bucket)

    _dt = deserialize.read_datatable(TEST_S3_URL)

    pd.testing.assert_frame_equal(
        to_pandas(dt.to_dataframe(), index=dt.index),
        to_pandas(_dt.to_dataframe(), index=_dt.index))
    assert dt == _dt
def test_sets_float64_dtype_on_init():
    column_name = 'test_series'
    series_list = [
        pd.Series([1.1, 2, 3], name=column_name),
        pd.Series([1.1, None, 3], name=column_name),
        pd.Series([1.1, np.nan, 3], name=column_name),
    ]

    logical_type = Double
    for series in series_list:
        series = series.astype('object')
        ltypes = {
            column_name: logical_type,
        }
        dt = DataTable(pd.DataFrame(series), logical_types=ltypes)
        assert dt.columns[column_name].logical_type == logical_type
        assert dt.columns[column_name].dtype == logical_type.pandas_dtype
        assert dt.to_dataframe(
        )[column_name].dtype == logical_type.pandas_dtype
def test_sets_boolean_dtype_on_init():
    column_name = 'test_series'
    series_list = [
        pd.Series([True, False, True], name=column_name),
        pd.Series([True, None, True], name=column_name),
        pd.Series([True, np.nan, True], name=column_name),
        pd.Series([True, pd.NA, True], name=column_name),
    ]

    logical_type = Boolean
    for series in series_list:
        series = series.astype('object')
        ltypes = {
            column_name: logical_type,
        }
        dt = DataTable(pd.DataFrame(series), logical_types=ltypes)
        assert dt.columns[column_name].logical_type == logical_type
        assert dt.columns[column_name].dtype == logical_type.pandas_dtype
        assert dt.to_dataframe(
        )[column_name].dtype == logical_type.pandas_dtype