Ejemplo n.º 1
0
def test_datacolumn_init_with_extension_array():
    series_categories = pd.Series([1, 2, 3], dtype='category')
    extension_categories = pd.Categorical([1, 2, 3])

    data_col = DataColumn(extension_categories)
    series = data_col.to_series()
    assert series.equals(series_categories)
    assert series.name is None
    assert data_col.name is None
    assert data_col.dtype == 'category'
    assert data_col.logical_type == Categorical

    series_ints = pd.Series([1, 2, None, 4], dtype='Int64')
    extension_ints = pd.arrays.IntegerArray(np.array([1, 2, 3, 4], dtype="int64"), mask=np.array([False, False, True, False]))

    data_col_with_name = DataColumn(extension_ints, name='extension')
    series = data_col_with_name.to_series()
    assert series.equals(series_ints)
    assert series.name == 'extension'
    assert data_col_with_name.name == 'extension'

    series_strs = pd.Series([1, 2, None, 4], dtype='string')

    data_col_different_ltype = DataColumn(extension_ints, logical_type='NaturalLanguage')
    series = data_col_different_ltype.to_series()
    assert series.equals(series_strs)
    assert data_col_different_ltype.logical_type == NaturalLanguage
    assert data_col_different_ltype.dtype == 'string'
Ejemplo n.º 2
0
def test_datacolumn_inity_with_falsy_name(sample_series):
    falsy_name = 0
    warning = 'Name mismatch between sample_series and 0. DataColumn and underlying series name are now 0'
    with pytest.warns(ColumnNameMismatchWarning, match=warning):
        dc_falsy_name = DataColumn(sample_series.copy(), name=falsy_name)

    assert dc_falsy_name.name == falsy_name
    assert dc_falsy_name.to_series().name == falsy_name
Ejemplo n.º 3
0
def test_shape(sample_series):
    col = DataColumn(sample_series)
    col_shape = col.shape
    series_shape = col.to_series().shape
    if dd and isinstance(sample_series, dd.Series):
        col_shape = (col_shape[0].compute(),)
        series_shape = (series_shape[0].compute(),)
    assert col_shape == (4,)
    assert col_shape == series_shape
Ejemplo n.º 4
0
def test_datacolumn_init(sample_series):
    data_col = DataColumn(sample_series, use_standard_tags=False)
    # Koalas doesn't support category dtype
    if not (ks and isinstance(sample_series, ks.Series)):
        sample_series = sample_series.astype('category')
    pd.testing.assert_series_equal(to_pandas(data_col.to_series()), to_pandas(sample_series))
    assert data_col.name == sample_series.name
    assert data_col.logical_type == Categorical
    assert data_col.semantic_tags == set()
Ejemplo n.º 5
0
def test_datacolumn_init_with_name(sample_series, sample_datetime_series):
    name = 'sample_series'
    changed_name = 'changed_name'

    dc_use_series_name = DataColumn(sample_series)
    assert dc_use_series_name.name == name
    assert dc_use_series_name.to_series().name == name

    warning = 'Name mismatch between sample_series and changed_name. DataColumn and underlying series name are now changed_name'
    with pytest.warns(ColumnNameMismatchWarning, match=warning):
        dc_use_input_name = DataColumn(sample_series, name=changed_name)
    assert dc_use_input_name.name == changed_name
    assert dc_use_input_name.to_series().name == changed_name

    warning = 'Name mismatch between sample_datetime_series and changed_name. DataColumn and underlying series name are now changed_name'
    with pytest.warns(ColumnNameMismatchWarning, match=warning):
        dc_with_ltype_change = DataColumn(sample_datetime_series, name=changed_name)
    assert dc_with_ltype_change.name == changed_name
    assert dc_with_ltype_change.to_series().name == changed_name
Ejemplo n.º 6
0
def test_latlong_formatting(latlongs):
    expected_series = pd.Series([(1, 2), (3, 4)])
    if ks and isinstance(latlongs[0], ks.Series):
        expected_series = ks.Series([[1, 2], [3, 4]])
    elif dd and isinstance(latlongs[0], dd.Series):
        expected_series = dd.from_pandas(expected_series, npartitions=2)

    expected_dc = DataColumn(expected_series, logical_type='LatLong', name='test_series')

    for series in latlongs:
        dc = DataColumn(series, logical_type='LatLong', name='test_series')
        pd.testing.assert_series_equal(to_pandas(dc.to_series()), to_pandas(expected_series))

        assert dc == expected_dc
Ejemplo n.º 7
0
def test_datacolumn_equality(sample_series, sample_datetime_series):
    # Check different parameters to DataColumn
    str_col = DataColumn(sample_series, logical_type='Categorical')
    str_col_2 = DataColumn(sample_series, logical_type=Categorical)
    str_col_diff_tags = DataColumn(sample_series, logical_type=Categorical, semantic_tags={'test'})
    diff_name_col = DataColumn(sample_datetime_series, logical_type=Categorical)
    diff_dtype_col = DataColumn(sample_series, logical_type=NaturalLanguage)
    diff_description_col = DataColumn(sample_series, logical_type='Categorical', description='description')
    diff_metadata_col = DataColumn(sample_series, logical_type='Categorical', metadata={'interesting_values': ['a', 'b']})

    assert str_col == str_col_2
    assert str_col != str_col_diff_tags
    assert str_col != diff_name_col
    assert str_col != diff_dtype_col
    assert str_col != diff_description_col
    assert str_col != diff_metadata_col

    # Check columns with same logical types but different parameters
    ordinal_ltype_1 = Ordinal(order=['a', 'b', 'c'])
    ordinal_ltype_2 = Ordinal(order=['b', 'a', 'c'])
    ordinal_col_1 = DataColumn(sample_series, logical_type=ordinal_ltype_1)
    ordinal_col_2 = DataColumn(sample_series, logical_type=ordinal_ltype_2)

    assert str_col != ordinal_col_1
    assert ordinal_col_1 != ordinal_col_2
    assert ordinal_col_1 == ordinal_col_1

    datetime_ltype_instantiated = Datetime(datetime_format='%Y-%m%d')
    datetime_col_format = DataColumn(sample_datetime_series, logical_type=datetime_ltype_instantiated)
    datetime_col_param = DataColumn(sample_datetime_series, logical_type=Datetime(datetime_format=None))
    datetime_col_instantiated = DataColumn(sample_datetime_series, logical_type=Datetime())
    datetime_col = DataColumn(sample_datetime_series, logical_type=Datetime)

    assert datetime_col != datetime_col_instantiated
    assert datetime_col_instantiated != datetime_col_format
    assert datetime_col_instantiated == datetime_col_param

    # Check different underlying series
    str_col = DataColumn(sample_series, logical_type='NaturalLanguage')
    changed_series = sample_series.copy().replace(to_replace='a', value='test')
    null_col = DataColumn(changed_series, logical_type='NaturalLanguage')

    # We only check underlying data for equality with pandas dataframes
    if isinstance(str_col.to_series(), pd.Series):
        assert str_col != null_col
    else:
        assert str_col == null_col
Ejemplo n.º 8
0
def test_to_series(sample_series):
    data_col = DataColumn(sample_series)
    series = data_col.to_series()

    assert series is data_col._series
    pd.testing.assert_series_equal(to_pandas(series), to_pandas(data_col._series))