Example #1
def test_standardScale_1_attr_all_nan():
    d = {
        'id1': ['ab', 'sa', '121', '121', 'a'],
        'id2': [1, np.nan, 0, 44, 0],
        'col1': [1.0, -1.1, 3.0, 7.5, 10.0],
        'col2': [np.nan, np.nan, np.nan, np.nan, np.nan],
        'ww': [3, np.nan, 'ww', '1', '1']
    f = data.Frame(d)
    f = f.setIndex(['id1', 'id2'])

    op = StandardScaler()
    assert op.getOutputShape() is None
    op.setOptions(attributes={1: None})
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)
    s = f.shape.clone()
    assert op.getOutputShape() == s

    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)

    g = op.execute(f)
    expected = {
        'col2': [((x - np.nanmean(d['col2'])) / np.nanstd(d['col2']))
                 for x in d['col2']],
        'ww': [3, None, 'ww', '1', '1']
    assert nan_to_None(roundValues(g.to_dict(),
                                   4)) == nan_to_None(roundValues(expected, 4))
    assert g.shape == s
Example #2
def test_minMaxScale():
    d = {
        'id1': ['ab', 'sa', '121', '121', 'a'],
        'id2': [1, np.nan, 0, 44, 0],
        'col1': [1, -1.1, 3, 7.5, 10],
        'col2': [3, 4, np.nan, 6, np.nan],
        'ww': [3, np.nan, 'ww', '1', '1']
    f = data.Frame(d)
    f = f.setIndex(['id1', 'id2'])

    op = MinMaxScaler()
    assert op.getOutputShape() is None
    op.setOptions(attributes={0: {'range': (-1, 1)}, 1: {'range': (2, 4)}})
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)
    s = f.shape.clone()
    assert op.getOutputShape() == s

    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)

    g = op.execute(f)
    expected = {
        'col1': [((x - min(d['col1'])) / (max(d['col1']) - min(d['col1'])))
                 for x in d['col1']],
        'col2': [((x - min(d['col2'])) / (max(d['col2']) - min(d['col2'])))
                 for x in d['col2']],
        'ww': [3, None, 'ww', '1', '1']
    expected = {
        'col1': [x * (1 - (-1)) - 1 for x in expected['col1']],
        'col2': [x * (4 - 2) + 2 for x in expected['col2']],
        'ww': expected['ww']
    assert nan_to_None(roundValues(g.to_dict(),
                                   4)) == nan_to_None(roundValues(expected, 4))
    assert g.shape == s
    assert not numpy_equal(g.getRawFrame().values, f.getRawFrame().values)

    options = op.getOptions()
    assert options == {
        'attributes': {
            0: {
                'range': (-1, 1)
            1: {
                'range': (2, 4)
Example #3
def test_fillnan_ffill():
    e = {'col1': [np.nan, 2, np.nan, 4, 10],
         'col2': pd.Categorical(['3', '4', np.nan, np.nan, '0'], ordered=True),
         'col3': ['q', '2', 'c', np.nan, np.nan],
         'date': pd.Series(['05-09-1988', np.nan, np.nan, '22-06-1994', '12-12-2012'],
    g = data.Frame(e)

    g = g.setIndex('col1')

    op = FillNan()
    assert op.getOutputShape() is None
    op.addInputShape(g.shape, 0)
    op.setOptions(selected={0: None, 1: None, 2: None}, fillMode='ffill')
    assert op.getOptions() == {
        'selected': {0: None, 1: None, 2: None},
        'fillMode': 'ffill'

    s = Shape()
    s.colNames = ['col3', 'col2', 'date']
    s.colTypes = [Types.String, Types.Ordinal, Types.Datetime]
    s.index = ['col1']
    s.indexTypes = [IndexType(Types.Numeric)]
    assert op.getOutputShape() == s

    h = op.execute(g)
    assert h.shape == s

    assert mapDate(roundValues(nan_to_None(h.to_dict()), decimals=3)) == {
        'col3': ['q', '2', 'c', 'c', 'c'],
        'col2': ['3', '4', '4', '4', '0'],
        'date': [t.strftime(format='%Y-%m-%d') if not pd.isna(t) else '1988-05-09'
                 for t in e['date']]
Example #4
def test_standardScale():
    d = {
        'id1': ['ab', 'sa', '121', '121', 'a'],
        'id2': [1, np.nan, 0, 44, 0],
        'col1': [1, -1.1, 3, 7.5, 10],
        'col2': [3, 4, np.nan, 6, np.nan],
        'ww': [3, np.nan, 'ww', '1', '1']
    f = data.Frame(d)
    f = f.setIndex(['id1', 'id2'])

    op = StandardScaler()
    assert op.getOutputShape() is None
    op.setOptions(attributes={0: None, 1: None})
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)
    s = f.shape.clone()
    assert op.getOutputShape() == s

    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)

    g = op.execute(f)
    expected = {
        'col1': [((x - np.nanmean(d['col1'])) / np.nanstd(d['col1']))
                 for x in d['col1']],
        'col2': [((x - np.nanmean(d['col2'])) / np.nanstd(d['col2']))
                 for x in d['col2']],
        'ww': [3, None, 'ww', '1', '1']
    assert nan_to_None(roundValues(g.to_dict(),
                                   4)) == nan_to_None(roundValues(expected, 4))
    assert g.shape == s
    assert not numpy_equal(g.getRawFrame().values, f.getRawFrame().values)

    options = op.getOptions()
    assert options == {'attributes': {0: None, 1: None}}
Example #5
def test_fillnan_byVal_date_num():
    e = {'col1': [np.nan, 2, np.nan, 4, 10],
         'col2': pd.Categorical(['3', '4', np.nan, np.nan, '0'], ordered=True),
         'col3': ['q', '2', 'c', np.nan, np.nan],
         'date': pd.Series(['05-09-1988', np.nan, np.nan, '22-06-1994', '12-12-2012'],
         'col4': [np.nan, 2, np.nan, 4, 10]}
    g = data.Frame(e)

    g = g.setIndex('col1')

    op = FillNan()
    assert op.getOutputShape() is None
    op.addInputShape(g.shape, 0)
    with pytest.raises(OptionValidationError):
        op.setOptions(selected={0: {'fill': 'pol'}, 1: {'fill': '23'},  # wrong
                                2: {'fill': '1966-04-02 00:00:30'},
                                3: {'fill': 'march'}},  # wrong

    op.setOptions(selected={2: {'fill': '1966-04-02 00:00:30'},
                            3: {'fill': '0.9'}},

    assert op.getOptions() == {
        'selected': {2: {'fill': '1966-04-02 00:00:30'},
                     3: {'fill': '0.9'}},
        'fillMode': 'value'

    s = Shape()
    s.colNames = ['col3', 'col2', 'date', 'col4']
    s.colTypes = [Types.String, Types.Ordinal, Types.Datetime, Types.Numeric]
    s.index = ['col1']
    s.indexTypes = [IndexType(Types.Numeric)]
    assert op.getOutputShape() == s

    h = op.execute(g)
    assert h.shape == s

    assert mapDate(roundValues(nan_to_None(h.to_dict()), decimals=3)) == {
        'col3': ['q', '2', 'c', None, None],
        'col2': ['3', '4', None, None, '0'],
        'date': [t.strftime(format='%Y-%m-%d') if not pd.isna(t) else '1966-04-02'
                 for t in e['date']],
        'col4': [0.9, 2.0, 0.9, 4.0, 10.0]
Example #6
def test_str_toNumeric():
    d = {
        'col1': pd.Categorical([3, 0, 5, 6, 0]),
        'col2': [3, 4, 5, 6, 0],
        'col3': ['123', '2', '0.43', '4', '90']

    # 'cold': pd.Series(['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'],
    #                   dtype='datetime64[ns]')}
    f = Frame(d)

    op = ToNumeric()
    op.addInputShape(f.shape, pos=0)
    op.setOptions(attributes={0: dict(), 2: dict()}, errors='raise')

    # Predict output shape
    os = f.shape.columnsDict
    os['col1'] = Types.Numeric
    os['col3'] = Types.Numeric
    assert op.getOutputShape().columnsDict == os

    # Removing options/input_shape causes None to be returned
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, pos=0)
    assert op.getOutputShape() is None
    op.setOptions(attributes={0: dict(), 2: dict()}, errors='coerce')
    assert op.getOutputShape().columnsDict == os  # Re-adding everything

    g = op.execute(f)
    gd = {
        'col1': [3.0, 0.0, 5.0, 6.00, 0.0],
        'col2': [3., 4., 5., 6., 0.0],
        'col3': [123.0, 2.0, 0.43, 4.0, 90.0]
    assert roundValues(g.to_dict(), 3) == gd
    assert g.shape.columnsDict == os
    assert g.shape.indexDict == f.shape.indexDict