Exemple #1
0
def test_to_array_method():
    m1 = autom8.create_matrix([[1], [2], [3], [4]])
    m2 = autom8.create_matrix([[1, 2], [3, 4], [5, 6]])
    assert np.array_equal(m1.to_array(), np.array([1, 2, 3, 4]))
    with pytest.raises(autom8.Autom8Exception) as excinfo:
        m2.to_array()
    excinfo.match('Expected.*one column')
def test_columns_with_numbers_as_strings():
    dataset = [
        ['A', 'B', 'C'],
        ['1.1', '$4', 7],
        ['2.2', '$5', 8],
        ['3.3', '6%', 9],
    ]

    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc)
    ctx = autom8.create_context(matrix, receiver=acc)

    autom8.clean_dataset(ctx)
    assert len(acc.warnings) == 0
    assert len(ctx.steps) == 2

    assert ctx.matrix.tolist() == [[1.1, 4, 7], [2.2, 5, 8], [3.3, 6, 9]]

    vectors = [['A', 'B', 'C'], [1, '2%', 'foo'], ['3', 4.0, 'bar']]
    matrix = autom8.create_matrix(vectors, receiver=acc)
    out = PlaybackContext(matrix, receiver=acc)
    playback(ctx.steps, out)
    assert out.matrix.tolist() == [[1, 2, 'foo'], [3, 4, 'bar']]
    assert out.matrix.columns[0].dtype == int
    assert out.matrix.columns[1].dtype == float
def test_column_of_ints_and_floats():
    dataset = [
        ['A', 'B'],
        [1, 3.3],
        [2.2, 4],
        [None, None],
    ]

    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc)
    ctx = autom8.create_context(matrix, receiver=acc)

    autom8.clean_dataset(ctx)

    assert len(ctx.steps) == 4
    assert len(acc.warnings) == 2
    assert ctx.matrix.tolist() == [
        [1.0, True, 3.3, True],
        [2.2, True, 4.0, True],
        [0.0, False, 0.0, False],
    ]

    vectors = [['A', 'B'], [None, 10], [20.0, None], [30, 40]]
    matrix = autom8.create_matrix(vectors, receiver=acc)
    out = PlaybackContext(matrix, receiver=acc)
    playback(ctx.steps, out)
    assert out.matrix.tolist() == [
        [0.0, False, 10.0, True],
        [20.0, True, 0.0, False],
        [30.0, True, 40.0, True],
    ]

    assert out.matrix.columns[0].dtype == float
    assert out.matrix.columns[2].dtype == float
Exemple #4
0
def test_tolist_method():
    m1 = autom8.create_matrix(
        dataset=[['hi', True], ['bye', False]],
        column_names=['msg', 'flag'],
        column_roles=['textual', 'encoded'],
    )
    m2 = autom8.create_matrix([[1, 2.0], [3, 4.0], [5, 6.0]])
    assert m1.tolist() == [['hi', True], ['bye', False]]
    assert m2.tolist() == [[1, 2.0], [3, 4.0], [5, 6.0]]
Exemple #5
0
def test_len_method():
    m1 = autom8.create_matrix([
        ['hi', 1, True],
        ['so', 2, True],
        ['bye', 3, False],
    ])
    m2 = autom8.create_matrix([[1], [2], [3], [4], [5], [6], [7]])
    assert len(m1) == 3
    assert len(m2) == 7
Exemple #6
0
def test_drop_columns_by_index():
    m1 = autom8.create_matrix([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    m2 = autom8.create_matrix([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
    m1.drop_columns_by_index([0, 2])
    m2.drop_columns_by_index([1, 2])
    assert len(m1.columns) == 1
    assert len(m2.columns) == 2
    assert np.array_equal(m1.columns[0].values, np.array([2, 5, 8]))
    assert m1.tolist() == [[2], [5], [8]]
    assert m2.tolist() == [[1, 4], [5, 8], [9, 12]]
def test_mixed_up_columns_with_strings_and_numbers():
    dataset = [
        ['A', 'B'],
        [True, 'foo'],
        [1.1, 30],
        [20, 4.4],
        ['bar', False],
        ['', 'baz'],
        [50, 'fiz'],
        [None, True],
    ]

    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc)
    ctx = autom8.create_context(matrix, receiver=acc)

    autom8.clean_dataset(ctx)

    assert len(ctx.steps) == 6
    assert len(acc.warnings) == 0
    assert ctx.matrix.tolist() == [
        [1.0, '', 0.0, 'foo'],
        [1.1, '', 30.0, ''],
        [20.0, '', 4.4, ''],
        [0.0, 'bar', 0.0, ''],
        [0.0, '', 0.0, 'baz'],
        [50.0, '', 0.0, 'fiz'],
        [0.0, '', 1.0, ''],
    ]
    assert ctx.matrix.formulas == [
        ['number', 'A'],
        ['string', 'A'],
        ['number', 'B'],
        ['string', 'B'],
    ]

    vectors = [['A', 'B'], [False, 'buz'], ['zim', 10], [2, None]]
    matrix = autom8.create_matrix(vectors, receiver=acc)
    out = PlaybackContext(matrix, receiver=acc)
    playback(ctx.steps, out)
    assert out.matrix.tolist() == [
        [0.0, '', 0.0, 'buz'],
        [0.0, 'zim', 10.0, ''],
        [2.0, '', 0.0, ''],
    ]
    assert out.matrix.formulas == [
        ['number', 'A'],
        ['string', 'A'],
        ['number', 'B'],
        ['string', 'B'],
    ]
def test_columns_with_some_empty_strings():
    dataset = [
        ['A', 'B', 'C'],
        [True, 1.1, 20],
        ['', 2.2, 30],
        [False, '', 40],
        [False, 3.3, ''],
        ['', 4.4, ''],
    ]

    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc)
    ctx = autom8.create_context(matrix, receiver=acc)

    autom8.clean_dataset(ctx)

    assert len(ctx.steps) == 6
    assert len(acc.warnings) == 3
    assert ctx.matrix.tolist() == [
        [True, True, 1.1, True, 20, True],
        [False, False, 2.2, True, 30, True],
        [False, True, 0.0, False, 40, True],
        [False, True, 3.3, True, 0, False],
        [False, False, 4.4, True, 0, False],
    ]
    assert ctx.matrix.formulas == [
        'A',
        ['is-defined', 'A'],
        'B',
        ['is-defined', 'B'],
        'C',
        ['is-defined', 'C'],
    ]

    vectors = [['A', 'B', 'C'], ['', 5.5, ''], [True, '', 50]]
    matrix = autom8.create_matrix(vectors, receiver=acc)
    out = PlaybackContext(matrix, receiver=acc)
    playback(ctx.steps, out)
    assert out.matrix.tolist() == [
        [False, False, 5.5, True, 0, False],
        [True, True, 0.0, False, 50, True],
    ]
    assert out.matrix.formulas == [
        'A',
        ['is-defined', 'A'],
        'B',
        ['is-defined', 'B'],
        'C',
        ['is-defined', 'C'],
    ]
Exemple #9
0
def test_creating_simple_matrix_with_names_and_roles():
    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(
        dataset=[['hi', True], ['bye', False]],
        column_names=['msg', 'flag'],
        column_roles=['textual', 'encoded'],
        receiver=acc,
    )

    c1, c2 = matrix.columns
    e1 = np.array(['hi', 'bye'], dtype=object)
    e2 = np.array([True, False], dtype=None)

    assert np.array_equal(c1.values, e1)
    assert np.array_equal(c2.values, e2)

    assert c1.name == 'msg'
    assert c2.name == 'flag'

    assert c1.role == 'textual'
    assert c2.role == 'encoded'

    assert c1.is_original
    assert c2.is_original

    assert len(acc.warnings) == 0
Exemple #10
0
def test_column_with_some_blank_strings():
    # Repeat the previous test, only replace most of the empty strings with
    # blank strings.
    dataset = [
        ['A', 'B', 'C'],
        [True, 1.1, 20],
        [' ', 2.2, 30],
        [False, '\t', 40],
        [False, 3.3, ' \t \r \n\t'],
        ['', 4.4, '    '],
    ]

    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc)
    ctx = autom8.create_context(matrix, receiver=acc)

    autom8.clean_dataset(ctx)

    assert ctx.matrix.tolist() == [
        [True, True, 1.1, True, 20, True],
        [False, False, 2.2, True, 30, True],
        [False, True, 0.0, False, 40, True],
        [False, True, 3.3, True, 0, False],
        [False, False, 4.4, True, 0, False],
    ]
    assert ctx.matrix.formulas == [
        'A',
        ['is-defined', 'A'],
        'B',
        ['is-defined', 'B'],
        'C',
        ['is-defined', 'C'],
    ]
Exemple #11
0
def test_creating_simple_matrix_from_list():
    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(
        [['hi', 1, True], ['bye', 2, False]],
        receiver=acc,
    )

    c1, c2, c3 = matrix.columns
    e1 = np.array(['hi', 'bye'], dtype=object)
    e2 = np.array([1, 2], dtype=None)
    e3 = np.array([True, False], dtype=None)

    assert np.array_equal(c1.values, e1)
    assert np.array_equal(c2.values, e2)
    assert np.array_equal(c3.values, e3)

    assert c1.name == 'A'
    assert c2.name == 'B'
    assert c3.name == 'C'

    assert c1.role is None
    assert c2.role is None
    assert c3.role is None

    assert c1.is_original
    assert c2.is_original
    assert c3.is_original

    assert len(acc.warnings) == 0
Exemple #12
0
def test_is_recording_property():
    matrix = autom8.create_matrix([[1, 2]])
    c1 = autom8.create_context(matrix)
    c2 = PlaybackContext(matrix, autom8.Accumulator())
    assert c1.is_recording
    assert not c2.is_recording
    assert hasattr(c1, 'receiver')
    assert hasattr(c2, 'receiver')
Exemple #13
0
def test_append_column():
    matrix = autom8.create_matrix([[1], [2], [3], [4]])
    matrix.append_column(np.array([2, 4, 6, 8]), 'foo', 'encoded')
    c1, c2 = matrix.columns
    assert c2.name == 'foo'
    assert c2.role == 'encoded'
    assert not c2.is_original
    assert np.array_equal(c2.values, np.array([2, 4, 6, 8]))
    assert not np.array_equal(c2.values, np.array([1, 2, 3, 4]))
Exemple #14
0
def test_select_columns_by_name_with_superset():
    dataset = [
        ['hi', 1, True, 10.5],
        ['so', 2, True, 15.5],
        ['bye', 3, False, 20.5],
    ]
    matrix = autom8.create_matrix(dataset, column_names=['A', 'B', 'C', 'D'])
    received = matrix.select_columns_by_name(['C', 'B'])
    assert received.tolist() == [[True, 1], [True, 2], [False, 3]]
Exemple #15
0
def test_columns_with_numbers_with_commas():
    dataset = [['A'], ['1,100.0'], ['2,200'], ['3,300'], ['50']]
    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc)
    ctx = autom8.create_context(matrix, receiver=acc)
    autom8.clean_dataset(ctx)
    assert len(acc.warnings) == 0
    assert len(ctx.steps) == 1
    assert ctx.matrix.tolist() == [[1100], [2200], [3300], [50]]
Exemple #16
0
def test_duplicate_column_names():
    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(
        dataset=[[1, 2, 3]],
        column_names=['A', 'B', 'A'],
        receiver=acc,
    )
    assert len(acc.warnings) == 1
    assert 'Column names are not unique' in acc.warnings[0]
Exemple #17
0
def test_select_columns_by_name_with_invalid_arguments():
    dataset = [
        ['hi', 1, True, 10.5],
        ['so', 2, True, 15.5],
        ['bye', 3, False, 20.5],
    ]
    matrix = autom8.create_matrix(dataset, column_names=['A', 'B', 'C', 'D'])
    with pytest.raises(autom8.Autom8Exception) as excinfo:
        matrix.select_columns_by_name(['C', 'Z'])
    excinfo.match('Expected column names')
Exemple #18
0
def test_planner_decorator():
    matrix = autom8.create_matrix([[1, 1], [2, 2]])
    c1 = autom8.create_context(matrix)
    c2 = PlaybackContext(matrix, autom8.Accumulator())

    # This should not raise an exception.
    autom8.drop_duplicate_columns(c1)

    # But this should raise one.
    with pytest.raises(autom8.Autom8Exception) as excinfo:
        autom8.drop_duplicate_columns(c2)
    excinfo.match('Expected.*RecordingContext')
Exemple #19
0
def test_column_dtype_property():
    matrix = autom8.create_matrix([
        ['hi', 10, 1.1, True, None],
        ['so', 20, 2.2, True, None],
        ['bye', 30, 3.3, False, None],
    ])
    c1, c2, c3, c4, c5 = matrix.columns
    assert c1.dtype == np.dtype('O')
    assert c2.dtype == np.dtype('int64')
    assert c3.dtype == np.dtype('float64')
    assert c4.dtype == np.dtype('bool')
    assert c5.dtype == np.dtype('O')
Exemple #20
0
def test_primitives_with_object_dtype():
    dataset = [
        ['A', 'B', 'C'],
        [True, 1.1, 2],
        [False, 3.1, 4],
        [True, 5.1, 6],
    ]

    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc)
    for col in matrix.columns:
        col.values = col.values.astype(object)

    ctx = autom8.create_context(matrix, receiver=acc)
    autom8.clean_dataset(ctx)

    dtypes = [c.dtype for c in ctx.matrix.columns]
    assert dtypes[0] == bool
    assert dtypes[1] == float
    assert dtypes[2] == int

    vectors = [['A', 'B', 'C'], [1, 2, 3.0], [0, 4, 5.0], [1, False, 6.9]]
    matrix = autom8.create_matrix(vectors, receiver=acc)
    out = PlaybackContext(matrix, receiver=acc)
    playback(ctx.steps, out)
    assert out.matrix.tolist() == [[True, 2.0, 3], [False, 4.0, 5],
                                   [True, 0.0, 6]]

    dtypes = [c.dtype for c in out.matrix.columns]
    assert dtypes[0] == bool
    assert dtypes[1] == float
    assert dtypes[2] == int

    vectors = [['A', 'B', 'C'], ['1', '2', None], ['', None, ()]]
    matrix = autom8.create_matrix(vectors, receiver=acc)
    out = PlaybackContext(matrix, receiver=acc)
    playback(ctx.steps, out)

    # Just use repr to avoid having to fart around with nan.
    assert repr(out.matrix.tolist()) == ("[[True, 2.0, 0], [False, nan, 0]]")
Exemple #21
0
def test_column_with_all_none():
    dataset = [
        ['A', 'B', 'C'],
        [True, None, 2],
        [False, None, 4],
        [True, None, 6],
    ]

    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc)
    ctx = autom8.create_context(matrix, receiver=acc)

    autom8.clean_dataset(ctx)
    assert len(acc.warnings) == 1
    assert 'Dropping column' in acc.warnings[0]
    assert ctx.matrix.tolist() == [[True, 2], [False, 4], [True, 6]]

    vectors = [['A', 'B', 'C'], [1, 2, 'foo'], [3, 4, 'bar']]
    matrix = autom8.create_matrix(vectors, receiver=acc)
    out = PlaybackContext(matrix, receiver=acc)
    playback(ctx.steps, out)
    assert out.matrix.tolist() == [[1, 'foo'], [3, 'bar']]
Exemple #22
0
def test_matrix_with_unexpected_value():
    dataset = [
        ['A', 'B', 'C'],
        [1, 2, ()],
        [3, 4, {}],
        [5, 6, object()],
    ]
    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc)
    ctx = autom8.create_context(matrix, receiver=acc)

    autom8.clean_dataset(ctx)
    assert len(acc.warnings) == 1
    assert 'Dropping column' in acc.warnings[0]
    assert 'contain booleans, numbers' in acc.warnings[0]
    assert ctx.matrix.tolist() == [[1, 2], [3, 4], [5, 6]]

    vectors = [['A', 'B', 'C'], [1, 2, 'foo'], [3, 4, 'bar']]
    matrix = autom8.create_matrix(vectors, receiver=acc)
    out = PlaybackContext(matrix, receiver=acc)
    playback(ctx.steps, out)
    assert out.matrix.tolist() == [[1, 2], [3, 4]]
Exemple #23
0
def test_creating_a_matrix_with_list_of_roles():
    dataset = [
        ['hi', 1, True, 10.5],
        ['so', 2, True, 15.5],
        ['bye', 3, False, 20.5],
    ]
    matrix = autom8.create_matrix(
        dataset,
        column_roles=['textual', 'categorical', 'encoded', 'numerical'])
    assert matrix.columns[0].role == 'textual'
    assert matrix.columns[1].role == 'categorical'
    assert matrix.columns[2].role == 'encoded'
    assert matrix.columns[3].role == 'numerical'
Exemple #24
0
def test_copy_method():
    m1 = autom8.create_matrix([
        ['hi', 1.1, True],
        ['so', 2.2, True],
        ['bye', 3.3, False],
    ])
    m2 = autom8.create_matrix([[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]])
    n1, n2 = m1.copy(), m2.copy()

    assert m1 is not n1
    assert m2 is not n2

    assert len(m1.columns) == len(n1.columns)
    assert len(m2.columns) == len(n2.columns)

    for a, b in zip(m1.columns + m2.columns, n1.columns + n2.columns):
        assert a is not b
        assert a.values is not b.values
        assert a.name == b.name
        assert a.role == b.role
        assert a.is_original == b.is_original
        assert np.array_equal(a.values, b.values)
Exemple #25
0
def test_column_of_all_strings_and_none_values():
    dataset = [
        ['A', 'B'],
        ['1', 2],
        ['foo', 4],
        [None, 0],
    ]

    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc)
    ctx = autom8.create_context(matrix, receiver=acc)

    autom8.clean_dataset(ctx)
    assert len(acc.warnings) == 0
    assert len(ctx.steps) == 1
    assert ctx.matrix.tolist() == [['1', 2], ['foo', 4], ['', 0]]

    vectors = [['A', 'B'], [None, 'bar'], ['baz', None]]
    matrix = autom8.create_matrix(vectors, receiver=acc)
    out = PlaybackContext(matrix, receiver=acc)
    playback(ctx.steps, out)
    assert out.matrix.tolist() == [['', 'bar'], ['baz', None]]
Exemple #26
0
def test_extra_columns_warning_message():
    a1 = autom8.Accumulator()
    a2 = autom8.Accumulator()
    m1 = autom8.create_matrix([[1, 2], [1, 2, 3]], receiver=a1)
    m2 = autom8.create_matrix([[1], [1, 2, 3], [1, 2, 3, 4], [1, 2, 3, 4]],
                              receiver=a2)

    assert len(m1.columns), 2
    assert a1.warnings == [
        'Dropped 1 extra column from dataset.'
        ' Keeping first 2 columns.'
        ' To avoid this behavior, ensure that each row in the dataset has'
        ' the same number of columns.'
    ]

    assert len(m2.columns), 1
    assert a2.warnings == [
        'Dropped 3 extra columns from dataset.'
        ' Keeping first 1 column.'
        ' To avoid this behavior, ensure that each row in the dataset has'
        ' the same number of columns.'
    ]
Exemple #27
0
def test_creating_simple_matrix_from_numpy_array():
    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(
        np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]),
        receiver=acc,
    )

    c1, c2, c3 = matrix.columns
    e1 = np.array([1, 4, 7, 10], dtype=object)
    e2 = np.array([2, 5, 8, 11], dtype=None)
    e3 = np.array([3, 6, 9, 12], dtype=None)

    assert np.array_equal(c1.values, e1)
    assert np.array_equal(c2.values, e2)
    assert np.array_equal(c3.values, e3)
Exemple #28
0
def test_creating_a_matrix_with_map_of_roles():
    dataset = [
        ['hi', 1, True, 10.5],
        ['so', 2, True, 15.5],
        ['bye', 3, False, 20.5],
    ]
    matrix = autom8.create_matrix(
        dataset,
        column_names=['A', 'B', 'C', 'D'],
        column_roles={
            'D': 'numerical',
            'C': 'encoded',
            1: 'categorical',
            'A': 'textual',
        },
    )
    assert matrix.columns[0].role == 'textual'
    assert matrix.columns[1].role == 'categorical'
    assert matrix.columns[2].role == 'encoded'
    assert matrix.columns[3].role == 'numerical'

    with pytest.raises(autom8.Autom8Exception) as excinfo:
        autom8.create_matrix(
            dataset,
            column_names=['A', 'B', 'C', 'D'],
            column_roles={'Z': 'numerical'},
        )
    excinfo.match('Expected column')

    with pytest.raises(autom8.Autom8Exception) as excinfo:
        autom8.create_matrix(
            dataset,
            column_names=['A', 'B', 'C', 'D'],
            column_roles={object(): 'numerical'},
        )
    excinfo.match('Expected valid column')
Exemple #29
0
def test_empty_dataset_warning_message():
    a1 = autom8.Accumulator()
    a2 = autom8.Accumulator()
    a3 = autom8.Accumulator()
    autom8.create_matrix([[]], receiver=a1)
    autom8.create_matrix([[], []], receiver=a2)
    autom8.create_matrix([[], [], []], receiver=a3)
    assert a1.warnings == ['Dropped 1 empty row from dataset.']
    assert a2.warnings == ['Dropped 2 empty rows from dataset.']
    assert a3.warnings == ['Dropped 3 empty rows from dataset.']
Exemple #30
0
def test_column_of_all_strings():
    dataset = [
        ['A', 'B'],
        ['1', 2],
        ['3', 4],
        ['n', 0],
    ]

    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc)
    ctx = autom8.create_context(matrix, receiver=acc)

    autom8.clean_dataset(ctx)
    assert len(acc.warnings) == 0
    assert len(ctx.steps) == 0
    assert ctx.matrix.tolist() == [['1', 2], ['3', 4], ['n', 0]]