예제 #1
0
def test_column_with_some_blank_strings():
    # Repeat the previous test, only replace most of the empty strings with
    # blank strings.
    dataset = [
        ['A', 'B', 'C'],
        [True, 1.1, 20],
        [' ', 2.2, 30],
        [False, '\t', 40],
        [False, 3.3, ' \t \r \n\t'],
        ['', 4.4, '    '],
    ]

    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc)
    ctx = autom8.create_context(matrix, receiver=acc)

    autom8.clean_dataset(ctx)

    assert ctx.matrix.tolist() == [
        [True, True, 1.1, True, 20, True],
        [False, False, 2.2, True, 30, True],
        [False, True, 0.0, False, 40, True],
        [False, True, 3.3, True, 0, False],
        [False, False, 4.4, True, 0, False],
    ]
    assert ctx.matrix.formulas == [
        'A',
        ['is-defined', 'A'],
        'B',
        ['is-defined', 'B'],
        'C',
        ['is-defined', 'C'],
    ]
예제 #2
0
def test_column_of_ints_and_floats():
    dataset = [
        ['A', 'B'],
        [1, 3.3],
        [2.2, 4],
        [None, None],
    ]

    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc)
    ctx = autom8.create_context(matrix, receiver=acc)

    autom8.clean_dataset(ctx)

    assert len(ctx.steps) == 4
    assert len(acc.warnings) == 2
    assert ctx.matrix.tolist() == [
        [1.0, True, 3.3, True],
        [2.2, True, 4.0, True],
        [0.0, False, 0.0, False],
    ]

    vectors = [['A', 'B'], [None, 10], [20.0, None], [30, 40]]
    matrix = autom8.create_matrix(vectors, receiver=acc)
    out = PlaybackContext(matrix, receiver=acc)
    playback(ctx.steps, out)
    assert out.matrix.tolist() == [
        [0.0, False, 10.0, True],
        [20.0, True, 0.0, False],
        [30.0, True, 40.0, True],
    ]

    assert out.matrix.columns[0].dtype == float
    assert out.matrix.columns[2].dtype == float
예제 #3
0
def test_columns_with_numbers_as_strings():
    dataset = [
        ['A', 'B', 'C'],
        ['1.1', '$4', 7],
        ['2.2', '$5', 8],
        ['3.3', '6%', 9],
    ]

    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc)
    ctx = autom8.create_context(matrix, receiver=acc)

    autom8.clean_dataset(ctx)
    assert len(acc.warnings) == 0
    assert len(ctx.steps) == 2

    assert ctx.matrix.tolist() == [[1.1, 4, 7], [2.2, 5, 8], [3.3, 6, 9]]

    vectors = [['A', 'B', 'C'], [1, '2%', 'foo'], ['3', 4.0, 'bar']]
    matrix = autom8.create_matrix(vectors, receiver=acc)
    out = PlaybackContext(matrix, receiver=acc)
    playback(ctx.steps, out)
    assert out.matrix.tolist() == [[1, 2, 'foo'], [3, 4, 'bar']]
    assert out.matrix.columns[0].dtype == int
    assert out.matrix.columns[1].dtype == float
예제 #4
0
def test_is_recording_property():
    matrix = autom8.create_matrix([[1, 2]])
    c1 = autom8.create_context(matrix)
    c2 = PlaybackContext(matrix, autom8.Accumulator())
    assert c1.is_recording
    assert not c2.is_recording
    assert hasattr(c1, 'receiver')
    assert hasattr(c2, 'receiver')
예제 #5
0
def test_columns_with_numbers_with_commas():
    dataset = [['A'], ['1,100.0'], ['2,200'], ['3,300'], ['50']]
    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc)
    ctx = autom8.create_context(matrix, receiver=acc)
    autom8.clean_dataset(ctx)
    assert len(acc.warnings) == 0
    assert len(ctx.steps) == 1
    assert ctx.matrix.tolist() == [[1100], [2200], [3300], [50]]
예제 #6
0
def _create_context(features, roles):
    # Add a column of labels. It's required by create_context.
    dataset = [row + [0] for row in features]
    num_cols = len(dataset[0])

    return autom8.create_context(
        dataset=dataset,
        column_names=_column_names(dataset),
        column_roles=roles + ['numerical'],
    )
예제 #7
0
def test_planner_decorator():
    matrix = autom8.create_matrix([[1, 1], [2, 2]])
    c1 = autom8.create_context(matrix)
    c2 = PlaybackContext(matrix, autom8.Accumulator())

    # This should not raise an exception.
    autom8.drop_duplicate_columns(c1)

    # But this should raise one.
    with pytest.raises(autom8.Autom8Exception) as excinfo:
        autom8.drop_duplicate_columns(c2)
    excinfo.match('Expected.*RecordingContext')
예제 #8
0
def test_mixed_up_columns_with_strings_and_numbers():
    dataset = [
        ['A', 'B'],
        [True, 'foo'],
        [1.1, 30],
        [20, 4.4],
        ['bar', False],
        ['', 'baz'],
        [50, 'fiz'],
        [None, True],
    ]

    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc)
    ctx = autom8.create_context(matrix, receiver=acc)

    autom8.clean_dataset(ctx)

    assert len(ctx.steps) == 6
    assert len(acc.warnings) == 0
    assert ctx.matrix.tolist() == [
        [1.0, '', 0.0, 'foo'],
        [1.1, '', 30.0, ''],
        [20.0, '', 4.4, ''],
        [0.0, 'bar', 0.0, ''],
        [0.0, '', 0.0, 'baz'],
        [50.0, '', 0.0, 'fiz'],
        [0.0, '', 1.0, ''],
    ]
    assert ctx.matrix.formulas == [
        ['number', 'A'],
        ['string', 'A'],
        ['number', 'B'],
        ['string', 'B'],
    ]

    vectors = [['A', 'B'], [False, 'buz'], ['zim', 10], [2, None]]
    matrix = autom8.create_matrix(vectors, receiver=acc)
    out = PlaybackContext(matrix, receiver=acc)
    playback(ctx.steps, out)
    assert out.matrix.tolist() == [
        [0.0, '', 0.0, 'buz'],
        [0.0, 'zim', 10.0, ''],
        [2.0, '', 0.0, ''],
    ]
    assert out.matrix.formulas == [
        ['number', 'A'],
        ['string', 'A'],
        ['number', 'B'],
        ['string', 'B'],
    ]
예제 #9
0
def test_clean_numeric_labels():
    dataset = [
        ['A', 'B', 'C'],
        [1, 2, '3'],
        [3, 4, '4'],
        [5, 6, 5],
        [7, 8, None],
        [9, 9, ''],
    ]
    acc = autom8.Accumulator()
    ctx = autom8.create_context(dataset, receiver=acc)

    assert len(acc.warnings) == 1
    assert ctx.labels.original.tolist() == [3, 4, 5, 0, 0]
예제 #10
0
def test_columns_with_some_empty_strings():
    dataset = [
        ['A', 'B', 'C'],
        [True, 1.1, 20],
        ['', 2.2, 30],
        [False, '', 40],
        [False, 3.3, ''],
        ['', 4.4, ''],
    ]

    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc)
    ctx = autom8.create_context(matrix, receiver=acc)

    autom8.clean_dataset(ctx)

    assert len(ctx.steps) == 6
    assert len(acc.warnings) == 3
    assert ctx.matrix.tolist() == [
        [True, True, 1.1, True, 20, True],
        [False, False, 2.2, True, 30, True],
        [False, True, 0.0, False, 40, True],
        [False, True, 3.3, True, 0, False],
        [False, False, 4.4, True, 0, False],
    ]
    assert ctx.matrix.formulas == [
        'A',
        ['is-defined', 'A'],
        'B',
        ['is-defined', 'B'],
        'C',
        ['is-defined', 'C'],
    ]

    vectors = [['A', 'B', 'C'], ['', 5.5, ''], [True, '', 50]]
    matrix = autom8.create_matrix(vectors, receiver=acc)
    out = PlaybackContext(matrix, receiver=acc)
    playback(ctx.steps, out)
    assert out.matrix.tolist() == [
        [False, False, 5.5, True, 0, False],
        [True, True, 0.0, False, 50, True],
    ]
    assert out.matrix.formulas == [
        'A',
        ['is-defined', 'A'],
        'B',
        ['is-defined', 'B'],
        'C',
        ['is-defined', 'C'],
    ]
예제 #11
0
def test_column_of_all_strings():
    dataset = [
        ['A', 'B'],
        ['1', 2],
        ['3', 4],
        ['n', 0],
    ]

    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc)
    ctx = autom8.create_context(matrix, receiver=acc)

    autom8.clean_dataset(ctx)
    assert len(acc.warnings) == 0
    assert len(ctx.steps) == 0
    assert ctx.matrix.tolist() == [['1', 2], ['3', 4], ['n', 0]]
예제 #12
0
def test_evaluate_pipeline():
    acc = autom8.Accumulator()
    inputs = [
        [1, 2],
        [3, 4],
        [5, 6],
        [7, 8],
        [9, 10],
        [11, 12],
        [13, 14],
        [15, 16],
    ]
    dataset = [i + [i[0] + i[1]] for i in inputs]
    ctx = autom8.create_context(dataset, receiver=acc)

    # For now, just hack in the test_indices that we want.
    ctx.test_indices = [2, 5]

    autom8.add_column_of_ones(ctx)
    ctx << sklearn.linear_model.LinearRegression()
    assert len(acc.candidates) == 1

    candidate = acc.candidates[0]
    assert candidate.train.metrics['r2_score'] == 1.0
    assert candidate.test.metrics['r2_score'] == 1.0

    assert np.allclose(
        candidate.train.predictions,
        np.array([1 + 2, 3 + 4, 7 + 8, 9 + 10, 13 + 14, 15 + 16]),
    )

    assert np.allclose(
        candidate.test.predictions,
        np.array([5 + 6, 11 + 12]),
    )

    # Try using the pipeline to make some predictions.
    result = candidate.pipeline.run([[17, 18], [19, 20], [21, 22]],
                                    receiver=acc)

    assert np.allclose(result.predictions,
                       np.array([17 + 18, 19 + 20, 21 + 22]))
    assert result.probabilities is None
    assert not acc.warnings
예제 #13
0
def test_sandbox():
    ctx = autom8.create_context(
        dataset=[
            [1, 5, True, 9, 10],
            [2, 6, False, 10, 20],
            [3, 7, False, 11, 30],
            [4, 8, True, 12, 40],
        ],
        column_names=['A', 'B', 'C', 'D', 'E'],
        column_roles=['numerical'] * 2 + ['encoded'] + ['numerical'] * 2,
    )
    autom8.add_column_of_ones(ctx)

    assert len(ctx.steps) == 1
    assert len(ctx.matrix.columns) == 4+1
    assert ctx.matrix.tolist() == [
        [1, 5, True, 9, 1],
        [2, 6, False, 10, 1],
        [3, 7, False, 11, 1],
        [4, 8, True, 12, 1],
    ]

    with ctx.sandbox():
        autom8.multiply_columns(ctx)
        assert len(ctx.steps) == 2
        assert len(ctx.matrix.columns) == 4+1+3
        assert ctx.matrix.tolist() == [
            [1, 5, True, 9, 1, 1*5, 1*9, 5*9],
            [2, 6, False, 10, 1, 2*6, 2*10, 6*10],
            [3, 7, False, 11, 1, 3*7, 3*11, 7*11],
            [4, 8, True, 12, 1, 4*8, 4*12, 8*12],
        ]

    # Now check that the context has been restored to its previous state.
    assert len(ctx.steps) == 1
    assert len(ctx.matrix.columns) == 4+1
    assert ctx.matrix.tolist() == [
        [1, 5, True, 9, 1],
        [2, 6, False, 10, 1],
        [3, 7, False, 11, 1],
        [4, 8, True, 12, 1],
    ]
예제 #14
0
def test_primitives_with_object_dtype():
    dataset = [
        ['A', 'B', 'C'],
        [True, 1.1, 2],
        [False, 3.1, 4],
        [True, 5.1, 6],
    ]

    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc)
    for col in matrix.columns:
        col.values = col.values.astype(object)

    ctx = autom8.create_context(matrix, receiver=acc)
    autom8.clean_dataset(ctx)

    dtypes = [c.dtype for c in ctx.matrix.columns]
    assert dtypes[0] == bool
    assert dtypes[1] == float
    assert dtypes[2] == int

    vectors = [['A', 'B', 'C'], [1, 2, 3.0], [0, 4, 5.0], [1, False, 6.9]]
    matrix = autom8.create_matrix(vectors, receiver=acc)
    out = PlaybackContext(matrix, receiver=acc)
    playback(ctx.steps, out)
    assert out.matrix.tolist() == [[True, 2.0, 3], [False, 4.0, 5],
                                   [True, 0.0, 6]]

    dtypes = [c.dtype for c in out.matrix.columns]
    assert dtypes[0] == bool
    assert dtypes[1] == float
    assert dtypes[2] == int

    vectors = [['A', 'B', 'C'], ['1', '2', None], ['', None, ()]]
    matrix = autom8.create_matrix(vectors, receiver=acc)
    out = PlaybackContext(matrix, receiver=acc)
    playback(ctx.steps, out)

    # Just use repr to avoid having to fart around with nan.
    assert repr(out.matrix.tolist()) == ("[[True, 2.0, 0], [False, nan, 0]]")
예제 #15
0
def test_column_with_all_none():
    dataset = [
        ['A', 'B', 'C'],
        [True, None, 2],
        [False, None, 4],
        [True, None, 6],
    ]

    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc)
    ctx = autom8.create_context(matrix, receiver=acc)

    autom8.clean_dataset(ctx)
    assert len(acc.warnings) == 1
    assert 'Dropping column' in acc.warnings[0]
    assert ctx.matrix.tolist() == [[True, 2], [False, 4], [True, 6]]

    vectors = [['A', 'B', 'C'], [1, 2, 'foo'], [3, 4, 'bar']]
    matrix = autom8.create_matrix(vectors, receiver=acc)
    out = PlaybackContext(matrix, receiver=acc)
    playback(ctx.steps, out)
    assert out.matrix.tolist() == [[1, 'foo'], [3, 'bar']]
예제 #16
0
def test_matrix_with_unexpected_value():
    dataset = [
        ['A', 'B', 'C'],
        [1, 2, ()],
        [3, 4, {}],
        [5, 6, object()],
    ]
    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc)
    ctx = autom8.create_context(matrix, receiver=acc)

    autom8.clean_dataset(ctx)
    assert len(acc.warnings) == 1
    assert 'Dropping column' in acc.warnings[0]
    assert 'contain booleans, numbers' in acc.warnings[0]
    assert ctx.matrix.tolist() == [[1, 2], [3, 4], [5, 6]]

    vectors = [['A', 'B', 'C'], [1, 2, 'foo'], [3, 4, 'bar']]
    matrix = autom8.create_matrix(vectors, receiver=acc)
    out = PlaybackContext(matrix, receiver=acc)
    playback(ctx.steps, out)
    assert out.matrix.tolist() == [[1, 2], [3, 4]]
예제 #17
0
def test_column_of_all_strings_and_none_values():
    dataset = [
        ['A', 'B'],
        ['1', 2],
        ['foo', 4],
        [None, 0],
    ]

    acc = autom8.Accumulator()
    matrix = autom8.create_matrix(_add_labels(dataset), receiver=acc)
    ctx = autom8.create_context(matrix, receiver=acc)

    autom8.clean_dataset(ctx)
    assert len(acc.warnings) == 0
    assert len(ctx.steps) == 1
    assert ctx.matrix.tolist() == [['1', 2], ['foo', 4], ['', 0]]

    vectors = [['A', 'B'], [None, 'bar'], ['baz', None]]
    matrix = autom8.create_matrix(vectors, receiver=acc)
    out = PlaybackContext(matrix, receiver=acc)
    playback(ctx.steps, out)
    assert out.matrix.tolist() == [['', 'bar'], ['baz', None]]
예제 #18
0
def test_training_and_testing_data():
    dataset = autom8.create_matrix([
        [1, 5, True, 9, 10],
        [2, 6, False, 10, 20],
        [3, 7, False, 11, 30],
        [4, 8, True, 12, 40],
    ])
    ctx = autom8.create_context(dataset)

    # For now, just hack in the test_indices that we want.
    ctx.test_indices = [1, 3]

    m1, a1 = ctx.testing_data()
    m2, a2 = ctx.training_data()
    assert a1.tolist() == [20, 40]
    assert a2.tolist() == [10, 30]
    assert m1.tolist() == [
        [2, 6, False, 10],
        [4, 8, True, 12],
    ]
    assert m2.tolist() == [
        [1, 5, True, 9],
        [3, 7, False, 11],
    ]
예제 #19
0
def test_invalid_contexts():
    with pytest.raises(autom8.Autom8Exception) as excinfo:
        autom8.create_context([])
    excinfo.match('Expected.*dataset')

    with pytest.raises(autom8.Autom8Exception) as excinfo:
        autom8.create_context([[1], [2], [3]])
    excinfo.match('Expected.*dataset')

    with pytest.raises(autom8.Autom8Exception) as excinfo:
        autom8.create_context(
            dataset=[['A', 'B'], [1, 2]],
            target_column='C',
        )
    excinfo.match('Expected.*target_column')

    with pytest.raises(autom8.Autom8Exception) as excinfo:
        autom8.create_context(
            dataset=[['A', 'B'], [1, 2]],
            target_column=object(),
        )
    excinfo.match('Expected.*target_column')

    with pytest.raises(autom8.Autom8Exception) as excinfo:
        autom8.create_context(
            dataset=[['A', 'B'], [1, 2]],
            target_column=10,
        )
    excinfo.match('Expected.*target_column')

    with pytest.raises(autom8.Autom8Exception) as excinfo:
        autom8.create_context(
            dataset=[['A', 'B'], [1, 2]],
            problem_type='classify',
        )
    excinfo.match('Expected.*problem_type')

    with pytest.raises(autom8.Autom8Exception) as excinfo:
        autom8.create_context(
            dataset=[['A', 'B'], [1, 2]],
            test_ratio=1.2,
        )
    excinfo.match('Expected.*test_ratio')