def test_init_ignore_default():
    encoder = OneHotEncoder({
        'foo': 3,
        'bar': 7
    }, ['buzz_numeric'],
                            max_levels_default=999)
    assert encoder.numeric_cols == ['buzz_numeric']
    assert encoder.categorical_n_levels_dict == {'foo': 3, 'bar': 7}
def get_encoder(pipeline_params, write=True, read_from_file=False):
    encoder_file = file_names['encoder']
    if os.path.exists(encoder_file) and read_from_file:
        print('Reading encoder from : %s' % encoder_file)
        encoder_from_file = OneHotEncoder([], [])
        encoder_from_file.load_from_file(encoder_file)
        return encoder_from_file

    print('Building encoder')
    stream = stream_data(pipeline_params)
    encoder = get_encoder_from_stream(stream)

    if write:
        print('Writing encoder to: %s' % encoder_file)
        encoder.save(encoder_file)

    return encoder
def get_encoder_from_stream(stream):
    categorical_n_levels_dict_all = {
        'item_nbr': 10000000000,
        'year': 50,
        'month': 13,
        'day': 370,
        'class': 600,
        'family': 100,
        'dayofweek': 10
    }

    categorical_n_levels_dict = categorical_n_levels_dict_all

    numeric_columns = ['perishable', 'days_til_end_of_data', 'dayoff']

    encoder = OneHotEncoder(categorical_n_levels_dict, numeric_columns)
    encoder.load_from_data_stream(stream)
    return encoder
def test_load_from_data():
    encoder = OneHotEncoder(['animal', 'color'], ['weight'],
                            max_levels_default=100)
    data = [{
        'animal': 'cat',
        'color': 'blue',
        'weight': 1.0
    }, {
        'animal': 'cat',
        'color': 'red',
        'weight': 3.0
    }, {
        'animal': 'dog',
        'color': 'yellow',
        'weight': 5.5
    }, {
        'animal': 'fish',
        'color': 'blue',
        'weight': 7.0
    }, {
        'animal': 'cat',
        'color': 'blue',
        'weight': 2.0
    }, {
        'animal': 'cat',
        'color': 'blue',
        'weight': 0.0
    }, {
        'animal': 'cat',
        'color': 'blue',
        'weight': 99.9
    }]

    assert encoder.encoder is None
    assert encoder.decoder is None
    assert encoder.one_hot_encoder_dicts is None

    encoder.load_from_data_stream(data)

    assert encoder.encoder is not None
    assert encoder.decoder is not None
    assert encoder.one_hot_encoder_dicts is not None
def test_load_from_data_encodes_data():
    encoder = OneHotEncoder(['animal', 'color'], ['weight'],
                            max_levels_default=100)
    data = [{
        'animal': 'cat',
        'color': 'blue',
        'weight': 1.0
    }, {
        'animal': 'cat',
        'color': 'red',
        'weight': 3.0
    }, {
        'animal': 'dog',
        'color': 'yellow',
        'weight': 5.5
    }, {
        'animal': 'fish',
        'color': 'blue',
        'weight': 7.0
    }, {
        'animal': 'cat',
        'color': 'blue',
        'weight': 2.0
    }, {
        'animal': 'cat',
        'color': 'blue',
        'weight': 0.0
    }, {
        'animal': 'cat',
        'color': 'blue',
        'weight': 99.9
    }]

    encoder.load_from_data_stream(data)

    encoded_data = [encoder.encode_row(row) for row in data]
    assert len(encoded_data) == len(data)
    assert len(encoded_data[0]) != len(data[0])
def test_inversion_more_complicated():
    encoder = OneHotEncoder(['animal', 'color'], ['weight', 'height'],
                            max_levels_default=100)

    data = [{
        'animal': 'cat',
        'color': 'blue',
        'weight': 6.0,
        'height': 88.9
    }, {
        'animal': 'cat',
        'color': 'red',
        'weight': 3.0,
        'height': 44.9
    }, {
        'animal': 'dog',
        'color': 'yellow',
        'weight': 5.5,
        'height': 2.5
    }, {
        'animal': 'fish',
        'color': 'blue',
        'weight': 7.0,
        'height': 3233.2
    }, {
        'animal': 'cat',
        'color': 'magenta',
        'weight': 2.0,
        'height': 666.6
    }, {
        'animal': 'mouse',
        'color': 'red',
        'weight': 0.0,
        'height': 55.5
    }, {
        'animal': 'mouse',
        'color': 'blah',
        'weight': 99.9,
        'height': 33
    }]

    encoder.load_from_data_stream(data)

    encoded_data = encoder.encode_data(data)

    data_decoded = encoder.decode_data(encoded_data)
    assert data_decoded == data

    data_recoded = encoder.encode_data(data_decoded)
    assert data_recoded == encoded_data
def test_inversion():
    encoder = OneHotEncoder(['animal', 'color'], ['weight'],
                            max_levels_default=100)

    data = [{
        'animal': 'cat',
        'color': 'blue',
        'weight': 6.0
    }, {
        'animal': 'cat',
        'color': 'red',
        'weight': 3.0
    }, {
        'animal': 'dog',
        'color': 'yellow',
        'weight': 5.5
    }, {
        'animal': 'fish',
        'color': 'blue',
        'weight': 7.0
    }, {
        'animal': 'cat',
        'color': 'magenta',
        'weight': 2.0
    }, {
        'animal': 'mouse',
        'color': 'purple',
        'weight': 0.0
    }, {
        'animal': 'mouse',
        'color': 'black',
        'weight': 99.9
    }]

    encoder.load_from_data_stream(data)

    encoded_data = encoder.encode_data(data)

    data_decoded = encoder.decode_data(encoded_data)
    assert data_decoded == data

    data_recoded = encoder.encode_data(data_decoded)
    assert data_recoded == encoded_data
def test_init():
    encoder = OneHotEncoder({'foo': 3, 'bar': 7}, ['buzz_numeric'])
    assert encoder.numeric_cols == ['buzz_numeric']
    assert encoder.categorical_n_levels_dict == {'foo': 3, 'bar': 7}
def test_inversion_more_complicated_with_max_levels_diff():
    encoder = OneHotEncoder({'animal': 2, 'color': 1}, ['weight', 'height'])

    data = [{
        'animal': 'cat',
        'color': 'blue',
        'weight': 6.0,
        'height': 88.9,
        'extra_junk': 'blah'
    }, {
        'animal': 'cat',
        'color': 'red',
        'weight': 3.0,
        'height': 44.9
    }, {
        'animal': 'dog',
        'color': 'yellow',
        'weight': 5.5,
        'height': 2.5
    }, {
        'animal': 'fish',
        'color': 'blue',
        'weight': 7.0,
        'height': 3233.2
    }, {
        'animal': 'cat',
        'color': 'magenta',
        'weight': 2.0,
        'height': 666.6
    }, {
        'animal': 'mouse',
        'color': 'red',
        'weight': 0.0,
        'height': 55.5
    }, {
        'animal': 'mouse',
        'color': 'blah',
        'weight': 99.9,
        'height': 33
    }]

    encoder.load_from_data_stream(data)

    encoded_data = encoder.encode_data(data)
    data_decoded = encoder.decode_data(encoded_data)

    expected = [{
        'height': 88.9,
        'weight': 6.0,
        'animal': 'cat',
        'color': 'blue'
    }, {
        'height': 44.9,
        'weight': 3.0,
        'animal': 'cat',
        'color': 'UNKNOWN_CATEGORICAL_LEVEL'
    }, {
        'height': 2.5,
        'weight': 5.5,
        'color': 'UNKNOWN_CATEGORICAL_LEVEL',
        'animal': 'UNKNOWN_CATEGORICAL_LEVEL'
    }, {
        'height': 3233.2,
        'weight': 7.0,
        'color': 'blue',
        'animal': 'UNKNOWN_CATEGORICAL_LEVEL'
    }, {
        'height': 666.6,
        'weight': 2.0,
        'animal': 'cat',
        'color': 'UNKNOWN_CATEGORICAL_LEVEL'
    }, {
        'height': 55.5,
        'weight': 0.0,
        'animal': 'mouse',
        'color': 'UNKNOWN_CATEGORICAL_LEVEL'
    }, {
        'height': 33,
        'weight': 99.9,
        'animal': 'mouse',
        'color': 'UNKNOWN_CATEGORICAL_LEVEL'
    }]

    assert data_decoded == expected
def test_init_empty():
    OneHotEncoder([], [])
def test_save_load():
    filename = NamedTemporaryFile().name

    encoder = OneHotEncoder(['animal', 'color'], ['weight'],
                            max_levels_default=100)

    data = [{
        'animal': 'cat',
        'color': 'blue',
        'weight': 6.0
    }, {
        'animal': 'cat',
        'color': 'red',
        'weight': 3.0
    }, {
        'animal': 'dog',
        'color': 'yellow',
        'weight': 5.5
    }, {
        'animal': 'fish',
        'color': 'blue',
        'weight': 7.0
    }, {
        'animal': 'cat',
        'color': 'magenta',
        'weight': 2.0
    }, {
        'animal': 'mouse',
        'color': 'purple',
        'weight': 0.0
    }, {
        'animal': 'mouse',
        'color': 'black',
        'weight': 99.9
    }]

    encoder.load_from_data_stream(data)

    encoded_data = encoder.encode_data(data)

    encoder.save(filename)

    encoder_from_file = OneHotEncoder([], [])
    encoder_from_file.load_from_file(filename)

    encoded_data_from_file = encoder_from_file.encode_data(data)

    assert encoded_data == encoded_data_from_file
def test_load_from_data_encodes_data_correctly():
    encoder = OneHotEncoder(['animal', 'color'], ['weight'],
                            max_levels_default=100)

    data = [{
        'animal': 'cat',
        'color': 'blue',
        'weight': 6.0
    }, {
        'animal': 'cat',
        'color': 'red',
        'weight': 3.0
    }, {
        'animal': 'dog',
        'color': 'yellow',
        'weight': 5.5
    }, {
        'animal': 'fish',
        'color': 'blue',
        'weight': 7.0
    }, {
        'animal': 'cat',
        'color': 'magenta',
        'weight': 2.0
    }, {
        'animal': 'mouse',
        'color': 'purple',
        'weight': 0.0
    }, {
        'animal': 'mouse',
        'color': 'black',
        'weight': 99.9
    }]

    encoder.load_from_data_stream(data)

    encoded_data = [encoder.encode_row(row) for row in data]
    assert len(encoded_data) == len(data)
    assert len(encoded_data[0]) != len(data[0])

    first_row = encoded_data[0]

    expected = [
        6.0,  # weight is numeric and comes first
        1.0,  # animal is first categorical and cat is the most common, first row is cat
        0.0,  # animal, mouse is next most common, not a mouse
        0.0,  # animal, dog and fish tied for frequency but dog first alphabetically
        0.0,  # animal, fish, cat is not a fish
        1.0,  # color is next categorical alphabetically and blue is most common, first row blue
        0.0,  # black
        0.0,  # magenta
        0.0,  # purple
        0.0,  # red
        0.0
    ]  # yellow
    assert first_row == expected

    second_row = encoded_data[1]

    expected = [
        3.0,  # weight is numeric and comes first
        1.0,  # animal is first categorical and cat is the most common, first row is cat
        0.0,  # animal, mouse is next most common, not a mouse
        0.0,  # animal, dog and fish tied for frequency but dog first alphabetically
        0.0,  # animal, fish, cat is not a fish
        0.0,  # color is next categorical alphabetically and blue is most common, first row blue
        0.0,  # black next alphabetically for ones with frequency 1
        0.0,  # magenta next
        0.0,  # purple
        1.0,  # red, this is red
        0.0
    ]  # yellow
    assert second_row == expected

    last_row = encoded_data[-1]

    expected = [
        99.9,  # weight is numeric and comes first
        0.0,  # animal is first categorical and cat is the most common, first row is cat
        1.0,  # animal, mouse is next most common, not a mouse
        0.0,  # animal, dog and fish tied for frequency but dog first alphabetically
        0.0,  # animal, fish, cat is not a fish
        0.0,  # color is next categorical alphabetically and blue is most common, first row blue
        1.0,  # black next alphabetically for ones with frequency 1, this one black
        0.0,  # magenta next
        0.0,  # purple
        0.0,  # red
        0.0
    ]  # yellow

    assert last_row == expected

    expected_total = [[6.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                      [3.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
                      [5.5, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
                      [7.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                      [2.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
                      [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
                      [99.9, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]]

    assert encoded_data == expected_total