Beispiel #1
0
def test_get_column_statistics():
    column = pandas.Series([float('NaN'), '1', '2.0', float('NaN')])
    stats_values = stats.get_column_statistics(column)
    assert set(stats_values.keys()) == {
            'missing_values_count',
            'format',
            'format_specific_statistics',
            'type'
        } or set(stats_values.keys()) == {
            'missing_values_count',
            'format',
            'format_specific_statistics',
            'type',
            'uniques_count',
            'uniques_stats'
        }

    column = pandas.Series(['a', 'cd', 'etx', 'a', '22', '22', 'n'])
    stats_values = stats.get_column_statistics(column)
    assert set(stats_values.keys()) == {
            'missing_values_count',
            'format',
            'format_specific_statistics',
            'type'
        } or set(stats_values.keys()) == {
            'missing_values_count',
            'format',
            'format_specific_statistics',
            'type',
            'uniques_count',
            'uniques_stats'
        }
Beispiel #2
0
def test_get_column_statistics_check_missings():
    column = pandas.Series([float('NaN'), '1', '2.0', float('NaN')])
    stats_values = stats.get_column_statistics(column)
    assert stats_values['missing_values_count'] == 2

    column = pandas.Series(['0', '1', '2.0', '-1'])
    stats_values = stats.get_column_statistics(column)
    assert stats_values['missing_values_count'] == 0
Beispiel #3
0
def test_merge_text_column_partitions_stats():
    column = pandas.Series([float('NaN'), 'a', '2.0', 'abc', 'ddddd', '2'],
                           dtype=str)
    first_part_stats_values = stats.get_column_statistics(column[:3])
    second_part_stats_values = stats.get_column_statistics(column[3:])
    merged_text_stats = text_column.merge_text_column_partitions_stats(
        [first_part_stats_values, second_part_stats_values])
    assert merged_text_stats == {'min_len': 1, 'max_len': 5}
Beispiel #4
0
def test_merge_column_partitions_stats():
    column = pandas.Series(
            [float('NaN'), '1', '2.0', '3', float('NaN'), '2', float('NaN')],
            dtype=str
        )
    first_part_stats_values = stats.get_column_statistics(column[:3])
    second_part_stats_values = stats.get_column_statistics(column[3:])
    merged_stats = merge.merge_column_partitions_stats(
            [first_part_stats_values, second_part_stats_values]
        )
    uniques_stats = merged_stats.pop('uniques_stats')
    assert merged_stats == {
            'format': DataFormats.numerical,
            'missing_values_count': 3,
            'type': DataTypes.categorical,
            'uniques_count': 3,
            'format_specific_statistics': {
                    'sum': 8.0,
                    'min': 1.0,
                    'max': 3,
                },
        }
    assert all(
            uniques_stats.sort_index() == pandas.Series(
                [1.0, 2.0, 1.0],
                index=pandas.Index([1.0, 2.0, 3.0])
            ).sort_index()
        )
    
    column = pandas.Series(
            [float('NaN'), float('NaN'), float('NaN'), 'a', '2.0', 'abc', 'ddddd', '2'],
            dtype=str
        )
    first_part_stats_values = stats.get_column_statistics(column[:3])
    second_part_stats_values = stats.get_column_statistics(column[3:])
    merged_stats = merge.merge_column_partitions_stats(
            [first_part_stats_values, second_part_stats_values]
        )
    uniques_stats = merged_stats.pop('uniques_stats')

    assert merged_stats == {
            'format': DataFormats.character,
            'missing_values_count': 3,
            'type': DataTypes.categorical,
            'uniques_count': 5,
            'format_specific_statistics': {
                    'min_len': 1,
                    'max_len': 5
                },
        }

    assert all(
            uniques_stats.sort_index() == pandas.Series(
                [1.0, 1.0, 1.0, 1.0, 1.0],
                index=pandas.Index(['2', '2.0', 'a', 'abc', 'ddddd'])
            ).sort_index()
        )
Beispiel #5
0
def test_merge_numerical_column_partitions_stats():
    column = pandas.Series([float('NaN'), '1', '2.0', '3', '2',
                            float('NaN')],
                           dtype=str)
    first_part_stats_values = stats.get_column_statistics(column[:3])
    second_part_stats_values = stats.get_column_statistics(column[3:])
    merged_numerical_stats = numerical_column.merge_numerical_column_partitions_stats(
        [first_part_stats_values, second_part_stats_values])
    assert merged_numerical_stats == {'min': 1, 'max': 3, 'sum': 8.0}
Beispiel #6
0
def test_merge_column_missing_values_count():
    column = pandas.Series(
            [float('NaN'), '1', '2.0', '3', float('NaN'), '2', float('NaN')],
            dtype=str
        )
    first_part_stats_values = stats.get_column_statistics(column[:3])
    second_part_stats_values = stats.get_column_statistics(column[3:])
    assert merge.merge_column_missing_values_count(
            [first_part_stats_values, second_part_stats_values]
        ) == 3
Beispiel #7
0
def test_get_column_statistics_check_format():
    column = pandas.Series(['0', '1', '2', '-1'])
    stats_values = stats.get_column_statistics(column)
    assert stats_values['format'] is DataFormats.numerical

    column = pandas.Series(['0.8', '1', '2.0', '0'])
    stats_values = stats.get_column_statistics(column)
    assert stats_values['format'] is DataFormats.numerical

    column = pandas.Series(['s', '1', '2.0', float('NaN')])
    stats_values = stats.get_column_statistics(column)
    assert stats_values['format'] is DataFormats.character
Beispiel #8
0
def test_merge_text_column_min_len_max_len():
    column = pandas.Series(['a', '2.0', 'abc', 'ddddd', '2',
                            float('NaN')],
                           dtype=str)
    first_part_stats_values = stats.get_column_statistics(column[:3])
    second_part_stats_values = stats.get_column_statistics(column[3:])
    merged_min_len, merged_max_len = text_column.merge_text_column_min_len_max_len(
        [
            first_part_stats_values['format_specific_statistics'],
            second_part_stats_values['format_specific_statistics']
        ])
    assert merged_min_len == 1 and merged_max_len == 5
def test_merge_column_uniques_count():
    first_column_part_stats = stats.get_column_statistics(
        pandas.Series([float('NaN'), '1', '2.0',
                       float('NaN'), '3'], dtype=str))
    second_column_part_stats = stats.get_column_statistics(
        pandas.Series(
            [float('NaN'), '2', '3', float('NaN')], dtype=str))
    uniques_data = categorical_column.merge_column_uniques_count(
        [first_column_part_stats, second_column_part_stats])
    uniques_count = uniques_data['uniques_count']
    uniques_stats = uniques_data['uniques_stats']

    assert uniques_count == 3
    assert all(uniques_stats.sort_index() == pandas.Series(
        [1.0, 2.0, 2.0], index=[1.0, 2.0, 3.0]).sort_index())
Beispiel #10
0
def test_get_column_statistics_numeric_specific_statistics_sum():
    column = pandas.Series(['0.8', '1', '2', '-1'])
    stats_values = stats.get_column_statistics(column)
    assert stats_values['format_specific_statistics']['sum'] == numpy.float32(2.8)
Beispiel #11
0
def test_get_column_statistics_numeric_specific_statistics_min_max():
    column = pandas.Series(['0.8', '1', '2', '-1'])
    stats_values = stats.get_column_statistics(column)
    assert stats_values['format_specific_statistics']['min'] == -1
    assert stats_values['format_specific_statistics']['max'] == 2
Beispiel #12
0
def test_get_column_statistics_detect_categorical():
    column = pandas.Series(['a', 'cd', 'etx', 'a', '22', '22', 'n'])
    stats_values = stats.get_column_statistics(column)
    assert stats_values['type'] is DataTypes.categorical
    assert stats_values['uniques_count'] == 5
Beispiel #13
0
def test_get_column_statistics_detect_continuous():
    column = pandas.Series(numpy.arange(1002).astype(str))
    stats_values = stats.get_column_statistics(column)
    assert stats_values['type'] is DataTypes.continuous
Beispiel #14
0
def test_get_column_statistics_text_specific_statistics_min_len_max_len():
    column = pandas.Series(['a', float('NaN'), 'etx09', 'a', '22', '22', 'n'])
    stats_values = stats.get_column_statistics(column)
    assert stats_values['format_specific_statistics']['min_len'] == 1
    assert stats_values['format_specific_statistics']['max_len'] == 5