Exemple #1
0
def test_extract_dataframe_info_nan_column(npartitions):
    size = 1000
    pandas_df = pandas.DataFrame({'A':  numpy.tile(float('NaN'), 3 * size)})
    dask_df = dask.dataframe.from_pandas(pandas_df, npartitions=npartitions)
    dask_df_info = stats.extract_dataframe_info(dask_df)
    column_statistic = dask_df_info['per_column_statistic']['A']
    assert column_statistic['format'] is DataFormats.numerical
    assert column_statistic['type'] is DataTypes.categorical
    assert column_statistic['format_specific_statistics'] == {}
Exemple #2
0
def test_extract_dataframe_info_nan_partition():
    size = 1000
    part_one = pandas.DataFrame({ 'A': numpy.tile(float('NaN'), size)})
    part_two = pandas.DataFrame({ 'A': numpy.linspace(0, size, size)}).astype(str)
    part_three = pandas.DataFrame({ 'A': numpy.tile(float('NaN'), size)})
    delayed = [dask.delayed(part_one), dask.delayed(part_two), dask.delayed(part_three)]
    dask_df = dask.dataframe.from_delayed(delayed)
    dask_df_info = stats.extract_dataframe_info(dask_df)
    column_statistic = dask_df_info['per_column_statistic']['A']
    assert column_statistic['format'] is DataFormats.numerical
    assert column_statistic['type'] is DataTypes.continuous
    assert column_statistic['missing_values_count'] == 2000
    assert column_statistic['format_specific_statistics'] == {
            'min': 0.0,
            'max': 1000.0,
            'sum': 500000.0
        }
Exemple #3
0
def test_extract_dataframe_info(boston_dataset, npartitions):
    pandas_df = pandas.read_csv('file://' + boston_dataset['path'], dtype=str)
    pandas_df_info = stats.extract_pandas_dataframe_info(pandas_df)
    dask_df = dask.dataframe.from_pandas(pandas_df, npartitions=npartitions)
    dask_df_info = stats.extract_dataframe_info(dask_df)
    pandas_df_per_column_statistic = pandas_df_info['per_column_statistic']
    dask_df_per_column_statistic = dask_df_info['per_column_statistic']

    pandas_df_uniques_stats = {}
    for column_name, column_stats in pandas_df_per_column_statistic.items():
        pandas_df_uniques_stats[column_name] = \
                column_stats.pop('uniques_stats').to_dict()
        del column_stats['format_specific_statistics']['sum']
    dask_df_uniques_stats = {}
    for column_name, column_stats in dask_df_per_column_statistic.items():
        dask_df_uniques_stats[column_name] = \
                column_stats.pop('uniques_stats').to_dict()
        del column_stats['format_specific_statistics']['sum']
    assert pandas_df_per_column_statistic == dask_df_per_column_statistic
    assert pandas_df_uniques_stats == dask_df_uniques_stats
Exemple #4
0
    def wrapper(*args, **kwargs):
        dataset = func(*args, **kwargs)

        raw_dataframe = dataset['raw_dataframe']
        dataset['dataframe_info'] = json.loads(
            json.dumps(stats.extract_dataframe_info(raw_dataframe)))
        if 'feature_column_labels' not in dataset:
            dataset['feature_column_labels'] = raw_dataframe.columns.drop(
                dataset['target_column_label'])

        if 'MSE_baseline' not in dataset:
            y = raw_dataframe[dataset['target_column_label']]
            dataset['MSE_baseline'] = ((y - y.mean())**2).mean().compute()

        per_column_statistic = dataset['dataframe_info'][
            'per_column_statistic']
        dataset['columns_info'] = {
            column_id: cloudsml.models.BaseDataTransformationColumn(
                id=column_id,
                name=column_name,
                statistics=per_column_statistic[column_name],
                data_type=per_column_statistic[column_name]['type'],
                data_format=per_column_statistic[column_name]['format'])
            for column_id, column_name in zip(
                sorted(random.sample(range(100000), len(raw_dataframe.columns))
                       ), dataset['dataframe_info']['columns'])
        }
        dataset['columns_info_by_name'] = {
            column.name: column
            for column in dataset['columns_info'].values()
        }
        dataframe = raw_dataframe.rename(columns={
            column.name: column.id
            for column in dataset['columns_info'].values()
        })
        dataset['dataframe'] = dataframe
        dataset['target_column_id'] = dataset['columns_info_by_name'][
            dataset['target_column_label']].id
        dataset['feature_column_ids'] = dataset['dataframe'].columns.drop(
            dataset['target_column_id']).values.tolist()
        return dataset
Exemple #5
0
def test_extract_dataframe_info_continuous():
    loc = 40
    scale = 60
    size = 1600

    numpy.random.seed(0)
    part_one_samples = numpy.random.normal(loc=loc, scale=scale, size=int(size / 2)).astype(str)
    numpy.random.seed(10)
    part_two_samples = numpy.random.normal(loc=loc, scale=scale, size=int(size / 2)).astype(str)

    categorical_samples = numpy.random.choice(
            numpy.array(['a', 'a', 'c', 'dce', 'B', '1', '2pq']),
            size=size
        )
    part_one_categorical_samples = categorical_samples[:int(size / 2)]
    part_two_categorical_samples = categorical_samples[int(size / 2):]

    alphabet = numpy.array(list(string.ascii_lowercase))
    text_continuous_samples = numpy.array(
            [
                ''.join(numpy.random.choice(alphabet, size=8).tolist()) \
                    for _ in range(0, size)
            ]
        )
    part_one_text_continuous_samples = text_continuous_samples[:int(size / 2)]
    part_two_text_continuous_samples = text_continuous_samples[int(size / 2):]

    part_one = pandas.DataFrame(
            {
                'A': part_one_samples,
                'B': part_one_categorical_samples,
                'C': part_one_text_continuous_samples
            },
            dtype=str
        )
    part_two = pandas.DataFrame(
            {
                'A': part_two_samples,
                'B': part_two_categorical_samples,
                'C': part_two_text_continuous_samples
            },
            dtype=str
        )

    pandas_df = pandas.concat([part_one, part_two])
    dask_df = dask.dataframe.from_pandas(pandas_df, npartitions=1)
    dask_df_info = stats.extract_dataframe_info(dask_df)
    one_part_dask_df_per_column_statistic = dask_df_info['per_column_statistic']

    delayed = [dask.delayed(part_one), dask.delayed(part_two)]
    dask_df = dask.dataframe.from_delayed(delayed)
    dask_df_info = stats.extract_dataframe_info(dask_df)
    many_part_dask_df_per_column_statistic = dask_df_info['per_column_statistic']

    assert(
            many_part_dask_df_per_column_statistic['A']['binning_stats'] ==
            one_part_dask_df_per_column_statistic['A']['binning_stats']
        )
    assert (
            many_part_dask_df_per_column_statistic['B']['uniques_count'] ==
            one_part_dask_df_per_column_statistic['B']['uniques_count']
        )
    assert many_part_dask_df_per_column_statistic['C']['format'] == DataFormats.character
    assert many_part_dask_df_per_column_statistic['C']['type'] == DataTypes.continuous