Python normalize_data Examples, trw.reporting.normalize_data.normalize_data Python Examples

Example #1

0

Show file

def get_data_normalize_and_alias(options, connection, name):
    """

    Retrieve data from SQL, normalize the data and resolve aliasing (

    Returns:
        tuple (data, types, type_categories, alias)
    """
    table_name_roles = get_tables_name_and_role(connection.cursor())
    table_name_roles = dict(table_name_roles)
    role = table_name_roles.get(name)
    assert role, f'table={name} doesn\'t have a role!'

    if 'alias##' in role:
        splits = role.split('##')
        assert len(
            splits
        ) == 2, 'alias is not well formed. Expeced ``alias##aliasname``'
        alias = splits[1]
        # in an aliased table, use the ``name`` to pickup the correct config from the ``options``
        data, types, type_categories = normalize_data(options,
                                                      get_table_data(
                                                          connection, alias),
                                                      table_name=name)
    else:
        alias = None
        data, types, type_categories = normalize_data(options,
                                                      get_table_data(
                                                          connection, name),
                                                      table_name=name)

    return data, types, type_categories, alias

Example #2

0

Show file

    def test_data_normalization_column_expansion(self):
        # multi dimensional numpy arrays can be expanded as a single columns
        # to facilitate analysis
        connection = sqlite3.connect(':memory:')
        cursor = connection.cursor()

        batch = {
            'numpy_arrays_int_5': np.random.randint(0, 10, [3, 5]),
        }

        table_name = 'table_name'
        tmp_folder, _ = make_table(cursor, table_name, 'table_role', batch)

        data = get_table_data(cursor, table_name)
        options = trw.reporting.create_default_reporting_options()
        options.db_root = os.path.join(tmp_folder, 'test.db')

        normalized_data, types, type_categories = normalize_data(
            options, data, table_name)
        assert len(normalized_data) == 5
        assert 'numpy_arrays_int_5_0' in normalized_data
        assert 'numpy_arrays_int_5_1' in normalized_data
        assert 'numpy_arrays_int_5_2' in normalized_data
        assert 'numpy_arrays_int_5_3' in normalized_data
        assert 'numpy_arrays_int_5_4' in normalized_data
        assert type_categories[
            'numpy_arrays_int_5_4'] == DataCategory.DiscreteOrdered

Example #3

0

Show file

    def update(self, options, connection, name, tabs):
        try:
            table_changed = self.table_changed()
            if table_changed:  # different number of rows, data was changed!
                # discard `0`, the table creation is not part of a
                # transaction, the table is being populated
                number_of_rows = get_table_number_of_rows(connection, name)
                if number_of_rows > 0:
                    data, types, type_categories = normalize_data(
                        options,
                        get_table_data(connection, name),
                        table_name=name)
                    keep_last_n_rows = safe_lookup(options.config, name,
                                                   'data', 'keep_last_n_rows')
                    if keep_last_n_rows is not None:
                        data_trimmed = collections.OrderedDict()
                        for name, values in data.items():
                            data_trimmed[name] = values[-keep_last_n_rows:]
                        data = data_trimmed

                    for tab in tabs:
                        tab.update_data(options, name, data, types,
                                        type_categories)
        except sqlite3.OperationalError as e:
            logger.warning(
                f'TabsDynamicData={self} could not be updated. Excpetion={e}. '
                f'``database is locked`` can be ignored if another process is '
                f'currently populating the database')

Example #4

0

Show file

    def test_data_normalization_different_numpy_shapes_2d(self):
        # make sure we can load differently shaped data samples
        connection = sqlite3.connect(':memory:')
        cursor = connection.cursor()

        batch = {
            'values': [
                np.zeros([1, 1, 64, 64]),
                np.zeros([1, 1, 64, 64]),
                np.zeros([1, 1, 65, 65])
            ],
        }

        table_name = 'table_name'
        tmp_folder, _ = make_table(cursor, table_name, 'table_role', batch)

        data = get_table_data(cursor, table_name)
        options = trw.reporting.create_default_reporting_options()
        options.db_root = os.path.join(tmp_folder, 'test.db')

        normalized_data, types, type_categories = normalize_data(
            options, data, table_name)
        assert len(normalized_data['values']) == 3

        # these were exported as image
        assert '0.png' in normalized_data['values'][0]
        assert '1.png' in normalized_data['values'][1]
        assert '2.png' in normalized_data['values'][2]

Example #5

0

Show file

    def test_table_data_samples(self):
        # make sure we can render the samples tabs with tabular and scatter data
        connection = sqlite3.connect(':memory:')
        cursor = connection.cursor()

        batch = {
            'value_continuous': [1.1, 2.2, 3.3],
            'value_integers': [1, 2, 3],
            'numpy_arrays': np.random.randn(3, 2000),
            'numpy_arrays_int_1': np.random.randint(0, 10, [3]),
            'images': np.random.randint(0, 255, [3, 3, 128, 128]),
            'strings': ['p1', 'p2', 'p3'],
            'column_to_be_removed': ['p1', 'p2', 'p3'],
        }

        table_name = 'table_name'
        tmp_folder, _ = make_table(cursor, table_name, 'table_role', batch)

        options = trw.reporting.create_default_reporting_options()
        options.db_root = os.path.join(tmp_folder, 'test.db')

        d, types, type_categories = normalize_data(options, batch, table_name)
        tabs = process_data_samples(options, 'table_name', 'table_role', d,
                                    types, type_categories)

        assert len(tabs) == 2  # must have the `tabular` and `scatter` tabs

        #
        # Check the tabular tab
        #
        assert len(
            tabs[0].ui.child.children) == 2  # must have a DataTable, Div
        table = tabs[0].ui.child.children[0]
        assert isinstance(table, DataTable)
        div = tabs[0].ui.child.children[1]
        assert isinstance(tabs[0].ui.child.children[1],
                          Div)  # DIV to configure extra CSS
        # for the table (text rotation)
        assert div.visible is False

        assert len(table.source.column_names) == len(batch)

        #
        # Check the scatter tab
        #
        head = tabs[1].ui.child.children
        assert len(head[0].children) >= 7  # all the tools

        figures = head[1].children
        assert len(figures) == 1  # should have a single figure
        assert isinstance(figures[0].children[0], Figure)
        assert len(figures[0].children[0].renderers) > 0  # if not, failure!

Example #6

0

Show file

    def test_table_data_samples__default_config__discrete_discrete(self):
        # make sure we can configure defaults
        connection = sqlite3.connect(':memory:')
        cursor = connection.cursor()

        batch = {
            'value_continuous': [1.1, 2.2, 3.3],
            'value_integers': [1, 2, 3],
            'numpy_arrays': np.random.randn(3, 2000),
            'numpy_arrays_int_1': np.random.randint(0, 10, [3]),
            'images': np.random.randint(0, 255, [3, 3, 128, 128]),
            'strings': ['p1', 'p2', 'p3'],
        }

        table_name = 'table_name'
        tmp_folder, _ = make_table(cursor, table_name, 'table_role', batch)

        config = {
            'table_name': {
                'default': {
                    'Scatter X Axis': 'value_integers',
                    'Scatter Y Axis': 'strings',
                    'Binning X Axis': 'value_integers',
                    'Color by': 'value_continuous',
                    'Display with': 'Dot'
                }
            }
        }

        options = trw.reporting.create_default_reporting_options(config=config)
        options.db_root = os.path.join(tmp_folder, 'test.db')

        d, types, type_categories = normalize_data(options, batch, table_name)
        tabs = process_data_samples(options, 'table_name', 'table_role', d,
                                    types, type_categories)

        #
        # Check the scatter tab
        #
        head = tabs[1].ui.child.children
        assert len(head) >= 2  # Tools, Scatter 1+2+3

        assert head[0].children[0].value == 'value_integers'
        assert head[0].children[1].value == 'strings'
        assert head[0].children[2].value == 'value_continuous'
        print('DONE')

Example #7

0

Show file

    def test_table_data_samples__scatter_and_discrete_configs(self):
        # make sure we can configure defaults
        connection = sqlite3.connect(':memory:')
        cursor = connection.cursor()

        batch = {
            'value_continuous': [1.1, 2.2, 3.3],
            'value_integers': [1, 2, 3],
            'numpy_arrays': np.random.randn(3, 2000),
            'numpy_arrays_int_1': np.random.randint(0, 10, [3]),
            'images': np.random.randint(0, 255, [3, 3, 128, 128]),
            'strings': ['p1', 'p2', 'p3'],
        }

        table_name = 'table_name'
        tmp_folder, _ = make_table(cursor, table_name, 'table_role', batch)

        #
        # X: continuous, Y: discrete
        #
        config = {
            'table_name': {
                'default': {
                    'Scatter X Axis': 'value_continuous',
                    'Scatter Y Axis': 'value_integers',
                    'Color by': 'strings',
                    'Display with': 'Dot'
                }
            }
        }

        options = trw.reporting.create_default_reporting_options(config=config)
        options.db_root = os.path.join(tmp_folder, 'test.db')

        d, types, type_categories = normalize_data(options, batch, table_name)
        tabs = process_data_samples(options, 'table_name', 'table_role', d,
                                    types, type_categories)
        head = tabs[1].ui.child.children
        assert len(head) == 2  # Tools, row(Fig, Figure-color-legend)
        assert len(head[1].children) == 2  # Fig, Figure-color-legend
        assert len(head[1].children[0].children[0].renderers) > 0

        #
        # Y: continuous, X: discrete
        #
        config = {
            'table_name': {
                'default': {
                    'Scatter X Axis': 'value_integers',
                    'Scatter Y Axis': 'value_continuous',
                    'Color by': 'strings',
                    'Display with': 'Dot'
                }
            }
        }

        options = trw.reporting.create_default_reporting_options(config=config)
        options.db_root = os.path.join(tmp_folder, 'test.db')

        tabs = process_data_samples(options, 'table_name', 'table_role', d,
                                    types, type_categories)
        head = tabs[1].ui.child.children
        assert len(head) == 2  # Tools, row(Fig, Figure-color-legend)
        assert len(head[1].children[0].children[0].renderers) > 0

        #
        # Y: continuous, X: continuous
        #
        config = {
            'table_name': {
                'default': {
                    'Scatter X Axis': 'value_continuous',
                    'Scatter Y Axis': 'value_continuous',
                    'Color by': 'strings',
                    'Display with': 'Dot'
                }
            }
        }

        options = trw.reporting.create_default_reporting_options(config=config)
        options.db_root = os.path.join(tmp_folder, 'test.db')

        tabs = process_data_samples(options, 'table_name', 'table_role', d,
                                    types, type_categories)
        head = tabs[1].ui.child.children
        assert len(head) == 2  # Tools, Fig, Figure-color-legend
        assert len(head[1].children[0].children[0].renderers) > 0

Example #8

0

Show file

    def test_data_normalization(self):
        # publish data the same way the application would:
        # export samples to a database, then read the data
        connection = sqlite3.connect(':memory:')
        cursor = connection.cursor()

        batch = {
            'constant': 0,
            'value_continuous': [1.1, 2.2, 3.3],
            'value_integers': [1, 2, 3],
            'numpy_arrays': np.random.randn(3, 2000),
            'numpy_arrays_int_10': np.random.randint(0, 10, [3, 10]),
            'numpy_arrays_int_1': np.random.randint(0, 10, [3]),
            'images': np.random.randint(0, 255, [3, 3, 128, 128]),
            'strings': ['p1', 'p2', 'p3'],
            'split': 'train',
            'column_to_be_removed': ['p1', 'p2', 'p3'],
        }

        table_name = 'table_name'
        tmp_folder, _ = make_table(cursor, table_name, 'table_role', batch)

        subsampling_factor = 2 / 3
        data = get_table_data(cursor, table_name)
        options = trw.reporting.create_default_reporting_options()
        options.config = {
            'table_name': {
                'data': {
                    'subsampling_factor': subsampling_factor,
                    'remove_columns': ['column_to_be_removed']
                }
            }
        }
        options.data.unpack_numpy_arrays_with_less_than_x_columns = 3
        options.db_root = os.path.join(tmp_folder, 'test.db')

        # must have sub-sampled the data
        normalized_data, types, type_categories = normalize_data(
            options, data, table_name)
        assert len(normalized_data['constant']) == 2, 'subsampling failed!'

        # must have exactly the batch keys
        assert len(batch) == len(normalized_data) + 1
        for key in normalized_data.keys():
            assert key in batch
        assert 'column_to_be_removed' not in normalized_data.keys()

        # for BLOB_IMAGE, must have appended the `appname` (folder of the datbase name)
        appname = os.path.basename(os.path.dirname(options.db_root))
        assert appname in normalized_data['images'][0]

        # images must be served from a static directory
        assert 'static' in normalized_data['images'][0]

        assert type_categories['images'] == DataCategory.Other
        assert type_categories[
            'numpy_arrays'] == DataCategory.Other  # too many dimensions
        assert type_categories['numpy_arrays_int_10'] == DataCategory.Other
        assert type_categories[
            'numpy_arrays_int_1'] == DataCategory.DiscreteOrdered
        assert type_categories['value_continuous'] == DataCategory.Continuous
        assert type_categories[
            'value_integers'] == DataCategory.DiscreteOrdered
        assert type_categories['strings'] == DataCategory.DiscreteUnordered
        assert type_categories['split'] == DataCategory.DiscreteUnordered

        assert types['numpy_arrays'] == 'BLOB_NUMPY'
        assert types['images'] == 'BLOB_IMAGE_PNG'
        assert types['numpy_arrays_int_10'] == 'BLOB_NUMPY'