def get_data_normalize_and_alias(options, connection, name): """ Retrieve data from SQL, normalize the data and resolve aliasing ( Returns: tuple (data, types, type_categories, alias) """ table_name_roles = get_tables_name_and_role(connection.cursor()) table_name_roles = dict(table_name_roles) role = table_name_roles.get(name) assert role, f'table={name} doesn\'t have a role!' if 'alias##' in role: splits = role.split('##') assert len( splits ) == 2, 'alias is not well formed. Expeced ``alias##aliasname``' alias = splits[1] # in an aliased table, use the ``name`` to pickup the correct config from the ``options`` data, types, type_categories = normalize_data(options, get_table_data( connection, alias), table_name=name) else: alias = None data, types, type_categories = normalize_data(options, get_table_data( connection, name), table_name=name) return data, types, type_categories, alias
def test_data_normalization_column_expansion(self): # multi dimensional numpy arrays can be expanded as a single columns # to facilitate analysis connection = sqlite3.connect(':memory:') cursor = connection.cursor() batch = { 'numpy_arrays_int_5': np.random.randint(0, 10, [3, 5]), } table_name = 'table_name' tmp_folder, _ = make_table(cursor, table_name, 'table_role', batch) data = get_table_data(cursor, table_name) options = trw.reporting.create_default_reporting_options() options.db_root = os.path.join(tmp_folder, 'test.db') normalized_data, types, type_categories = normalize_data( options, data, table_name) assert len(normalized_data) == 5 assert 'numpy_arrays_int_5_0' in normalized_data assert 'numpy_arrays_int_5_1' in normalized_data assert 'numpy_arrays_int_5_2' in normalized_data assert 'numpy_arrays_int_5_3' in normalized_data assert 'numpy_arrays_int_5_4' in normalized_data assert type_categories[ 'numpy_arrays_int_5_4'] == DataCategory.DiscreteOrdered
def update(self, options, connection, name, tabs): try: table_changed = self.table_changed() if table_changed: # different number of rows, data was changed! # discard `0`, the table creation is not part of a # transaction, the table is being populated number_of_rows = get_table_number_of_rows(connection, name) if number_of_rows > 0: data, types, type_categories = normalize_data( options, get_table_data(connection, name), table_name=name) keep_last_n_rows = safe_lookup(options.config, name, 'data', 'keep_last_n_rows') if keep_last_n_rows is not None: data_trimmed = collections.OrderedDict() for name, values in data.items(): data_trimmed[name] = values[-keep_last_n_rows:] data = data_trimmed for tab in tabs: tab.update_data(options, name, data, types, type_categories) except sqlite3.OperationalError as e: logger.warning( f'TabsDynamicData={self} could not be updated. Excpetion={e}. ' f'``database is locked`` can be ignored if another process is ' f'currently populating the database')
def test_data_normalization_different_numpy_shapes_2d(self): # make sure we can load differently shaped data samples connection = sqlite3.connect(':memory:') cursor = connection.cursor() batch = { 'values': [ np.zeros([1, 1, 64, 64]), np.zeros([1, 1, 64, 64]), np.zeros([1, 1, 65, 65]) ], } table_name = 'table_name' tmp_folder, _ = make_table(cursor, table_name, 'table_role', batch) data = get_table_data(cursor, table_name) options = trw.reporting.create_default_reporting_options() options.db_root = os.path.join(tmp_folder, 'test.db') normalized_data, types, type_categories = normalize_data( options, data, table_name) assert len(normalized_data['values']) == 3 # these were exported as image assert '0.png' in normalized_data['values'][0] assert '1.png' in normalized_data['values'][1] assert '2.png' in normalized_data['values'][2]
def test_table_data_samples(self): # make sure we can render the samples tabs with tabular and scatter data connection = sqlite3.connect(':memory:') cursor = connection.cursor() batch = { 'value_continuous': [1.1, 2.2, 3.3], 'value_integers': [1, 2, 3], 'numpy_arrays': np.random.randn(3, 2000), 'numpy_arrays_int_1': np.random.randint(0, 10, [3]), 'images': np.random.randint(0, 255, [3, 3, 128, 128]), 'strings': ['p1', 'p2', 'p3'], 'column_to_be_removed': ['p1', 'p2', 'p3'], } table_name = 'table_name' tmp_folder, _ = make_table(cursor, table_name, 'table_role', batch) options = trw.reporting.create_default_reporting_options() options.db_root = os.path.join(tmp_folder, 'test.db') d, types, type_categories = normalize_data(options, batch, table_name) tabs = process_data_samples(options, 'table_name', 'table_role', d, types, type_categories) assert len(tabs) == 2 # must have the `tabular` and `scatter` tabs # # Check the tabular tab # assert len( tabs[0].ui.child.children) == 2 # must have a DataTable, Div table = tabs[0].ui.child.children[0] assert isinstance(table, DataTable) div = tabs[0].ui.child.children[1] assert isinstance(tabs[0].ui.child.children[1], Div) # DIV to configure extra CSS # for the table (text rotation) assert div.visible is False assert len(table.source.column_names) == len(batch) # # Check the scatter tab # head = tabs[1].ui.child.children assert len(head[0].children) >= 7 # all the tools figures = head[1].children assert len(figures) == 1 # should have a single figure assert isinstance(figures[0].children[0], Figure) assert len(figures[0].children[0].renderers) > 0 # if not, failure!
def test_table_data_samples__default_config__discrete_discrete(self): # make sure we can configure defaults connection = sqlite3.connect(':memory:') cursor = connection.cursor() batch = { 'value_continuous': [1.1, 2.2, 3.3], 'value_integers': [1, 2, 3], 'numpy_arrays': np.random.randn(3, 2000), 'numpy_arrays_int_1': np.random.randint(0, 10, [3]), 'images': np.random.randint(0, 255, [3, 3, 128, 128]), 'strings': ['p1', 'p2', 'p3'], } table_name = 'table_name' tmp_folder, _ = make_table(cursor, table_name, 'table_role', batch) config = { 'table_name': { 'default': { 'Scatter X Axis': 'value_integers', 'Scatter Y Axis': 'strings', 'Binning X Axis': 'value_integers', 'Color by': 'value_continuous', 'Display with': 'Dot' } } } options = trw.reporting.create_default_reporting_options(config=config) options.db_root = os.path.join(tmp_folder, 'test.db') d, types, type_categories = normalize_data(options, batch, table_name) tabs = process_data_samples(options, 'table_name', 'table_role', d, types, type_categories) # # Check the scatter tab # head = tabs[1].ui.child.children assert len(head) >= 2 # Tools, Scatter 1+2+3 assert head[0].children[0].value == 'value_integers' assert head[0].children[1].value == 'strings' assert head[0].children[2].value == 'value_continuous' print('DONE')
def test_table_data_samples__scatter_and_discrete_configs(self): # make sure we can configure defaults connection = sqlite3.connect(':memory:') cursor = connection.cursor() batch = { 'value_continuous': [1.1, 2.2, 3.3], 'value_integers': [1, 2, 3], 'numpy_arrays': np.random.randn(3, 2000), 'numpy_arrays_int_1': np.random.randint(0, 10, [3]), 'images': np.random.randint(0, 255, [3, 3, 128, 128]), 'strings': ['p1', 'p2', 'p3'], } table_name = 'table_name' tmp_folder, _ = make_table(cursor, table_name, 'table_role', batch) # # X: continuous, Y: discrete # config = { 'table_name': { 'default': { 'Scatter X Axis': 'value_continuous', 'Scatter Y Axis': 'value_integers', 'Color by': 'strings', 'Display with': 'Dot' } } } options = trw.reporting.create_default_reporting_options(config=config) options.db_root = os.path.join(tmp_folder, 'test.db') d, types, type_categories = normalize_data(options, batch, table_name) tabs = process_data_samples(options, 'table_name', 'table_role', d, types, type_categories) head = tabs[1].ui.child.children assert len(head) == 2 # Tools, row(Fig, Figure-color-legend) assert len(head[1].children) == 2 # Fig, Figure-color-legend assert len(head[1].children[0].children[0].renderers) > 0 # # Y: continuous, X: discrete # config = { 'table_name': { 'default': { 'Scatter X Axis': 'value_integers', 'Scatter Y Axis': 'value_continuous', 'Color by': 'strings', 'Display with': 'Dot' } } } options = trw.reporting.create_default_reporting_options(config=config) options.db_root = os.path.join(tmp_folder, 'test.db') tabs = process_data_samples(options, 'table_name', 'table_role', d, types, type_categories) head = tabs[1].ui.child.children assert len(head) == 2 # Tools, row(Fig, Figure-color-legend) assert len(head[1].children[0].children[0].renderers) > 0 # # Y: continuous, X: continuous # config = { 'table_name': { 'default': { 'Scatter X Axis': 'value_continuous', 'Scatter Y Axis': 'value_continuous', 'Color by': 'strings', 'Display with': 'Dot' } } } options = trw.reporting.create_default_reporting_options(config=config) options.db_root = os.path.join(tmp_folder, 'test.db') tabs = process_data_samples(options, 'table_name', 'table_role', d, types, type_categories) head = tabs[1].ui.child.children assert len(head) == 2 # Tools, Fig, Figure-color-legend assert len(head[1].children[0].children[0].renderers) > 0
def test_data_normalization(self): # publish data the same way the application would: # export samples to a database, then read the data connection = sqlite3.connect(':memory:') cursor = connection.cursor() batch = { 'constant': 0, 'value_continuous': [1.1, 2.2, 3.3], 'value_integers': [1, 2, 3], 'numpy_arrays': np.random.randn(3, 2000), 'numpy_arrays_int_10': np.random.randint(0, 10, [3, 10]), 'numpy_arrays_int_1': np.random.randint(0, 10, [3]), 'images': np.random.randint(0, 255, [3, 3, 128, 128]), 'strings': ['p1', 'p2', 'p3'], 'split': 'train', 'column_to_be_removed': ['p1', 'p2', 'p3'], } table_name = 'table_name' tmp_folder, _ = make_table(cursor, table_name, 'table_role', batch) subsampling_factor = 2 / 3 data = get_table_data(cursor, table_name) options = trw.reporting.create_default_reporting_options() options.config = { 'table_name': { 'data': { 'subsampling_factor': subsampling_factor, 'remove_columns': ['column_to_be_removed'] } } } options.data.unpack_numpy_arrays_with_less_than_x_columns = 3 options.db_root = os.path.join(tmp_folder, 'test.db') # must have sub-sampled the data normalized_data, types, type_categories = normalize_data( options, data, table_name) assert len(normalized_data['constant']) == 2, 'subsampling failed!' # must have exactly the batch keys assert len(batch) == len(normalized_data) + 1 for key in normalized_data.keys(): assert key in batch assert 'column_to_be_removed' not in normalized_data.keys() # for BLOB_IMAGE, must have appended the `appname` (folder of the datbase name) appname = os.path.basename(os.path.dirname(options.db_root)) assert appname in normalized_data['images'][0] # images must be served from a static directory assert 'static' in normalized_data['images'][0] assert type_categories['images'] == DataCategory.Other assert type_categories[ 'numpy_arrays'] == DataCategory.Other # too many dimensions assert type_categories['numpy_arrays_int_10'] == DataCategory.Other assert type_categories[ 'numpy_arrays_int_1'] == DataCategory.DiscreteOrdered assert type_categories['value_continuous'] == DataCategory.Continuous assert type_categories[ 'value_integers'] == DataCategory.DiscreteOrdered assert type_categories['strings'] == DataCategory.DiscreteUnordered assert type_categories['split'] == DataCategory.DiscreteUnordered assert types['numpy_arrays'] == 'BLOB_NUMPY' assert types['images'] == 'BLOB_IMAGE_PNG' assert types['numpy_arrays_int_10'] == 'BLOB_NUMPY'