def error_check_denest(schema, key_properties, records): denested = denest.to_table_batches(schema, key_properties, records) for table_batch in denested: assert [] == errors(table_batch) return denested
def test__schema__objects_add_fields(): denested = denest.to_table_batches( { 'properties': { 'a': { 'type': 'integer' }, 'b': { 'type': 'object', 'properties': { 'c': { 'type': 'string' }, 'd': { 'type': 'boolean' } } } } }, ['a'], []) assert 1 == len(denested) assert ('b', 'c') in denested[0]['streamed_schema']['schema']['properties'] assert ('b', 'd') in denested[0]['streamed_schema']['schema']['properties'] for table_batch in denested: assert [] == errors(table_batch)
def test__schema__arrays_add_tables(): denested = denest.to_table_batches( { 'properties': { 'a': { 'type': 'integer' }, 'b': { 'type': 'array', 'items': { 'properties': { 'c': { 'type': 'string' }, 'd': { 'type': 'boolean' } } } } } }, ['a'], []) assert 2 == len(denested) for table_batch in denested: assert [] == errors(table_batch)
def test_empty(): denested = denest.to_table_batches({}, [], []) assert 1 == len(denested) assert [] == denested[0]['records'] assert [] == denested[0]['streamed_schema']['key_properties'] for table_batch in denested: assert [] == errors(table_batch)
def test__records__nested__root_empty(): denested = denest.to_table_batches(NESTED_SCHEMA, [], NESTED_RECORDS) table_batch = _get_table_batch_with_path(denested, tuple()) assert {} == table_batch['streamed_schema']['schema']['properties'] assert 5 == len(table_batch['records']) for record in table_batch['records']: assert {} == record
def test__records__nested__tables(): denested = denest.to_table_batches(NESTED_SCHEMA, [], NESTED_RECORDS) print('denested:', denested) assert 3 == len(denested) for table_batch in denested: assert table_batch['streamed_schema']['path'] in \ {tuple(), ('a', 'b'), ('a', 'b', 'c', 'e')} assert [] == errors(table_batch)
def test__schema__nested_objects_add_fields(): for _ in range(0, 100): r = random_object_schema() denested = denest.to_table_batches(r['schema'], [], []) print('r:', r) print() print('denested:', denested) assert 1 == len(denested) assert tuple( r['path'] ) in denested[0]['streamed_schema']['schema']['properties'] for table_batch in denested: assert [] == errors(table_batch)
def test__records__nested__child_table__a_b(): denested = denest.to_table_batches(NESTED_SCHEMA, [], NESTED_RECORDS) table_batch = _get_table_batch_with_path(denested, ('a', 'b')) assert { 'type': ['integer'] } == table_batch['streamed_schema']['schema']['properties'][('c', 'd')] assert 7 == len(table_batch['records']) for record in table_batch['records']: # Don't try to access key "('c', 'd')" if record is empty if record == {}: continue assert 'integer' == record[('c', 'd')][0] assert int == type(record[('c', 'd')][1])
def test__records__nested__child_table__a_b_c_e(): denested = denest.to_table_batches(NESTED_SCHEMA, [], NESTED_RECORDS) table_batch = _get_table_batch_with_path(denested, ('a', 'b', 'c', 'e')) assert { 'type': ['string'] } == table_batch['streamed_schema']['schema']['properties'][('f', )] assert { 'type': ['boolean'] } == table_batch['streamed_schema']['schema']['properties'][('g', )] assert 2 == len(table_batch['records']) for record in table_batch['records']: assert 'string' == record[('f', )][0] assert str == type(record[('f', )][1]) assert 'boolean' == record[('g', )][0] assert bool == type(record[('g', )][1])
def test__schema__nested_arrays_add_tables(): for _ in range(0, 100): r = random_array_schema() denested = denest.to_table_batches(r['schema'], [], []) print('r:', r) print() print('denested:', denested) assert len(r['path']) + 1 == len(denested) for table_batch in denested: assert [] == errors(table_batch) table_path_accum = [] tables_checked = 0 while True: found_table = False print('looking for a table with path:', table_path_accum) for table_batch in denested: if tuple(table_path_accum ) == table_batch['streamed_schema']['path']: found_table = True break assert found_table print('...table found') tables_checked += 1 if len(table_path_accum) == len(r['path']): break table_path_accum.append(r['path'][len(table_path_accum)]) ## Assert that we looked for every table path assert tables_checked == len(denested) print('PASSED') print()
def write_batch_helper(self, connection, root_table_name, schema, key_properties, records, metadata): """ Write all `table_batch`s associated with the given `schema` and `records` to remote. :param connection: remote connection, type left to be determined by implementing class :param root_table_name: string :param schema: SingerStreamSchema :param key_properties: [string, ...] :param records: [{...}, ...] :param metadata: additional metadata needed by implementing class :return: {'records_persisted': int, 'rows_persisted': int} """ with self._set_timer_tags(metrics.job_timer(), 'batch', (root_table_name,)): with self._set_counter_tags(metrics.record_counter(None), 'batch_rows_persisted', (root_table_name,)) as batch_counter: self.LOGGER.info('Writing batch with {} records for `{}` with `key_properties`: `{}`'.format( len(records), root_table_name, key_properties )) for table_batch in denest.to_table_batches(schema, key_properties, records): table_batch['streamed_schema']['path'] = (root_table_name,) + \ table_batch['streamed_schema']['path'] with self._set_timer_tags(metrics.job_timer(), 'table', table_batch['streamed_schema']['path']) as table_batch_timer: with self._set_counter_tags(metrics.record_counter(None), 'table_rows_persisted', table_batch['streamed_schema']['path']) as table_batch_counter: self.LOGGER.info('Writing table batch schema for `{}`...'.format( table_batch['streamed_schema']['path'] )) remote_schema = self.upsert_table_helper(connection, table_batch['streamed_schema'], metadata) self._set_metrics_tags__table(table_batch_timer, remote_schema['name']) self._set_metrics_tags__table(table_batch_counter, remote_schema['name']) self.LOGGER.info('Writing table batch with {} rows for `{}`...'.format( len(table_batch['records']), table_batch['streamed_schema']['path'] )) batch_rows_persisted = self.write_table_batch( connection, {'remote_schema': remote_schema, 'records': self._serialize_table_records(remote_schema, table_batch['streamed_schema'], table_batch['records'])}, metadata) table_batch_counter.increment(batch_rows_persisted) batch_counter.increment(batch_rows_persisted) return { 'records_persisted': len(records), 'rows_persisted': batch_counter.value }
def write_batch_helper(self, connection, root_table_name, schema, key_properties, records, metadata): """ Write all `table_batch`s associated with the given `schema` and `records` to remote. :param connection: remote connection, type left to be determined by implementing class :param root_table_name: string :param schema: SingerStreamSchema :param key_properties: [string, ...] :param records: [{...}, ...] :param metadata: additional metadata needed by implementing class :return: {'records_persisted': int, 'rows_persisted': int} """ batch__timing_start = time.monotonic() self.LOGGER.info( 'Writing batch with {} records for `{}` with `key_properties`: `{}`' .format(len(records), root_table_name, key_properties)) rows_persisted = 0 for table_batch in denest.to_table_batches(schema, key_properties, records): table_batch['streamed_schema']['path'] = ( root_table_name, ) + table_batch['streamed_schema']['path'] table_batch__schema__timing_start = time.monotonic() self.LOGGER.info('Writing table batch schema for `{}`'.format( table_batch['streamed_schema']['path'])) remote_schema = self.upsert_table_helper( connection, table_batch['streamed_schema'], metadata) self.LOGGER.info( 'Table batch schema written in {} millis for `{}`'.format( _duration_millis(table_batch__schema__timing_start), table_batch['streamed_schema']['path'])) table_batch__records__timing_start = time.monotonic() self.LOGGER.info( 'Writing table batch with {} rows for `{}`'.format( len(table_batch['records']), table_batch['streamed_schema']['path'])) batch_rows_persisted = self.write_table_batch( connection, { 'remote_schema': remote_schema, 'records': self._serialize_table_records( remote_schema, table_batch['streamed_schema'], table_batch['records']) }, metadata) self.LOGGER.info( 'Table batch with {} rows wrote {} rows in {} millis for {}'. format(len(table_batch['records']), batch_rows_persisted, _duration_millis(table_batch__records__timing_start), table_batch['streamed_schema']['path'])) rows_persisted += batch_rows_persisted self.LOGGER.info( 'Batch with {} records wrote {} rows in {} millis for `{}`'.format( len(records), rows_persisted, _duration_millis(batch__timing_start), root_table_name)) return { 'records_persisted': len(records), 'rows_persisted': rows_persisted }