def error_check_denest(schema, key_properties, records):
    denested = denest.to_table_batches(schema, key_properties, records)

    for table_batch in denested:
        assert [] == errors(table_batch)

    return denested
def test__schema__objects_add_fields():
    denested = denest.to_table_batches(
        {
            'properties': {
                'a': {
                    'type': 'integer'
                },
                'b': {
                    'type': 'object',
                    'properties': {
                        'c': {
                            'type': 'string'
                        },
                        'd': {
                            'type': 'boolean'
                        }
                    }
                }
            }
        }, ['a'], [])

    assert 1 == len(denested)
    assert ('b', 'c') in denested[0]['streamed_schema']['schema']['properties']
    assert ('b', 'd') in denested[0]['streamed_schema']['schema']['properties']

    for table_batch in denested:
        assert [] == errors(table_batch)
def test__schema__arrays_add_tables():
    denested = denest.to_table_batches(
        {
            'properties': {
                'a': {
                    'type': 'integer'
                },
                'b': {
                    'type': 'array',
                    'items': {
                        'properties': {
                            'c': {
                                'type': 'string'
                            },
                            'd': {
                                'type': 'boolean'
                            }
                        }
                    }
                }
            }
        }, ['a'], [])
    assert 2 == len(denested)
    for table_batch in denested:
        assert [] == errors(table_batch)
def test_empty():
    denested = denest.to_table_batches({}, [], [])
    assert 1 == len(denested)
    assert [] == denested[0]['records']
    assert [] == denested[0]['streamed_schema']['key_properties']

    for table_batch in denested:
        assert [] == errors(table_batch)
def test__records__nested__root_empty():
    denested = denest.to_table_batches(NESTED_SCHEMA, [], NESTED_RECORDS)
    table_batch = _get_table_batch_with_path(denested, tuple())

    assert {} == table_batch['streamed_schema']['schema']['properties']

    assert 5 == len(table_batch['records'])

    for record in table_batch['records']:
        assert {} == record
def test__records__nested__tables():
    denested = denest.to_table_batches(NESTED_SCHEMA, [], NESTED_RECORDS)

    print('denested:', denested)

    assert 3 == len(denested)

    for table_batch in denested:
        assert table_batch['streamed_schema']['path'] in \
               {tuple(),
                ('a', 'b'),
                ('a', 'b', 'c', 'e')}
        assert [] == errors(table_batch)
def test__schema__nested_objects_add_fields():
    for _ in range(0, 100):
        r = random_object_schema()
        denested = denest.to_table_batches(r['schema'], [], [])

        print('r:', r)
        print()
        print('denested:', denested)

        assert 1 == len(denested)
        assert tuple(
            r['path']
        ) in denested[0]['streamed_schema']['schema']['properties']

        for table_batch in denested:
            assert [] == errors(table_batch)
def test__records__nested__child_table__a_b():
    denested = denest.to_table_batches(NESTED_SCHEMA, [], NESTED_RECORDS)
    table_batch = _get_table_batch_with_path(denested, ('a', 'b'))

    assert {
        'type': ['integer']
    } == table_batch['streamed_schema']['schema']['properties'][('c', 'd')]

    assert 7 == len(table_batch['records'])

    for record in table_batch['records']:
        # Don't try to access key "('c', 'd')" if record is empty
        if record == {}:
            continue
        assert 'integer' == record[('c', 'd')][0]
        assert int == type(record[('c', 'd')][1])
def test__records__nested__child_table__a_b_c_e():
    denested = denest.to_table_batches(NESTED_SCHEMA, [], NESTED_RECORDS)
    table_batch = _get_table_batch_with_path(denested, ('a', 'b', 'c', 'e'))

    assert {
        'type': ['string']
    } == table_batch['streamed_schema']['schema']['properties'][('f', )]
    assert {
        'type': ['boolean']
    } == table_batch['streamed_schema']['schema']['properties'][('g', )]

    assert 2 == len(table_batch['records'])

    for record in table_batch['records']:
        assert 'string' == record[('f', )][0]
        assert str == type(record[('f', )][1])

        assert 'boolean' == record[('g', )][0]
        assert bool == type(record[('g', )][1])
Beispiel #10
0
def test__schema__nested_arrays_add_tables():
    for _ in range(0, 100):
        r = random_array_schema()
        denested = denest.to_table_batches(r['schema'], [], [])

        print('r:', r)
        print()
        print('denested:', denested)

        assert len(r['path']) + 1 == len(denested)

        for table_batch in denested:
            assert [] == errors(table_batch)

        table_path_accum = []
        tables_checked = 0
        while True:
            found_table = False

            print('looking for a table with path:', table_path_accum)

            for table_batch in denested:
                if tuple(table_path_accum
                         ) == table_batch['streamed_schema']['path']:
                    found_table = True
                    break

            assert found_table
            print('...table found')

            tables_checked += 1

            if len(table_path_accum) == len(r['path']):
                break

            table_path_accum.append(r['path'][len(table_path_accum)])

        ## Assert that we looked for every table path
        assert tables_checked == len(denested)

        print('PASSED')
        print()
Beispiel #11
0
    def write_batch_helper(self, connection, root_table_name, schema, key_properties, records, metadata):
        """
        Write all `table_batch`s associated with the given `schema` and `records` to remote.

        :param connection: remote connection, type left to be determined by implementing class
        :param root_table_name: string
        :param schema: SingerStreamSchema
        :param key_properties: [string, ...]
        :param records: [{...}, ...]
        :param metadata: additional metadata needed by implementing class
        :return: {'records_persisted': int,
                  'rows_persisted': int}
        """
        with self._set_timer_tags(metrics.job_timer(),
                                  'batch',
                                  (root_table_name,)):
            with self._set_counter_tags(metrics.record_counter(None),
                                        'batch_rows_persisted',
                                        (root_table_name,)) as batch_counter:
                self.LOGGER.info('Writing batch with {} records for `{}` with `key_properties`: `{}`'.format(
                    len(records),
                    root_table_name,
                    key_properties
                ))

                for table_batch in denest.to_table_batches(schema, key_properties, records):
                    table_batch['streamed_schema']['path'] = (root_table_name,) + \
                                                             table_batch['streamed_schema']['path']

                    with self._set_timer_tags(metrics.job_timer(),
                                              'table',
                                              table_batch['streamed_schema']['path']) as table_batch_timer:
                        with self._set_counter_tags(metrics.record_counter(None),
                                                    'table_rows_persisted',
                                                    table_batch['streamed_schema']['path']) as table_batch_counter:
                            self.LOGGER.info('Writing table batch schema for `{}`...'.format(
                                table_batch['streamed_schema']['path']
                            ))

                            remote_schema = self.upsert_table_helper(connection,
                                                                     table_batch['streamed_schema'],
                                                                     metadata)

                            self._set_metrics_tags__table(table_batch_timer, remote_schema['name'])
                            self._set_metrics_tags__table(table_batch_counter, remote_schema['name'])

                            self.LOGGER.info('Writing table batch with {} rows for `{}`...'.format(
                                len(table_batch['records']),
                                table_batch['streamed_schema']['path']
                            ))

                            batch_rows_persisted = self.write_table_batch(
                                connection,
                                {'remote_schema': remote_schema,
                                 'records': self._serialize_table_records(remote_schema,
                                                                          table_batch['streamed_schema'],
                                                                          table_batch['records'])},
                                metadata)

                            table_batch_counter.increment(batch_rows_persisted)
                            batch_counter.increment(batch_rows_persisted)

                return {
                    'records_persisted': len(records),
                    'rows_persisted': batch_counter.value
                }
    def write_batch_helper(self, connection, root_table_name, schema,
                           key_properties, records, metadata):
        """
        Write all `table_batch`s associated with the given `schema` and `records` to remote.

        :param connection: remote connection, type left to be determined by implementing class
        :param root_table_name: string
        :param schema: SingerStreamSchema
        :param key_properties: [string, ...]
        :param records: [{...}, ...]
        :param metadata: additional metadata needed by implementing class
        :return: {'records_persisted': int,
                  'rows_persisted': int}
        """
        batch__timing_start = time.monotonic()

        self.LOGGER.info(
            'Writing batch with {} records for `{}` with `key_properties`: `{}`'
            .format(len(records), root_table_name, key_properties))

        rows_persisted = 0
        for table_batch in denest.to_table_batches(schema, key_properties,
                                                   records):
            table_batch['streamed_schema']['path'] = (
                root_table_name, ) + table_batch['streamed_schema']['path']

            table_batch__schema__timing_start = time.monotonic()

            self.LOGGER.info('Writing table batch schema for `{}`'.format(
                table_batch['streamed_schema']['path']))

            remote_schema = self.upsert_table_helper(
                connection, table_batch['streamed_schema'], metadata)

            self.LOGGER.info(
                'Table batch schema written in {} millis for `{}`'.format(
                    _duration_millis(table_batch__schema__timing_start),
                    table_batch['streamed_schema']['path']))

            table_batch__records__timing_start = time.monotonic()

            self.LOGGER.info(
                'Writing table batch with {} rows for `{}`'.format(
                    len(table_batch['records']),
                    table_batch['streamed_schema']['path']))

            batch_rows_persisted = self.write_table_batch(
                connection, {
                    'remote_schema':
                    remote_schema,
                    'records':
                    self._serialize_table_records(
                        remote_schema, table_batch['streamed_schema'],
                        table_batch['records'])
                }, metadata)

            self.LOGGER.info(
                'Table batch with {} rows wrote {} rows in {} millis for {}'.
                format(len(table_batch['records']), batch_rows_persisted,
                       _duration_millis(table_batch__records__timing_start),
                       table_batch['streamed_schema']['path']))

            rows_persisted += batch_rows_persisted

        self.LOGGER.info(
            'Batch with {} records wrote {} rows in {} millis for `{}`'.format(
                len(records), rows_persisted,
                _duration_millis(batch__timing_start), root_table_name))

        return {
            'records_persisted': len(records),
            'rows_persisted': rows_persisted
        }