Esempio n. 1
0
    def setUp(self):
        self.config = test_utils.get_test_config()
        bigquery = DbSync(self.config)

        # Drop target schema
        if self.config['default_target_schema']:
            client = bigquery.open_connection()
            client.delete_dataset(
                self.config['default_target_schema'],
                delete_contents=True,
                not_found_ok=True)
Esempio n. 2
0
    def test_nested_schema_flattening(self):
        """Loading nested JSON objects with flattening and not not flattening"""
        tap_lines = test_utils.get_test_tap_lines('messages-with-nested-schema.json')

        # Turning on data flattening
        self.config['data_flattening_max_level'] = 10

        # Load with default settings - Flattening disabled
        self.persist_lines(tap_lines)

        # Get loaded rows from tables
        bigquery = DbSync(self.config)
        target_schema = self.config.get('default_target_schema', '')
        flattened_table = query(bigquery,
            "SELECT * FROM {}.test_table_nested_schema ORDER BY c_pk".format(target_schema))

        # Should be flattened columns
        self.assertEqual(
            flattened_table,
            [{
                'c_pk': 1,
                'c_array': '[1, 2, 3]',
                'c_object': None,
                'c_object_with_props__key_1': 'value_1',
                'c_nested_object__nested_prop_1': 'nested_value_1',
                'c_nested_object__nested_prop_2': 'nested_value_2',
                'c_nested_object__nested_prop_3__multi_nested_prop_1': 'multi_value_1',
                'c_nested_object__nested_prop_3__multi_nested_prop_2': 'multi_value_2',
            }])
Esempio n. 3
0
    def test_nested_schema_unflattening(self):
        """Loading nested JSON objects with no props without flattening"""
        tap_lines = test_utils.get_test_tap_lines('messages-with-nested-schema.json')

        # Load with default settings - Flattening disabled
        self.persist_lines(tap_lines)

        # Get loaded rows from tables - Transform JSON to string at query time
        bigquery = DbSync(self.config)
        target_schema = self.config.get('default_target_schema', '')
        unflattened_table = query(bigquery, """
            SELECT c_pk
                  , c_array c_array
                  , c_object c_object
                  , c_object c_object_with_props
                  , c_nested_object c_nested_object
              FROM {}.test_table_nested_schema
             ORDER BY c_pk""".format(target_schema))

        # Should be valid nested JSON strings
        self.assertEqual(
            unflattened_table,
            [{
                'c_pk': 1,
                'c_array': '[1, 2, 3]',
                'c_object': '{"key_1": "value_1"}',
                'c_object_with_props': '{"key_1": "value_1"}',
                'c_nested_object': {'nested_prop_1': 'nested_value_1',
                                    'nested_prop_2': 'nested_value_2',
                                    'nested_prop_3': {'multi_nested_prop_1': 'multi_value_1',
                                                      'multi_nested_prop_2': 'multi_value_2'}},
            }])
    def setUp(self):
        self.config = get_test_config()
        bigquery = DbSync(self.config)

        # Drop target schema
        if self.config['default_target_schema']:
            bigquery.client.delete_dataset(
                self.config['default_target_schema'],
                delete_contents=True,
                not_found_ok=True)
    def test_table_with_no_pk(self):
        """Tests table with a primary key gets clustered on those fields"""
        tap_lines = test_utils.get_test_tap_lines('table_with_no_pk.json')
        self.config['primary_key_required'] = False
        self.persist_lines(tap_lines)

        # Get loaded rows from tables
        bigquery = DbSync(self.config)
        target_schema = self.config.get('default_target_schema', '')
        table = query(bigquery, "SELECT * FROM {}.test_table_no_pk ORDER BY c_id".format(target_schema))
        self.assertEqual(len(table), 2)
Esempio n. 6
0
    def assert_logical_streams_are_in_bigquery_and_are_empty(self):
        # Get loaded rows from tables
        bigquery = DbSync(self.config)
        target_schema = self.config.get('default_target_schema', '')
        table_one = query(bigquery, "SELECT * FROM {}.logical1_table1 ORDER BY cid".format(target_schema))
        table_two = query(bigquery, "SELECT * FROM {}.logical1_table2 ORDER BY cid".format(target_schema))
        table_three = query(bigquery, "SELECT * FROM {}.logical2_table1 ORDER BY cid".format(target_schema))
        table_four = query(bigquery, "SELECT cid, ctimentz, ctimetz FROM {}.logical1_edgydata WHERE cid IN(1,2,3,4,5,6,8,9) ORDER BY cid".format(target_schema))

        self.assertEqual(table_one, [])
        self.assertEqual(table_two, [])
        self.assertEqual(table_three, [])
        self.assertEqual(table_four, [])
    def test_table_with_pk_multi_column_removed(self):
        """Test table with a pk with multiple columns gets clustered by those and removing the pk doesnt cause errors"""
        tap_lines = test_utils.get_test_tap_lines('table_with_multi_pk_cluster.json')
        self.persist_lines(tap_lines)

        # Get loaded rows from tables
        bigquery = DbSync(self.config)
        target_schema = self.config.get('default_target_schema', '')
        table = query(bigquery, "SELECT * FROM {}.test_table_cluster_multi ORDER BY c_pk".format(target_schema))
        cluster_columns = query(bigquery, "SELECT clustering_ordinal_position, column_name FROM {}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = 'test_table_cluster_multi' AND clustering_ordinal_position > 0 ORDER BY 1".format(target_schema))

        # ----------------------------------------------------------------------
        # Check rows in table
        # ----------------------------------------------------------------------
        expected_table = [
            {'c_pk': 2, 'c_int': 2, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 12, 2, 0, 0, tzinfo=timezone.utc)},
            {'c_pk': 3, 'c_int': 3, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 15, 2, 0, 0, tzinfo=timezone.utc)}
        ]

        expected_cluster_columns = [
            {'clustering_ordinal_position': 1, 'column_name': 'c_pk'},
            {'clustering_ordinal_position': 2, 'column_name': 'c_varchar'}
        ]

        self.assertEqual(self.remove_metadata_columns_from_rows(table), expected_table)
        self.assertEqual(cluster_columns, expected_cluster_columns)

        # ----------------------------------------------------------------------
        # Remove the primary key and check if clustering stayed unchanged
        # ----------------------------------------------------------------------
        self.config['primary_key_required'] = False
        tap_lines = test_utils.get_test_tap_lines('table_with_multi_pk_cluster_changed.json')
        self.persist_lines(tap_lines)

        table_changed = query(bigquery, "SELECT * FROM {}.test_table_cluster_multi ORDER BY c_pk".format(target_schema))
        cluster_columns_changed = query(bigquery, "SELECT clustering_ordinal_position, column_name FROM {}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = 'test_table_cluster_multi' AND clustering_ordinal_position > 0 ORDER BY 1".format(target_schema))

        expected_table_changed = [
            {'c_pk': 2, 'c_int': 2, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 12, 2, 0, 0, tzinfo=timezone.utc)},
            {'c_pk': 2, 'c_int': 2, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 12, 2, 0, 0, tzinfo=timezone.utc)},
            {'c_pk': 3, 'c_int': 3, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 15, 2, 0, 0, tzinfo=timezone.utc)}
        ]

        expected_cluster_columns_changed = []

        self.assertEqual(self.remove_metadata_columns_from_rows(table_changed), expected_table_changed)
        self.assertEqual(cluster_columns_changed, expected_cluster_columns)
Esempio n. 8
0
    def test_column_name_change(self):
        """Tests correct renaming of bigquery columns after source change"""
        tap_lines_before_column_name_change = test_utils.get_test_tap_lines('messages-with-three-streams.json')
        tap_lines_after_column_name_change = test_utils.get_test_tap_lines(
            'messages-with-three-streams-modified-column.json')

        # Load with default settings
        self.persist_lines(tap_lines_before_column_name_change)
        self.persist_lines(tap_lines_after_column_name_change)

        # Get loaded rows from tables
        bigquery = DbSync(self.config)
        target_schema = self.config.get('default_target_schema', '')
        table_one = query(bigquery, "SELECT * FROM {}.test_table_one ORDER BY c_pk".format(target_schema))
        table_two = query(bigquery, "SELECT * FROM {}.test_table_two ORDER BY c_pk".format(target_schema))
        table_three = query(bigquery, "SELECT * FROM {}.test_table_three ORDER BY c_pk".format(target_schema))

        # Table one should have no changes
        self.assertEqual(
            table_one,
            [{'c_int': 1, 'c_pk': 1, 'c_varchar': '1'}])

        # Table two should have versioned column
        self.assertEquals(
            table_two,
            [
                {'c_int': 1, 'c_pk': 1,
                 'c_varchar': '1', 'c_date': datetime.datetime(2019, 2, 1, 15, 12, 45, tzinfo=timezone.utc), 'c_date__st': None},
                {'c_int': 2, 'c_pk': 2, 'c_varchar': '2',
                 'c_date': datetime.datetime(2019, 2, 10, 2, tzinfo=timezone.utc), 'c_date__st': '2019-02-12 02:00:00'},
                {'c_int': 3, 'c_pk': 3, 'c_varchar': '2', 'c_date': None, 'c_date__st': '2019-02-15 02:00:00'}
            ]
        )

        # Table three should have renamed columns
        self.assertEqual(
            table_three,
            [
                {'c_int': 1, 'c_pk': 1, 'c_time': datetime.time(4, 0), 'c_varchar': '1', 'c_time_renamed': None},
                {'c_int': 2, 'c_pk': 2, 'c_time': datetime.time(7, 15), 'c_varchar': '2', 'c_time_renamed': None},
                {'c_int': 3, 'c_pk': 3, 'c_time': datetime.time(23, 0, 3), 'c_varchar': '3',
                 'c_time_renamed': datetime.time(8, 15)},
                {'c_int': 4, 'c_pk': 4, 'c_time': None, 'c_varchar': '4', 'c_time_renamed': datetime.time(23, 0, 3)}
            ])
    def test_table_with_pk_adds_clustering(self):
        """Tests table with a primary key gets clustered on those fields"""
        tap_lines = test_utils.get_test_tap_lines('table_with_pk_cluster.json')
        self.persist_lines(tap_lines)

        # Get loaded rows from tables
        bigquery = DbSync(self.config)
        target_schema = self.config.get('default_target_schema', '')
        table = query(bigquery, "SELECT * FROM {}.test_table_cluster ORDER BY c_pk".format(target_schema))
        cluster_columns = query(bigquery, "SELECT clustering_ordinal_position, column_name FROM {}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = 'test_table_cluster' AND clustering_ordinal_position > 0 ORDER BY 1".format(target_schema))

        # ----------------------------------------------------------------------
        # Check rows in table
        # ----------------------------------------------------------------------
        expected_table = [
            {'c_pk': 2, 'c_int': 2, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 12, 2, 0, 0, tzinfo=timezone.utc)},
            {'c_pk': 3, 'c_int': 3, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 15, 2, 0, 0, tzinfo=timezone.utc)}
        ]

        expected_cluster_columns = [
            {'clustering_ordinal_position': 1, 'column_name': 'c_pk'},
        ]

        self.assertEqual(self.remove_metadata_columns_from_rows(table), expected_table)
        self.assertEqual(cluster_columns, expected_cluster_columns)

        # ----------------------------------------------------------------------
        # Change the primary key and check if clustering stayed unchanged
        # ----------------------------------------------------------------------
        tap_lines = test_utils.get_test_tap_lines('table_with_pk_cluster_changed.json')
        self.persist_lines(tap_lines)

        table_changed = query(bigquery, "SELECT * FROM {}.test_table_cluster ORDER BY c_pk".format(target_schema))
        cluster_columns_changed = query(bigquery, "SELECT clustering_ordinal_position, column_name FROM {}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = 'test_table_cluster' AND clustering_ordinal_position > 0 ORDER BY 1".format(target_schema))

        expected_table_changed = [
            {'c_pk': 2, 'c_int': 2, 'c_varchar': 'c', 'c_date': datetime.datetime(2019, 2, 12, 2, 0, 0, tzinfo=timezone.utc)},
            {'c_pk': 3, 'c_int': 3, 'c_varchar': 'c', 'c_date': datetime.datetime(2022, 5, 15, 5, 0, 0, tzinfo=timezone.utc)}
        ]

        self.assertEqual(self.remove_metadata_columns_from_rows(table_changed), expected_table_changed)
        self.assertEqual(cluster_columns_changed, expected_cluster_columns)
Esempio n. 10
0
    def test_non_db_friendly_columns(self):
        """Loading non-db friendly columns like, camelcase, minus signs, etc."""
        tap_lines = test_utils.get_test_tap_lines('messages-with-non-db-friendly-columns.json')

        # Load with default settings
        self.persist_lines(tap_lines)

        # Get loaded rows from tables
        bigquery = DbSync(self.config)
        target_schema = self.config.get('default_target_schema', '')
        table_non_db_friendly_columns = query(bigquery,
            "SELECT * FROM {}.test_table_non_db_friendly_columns ORDER BY c_pk".format(target_schema))

        self.assertEqual(
            table_non_db_friendly_columns,
            [
                {'c_pk': 1, 'camelcasecolumn': 'Dummy row 1', 'minus_column': 'Dummy row 1'},
                {'c_pk': 2, 'camelcasecolumn': 'Dummy row 2', 'minus_column': 'Dummy row 2'},
                {'c_pk': 3, 'camelcasecolumn': 'Dummy row 3', 'minus_column': 'Dummy row 3'},
                {'c_pk': 4, 'camelcasecolumn': 'Dummy row 4', 'minus_column': 'Dummy row 4'},
                {'c_pk': 5, 'camelcasecolumn': 'Dummy row 5', 'minus_column': 'Dummy row 5'},
            ])
Esempio n. 11
0
    def test_loading_unicode_characters(self):
        """Loading unicode encoded characters"""
        tap_lines = test_utils.get_test_tap_lines('messages-with-unicode-characters.json')

        # Load with default settings
        self.persist_lines(tap_lines)

        # Get loaded rows from tables
        bigquery = DbSync(self.config)
        target_schema = self.config.get('default_target_schema', '')
        table_unicode = query(bigquery, "SELECT * FROM {}.test_table_unicode ORDER BY c_int".format(target_schema))

        self.assertEqual(
            table_unicode,
            [
                {'c_int': 1, 'c_pk': 1, 'c_varchar': 'Hello world, Καλημέρα κόσμε, コンニチハ'},
                {'c_int': 2, 'c_pk': 2, 'c_varchar': 'Chinese: 和毛泽东 <<重上井冈山>>. 严永欣, 一九八八年.'},
                {'c_int': 3, 'c_pk': 3,
                 'c_varchar': 'Russian: Зарегистрируйтесь сейчас на Десятую Международную Конференцию по'},
                {'c_int': 4, 'c_pk': 4, 'c_varchar': 'Thai: แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช'},
                {'c_int': 5, 'c_pk': 5, 'c_varchar': 'Arabic: لقد لعبت أنت وأصدقاؤك لمدة وحصلتم علي من إجمالي النقاط'},
                {'c_int': 6, 'c_pk': 6, 'c_varchar': 'Special Characters: [",\'!@£$%^&*()]'}
            ])
Esempio n. 12
0
def persist_lines(config, lines) -> None:
    state = None
    flushed_state = None
    schemas = {}
    key_properties = {}
    validators = {}
    records_to_load = {}
    csv_files_to_load = {}
    row_count = {}
    stream_to_sync = {}
    total_row_count = {}
    batch_size_rows = config.get('batch_size_rows', DEFAULT_BATCH_SIZE_ROWS)

    # Loop over lines from stdin
    for line in lines:
        try:
            o = json.loads(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if 'type' not in o:
            raise Exception(
                "Line is missing required key 'type': {}".format(line))

        t = o['type']

        if t == 'RECORD':
            if 'stream' not in o:
                raise Exception(
                    "Line is missing required key 'stream': {}".format(line))
            if o['stream'] not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema"
                    .format(o['stream']))

            # Get schema for this record's stream
            stream = o['stream']

            adjust_timestamps_in_record(o['record'], schemas[stream])

            # Validate record
            try:
                validators[stream].validate(float_to_decimal(o['record']))
            except Exception as ex:
                if type(ex).__name__ == "InvalidOperation":
                    logger.error(
                        "Data validation failed and cannot load to destination. RECORD: {}\n'multipleOf' validations that allows long precisions are not supported (i.e. with 15 digits or more). Try removing 'multipleOf' methods from JSON schema."
                        .format(o['record']))
                    raise ex

            primary_key_string = stream_to_sync[
                stream].record_primary_key_string(o['record'])
            if not primary_key_string:
                primary_key_string = 'RID-{}'.format(total_row_count[stream])

            if stream not in records_to_load:
                records_to_load[stream] = {}

            # increment row count only when a new PK is encountered in the current batch
            if primary_key_string not in records_to_load[stream]:
                row_count[stream] += 1
                total_row_count[stream] += 1

            # append record
            if config.get('add_metadata_columns') or config.get('hard_delete'):
                records_to_load[stream][
                    primary_key_string] = add_metadata_values_to_record(
                        o, stream_to_sync[stream])
            else:
                records_to_load[stream][primary_key_string] = o['record']

            if row_count[stream] >= batch_size_rows:
                # flush all streams, delete records if needed, reset counts and then emit current state
                if config.get('flush_all_streams'):
                    filter_streams = None
                else:
                    filter_streams = [stream]

                # Flush and return a new state dict with new positions only for the flushed streams
                flushed_state = flush_streams(records_to_load,
                                              row_count,
                                              stream_to_sync,
                                              config,
                                              state,
                                              flushed_state,
                                              filter_streams=filter_streams)

                # emit last encountered state
                emit_state(copy.deepcopy(flushed_state))

        elif t == 'SCHEMA':
            if 'stream' not in o:
                raise Exception(
                    "Line is missing required key 'stream': {}".format(line))

            stream = o['stream']

            schemas[stream] = float_to_decimal(o['schema'])
            validators[stream] = Draft4Validator(
                schemas[stream], format_checker=FormatChecker())

            # flush records from previous stream SCHEMA
            # if same stream has been encountered again, it means the schema might have been altered
            # so previous records need to be flushed
            if row_count.get(stream, 0) > 0:
                flushed_state = flush_streams(records_to_load, row_count,
                                              stream_to_sync, config, state,
                                              flushed_state)

                # emit latest encountered state
                emit_state(flushed_state)

            # key_properties key must be available in the SCHEMA message.
            if 'key_properties' not in o:
                raise Exception("key_properties field is required")

            # Log based and Incremental replications on tables with no Primary Key
            # cause duplicates when merging UPDATE events.
            # Stop loading data by default if no Primary Key.
            #
            # If you want to load tables with no Primary Key:
            #  1) Set ` 'primary_key_required': false ` in the target-bigquery config.json
            #  or
            #  2) Use fastsync [postgres-to-bigquery, mysql-to-bigquery, etc.]
            if config.get('primary_key_required', True) and len(
                    o['key_properties']) == 0:
                logger.critical(
                    "Primary key is set to mandatory but not defined in the [{}] stream"
                    .format(stream))
                raise Exception("key_properties field is required")

            key_properties[stream] = o['key_properties']

            if config.get('add_metadata_columns') or config.get('hard_delete'):
                stream_to_sync[stream] = DbSync(
                    config, add_metadata_columns_to_schema(o))
            else:
                stream_to_sync[stream] = DbSync(config, o)

            try:
                stream_to_sync[stream].create_schema_if_not_exists()
                stream_to_sync[stream].sync_table()
            except Exception as e:
                logger.error("""
                    Cannot sync table structure in BigQuery schema: {} .
                """.format(stream_to_sync[stream].schema_name))
                raise e

            row_count[stream] = 0
            total_row_count[stream] = 0
            csv_files_to_load[stream] = NamedTemporaryFile(mode='w+b')

        elif t == 'ACTIVATE_VERSION':
            logger.debug('ACTIVATE_VERSION message')

        elif t == 'STATE':
            logger.debug('Setting state to {}'.format(o['value']))
            state = o['value']

            # Initially set flushed state
            if not flushed_state:
                flushed_state = copy.deepcopy(state)

        else:
            raise Exception("Unknown message type {} in message {}".format(
                o['type'], o))

    # if some bucket has records that need to be flushed but haven't reached batch size
    # then flush all buckets.
    if sum(row_count.values()) > 0:
        # flush all streams one last time, delete records if needed, reset counts and then emit current state
        flushed_state = flush_streams(records_to_load, row_count,
                                      stream_to_sync, config, state,
                                      flushed_state)

    # emit latest state
    emit_state(copy.deepcopy(flushed_state))
Esempio n. 13
0
def persist_lines(config, lines) -> None:
    state = None
    flushed_state = None
    schemas = {}
    key_properties = {}
    validators = {}
    records_to_load = {}
    row_count = {}
    flush_timestamp = {}
    stream_to_sync = {}
    total_row_count = {}
    batch_size_rows = config.get('batch_size_rows', DEFAULT_BATCH_SIZE_ROWS)
    default_hard_delete = config.get('hard_delete', DEFAULT_HARD_DELETE)
    hard_delete_mapping = config.get('hard_delete_mapping', {})
    batch_wait_limit_seconds = config.get('batch_wait_limit_seconds', None)

    # Loop over lines from stdin
    for line in lines:
        # Check to see if any streams should be flushed based on time.
        # This assumes that each individual record takes a negligible period
        # of time to be processed.
        streams_to_flush_timestamp = set()
        if batch_wait_limit_seconds:
            streams_to_flush_timestamp = {
                stream for stream, timestamp in flush_timestamp.items()
                if datetime.utcnow() >= timestamp + timedelta(seconds=batch_wait_limit_seconds)
            }

        try:
            o = json.loads(line)
        except json.decoder.JSONDecodeError:
            LOGGER.error("Unable to parse:\n{}".format(line))
            raise

        if 'type' not in o:
            raise Exception("Line is missing required key 'type': {}".format(line))

        t = o['type']

        if t == 'RECORD':
            if 'stream' not in o:
                raise Exception("Line is missing required key 'stream': {}".format(line))
            if o['stream'] not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema".format(o['stream']))

            # Get schema for this record's stream
            stream = o['stream']

            stream_utils.adjust_timestamps_in_record(o['record'], schemas[stream])

            # Validate record
            if config.get('validate_records'):
                try:
                    validators[stream].validate(stream_utils.float_to_decimal(o['record']))
                except Exception as ex:
                    if type(ex).__name__ == "InvalidOperation":
                        raise InvalidValidationOperationException(
                            f"Data validation failed and cannot load to destination. RECORD: {o['record']}\n"
                            "multipleOf validations that allows long precisions are not supported (i.e. with 15 digits"
                            "or more) Try removing 'multipleOf' methods from JSON schema.")
                    raise RecordValidationException(f"Record does not pass schema validation. RECORD: {o['record']}")

            primary_key_string = stream_to_sync[stream].record_primary_key_string(o['record'])
            if not primary_key_string:
                primary_key_string = 'RID-{}'.format(total_row_count[stream])

            # increment row count only when a new PK is encountered in the current batch
            if primary_key_string not in records_to_load[stream]:
                row_count[stream] += 1
                total_row_count[stream] += 1

            # append record
            if config.get('add_metadata_columns') or hard_delete_mapping.get(stream, default_hard_delete):
                records_to_load[stream][primary_key_string] = stream_utils.add_metadata_values_to_record(o)
            else:
                records_to_load[stream][primary_key_string] = o['record']

            flush = False
            if row_count[stream] >= batch_size_rows:
                flush = True
                LOGGER.info("Flush triggered by batch_size_rows (%s) reached in %s",
                             batch_size_rows, stream)
            elif streams_to_flush_timestamp:
                flush = True
                LOGGER.info("Flush triggered by batch_wait_limit_seconds (%s)",
                            batch_wait_limit_seconds)

            if flush:
                # flush all streams, delete records if needed, reset counts and then emit current state
                if config.get('flush_all_streams'):
                    filter_streams = None
                else:
                    filter_streams = list(streams_to_flush_timestamp | {stream})

                # Flush and return a new state dict with new positions only for the flushed streams
                flushed_state, flushed_timestamps = flush_streams(
                    records_to_load,
                    row_count,
                    stream_to_sync,
                    config,
                    state,
                    flushed_state,
                    filter_streams=filter_streams)

                flush_timestamp.update(flushed_timestamps)

                # emit last encountered state
                emit_state(copy.deepcopy(flushed_state))

        elif t == 'SCHEMA':
            if 'stream' not in o:
                raise Exception("Line is missing required key 'stream': {}".format(line))

            stream = o['stream']

            schemas[stream] = stream_utils.float_to_decimal(o['schema'])
            validators[stream] = Draft7Validator(schemas[stream], format_checker=FormatChecker())

            # flush records from previous stream SCHEMA
            # if same stream has been encountered again, it means the schema might have been altered
            # so previous records need to be flushed
            if row_count.get(stream, 0) > 0:
                if config.get('flush_all_streams'):
                    filter_streams = None
                else:
                    filter_streams = list(streams_to_flush_timestamp | {stream})

                flushed_state, flushed_timestamps = flush_streams(
                    records_to_load, row_count, stream_to_sync, config, state, flushed_state, filter_streams=filter_streams
                )

                flush_timestamp.update(flushed_timestamps)

                # emit latest encountered state
                emit_state(flushed_state)

            # key_properties key must be available in the SCHEMA message.
            if 'key_properties' not in o:
                raise Exception("key_properties field is required")

            # Log based and Incremental replications on tables with no Primary Key
            # cause duplicates when merging UPDATE events.
            # Stop loading data by default if no Primary Key.
            #
            # If you want to load tables with no Primary Key:
            #  1) Set ` 'primary_key_required': false ` in the target-bigquery config.json
            #  or
            #  2) Use fastsync [postgres-to-bigquery, mysql-to-bigquery, etc.]
            if config.get('primary_key_required', True) and len(o['key_properties']) == 0:
                LOGGER.critical("Primary key is set to mandatory but not defined in the [{}] stream".format(stream))
                raise Exception("key_properties field is required")

            key_properties[stream] = o['key_properties']

            if config.get('add_metadata_columns') or hard_delete_mapping.get(stream, default_hard_delete):
                stream_to_sync[stream] = DbSync(config, add_metadata_columns_to_schema(o))
            else:
                stream_to_sync[stream] = DbSync(config, o)

            try:
                stream_to_sync[stream].create_schema_if_not_exists()
                stream_to_sync[stream].sync_table()
            except Exception as e:
                LOGGER.error("""
                    Cannot sync table structure in BigQuery schema: {} .
                """.format(
                    stream_to_sync[stream].schema_name))
                raise e

            records_to_load[stream] = {}
            row_count[stream] = 0
            total_row_count[stream] = 0
            flush_timestamp[stream] = datetime.utcnow()

        elif t == 'ACTIVATE_VERSION':
            stream = o['stream']
            version = o['version']

            if hard_delete_mapping.get(stream, default_hard_delete):
                if stream in stream_to_sync:
                    LOGGER.debug('ACTIVATE_VERSION message, clearing records with versions other than {}'.format(version))
                    stream_to_sync[stream].activate_table_version(stream, version)
                else:
                    LOGGER.warn('ACTIVATE_VERSION message, unknown stream {}'.format(stream))
            else:
                LOGGER.debug('ACTIVATE_VERSION message - ignoring due hard_delete not set')

        elif t == 'STATE':
            LOGGER.debug('Setting state to {}'.format(o['value']))
            state = o['value']

            # Initially set flushed state
            if not flushed_state:
                flushed_state = copy.deepcopy(state)

        else:
            raise Exception("Unknown message type {} in message {}"
                            .format(o['type'], o))

    # if some bucket has records that need to be flushed but haven't reached batch size
    # then flush all buckets.
    if sum(row_count.values()) > 0:
        # flush all streams one last time, delete records if needed, reset counts and then emit current state
        flushed_state, _ = flush_streams(records_to_load, row_count, stream_to_sync, config, state, flushed_state)

    # emit latest state
    emit_state(copy.deepcopy(flushed_state))
Esempio n. 14
0
    def assert_three_streams_are_into_bigquery(self, should_metadata_columns_exist=False,
                                                should_hard_deleted_rows=False):
        """
        This is a helper assertion that checks if every data from the message-with-three-streams.json
        file is available in BigQuery tables correctly.
        Useful to check different loading methods (unencrypted, Client-Side encryption, gzip, etc.)
        without duplicating assertions
        """
        bigquery = DbSync(self.config)
        default_target_schema = self.config.get('default_target_schema', '')
        schema_mapping = self.config.get('schema_mapping', {})

        # Identify target schema name
        target_schema = None
        if default_target_schema is not None and default_target_schema.strip():
            target_schema = default_target_schema
        elif schema_mapping:
            target_schema = "tap_mysql_test"

        # Get loaded rows from tables
        table_one = query(bigquery, "SELECT * FROM {}.test_table_one ORDER BY c_pk".format(target_schema))
        table_two = query(bigquery, "SELECT * FROM {}.test_table_two ORDER BY c_pk".format(target_schema))
        table_three = query(bigquery, "SELECT * FROM {}.test_table_three ORDER BY c_pk".format(target_schema))

        # ----------------------------------------------------------------------
        # Check rows in table_one
        # ----------------------------------------------------------------------
        expected_table_one = [
            {'c_int': 1, 'c_pk': 1, 'c_varchar': '1'}
        ]

        self.assertEqual(
            self.remove_metadata_columns_from_rows(table_one), expected_table_one)

        # ----------------------------------------------------------------------
        # Check rows in table_tow
        # ----------------------------------------------------------------------
        expected_table_two = []
        if not should_hard_deleted_rows:
            expected_table_two = [
                {'c_int': 1, 'c_pk': 1, 'c_varchar': '1', 'c_date': datetime.datetime(2019, 2, 1, 15, 12, 45, tzinfo=timezone.utc)},
                {'c_int': 2, 'c_pk': 2, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 10, 2, 0, 0, tzinfo=timezone.utc)}
            ]
        else:
            expected_table_two = [
                {'c_int': 2, 'c_pk': 2, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 10, 2, 0, 0, tzinfo=timezone.utc)}
            ]

        self.assertEqual(
            self.remove_metadata_columns_from_rows(table_two), expected_table_two)

        # ----------------------------------------------------------------------
        # Check rows in table_three
        # ----------------------------------------------------------------------
        expected_table_three = []
        if not should_hard_deleted_rows:
            expected_table_three = [
                {'c_int': 1, 'c_pk': 1, 'c_varchar': '1', 'c_time': datetime.time(4, 0, 0)},
                {'c_int': 2, 'c_pk': 2, 'c_varchar': '2', 'c_time': datetime.time(7, 15, 0)},
                {'c_int': 3, 'c_pk': 3, 'c_varchar': '3', 'c_time': datetime.time(23, 0, 3)}
            ]
        else:
            expected_table_three = [
                {'c_int': 1, 'c_pk': 1, 'c_varchar': '1', 'c_time': datetime.time(4, 0, 0)},
                {'c_int': 2, 'c_pk': 2, 'c_varchar': '2', 'c_time': datetime.time(7, 15, 0)}
            ]

        self.assertEqual(
            self.remove_metadata_columns_from_rows(table_three), expected_table_three)

        # ----------------------------------------------------------------------
        # Check if metadata columns exist or not
        # ----------------------------------------------------------------------
        if should_metadata_columns_exist:
            self.assert_metadata_columns_exist(table_one)
            self.assert_metadata_columns_exist(table_two)
            self.assert_metadata_columns_exist(table_three)
        else:
            self.assert_metadata_columns_not_exist(table_one)
            self.assert_metadata_columns_not_exist(table_two)
            self.assert_metadata_columns_not_exist(table_three)
Esempio n. 15
0
    def assert_logical_streams_are_in_bigquery(self, should_metadata_columns_exist=False):
        # Get loaded rows from tables
        bigquery = DbSync(self.config)
        target_schema = self.config.get('default_target_schema', '')
        table_one = query(bigquery, "SELECT * FROM {}.logical1_table1 ORDER BY cid".format(target_schema))
        table_two = query(bigquery, "SELECT * FROM {}.logical1_table2 ORDER BY cid".format(target_schema))
        table_three = query(bigquery, "SELECT * FROM {}.logical2_table1 ORDER BY cid".format(target_schema))
        table_four = query(bigquery, "SELECT cid, ctimentz, ctimetz FROM {}.logical1_edgydata WHERE cid IN(1,2,3,4,5,6,8,9) ORDER BY cid".format(target_schema))

        # ----------------------------------------------------------------------
        # Check rows in table_one
        # ----------------------------------------------------------------------
        expected_table_one = [
            {'cid': 1, 'cvarchar': "inserted row", 'cvarchar2': None},
            {'cid': 2, 'cvarchar': 'inserted row', "cvarchar2": "inserted row"},
            {'cid': 3, 'cvarchar': "inserted row", 'cvarchar2': "inserted row"},
            {'cid': 4, 'cvarchar': "inserted row", 'cvarchar2': "inserted row"}
        ]

        # ----------------------------------------------------------------------
        # Check rows in table_two
        # ----------------------------------------------------------------------
        expected_table_two = [
            {'cid': 1, 'cvarchar': "updated row"},
            {'cid': 2, 'cvarchar': 'updated row'},
            {'cid': 3, 'cvarchar': "updated row"},
            {'cid': 5, 'cvarchar': "updated row"},
            {'cid': 7, 'cvarchar': "updated row"},
            {'cid': 8, 'cvarchar': 'updated row'},
            {'cid': 9, 'cvarchar': "updated row"},
            {'cid': 10, 'cvarchar': 'updated row'}
        ]

        # ----------------------------------------------------------------------
        # Check rows in table_three
        # ----------------------------------------------------------------------
        expected_table_three = [
            {'cid': 1, 'cvarchar': "updated row"},
            {'cid': 2, 'cvarchar': 'updated row'},
            {'cid': 3, 'cvarchar': "updated row"},
        ]

        # ----------------------------------------------------------------------
        # Check rows in table_four
        # ----------------------------------------------------------------------
        expected_table_four = [
            {'cid': 1, 'ctimentz': None, 'ctimetz': None},
            {'cid': 2, 'ctimentz': datetime.time(23, 0, 15), 'ctimetz': datetime.time(23, 0, 15)},
            {'cid': 3, 'ctimentz': datetime.time(12, 0, 15), 'ctimetz': datetime.time(12, 0, 15)},
            {'cid': 4, 'ctimentz': datetime.time(12, 0, 15), 'ctimetz': datetime.time(9, 0, 15)},
            {'cid': 5, 'ctimentz': datetime.time(12, 0, 15), 'ctimetz': datetime.time(15, 0, 15)},
            {'cid': 6, 'ctimentz': datetime.time(0, 0), 'ctimetz': datetime.time(0, 0)},
            {'cid': 8, 'ctimentz': datetime.time(0, 0), 'ctimetz': datetime.time(1, 0)},
            {'cid': 9, 'ctimentz': datetime.time(0, 0), 'ctimetz': datetime.time(0, 0)}
        ]

        if should_metadata_columns_exist:
            self.assertEqual(self.remove_metadata_columns_from_rows(table_one), expected_table_one)
            self.assertEqual(self.remove_metadata_columns_from_rows(table_two), expected_table_two)
            self.assertEqual(self.remove_metadata_columns_from_rows(table_three), expected_table_three)
            self.assertEqual(table_four, expected_table_four)
        else:
            self.assertEqual(table_one, expected_table_one)
            self.assertEqual(table_two, expected_table_two)
            self.assertEqual(table_three, expected_table_three)
            self.assertEqual(table_four, expected_table_four)