def setUp(self): self.config = test_utils.get_test_config() bigquery = DbSync(self.config) # Drop target schema if self.config['default_target_schema']: client = bigquery.open_connection() client.delete_dataset( self.config['default_target_schema'], delete_contents=True, not_found_ok=True)
def test_nested_schema_flattening(self): """Loading nested JSON objects with flattening and not not flattening""" tap_lines = test_utils.get_test_tap_lines('messages-with-nested-schema.json') # Turning on data flattening self.config['data_flattening_max_level'] = 10 # Load with default settings - Flattening disabled self.persist_lines(tap_lines) # Get loaded rows from tables bigquery = DbSync(self.config) target_schema = self.config.get('default_target_schema', '') flattened_table = query(bigquery, "SELECT * FROM {}.test_table_nested_schema ORDER BY c_pk".format(target_schema)) # Should be flattened columns self.assertEqual( flattened_table, [{ 'c_pk': 1, 'c_array': '[1, 2, 3]', 'c_object': None, 'c_object_with_props__key_1': 'value_1', 'c_nested_object__nested_prop_1': 'nested_value_1', 'c_nested_object__nested_prop_2': 'nested_value_2', 'c_nested_object__nested_prop_3__multi_nested_prop_1': 'multi_value_1', 'c_nested_object__nested_prop_3__multi_nested_prop_2': 'multi_value_2', }])
def test_nested_schema_unflattening(self): """Loading nested JSON objects with no props without flattening""" tap_lines = test_utils.get_test_tap_lines('messages-with-nested-schema.json') # Load with default settings - Flattening disabled self.persist_lines(tap_lines) # Get loaded rows from tables - Transform JSON to string at query time bigquery = DbSync(self.config) target_schema = self.config.get('default_target_schema', '') unflattened_table = query(bigquery, """ SELECT c_pk , c_array c_array , c_object c_object , c_object c_object_with_props , c_nested_object c_nested_object FROM {}.test_table_nested_schema ORDER BY c_pk""".format(target_schema)) # Should be valid nested JSON strings self.assertEqual( unflattened_table, [{ 'c_pk': 1, 'c_array': '[1, 2, 3]', 'c_object': '{"key_1": "value_1"}', 'c_object_with_props': '{"key_1": "value_1"}', 'c_nested_object': {'nested_prop_1': 'nested_value_1', 'nested_prop_2': 'nested_value_2', 'nested_prop_3': {'multi_nested_prop_1': 'multi_value_1', 'multi_nested_prop_2': 'multi_value_2'}}, }])
def setUp(self): self.config = get_test_config() bigquery = DbSync(self.config) # Drop target schema if self.config['default_target_schema']: bigquery.client.delete_dataset( self.config['default_target_schema'], delete_contents=True, not_found_ok=True)
def test_table_with_no_pk(self): """Tests table with a primary key gets clustered on those fields""" tap_lines = test_utils.get_test_tap_lines('table_with_no_pk.json') self.config['primary_key_required'] = False self.persist_lines(tap_lines) # Get loaded rows from tables bigquery = DbSync(self.config) target_schema = self.config.get('default_target_schema', '') table = query(bigquery, "SELECT * FROM {}.test_table_no_pk ORDER BY c_id".format(target_schema)) self.assertEqual(len(table), 2)
def assert_logical_streams_are_in_bigquery_and_are_empty(self): # Get loaded rows from tables bigquery = DbSync(self.config) target_schema = self.config.get('default_target_schema', '') table_one = query(bigquery, "SELECT * FROM {}.logical1_table1 ORDER BY cid".format(target_schema)) table_two = query(bigquery, "SELECT * FROM {}.logical1_table2 ORDER BY cid".format(target_schema)) table_three = query(bigquery, "SELECT * FROM {}.logical2_table1 ORDER BY cid".format(target_schema)) table_four = query(bigquery, "SELECT cid, ctimentz, ctimetz FROM {}.logical1_edgydata WHERE cid IN(1,2,3,4,5,6,8,9) ORDER BY cid".format(target_schema)) self.assertEqual(table_one, []) self.assertEqual(table_two, []) self.assertEqual(table_three, []) self.assertEqual(table_four, [])
def test_table_with_pk_multi_column_removed(self): """Test table with a pk with multiple columns gets clustered by those and removing the pk doesnt cause errors""" tap_lines = test_utils.get_test_tap_lines('table_with_multi_pk_cluster.json') self.persist_lines(tap_lines) # Get loaded rows from tables bigquery = DbSync(self.config) target_schema = self.config.get('default_target_schema', '') table = query(bigquery, "SELECT * FROM {}.test_table_cluster_multi ORDER BY c_pk".format(target_schema)) cluster_columns = query(bigquery, "SELECT clustering_ordinal_position, column_name FROM {}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = 'test_table_cluster_multi' AND clustering_ordinal_position > 0 ORDER BY 1".format(target_schema)) # ---------------------------------------------------------------------- # Check rows in table # ---------------------------------------------------------------------- expected_table = [ {'c_pk': 2, 'c_int': 2, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 12, 2, 0, 0, tzinfo=timezone.utc)}, {'c_pk': 3, 'c_int': 3, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 15, 2, 0, 0, tzinfo=timezone.utc)} ] expected_cluster_columns = [ {'clustering_ordinal_position': 1, 'column_name': 'c_pk'}, {'clustering_ordinal_position': 2, 'column_name': 'c_varchar'} ] self.assertEqual(self.remove_metadata_columns_from_rows(table), expected_table) self.assertEqual(cluster_columns, expected_cluster_columns) # ---------------------------------------------------------------------- # Remove the primary key and check if clustering stayed unchanged # ---------------------------------------------------------------------- self.config['primary_key_required'] = False tap_lines = test_utils.get_test_tap_lines('table_with_multi_pk_cluster_changed.json') self.persist_lines(tap_lines) table_changed = query(bigquery, "SELECT * FROM {}.test_table_cluster_multi ORDER BY c_pk".format(target_schema)) cluster_columns_changed = query(bigquery, "SELECT clustering_ordinal_position, column_name FROM {}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = 'test_table_cluster_multi' AND clustering_ordinal_position > 0 ORDER BY 1".format(target_schema)) expected_table_changed = [ {'c_pk': 2, 'c_int': 2, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 12, 2, 0, 0, tzinfo=timezone.utc)}, {'c_pk': 2, 'c_int': 2, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 12, 2, 0, 0, tzinfo=timezone.utc)}, {'c_pk': 3, 'c_int': 3, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 15, 2, 0, 0, tzinfo=timezone.utc)} ] expected_cluster_columns_changed = [] self.assertEqual(self.remove_metadata_columns_from_rows(table_changed), expected_table_changed) self.assertEqual(cluster_columns_changed, expected_cluster_columns)
def test_column_name_change(self): """Tests correct renaming of bigquery columns after source change""" tap_lines_before_column_name_change = test_utils.get_test_tap_lines('messages-with-three-streams.json') tap_lines_after_column_name_change = test_utils.get_test_tap_lines( 'messages-with-three-streams-modified-column.json') # Load with default settings self.persist_lines(tap_lines_before_column_name_change) self.persist_lines(tap_lines_after_column_name_change) # Get loaded rows from tables bigquery = DbSync(self.config) target_schema = self.config.get('default_target_schema', '') table_one = query(bigquery, "SELECT * FROM {}.test_table_one ORDER BY c_pk".format(target_schema)) table_two = query(bigquery, "SELECT * FROM {}.test_table_two ORDER BY c_pk".format(target_schema)) table_three = query(bigquery, "SELECT * FROM {}.test_table_three ORDER BY c_pk".format(target_schema)) # Table one should have no changes self.assertEqual( table_one, [{'c_int': 1, 'c_pk': 1, 'c_varchar': '1'}]) # Table two should have versioned column self.assertEquals( table_two, [ {'c_int': 1, 'c_pk': 1, 'c_varchar': '1', 'c_date': datetime.datetime(2019, 2, 1, 15, 12, 45, tzinfo=timezone.utc), 'c_date__st': None}, {'c_int': 2, 'c_pk': 2, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 10, 2, tzinfo=timezone.utc), 'c_date__st': '2019-02-12 02:00:00'}, {'c_int': 3, 'c_pk': 3, 'c_varchar': '2', 'c_date': None, 'c_date__st': '2019-02-15 02:00:00'} ] ) # Table three should have renamed columns self.assertEqual( table_three, [ {'c_int': 1, 'c_pk': 1, 'c_time': datetime.time(4, 0), 'c_varchar': '1', 'c_time_renamed': None}, {'c_int': 2, 'c_pk': 2, 'c_time': datetime.time(7, 15), 'c_varchar': '2', 'c_time_renamed': None}, {'c_int': 3, 'c_pk': 3, 'c_time': datetime.time(23, 0, 3), 'c_varchar': '3', 'c_time_renamed': datetime.time(8, 15)}, {'c_int': 4, 'c_pk': 4, 'c_time': None, 'c_varchar': '4', 'c_time_renamed': datetime.time(23, 0, 3)} ])
def test_table_with_pk_adds_clustering(self): """Tests table with a primary key gets clustered on those fields""" tap_lines = test_utils.get_test_tap_lines('table_with_pk_cluster.json') self.persist_lines(tap_lines) # Get loaded rows from tables bigquery = DbSync(self.config) target_schema = self.config.get('default_target_schema', '') table = query(bigquery, "SELECT * FROM {}.test_table_cluster ORDER BY c_pk".format(target_schema)) cluster_columns = query(bigquery, "SELECT clustering_ordinal_position, column_name FROM {}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = 'test_table_cluster' AND clustering_ordinal_position > 0 ORDER BY 1".format(target_schema)) # ---------------------------------------------------------------------- # Check rows in table # ---------------------------------------------------------------------- expected_table = [ {'c_pk': 2, 'c_int': 2, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 12, 2, 0, 0, tzinfo=timezone.utc)}, {'c_pk': 3, 'c_int': 3, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 15, 2, 0, 0, tzinfo=timezone.utc)} ] expected_cluster_columns = [ {'clustering_ordinal_position': 1, 'column_name': 'c_pk'}, ] self.assertEqual(self.remove_metadata_columns_from_rows(table), expected_table) self.assertEqual(cluster_columns, expected_cluster_columns) # ---------------------------------------------------------------------- # Change the primary key and check if clustering stayed unchanged # ---------------------------------------------------------------------- tap_lines = test_utils.get_test_tap_lines('table_with_pk_cluster_changed.json') self.persist_lines(tap_lines) table_changed = query(bigquery, "SELECT * FROM {}.test_table_cluster ORDER BY c_pk".format(target_schema)) cluster_columns_changed = query(bigquery, "SELECT clustering_ordinal_position, column_name FROM {}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = 'test_table_cluster' AND clustering_ordinal_position > 0 ORDER BY 1".format(target_schema)) expected_table_changed = [ {'c_pk': 2, 'c_int': 2, 'c_varchar': 'c', 'c_date': datetime.datetime(2019, 2, 12, 2, 0, 0, tzinfo=timezone.utc)}, {'c_pk': 3, 'c_int': 3, 'c_varchar': 'c', 'c_date': datetime.datetime(2022, 5, 15, 5, 0, 0, tzinfo=timezone.utc)} ] self.assertEqual(self.remove_metadata_columns_from_rows(table_changed), expected_table_changed) self.assertEqual(cluster_columns_changed, expected_cluster_columns)
def test_non_db_friendly_columns(self): """Loading non-db friendly columns like, camelcase, minus signs, etc.""" tap_lines = test_utils.get_test_tap_lines('messages-with-non-db-friendly-columns.json') # Load with default settings self.persist_lines(tap_lines) # Get loaded rows from tables bigquery = DbSync(self.config) target_schema = self.config.get('default_target_schema', '') table_non_db_friendly_columns = query(bigquery, "SELECT * FROM {}.test_table_non_db_friendly_columns ORDER BY c_pk".format(target_schema)) self.assertEqual( table_non_db_friendly_columns, [ {'c_pk': 1, 'camelcasecolumn': 'Dummy row 1', 'minus_column': 'Dummy row 1'}, {'c_pk': 2, 'camelcasecolumn': 'Dummy row 2', 'minus_column': 'Dummy row 2'}, {'c_pk': 3, 'camelcasecolumn': 'Dummy row 3', 'minus_column': 'Dummy row 3'}, {'c_pk': 4, 'camelcasecolumn': 'Dummy row 4', 'minus_column': 'Dummy row 4'}, {'c_pk': 5, 'camelcasecolumn': 'Dummy row 5', 'minus_column': 'Dummy row 5'}, ])
def test_loading_unicode_characters(self): """Loading unicode encoded characters""" tap_lines = test_utils.get_test_tap_lines('messages-with-unicode-characters.json') # Load with default settings self.persist_lines(tap_lines) # Get loaded rows from tables bigquery = DbSync(self.config) target_schema = self.config.get('default_target_schema', '') table_unicode = query(bigquery, "SELECT * FROM {}.test_table_unicode ORDER BY c_int".format(target_schema)) self.assertEqual( table_unicode, [ {'c_int': 1, 'c_pk': 1, 'c_varchar': 'Hello world, Καλημέρα κόσμε, コンニチハ'}, {'c_int': 2, 'c_pk': 2, 'c_varchar': 'Chinese: 和毛泽东 <<重上井冈山>>. 严永欣, 一九八八年.'}, {'c_int': 3, 'c_pk': 3, 'c_varchar': 'Russian: Зарегистрируйтесь сейчас на Десятую Международную Конференцию по'}, {'c_int': 4, 'c_pk': 4, 'c_varchar': 'Thai: แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช'}, {'c_int': 5, 'c_pk': 5, 'c_varchar': 'Arabic: لقد لعبت أنت وأصدقاؤك لمدة وحصلتم علي من إجمالي النقاط'}, {'c_int': 6, 'c_pk': 6, 'c_varchar': 'Special Characters: [",\'!@£$%^&*()]'} ])
def persist_lines(config, lines) -> None: state = None flushed_state = None schemas = {} key_properties = {} validators = {} records_to_load = {} csv_files_to_load = {} row_count = {} stream_to_sync = {} total_row_count = {} batch_size_rows = config.get('batch_size_rows', DEFAULT_BATCH_SIZE_ROWS) # Loop over lines from stdin for line in lines: try: o = json.loads(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if 'type' not in o: raise Exception( "Line is missing required key 'type': {}".format(line)) t = o['type'] if t == 'RECORD': if 'stream' not in o: raise Exception( "Line is missing required key 'stream': {}".format(line)) if o['stream'] not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema" .format(o['stream'])) # Get schema for this record's stream stream = o['stream'] adjust_timestamps_in_record(o['record'], schemas[stream]) # Validate record try: validators[stream].validate(float_to_decimal(o['record'])) except Exception as ex: if type(ex).__name__ == "InvalidOperation": logger.error( "Data validation failed and cannot load to destination. RECORD: {}\n'multipleOf' validations that allows long precisions are not supported (i.e. with 15 digits or more). Try removing 'multipleOf' methods from JSON schema." .format(o['record'])) raise ex primary_key_string = stream_to_sync[ stream].record_primary_key_string(o['record']) if not primary_key_string: primary_key_string = 'RID-{}'.format(total_row_count[stream]) if stream not in records_to_load: records_to_load[stream] = {} # increment row count only when a new PK is encountered in the current batch if primary_key_string not in records_to_load[stream]: row_count[stream] += 1 total_row_count[stream] += 1 # append record if config.get('add_metadata_columns') or config.get('hard_delete'): records_to_load[stream][ primary_key_string] = add_metadata_values_to_record( o, stream_to_sync[stream]) else: records_to_load[stream][primary_key_string] = o['record'] if row_count[stream] >= batch_size_rows: # flush all streams, delete records if needed, reset counts and then emit current state if config.get('flush_all_streams'): filter_streams = None else: filter_streams = [stream] # Flush and return a new state dict with new positions only for the flushed streams flushed_state = flush_streams(records_to_load, row_count, stream_to_sync, config, state, flushed_state, filter_streams=filter_streams) # emit last encountered state emit_state(copy.deepcopy(flushed_state)) elif t == 'SCHEMA': if 'stream' not in o: raise Exception( "Line is missing required key 'stream': {}".format(line)) stream = o['stream'] schemas[stream] = float_to_decimal(o['schema']) validators[stream] = Draft4Validator( schemas[stream], format_checker=FormatChecker()) # flush records from previous stream SCHEMA # if same stream has been encountered again, it means the schema might have been altered # so previous records need to be flushed if row_count.get(stream, 0) > 0: flushed_state = flush_streams(records_to_load, row_count, stream_to_sync, config, state, flushed_state) # emit latest encountered state emit_state(flushed_state) # key_properties key must be available in the SCHEMA message. if 'key_properties' not in o: raise Exception("key_properties field is required") # Log based and Incremental replications on tables with no Primary Key # cause duplicates when merging UPDATE events. # Stop loading data by default if no Primary Key. # # If you want to load tables with no Primary Key: # 1) Set ` 'primary_key_required': false ` in the target-bigquery config.json # or # 2) Use fastsync [postgres-to-bigquery, mysql-to-bigquery, etc.] if config.get('primary_key_required', True) and len( o['key_properties']) == 0: logger.critical( "Primary key is set to mandatory but not defined in the [{}] stream" .format(stream)) raise Exception("key_properties field is required") key_properties[stream] = o['key_properties'] if config.get('add_metadata_columns') or config.get('hard_delete'): stream_to_sync[stream] = DbSync( config, add_metadata_columns_to_schema(o)) else: stream_to_sync[stream] = DbSync(config, o) try: stream_to_sync[stream].create_schema_if_not_exists() stream_to_sync[stream].sync_table() except Exception as e: logger.error(""" Cannot sync table structure in BigQuery schema: {} . """.format(stream_to_sync[stream].schema_name)) raise e row_count[stream] = 0 total_row_count[stream] = 0 csv_files_to_load[stream] = NamedTemporaryFile(mode='w+b') elif t == 'ACTIVATE_VERSION': logger.debug('ACTIVATE_VERSION message') elif t == 'STATE': logger.debug('Setting state to {}'.format(o['value'])) state = o['value'] # Initially set flushed state if not flushed_state: flushed_state = copy.deepcopy(state) else: raise Exception("Unknown message type {} in message {}".format( o['type'], o)) # if some bucket has records that need to be flushed but haven't reached batch size # then flush all buckets. if sum(row_count.values()) > 0: # flush all streams one last time, delete records if needed, reset counts and then emit current state flushed_state = flush_streams(records_to_load, row_count, stream_to_sync, config, state, flushed_state) # emit latest state emit_state(copy.deepcopy(flushed_state))
def persist_lines(config, lines) -> None: state = None flushed_state = None schemas = {} key_properties = {} validators = {} records_to_load = {} row_count = {} flush_timestamp = {} stream_to_sync = {} total_row_count = {} batch_size_rows = config.get('batch_size_rows', DEFAULT_BATCH_SIZE_ROWS) default_hard_delete = config.get('hard_delete', DEFAULT_HARD_DELETE) hard_delete_mapping = config.get('hard_delete_mapping', {}) batch_wait_limit_seconds = config.get('batch_wait_limit_seconds', None) # Loop over lines from stdin for line in lines: # Check to see if any streams should be flushed based on time. # This assumes that each individual record takes a negligible period # of time to be processed. streams_to_flush_timestamp = set() if batch_wait_limit_seconds: streams_to_flush_timestamp = { stream for stream, timestamp in flush_timestamp.items() if datetime.utcnow() >= timestamp + timedelta(seconds=batch_wait_limit_seconds) } try: o = json.loads(line) except json.decoder.JSONDecodeError: LOGGER.error("Unable to parse:\n{}".format(line)) raise if 'type' not in o: raise Exception("Line is missing required key 'type': {}".format(line)) t = o['type'] if t == 'RECORD': if 'stream' not in o: raise Exception("Line is missing required key 'stream': {}".format(line)) if o['stream'] not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema".format(o['stream'])) # Get schema for this record's stream stream = o['stream'] stream_utils.adjust_timestamps_in_record(o['record'], schemas[stream]) # Validate record if config.get('validate_records'): try: validators[stream].validate(stream_utils.float_to_decimal(o['record'])) except Exception as ex: if type(ex).__name__ == "InvalidOperation": raise InvalidValidationOperationException( f"Data validation failed and cannot load to destination. RECORD: {o['record']}\n" "multipleOf validations that allows long precisions are not supported (i.e. with 15 digits" "or more) Try removing 'multipleOf' methods from JSON schema.") raise RecordValidationException(f"Record does not pass schema validation. RECORD: {o['record']}") primary_key_string = stream_to_sync[stream].record_primary_key_string(o['record']) if not primary_key_string: primary_key_string = 'RID-{}'.format(total_row_count[stream]) # increment row count only when a new PK is encountered in the current batch if primary_key_string not in records_to_load[stream]: row_count[stream] += 1 total_row_count[stream] += 1 # append record if config.get('add_metadata_columns') or hard_delete_mapping.get(stream, default_hard_delete): records_to_load[stream][primary_key_string] = stream_utils.add_metadata_values_to_record(o) else: records_to_load[stream][primary_key_string] = o['record'] flush = False if row_count[stream] >= batch_size_rows: flush = True LOGGER.info("Flush triggered by batch_size_rows (%s) reached in %s", batch_size_rows, stream) elif streams_to_flush_timestamp: flush = True LOGGER.info("Flush triggered by batch_wait_limit_seconds (%s)", batch_wait_limit_seconds) if flush: # flush all streams, delete records if needed, reset counts and then emit current state if config.get('flush_all_streams'): filter_streams = None else: filter_streams = list(streams_to_flush_timestamp | {stream}) # Flush and return a new state dict with new positions only for the flushed streams flushed_state, flushed_timestamps = flush_streams( records_to_load, row_count, stream_to_sync, config, state, flushed_state, filter_streams=filter_streams) flush_timestamp.update(flushed_timestamps) # emit last encountered state emit_state(copy.deepcopy(flushed_state)) elif t == 'SCHEMA': if 'stream' not in o: raise Exception("Line is missing required key 'stream': {}".format(line)) stream = o['stream'] schemas[stream] = stream_utils.float_to_decimal(o['schema']) validators[stream] = Draft7Validator(schemas[stream], format_checker=FormatChecker()) # flush records from previous stream SCHEMA # if same stream has been encountered again, it means the schema might have been altered # so previous records need to be flushed if row_count.get(stream, 0) > 0: if config.get('flush_all_streams'): filter_streams = None else: filter_streams = list(streams_to_flush_timestamp | {stream}) flushed_state, flushed_timestamps = flush_streams( records_to_load, row_count, stream_to_sync, config, state, flushed_state, filter_streams=filter_streams ) flush_timestamp.update(flushed_timestamps) # emit latest encountered state emit_state(flushed_state) # key_properties key must be available in the SCHEMA message. if 'key_properties' not in o: raise Exception("key_properties field is required") # Log based and Incremental replications on tables with no Primary Key # cause duplicates when merging UPDATE events. # Stop loading data by default if no Primary Key. # # If you want to load tables with no Primary Key: # 1) Set ` 'primary_key_required': false ` in the target-bigquery config.json # or # 2) Use fastsync [postgres-to-bigquery, mysql-to-bigquery, etc.] if config.get('primary_key_required', True) and len(o['key_properties']) == 0: LOGGER.critical("Primary key is set to mandatory but not defined in the [{}] stream".format(stream)) raise Exception("key_properties field is required") key_properties[stream] = o['key_properties'] if config.get('add_metadata_columns') or hard_delete_mapping.get(stream, default_hard_delete): stream_to_sync[stream] = DbSync(config, add_metadata_columns_to_schema(o)) else: stream_to_sync[stream] = DbSync(config, o) try: stream_to_sync[stream].create_schema_if_not_exists() stream_to_sync[stream].sync_table() except Exception as e: LOGGER.error(""" Cannot sync table structure in BigQuery schema: {} . """.format( stream_to_sync[stream].schema_name)) raise e records_to_load[stream] = {} row_count[stream] = 0 total_row_count[stream] = 0 flush_timestamp[stream] = datetime.utcnow() elif t == 'ACTIVATE_VERSION': stream = o['stream'] version = o['version'] if hard_delete_mapping.get(stream, default_hard_delete): if stream in stream_to_sync: LOGGER.debug('ACTIVATE_VERSION message, clearing records with versions other than {}'.format(version)) stream_to_sync[stream].activate_table_version(stream, version) else: LOGGER.warn('ACTIVATE_VERSION message, unknown stream {}'.format(stream)) else: LOGGER.debug('ACTIVATE_VERSION message - ignoring due hard_delete not set') elif t == 'STATE': LOGGER.debug('Setting state to {}'.format(o['value'])) state = o['value'] # Initially set flushed state if not flushed_state: flushed_state = copy.deepcopy(state) else: raise Exception("Unknown message type {} in message {}" .format(o['type'], o)) # if some bucket has records that need to be flushed but haven't reached batch size # then flush all buckets. if sum(row_count.values()) > 0: # flush all streams one last time, delete records if needed, reset counts and then emit current state flushed_state, _ = flush_streams(records_to_load, row_count, stream_to_sync, config, state, flushed_state) # emit latest state emit_state(copy.deepcopy(flushed_state))
def assert_three_streams_are_into_bigquery(self, should_metadata_columns_exist=False, should_hard_deleted_rows=False): """ This is a helper assertion that checks if every data from the message-with-three-streams.json file is available in BigQuery tables correctly. Useful to check different loading methods (unencrypted, Client-Side encryption, gzip, etc.) without duplicating assertions """ bigquery = DbSync(self.config) default_target_schema = self.config.get('default_target_schema', '') schema_mapping = self.config.get('schema_mapping', {}) # Identify target schema name target_schema = None if default_target_schema is not None and default_target_schema.strip(): target_schema = default_target_schema elif schema_mapping: target_schema = "tap_mysql_test" # Get loaded rows from tables table_one = query(bigquery, "SELECT * FROM {}.test_table_one ORDER BY c_pk".format(target_schema)) table_two = query(bigquery, "SELECT * FROM {}.test_table_two ORDER BY c_pk".format(target_schema)) table_three = query(bigquery, "SELECT * FROM {}.test_table_three ORDER BY c_pk".format(target_schema)) # ---------------------------------------------------------------------- # Check rows in table_one # ---------------------------------------------------------------------- expected_table_one = [ {'c_int': 1, 'c_pk': 1, 'c_varchar': '1'} ] self.assertEqual( self.remove_metadata_columns_from_rows(table_one), expected_table_one) # ---------------------------------------------------------------------- # Check rows in table_tow # ---------------------------------------------------------------------- expected_table_two = [] if not should_hard_deleted_rows: expected_table_two = [ {'c_int': 1, 'c_pk': 1, 'c_varchar': '1', 'c_date': datetime.datetime(2019, 2, 1, 15, 12, 45, tzinfo=timezone.utc)}, {'c_int': 2, 'c_pk': 2, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 10, 2, 0, 0, tzinfo=timezone.utc)} ] else: expected_table_two = [ {'c_int': 2, 'c_pk': 2, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 10, 2, 0, 0, tzinfo=timezone.utc)} ] self.assertEqual( self.remove_metadata_columns_from_rows(table_two), expected_table_two) # ---------------------------------------------------------------------- # Check rows in table_three # ---------------------------------------------------------------------- expected_table_three = [] if not should_hard_deleted_rows: expected_table_three = [ {'c_int': 1, 'c_pk': 1, 'c_varchar': '1', 'c_time': datetime.time(4, 0, 0)}, {'c_int': 2, 'c_pk': 2, 'c_varchar': '2', 'c_time': datetime.time(7, 15, 0)}, {'c_int': 3, 'c_pk': 3, 'c_varchar': '3', 'c_time': datetime.time(23, 0, 3)} ] else: expected_table_three = [ {'c_int': 1, 'c_pk': 1, 'c_varchar': '1', 'c_time': datetime.time(4, 0, 0)}, {'c_int': 2, 'c_pk': 2, 'c_varchar': '2', 'c_time': datetime.time(7, 15, 0)} ] self.assertEqual( self.remove_metadata_columns_from_rows(table_three), expected_table_three) # ---------------------------------------------------------------------- # Check if metadata columns exist or not # ---------------------------------------------------------------------- if should_metadata_columns_exist: self.assert_metadata_columns_exist(table_one) self.assert_metadata_columns_exist(table_two) self.assert_metadata_columns_exist(table_three) else: self.assert_metadata_columns_not_exist(table_one) self.assert_metadata_columns_not_exist(table_two) self.assert_metadata_columns_not_exist(table_three)
def assert_logical_streams_are_in_bigquery(self, should_metadata_columns_exist=False): # Get loaded rows from tables bigquery = DbSync(self.config) target_schema = self.config.get('default_target_schema', '') table_one = query(bigquery, "SELECT * FROM {}.logical1_table1 ORDER BY cid".format(target_schema)) table_two = query(bigquery, "SELECT * FROM {}.logical1_table2 ORDER BY cid".format(target_schema)) table_three = query(bigquery, "SELECT * FROM {}.logical2_table1 ORDER BY cid".format(target_schema)) table_four = query(bigquery, "SELECT cid, ctimentz, ctimetz FROM {}.logical1_edgydata WHERE cid IN(1,2,3,4,5,6,8,9) ORDER BY cid".format(target_schema)) # ---------------------------------------------------------------------- # Check rows in table_one # ---------------------------------------------------------------------- expected_table_one = [ {'cid': 1, 'cvarchar': "inserted row", 'cvarchar2': None}, {'cid': 2, 'cvarchar': 'inserted row', "cvarchar2": "inserted row"}, {'cid': 3, 'cvarchar': "inserted row", 'cvarchar2': "inserted row"}, {'cid': 4, 'cvarchar': "inserted row", 'cvarchar2': "inserted row"} ] # ---------------------------------------------------------------------- # Check rows in table_two # ---------------------------------------------------------------------- expected_table_two = [ {'cid': 1, 'cvarchar': "updated row"}, {'cid': 2, 'cvarchar': 'updated row'}, {'cid': 3, 'cvarchar': "updated row"}, {'cid': 5, 'cvarchar': "updated row"}, {'cid': 7, 'cvarchar': "updated row"}, {'cid': 8, 'cvarchar': 'updated row'}, {'cid': 9, 'cvarchar': "updated row"}, {'cid': 10, 'cvarchar': 'updated row'} ] # ---------------------------------------------------------------------- # Check rows in table_three # ---------------------------------------------------------------------- expected_table_three = [ {'cid': 1, 'cvarchar': "updated row"}, {'cid': 2, 'cvarchar': 'updated row'}, {'cid': 3, 'cvarchar': "updated row"}, ] # ---------------------------------------------------------------------- # Check rows in table_four # ---------------------------------------------------------------------- expected_table_four = [ {'cid': 1, 'ctimentz': None, 'ctimetz': None}, {'cid': 2, 'ctimentz': datetime.time(23, 0, 15), 'ctimetz': datetime.time(23, 0, 15)}, {'cid': 3, 'ctimentz': datetime.time(12, 0, 15), 'ctimetz': datetime.time(12, 0, 15)}, {'cid': 4, 'ctimentz': datetime.time(12, 0, 15), 'ctimetz': datetime.time(9, 0, 15)}, {'cid': 5, 'ctimentz': datetime.time(12, 0, 15), 'ctimetz': datetime.time(15, 0, 15)}, {'cid': 6, 'ctimentz': datetime.time(0, 0), 'ctimetz': datetime.time(0, 0)}, {'cid': 8, 'ctimentz': datetime.time(0, 0), 'ctimetz': datetime.time(1, 0)}, {'cid': 9, 'ctimentz': datetime.time(0, 0), 'ctimetz': datetime.time(0, 0)} ] if should_metadata_columns_exist: self.assertEqual(self.remove_metadata_columns_from_rows(table_one), expected_table_one) self.assertEqual(self.remove_metadata_columns_from_rows(table_two), expected_table_two) self.assertEqual(self.remove_metadata_columns_from_rows(table_three), expected_table_three) self.assertEqual(table_four, expected_table_four) else: self.assertEqual(table_one, expected_table_one) self.assertEqual(table_two, expected_table_two) self.assertEqual(table_three, expected_table_three) self.assertEqual(table_four, expected_table_four)