def get_definitions(service: str) -> dict: service_account_obj = service_account.Credentials.from_service_account_file( os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH']) storage = StorageAdapter(service_account_obj) storage.get_client() key = storage.get_file_as_string(bucket="mysql_sync_keys", file=service) if key is False: raise RuntimeError(storage.errors) crypto = Fernet(key) bqa = BigQueryAdapter(service_account_obj) bqa.get_client() query = f"SELECT * FROM mysql_sync.data_sources WHERE service = '{service}'" result = bqa.query(query) if result is False: raise RuntimeError(bqa.errors) definitions = {} for item in result: definitions['service'] = item['service'] definitions['data_set'] = item['data_set'] definitions['database'] = item['database'] definitions['host'] = crypto.decrypt( bytes(item['host'], encoding='utf-8')) definitions['user'] = crypto.decrypt( bytes(item['user'], encoding='utf-8')) definitions['password'] = crypto.decrypt( bytes(item['password'], encoding='utf-8')) return definitions
def test_create_table_runtime_error(mock_service_account): bq = BigQueryAdapter(mock_service_account) schema = [ bigquery.SchemaField("column1", "STRING", mode="REQUIRED"), bigquery.SchemaField("column2", "INTEGER", mode="REQUIRED"), ] with pytest.raises(RuntimeError): bq.create_table(schema)
def set_last_run(self, service_name: str): big_query_client = BigQueryAdapter(self.service_account) big_query_client.get_client() query = f"UPDATE mysql_sync.data_sources SET `last_run` = CURRENT_TIMESTAMP where `service`= '{service_name}'" result = big_query_client.query(query) if not result: self.sd_logger.error(big_query_client.errors, { 'class': 'PrefectProduceSchemaJobs', 'method': 'set_last_run' }) return result
def create_data_set(self, big_query_client: BigQueryAdapter) -> bool: # return True if dataset already exists if big_query_client.check_dataset(self.definitions['data_set']): return True big_query_client.set_data_set_ref(self.definitions['data_set']) result = big_query_client.create_data_set() if result: self.sd_logger.info( {'message': f"Created {self.definitions['data_set']} Data Set"}, {'class': 'PrefectAddServiceBigQuery', 'method': 'create_data_set'}) else: self.sd_logger.warning( big_query_client.errors, {'class': 'PrefectAddServiceBigQuery', 'method': 'create_data_set'} ) return result
def create_config_data_set(self, big_query_client: BigQueryAdapter) -> bool: # return True if dataset already exists if big_query_client.check_dataset('mysql_sync'): return True big_query_client.set_data_set_ref('mysql_sync') result = big_query_client.create_data_set() if result: self.sd_logger.info({'message': f"Created mysql_sync Data Set"}, { 'class': 'PrefectInstallBigQuery', 'method': 'create_data_set' }) else: self.sd_logger.warning(big_query_client.errors, { 'class': 'PrefectInstallBigQuery', 'method': 'create_data_set' }) return result
def __init__(self, service: str, **kwargs): self.service_account = service_account.Credentials.from_service_account_file( os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH']) self.definitions = service_helpers.get_definitions(service) self.sd_logger = StackDriverAdapter(self.service_account) self.sd_logger.get_client() self.sd_logger.create_logger(f"{self.definitions['service']}-etl") self.pub_sub_client = PubSubAdapter(self.service_account) self.pub_sub_client.get_subscriber() self.pub_sub_client.set_subscription( f"{self.definitions['service']}-etl-schema") self.big_query_client = BigQueryAdapter(self.service_account) self.big_query_client.get_client() self.big_query_client.set_data_set_ref(self.definitions['data_set']) super().__init__(**kwargs)
def get_service_to_produce(self) -> str: big_query_client = BigQueryAdapter(self.service_account) big_query_client.get_client() query = 'SELECT * from mysql_sync.data_sources order by `last_run` limit 1' result = big_query_client.query(query) if not result: self.sd_logger.error( big_query_client.errors, { 'class': 'PrefectProduceSchemaJobs', 'method': 'get_service_to_produce' }) RuntimeError(big_query_client.errors) for item in result: if hasattr(item, 'service'): return item.service else: raise RuntimeError( 'Service was not found in mysql_sync.data_sources')
def write_to_big_query(self, data: pd.DataFrame) -> bool: bq = BigQueryAdapter(self.service_account) bq.get_client() bq.set_data_set_ref(self.definitions['data_set']) bq.set_table_ref('sync_tracking_table') result = bq.upload_data_frame(data, 'replace') if result: self.sd_logger.info( {"Tracking table populated": data.count(axis=0).to_dict()}, { 'class': 'PopulateTrackingTable', 'method': 'write_to_big_query' }) else: self.sd_logger.error(bq.errors, { 'class': 'PopulateTrackingTable', 'method': 'write_to_big_query' }) return result
def save_record(self): record = { 'service': [self.service], 'data_set': [self.data_set], 'host': [self.host], 'user': [self.user], 'password': [self.password], 'database': [self.database] } df = pd.DataFrame(record) big_query = BigQueryAdapter(self.service_account) big_query.get_client() big_query.set_data_set_ref('mysql_sync') big_query.set_table_ref('data_sources') result = big_query.upload_data_frame(df) if result is False: self.sd_logger.error(big_query.errors, { 'class': 'AddService', 'method': 'save_record' })
def __init__(self, service: str, **kwargs): self.table = None self.watched_column = None self.primary_id = None self.sample_size = 1000 self.service_account = service_account.Credentials.from_service_account_file( os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH']) self.definitions = service_helpers.get_definitions(service) self.sd_logger = StackDriverAdapter(self.service_account) self.sd_logger.get_client() self.sd_logger.create_logger(f"{self.definitions['service']}-etl") self.pub_sub_client = PubSubAdapter(self.service_account) self.pub_sub_client.get_subscriber() self.pub_sub_client.set_subscription( f"{self.definitions['service']}-etl") self.big_query_client = BigQueryAdapter(self.service_account) self.big_query_client.get_client() self.big_query_client.set_data_set_ref(self.definitions['data_set']) self.my_sql_client = MySqlAdapter() super().__init__(**kwargs)
def get_items_to_be_queued(self, big_query_client: BigQueryAdapter): query = [ "SELECT `table`, `watched`, `primary_id`", f"FROM `{self.definitions['data_set']}`.`sync_tracking_table`", "WHERE `synchronize` is true", "ORDER BY `table` ASC" ] result = big_query_client.query(' '.join(query)) if not result: self.sd_logger.error( big_query_client.errors, { 'class': 'PrefectProduceSchemaJobs', 'method': 'get_items_to_be_queued' }) return result
def create_config_table(self, big_query_client: BigQueryAdapter): big_query_client.set_data_set_ref('mysql_sync') big_query_client.set_table_ref('data_sources') # return true if table already exists if big_query_client.check_table(): return True schema = [ bigquery.SchemaField('service', 'STRING', description="Service Name"), bigquery.SchemaField('data_set', 'STRING', description="Big Query Data Set"), bigquery.SchemaField('host', 'STRING', description="MySQL host connection"), bigquery.SchemaField('user', 'STRING', description="MySQL connection user"), bigquery.SchemaField('password', 'STRING', description="MySQL connection password"), bigquery.SchemaField('database', 'STRING', description="MySQL Database"), bigquery.SchemaField('last_run', 'TIMESTAMP', description="Last produce date"), ] result = big_query_client.create_table(schema) if result: self.sd_logger.info({'message': f"Created Config table"}, { 'class': 'PrefectInstallBigQuery', 'method': 'create_config_table' }) else: self.sd_logger.warning(big_query_client.errors, { 'class': 'PrefectInstallBigQuery', 'method': 'create_config_table' }) return result
def create_tracking_table(self, big_query_client: BigQueryAdapter): big_query_client.set_data_set_ref(self.definitions['data_set']) big_query_client.set_table_ref('sync_tracking_table') # return true if table already exists if big_query_client.check_table(): return True schema = [ bigquery.SchemaField( 'table', 'STRING', description="Tracked Table Name" ), bigquery.SchemaField( 'watched', 'STRING', description="Column to watch to minimize the number of records loaded per sync" ), bigquery.SchemaField( 'primary_id', 'STRING', description="Primary Id Column(s)" ), bigquery.SchemaField( 'synchronize', 'BOOLEAN', description="Flag to Synchronize the table" ) ] result = big_query_client.create_table(schema) if result: self.sd_logger.info( {'message': f"Created tracking table"}, {'class': 'PrefectAddServiceBigQuery', 'method': 'create_tracking_table'} ) else: self.sd_logger.warning( big_query_client.errors, {'class': 'PrefectAddServiceBigQuery', 'method': 'create_tracking_table'} ) return result
class Data(Task): def __init__(self, service: str, **kwargs): self.chunk_size = 250000 self.table = None self.watched_column = None self.service_account = service_account.Credentials.from_service_account_file( os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH']) self.definitions = service_helpers.get_definitions(service) self.sd_logger = StackDriverAdapter(self.service_account) self.sd_logger.get_client() self.sd_logger.create_logger(f"{self.definitions['service']}-etl") self.pub_sub_client = PubSubAdapter(self.service_account) self.pub_sub_client.get_subscriber() self.pub_sub_client.set_subscription( f"{self.definitions['service']}-etl-data") self.big_query_client = BigQueryAdapter(self.service_account) self.big_query_client.get_client() self.big_query_client.set_data_set_ref(self.definitions['data_set']) self.my_sql_client = MySqlAdapter(service) super().__init__(**kwargs) def check_message(self, message): if 'table' in message: self.table = message['table'] else: error_message = 'Table was not included in the message' self.sd_logger.error({'error': error_message}, { 'class': 'Data', 'method': 'check_message', 'table': self.table }) raise RuntimeError(error_message) if 'watched' in message: self.watched_column = message['watched'] else: error_message = 'Watched was not included in the message' self.sd_logger.error({'error': error_message}, { 'class': 'Data', 'method': 'check_message', 'table': self.table }) raise RuntimeError(error_message) def get_schema_from_big_query(self) -> bool: self.big_query_client.set_table_ref(self.table) table_check = self.big_query_client.check_table() if table_check: return self.big_query_client.get_schema() return table_check def last_updated_data(self) -> Union[str, bool]: query = f"SELECT MAX({self.watched_column}) as last_updated FROM {self.definitions['data_set']}.{self.table}" result = self.big_query_client.query(query) if result: for value in result: if value['last_updated']: return value['last_updated'] else: # return none to allow all records to be pulled at the start return None else: self.sd_logger.critical( self.big_query_client.errors, { 'class': 'Data', 'method': 'last_updated_data', 'table': self.table }) return result def get_number_of_records_to_import(self, last_updated) -> Union[int, bool]: result = self.my_sql_client.count_items_to_sync( table=self.table, watched_column=self.watched_column, last_run=last_updated) if self.my_sql_client.errors: self.sd_logger.critical( self.my_sql_client.errors, { 'class': 'Data', 'method': 'get_number_of_records_to_import', 'table': self.table }) return result def query_mysql_for_records(self, last_updated_date: str, limit: int, offset: int) -> Union[list, bool]: results = self.my_sql_client.get_records( table=self.table, watched_column=self.watched_column, last_run=last_updated_date, limit=limit, offset=offset) if not results: self.sd_logger.critical( self.my_sql_client.errors, { 'class': 'Data', 'method': 'get_number_of_records_to_import', 'table': self.table }) return results def load_mysql_data_into_data_frame( self, data: list, schema: dict) -> Union[DataFrame, bool]: if len(data[0]) is not len(schema.keys()): self.sd_logger.critical( { 'message': "Schema and data length mismatch", 'schema_length': len(schema.keys()), 'data_length': len(data[0]) }, { 'class': 'Data', 'method': 'load_mysql_data_into_data_frame', 'table': self.table }) return False df = pd.DataFrame.from_records(data, columns=schema.keys()) del data return df def transform_data_frame_to_match_big_query_schema( self, data_frame: DataFrame, schema: dict) -> Union[DataFrame, bool]: try: df = service_helpers.data_frame_to_schema(data_frame, schema) except ValueError as e: self.sd_logger.critical({'message': 'Error: {}'.format(e)}, { 'class': 'Data', 'method': 'transform_data_frame_to_match_big_query_schema', 'table': self.table }) return False return df def append_data_frame_to_big_query(self, data_frame: DataFrame): result = self.big_query_client.upload_data_frame(data_frame) if result: message = f"table:{self.table} | Records written: {data_frame.shape[0]}" self.sd_logger.info({'message': message}, { 'class': 'Data', 'method': 'append_data_frame_to_big_query', 'table': self.table }) else: self.sd_logger.critical( self.big_query_client.errors, { 'class': 'Data', 'method': 'append_data_frame_to_big_query', 'table': self.table }) return result def write_df_to_storage(self, df: DataFrame) -> bool: storage_client = StorageAdapter(self.service_account) storage_client.get_client() date_time_obj = datetime.datetime.utcnow() location = f'error/csv/{self.table}/{date_time_obj.strftime("%m-%d-%Y_%H:%M:%S")}_UTC' result = storage_client.write_string( bucket=self.definitions['service'] + '-etl', destination=location, string=df.to_csv(), encoding='text/csv') if not result: self.sd_logger.error( storage_client.errors, { 'class': 'Data', 'method': 'write_df_to_storage', 'table': self.table }) return False self.sd_logger.info({'message': f"Failed CSV added to {location}"}, { 'class': 'Data', 'method': 'write_df_to_storage', 'table': self.table }) return result
class Schema(Task): def __init__(self, service: str, **kwargs): self.service_account = service_account.Credentials.from_service_account_file( os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH']) self.definitions = service_helpers.get_definitions(service) self.sd_logger = StackDriverAdapter(self.service_account) self.sd_logger.get_client() self.sd_logger.create_logger(f"{self.definitions['service']}-etl") self.pub_sub_client = PubSubAdapter(self.service_account) self.pub_sub_client.get_subscriber() self.pub_sub_client.set_subscription( f"{self.definitions['service']}-etl-schema") self.big_query_client = BigQueryAdapter(self.service_account) self.big_query_client.get_client() self.big_query_client.set_data_set_ref(self.definitions['data_set']) super().__init__(**kwargs) def get_mysql_schema(self, table: str) -> Union[list, bool]: mysql_client = MySqlAdapter(self.definitions['service']) columns = mysql_client.mysql_table_definition(table) if not columns: self.sd_logger.error(mysql_client.errors, { 'class': 'Schema', 'method': 'get_mysql_schema', 'table': table }) return False return columns @staticmethod def organized_mysql_schema(schema: list) -> list: organized_column_data = [] for column in schema: organized_column_data.append( service_helpers.label_mysql_table_definitions(column)) return organized_column_data @staticmethod def convert_mysql_to_big_query_schema(schema: list) -> list: return service_helpers.generate_bq_schema_from_mysql(schema) def store_mysql_schema(self, schema: list, table: str) -> bool: encoded_schema = json.dumps(schema) storage_client = StorageAdapter(self.service_account) storage_client.get_client() date_time_obj = datetime.utcnow() result = storage_client.write_string( bucket=self.definitions['service'] + '-etl', destination= f'schema/{table}/{date_time_obj.strftime("%m-%d-%Y_%H:%M:%S")}_UTC', string=encoded_schema) if not result: self.sd_logger.error(storage_client.errors, { 'class': 'Schema', 'method': 'store_mysql_schema', 'table': table }) return False return result def check_table_exists(self, table): self.big_query_client.set_table_ref(table) return self.big_query_client.check_table() def get_current_schema(self, table: str): self.big_query_client.set_table_ref(table) return self.big_query_client.get_schema() @staticmethod def compare_schema(new_schema, current_schema) -> bool: # first check is total number of items if len(new_schema) is not len(current_schema): return False # compare column names and types schema_matches = True for x in range(len(new_schema)): exists_in_current = False if current_schema[x].name == new_schema[x].name: if current_schema[x].field_type == new_schema[x].field_type: exists_in_current = True else: exists_in_current = True if not exists_in_current: schema_matches = False return schema_matches def create_table(self, table: str, schema: list) -> bool: self.big_query_client.set_table_ref(table) result = self.big_query_client.create_table(schema=schema, overwrite=True) if result: self.sd_logger.info({'message': f"Table {table} Created"}, { 'class': 'Schema', 'method': 'create_table', 'table': table }) else: self.sd_logger.error(self.big_query_client.errors, { 'class': 'Schema', 'method': 'create_table', 'table': table }) return result @staticmethod def acknowledge_message(message): message.ack() def copy_bq_table(self, table: str): self.big_query_client.set_table_ref(table) copy_table_ref = self.big_query_client.table_ref date_time_obj = datetime.utcnow() destination_str = f'{table}_{date_time_obj.strftime("%m_%d_%Y")}' self.big_query_client.set_table_ref(destination_str) destination_table_ref = self.big_query_client.table_ref result = self.big_query_client.copy_table( copy_table=copy_table_ref, destination_table=destination_table_ref) if result: self.sd_logger.warning( {'message': f"Table {table} copied to {destination_str}"}, { 'class': 'Schema', 'method': 'copy_bq_table', 'table': table }) else: self.sd_logger.error(self.big_query_client.errors, { 'class': 'Schema', 'method': 'copy_bq_table', 'table': table }) return destination_str def backup_table_to_storage(self, table): self.big_query_client.set_table_ref(table) copy_table_ref = self.big_query_client.table_ref date_time_obj = datetime.utcnow() destination = f'gs://{self.definitions["service"]}-etl/data/{table}/{date_time_obj.strftime("%m-%d-%Y_%H:%M:%S")}_UTC_*.avro' result = self.big_query_client.export_table_to_storage( table=copy_table_ref, destination=destination) if result: self.sd_logger.info( {'message': f"Table {table} exported to {destination}"}, { 'class': 'Schema', 'method': 'backup_table_to_storage' }) else: self.sd_logger.error(self.big_query_client.errors, { 'class': 'Schema', 'method': 'backup_table_to_storage' }) return result def delete_table(self, table): self.big_query_client.set_table_ref(table) result = self.big_query_client.delete_table() if result: self.sd_logger.info({'message': f"Table {table} deleted"}, { 'class': 'Schema', 'method': 'delete_table' }) else: self.sd_logger.error(self.big_query_client.errors, { 'class': 'Schema', 'method': 'delete_table' }) return result
def test_check_dependencies_client(mock_service_account): bq = BigQueryAdapter(mock_service_account) with pytest.raises(RuntimeError): bq.check_dependencies()
def test_get_client_success(mock_service_account): bq = BigQueryAdapter(mock_service_account) bq.get_client() assert isinstance(bq.client, google.cloud.bigquery.client.Client)
def mock_configured_client(mock_service_account): bq = BigQueryAdapter(mock_service_account) bq.get_client() return bq
def create_big_query_client(self) -> BigQueryAdapter: big_query_client = BigQueryAdapter(self.service_account) big_query_client.get_client() big_query_client.set_data_set_ref(self.definitions['data_set']) big_query_client.set_table_ref('sync_tracking_table') return big_query_client
def create_client(self) -> BigQueryAdapter: big_query_client = BigQueryAdapter(self.service_account) big_query_client.get_client() return big_query_client
class Audit(Task): def __init__(self, service: str, **kwargs): self.table = None self.watched_column = None self.primary_id = None self.sample_size = 1000 self.service_account = service_account.Credentials.from_service_account_file( os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH']) self.definitions = service_helpers.get_definitions(service) self.sd_logger = StackDriverAdapter(self.service_account) self.sd_logger.get_client() self.sd_logger.create_logger(f"{self.definitions['service']}-etl") self.pub_sub_client = PubSubAdapter(self.service_account) self.pub_sub_client.get_subscriber() self.pub_sub_client.set_subscription( f"{self.definitions['service']}-etl") self.big_query_client = BigQueryAdapter(self.service_account) self.big_query_client.get_client() self.big_query_client.set_data_set_ref(self.definitions['data_set']) self.my_sql_client = MySqlAdapter() super().__init__(**kwargs) def extract_message_values(self, message: dict) -> bool: if self.table is None: if 'table' in message: self.table = message['table'] else: self.sd_logger.error( { 'error': 'Key: table was not found in message', 'data': message }, { 'class': 'Audit', 'method': 'extract_message_values' }) return False if 'watched' in message: self.watched_column = message['watched'] else: self.sd_logger.error( { 'error': 'Key: watched was not found in message', 'data': message }, { 'class': 'Audit', 'method': 'extract_message_values', "table": self.table }) return False if 'primary_id' in message: self.primary_id = message['primary_id'] else: self.sd_logger.error( { 'error': 'Key: primary_id was not found in message', 'data': message }, { 'class': 'Audit', 'method': 'extract_message_values', "table": self.table }) return False return True def big_query_last_updated_data(self) -> Union[datetime.datetime, bool]: query = f"SELECT MAX({self.watched_column}) as last_updated FROM {self.definitions['data_set']}.{self.table}" result = self.big_query_client.query(query) if result: for value in result: if value.last_updated: return value.last_updated else: # use string of the start of unix time as the default time return False else: self.sd_logger.critical( self.big_query_client.errors, { 'class': 'Audit', 'method': 'big_query_last_updated_data', 'table': self.table }) return result def get_total_distinct_mysql( self, last_updated: datetime.datetime) -> Union[int, bool]: if last_updated: records = self.my_sql_client.count_distinct( table=self.table, index=self.primary_id, watched_column=self.watched_column, last_updated=last_updated) else: records = self.my_sql_client.count_distinct(table=self.table, index=self.primary_id) if records is False: self.sd_logger.error({'error': self.my_sql_client.errors}, { 'class': 'Audit', 'method': 'get_total_distinct_mysql', "table": self.table }) return records def get_total_distinct_big_query(self) -> int: query = [ "SELECT COUNT(*) as total_count", "FROM (", f"select DISTINCT {self.primary_id}", f"FROM {self.definitions['data_set']}.{self.table}", ")" ] result = self.big_query_client.query(' '.join(query)) if not result: self.sd_logger.error({'error': self.big_query_client.errors}, { 'class': 'Audit', 'method': 'get_total_distinct_big_query', "table": self.table }) return False for item in result: return item.total_count def audit_from_totals(self, total_mysql: int, total_big_query: int) -> bool: # check if the totals are the same if total_mysql == total_big_query: return True # big_query total will be lower than the mysql most of the time # allow it is be passing_total = int(round((total_mysql * .99), 0)) if total_big_query >= passing_total: return True self.sd_logger.info( { 'message': f"Audit totals did not match: {self.table}", "data": { "MySQL Total": total_mysql, "Big Query Total": total_big_query, "passing total": passing_total } }, { 'class': 'Audit', 'method': 'audit_from_totals', "table": self.table }) return False def get_random_mysql(self, last_run: int = None) -> Union[list, bool]: result = self.my_sql_client.get_random_records( table=self.table, limit=self.sample_size, index=self.primary_id, watched=self.watched_column, last_updated=last_run) if result is not False: return result self.sd_logger.error({'error': self.my_sql_client.errors}, { 'class': 'Audit', 'method': 'get_random_mysql', "table": self.table }) return False def get_big_query_records_from_sample(self, sample: list) -> DataFrame: primary_ids = self.primary_id.split(',') or_statements = [] # get the same items from the mysql random query in big query for item in sample: pk_statements = [] for key in primary_ids: if type(item[key]) is int: pk_statements.append(f"`{key}` = {item[key]}") else: pk_statements.append(f"`{key}` = '{item[key]}'") if type(item[self.watched_column]) is int: watched_condition = f"AND `{self.watched_column}` = {item[self.watched_column]}" else: if item[self.watched_column] is None: watched_condition = f"AND `{self.watched_column}` IS NULL" else: watched_condition = f"AND `{self.watched_column}` = '{item[self.watched_column]}'" or_statements.append( f"({' AND '.join(pk_statements)} {watched_condition})") query = [ f"SELECT *", f"FROM {self.definitions['data_set']}.{self.table}", f"WHERE {' OR '.join(or_statements)}", "ORDER BY" ] order_by = [] for key in primary_ids: order_by.append(f"{key} ASC") query.append(', '.join(order_by)) result = self.big_query_client.query(' '.join(query)) if not result: self.sd_logger.error({'error': self.big_query_client.errors}, { 'class': 'Audit', 'method': 'get_big_query_records_from_sample', "table": self.table }) return False output = result.to_dataframe() return output def convert_mysql_to_data_frame( self, mysql_data: list) -> Union[DataFrame, bool]: self.big_query_client.set_table_ref(self.table) bq_schema = self.big_query_client.get_schema() schema = service_helpers.convert_schema_to_dict(bq_schema) if len(mysql_data[0]) is not len(schema.keys()): self.sd_logger.critical( { 'message': "Schema and data length mismatch", 'schema_length': len(schema.keys()), 'data_length': len(mysql_data[0]) }, { 'class': 'Audit', 'method': 'convert_mysql_to_data_frame', 'table': self.table }) return False data_frame = pd.DataFrame.from_records(mysql_data, columns=schema.keys()) try: df = service_helpers.data_frame_to_schema(data_frame, schema, utc=True) except ValueError as e: self.sd_logger.critical({'message': 'Error: {}'.format(e)}, { 'class': 'Audit', 'method': 'transform_data_frame_to_match_big_query_schema', 'table': self.table }) return False return df def audit_from_sample(self, mysql_data: DataFrame, big_query_data: DataFrame) -> bool: # test if data frames are equal - this will fail in most cases to do the poor data quality from sources result = mysql_data.equals(big_query_data) if result: return True # check if the number of returned rows are the same # this check is useful because of the strictness of the queries that generate these DataFrames if mysql_data.shape[0] == big_query_data.shape[0]: return True self.sd_logger.info( { 'message': f"Audit Failure: {self.table}", 'DataFrame.equals': result, 'mysql.shape': mysql_data.shape[0], 'big_query.shape': big_query_data.shape[0] }, { 'class': 'Audit', 'method': 'audit_from_sample', 'table': self.table }) return False