Example #1
0
 def __init__(self, **kwargs):
     self.service_account = service_account.Credentials.from_service_account_file(
         os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH'])
     self.sd_logger = StackDriverAdapter(self.service_account)
     self.sd_logger.get_client()
     self.sd_logger.create_logger(f"service-add-pub-sub")
     super().__init__(**kwargs)
Example #2
0
 def __init__(self, service: str, **kwargs):
     self.service_account = service_account.Credentials.from_service_account_file(
         os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH']
     )
     self.definitions = service_helpers.get_definitions(service)
     self.sd_logger = StackDriverAdapter(self.service_account)
     self.sd_logger.get_client()
     self.sd_logger.create_logger(f"service-add-big-query")
     super().__init__(**kwargs)
Example #3
0
class PrefectInstallStorage(Task):
    def __init__(self, **kwargs):
        self.service_account = service_account.Credentials.from_service_account_file(
            os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH'])
        self.sd_logger = StackDriverAdapter(self.service_account)
        self.sd_logger.get_client()
        self.sd_logger.create_logger(f"storage-install-mysql-sync")
        super().__init__(**kwargs)

    def create_config_bucket(self):
        client = StorageAdapter(self.service_account)
        client.get_client()
        result = client.make_bucket('mysql_sync_keys')
        if result:
            self.sd_logger.info({'message': f'Created mysql_sync_keys Bucket'},
                                {
                                    'class': 'PrefectInstallStorage',
                                    'method': 'create_config_bucket'
                                })
        else:
            self.sd_logger.warning(client.errors, {
                'class': 'PrefectInstallStorage',
                'method': 'create_config_bucket'
            })
        return result
Example #4
0
class PrefectAddServiceStorage(Task):
    def __init__(self, **kwargs):
        self.service_account = service_account.Credentials.from_service_account_file(
            os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH'])
        self.sd_logger = StackDriverAdapter(self.service_account)
        self.sd_logger.get_client()
        self.sd_logger.create_logger(f"service-add-storage")
        super().__init__(**kwargs)

    def create_service_bucket(self, service: str):
        client = StorageAdapter(self.service_account)
        client.get_client()
        result = client.make_bucket(f"{service}-sync")
        if result:
            self.sd_logger.info({'message': f'Created {service} Bucket'}, {
                'class': 'PrefectAddServiceStorage',
                'method': 'create_service_bucket'
            })
        else:
            self.sd_logger.warning(
                client.errors, {
                    'class': 'PrefectAddServiceStorage',
                    'method': 'create_service_bucket'
                })
        return result
    def __init__(self):
        self.service = None
        self.data_set = None
        self.host = None
        self.user = None
        self.password = None
        self.database = None

        self.key = None

        self.service_account = service_account.Credentials.from_service_account_file(
            os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH'])

        self.sd_logger = StackDriverAdapter(self.service_account)
        self.sd_logger.get_client()
        self.sd_logger.create_logger(f"add-service")
Example #6
0
    def __init__(self, service: str, **kwargs):
        self.service_account = service_account.Credentials.from_service_account_file(
            os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH'])
        self.definitions = service_helpers.get_definitions(service)

        self.sd_logger = StackDriverAdapter(self.service_account)
        self.sd_logger.get_client()
        self.sd_logger.create_logger(f"{self.definitions['service']}-etl")

        self.pub_sub_client = PubSubAdapter(self.service_account)
        self.pub_sub_client.get_subscriber()
        self.pub_sub_client.set_subscription(
            f"{self.definitions['service']}-etl-schema")

        self.big_query_client = BigQueryAdapter(self.service_account)
        self.big_query_client.get_client()
        self.big_query_client.set_data_set_ref(self.definitions['data_set'])
        super().__init__(**kwargs)
Example #7
0
    def __init__(self, service: str, **kwargs):
        self.table = None
        self.watched_column = None
        self.primary_id = None
        self.sample_size = 1000
        self.service_account = service_account.Credentials.from_service_account_file(
            os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH'])
        self.definitions = service_helpers.get_definitions(service)

        self.sd_logger = StackDriverAdapter(self.service_account)
        self.sd_logger.get_client()
        self.sd_logger.create_logger(f"{self.definitions['service']}-etl")

        self.pub_sub_client = PubSubAdapter(self.service_account)
        self.pub_sub_client.get_subscriber()
        self.pub_sub_client.set_subscription(
            f"{self.definitions['service']}-etl")

        self.big_query_client = BigQueryAdapter(self.service_account)
        self.big_query_client.get_client()
        self.big_query_client.set_data_set_ref(self.definitions['data_set'])

        self.my_sql_client = MySqlAdapter()
        super().__init__(**kwargs)
Example #8
0
 def create_logger(self):
     self.sd_logger = StackDriverAdapter(self.service_account)
     self.sd_logger.get_client()
     self.sd_logger.create_logger(f"{self.service}-consumer-sync")
Example #9
0
class PrefectConsumer(Task):
    def __init__(self):
        self.service_account = service_account.Credentials.from_service_account_file(
            os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH'])
        self.consumer_type = os.environ['MYSQL_BIG_QUERY_CONSUMER_TYPE']
        self.definitions = None
        self.service = None
        self.sd_logger = None
        self.next_queue = None

    def subscribe_to_queue(self, callback):
        pub_sub_client = PubSubAdapter(self.service_account)
        pub_sub_client.get_subscriber()
        pub_sub_client.set_subscription(
            f"mysql-bq-sync-{self.consumer_type}-jobs")

        pub_sub_client.subscribe(callback, max_messages=10)

    @staticmethod
    def decode_message(message) -> dict:
        data_string = bytes.decode(message.data)
        return json.loads(data_string)

    def get_service(self, message: dict):
        self.service = message['service']

    def get_definitions(self):
        self.definitions = service_helpers.get_definitions(self.service)

    def create_logger(self):
        self.sd_logger = StackDriverAdapter(self.service_account)
        self.sd_logger.get_client()
        self.sd_logger.create_logger(f"{self.service}-consumer-sync")

    def run_consume_job(self, message):
        if self.consumer_type == 'schema':
            schema.flow_runner(message)
        elif self.consumer_type == 'data':
            data.flow_runner(message)
        elif self.consumer_type == 'audit':
            results = audit.flow_runner(message)
            if results is False:
                message['flush'] = True
            else:
                return False
        return message

    def set_next_queue(self):
        self.next_queue = PubSubAdapter(self.service_account)
        self.next_queue.get_publisher()
        if self.consumer_type == 'schema':
            self.next_queue.set_topic(f"mysql-bq-sync-data-jobs")
        elif self.consumer_type == 'data':
            self.next_queue.set_topic(f"mysql-bq-sync-audit-jobs")
        elif self.consumer_type == 'audit':
            self.next_queue.set_topic(f"mysql-bq-sync-schema-jobs")

    def install_next_job(self, next_job_message: dict) -> bool:
        message_string = json.dumps(next_job_message)
        results = self.next_queue.publish_message(message_string)
        if results is False:
            self.sd_logger.error(self.next_queue.errors, {
                'class': 'PrefectConsumer',
                'method': 'install_next_job'
            })
        return results
Example #10
0
class PrefectAddServiceBigQuery(Task):
    def __init__(self, service: str, **kwargs):
        self.service_account = service_account.Credentials.from_service_account_file(
            os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH']
        )
        self.definitions = service_helpers.get_definitions(service)
        self.sd_logger = StackDriverAdapter(self.service_account)
        self.sd_logger.get_client()
        self.sd_logger.create_logger(f"service-add-big-query")
        super().__init__(**kwargs)

    def create_client(self) -> BigQueryAdapter:
        big_query_client = BigQueryAdapter(self.service_account)
        big_query_client.get_client()
        return big_query_client

    def create_data_set(self, big_query_client: BigQueryAdapter) -> bool:
        # return True if dataset already exists
        if big_query_client.check_dataset(self.definitions['data_set']):
            return True
        big_query_client.set_data_set_ref(self.definitions['data_set'])
        result = big_query_client.create_data_set()
        if result:
            self.sd_logger.info(
                {'message': f"Created {self.definitions['data_set']} Data Set"},
                {'class': 'PrefectAddServiceBigQuery', 'method': 'create_data_set'})
        else:
            self.sd_logger.warning(
                big_query_client.errors,
                {'class': 'PrefectAddServiceBigQuery', 'method': 'create_data_set'}
            )
        return result

    def create_tracking_table(self, big_query_client: BigQueryAdapter):
        big_query_client.set_data_set_ref(self.definitions['data_set'])
        big_query_client.set_table_ref('sync_tracking_table')
        # return true if table already exists
        if big_query_client.check_table():
            return True
        schema = [
            bigquery.SchemaField(
                'table',
                'STRING',
                description="Tracked Table Name"
            ),
            bigquery.SchemaField(
                'watched',
                'STRING',
                description="Column to watch to minimize the number of records loaded per sync"
            ),
            bigquery.SchemaField(
                'primary_id',
                'STRING',
                description="Primary Id Column(s)"
            ),
            bigquery.SchemaField(
                'synchronize',
                'BOOLEAN',
                description="Flag to Synchronize the table"
            )
        ]
        result = big_query_client.create_table(schema)
        if result:
            self.sd_logger.info(
                {'message': f"Created tracking table"},
                {'class': 'PrefectAddServiceBigQuery', 'method': 'create_tracking_table'}
            )
        else:
            self.sd_logger.warning(
                big_query_client.errors,
                {'class': 'PrefectAddServiceBigQuery', 'method': 'create_tracking_table'}
            )
        return result
Example #11
0
class PrefectAddServicePubSub(Task):
    def __init__(self, **kwargs):
        self.service_account = service_account.Credentials.from_service_account_file(
            os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH'])
        self.sd_logger = StackDriverAdapter(self.service_account)
        self.sd_logger.get_client()
        self.sd_logger.create_logger(f"service-add-pub-sub")
        super().__init__(**kwargs)

    def create_client(self):
        pub_sub_client = PubSubAdapter(self.service_account)
        pub_sub_client.get_publisher()
        return pub_sub_client

    def create_topic(self, pub_sub_client: PubSubAdapter, topic: str) -> bool:
        pub_sub_client.set_topic(topic)
        result = pub_sub_client.create_topic()
        if result:
            self.sd_logger.info({'message': f"Created {topic} Topic"}, {
                'class': 'PrefectAddServicePubSub',
                'method': 'create_topic'
            })
        else:
            self.sd_logger.warning(pub_sub_client.errors, {
                'class': 'PrefectAddServicePubSub',
                'method': 'create_topic'
            })
        return result

    def create_subscription(self, pub_sub_client: PubSubAdapter, topic: str,
                            subscription: str) -> bool:
        pub_sub_client.set_topic(topic)
        pub_sub_client.set_subscription(subscription)
        pub_sub_client.get_subscriber()
        result = pub_sub_client.create_subscription()
        if result:
            self.sd_logger.info({'message': f"Created {topic} Subscription"}, {
                'class': 'PrefectAddServicePubSub',
                'method': 'create_subscription'
            })
        else:
            self.sd_logger.warning(
                pub_sub_client.errors, {
                    'class': 'PrefectAddServicePubSub',
                    'method': 'create_subscription'
                })
        return result
Example #12
0
class PrefectProduceSchemaJobs(Task):
    def __init__(self, **kwargs):
        self.service_account = service_account.Credentials.from_service_account_file(
            os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH'])
        self.definitions = None
        self.sd_logger = None
        self.service = None
        super().__init__(**kwargs)

    def get_definitions(self, service_name: str):
        self.definitions = service_helpers.get_definitions(service_name)

    def create_logger(self):
        self.sd_logger = StackDriverAdapter(self.service_account)
        self.sd_logger.get_client()
        self.sd_logger.create_logger(
            f"{self.definitions['service']}-producer-sync")

    def create_pub_sub_client(self) -> PubSubAdapter:
        pub_sub_client = PubSubAdapter(self.service_account)
        pub_sub_client.get_publisher()
        pub_sub_client.set_topic("mysql-bq-sync-schema-jobs")
        return pub_sub_client

    def create_big_query_client(self) -> BigQueryAdapter:
        big_query_client = BigQueryAdapter(self.service_account)
        big_query_client.get_client()
        big_query_client.set_data_set_ref(self.definitions['data_set'])
        big_query_client.set_table_ref('sync_tracking_table')
        return big_query_client

    def get_service_to_produce(self) -> str:
        big_query_client = BigQueryAdapter(self.service_account)
        big_query_client.get_client()
        query = 'SELECT * from mysql_sync.data_sources order by `last_run` limit 1'
        result = big_query_client.query(query)

        if not result:
            self.sd_logger.error(
                big_query_client.errors, {
                    'class': 'PrefectProduceSchemaJobs',
                    'method': 'get_service_to_produce'
                })
            RuntimeError(big_query_client.errors)

        for item in result:
            if hasattr(item, 'service'):
                return item.service
            else:
                raise RuntimeError(
                    'Service was not found in mysql_sync.data_sources')

    def get_items_to_be_queued(self, big_query_client: BigQueryAdapter):
        query = [
            "SELECT `table`, `watched`, `primary_id`",
            f"FROM `{self.definitions['data_set']}`.`sync_tracking_table`",
            "WHERE `synchronize` is true", "ORDER BY `table` ASC"
        ]

        result = big_query_client.query(' '.join(query))
        if not result:
            self.sd_logger.error(
                big_query_client.errors, {
                    'class': 'PrefectProduceSchemaJobs',
                    'method': 'get_items_to_be_queued'
                })
        return result

    def send_items_to_queue(self, service_name: str,
                            items: google.cloud.bigquery.table.RowIterator,
                            pub_sub: PubSubAdapter):
        success = 0
        for item in items:
            message = {
                'service': service_name,
                'table': item['table'],
                'watched': item['watched'],
                'primary_id': item['primary_id'],
                "flush": False
            }
            message_string = json.dumps(message)
            result = pub_sub.publish_message(message_string)
            if not result:
                self.sd_logger.error(
                    pub_sub.errors, {
                        'class': 'PrefectProduceSchemaJobs',
                        'method': 'send_items_to_queue'
                    })
            else:
                success += 1
        self.sd_logger.info({'message': f'{success} Schema Jobs Queued'}, {
            'class': 'PrefectProduceSchemaJobs',
            'method': 'send_items_to_queue'
        })
        return success

    def set_last_run(self, service_name: str):
        big_query_client = BigQueryAdapter(self.service_account)
        big_query_client.get_client()
        query = f"UPDATE mysql_sync.data_sources SET `last_run` = CURRENT_TIMESTAMP where `service`= '{service_name}'"
        result = big_query_client.query(query)

        if not result:
            self.sd_logger.error(big_query_client.errors, {
                'class': 'PrefectProduceSchemaJobs',
                'method': 'set_last_run'
            })
        return result
Example #13
0
class PrefectInstallBigQuery(Task):
    def __init__(self, **kwargs):
        self.service_account = service_account.Credentials.from_service_account_file(
            os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH'])
        self.sd_logger = StackDriverAdapter(self.service_account)
        self.sd_logger.get_client()
        self.sd_logger.create_logger(f"install-big-query")
        super().__init__(**kwargs)

    def create_client(self) -> BigQueryAdapter:
        big_query_client = BigQueryAdapter(self.service_account)
        big_query_client.get_client()
        return big_query_client

    def create_config_data_set(self,
                               big_query_client: BigQueryAdapter) -> bool:
        # return True if dataset already exists
        if big_query_client.check_dataset('mysql_sync'):
            return True
        big_query_client.set_data_set_ref('mysql_sync')
        result = big_query_client.create_data_set()
        if result:
            self.sd_logger.info({'message': f"Created mysql_sync Data Set"}, {
                'class': 'PrefectInstallBigQuery',
                'method': 'create_data_set'
            })
        else:
            self.sd_logger.warning(big_query_client.errors, {
                'class': 'PrefectInstallBigQuery',
                'method': 'create_data_set'
            })
        return result

    def create_config_table(self, big_query_client: BigQueryAdapter):
        big_query_client.set_data_set_ref('mysql_sync')
        big_query_client.set_table_ref('data_sources')
        # return true if table already exists
        if big_query_client.check_table():
            return True

        schema = [
            bigquery.SchemaField('service',
                                 'STRING',
                                 description="Service Name"),
            bigquery.SchemaField('data_set',
                                 'STRING',
                                 description="Big Query Data Set"),
            bigquery.SchemaField('host',
                                 'STRING',
                                 description="MySQL host connection"),
            bigquery.SchemaField('user',
                                 'STRING',
                                 description="MySQL connection user"),
            bigquery.SchemaField('password',
                                 'STRING',
                                 description="MySQL connection password"),
            bigquery.SchemaField('database',
                                 'STRING',
                                 description="MySQL Database"),
            bigquery.SchemaField('last_run',
                                 'TIMESTAMP',
                                 description="Last produce date"),
        ]
        result = big_query_client.create_table(schema)
        if result:
            self.sd_logger.info({'message': f"Created Config table"}, {
                'class': 'PrefectInstallBigQuery',
                'method': 'create_config_table'
            })
        else:
            self.sd_logger.warning(big_query_client.errors, {
                'class': 'PrefectInstallBigQuery',
                'method': 'create_config_table'
            })
        return result
def mock_service(mock_service_account):
    return StackDriverAdapter(mock_service_account)
Example #15
0
class Schema(Task):
    def __init__(self, service: str, **kwargs):
        self.service_account = service_account.Credentials.from_service_account_file(
            os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH'])
        self.definitions = service_helpers.get_definitions(service)

        self.sd_logger = StackDriverAdapter(self.service_account)
        self.sd_logger.get_client()
        self.sd_logger.create_logger(f"{self.definitions['service']}-etl")

        self.pub_sub_client = PubSubAdapter(self.service_account)
        self.pub_sub_client.get_subscriber()
        self.pub_sub_client.set_subscription(
            f"{self.definitions['service']}-etl-schema")

        self.big_query_client = BigQueryAdapter(self.service_account)
        self.big_query_client.get_client()
        self.big_query_client.set_data_set_ref(self.definitions['data_set'])
        super().__init__(**kwargs)

    def get_mysql_schema(self, table: str) -> Union[list, bool]:
        mysql_client = MySqlAdapter(self.definitions['service'])
        columns = mysql_client.mysql_table_definition(table)
        if not columns:
            self.sd_logger.error(mysql_client.errors, {
                'class': 'Schema',
                'method': 'get_mysql_schema',
                'table': table
            })
            return False
        return columns

    @staticmethod
    def organized_mysql_schema(schema: list) -> list:
        organized_column_data = []
        for column in schema:
            organized_column_data.append(
                service_helpers.label_mysql_table_definitions(column))
        return organized_column_data

    @staticmethod
    def convert_mysql_to_big_query_schema(schema: list) -> list:
        return service_helpers.generate_bq_schema_from_mysql(schema)

    def store_mysql_schema(self, schema: list, table: str) -> bool:
        encoded_schema = json.dumps(schema)
        storage_client = StorageAdapter(self.service_account)
        storage_client.get_client()

        date_time_obj = datetime.utcnow()

        result = storage_client.write_string(
            bucket=self.definitions['service'] + '-etl',
            destination=
            f'schema/{table}/{date_time_obj.strftime("%m-%d-%Y_%H:%M:%S")}_UTC',
            string=encoded_schema)
        if not result:
            self.sd_logger.error(storage_client.errors, {
                'class': 'Schema',
                'method': 'store_mysql_schema',
                'table': table
            })
            return False

        return result

    def check_table_exists(self, table):
        self.big_query_client.set_table_ref(table)
        return self.big_query_client.check_table()

    def get_current_schema(self, table: str):
        self.big_query_client.set_table_ref(table)
        return self.big_query_client.get_schema()

    @staticmethod
    def compare_schema(new_schema, current_schema) -> bool:
        # first check is total number of items
        if len(new_schema) is not len(current_schema):
            return False
        # compare column names and types
        schema_matches = True
        for x in range(len(new_schema)):
            exists_in_current = False
            if current_schema[x].name == new_schema[x].name:
                if current_schema[x].field_type == new_schema[x].field_type:
                    exists_in_current = True
            else:
                exists_in_current = True

            if not exists_in_current:
                schema_matches = False
        return schema_matches

    def create_table(self, table: str, schema: list) -> bool:
        self.big_query_client.set_table_ref(table)
        result = self.big_query_client.create_table(schema=schema,
                                                    overwrite=True)
        if result:
            self.sd_logger.info({'message': f"Table {table} Created"}, {
                'class': 'Schema',
                'method': 'create_table',
                'table': table
            })
        else:
            self.sd_logger.error(self.big_query_client.errors, {
                'class': 'Schema',
                'method': 'create_table',
                'table': table
            })
        return result

    @staticmethod
    def acknowledge_message(message):
        message.ack()

    def copy_bq_table(self, table: str):
        self.big_query_client.set_table_ref(table)
        copy_table_ref = self.big_query_client.table_ref

        date_time_obj = datetime.utcnow()
        destination_str = f'{table}_{date_time_obj.strftime("%m_%d_%Y")}'
        self.big_query_client.set_table_ref(destination_str)
        destination_table_ref = self.big_query_client.table_ref

        result = self.big_query_client.copy_table(
            copy_table=copy_table_ref, destination_table=destination_table_ref)
        if result:
            self.sd_logger.warning(
                {'message': f"Table {table} copied to {destination_str}"}, {
                    'class': 'Schema',
                    'method': 'copy_bq_table',
                    'table': table
                })
        else:
            self.sd_logger.error(self.big_query_client.errors, {
                'class': 'Schema',
                'method': 'copy_bq_table',
                'table': table
            })
        return destination_str

    def backup_table_to_storage(self, table):
        self.big_query_client.set_table_ref(table)
        copy_table_ref = self.big_query_client.table_ref

        date_time_obj = datetime.utcnow()
        destination = f'gs://{self.definitions["service"]}-etl/data/{table}/{date_time_obj.strftime("%m-%d-%Y_%H:%M:%S")}_UTC_*.avro'
        result = self.big_query_client.export_table_to_storage(
            table=copy_table_ref, destination=destination)
        if result:
            self.sd_logger.info(
                {'message': f"Table {table} exported to {destination}"}, {
                    'class': 'Schema',
                    'method': 'backup_table_to_storage'
                })
        else:
            self.sd_logger.error(self.big_query_client.errors, {
                'class': 'Schema',
                'method': 'backup_table_to_storage'
            })
        return result

    def delete_table(self, table):
        self.big_query_client.set_table_ref(table)
        result = self.big_query_client.delete_table()
        if result:
            self.sd_logger.info({'message': f"Table {table} deleted"}, {
                'class': 'Schema',
                'method': 'delete_table'
            })
        else:
            self.sd_logger.error(self.big_query_client.errors, {
                'class': 'Schema',
                'method': 'delete_table'
            })
        return result
Example #16
0
class Audit(Task):
    def __init__(self, service: str, **kwargs):
        self.table = None
        self.watched_column = None
        self.primary_id = None
        self.sample_size = 1000
        self.service_account = service_account.Credentials.from_service_account_file(
            os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH'])
        self.definitions = service_helpers.get_definitions(service)

        self.sd_logger = StackDriverAdapter(self.service_account)
        self.sd_logger.get_client()
        self.sd_logger.create_logger(f"{self.definitions['service']}-etl")

        self.pub_sub_client = PubSubAdapter(self.service_account)
        self.pub_sub_client.get_subscriber()
        self.pub_sub_client.set_subscription(
            f"{self.definitions['service']}-etl")

        self.big_query_client = BigQueryAdapter(self.service_account)
        self.big_query_client.get_client()
        self.big_query_client.set_data_set_ref(self.definitions['data_set'])

        self.my_sql_client = MySqlAdapter()
        super().__init__(**kwargs)

    def extract_message_values(self, message: dict) -> bool:
        if self.table is None:
            if 'table' in message:
                self.table = message['table']
            else:
                self.sd_logger.error(
                    {
                        'error': 'Key: table was not found in message',
                        'data': message
                    }, {
                        'class': 'Audit',
                        'method': 'extract_message_values'
                    })
                return False

            if 'watched' in message:
                self.watched_column = message['watched']
            else:
                self.sd_logger.error(
                    {
                        'error': 'Key: watched was not found in message',
                        'data': message
                    }, {
                        'class': 'Audit',
                        'method': 'extract_message_values',
                        "table": self.table
                    })
                return False

            if 'primary_id' in message:
                self.primary_id = message['primary_id']
            else:
                self.sd_logger.error(
                    {
                        'error': 'Key: primary_id was not found in message',
                        'data': message
                    }, {
                        'class': 'Audit',
                        'method': 'extract_message_values',
                        "table": self.table
                    })
                return False
        return True

    def big_query_last_updated_data(self) -> Union[datetime.datetime, bool]:
        query = f"SELECT MAX({self.watched_column}) as last_updated FROM {self.definitions['data_set']}.{self.table}"
        result = self.big_query_client.query(query)
        if result:
            for value in result:
                if value.last_updated:
                    return value.last_updated
                else:
                    # use string of the start of unix time as the default time
                    return False
        else:
            self.sd_logger.critical(
                self.big_query_client.errors, {
                    'class': 'Audit',
                    'method': 'big_query_last_updated_data',
                    'table': self.table
                })
        return result

    def get_total_distinct_mysql(
            self, last_updated: datetime.datetime) -> Union[int, bool]:
        if last_updated:
            records = self.my_sql_client.count_distinct(
                table=self.table,
                index=self.primary_id,
                watched_column=self.watched_column,
                last_updated=last_updated)
        else:
            records = self.my_sql_client.count_distinct(table=self.table,
                                                        index=self.primary_id)
        if records is False:
            self.sd_logger.error({'error': self.my_sql_client.errors}, {
                'class': 'Audit',
                'method': 'get_total_distinct_mysql',
                "table": self.table
            })
        return records

    def get_total_distinct_big_query(self) -> int:
        query = [
            "SELECT COUNT(*) as total_count", "FROM (",
            f"select DISTINCT {self.primary_id}",
            f"FROM {self.definitions['data_set']}.{self.table}", ")"
        ]

        result = self.big_query_client.query(' '.join(query))

        if not result:
            self.sd_logger.error({'error': self.big_query_client.errors}, {
                'class': 'Audit',
                'method': 'get_total_distinct_big_query',
                "table": self.table
            })
            return False

        for item in result:
            return item.total_count

    def audit_from_totals(self, total_mysql: int,
                          total_big_query: int) -> bool:
        # check if the totals are the same
        if total_mysql == total_big_query:
            return True
        # big_query total will be lower than the mysql most of the time
        # allow it is be
        passing_total = int(round((total_mysql * .99), 0))
        if total_big_query >= passing_total:
            return True

        self.sd_logger.info(
            {
                'message': f"Audit totals did not match: {self.table}",
                "data": {
                    "MySQL Total": total_mysql,
                    "Big Query Total": total_big_query,
                    "passing total": passing_total
                }
            }, {
                'class': 'Audit',
                'method': 'audit_from_totals',
                "table": self.table
            })
        return False

    def get_random_mysql(self, last_run: int = None) -> Union[list, bool]:
        result = self.my_sql_client.get_random_records(
            table=self.table,
            limit=self.sample_size,
            index=self.primary_id,
            watched=self.watched_column,
            last_updated=last_run)
        if result is not False:
            return result

        self.sd_logger.error({'error': self.my_sql_client.errors}, {
            'class': 'Audit',
            'method': 'get_random_mysql',
            "table": self.table
        })
        return False

    def get_big_query_records_from_sample(self, sample: list) -> DataFrame:
        primary_ids = self.primary_id.split(',')
        or_statements = []
        # get the same items from the mysql random query in big query
        for item in sample:
            pk_statements = []
            for key in primary_ids:
                if type(item[key]) is int:
                    pk_statements.append(f"`{key}` = {item[key]}")
                else:
                    pk_statements.append(f"`{key}` = '{item[key]}'")
            if type(item[self.watched_column]) is int:
                watched_condition = f"AND `{self.watched_column}` = {item[self.watched_column]}"
            else:
                if item[self.watched_column] is None:
                    watched_condition = f"AND `{self.watched_column}` IS NULL"
                else:
                    watched_condition = f"AND `{self.watched_column}` = '{item[self.watched_column]}'"
            or_statements.append(
                f"({' AND '.join(pk_statements)} {watched_condition})")
        query = [
            f"SELECT *", f"FROM {self.definitions['data_set']}.{self.table}",
            f"WHERE {' OR '.join(or_statements)}", "ORDER BY"
        ]
        order_by = []
        for key in primary_ids:
            order_by.append(f"{key} ASC")
        query.append(', '.join(order_by))

        result = self.big_query_client.query(' '.join(query))

        if not result:
            self.sd_logger.error({'error': self.big_query_client.errors}, {
                'class': 'Audit',
                'method': 'get_big_query_records_from_sample',
                "table": self.table
            })
            return False
        output = result.to_dataframe()
        return output

    def convert_mysql_to_data_frame(
            self, mysql_data: list) -> Union[DataFrame, bool]:
        self.big_query_client.set_table_ref(self.table)
        bq_schema = self.big_query_client.get_schema()
        schema = service_helpers.convert_schema_to_dict(bq_schema)

        if len(mysql_data[0]) is not len(schema.keys()):
            self.sd_logger.critical(
                {
                    'message': "Schema and data length mismatch",
                    'schema_length': len(schema.keys()),
                    'data_length': len(mysql_data[0])
                }, {
                    'class': 'Audit',
                    'method': 'convert_mysql_to_data_frame',
                    'table': self.table
                })
            return False
        data_frame = pd.DataFrame.from_records(mysql_data,
                                               columns=schema.keys())

        try:
            df = service_helpers.data_frame_to_schema(data_frame,
                                                      schema,
                                                      utc=True)
        except ValueError as e:
            self.sd_logger.critical({'message': 'Error: {}'.format(e)}, {
                'class': 'Audit',
                'method': 'transform_data_frame_to_match_big_query_schema',
                'table': self.table
            })
            return False
        return df

    def audit_from_sample(self, mysql_data: DataFrame,
                          big_query_data: DataFrame) -> bool:
        # test if data frames are equal -  this will fail in most cases to do the poor data quality from sources
        result = mysql_data.equals(big_query_data)
        if result:
            return True
        # check if the number of returned rows are the same
        # this check is useful because of the strictness of the queries that generate these DataFrames
        if mysql_data.shape[0] == big_query_data.shape[0]:
            return True

        self.sd_logger.info(
            {
                'message': f"Audit Failure: {self.table}",
                'DataFrame.equals': result,
                'mysql.shape': mysql_data.shape[0],
                'big_query.shape': big_query_data.shape[0]
            }, {
                'class': 'Audit',
                'method': 'audit_from_sample',
                'table': self.table
            })
        return False
Example #17
0
class Data(Task):
    def __init__(self, service: str, **kwargs):
        self.chunk_size = 250000
        self.table = None
        self.watched_column = None
        self.service_account = service_account.Credentials.from_service_account_file(
            os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH'])
        self.definitions = service_helpers.get_definitions(service)

        self.sd_logger = StackDriverAdapter(self.service_account)
        self.sd_logger.get_client()
        self.sd_logger.create_logger(f"{self.definitions['service']}-etl")

        self.pub_sub_client = PubSubAdapter(self.service_account)
        self.pub_sub_client.get_subscriber()
        self.pub_sub_client.set_subscription(
            f"{self.definitions['service']}-etl-data")

        self.big_query_client = BigQueryAdapter(self.service_account)
        self.big_query_client.get_client()
        self.big_query_client.set_data_set_ref(self.definitions['data_set'])

        self.my_sql_client = MySqlAdapter(service)
        super().__init__(**kwargs)

    def check_message(self, message):
        if 'table' in message:
            self.table = message['table']
        else:
            error_message = 'Table was not included in the message'
            self.sd_logger.error({'error': error_message}, {
                'class': 'Data',
                'method': 'check_message',
                'table': self.table
            })
            raise RuntimeError(error_message)

        if 'watched' in message:
            self.watched_column = message['watched']
        else:
            error_message = 'Watched was not included in the message'
            self.sd_logger.error({'error': error_message}, {
                'class': 'Data',
                'method': 'check_message',
                'table': self.table
            })
            raise RuntimeError(error_message)

    def get_schema_from_big_query(self) -> bool:
        self.big_query_client.set_table_ref(self.table)
        table_check = self.big_query_client.check_table()
        if table_check:
            return self.big_query_client.get_schema()
        return table_check

    def last_updated_data(self) -> Union[str, bool]:
        query = f"SELECT MAX({self.watched_column}) as last_updated FROM {self.definitions['data_set']}.{self.table}"
        result = self.big_query_client.query(query)
        if result:
            for value in result:
                if value['last_updated']:
                    return value['last_updated']
                else:
                    # return none to allow all records to be pulled at the start
                    return None
        else:
            self.sd_logger.critical(
                self.big_query_client.errors, {
                    'class': 'Data',
                    'method': 'last_updated_data',
                    'table': self.table
                })
        return result

    def get_number_of_records_to_import(self,
                                        last_updated) -> Union[int, bool]:
        result = self.my_sql_client.count_items_to_sync(
            table=self.table,
            watched_column=self.watched_column,
            last_run=last_updated)
        if self.my_sql_client.errors:
            self.sd_logger.critical(
                self.my_sql_client.errors, {
                    'class': 'Data',
                    'method': 'get_number_of_records_to_import',
                    'table': self.table
                })
        return result

    def query_mysql_for_records(self, last_updated_date: str, limit: int,
                                offset: int) -> Union[list, bool]:
        results = self.my_sql_client.get_records(
            table=self.table,
            watched_column=self.watched_column,
            last_run=last_updated_date,
            limit=limit,
            offset=offset)
        if not results:
            self.sd_logger.critical(
                self.my_sql_client.errors, {
                    'class': 'Data',
                    'method': 'get_number_of_records_to_import',
                    'table': self.table
                })
        return results

    def load_mysql_data_into_data_frame(
            self, data: list, schema: dict) -> Union[DataFrame, bool]:
        if len(data[0]) is not len(schema.keys()):
            self.sd_logger.critical(
                {
                    'message': "Schema and data length mismatch",
                    'schema_length': len(schema.keys()),
                    'data_length': len(data[0])
                }, {
                    'class': 'Data',
                    'method': 'load_mysql_data_into_data_frame',
                    'table': self.table
                })
            return False
        df = pd.DataFrame.from_records(data, columns=schema.keys())
        del data
        return df

    def transform_data_frame_to_match_big_query_schema(
            self, data_frame: DataFrame,
            schema: dict) -> Union[DataFrame, bool]:
        try:
            df = service_helpers.data_frame_to_schema(data_frame, schema)
        except ValueError as e:
            self.sd_logger.critical({'message': 'Error: {}'.format(e)}, {
                'class': 'Data',
                'method': 'transform_data_frame_to_match_big_query_schema',
                'table': self.table
            })
            return False
        return df

    def append_data_frame_to_big_query(self, data_frame: DataFrame):
        result = self.big_query_client.upload_data_frame(data_frame)
        if result:
            message = f"table:{self.table} | Records written: {data_frame.shape[0]}"
            self.sd_logger.info({'message': message}, {
                'class': 'Data',
                'method': 'append_data_frame_to_big_query',
                'table': self.table
            })

        else:
            self.sd_logger.critical(
                self.big_query_client.errors, {
                    'class': 'Data',
                    'method': 'append_data_frame_to_big_query',
                    'table': self.table
                })
        return result

    def write_df_to_storage(self, df: DataFrame) -> bool:
        storage_client = StorageAdapter(self.service_account)
        storage_client.get_client()

        date_time_obj = datetime.datetime.utcnow()

        location = f'error/csv/{self.table}/{date_time_obj.strftime("%m-%d-%Y_%H:%M:%S")}_UTC'

        result = storage_client.write_string(
            bucket=self.definitions['service'] + '-etl',
            destination=location,
            string=df.to_csv(),
            encoding='text/csv')
        if not result:
            self.sd_logger.error(
                storage_client.errors, {
                    'class': 'Data',
                    'method': 'write_df_to_storage',
                    'table': self.table
                })
            return False

        self.sd_logger.info({'message': f"Failed CSV added to {location}"}, {
            'class': 'Data',
            'method': 'write_df_to_storage',
            'table': self.table
        })

        return result
Example #18
0
 def create_logger(self):
     self.sd_logger = StackDriverAdapter(self.service_account)
     self.sd_logger.get_client()
     self.sd_logger.create_logger(
         f"{self.definitions['service']}-producer-sync")
Example #19
0
class AddService(Task):
    def __init__(self):
        self.service = None
        self.data_set = None
        self.host = None
        self.user = None
        self.password = None
        self.database = None

        self.key = None

        self.service_account = service_account.Credentials.from_service_account_file(
            os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH'])

        self.sd_logger = StackDriverAdapter(self.service_account)
        self.sd_logger.get_client()
        self.sd_logger.create_logger(f"add-service")

    def generate_key(self):
        self.key = Fernet.generate_key()

    def set_attribute(self, attribute: str, value: str, encode: bool = False):
        if encode:
            f = Fernet(self.key)
            encoded = f.encrypt(bytes(value, encoding='utf-8'))
            value = str(encoded, 'utf-8')

        setattr(self, attribute, value)

    def save_key(self):
        storage_client = StorageAdapter(self.service_account)
        storage_client.get_client()
        result = storage_client.write_string(bucket='mysql_sync_keys',
                                             destination=f"{self.service}",
                                             string=self.key)

        if result is False:
            raise RuntimeError(storage_client.errors)

    def save_record(self):
        record = {
            'service': [self.service],
            'data_set': [self.data_set],
            'host': [self.host],
            'user': [self.user],
            'password': [self.password],
            'database': [self.database]
        }
        df = pd.DataFrame(record)
        big_query = BigQueryAdapter(self.service_account)
        big_query.get_client()
        big_query.set_data_set_ref('mysql_sync')
        big_query.set_table_ref('data_sources')
        result = big_query.upload_data_frame(df)
        if result is False:
            self.sd_logger.error(big_query.errors, {
                'class': 'AddService',
                'method': 'save_record'
            })

    def storage(self):
        return storage_add.storage_flow_runner(self.service)

    def big_query(self):
        return big_query_add.big_query_flow_runner(self.service)

    def populate_tracking(self):
        return populate.populate_tracking_table_flow_runner(self.service)

    def run_set_up(self):
        service = input('Service Name: ')
        service = re.sub(r"[^\w]|_", '-', service)
        self.set_attribute('service', service)

        data_set = input('Data Set: ')
        data_set = re.sub(r"[^\w]|-", '_', data_set)
        self.set_attribute('data_set', data_set)

        self.set_attribute('host', input('MySQL Host Connection: '), True)
        self.set_attribute('user', input('MySQL Connection User: '******'password', input('MySQL Connection Password: '******'database', input('MySQL Database: '))
Example #20
0
class PopulateTrackingTable(Task):
    def __init__(self, service: str, **kwargs):
        self.service_account = service_account.Credentials.from_service_account_file(
            os.environ['MYSQL_BIG_QUERY_GOOGLE_AUTH'])
        self.definitions = service_helpers.get_definitions(service)
        self.sd_logger = StackDriverAdapter(self.service_account)
        self.sd_logger.get_client()
        self.sd_logger.create_logger(f"service-add-populate-tracking")
        self.mysql_client = MySqlAdapter(service)
        super().__init__(**kwargs)

    def create_client(self) -> BigQueryAdapter:
        big_query_client = BigQueryAdapter(self.service_account)
        big_query_client.get_client()
        return big_query_client

    def define_tables(self) -> Union[list, bool]:
        table_definitions = self.mysql_client.mysql_table_definition()
        if not isinstance(table_definitions, list):
            self.sd_logger.error(self.mysql_client.errors, {
                'class': 'PopulateTrackingTable',
                'method': 'define_tables'
            })
            return False
        return table_definitions

    @staticmethod
    def organize_tables(columns: list) -> dict:
        tables = {}
        for column in columns:
            if not column['table'] in tables:
                tables[column['table']] = []
            tables[column['table']].append(
                service_helpers.label_mysql_table_definitions(column))
        return tables

    @staticmethod
    def format_for_big_query(tables: dict) -> list:
        output = []
        for table, data in tables.items():
            item = {
                'table': table,
                'watched': service_helpers.guess_watched(data),
                'primary_id': service_helpers.guess_pk(data),
            }

            if item['watched'] is None or item['primary_id'] is None:
                item['synchronize'] = False
            else:
                item['synchronize'] = True
            output.append(item)
        return output

    @staticmethod
    def convert_to_data_frame(data: list) -> pd.DataFrame:
        df = pd.DataFrame(data=data)
        return df

    def write_to_big_query(self, data: pd.DataFrame) -> bool:
        bq = BigQueryAdapter(self.service_account)
        bq.get_client()
        bq.set_data_set_ref(self.definitions['data_set'])
        bq.set_table_ref('sync_tracking_table')
        result = bq.upload_data_frame(data, 'replace')
        if result:
            self.sd_logger.info(
                {"Tracking table populated": data.count(axis=0).to_dict()}, {
                    'class': 'PopulateTrackingTable',
                    'method': 'write_to_big_query'
                })
        else:
            self.sd_logger.error(bq.errors, {
                'class': 'PopulateTrackingTable',
                'method': 'write_to_big_query'
            })
        return result