def __init__(self, kwargs=None): self.settings = kwargs excludes = listwrap(self.settings.exclude) self.settings.exclude = set(e for e in excludes if len(split_field(e)) == 1) self.settings.exclude_columns = set(p for e in excludes for p in [tuple(split_field(e))] if len(p) > 1) self.settings.exclude_path = list( map(split_field, listwrap(self.settings.exclude_path))) self.settings.show_foreign_keys = coalesce( self.settings.show_foreign_keys, True) self.name_relations = unwrap(coalesce(self.settings.name_relations, {})) self.all_nested_paths = None self.nested_path_to_join = None self.columns = None with Explanation("scan database", debug=DEBUG): self.db = MySQL(**kwargs.database) self.settings.database.schema = self.db.settings.schema with self.db.transaction(): self._scan_database() if not self.settings.database.schema: Log.error("you must provide a `database.schema`")
def test_extract_alert(extract_alert_settings, test_perf_alert_summary, test_perf_alert): """ If you find this test failing, then copy the JSON in the test failure into the test_extract_alerts.json file, then you may use the diff to review the changes. """ now = datetime.datetime.now() source = MySQL(extract_alert_settings.source.database) extractor = MySqlSnowflakeExtractor(extract_alert_settings.source) sql = extractor.get_sql( SQL("SELECT " + text(test_perf_alert_summary.id) + " as id")) acc = [] with source.transaction(): cursor = list(source.query(sql, stream=True, row_tuples=True)) extractor.construct_docs(cursor, acc.append, False) doc = acc[0] # TEST ARE RUN WITH CURRENT TIMESTAMPS doc.created = now doc.last_updated = now for d in doc.details: d.created = now d.last_updated = now d.series_signature.last_updated = now assertAlmostEqual( acc, ALERT, places=3 ) # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603
def get_all_signatures(db_config, sql): """ RETURN ALL SIGNATURES FROM PERFHERDER DATABASE """ db = MySQL(db_config) with db: return db.query(sql)
def test_make_failure_class(failure_class, extract_job_settings): # TEST I CAN MAKE AN OBJECT IN THE DATABASE source = MySQL(extract_job_settings.source.database) with source.transaction(): result = source.query(SQL("SELECT * from failure_classification")) # verify the repository object is the one we expect assert result[0].name == "not classified"
def test_make_repository(test_repository, extract_job_settings): # TEST EXISTING FIXTURE MAKES AN OBJECT IN THE DATABASE source = MySQL(extract_job_settings.source.database) with source.transaction(): result = source.query(SQL("SELECT * from repository")) # verify the repository object is the one we expect assert result[0].id == test_repository.id assert result[0].tc_root_url == test_repository.tc_root_url
def run_compare(self, config, id_sql, expected): db = MySQL(**config.database) extractor = MySqlSnowflakeExtractor(kwargs=config) sql = extractor.get_sql(SQL(id_sql)) result = [] with db.transaction(): cursor = db.query(sql, stream=True, row_tuples=True) cursor = list(cursor) extractor.construct_docs(cursor, result.append, False) self.assertEqual(result, expected, "expecting identical") self.assertEqual(expected, result, "expecting identical")
def test_django_cannot_encode_datetime(extract_job_settings): """ DJANGO DOES NOT ENCODE THE DATETIME PROPERLY """ epoch = Date(Date.EPOCH).datetime get_ids = SQL( str((Job.objects.filter( Q(last_modified__gt=epoch) | (Q(last_modified=epoch) & Q(id__gt=0))).annotate().values("id").order_by( "last_modified", "id")[:2000]).query)) source = MySQL(extract_job_settings.source.database) with pytest.raises(Exception): with source.transaction(): list(source.query(get_ids, stream=True, row_tuples=True))
def test_django_cannot_encode_datetime_strings(extract_job_settings): """ DJANGO/MYSQL DATETIME MATH WORKS WHEN STRINGS """ epoch_string = Date.EPOCH.format() sql_query = SQL( str((Job.objects.filter( Q(last_modified__gt=epoch_string) | (Q(last_modified=epoch_string) & Q(id__gt=0))).annotate().values("id").order_by( "last_modified", "id")[:2000]).query)) source = MySQL(extract_job_settings.source.database) with pytest.raises(Exception): with source.transaction(): list(source.query(sql_query, stream=True, row_tuples=True))
def test_extract_job(complex_job, extract_job_settings): """ If you find this test failing, then copy the JSON in the test failure into the test_extract_job.json file, then you may use the diff to review the changes. """ with MySQL(extract_job_settings.source.database) as source: with MySqlSnowflakeExtractor(extract_job_settings.source) as extractor: sql = extractor.get_sql( SQL("SELECT " + text(complex_job.id) + " as id")) acc = [] with source.transaction(): cursor = list(source.query(sql, stream=True, row_tuples=True)) extractor.construct_docs(cursor, acc.append, False) doc = first(acc) doc.guid = first(JOB).guid # NEW EACH TIME job_guid = first(jx.drill(JOB, "job_log.failure_line.job_guid")) for fl in jx.drill(doc, "job_log.failure_line"): fl.job_guid = job_guid assertAlmostEqual( acc, JOB, places= 4, # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603 )
def get_signature(db_config, signature_id): db = MySQL(db_config) with db: return first( db.query(f""" SELECT t1.id , t1.signature_hash, t1.suite , t1.test , UNIX_TIMESTAMP(t1.last_updated) as last_updated, t1.lower_is_better, t1.has_subtests , t1.alert_threshold, t1.fore_window , t1.max_back_window, t1.min_back_window , t1.should_alert, t1.extra_options , t1.alert_change_type, t1.measurement_unit, t1.application , t1.suite_public_name, t1.test_public_name , t1.tags, t3.option_collection_hash as `option_collection.hash`, t1.framework_id, t4.name AS framework, t5.platform AS platform, t6.name AS `repository` FROM performance_signature t1 LEFT JOIN performance_signature AS t2 ON t2.id = t1.parent_signature_id LEFT JOIN option_collection AS t3 ON t3.id = t1.option_collection_id LEFT JOIN performance_framework AS t4 ON t4.id = t1.framework_id LEFT JOIN machine_platform AS t5 ON t5.id = t1.platform_id LEFT JOIN repository AS t6 ON t6.id = t1.repository_id WHERE t1.id in {quote_list(listwrap(signature_id))} ORDER BY t1.last_updated DESC """))
def test_extract_job(complex_job, extract_job_settings, now): source = MySQL(extract_job_settings.source.database) extractor = MySqlSnowflakeExtractor(extract_job_settings.source) sql = extractor.get_sql(SQL("SELECT " + text(complex_job.id) + " as id")) acc = [] with source.transaction(): cursor = list(source.query(sql, stream=True, row_tuples=True)) extractor.construct_docs(cursor, acc.append, False) doc = acc[0] doc.guid = complex_job.guid doc.last_modified = complex_job.last_modified assertAlmostEqual( acc, JOB, places=3 ) # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603
def __init__(self, host, port=3306, username=None, password=None, debug=False, schema=None, preamble=None, readonly=False, kwargs=None): from jx_mysql.mysql import MySQL self.settings = kwargs self._db = MySQL(kwargs)
def test_extract_job(complex_job, extract_job_settings, now): """ If you find this test failing, then copy the JSON in the test failure into the test_extract_job.json file, then you may use the diff to review the changes. """ source = MySQL(extract_job_settings.source.database) extractor = MySqlSnowflakeExtractor(extract_job_settings.source) sql = extractor.get_sql(SQL("SELECT " + text(complex_job.id) + " as id")) acc = [] with source.transaction(): cursor = list(source.query(sql, stream=True, row_tuples=True)) extractor.construct_docs(cursor, acc.append, False) doc = acc[0] doc.guid = complex_job.guid assertAlmostEqual( acc, JOB, places= 4, # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603 )
def setUpClass(cls): Log.start(settings.debug) with Timer("setup database"): try: with MySQL(schema=None, kwargs=settings.database) as db: db.query("drop database testing") except Exception as e: if "Can't drop database " in e: pass else: Log.warning("problem removing db", cause=e) execute_file("tests/resources/database.sql", schema=None, kwargs=settings.database)
def env_setup(): # These values not directly accessed during testing, but the code requires that they be present. os.environ["NEW_RELIC_APP_NAME"] = "testing" os.environ["BIGQUERY_PRIVATE_KEY_ID"] = "1" os.environ["BIGQUERY_PRIVATE_KEY"] = "1" # THE DOCKER ENV IS DIFFERENT FROM THE DEV ENVIRONMENT attempt = [ "mysql://[email protected]:3306/test_treeherder", "mysql://root@mysql:3306/test_treeherder", ] for a in attempt: try: MySQL(host=a) os.environ["DATABASE_URL"] = a except Exception: pass
def test_extract_alert(extract_alert_settings, test_perf_alert_summary, test_perf_alert): """ If you find this test failing, then copy the JSON in the test failure into the test_extract_alerts.json file, then you may use the diff to review the changes. """ with MySQL(extract_alert_settings.source.database) as source: with MySqlSnowflakeExtractor( extract_alert_settings.source) as extractor: sql = extractor.get_sql( SQL("SELECT " + text(test_perf_alert_summary.id) + " as id")) acc = [] with source.transaction(): cursor = list(source.query(sql, stream=True, row_tuples=True)) extractor.construct_docs(cursor, acc.append, False) assertAlmostEqual( acc, ALERT, places=3 ) # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603
def run(self, force=False, restart=False, merge=False): # SETUP LOGGING settings = startup.read_settings(filename=CONFIG_FILE) constants.set(settings.constants) Log.start(settings.debug) if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination ).get_or_create_table(settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis() state = redis.get(settings.extractor.key) if restart or not state: state = (0, 0) redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) last_modified, alert_id = state last_modified = parse(last_modified) # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note( "Extracting alerts for last_modified={{last_modified|datetime|quote}}, alert.id={{alert_id}}", last_modified=last_modified, alert_id=alert_id, ) last_year = ( Date.today() - YEAR + DAY ) # ONLY YOUNG RECORDS CAN GO INTO BIGQUERY # SELECT # s.od # FROM # treeherder.performance_alert_summary s # LEFT JOIN # treeherder.performance_alert a ON s.id=a.summary_id # WHERE # s.created>{last_year} AND (s.last_updated>{last_modified} OR a.last_updated>{last_modified}) # GROUP BY # s.id # ORDER BY # s.id # LIMIT # {settings.extractor.chunk_size} get_ids = SQL( str( ( PerformanceAlertSummary.objects.filter( Q(created__gt=last_year.datetime) & ( Q(last_updated__gt=last_modified.datetime) | Q(alerts__last_updated__gt=last_modified.datetime) ) ) .annotate() .values("id") .order_by("id")[: settings.extractor.chunk_size] ).query ) ) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] last_modified, alert_id = last_doc.created, last_doc.id redis.set( settings.extractor.key, value2json((last_modified, alert_id)).encode("utf8"), ) if len(acc) < settings.extractor.chunk_size: break except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done alert extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done alert merge") Log.stop()
def test_make_job(complex_job, extract_job_settings): source = MySQL(extract_job_settings.source.database) with source.transaction(): result = source.query(SQL("SELECT count(1) as num from job_detail")) assert result[0].num == 4
def get_dataum(db_config, signature_id, since, limit): db = MySQL(db_config) with db: return db.query( SQL(f""" SELECT d.id, d.value, t3.id AS `job.id`, t3.guid AS `job.guid`, p.revision AS `push.revision`, UNIX_TIMESTAMP(p.time) AS `push.time`, r.name AS `push.repository`, s.created AS `summary.created`, s.status AS `summary.status`, s.bug_number AS `summary.bug_number`, s.manually_created AS `summary.manually_created`, s.prev_push_id AS `summary.prev_push`, s.issue_tracker_id AS `summary.issue_tracker`, s.notes AS `summary.notes`, s.first_triaged AS `summary.first_triaged`, s.last_updated AS `summary.last_updated`, s.bug_updated AS `summary.bug_updated`, a.id AS `alert.id`, a.`is_regression` AS `alert.isregression`, a.`status` AS `alert.status`, a.`amount_pct` AS `alert.amount_pct`, a.`amount_abs` AS `alert.amount_abs`, a.`prev_value` AS `alert.prev_value`, a.`new_value` AS `alert.new_value`, a.`t_value` AS `alert.t_value`, a.`manually_created` AS `alert.manually_created`, a.`classifier_id` AS `alert.classifier_id`, a.`starred` AS `alert.starred`, a.`created` AS `alert.created`, a.`first_triaged` AS `alert.first_triaged`, a.`last_updated` AS `alert.last_updated` FROM performance_datum AS d JOIN performance_signature sig on sig.id=d.signature_id LEFT JOIN job AS t3 ON t3.id = d.job_id LEFT JOIN push AS p ON p.id = d.push_id LEFT JOIN repository AS r ON r.id = p.repository_id LEFT JOIN performance_alert_summary s on s.repository_id = p.repository_id and s.push_id=p.id LEFT JOIN performance_alert a ON a.summary_id = s.id AND a.series_signature_id = d.signature_id AND a.manually_created=0 WHERE p.time > {quote_value(since)} AND d.signature_id in {quote_list(listwrap(signature_id))} ORDER BY p.time DESC LIMIT {quote_value(limit + 1)} """))
def extract(self, settings, force, restart, merge): if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination).get_or_create_table( settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis.from_url(REDIS_URL) state = redis.get(settings.extractor.key) if restart or not state: state = 916850000 redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) perf_id = state # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note("Extracting perfs for perf.id={{perf_id}}", perf_id=perf_id) # get_ids = sql_query( # { # "from": "performance_datum", # "select": ["id"], # "where": {"gt": {"id": perf_id}}, # "sort": ["id"], # "limit": settings.extractor.chunk_size, # } # ) get_ids = SQL( str((PerformanceDatum.objects.filter( id__gt=perf_id).values("id").order_by("id") [:settings.extractor.chunk_size]).query)) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break # TODO: Remove me July 2021 # OLD PERF RECORDS HAVE NO CORRESPONDING JOB # ADD job.submit_time FOR PARTITIONING for a in acc: if not a.job.submit_time: a.job.submit_time = a.push_timestamp a.etl.timestamp = Date.now() destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] perf_id = last_doc.id redis.set(settings.extractor.key, value2json(perf_id).encode("utf8")) if len(acc) < settings.extractor.chunk_size: break except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done perf extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done perf merge")
def extract(self, settings, force, restart, merge): if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination).get_or_create_table( settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis.from_url(REDIS_URL) state = redis.get(settings.extractor.key) if restart or not state: state = (0, 0) redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) last_modified, alert_id = state last_modified = Date(last_modified) # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note( "Extracting alerts for last_modified={{last_modified|datetime|quote}}, alert.id={{alert_id}}", last_modified=last_modified, alert_id=alert_id, ) last_year = Date.today( ) - YEAR + DAY # ONLY YOUNG RECORDS CAN GO INTO BIGQUERY get_ids = SQL( "SELECT s.id " + "\nFROM treeherder.performance_alert_summary s" + "\nLEFT JOIN treeherder.performance_alert a ON s.id=a.summary_id" + "\nWHERE s.created>" + quote_value(last_year).sql + " AND (s.last_updated > " + quote_value(last_modified).sql + "\nOR a.last_updated > " + quote_value(last_modified).sql + ")" + "\nGROUP BY s.id" + "\nORDER BY s.id" + "\nLIMIT " + quote_value(settings.extractor.chunk_size).sql) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] last_modified, alert_id = last_doc.created, last_doc.id redis.set( settings.extractor.key, value2json((last_modified, alert_id)).encode("utf8"), ) if len(acc) < settings.extractor.chunk_size: break except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done alert extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done alert merge") Log.stop()
def run(self, force=False, restart=False, merge=False): # SETUP LOGGING settings = startup.read_settings(filename=CONFIG_FILE) constants.set(settings.constants) Log.start(settings.debug) if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination ).get_or_create_table(settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis() state = redis.get(settings.extractor.key) if restart or not state: state = (0, 0) redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) last_modified, job_id = state # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note( "Extracting jobs for last_modified={{last_modified|datetime|quote}}, job.id={{job_id}}", last_modified=last_modified, job_id=job_id, ) # Example: job.id ==283890114 # get_ids = ConcatSQL( # (SQL_SELECT, sql_alias(quote_value(283890114), "id")) # ) # get_ids = sql_query( # { # "from": "job", # "select": ["id"], # "where": { # "or": [ # {"gt": {"last_modified": parse(last_modified)}}, # { # "and": [ # {"eq": {"last_modified": parse(last_modified)}}, # {"gt": {"id": job_id}}, # ] # }, # ] # }, # "sort": ["last_modified", "id"], # "limit": settings.extractor.chunk_size, # } # ) get_ids = SQL(str( ( Job.objects.filter( Q(last_modified__gt=parse(last_modified).datetime) | ( Q(last_modified=parse(last_modified).datetime) & Q(id__gt=job_id) ) ) .annotate() .values("id") .order_by("last_modified", "id")[ : settings.extractor.chunk_size ] ).query )) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] last_modified, job_id = last_doc.last_modified, last_doc.id redis.set( settings.extractor.key, value2json((last_modified, job_id)).encode("utf8"), ) if len(acc) < settings.extractor.chunk_size: break except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done job extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done job merge")
def extract(self, settings, force, restart, start, merge): if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination).get_or_create_table( settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis.from_url(REDIS_URL) state = redis.get(settings.extractor.key) if start: state = start, 0 elif restart or not state: state = (0, 0) redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) last_modified, job_id = state # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note( "Extracting jobs for last_modified={{last_modified|datetime|quote}}, job.id={{job_id}}", last_modified=last_modified, job_id=job_id, ) # Example: job.id ==283890114 # get_ids = ConcatSQL( # (SQL_SELECT, sql_alias(quote_value(283890114), "id")) # ) get_ids = sql_query({ "from": "job", "select": ["id"], "where": { "or": [ { "gt": { "last_modified": Date(last_modified) } }, { "and": [ { "eq": { "last_modified": Date(last_modified) } }, { "gt": { "id": job_id } }, ] }, ] }, "sort": ["last_modified", "id"], "limit": settings.extractor.chunk_size, }) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break # SOME LIMITS PLACES ON STRING SIZE for fl in jx.drill(acc, "job_log.failure_line"): fl.message = strings.limit(fl.message, 10000) for r in acc: r.etl.timestamp = Date.now() destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] last_modified, job_id = last_doc.last_modified, last_doc.id redis.set( settings.extractor.key, value2json((last_modified, job_id)).encode("utf8"), ) if len(acc) < settings.extractor.chunk_size: break except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done job extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done job merge")
class MySqlSnowflakeExtractor(object): @override def __init__(self, kwargs=None): self.settings = kwargs excludes = listwrap(self.settings.exclude) self.settings.exclude = set(e for e in excludes if len(split_field(e)) == 1) self.settings.exclude_columns = set(p for e in excludes for p in [tuple(split_field(e))] if len(p) > 1) self.settings.exclude_path = list( map(split_field, listwrap(self.settings.exclude_path))) self.settings.show_foreign_keys = coalesce( self.settings.show_foreign_keys, True) self.name_relations = unwrap(coalesce(self.settings.name_relations, {})) self.all_nested_paths = None self.nested_path_to_join = None self.columns = None with Explanation("scan database", debug=DEBUG): self.db = MySQL(**kwargs.database) self.settings.database.schema = self.db.settings.schema with self.db.transaction(): self._scan_database() if not self.settings.database.schema: Log.error("you must provide a `database.schema`") def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def close(self): self.db.close() def get_sql(self, get_ids): sql = self._compose_sql(get_ids) # ORDERING sort = [] ordering = [] for ci, c in enumerate(self.columns): if c.sort: sort.append(quote_column(c.column_alias) + SQL_IS_NOT_NULL) sort.append(quote_column(c.column_alias)) ordering.append(ci) union_all_sql = SQL_UNION_ALL.join(sql) union_all_sql = ConcatSQL( SQL_SELECT, SQL_STAR, SQL_FROM, sql_alias(sql_iso(union_all_sql), "a"), SQL_ORDERBY, sql_list(sort), ) if DEBUG: Log.note("{{sql}}", sql=union_all_sql) return union_all_sql def path_not_allowed(self, path): return path != "." and any( path_in_path(e, p) for p in [split_field(path)] for e in self.settings.exclude_path) def _scan_database(self): # GET ALL RELATIONS raw_relations = self.db.query( """ SELECT table_schema, table_name, referenced_table_schema, referenced_table_name, referenced_column_name, constraint_name, column_name, ordinal_position FROM information_schema.key_column_usage WHERE referenced_column_name IS NOT NULL """, param=self.settings.database, ) if not raw_relations: Log.error("No relations in the database") for r in self.settings.add_relations: try: lhs, rhs = map(strings.trim, r.split("->")) lhs = lhs.split(".") if len(lhs) == 2: lhs = [self.settings.database.schema] + lhs rhs = rhs.split(".") if len(rhs) == 2: rhs = [self.settings.database.schema] + rhs to_add = Data( ordinal_position=1, # CAN ONLY HANDLE 1-COLUMN RELATIONS table_schema=lhs[0], table_name=lhs[1], column_name=lhs[2], referenced_table_schema=rhs[0], referenced_table_name=rhs[1], referenced_column_name=rhs[2], ) # CHECK IF EXISTING if jx.filter(raw_relations, {"eq": to_add}): Log.note("Relation {{relation}} already exists", relation=r) continue to_add.constraint_name = Random.hex(20) raw_relations.append(to_add) except Exception as e: Log.error("Could not parse {{line|quote}}", line=r, cause=e) relations = jx.select( raw_relations, [ { "name": "constraint.name", "value": "constraint_name" }, { "name": "table.schema", "value": "table_schema" }, { "name": "table.name", "value": "table_name" }, { "name": "column.name", "value": "column_name" }, { "name": "referenced.table.schema", "value": "referenced_table_schema" }, { "name": "referenced.table.name", "value": "referenced_table_name" }, { "name": "referenced.column.name", "value": "referenced_column_name" }, { "name": "ordinal_position", "value": "ordinal_position" }, ], ) # GET ALL TABLES raw_tables = self.db.query(""" SELECT t.table_schema, t.table_name, c.constraint_name, c.constraint_type, k.column_name, k.ordinal_position FROM information_schema.tables t LEFT JOIN information_schema.table_constraints c on c.table_name=t.table_name AND c.table_schema=t.table_schema and (constraint_type='UNIQUE' or constraint_type='PRIMARY KEY') LEFT JOIN information_schema.key_column_usage k on k.constraint_name=c.constraint_name AND k.table_name=t.table_name and k.table_schema=t.table_schema ORDER BY t.table_schema, t.table_name, c.constraint_name, k.ordinal_position, k.column_name """) # ORGANIZE, AND PICK ONE UNIQUE CONSTRAINT FOR LINKING tables = UniqueIndex(keys=["name", "schema"]) for t, c in jx.groupby(raw_tables, ["table_name", "table_schema"]): c = wrap(list(c)) best_index = Null is_referenced = False is_primary = False for g, w in jx.groupby(c, "constraint_name"): if not g.constraint_name: continue w = list(w) ref = False for r in relations: if (r.table.name == t.table_name and r.table.schema == t.table_schema and r.constraint.name == g.constraint_name): ref = True is_prime = w[0].constraint_type == "PRIMARY" reasons_this_one_is_better = [ best_index == None, # WE DO NOT HAVE A CANDIDATE YET is_prime and not is_primary, # PRIMARY KEYS ARE GOOD TO HAVE is_primary == is_prime and ref and not is_referenced, # REFERENCED UNIQUE TUPLES ARE GOOD TOO is_primary == is_prime and ref == is_referenced and len(w) < len(best_index), # THE SHORTER THE TUPLE, THE BETTER ] if any(reasons_this_one_is_better): is_primary = is_prime is_referenced = ref best_index = w tables.add({ "name": t.table_name, "schema": t.table_schema, "id": [b.column_name for b in best_index], }) fact_table = tables[self.settings.fact_table, self.settings.database.schema] ids_table = { "alias": "t0", "name": "__ids__", "schema": fact_table.schema, "id": fact_table.id, } relations.extend( wrap({ "constraint": { "name": "__link_ids_to_fact_table__" }, "table": ids_table, "column": { "name": c }, "referenced": { "table": fact_table, "column": { "name": c } }, "ordinal_position": i, }) for i, c in enumerate(fact_table.id)) tables.add(ids_table) # GET ALL COLUMNS raw_columns = self.db.query(""" SELECT column_name, table_schema, table_name, ordinal_position, data_type FROM information_schema.columns """) reference_only_tables = [ r.split(".")[0] for r in self.settings.reference_only if len(r.split(".")) == 2 ] reference_all_tables = [ r.split(".")[0] for r in self.settings.reference_only if len(r.split(".")) == 1 ] foreign_column_table_schema_triples = {(r.column.name, r.table.name, r.table.schema) for r in relations} referenced_column_table_schema_triples = {( r.referenced.column.name, r.referenced.table.name, r.referenced.table.schema, ) for r in relations} related_column_table_schema_triples = ( foreign_column_table_schema_triples | referenced_column_table_schema_triples) columns = UniqueIndex(["column.name", "table.name", "table.schema"]) for c in raw_columns: if c.table_name in reference_only_tables: if c.table_name + "." + c.column_name in self.settings.reference_only: include = True reference = True foreign = False elif c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = False foreign = False else: include = False reference = False foreign = False elif c.table_name in reference_all_tables: # TABLES USED FOR REFERENCE, NO NESTED DOCUMENTS EXPECTED if c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = True foreign = False elif ( c.column_name, c.table_name, c.table_schema, ) in foreign_column_table_schema_triples: include = False reference = False foreign = True else: include = True reference = False foreign = False elif c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = False foreign = False elif ( c.column_name, c.table_name, c.table_schema, ) in foreign_column_table_schema_triples: include = False reference = False foreign = True elif ( c.column_name, c.table_name, c.table_schema, ) in referenced_column_table_schema_triples: include = self.settings.show_foreign_keys reference = False foreign = False else: include = True reference = False foreign = False rel = { "column": { "name": c.column_name, "type": c.data_type }, "table": { "name": c.table_name, "schema": c.table_schema }, "ordinal_position": c.ordinal_position, "is_id": c.column_name in tables[(c.table_name, c.table_schema)].id, "include": include, # TRUE IF THIS COLUMN IS OUTPUTTED "reference": reference, # TRUE IF THIS COLUMN REPRESENTS THE ROW "foreign": foreign, # TRUE IF THIS COLUMN POINTS TO ANOTHER ROW } columns.add(rel) # ITERATE OVER ALL PATHS todo = FlatList() output_columns = FlatList() nested_path_to_join = {} all_nested_paths = [["."]] def follow_paths(position, path, nested_path, done_relations, no_nested_docs): if position.name in self.settings.exclude: return if self.path_not_allowed(path): return if DEBUG: Log.note("Trace {{path}}", path=path) if position.name != "__ids__": # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS) self.db.query( ConcatSQL( SQL_SELECT, SQL_STAR, SQL_FROM, quote_column(position.schema, position.name), SQL_LIMIT, SQL_ONE, )) if position.name in reference_all_tables: no_nested_docs = True if position.name in reference_only_tables: return curr_join_list = copy(nested_path_to_join[nested_path[0]]) ############################################################################### # INNER OBJECTS ############################################################################### referenced_tables = list( sort_using_key( jx.groupby( jx.filter( relations, { "eq": { "table.name": position.name, "table.schema": position.schema, } }, ), "constraint.name", ), key=lambda p: first(p[1]).column.name, )) for g, constraint_columns in referenced_tables: g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue if any(cc for cc in constraint_columns if cc.referenced.table.name in self.settings.exclude): continue done_relations.add(g["constraint.name"]) many_to_one_joins = nested_path_to_join[nested_path[0]] index = len(many_to_one_joins) alias = "t" + text(index) for c in constraint_columns: c.referenced.table.alias = alias c.table = position many_to_one_joins.append({ "join_columns": constraint_columns, "path": path, "nested_path": nested_path, }) # HANDLE THE COMMON *id SUFFIX name = [] for cname, tname in zip( constraint_columns.column.name, constraint_columns.referenced.table.name, ): if cname.startswith(tname): name.append(tname) elif cname.endswith("_id"): name.append(cname[:-3]) else: name.append(cname) relation_string = many_to_one_string(constraint_columns[0]) step = "/".join(name) if len(constraint_columns) == 1: step = self.name_relations.get(relation_string, step) referenced_column_path = concat_field(path, step) if self.path_not_allowed(referenced_column_path): continue if referenced_column_path in reference_only_tables: continue col_pointer_name = relative_field(referenced_column_path, nested_path[0]) for col in columns: if (col.table.name == constraint_columns[0].referenced.table.name and col.table.schema == constraint_columns[0].referenced.table.schema): col_full_name = concat_field( col_pointer_name, literal_field(col.column.name)) if (col.is_id and len(nested_path) == 1 and col.table.name == fact_table.name and col.table.schema == fact_table.schema): # ALWAYS SHOW THE ID OF THE FACT c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": True, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name, }) elif col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.reference: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_pointer_name if not self.settings.show_foreign_keys else col_full_name, # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED }) elif col.include: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name, }) if position.name in reference_only_tables: continue todo.append( Data( position=copy(constraint_columns[0].referenced.table), path=referenced_column_path, nested_path=nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs, )) ############################################################################### # NESTED OBJECTS ############################################################################### if not no_nested_docs: nesting_tables = list( sort_using_key( jx.groupby( jx.filter( relations, { "eq": { "referenced.table.name": position.name, "referenced.table.schema": position.schema, } }, ), "constraint.name", ), key=lambda p: [(r.table.name, r.column.name) for r in [first(p[1])]][0], )) for g, constraint_columns in nesting_tables: g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue done_relations.add(g["constraint.name"]) many_table = set(constraint_columns.table.name) if not (many_table - self.settings.exclude): continue relation_string = one_to_many_string(constraint_columns[0]) step = "/".join(many_table) if len(constraint_columns) == 1: step = self.name_relations.get(relation_string, step) referenced_column_path = concat_field(path, step) if self.path_not_allowed(referenced_column_path): continue new_nested_path = [referenced_column_path] + nested_path all_nested_paths.append(new_nested_path) if referenced_column_path in nested_path_to_join: Log.error( "{{path}} already exists, try adding entry to name_relations", path=referenced_column_path, ) one_to_many_joins = nested_path_to_join[ referenced_column_path] = copy(curr_join_list) index = len(one_to_many_joins) alias = "t" + text(index) for c in constraint_columns: c.table.alias = alias c.referenced.table = position one_to_many_joins.append( set_default( {}, g, { "children": True, "join_columns": constraint_columns, "path": path, "nested_path": nested_path, }, )) for col in columns: if (col.table.name == constraint_columns[0].table.name and col.table.schema == constraint_columns[0].table.schema): col_full_name = join_field( split_field(referenced_column_path) [len(split_field(new_nested_path[0])):] + [literal_field(col.column.name)]) if col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) else: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if col.include else None, }) todo.append( Data( position=constraint_columns[0].table, path=referenced_column_path, nested_path=new_nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs, )) path = "." nested_path = [path] nested_path_to_join["."] = [{ "path": path, "join_columns": [{ "referenced": { "table": ids_table } }], "nested_path": nested_path, }] todo.append( Data( position=ids_table, path=path, nested_path=nested_path, done_relations=set(), no_nested_docs=False, )) while todo: item = todo.pop(0) follow_paths(**item) self.all_nested_paths = all_nested_paths self.nested_path_to_join = nested_path_to_join self.columns = output_columns def _compose_sql(self, get_ids): """ :param get_ids: SQL to get the ids, and used to select the documents returned :return: """ if not isinstance(get_ids, SQL): Log.error("Expecting SQL to get some primary ids") sql = [] for nested_path in self.all_nested_paths: # MAKE THE REQUIRED JOINS sql_joins = [] for i, curr_join in enumerate( self.nested_path_to_join[nested_path[0]]): curr_join = wrap(curr_join) rel = curr_join.join_columns[0] if i == 0: sql_joins.append( ConcatSQL( SQL_FROM, sql_alias(sql_iso(get_ids), rel.referenced.table.alias), )) elif curr_join.children: full_name = quote_column(rel.table.schema, rel.table.name) sql_joins.append( ConcatSQL( SQL_INNER_JOIN, sql_alias(full_name, rel.table.alias), SQL_ON, SQL_AND.join( ConcatSQL( quote_column(rel.table.alias, const_col.column.name), SQL_EQ, quote_column( rel.referenced.table.alias, const_col.referenced.column.name, ), ) for const_col in curr_join.join_columns), )) else: full_name = quote_column(rel.referenced.table.schema, rel.referenced.table.name) sql_joins.append( ConcatSQL( SQL_LEFT_JOIN, sql_alias(full_name, rel.referenced.table.alias), SQL_ON, SQL_AND.join( ConcatSQL( quote_column( rel.referenced.table.alias, const_col.referenced.column.name, ), SQL_EQ, quote_column(rel.table.alias, const_col.column.name), ) for const_col in curr_join.join_columns), )) # ONLY SELECT WHAT WE NEED, NULL THE REST selects = [] not_null_column_seen = False for c in self.columns: if ( c.column.table.name, c.column.column.name, ) in self.settings.exclude_columns: selects.append(sql_alias(SQL_NULL, c.column_alias)) elif c.nested_path[0] == nested_path[0]: s = sql_alias( quote_column(c.table_alias, c.column.column.name), c.column_alias, ) selects.append(s) not_null_column_seen = True elif startswith_field(nested_path[0], c.path): # PARENT ID REFERENCES if c.column.is_id: s = sql_alias( quote_column(c.table_alias, c.column.column.name), c.column_alias, ) selects.append(s) not_null_column_seen = True else: selects.append(sql_alias(SQL_NULL, c.column_alias)) else: selects.append(sql_alias(SQL_NULL, c.column_alias)) if not_null_column_seen: sql.append(SQL_SELECT + sql_list(selects) + SQL("").join(sql_joins)) return sql def construct_docs(self, cursor, append, please_stop): """ :param cursor: ITERATOR OF RECORD TUPLES :param append: METHOD TO CALL WITH CONSTRUCTED DOCUMENT :return: (count, first, next, next_key) number of documents added the first document in the batch the first document of the next batch """ null_values = set(self.settings.null_values) | {None} doc_count = 0 columns = tuple(wrap(c) for c in self.columns) with Timer("Downloading from MySQL", verbose=DEBUG): curr_doc = Null row_count = 0 if DEBUG: cursor = list(cursor) Log.note("{{data|json|limit(1000)}}", data=cursor) for row in cursor: row_count += 1 if please_stop: Log.error("Got `please_stop` signal") nested_path = [] next_object = Data() for c, value in zip(columns, row): # columns ARE IN ORDER, FROM FACT ['.'] TO EVER-DEEPER-NESTED if value in null_values: # EVERY COLUMN THAT'S NOT NEEDED IS None continue if len(nested_path) < len(c.nested_path): # EACH COLUMN IS DEEPER THAN THE NEXT # THESE WILL BE THE id COLUMNS, WHICH ARE ALWAYS INCLUDED AND BEFORE ALL OTHER VALUES nested_path = unwrap(c.nested_path) next_object = Data() next_object[c.put] = value if len(nested_path) == 1: # TOP LEVEL DOCUMENT, EMIT THE curr_doc AND ADVANCE if curr_doc: append(curr_doc["id"]) doc_count += 1 curr_doc = next_object continue # LET'S PLACE next_object AT THE CORRECT NESTED LEVEL children = [curr_doc] for parent_path, path in jx.pairs(reversed(nested_path)): relative_path = relative_field(path, parent_path) try: parent = children[-1] children = unwrap(parent[relative_path]) if not children: children = parent[relative_path] = [] except Exception as e: Log.error( "Document construction error: path={{path}}\nsteps={{steps}}\ndoc={{curr_doc}}\nnext={{next_object}}", path=path, steps=nested_path, curr_doc=curr_doc, next_object=next_object, cause=e) children.append(next_object) # DEAL WITH LAST RECORD if curr_doc: append(curr_doc["id"]) doc_count += 1 Log.note( "{{doc_count}} documents ({{row_count}} db records)", doc_count=doc_count, row_count=row_count, )