def test_extract_job_sql(extract_job_settings, transactional_db): """ VERIFY SQL OVER DATABASE """ extractor = MySqlSnowflakeExtractor(extract_job_settings.source) sql = extractor.get_sql(SQL("SELECT 0")) assert "".join(sql.sql.split()) == "".join(EXTRACT_JOB_SQL.split())
def test_extract_alert(extract_alert_settings, test_perf_alert_summary, test_perf_alert): """ If you find this test failing, then copy the JSON in the test failure into the test_extract_alerts.json file, then you may use the diff to review the changes. """ now = datetime.datetime.now() source = MySQL(extract_alert_settings.source.database) extractor = MySqlSnowflakeExtractor(extract_alert_settings.source) sql = extractor.get_sql( SQL("SELECT " + text(test_perf_alert_summary.id) + " as id")) acc = [] with source.transaction(): cursor = list(source.query(sql, stream=True, row_tuples=True)) extractor.construct_docs(cursor, acc.append, False) doc = acc[0] # TEST ARE RUN WITH CURRENT TIMESTAMPS doc.created = now doc.last_updated = now for d in doc.details: d.created = now d.last_updated = now d.series_signature.last_updated = now assertAlmostEqual( acc, ALERT, places=3 ) # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603
def test_extract_alert_sql(extract_alert_settings, test_perf_alert_summary, test_perf_alert): """ If you find this test failing, then replace the contents of test_extract_alerts.sql with the contents of the `sql` variable below. You can then review the resulting diff. """ p = test_perf_alert s2 = PerformanceAlertSummary.objects.create( id=2, repository=test_perf_alert_summary.repository, prev_push_id=3, push_id=4, created=datetime.datetime.now(), framework=test_perf_alert_summary.framework, manually_created=False, ) # set related summary with downstream status, make sure that works # p = PerformanceAlert.objects.get(id=1) p.status = PerformanceAlert.DOWNSTREAM p.related_summary = s2 p.save() extractor = MySqlSnowflakeExtractor(extract_alert_settings.source) sql = extractor.get_sql(SQL("SELECT 0")) assert "".join(sql.sql.split()) == "".join(EXTRACT_ALERT_SQL.split())
def test_extract_job_sql(extract_job_settings, transactional_db): """ VERIFY SQL OVER DATABASE If you find this test failing, then replace the contents of test_extract_job.sql with the contents of the `sql` variable below. You can then review the resulting diff. """ extractor = MySqlSnowflakeExtractor(extract_job_settings.source) sql = extractor.get_sql(SQL("SELECT 0")) assert "".join(sql.sql.split()) == "".join(EXTRACT_JOB_SQL.split())
def run_compare(self, config, id_sql, expected): db = MySQL(**config.database) extractor = MySqlSnowflakeExtractor(kwargs=config) sql = extractor.get_sql(SQL(id_sql)) result = [] with db.transaction(): cursor = db.query(sql, stream=True, row_tuples=True) cursor = list(cursor) extractor.construct_docs(cursor, result.append, False) self.assertEqual(result, expected, "expecting identical") self.assertEqual(expected, result, "expecting identical")
def test_extract_job(complex_job, extract_job_settings, now): source = MySQL(extract_job_settings.source.database) extractor = MySqlSnowflakeExtractor(extract_job_settings.source) sql = extractor.get_sql(SQL("SELECT " + text(complex_job.id) + " as id")) acc = [] with source.transaction(): cursor = list(source.query(sql, stream=True, row_tuples=True)) extractor.construct_docs(cursor, acc.append, False) doc = acc[0] doc.guid = complex_job.guid doc.last_modified = complex_job.last_modified assertAlmostEqual( acc, JOB, places=3 ) # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603
def test_extract_alert_sql(extract_alert_settings, test_perf_alert_summary, test_perf_alert): p = test_perf_alert s2 = PerformanceAlertSummary.objects.create( id=2, repository=test_perf_alert_summary.repository, prev_push_id=3, push_id=4, created=datetime.datetime.now(), framework=test_perf_alert_summary.framework, manually_created=False, ) # set related summary with downstream status, make sure that works # p = PerformanceAlert.objects.get(id=1) p.status = PerformanceAlert.DOWNSTREAM p.related_summary = s2 p.save() extractor = MySqlSnowflakeExtractor(extract_alert_settings.source) sql = extractor.get_sql(SQL("SELECT 0")) assert "".join(sql.sql.split()) == "".join(EXTRACT_ALERT_SQL.split())
def test_extract_job(complex_job, extract_job_settings, now): """ If you find this test failing, then copy the JSON in the test failure into the test_extract_job.json file, then you may use the diff to review the changes. """ source = MySQL(extract_job_settings.source.database) extractor = MySqlSnowflakeExtractor(extract_job_settings.source) sql = extractor.get_sql(SQL("SELECT " + text(complex_job.id) + " as id")) acc = [] with source.transaction(): cursor = list(source.query(sql, stream=True, row_tuples=True)) extractor.construct_docs(cursor, acc.append, False) doc = acc[0] doc.guid = complex_job.guid assertAlmostEqual( acc, JOB, places= 4, # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603 )
def run(self, force=False, restart=False, merge=False): # SETUP LOGGING settings = startup.read_settings(filename=CONFIG_FILE) constants.set(settings.constants) Log.start(settings.debug) if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination ).get_or_create_table(settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis() state = redis.get(settings.extractor.key) if restart or not state: state = (0, 0) redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) last_modified, job_id = state # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note( "Extracting jobs for last_modified={{last_modified|datetime|quote}}, job.id={{job_id}}", last_modified=last_modified, job_id=job_id, ) # Example: job.id ==283890114 # get_ids = ConcatSQL( # (SQL_SELECT, sql_alias(quote_value(283890114), "id")) # ) # get_ids = sql_query( # { # "from": "job", # "select": ["id"], # "where": { # "or": [ # {"gt": {"last_modified": parse(last_modified)}}, # { # "and": [ # {"eq": {"last_modified": parse(last_modified)}}, # {"gt": {"id": job_id}}, # ] # }, # ] # }, # "sort": ["last_modified", "id"], # "limit": settings.extractor.chunk_size, # } # ) get_ids = SQL(str( ( Job.objects.filter( Q(last_modified__gt=parse(last_modified).datetime) | ( Q(last_modified=parse(last_modified).datetime) & Q(id__gt=job_id) ) ) .annotate() .values("id") .order_by("last_modified", "id")[ : settings.extractor.chunk_size ] ).query )) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] last_modified, job_id = last_doc.last_modified, last_doc.id redis.set( settings.extractor.key, value2json((last_modified, job_id)).encode("utf8"), ) if len(acc) < settings.extractor.chunk_size: break except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done job extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done job merge")
def extract(self, settings, force, restart, start, merge): if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination).get_or_create_table( settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis.from_url(REDIS_URL) state = redis.get(settings.extractor.key) if start: state = start, 0 elif restart or not state: state = (0, 0) redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) last_modified, job_id = state # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note( "Extracting jobs for last_modified={{last_modified|datetime|quote}}, job.id={{job_id}}", last_modified=last_modified, job_id=job_id, ) # Example: job.id ==283890114 # get_ids = ConcatSQL( # (SQL_SELECT, sql_alias(quote_value(283890114), "id")) # ) get_ids = sql_query({ "from": "job", "select": ["id"], "where": { "or": [ { "gt": { "last_modified": Date(last_modified) } }, { "and": [ { "eq": { "last_modified": Date(last_modified) } }, { "gt": { "id": job_id } }, ] }, ] }, "sort": ["last_modified", "id"], "limit": settings.extractor.chunk_size, }) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break # SOME LIMITS PLACES ON STRING SIZE for fl in jx.drill(acc, "job_log.failure_line"): fl.message = strings.limit(fl.message, 10000) for r in acc: r.etl.timestamp = Date.now() destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] last_modified, job_id = last_doc.last_modified, last_doc.id redis.set( settings.extractor.key, value2json((last_modified, job_id)).encode("utf8"), ) if len(acc) < settings.extractor.chunk_size: break except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done job extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done job merge")
def extract(self, settings, force, restart, merge): if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination).get_or_create_table( settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis.from_url(REDIS_URL) state = redis.get(settings.extractor.key) if restart or not state: state = (0, 0) redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) last_modified, alert_id = state last_modified = Date(last_modified) # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note( "Extracting alerts for last_modified={{last_modified|datetime|quote}}, alert.id={{alert_id}}", last_modified=last_modified, alert_id=alert_id, ) last_year = Date.today( ) - YEAR + DAY # ONLY YOUNG RECORDS CAN GO INTO BIGQUERY get_ids = SQL( "SELECT s.id " + "\nFROM treeherder.performance_alert_summary s" + "\nLEFT JOIN treeherder.performance_alert a ON s.id=a.summary_id" + "\nWHERE s.created>" + quote_value(last_year).sql + " AND (s.last_updated > " + quote_value(last_modified).sql + "\nOR a.last_updated > " + quote_value(last_modified).sql + ")" + "\nGROUP BY s.id" + "\nORDER BY s.id" + "\nLIMIT " + quote_value(settings.extractor.chunk_size).sql) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] last_modified, alert_id = last_doc.created, last_doc.id redis.set( settings.extractor.key, value2json((last_modified, alert_id)).encode("utf8"), ) if len(acc) < settings.extractor.chunk_size: break except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done alert extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done alert merge") Log.stop()
def extract(self, settings, force, restart, merge): if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination).get_or_create_table( settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis.from_url(REDIS_URL) state = redis.get(settings.extractor.key) if restart or not state: state = 916850000 redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) perf_id = state # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note("Extracting perfs for perf.id={{perf_id}}", perf_id=perf_id) # get_ids = sql_query( # { # "from": "performance_datum", # "select": ["id"], # "where": {"gt": {"id": perf_id}}, # "sort": ["id"], # "limit": settings.extractor.chunk_size, # } # ) get_ids = SQL( str((PerformanceDatum.objects.filter( id__gt=perf_id).values("id").order_by("id") [:settings.extractor.chunk_size]).query)) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break # TODO: Remove me July 2021 # OLD PERF RECORDS HAVE NO CORRESPONDING JOB # ADD job.submit_time FOR PARTITIONING for a in acc: if not a.job.submit_time: a.job.submit_time = a.push_timestamp a.etl.timestamp = Date.now() destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] perf_id = last_doc.id redis.set(settings.extractor.key, value2json(perf_id).encode("utf8")) if len(acc) < settings.extractor.chunk_size: break except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done perf extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done perf merge")
def run(self, force=False, restart=False, merge=False): # SETUP LOGGING settings = startup.read_settings(filename=CONFIG_FILE) constants.set(settings.constants) Log.start(settings.debug) if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination ).get_or_create_table(settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis() state = redis.get(settings.extractor.key) if restart or not state: state = (0, 0) redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) last_modified, alert_id = state last_modified = parse(last_modified) # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note( "Extracting alerts for last_modified={{last_modified|datetime|quote}}, alert.id={{alert_id}}", last_modified=last_modified, alert_id=alert_id, ) last_year = ( Date.today() - YEAR + DAY ) # ONLY YOUNG RECORDS CAN GO INTO BIGQUERY # SELECT # s.od # FROM # treeherder.performance_alert_summary s # LEFT JOIN # treeherder.performance_alert a ON s.id=a.summary_id # WHERE # s.created>{last_year} AND (s.last_updated>{last_modified} OR a.last_updated>{last_modified}) # GROUP BY # s.id # ORDER BY # s.id # LIMIT # {settings.extractor.chunk_size} get_ids = SQL( str( ( PerformanceAlertSummary.objects.filter( Q(created__gt=last_year.datetime) & ( Q(last_updated__gt=last_modified.datetime) | Q(alerts__last_updated__gt=last_modified.datetime) ) ) .annotate() .values("id") .order_by("id")[: settings.extractor.chunk_size] ).query ) ) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] last_modified, alert_id = last_doc.created, last_doc.id redis.set( settings.extractor.key, value2json((last_modified, alert_id)).encode("utf8"), ) if len(acc) < settings.extractor.chunk_size: break except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done alert extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done alert merge") Log.stop()