def __init__(self, config): self.config = config = wrap(config) config.range.min = Date(config.range.min) config.range.max = Date(config.range.max) config.start = Date(config.start) config.interval = Duration(config.interval) config.branches = listwrap(config.branches) self.destination = bigquery.Dataset( config.destination).get_or_create_table(config.destination) # CALCULATE THE PREVIOUS RUN mozci_version = self.version("mozci") prev_done = self.get_state() if prev_done and prev_done.mozci_version == mozci_version: self.done = Data( mozci_version=mozci_version, min=Date(coalesce(prev_done.min, config.start, "today-2day")), max=Date(coalesce(prev_done.max, config.start, "today-2day")), ) else: self.done = Data( mozci_version=mozci_version, min=Date(coalesce(config.start, "today-2day")), max=Date(coalesce(config.start, "today-2day")), ) self.set_state()
def __init__(self, config): self.config = config = wrap(config) config.range.min = Date(config.range.min) config.range.max = Date(config.range.max) config.start = Date(config.start) config.interval = Duration(config.interval) config.branches = listwrap(config.branches) self.destination = bigquery.Dataset(config.destination).get_or_create_table( config.destination ) # CALCULATE THE PREVIOUS RUN mozci_version = self.version("mozci") self.etl_config_table = jx_sqlite.Container( config.config_db ).get_or_create_facts("etl-range") done_result = wrap(self.etl_config_table.query()).data prev_done = done_result[0] if len(done_result) and prev_done.mozci_version == mozci_version: self.done = Data( mozci_version=mozci_version, min=Date(coalesce(prev_done.min, config.start, "today-2day")), max=Date(coalesce(prev_done.max, config.start, "today-2day")), ) else: self.done = Data( mozci_version=mozci_version, min=Date(coalesce(config.start, "today-2day")), max=Date(coalesce(config.start, "today-2day")), ) self.etl_config_table.add(self.done)
def main(): since = Date.today() - Duration(SCATTER_RANGE) if config.database.host not in listwrap( config.analysis.expected_database_host): Log.error("Expecting database to be one of {{expected}}", expected=config.analysis.expected_database_host) if not config.analysis.interesting: Log.alert( "Expecting config file to have `analysis.interesting` with a json expression. All series are included." ) # SETUP DESTINATION deviant_summary = bigquery.Dataset( config.deviant_summary).get_or_create_table( read_only=True, kwargs=config.deviant_summary) if config.args.id: # EXIT EARLY AFTER WE GOT THE SPECIFIC IDS if len(config.args.id) < 4: step_detector.SHOW_CHARTS = True for signature_hash in config.args.id: process( signature_hash, since=since, source=config.database, deviant_summary=deviant_summary, show=True, ) return # DOWNLOAD if config.args.download: # GET INTERESTING SERIES where_clause = BQLang[jx_expression( config.analysis.interesting)].to_bq(deviant_summary.schema) # GET ALL KNOWN SERIES docs = list( deviant_summary.sql_query(f""" SELECT * EXCEPT (_rank, values) FROM ( SELECT *, row_number() over (partition by id order by last_updated desc) as _rank FROM {quote_column(deviant_summary.full_name)} ) a WHERE _rank=1 and {sql_iso(where_clause)} LIMIT {quote_value(DOWNLOAD_LIMIT)} """)) if len(docs) == DOWNLOAD_LIMIT: Log.warning("Not all signatures downloaded") File(config.args.download).write(list2tab(docs, separator=",")) # DEVIANT show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": { "abs": "overall_dev_score" }, "sort": "desc" }, limit=config.args.deviant, show_old=False, show_distribution=True, ) # MODAL show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort="overall_dev_score", limit=config.args.modal, where={"eq": { "overall_dev_status": "MODAL" }}, show_distribution=True, ) # OUTLIERS show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": "overall_dev_score", "sort": "desc" }, limit=config.args.outliers, where={"eq": { "overall_dev_status": "OUTLIERS" }}, show_distribution=True, ) # SKEWED show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": { "abs": "overall_dev_score" }, "sort": "desc" }, limit=config.args.skewed, where={"eq": { "overall_dev_status": "SKEWED" }}, show_distribution=True, ) # OK show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": { "abs": "overall_dev_score" }, "sort": "desc" }, limit=config.args.ok, where={"eq": { "overall_dev_status": "OK" }}, show_distribution=True, ) # NOISE show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": { "abs": "relative_noise" }, "sort": "desc" }, where={"gte": { "num_pushes": 30 }}, limit=config.args.noise, ) # EXTRA show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": { "abs": "max_extra_diff" }, "sort": "desc" }, where={"lte": { "num_new_segments": 7 }}, limit=config.args.extra, ) # MISSING show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": { "abs": "max_missing_diff" }, "sort": "desc" }, where={"lte": { "num_old_segments": 6 }}, limit=config.args.missing, ) # PATHOLOGICAL show_sorted( config=config, since=since, source=config.database, deviant_summary=deviant_summary, sort={ "value": "num_segments", "sort": "desc" }, limit=config.args.pathological, )
def run(self, force=False, restart=False, merge=False): # SETUP LOGGING settings = startup.read_settings(filename=CONFIG_FILE) constants.set(settings.constants) Log.start(settings.debug) if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination ).get_or_create_table(settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis() state = redis.get(settings.extractor.key) if restart or not state: state = (0, 0) redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) last_modified, job_id = state # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note( "Extracting jobs for last_modified={{last_modified|datetime|quote}}, job.id={{job_id}}", last_modified=last_modified, job_id=job_id, ) # Example: job.id ==283890114 # get_ids = ConcatSQL( # (SQL_SELECT, sql_alias(quote_value(283890114), "id")) # ) # get_ids = sql_query( # { # "from": "job", # "select": ["id"], # "where": { # "or": [ # {"gt": {"last_modified": parse(last_modified)}}, # { # "and": [ # {"eq": {"last_modified": parse(last_modified)}}, # {"gt": {"id": job_id}}, # ] # }, # ] # }, # "sort": ["last_modified", "id"], # "limit": settings.extractor.chunk_size, # } # ) get_ids = SQL(str( ( Job.objects.filter( Q(last_modified__gt=parse(last_modified).datetime) | ( Q(last_modified=parse(last_modified).datetime) & Q(id__gt=job_id) ) ) .annotate() .values("id") .order_by("last_modified", "id")[ : settings.extractor.chunk_size ] ).query )) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] last_modified, job_id = last_doc.last_modified, last_doc.id redis.set( settings.extractor.key, value2json((last_modified, job_id)).encode("utf8"), ) if len(acc) < settings.extractor.chunk_size: break except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done job extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done job merge")
def extract(self, settings, force, restart, start, merge): if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination).get_or_create_table( settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis.from_url(REDIS_URL) state = redis.get(settings.extractor.key) if start: state = start, 0 elif restart or not state: state = (0, 0) redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) last_modified, job_id = state # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note( "Extracting jobs for last_modified={{last_modified|datetime|quote}}, job.id={{job_id}}", last_modified=last_modified, job_id=job_id, ) # Example: job.id ==283890114 # get_ids = ConcatSQL( # (SQL_SELECT, sql_alias(quote_value(283890114), "id")) # ) get_ids = sql_query({ "from": "job", "select": ["id"], "where": { "or": [ { "gt": { "last_modified": Date(last_modified) } }, { "and": [ { "eq": { "last_modified": Date(last_modified) } }, { "gt": { "id": job_id } }, ] }, ] }, "sort": ["last_modified", "id"], "limit": settings.extractor.chunk_size, }) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break # SOME LIMITS PLACES ON STRING SIZE for fl in jx.drill(acc, "job_log.failure_line"): fl.message = strings.limit(fl.message, 10000) for r in acc: r.etl.timestamp = Date.now() destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] last_modified, job_id = last_doc.last_modified, last_doc.id redis.set( settings.extractor.key, value2json((last_modified, job_id)).encode("utf8"), ) if len(acc) < settings.extractor.chunk_size: break except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done job extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done job merge")
def extract(self, settings, force, restart, merge): if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination).get_or_create_table( settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis.from_url(REDIS_URL) state = redis.get(settings.extractor.key) if restart or not state: state = (0, 0) redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) last_modified, alert_id = state last_modified = Date(last_modified) # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note( "Extracting alerts for last_modified={{last_modified|datetime|quote}}, alert.id={{alert_id}}", last_modified=last_modified, alert_id=alert_id, ) last_year = Date.today( ) - YEAR + DAY # ONLY YOUNG RECORDS CAN GO INTO BIGQUERY get_ids = SQL( "SELECT s.id " + "\nFROM treeherder.performance_alert_summary s" + "\nLEFT JOIN treeherder.performance_alert a ON s.id=a.summary_id" + "\nWHERE s.created>" + quote_value(last_year).sql + " AND (s.last_updated > " + quote_value(last_modified).sql + "\nOR a.last_updated > " + quote_value(last_modified).sql + ")" + "\nGROUP BY s.id" + "\nORDER BY s.id" + "\nLIMIT " + quote_value(settings.extractor.chunk_size).sql) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] last_modified, alert_id = last_doc.created, last_doc.id redis.set( settings.extractor.key, value2json((last_modified, alert_id)).encode("utf8"), ) if len(acc) < settings.extractor.chunk_size: break except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done alert extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done alert merge") Log.stop()
def extract(self, settings, force, restart, merge): if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination).get_or_create_table( settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis.from_url(REDIS_URL) state = redis.get(settings.extractor.key) if restart or not state: state = 916850000 redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) perf_id = state # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note("Extracting perfs for perf.id={{perf_id}}", perf_id=perf_id) # get_ids = sql_query( # { # "from": "performance_datum", # "select": ["id"], # "where": {"gt": {"id": perf_id}}, # "sort": ["id"], # "limit": settings.extractor.chunk_size, # } # ) get_ids = SQL( str((PerformanceDatum.objects.filter( id__gt=perf_id).values("id").order_by("id") [:settings.extractor.chunk_size]).query)) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break # TODO: Remove me July 2021 # OLD PERF RECORDS HAVE NO CORRESPONDING JOB # ADD job.submit_time FOR PARTITIONING for a in acc: if not a.job.submit_time: a.job.submit_time = a.push_timestamp a.etl.timestamp = Date.now() destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] perf_id = last_doc.id redis.set(settings.extractor.key, value2json(perf_id).encode("utf8")) if len(acc) < settings.extractor.chunk_size: break except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done perf extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done perf merge")
def run(self, force=False, restart=False, merge=False): # SETUP LOGGING settings = startup.read_settings(filename=CONFIG_FILE) constants.set(settings.constants) Log.start(settings.debug) if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination ).get_or_create_table(settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis() state = redis.get(settings.extractor.key) if restart or not state: state = (0, 0) redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) last_modified, alert_id = state last_modified = parse(last_modified) # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note( "Extracting alerts for last_modified={{last_modified|datetime|quote}}, alert.id={{alert_id}}", last_modified=last_modified, alert_id=alert_id, ) last_year = ( Date.today() - YEAR + DAY ) # ONLY YOUNG RECORDS CAN GO INTO BIGQUERY # SELECT # s.od # FROM # treeherder.performance_alert_summary s # LEFT JOIN # treeherder.performance_alert a ON s.id=a.summary_id # WHERE # s.created>{last_year} AND (s.last_updated>{last_modified} OR a.last_updated>{last_modified}) # GROUP BY # s.id # ORDER BY # s.id # LIMIT # {settings.extractor.chunk_size} get_ids = SQL( str( ( PerformanceAlertSummary.objects.filter( Q(created__gt=last_year.datetime) & ( Q(last_updated__gt=last_modified.datetime) | Q(alerts__last_updated__gt=last_modified.datetime) ) ) .annotate() .values("id") .order_by("id")[: settings.extractor.chunk_size] ).query ) ) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] last_modified, alert_id = last_doc.created, last_doc.id redis.set( settings.extractor.key, value2json((last_modified, alert_id)).encode("utf8"), ) if len(acc) < settings.extractor.chunk_size: break except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done alert extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done alert merge") Log.stop()