def generate() -> Generator[PushResult, None, None]: num_cached = 0 for push in tqdm(pushes): key = cache_key(push) if push in cache and cache[push] is not None: num_cached += 1 cached = cache[push] if cached: value, mozci_version = cached yield value else: logger.info(f"Analyzing {push.rev} at the {granularity} level...") try: if granularity == "label": runnables = push.task_labels elif granularity == "group": runnables = push.group_summaries.keys() value = ( push.revs, list(runnables), list(push.get_possible_regressions(granularity)), list(push.get_likely_regressions(granularity)), ) adr.config.cache.put( key, (value, MOZCI_VERSION), adr.config["cache"]["retention"], ) yield value except adr.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) except Exception: traceback.print_exc() adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) logger.info(f"{num_cached} pushes were already cached out of {len(pushes)}")
def generate( futures: List[concurrent.futures.Future], ) -> Generator[PushResult, None, None]: nonlocal reretrieve num_cached = 0 num_pushes = len(pushes) for _ in tqdm(range(num_pushes)): push = pushes.pop(0) cached = futures.pop(0).result() semaphore.release() # Regenerating a large amount of data when we update the mozci regression detection # algorithm is currently pretty slow, so we only regenerate a subset of pushes whenever we # run. if cached: value, mozci_version = cached # Regenerate results which were generated with an older version of mozci. if reretrieve > 0 and mozci_version != MOZCI_VERSION: cached = None reretrieve -= 1 # Regenerate results which don't contain the fix revision. elif len(value) != 5: cached = None if cached: num_cached += 1 value, mozci_version = cached assert len(value) == 5 yield value else: logger.info( f"Analyzing {push.rev} at the {granularity} level...") key = cache_key(push) try: if granularity == "label": runnables = push.task_labels elif granularity == "group": runnables = push.group_summaries.keys() elif granularity == "config_group": runnables = push.config_group_summaries.keys() value = ( tuple(push.revs), push.backedoutby or push.bustage_fixed_by, tuple(runnables), tuple(push.get_possible_regressions(granularity)), tuple(push.get_likely_regressions(granularity)), ) mozci.config.cache.put( key, (value, MOZCI_VERSION), mozci.config["cache"]["retention"], ) assert len(value) == 5 yield value except mozci.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) except Exception: traceback.print_exc() logger.info( f"{num_cached} pushes were already cached out of {num_pushes}")
def generate_push_data(self, runnable): # We keep in the cache the fact that we failed to analyze a push for 10 # days, so if we re-run often we don't retry the same pushes many times. MISSING_CACHE_RETENTION = 10 * 24 * 60 # We'll use the past TRAINING_MONTHS months only for training the model, # but we use half TRAINING_MONTHS months more than that to calculate the # failure statistics. from_months = TRAINING_MONTHS[runnable] + math.floor( TRAINING_MONTHS[runnable] / 2 ) pushes = mozci.push.make_push_objects( from_date=f"today-{from_months}month", to_date="today-3day", branch="autoland", ) start_time = time.monotonic() num_cached = 0 push_data = [] def cache_key(push): return f"push_data.{runnable}.{push.rev}" # XXX: Some of the old pushes were stored without the mozci version, we # need to handle that until all have the version stored alongside them. for push in pushes: key = cache_key(push) cached = adr.config.cache.get(key) if not cached or isinstance(cached, tuple): continue adr.config.cache.forever(key, (cached, 0)) # Regenerating a large amount of data when we update the mozci regression detection # algorithm is currently pretty slow, so we only regenerate 1000 pushes whenever we # run. to_regenerate = set() for push in pushes[::-1]: cached = adr.config.cache.get(cache_key(push)) if not cached: continue value, mozci_version = cached if mozci_version != MOZCI_VERSION and len(to_regenerate) < 1000: to_regenerate.add(value[0][0]) for push in tqdm(pushes): key = cache_key(push) if adr.config.cache.has(key) and push.revs[0] not in to_regenerate: num_cached += 1 cached = adr.config.cache.get(key) if cached: value, mozci_version = cached push_data.append(value) else: logger.info(f"Analyzing {push.rev} at the {runnable} level...") try: if runnable == "label": runnables = push.task_labels elif runnable == "group": runnables = push.group_summaries.keys() value = [ push.revs, list(runnables), list(push.get_possible_regressions(runnable)), list(push.get_likely_regressions(runnable)), ] push_data.append(value) adr.config.cache.forever(key, (value, MOZCI_VERSION)) except adr.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) except Exception: traceback.print_exc() adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) if time.monotonic() - start_time >= 10800: self.upload_adr_cache() start_time = time.monotonic() logger.info(f"{num_cached} pushes were already cached out of {len(pushes)}") with open(f"push_data_{runnable}.json", "w") as f: json.dump(push_data, f) zstd_compress(f"push_data_{runnable}.json")
def generate( futures: List[concurrent.futures.Future], ) -> Generator[PushResult, None, None]: num_cached = 0 num_pushes = len(pushes) # Regenerating a large amount of data when we update the mozci regression detection # algorithm is currently pretty slow, so we only regenerate a subset of pushes whenever we # run. to_regenerate = int(os.environ.get("OLD_RESULTS_TO_REGENERATE", 0)) for _ in tqdm(range(num_pushes)): push = pushes.pop(0) cached = futures.pop(0).result() semaphore.release() if cached and to_regenerate > 0: value, mozci_version = cached # Regenerate results which were generated when we were not cleaning # up WPT groups. if granularity == "group" and any( runnable.startswith("/") for runnable in value[1] ): cached = None to_regenerate -= 1 # Regenerate results which were generated when we didn't get a correct # configuration for test-verify tasks. elif granularity == "config_group" and any( "test-verify" in runnable[0] for runnable in value[1] ): cached = None to_regenerate -= 1 # Regenerate results which were generated with an older version of mozci. elif mozci_version != MOZCI_VERSION: cached = None to_regenerate -= 1 if cached: num_cached += 1 value, mozci_version = cached yield value else: logger.info(f"Analyzing {push.rev} at the {granularity} level...") key = cache_key(push) try: if granularity == "label": runnables = push.task_labels elif granularity == "group": runnables = push.group_summaries.keys() elif granularity == "config_group": runnables = push.config_group_summaries.keys() value = ( push.revs, tuple(runnables), tuple(push.get_possible_regressions(granularity)), tuple(push.get_likely_regressions(granularity)), ) adr.config.cache.put( key, (value, MOZCI_VERSION), adr.config["cache"]["retention"], ) yield value except adr.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) except Exception: traceback.print_exc() logger.info(f"{num_cached} pushes were already cached out of {num_pushes}")
def generate_push_data(self, runnable): def upload_adr_cache(): cache_path = os.path.splitext(ADR_CACHE_DB)[0] assert os.path.abspath(adr.config["cache"]["stores"]["file"] ["path"]) == os.path.abspath(cache_path) with open_tar_zst(f"{ADR_CACHE_DB}.zst") as tar: tar.add(cache_path) db.upload(ADR_CACHE_DB) # We keep in the cache the fact that we failed to analyze a push for 10 # days, so if we re-run often we don't retry the same pushes many times. MISSING_CACHE_RETENTION = 10 * 24 * 60 # We'll use the past TRAINING_MONTHS months only for training the model, # but we use half TRAINING_MONTHS months more than that to calculate the # failure statistics. from_months = TRAINING_MONTHS[runnable] + math.floor( TRAINING_MONTHS[runnable] / 2) pushes = mozci.push.make_push_objects( from_date=f"today-{from_months}month", to_date="today-3day", branch="autoland", ) start_time = time.monotonic() num_cached = 0 push_data = [] for push in tqdm(pushes): key = f"push_data.{runnable}.{push.rev}" logger.info(f"Analyzing {push.rev} at the {runnable} level...") if adr.config.cache.has(key): num_cached += 1 cached = adr.config.cache.get(key) if cached: # XXX: We have to support items in the cache that were added # before the mozci version was stored. We can drop the if # when all items have been switched over. value = cached[0] if isinstance(cached, tuple) else cached push_data.append(value) else: try: if runnable == "label": runnables = push.task_labels elif runnable == "group": runnables = push.group_summaries.keys() value = [ push.revs, list(runnables), list(push.get_possible_regressions(runnable)), list(push.get_likely_regressions(runnable)), ] push_data.append(value) adr.config.cache.forever(key, (value, MOZCI_VERSION)) except adr.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) except Exception: traceback.print_exc() adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) if time.monotonic() - start_time >= 3600: upload_adr_cache() start_time = time.monotonic() logger.info( f"{num_cached} pushes were already cached out of {len(pushes)}") upload_adr_cache() with open(f"push_data_{runnable}.json", "w") as f: json.dump(push_data, f) zstd_compress(f"push_data_{runnable}.json")
def generate_push_data(self, runnable): # We keep in the cache the fact that we failed to analyze a push for 10 # days, so if we re-run often we don't retry the same pushes many times. MISSING_CACHE_RETENTION = 10 * 24 * 60 # We'll use the past TRAINING_MONTHS months only for training the model, # but we use half TRAINING_MONTHS months more than that to calculate the # failure statistics. from_months = TRAINING_MONTHS[runnable] + math.floor( TRAINING_MONTHS[runnable] / 2) # We use the actual date instead of 'today-X' aliases to avoid adr caching # this query. from_date = datetime.utcnow() - relativedelta(months=from_months) to_date = datetime.utcnow() - relativedelta(days=3) pushes = mozci.push.make_push_objects( from_date=from_date.strftime("%Y-%m-%d"), to_date=to_date.strftime("%Y-%m-%d"), branch="autoland", ) num_cached = 0 push_data = [] def cache_key(push): return f"push_data.{runnable}.{push.rev}" # Regenerating a large amount of data when we update the mozci regression detection # algorithm is currently pretty slow, so we only regenerate 1000 pushes whenever we # run. to_regenerate = set() """for push in pushes[::-1]: cached = adr.config.cache.get(cache_key(push)) if not cached: continue value, mozci_version = cached if mozci_version != MOZCI_VERSION and len(to_regenerate) < 1000: to_regenerate.add(value[0][0])""" def periodically_upload_adr_cache(): start_time = time.monotonic() while not upload_thread_stop.isSet(): if time.monotonic() - start_time >= 10800: self.upload_adr_cache() start_time = time.monotonic() upload_thread_stop.wait(timeout=7) upload_thread = threading.Thread(target=periodically_upload_adr_cache) upload_thread_stop = threading.Event() upload_thread.start() s3_store = adr.util.cache_stores.S3Store({ "bucket": "communitytc-bugbug", "prefix": "data/adr_cache/", }) s3_store.set_serializer(CompressedPickleSerializer()) for push in tqdm(pushes): key = cache_key(push) if adr.config.cache.has(key) and push.revs[0] not in to_regenerate: num_cached += 1 cached = adr.config.cache.get(key) if cached: s3_store.put(key, cached, adr.config["cache"]["retention"]) value, mozci_version = cached push_data.append(value) else: logger.info(f"Analyzing {push.rev} at the {runnable} level...") try: if runnable == "label": runnables = push.task_labels elif runnable == "group": runnables = push.group_summaries.keys() value = [ push.revs, list(runnables), list(push.get_possible_regressions(runnable)), list(push.get_likely_regressions(runnable)), ] push_data.append(value) adr.config.cache.put(key, (value, MOZCI_VERSION), adr.config["cache"]["retention"]) s3_store.put(key, (value, MOZCI_VERSION), adr.config["cache"]["retention"]) except adr.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) except Exception: traceback.print_exc() adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) upload_thread_stop.set() upload_thread.join() logger.info( f"{num_cached} pushes were already cached out of {len(pushes)}") with open(f"push_data_{runnable}.json", "w") as f: json.dump(push_data, f) zstd_compress(f"push_data_{runnable}.json")
def generate_push_data(self, runnable): def upload_adr_cache(): cache_path = os.path.splitext(ADR_CACHE_DB)[0] assert os.path.abspath( adr.config["cache"]["stores"]["file"]["path"] ) == os.path.abspath(cache_path) with open_tar_zst(f"{ADR_CACHE_DB}.zst") as tar: tar.add(cache_path) db.upload(ADR_CACHE_DB) # We'll use the past TRAINING_MONTHS months only for training the model, # but we use half TRAINING_MONTHS months more than that to calculate the # failure statistics. from_months = TRAINING_MONTHS[runnable] + math.floor( TRAINING_MONTHS[runnable] / 2 ) pushes = mozci.push.make_push_objects( from_date=f"today-{from_months}month", to_date="today-3day", branch="autoland", ) start_time = time.monotonic() num_cached = 0 push_data = [] for push in tqdm(pushes): key = f"push_data.{runnable}.{push.rev}" logger.info(f"Analyzing {push.rev} at the {runnable} level...") if adr.config.cache.has(key): num_cached += 1 push_data.append(adr.config.cache.get(key)) else: try: if runnable == "label": runnables = push.task_labels elif runnable == "group": runnables = push.group_summaries.keys() value = [ push.revs, list(runnables), list(push.get_possible_regressions(runnable)), list(push.get_likely_regressions(runnable)), ] push_data.append(value) adr.config.cache.forever(key, value) except adr.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) except Exception: traceback.print_exc() if time.monotonic() - start_time >= 3600: upload_adr_cache() start_time = time.monotonic() logger.info(f"{num_cached} pushes were already cached out of {len(pushes)}") upload_adr_cache() with open(f"push_data_{runnable}.json", "w") as f: json.dump(push_data, f) zstd_compress(f"push_data_{runnable}.json")
def generate( progress_bar: tqdm, pushes: list[mozci.push.Push], futures: list[concurrent.futures.Future], ) -> Generator[PushResult, None, None]: nonlocal reretrieve num_cached = 0 num_pushes = len(pushes) num_errors = 0 for push, future in zip(pushes, futures): cached = future.result() # Regenerating a large amount of data when we update the mozci regression detection # algorithm is currently pretty slow, so we only regenerate a subset of pushes whenever we # run. if cached: value, mozci_version = cached # Regenerate results which were generated with an older version of mozci. if reretrieve > 0 and mozci_version != MOZCI_VERSION: cached = None reretrieve -= 1 if cached: num_cached += 1 value, mozci_version = cached assert len(value) == 5 if value != "ERROR": yield value else: num_errors += 1 else: logger.info( f"Analyzing {push.rev} at the {granularity} level...") key = cache_key(push) try: if granularity == "label": runnables = push.label_summaries.keys() elif granularity == "group": runnables = push.group_summaries.keys() elif granularity == "config_group": runnables = push.config_group_summaries.keys() value = ( tuple(push.revs), push.backedoutby or push.bustage_fixed_by, tuple(runnables), tuple(push.get_possible_regressions(granularity)), tuple(push.get_likely_regressions(granularity)), ) mozci.config.cache.put( key, (value, MOZCI_VERSION), mozci.config["cache"]["retention"], ) assert len(value) == 5 yield value except mozci.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) except Exception: num_errors += 1 traceback.print_exc() mozci.config.cache.put( key, ("ERROR", MOZCI_VERSION), mozci.config["cache"]["retention"], ) progress_bar.update(1) logger.info( f"{num_cached} pushes were already cached out of {num_pushes}") logger.info(f"There were errors in {num_errors} pushes")
def generate(executor) -> Generator[PushResult, None, None]: num_cached = 0 num_pushes = len(pushes) # Regenerating a large amount of data when we update the mozci regression detection # algorithm is currently pretty slow, so we only regenerate 1000 pushes whenever we # run. to_regenerate = 1000 semaphore = threading.BoundedSemaphore(256) def retrieve_from_cache(push): semaphore.acquire() return adr.config.cache.get(cache_key(push)) futures = tuple( executor.submit(retrieve_from_cache, push) for push in pushes) for push, future in zip(tqdm(pushes), futures): exc = future.exception() if exc is not None: logger.info(f"Exception {exc} while getting {push.rev}") for f in futures: f.cancel() cached = future.result() semaphore.release() if cached and to_regenerate > 0: value, mozci_version = cached # Regenerate results which were generated when we were not cleaning # up WPT groups. if any(runnable.startswith("/") for runnable in value[1]): cached = None to_regenerate -= 1 """# Regenerate results which were generated with an older version of mozci. elif mozci_version != MOZCI_VERSION and to_regenerate > 0: cached = None to_regenerate -= 1""" if cached is not None: num_cached += 1 if cached: value, mozci_version = cached yield value else: logger.info( f"Analyzing {push.rev} at the {granularity} level...") key = cache_key(push) try: if granularity == "label": runnables = push.task_labels elif granularity == "group": runnables = push.group_summaries.keys() elif granularity == "config_group": runnables = push.config_group_summaries.keys() value = ( push.revs, tuple(runnables), tuple(push.get_possible_regressions(granularity)), tuple(push.get_likely_regressions(granularity)), ) adr.config.cache.put( key, (value, MOZCI_VERSION), adr.config["cache"]["retention"], ) yield value except adr.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) except Exception: traceback.print_exc() adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) logger.info( f"{num_cached} pushes were already cached out of {num_pushes}")