def run_sweep_and_sleep(sweep_id: str = None): """ Like run_sweep but actually sleeps for suggested amount of time before quitting. This is used to internalize the management of period between consecutive sweep runs. This is a crude way to spacing out the sweep runs. Alternative would be to turn runner back into a Celery task and use Celery timed delay API for recursive self-scheduling. """ delay_next_sweep_start_by = run_sweep(sweep_id=sweep_id) _measurement_name_base = __name__ + '.run_sweep_and_sleep.' # <- function name. adjust if changed _measurement_tags = {'sweep_id': sweep_id} Measure.gauge(_measurement_name_base + 'delay_next_sweep_start_by', tags=_measurement_tags)(int(delay_next_sweep_start_by)) logger.info( f"Done with main sweep run. Waiting for {delay_next_sweep_start_by} seconds before quitting" ) time.sleep(delay_next_sweep_start_by)
def _send_measurement_task_runtime(job_scope: JobScope, bucket: int): _measurement_base_name = f'{__name__}.report_tasks_outcome' _measurement_tags = { 'ad_account_id': job_scope.ad_account_id, 'sweep_id': job_scope.sweep_id, 'report_type': job_scope.report_type, 'report_variant': job_scope.report_variant, 'bucket': bucket, 'job_type': job_scope.job_type, } if job_scope.datapoint_count and job_scope.datapoint_count > 0: Measure.counter(f'{_measurement_base_name}.data_points', tags=_measurement_tags).increment( job_scope.datapoint_count) Measure.histogram(f'{_measurement_base_name}.data_points', tags=_measurement_tags)(job_scope.datapoint_count) Measure.gauge(f'{_measurement_base_name}.running_time', tags=_measurement_tags)(job_scope.running_time)
def ooze_task(self, task: CeleryTask, job_scope: JobScope, job_context: JobContext, score: int): """Blocking task oozing function.""" if OOZER_ENABLE_LEARNING and self.should_review_oozer_rate: pulse = self.sweep_status_tracker.get_pulse() old_rate = self.oozing_rate logger.warning( f'Completed {self._tasks_since_review} tasks in {self.secs_since_oozer_rate_review} seconds' ) self.oozing_rate = self.calculate_rate(old_rate, pulse) self._rate_review_time = self.current_time() self._tasks_since_review = 0 logger.warning( f'Updated oozing rate from {old_rate:.2f} to {self.oozing_rate:.2f}' ) Measure.gauge(f'{__name__}.oozing_rate', tags={'sweep_id': self.sweep_id})(self.oozing_rate) if self._tasks_since_review > self.expected_tasks_since_oozer_rate_review: gevent.sleep(self.wait_interval) self._ooze_task(task, job_scope, job_context, score) self._tasks_since_review += 1
def build_sweep(sweep_id: str): from sweep_builder.init_tokens import init_tokens from sweep_builder.pipeline import iter_pipeline from sweep_builder.reality_inferrer.reality import iter_reality_base try: _measurement_name_base = __name__ + '.' + build_sweep.__name__ + '.' _measurement_tags = {'sweep_id': sweep_id} # In the jobs persister we purposefully avoid persisting # anything besides the Job ID. This means that things like tokens # and other data on *Claim is lost. # As long as we are doing that, we need to leave tokens somewhere # for workers to pick up. logger.info(f"#{sweep_id} Prepositioning platform tokens") init_tokens(sweep_id) logger.info(f"#{sweep_id} Starting sweep building") # task_group = TaskGroup() delayed_tasks = [] cnt = 0 with Measure.counter(_measurement_name_base + 'outer_loop', tags=_measurement_tags) as cntr: for reality_claim in iter_reality_base(): # what we get here are Scope and AdAccount objects. # Children of AdAccount reality claims are to be processed # in separate Celery tasks. But we still have jobs # associated with Scopes objects, so # need to rate and store the jobs before chipping off # a separate task for each of AdAccounts. if reality_claim.entity_type == Entity.AdAccount: # child_task_id = task_group.generate_task_id() # task_group.report_task_active(child_task_id) delayed_tasks.append( # we are using Celery chord to process AdAccounts in parallel # for very very large (hundreds of thousands) numbers of AdAccounts, # chord management will be super memory expensive, # as chord timer/controller will be looking at entire list on # each tick. # In that case, probably better to switch to # a callback per handler + mutex/counter somewhere build_sweep_slice_per_ad_account_task.si( sweep_id, reality_claim, # task_id=child_task_id )) elif reality_claim.entity_type == Entity.Page: delayed_tasks.append( build_sweep_slice_per_page.si(sweep_id, reality_claim)) else: cnt = 1 _step = 1000 for _ in iter_pipeline(sweep_id, [reality_claim]): cnt += 1 if cnt % _step == 0: cntr += _step logger.info( f'#{sweep_id}-root: Queueing up #{cnt}') # because above counter communicates only increments of _step, # we need to report remainder --- amount under _step cntr += cnt % _step logger.info(f"#{sweep_id}-root: Queued up a total of {cnt} tasks") # # here we fan out actual work to celery workers # # and wait for all tasks to finish before returning group_result = group(delayed_tasks).delay() # In case the workers crash, go-away (scaling) or are otherwise # non-responsive, the following would wait indefinitely. # Since that's not desirable and the total sweep build time is minutes at # maximum, we add a reasonable timeout # Because we are not joining on the results, but actually periodically # looking for "you done yet?", we can exit if this threshold is busted, and # let the next run recover from the situation # You will nee should_be_done_by = time.time() + (60 * 20) Measure.gauge(f'{_measurement_name_base}per_account_sweep.total', tags=_measurement_tags)(len(group_result.results)) # Monitor the progress. Although this obviously can be achieved with # group_result.join(), we need to "see" into the task group progress with Measure.gauge(f'{_measurement_name_base}per_account_sweep.done', tags=_measurement_tags) as measure_done: while True: done_counter = 0 for result in group_result.results: logger.debug(f'{result}: {result.state}') if result.ready(): done_counter += 1 logger.debug( f"TOTAL: {done_counter}/{len(group_result.results)}") logger.debug("=" * 20) logger.debug("Checking group result") measure_done(done_counter) if group_result.ready(): logger.debug(f"#{sweep_id}-root: Sweep build complete") break # Important. If we don't sleep, the native join in celery context # switches all the time and we end up with 100% cpu, eventually somehow # deadlocking the process. 5 seconds is kind of an arbitrary number, but # does what we need and the impact of a (potential) delay is absolutely # minimal time.sleep(5) # The last line of defense. Workers did not finish in time we # expected, no point waiting, kill it. if time.time() > should_be_done_by: Measure.gauge( f'{_measurement_name_base}per_account_sweep.early_exits', tags=_measurement_tags)(1) logger.warning( "Exiting incomplete sweep build, it's taking too long") return logger.info("Waiting on results join") if group_result.supports_native_join: group_result.join_native() else: # Eager mode does not support native join. group_result.join() # # alternative to Celery's native group_result.join() # # our manual task tracking code + join() # task_group.join() logger.info("Join complete, sweep build ended") except Exception as ex: ErrorInspector.inspect(ex, None, {'sweep_id': sweep_id})