def feedback_entity_task(entity_data: Dict[str, Any], entity_type: str): """ This task is to feedback information about entity collected by updating data store. :param entity_data: The entity we're feeding back to the system :param entity_type: Type of the entity, a string representation """ try: feedback_entity(entity_data, entity_type) except (PutError, UpdateError) as ex: if ErrorInspector.is_dynamo_throughput_error(ex): Measure.counter( feedback_entity_task.__name__ + '.throughput_exceptions', tags={ 'entity_type': entity_type, 'ad_account_id': determine_ad_account_id(entity_data, entity_type) }, ).increment() logger.info(str(ex)) else: raise
def _fetch_job_report(job_id: str) -> Optional[JobReport]: """Retrieve job report from job report table (cached).""" try: report = JobReport.get(job_id) if report.fails_in_row and report.fails_in_row >= PERMANENTLY_FAILING_JOB_THRESHOLD: Measure.counter('permanently_failing_job').increment() logger.warning( f'[permanently-failing-job] Job with id {job_id} failed {report.fails_in_row}' f' times in a row.') return report except DoesNotExist: return None
def build_sweep_slice_per_page(sweep_id: str, page_reality_claim: RealityClaim, task_id: str = None): from sweep_builder.pipeline import iter_pipeline from sweep_builder.reality_inferrer.reality import iter_reality_per_page_claim cnt = 0 try: with TaskGroup.task_context(task_id): _measurement_name_base = __name__ + '.' + build_sweep_slice_per_page.__name__ + '.' _measurement_tags = { 'sweep_id': sweep_id, 'page_id': page_reality_claim.entity_id } reality_claims_iter = itertools.chain( [page_reality_claim], iter_reality_per_page_claim(page_reality_claim)) _step = 1000 _before_fetch = time.time() for claim in iter_pipeline(sweep_id, reality_claims_iter): Measure.timing( _measurement_name_base + 'next_persisted', tags={ 'entity_type': claim.entity_type, **_measurement_tags }, sample_rate=0.01, )((time.time() - _before_fetch) * 1000) cnt += 1 if cnt % _step == 0: logger.info( f'#{sweep_id}-#{page_reality_claim.entity_id}: Queueing up #{cnt}' ) _before_fetch = time.time() logger.info( f"#{sweep_id}-#{page_reality_claim.entity_id}: Queued up a total of {cnt} tasks" ) except Exception as ex: ErrorInspector.inspect(ex, page_reality_claim.ad_account_id, {'sweep_id': sweep_id}) return cnt
def iter_entities_per_page_id( page_id: str, fields: List[str] = None, page_entity_types: List[str] = None ) -> Generator[Dict[str, Any], None, None]: if not page_entity_types: page_entity_models = page_entity_type_model_map.values() else: page_entity_models = [page_entity_type_model_map[entity_type] for entity_type in page_entity_types] _step = 1000 for EntityModel in page_entity_models: cnt = 0 with Measure.counter( __name__ + '.entities_per_page_id', tags={'ad_account_id': page_id, 'entity_type': EntityModel.entity_type} ) as cntr: for record in EntityModel.query(page_id, filter_condition=(EntityModel.is_accessible != False)): cnt += 1 record_dict = record.to_dict(fields=fields, skip_null=True) # this is unfortunate, but we need to change page_id to ad_account_id record_dict['ad_account_id'] = record_dict['page_id'] del record_dict['page_id'] yield record_dict if cnt % _step == 0: cntr += _step if cnt % _step: cntr += cnt % _step
def iter_entities_per_ad_account_id( ad_account_id: str, fields: List[str] = None, entity_types: List[str] = None ) -> Generator[Dict[str, Any], None, None]: # occasionally it's important to pass through # we are not overriding the values, but must pass some value # state in entity_models # There we treat explicit None, or empty array as "use default list" if not entity_types: # All types are returned entity_models = entity_type_model_map.values() else: # intentionally leaving this logic brittle # this function is linked to types "statically" # and is not expected to hide misses in the map. entity_models = [entity_type_model_map[entity_type] for entity_type in entity_types] _step = 1000 for EntityModel in entity_models: cnt = 0 with Measure.counter( __name__ + '.entities_per_ad_account_id', tags={'ad_account_id': ad_account_id, 'entity_type': EntityModel.entity_type}, ) as cntr: for record in EntityModel.query(ad_account_id, filter_condition=(EntityModel.is_accessible != False)): cnt += 1 yield record.to_dict(fields=fields, skip_null=True) if cnt % _step == 0: cntr += _step if cnt % _step: cntr += cnt % _step
def run_sweep_and_sleep(sweep_id: str = None): """ Like run_sweep but actually sleeps for suggested amount of time before quitting. This is used to internalize the management of period between consecutive sweep runs. This is a crude way to spacing out the sweep runs. Alternative would be to turn runner back into a Celery task and use Celery timed delay API for recursive self-scheduling. """ delay_next_sweep_start_by = run_sweep(sweep_id=sweep_id) _measurement_name_base = __name__ + '.run_sweep_and_sleep.' # <- function name. adjust if changed _measurement_tags = {'sweep_id': sweep_id} Measure.gauge(_measurement_name_base + 'delay_next_sweep_start_by', tags=_measurement_tags)(int(delay_next_sweep_start_by)) logger.info( f"Done with main sweep run. Waiting for {delay_next_sweep_start_by} seconds before quitting" ) time.sleep(delay_next_sweep_start_by)
def _ooze_task(self, task: CeleryTask, job_scope: JobScope, job_context: JobContext, score: int): """Non-blocking task oozing function.""" task.delay(job_scope, job_context) self.oozed_count += 1 if self.oozed_count % OOZING_COUNTER_STEP == 0: self.counter += OOZING_COUNTER_STEP Measure.histogram( f'{__name__}.job_scores', tags={ 'sweep_id': self.sweep_id, 'ad_account_id': job_scope.ad_account_id, 'report_type': job_scope.report_type, 'report_variant': job_scope.report_variant, 'job_type': job_scope.job_type, }, )(score)
def generate_scorable( claim: ExpectationClaim) -> Generator[ScorableClaim, None, None]: """Select job signature for single expectation claim.""" last_report = _fetch_job_report(claim.job_id) if claim.ad_account_id and claim.entity_type == Entity.AdAccount: refresh_if_older_than = AccountCache.get_refresh_if_older_than( claim.ad_account_id) _reset = (refresh_if_older_than and last_report and last_report.last_success_dt and last_report.last_success_dt < refresh_if_older_than) if _reset: last_report = None _prefer_breakdown = (TASK_BREAKDOWN_ENABLED and claim.is_divisible and last_report and prefer_job_breakdown(last_report)) if not _prefer_breakdown: yield ScorableClaim( claim.entity_id, claim.entity_type, claim.report_type, claim.report_variant, claim.job_signature, last_report, ad_account_id=claim.ad_account_id, timezone=claim.timezone, range_start=claim.range_start, ) return logger.warning(f'Performing task breakdown for job_id: {claim.job_id}') Measure.increment( f'{__name__}.{generate_scorable.__name__}.task_broken_down', tags={ 'ad_account_id': claim.ad_account_id, 'entity_type': claim.entity_type }, )(1) # break down into smaller jobs recursively for child_claim in generate_child_claims(claim): yield from generate_scorable(child_claim)
def iter_expectations(reality_claims_iter: Iterable[RealityClaim]) -> Generator[ExpectationClaim, None, None]: """ Converts an instance of RealityClaim object (claiming that certain entities exist and providing some metadata about their existence) into one or more ExpectationClaim objects that express our expectations about what report types (for what dates) we expect to see. """ histogram_counter = defaultdict(int) for claim in reality_claims_iter: jobs_generators = entity_expectation_generator_map.get(claim.entity_type, []) for jobs_generator in jobs_generators: for expectation_claim in jobs_generator(claim): yield expectation_claim job_type = detect_job_type(expectation_claim.report_type, expectation_claim.entity_type) histogram_counter[(claim.ad_account_id, claim.entity_type, job_type)] += 1 for ((ad_account_id, entity_type, job_type), count) in histogram_counter.items(): Measure.histogram( f'{__name__}.{iter_expectations.__name__}.expectations_per_reality_claim', tags={'ad_account_id': ad_account_id, 'entity_type': entity_type, 'job_type': job_type}, )(count)
def iter_scorable( claims: Iterable[ExpectationClaim] ) -> Generator[ScorableClaim, None, None]: """Select signature for each expectation claim based on job history.""" histogram_counter = defaultdict(int) for claim in claims: for scorable_claim in generate_scorable(claim): job_type = detect_job_type(claim.report_type, claim.entity_type) histogram_counter[(claim.ad_account_id, claim.entity_type, job_type)] += 1 yield scorable_claim for ((ad_account_id, entity_type, job_type), count) in histogram_counter.items(): Measure.histogram( f'{__name__}.{iter_scorable.__name__}.scorable_claims_per_expectation_claim', tags={ 'ad_account_id': ad_account_id, 'entity_type': entity_type, 'job_type': job_type }, )(count)
def iter_prioritized( claims: Iterable[ScorableClaim] ) -> Generator[PrioritizationClaim, None, None]: """Assign score for each claim.""" _measurement_name_base = f'{__name__}.{iter_prioritized.__name__}' _before_next_expectation = time.time() for claim in claims: _measurement_tags = { 'entity_type': claim.entity_type, 'ad_account_id': claim.ad_account_id } Measure.timing(f'{_measurement_name_base}.next_expected', tags=_measurement_tags, sample_rate=0.01)( (time.time() - _before_next_expectation) * 1000) try: score = ScoreCalculator.assign_score(claim) with Measure.timer(f'{_measurement_name_base}.yield_result', tags=_measurement_tags): yield PrioritizationClaim( claim.entity_id, claim.entity_type, claim.report_type, claim.job_signature, score, ad_account_id=claim.ad_account_id, timezone=claim.timezone, range_start=claim.range_start, ) except ScoringException as e: ErrorInspector.inspect(e, claim.ad_account_id, {'job_id': claim.job_id}) _before_next_expectation = time.time()
def ooze_task(self, task: CeleryTask, job_scope: JobScope, job_context: JobContext, score: int): """Blocking task oozing function.""" if OOZER_ENABLE_LEARNING and self.should_review_oozer_rate: pulse = self.sweep_status_tracker.get_pulse() old_rate = self.oozing_rate logger.warning( f'Completed {self._tasks_since_review} tasks in {self.secs_since_oozer_rate_review} seconds' ) self.oozing_rate = self.calculate_rate(old_rate, pulse) self._rate_review_time = self.current_time() self._tasks_since_review = 0 logger.warning( f'Updated oozing rate from {old_rate:.2f} to {self.oozing_rate:.2f}' ) Measure.gauge(f'{__name__}.oozing_rate', tags={'sweep_id': self.sweep_id})(self.oozing_rate) if self._tasks_since_review > self.expected_tasks_since_oozer_rate_review: gevent.sleep(self.wait_interval) self._ooze_task(task, job_scope, job_context, score) self._tasks_since_review += 1
def __init__( self, sweep_id: str, sweep_status_tracker: SweepStatusTracker, oozed_total: int, stop_waiting_time: float, *, wait_interval: int = 1, ): self.sweep_id = sweep_id self.sweep_status_tracker = sweep_status_tracker self.oozed_total = oozed_total self.stop_waiting_time = stop_waiting_time self.wait_interval = wait_interval self.counter = Measure.counter(f'{__name__}.done', tags={'sweep_id': sweep_id}) self._last_total = 0
def assign_score(cls, claim: ScorableClaim) -> float: """Calculate score for a given claim.""" if claim.report_type in ReportType.MUST_RUN_EVERY_SWEEP: return MUST_RUN_SCORE timer = Measure.timer(f'{__name__}.assign_score', tags={ 'entity_type': claim.entity_type, 'ad_account_id': claim.ad_account_id }, sample_rate=0.01) with timer: hist_ratio = cls.historical_ratio(claim) score_skew_ratio = cls.skew_ratio(claim) account_skew = cls.account_skew(claim) combined_ratio = hist_ratio * score_skew_ratio * account_skew return int(MUST_RUN_SCORE * combined_ratio)
def _report_metrics(self, interval: int): """Regularly report pulse metrics for previous minute to Datadog.""" name_map = { FailureBucket.Success: 'success', FailureBucket.Other: 'other', FailureBucket.Throttling: 'throttling', FailureBucket.UserThrottling: 'user_throttling', FailureBucket.ApplicationThrottling: 'application_throttling', FailureBucket.AdAccountThrottling: 'adaccount_throttling', FailureBucket.TooLarge: 'too_large', FailureBucket.WorkingOnIt: 'working_on_it', FailureBucket.InaccessibleObject: 'inaccessible_object', } while True: gevent.sleep(interval) prev_minute = self.now_in_minutes() - 1 pulse_values = { int(k): int(v) for k, v in self.redis.hgetall(self._gen_key( prev_minute)).items() } total = 0 for bucket, name in name_map.items(): value = pulse_values.get(bucket, 0) # Exclude non-terminal state from total if bucket != FailureBucket.WorkingOnIt: total += value Measure.histogram(f'{__name__}.pulse_stats', tags={ 'sweep_id': self.sweep_id, 'bucket': name })(value) if total: Measure.histogram(f'{__name__}.pulse_stats', tags={ 'sweep_id': self.sweep_id, 'bucket': 'total' })(total) in_progress = self._get_in_progress_count() Measure.histogram(f'{__name__}.pulse_stats', tags={ 'sweep_id': self.sweep_id, 'bucket': 'in_progress' })(in_progress)
def __init__( self, sweep_id: str, sweep_status_tracker: SweepStatusTracker, pulse_review_interval: int, stop_oozing_time: float, *, wait_interval: int = 1, ): self.sweep_id = sweep_id self.sweep_status_tracker = sweep_status_tracker self.pulse_review_interval = pulse_review_interval self.stop_oozing_time = stop_oozing_time self.wait_interval = wait_interval self.oozed_count = 0 self.oozing_rate = OOZER_START_RATE self.counter = Measure.counter(f'{__name__}.oozed', tags={'sweep_id': sweep_id}) self._rate_review_time = self._pulse_review_time = round( time.time()) - 1 self._tasks_since_review = 0
def _send_measurement_task_runtime(job_scope: JobScope, bucket: int): _measurement_base_name = f'{__name__}.report_tasks_outcome' _measurement_tags = { 'ad_account_id': job_scope.ad_account_id, 'sweep_id': job_scope.sweep_id, 'report_type': job_scope.report_type, 'report_variant': job_scope.report_variant, 'bucket': bucket, 'job_type': job_scope.job_type, } if job_scope.datapoint_count and job_scope.datapoint_count > 0: Measure.counter(f'{_measurement_base_name}.data_points', tags=_measurement_tags).increment( job_scope.datapoint_count) Measure.histogram(f'{_measurement_base_name}.data_points', tags=_measurement_tags)(job_scope.datapoint_count) Measure.gauge(f'{_measurement_base_name}.running_time', tags=_measurement_tags)(job_scope.running_time)
def iter_persist_prioritized( sweep_id: str, prioritized_iter: Iterable[PrioritizationClaim] ) -> Generator[PrioritizationClaim, None, None]: """Persist prioritized jobs and pass-through context objects for inspection.""" AccountCache.reset() with SortedJobsQueue(sweep_id).JobsWriter() as add_to_queue: _measurement_name_base = f'{__name__}.{iter_persist_prioritized.__name__}' _before_next_prioritized = time.time() for prioritization_claim in prioritized_iter: job_type = detect_job_type(prioritization_claim.report_type, prioritization_claim.entity_type) _measurement_tags = { 'entity_type': prioritization_claim.entity_type, 'report_type': prioritization_claim.report_type, 'ad_account_id': prioritization_claim.ad_account_id, 'job_type': job_type, 'sweep_id': sweep_id, } Measure.timing(f'{_measurement_name_base}.next_prioritized', tags=_measurement_tags, sample_rate=0.01)( (time.time() - _before_next_prioritized) * 1000) score = prioritization_claim.score if not should_persist(score): logger.debug( f'Not persisting job {prioritization_claim.job_id} due to low score: {score}' ) continue # Following are JobScope attributes we don't store on JobID # so we need to store them separately. # See JobScope object for exact attr names. # At this point persister forms the # auxiliary data blob for saving on Data Flower. # We don't have to do that here. # It can be pre-computed and placed on the JobSignature # TODO: contemplate moving auxiliary data formation to # place where JobSignatures are generated and use that # data for Data Flower (as it was originally intended # but not implemented because saving each job's data # individually to Data Flower was too slow) # So, here you would unpack # **job_kwargs # that you get from prioritization_claim.score_job_pairs # ... Until then: extra_data = {} if prioritization_claim.timezone: extra_data[ 'ad_account_timezone_name'] = prioritization_claim.timezone with Measure.timer(f'{_measurement_name_base}.add_to_queue', tags=_measurement_tags): if prioritization_claim.report_age_in_days is not None: Measure.histogram( f'{_measurement_name_base}.report_age', tags=_measurement_tags)( prioritization_claim.report_age_in_days) add_to_queue(prioritization_claim.job_id, score, **extra_data) # This time includes the time consumer of this generator wastes # between reads from us. Good way to measure how quickly we are # consumed (what pauses we have between each consumption) with Measure.timer(f'{_measurement_name_base}.yield_result', tags=_measurement_tags): yield prioritization_claim _before_next_prioritized = time.time()
def build_sweep(sweep_id: str): from sweep_builder.init_tokens import init_tokens from sweep_builder.pipeline import iter_pipeline from sweep_builder.reality_inferrer.reality import iter_reality_base try: _measurement_name_base = __name__ + '.' + build_sweep.__name__ + '.' _measurement_tags = {'sweep_id': sweep_id} # In the jobs persister we purposefully avoid persisting # anything besides the Job ID. This means that things like tokens # and other data on *Claim is lost. # As long as we are doing that, we need to leave tokens somewhere # for workers to pick up. logger.info(f"#{sweep_id} Prepositioning platform tokens") init_tokens(sweep_id) logger.info(f"#{sweep_id} Starting sweep building") # task_group = TaskGroup() delayed_tasks = [] cnt = 0 with Measure.counter(_measurement_name_base + 'outer_loop', tags=_measurement_tags) as cntr: for reality_claim in iter_reality_base(): # what we get here are Scope and AdAccount objects. # Children of AdAccount reality claims are to be processed # in separate Celery tasks. But we still have jobs # associated with Scopes objects, so # need to rate and store the jobs before chipping off # a separate task for each of AdAccounts. if reality_claim.entity_type == Entity.AdAccount: # child_task_id = task_group.generate_task_id() # task_group.report_task_active(child_task_id) delayed_tasks.append( # we are using Celery chord to process AdAccounts in parallel # for very very large (hundreds of thousands) numbers of AdAccounts, # chord management will be super memory expensive, # as chord timer/controller will be looking at entire list on # each tick. # In that case, probably better to switch to # a callback per handler + mutex/counter somewhere build_sweep_slice_per_ad_account_task.si( sweep_id, reality_claim, # task_id=child_task_id )) elif reality_claim.entity_type == Entity.Page: delayed_tasks.append( build_sweep_slice_per_page.si(sweep_id, reality_claim)) else: cnt = 1 _step = 1000 for _ in iter_pipeline(sweep_id, [reality_claim]): cnt += 1 if cnt % _step == 0: cntr += _step logger.info( f'#{sweep_id}-root: Queueing up #{cnt}') # because above counter communicates only increments of _step, # we need to report remainder --- amount under _step cntr += cnt % _step logger.info(f"#{sweep_id}-root: Queued up a total of {cnt} tasks") # # here we fan out actual work to celery workers # # and wait for all tasks to finish before returning group_result = group(delayed_tasks).delay() # In case the workers crash, go-away (scaling) or are otherwise # non-responsive, the following would wait indefinitely. # Since that's not desirable and the total sweep build time is minutes at # maximum, we add a reasonable timeout # Because we are not joining on the results, but actually periodically # looking for "you done yet?", we can exit if this threshold is busted, and # let the next run recover from the situation # You will nee should_be_done_by = time.time() + (60 * 20) Measure.gauge(f'{_measurement_name_base}per_account_sweep.total', tags=_measurement_tags)(len(group_result.results)) # Monitor the progress. Although this obviously can be achieved with # group_result.join(), we need to "see" into the task group progress with Measure.gauge(f'{_measurement_name_base}per_account_sweep.done', tags=_measurement_tags) as measure_done: while True: done_counter = 0 for result in group_result.results: logger.debug(f'{result}: {result.state}') if result.ready(): done_counter += 1 logger.debug( f"TOTAL: {done_counter}/{len(group_result.results)}") logger.debug("=" * 20) logger.debug("Checking group result") measure_done(done_counter) if group_result.ready(): logger.debug(f"#{sweep_id}-root: Sweep build complete") break # Important. If we don't sleep, the native join in celery context # switches all the time and we end up with 100% cpu, eventually somehow # deadlocking the process. 5 seconds is kind of an arbitrary number, but # does what we need and the impact of a (potential) delay is absolutely # minimal time.sleep(5) # The last line of defense. Workers did not finish in time we # expected, no point waiting, kill it. if time.time() > should_be_done_by: Measure.gauge( f'{_measurement_name_base}per_account_sweep.early_exits', tags=_measurement_tags)(1) logger.warning( "Exiting incomplete sweep build, it's taking too long") return logger.info("Waiting on results join") if group_result.supports_native_join: group_result.join_native() else: # Eager mode does not support native join. group_result.join() # # alternative to Celery's native group_result.join() # # our manual task tracking code + join() # task_group.join() logger.info("Join complete, sweep build ended") except Exception as ex: ErrorInspector.inspect(ex, None, {'sweep_id': sweep_id})
def send_measurement_error(error_type: str, ad_account_id: str): Measure.counter(__name__ + '.errors', { 'error_type': error_type, 'ad_account_id': ad_account_id }).increment()