def feedback_entity_task(entity_data: Dict[str, Any], entity_type: str): """ This task is to feedback information about entity collected by updating data store. :param entity_data: The entity we're feeding back to the system :param entity_type: Type of the entity, a string representation """ try: feedback_entity(entity_data, entity_type) except (PutError, UpdateError) as ex: if ErrorInspector.is_dynamo_throughput_error(ex): Measure.counter( feedback_entity_task.__name__ + '.throughput_exceptions', tags={ 'entity_type': entity_type, 'ad_account_id': determine_ad_account_id(entity_data, entity_type) }, ).increment() logger.info(str(ex)) else: raise
def _fetch_job_report(job_id: str) -> Optional[JobReport]: """Retrieve job report from job report table (cached).""" try: report = JobReport.get(job_id) if report.fails_in_row and report.fails_in_row >= PERMANENTLY_FAILING_JOB_THRESHOLD: Measure.counter('permanently_failing_job').increment() logger.warning( f'[permanently-failing-job] Job with id {job_id} failed {report.fails_in_row}' f' times in a row.') return report except DoesNotExist: return None
def iter_entities_per_page_id( page_id: str, fields: List[str] = None, page_entity_types: List[str] = None ) -> Generator[Dict[str, Any], None, None]: if not page_entity_types: page_entity_models = page_entity_type_model_map.values() else: page_entity_models = [page_entity_type_model_map[entity_type] for entity_type in page_entity_types] _step = 1000 for EntityModel in page_entity_models: cnt = 0 with Measure.counter( __name__ + '.entities_per_page_id', tags={'ad_account_id': page_id, 'entity_type': EntityModel.entity_type} ) as cntr: for record in EntityModel.query(page_id, filter_condition=(EntityModel.is_accessible != False)): cnt += 1 record_dict = record.to_dict(fields=fields, skip_null=True) # this is unfortunate, but we need to change page_id to ad_account_id record_dict['ad_account_id'] = record_dict['page_id'] del record_dict['page_id'] yield record_dict if cnt % _step == 0: cntr += _step if cnt % _step: cntr += cnt % _step
def iter_entities_per_ad_account_id( ad_account_id: str, fields: List[str] = None, entity_types: List[str] = None ) -> Generator[Dict[str, Any], None, None]: # occasionally it's important to pass through # we are not overriding the values, but must pass some value # state in entity_models # There we treat explicit None, or empty array as "use default list" if not entity_types: # All types are returned entity_models = entity_type_model_map.values() else: # intentionally leaving this logic brittle # this function is linked to types "statically" # and is not expected to hide misses in the map. entity_models = [entity_type_model_map[entity_type] for entity_type in entity_types] _step = 1000 for EntityModel in entity_models: cnt = 0 with Measure.counter( __name__ + '.entities_per_ad_account_id', tags={'ad_account_id': ad_account_id, 'entity_type': EntityModel.entity_type}, ) as cntr: for record in EntityModel.query(ad_account_id, filter_condition=(EntityModel.is_accessible != False)): cnt += 1 yield record.to_dict(fields=fields, skip_null=True) if cnt % _step == 0: cntr += _step if cnt % _step: cntr += cnt % _step
def _send_measurement_task_runtime(job_scope: JobScope, bucket: int): _measurement_base_name = f'{__name__}.report_tasks_outcome' _measurement_tags = { 'ad_account_id': job_scope.ad_account_id, 'sweep_id': job_scope.sweep_id, 'report_type': job_scope.report_type, 'report_variant': job_scope.report_variant, 'bucket': bucket, 'job_type': job_scope.job_type, } if job_scope.datapoint_count and job_scope.datapoint_count > 0: Measure.counter(f'{_measurement_base_name}.data_points', tags=_measurement_tags).increment( job_scope.datapoint_count) Measure.histogram(f'{_measurement_base_name}.data_points', tags=_measurement_tags)(job_scope.datapoint_count) Measure.gauge(f'{_measurement_base_name}.running_time', tags=_measurement_tags)(job_scope.running_time)
def __init__( self, sweep_id: str, sweep_status_tracker: SweepStatusTracker, oozed_total: int, stop_waiting_time: float, *, wait_interval: int = 1, ): self.sweep_id = sweep_id self.sweep_status_tracker = sweep_status_tracker self.oozed_total = oozed_total self.stop_waiting_time = stop_waiting_time self.wait_interval = wait_interval self.counter = Measure.counter(f'{__name__}.done', tags={'sweep_id': sweep_id}) self._last_total = 0
def __init__( self, sweep_id: str, sweep_status_tracker: SweepStatusTracker, pulse_review_interval: int, stop_oozing_time: float, *, wait_interval: int = 1, ): self.sweep_id = sweep_id self.sweep_status_tracker = sweep_status_tracker self.pulse_review_interval = pulse_review_interval self.stop_oozing_time = stop_oozing_time self.wait_interval = wait_interval self.oozed_count = 0 self.oozing_rate = OOZER_START_RATE self.counter = Measure.counter(f'{__name__}.oozed', tags={'sweep_id': sweep_id}) self._rate_review_time = self._pulse_review_time = round( time.time()) - 1 self._tasks_since_review = 0
def send_measurement_error(error_type: str, ad_account_id: str): Measure.counter(__name__ + '.errors', { 'error_type': error_type, 'ad_account_id': ad_account_id }).increment()
def build_sweep(sweep_id: str): from sweep_builder.init_tokens import init_tokens from sweep_builder.pipeline import iter_pipeline from sweep_builder.reality_inferrer.reality import iter_reality_base try: _measurement_name_base = __name__ + '.' + build_sweep.__name__ + '.' _measurement_tags = {'sweep_id': sweep_id} # In the jobs persister we purposefully avoid persisting # anything besides the Job ID. This means that things like tokens # and other data on *Claim is lost. # As long as we are doing that, we need to leave tokens somewhere # for workers to pick up. logger.info(f"#{sweep_id} Prepositioning platform tokens") init_tokens(sweep_id) logger.info(f"#{sweep_id} Starting sweep building") # task_group = TaskGroup() delayed_tasks = [] cnt = 0 with Measure.counter(_measurement_name_base + 'outer_loop', tags=_measurement_tags) as cntr: for reality_claim in iter_reality_base(): # what we get here are Scope and AdAccount objects. # Children of AdAccount reality claims are to be processed # in separate Celery tasks. But we still have jobs # associated with Scopes objects, so # need to rate and store the jobs before chipping off # a separate task for each of AdAccounts. if reality_claim.entity_type == Entity.AdAccount: # child_task_id = task_group.generate_task_id() # task_group.report_task_active(child_task_id) delayed_tasks.append( # we are using Celery chord to process AdAccounts in parallel # for very very large (hundreds of thousands) numbers of AdAccounts, # chord management will be super memory expensive, # as chord timer/controller will be looking at entire list on # each tick. # In that case, probably better to switch to # a callback per handler + mutex/counter somewhere build_sweep_slice_per_ad_account_task.si( sweep_id, reality_claim, # task_id=child_task_id )) elif reality_claim.entity_type == Entity.Page: delayed_tasks.append( build_sweep_slice_per_page.si(sweep_id, reality_claim)) else: cnt = 1 _step = 1000 for _ in iter_pipeline(sweep_id, [reality_claim]): cnt += 1 if cnt % _step == 0: cntr += _step logger.info( f'#{sweep_id}-root: Queueing up #{cnt}') # because above counter communicates only increments of _step, # we need to report remainder --- amount under _step cntr += cnt % _step logger.info(f"#{sweep_id}-root: Queued up a total of {cnt} tasks") # # here we fan out actual work to celery workers # # and wait for all tasks to finish before returning group_result = group(delayed_tasks).delay() # In case the workers crash, go-away (scaling) or are otherwise # non-responsive, the following would wait indefinitely. # Since that's not desirable and the total sweep build time is minutes at # maximum, we add a reasonable timeout # Because we are not joining on the results, but actually periodically # looking for "you done yet?", we can exit if this threshold is busted, and # let the next run recover from the situation # You will nee should_be_done_by = time.time() + (60 * 20) Measure.gauge(f'{_measurement_name_base}per_account_sweep.total', tags=_measurement_tags)(len(group_result.results)) # Monitor the progress. Although this obviously can be achieved with # group_result.join(), we need to "see" into the task group progress with Measure.gauge(f'{_measurement_name_base}per_account_sweep.done', tags=_measurement_tags) as measure_done: while True: done_counter = 0 for result in group_result.results: logger.debug(f'{result}: {result.state}') if result.ready(): done_counter += 1 logger.debug( f"TOTAL: {done_counter}/{len(group_result.results)}") logger.debug("=" * 20) logger.debug("Checking group result") measure_done(done_counter) if group_result.ready(): logger.debug(f"#{sweep_id}-root: Sweep build complete") break # Important. If we don't sleep, the native join in celery context # switches all the time and we end up with 100% cpu, eventually somehow # deadlocking the process. 5 seconds is kind of an arbitrary number, but # does what we need and the impact of a (potential) delay is absolutely # minimal time.sleep(5) # The last line of defense. Workers did not finish in time we # expected, no point waiting, kill it. if time.time() > should_be_done_by: Measure.gauge( f'{_measurement_name_base}per_account_sweep.early_exits', tags=_measurement_tags)(1) logger.warning( "Exiting incomplete sweep build, it's taking too long") return logger.info("Waiting on results join") if group_result.supports_native_join: group_result.join_native() else: # Eager mode does not support native join. group_result.join() # # alternative to Celery's native group_result.join() # # our manual task tracking code + join() # task_group.join() logger.info("Join complete, sweep build ended") except Exception as ex: ErrorInspector.inspect(ex, None, {'sweep_id': sweep_id})