def build_sweep_slice_per_page(sweep_id: str, page_reality_claim: RealityClaim, task_id: str = None): from sweep_builder.pipeline import iter_pipeline from sweep_builder.reality_inferrer.reality import iter_reality_per_page_claim cnt = 0 try: with TaskGroup.task_context(task_id): _measurement_name_base = __name__ + '.' + build_sweep_slice_per_page.__name__ + '.' _measurement_tags = { 'sweep_id': sweep_id, 'page_id': page_reality_claim.entity_id } reality_claims_iter = itertools.chain( [page_reality_claim], iter_reality_per_page_claim(page_reality_claim)) _step = 1000 _before_fetch = time.time() for claim in iter_pipeline(sweep_id, reality_claims_iter): Measure.timing( _measurement_name_base + 'next_persisted', tags={ 'entity_type': claim.entity_type, **_measurement_tags }, sample_rate=0.01, )((time.time() - _before_fetch) * 1000) cnt += 1 if cnt % _step == 0: logger.info( f'#{sweep_id}-#{page_reality_claim.entity_id}: Queueing up #{cnt}' ) _before_fetch = time.time() logger.info( f"#{sweep_id}-#{page_reality_claim.entity_id}: Queued up a total of {cnt} tasks" ) except Exception as ex: ErrorInspector.inspect(ex, page_reality_claim.ad_account_id, {'sweep_id': sweep_id}) return cnt
def iter_prioritized( claims: Iterable[ScorableClaim] ) -> Generator[PrioritizationClaim, None, None]: """Assign score for each claim.""" _measurement_name_base = f'{__name__}.{iter_prioritized.__name__}' _before_next_expectation = time.time() for claim in claims: _measurement_tags = { 'entity_type': claim.entity_type, 'ad_account_id': claim.ad_account_id } Measure.timing(f'{_measurement_name_base}.next_expected', tags=_measurement_tags, sample_rate=0.01)( (time.time() - _before_next_expectation) * 1000) try: score = ScoreCalculator.assign_score(claim) with Measure.timer(f'{_measurement_name_base}.yield_result', tags=_measurement_tags): yield PrioritizationClaim( claim.entity_id, claim.entity_type, claim.report_type, claim.job_signature, score, ad_account_id=claim.ad_account_id, timezone=claim.timezone, range_start=claim.range_start, ) except ScoringException as e: ErrorInspector.inspect(e, claim.ad_account_id, {'job_id': claim.job_id}) _before_next_expectation = time.time()
def iter_persist_prioritized( sweep_id: str, prioritized_iter: Iterable[PrioritizationClaim] ) -> Generator[PrioritizationClaim, None, None]: """Persist prioritized jobs and pass-through context objects for inspection.""" AccountCache.reset() with SortedJobsQueue(sweep_id).JobsWriter() as add_to_queue: _measurement_name_base = f'{__name__}.{iter_persist_prioritized.__name__}' _before_next_prioritized = time.time() for prioritization_claim in prioritized_iter: job_type = detect_job_type(prioritization_claim.report_type, prioritization_claim.entity_type) _measurement_tags = { 'entity_type': prioritization_claim.entity_type, 'report_type': prioritization_claim.report_type, 'ad_account_id': prioritization_claim.ad_account_id, 'job_type': job_type, 'sweep_id': sweep_id, } Measure.timing(f'{_measurement_name_base}.next_prioritized', tags=_measurement_tags, sample_rate=0.01)( (time.time() - _before_next_prioritized) * 1000) score = prioritization_claim.score if not should_persist(score): logger.debug( f'Not persisting job {prioritization_claim.job_id} due to low score: {score}' ) continue # Following are JobScope attributes we don't store on JobID # so we need to store them separately. # See JobScope object for exact attr names. # At this point persister forms the # auxiliary data blob for saving on Data Flower. # We don't have to do that here. # It can be pre-computed and placed on the JobSignature # TODO: contemplate moving auxiliary data formation to # place where JobSignatures are generated and use that # data for Data Flower (as it was originally intended # but not implemented because saving each job's data # individually to Data Flower was too slow) # So, here you would unpack # **job_kwargs # that you get from prioritization_claim.score_job_pairs # ... Until then: extra_data = {} if prioritization_claim.timezone: extra_data[ 'ad_account_timezone_name'] = prioritization_claim.timezone with Measure.timer(f'{_measurement_name_base}.add_to_queue', tags=_measurement_tags): if prioritization_claim.report_age_in_days is not None: Measure.histogram( f'{_measurement_name_base}.report_age', tags=_measurement_tags)( prioritization_claim.report_age_in_days) add_to_queue(prioritization_claim.job_id, score, **extra_data) # This time includes the time consumer of this generator wastes # between reads from us. Good way to measure how quickly we are # consumed (what pauses we have between each consumption) with Measure.timer(f'{_measurement_name_base}.yield_result', tags=_measurement_tags): yield prioritization_claim _before_next_prioritized = time.time()