Esempio n. 1
0
def build_sweep_slice_per_page(sweep_id: str,
                               page_reality_claim: RealityClaim,
                               task_id: str = None):
    from sweep_builder.pipeline import iter_pipeline
    from sweep_builder.reality_inferrer.reality import iter_reality_per_page_claim

    cnt = 0
    try:
        with TaskGroup.task_context(task_id):

            _measurement_name_base = __name__ + '.' + build_sweep_slice_per_page.__name__ + '.'
            _measurement_tags = {
                'sweep_id': sweep_id,
                'page_id': page_reality_claim.entity_id
            }

            reality_claims_iter = itertools.chain(
                [page_reality_claim],
                iter_reality_per_page_claim(page_reality_claim))

            _step = 1000
            _before_fetch = time.time()
            for claim in iter_pipeline(sweep_id, reality_claims_iter):
                Measure.timing(
                    _measurement_name_base + 'next_persisted',
                    tags={
                        'entity_type': claim.entity_type,
                        **_measurement_tags
                    },
                    sample_rate=0.01,
                )((time.time() - _before_fetch) * 1000)
                cnt += 1

                if cnt % _step == 0:
                    logger.info(
                        f'#{sweep_id}-#{page_reality_claim.entity_id}: Queueing up #{cnt}'
                    )

                _before_fetch = time.time()

            logger.info(
                f"#{sweep_id}-#{page_reality_claim.entity_id}: Queued up a total of {cnt} tasks"
            )
    except Exception as ex:
        ErrorInspector.inspect(ex, page_reality_claim.ad_account_id,
                               {'sweep_id': sweep_id})

    return cnt
def iter_prioritized(
    claims: Iterable[ScorableClaim]
) -> Generator[PrioritizationClaim, None, None]:
    """Assign score for each claim."""
    _measurement_name_base = f'{__name__}.{iter_prioritized.__name__}'

    _before_next_expectation = time.time()

    for claim in claims:
        _measurement_tags = {
            'entity_type': claim.entity_type,
            'ad_account_id': claim.ad_account_id
        }

        Measure.timing(f'{_measurement_name_base}.next_expected',
                       tags=_measurement_tags,
                       sample_rate=0.01)(
                           (time.time() - _before_next_expectation) * 1000)

        try:
            score = ScoreCalculator.assign_score(claim)
            with Measure.timer(f'{_measurement_name_base}.yield_result',
                               tags=_measurement_tags):
                yield PrioritizationClaim(
                    claim.entity_id,
                    claim.entity_type,
                    claim.report_type,
                    claim.job_signature,
                    score,
                    ad_account_id=claim.ad_account_id,
                    timezone=claim.timezone,
                    range_start=claim.range_start,
                )
        except ScoringException as e:
            ErrorInspector.inspect(e, claim.ad_account_id,
                                   {'job_id': claim.job_id})

        _before_next_expectation = time.time()
Esempio n. 3
0
def iter_persist_prioritized(
    sweep_id: str, prioritized_iter: Iterable[PrioritizationClaim]
) -> Generator[PrioritizationClaim, None, None]:
    """Persist prioritized jobs and pass-through context objects for inspection."""

    AccountCache.reset()

    with SortedJobsQueue(sweep_id).JobsWriter() as add_to_queue:

        _measurement_name_base = f'{__name__}.{iter_persist_prioritized.__name__}'

        _before_next_prioritized = time.time()
        for prioritization_claim in prioritized_iter:
            job_type = detect_job_type(prioritization_claim.report_type,
                                       prioritization_claim.entity_type)
            _measurement_tags = {
                'entity_type': prioritization_claim.entity_type,
                'report_type': prioritization_claim.report_type,
                'ad_account_id': prioritization_claim.ad_account_id,
                'job_type': job_type,
                'sweep_id': sweep_id,
            }

            Measure.timing(f'{_measurement_name_base}.next_prioritized',
                           tags=_measurement_tags,
                           sample_rate=0.01)(
                               (time.time() - _before_next_prioritized) * 1000)

            score = prioritization_claim.score
            if not should_persist(score):
                logger.debug(
                    f'Not persisting job {prioritization_claim.job_id} due to low score: {score}'
                )
                continue

            # Following are JobScope attributes we don't store on JobID
            # so we need to store them separately.
            # See JobScope object for exact attr names.
            # At this point persister forms the
            # auxiliary data blob for saving on Data Flower.
            # We don't have to do that here.
            # It can be pre-computed and placed on the JobSignature
            # TODO: contemplate moving auxiliary data formation to
            #       place where JobSignatures are generated and use that
            #       data for Data Flower (as it was originally intended
            #       but not implemented because saving each job's data
            #       individually to Data Flower was too slow)
            # So, here you would unpack
            # **job_kwargs
            # that you get from prioritization_claim.score_job_pairs
            # ... Until then:
            extra_data = {}
            if prioritization_claim.timezone:
                extra_data[
                    'ad_account_timezone_name'] = prioritization_claim.timezone

            with Measure.timer(f'{_measurement_name_base}.add_to_queue',
                               tags=_measurement_tags):
                if prioritization_claim.report_age_in_days is not None:
                    Measure.histogram(
                        f'{_measurement_name_base}.report_age',
                        tags=_measurement_tags)(
                            prioritization_claim.report_age_in_days)
                add_to_queue(prioritization_claim.job_id, score, **extra_data)

            # This time includes the time consumer of this generator wastes
            # between reads from us. Good way to measure how quickly we are
            # consumed (what pauses we have between each consumption)
            with Measure.timer(f'{_measurement_name_base}.yield_result',
                               tags=_measurement_tags):
                yield prioritization_claim

            _before_next_prioritized = time.time()