예제 #1
0
 def test_cover_global_job_types(self):
     for report_type in (
         ReportType.sync_expectations,
         ReportType.sync_status,
         ReportType.import_accounts,
         ReportType.import_pages,
     ):
         with self.subTest(f'Report type = "{report_type}"'):
             assert (
                 detect_job_type(report_type) == JobType.GLOBAL
             ), f'Report type "{report_type}" should be known global job type'
예제 #2
0
 def test_cover_unknown_job_types(self):
     unknown_report_type = 'dummy-type'
     unknown_report_variant = 'dummy-variant'
     test_cases = [
         (unknown_report_type, None),
         (None, unknown_report_variant),
         (unknown_report_type, unknown_report_variant),
         (None, None),
     ]
     for report_type, report_variant in test_cases:
         with self.subTest(f'Report type = "{report_type}"; Report variant = "{report_variant}"'):
             assert (
                 detect_job_type(report_type, report_variant) == JobType.UNKNOWN
             ), f'Report type "{report_type}" and report variant "{report_variant}" should be unknown job type'
예제 #3
0
def iter_expectations(reality_claims_iter: Iterable[RealityClaim]) -> Generator[ExpectationClaim, None, None]:
    """
    Converts an instance of RealityClaim object (claiming that certain
    entities exist and providing some metadata about their existence)
    into one or more ExpectationClaim objects that express our expectations
    about what report types (for what dates) we expect to see.
    """
    histogram_counter = defaultdict(int)
    for claim in reality_claims_iter:
        jobs_generators = entity_expectation_generator_map.get(claim.entity_type, [])
        for jobs_generator in jobs_generators:
            for expectation_claim in jobs_generator(claim):
                yield expectation_claim
                job_type = detect_job_type(expectation_claim.report_type, expectation_claim.entity_type)
                histogram_counter[(claim.ad_account_id, claim.entity_type, job_type)] += 1

    for ((ad_account_id, entity_type, job_type), count) in histogram_counter.items():
        Measure.histogram(
            f'{__name__}.{iter_expectations.__name__}.expectations_per_reality_claim',
            tags={'ad_account_id': ad_account_id, 'entity_type': entity_type, 'job_type': job_type},
        )(count)
예제 #4
0
def iter_scorable(
    claims: Iterable[ExpectationClaim]
) -> Generator[ScorableClaim, None, None]:
    """Select signature for each expectation claim based on job history."""
    histogram_counter = defaultdict(int)
    for claim in claims:
        for scorable_claim in generate_scorable(claim):
            job_type = detect_job_type(claim.report_type, claim.entity_type)
            histogram_counter[(claim.ad_account_id, claim.entity_type,
                               job_type)] += 1
            yield scorable_claim

    for ((ad_account_id, entity_type, job_type),
         count) in histogram_counter.items():
        Measure.histogram(
            f'{__name__}.{iter_scorable.__name__}.scorable_claims_per_expectation_claim',
            tags={
                'ad_account_id': ad_account_id,
                'entity_type': entity_type,
                'job_type': job_type
            },
        )(count)
예제 #5
0
 def job_type(self) -> str:
     return detect_job_type(self.report_type, self.report_variant)
예제 #6
0
 def historical_ratio(claim: ScorableClaim) -> float:
     """Multiplier based on past efforts to download job."""
     job_type = detect_job_type(claim.report_type, claim.report_variant)
     fn = SCORE_HISTORY_HANDLERS.get((job_type, claim.report_type),
                                     ScoreSkewHandlers.same_score)
     return fn(claim)
예제 #7
0
 def skew_ratio(claim: ScorableClaim) -> float:
     job_type = detect_job_type(claim.report_type, claim.report_variant)
     fn = SCORE_SKEW_HANDLERS.get((job_type, claim.report_type),
                                  ScoreSkewHandlers.same_score)
     return fn(claim)
예제 #8
0
 def test_cover_all_known_entity_job_types(self):
     for entity_type in set.union(Entity.AA_SCOPED, Entity.NON_AA_SCOPED):
         with self.subTest(f'Entity type = "{entity_type}"'):
             assert (
                 detect_job_type(ReportType.entity, entity_type) != JobType.UNKNOWN
             ), f'Entity report for "{entity_type}" should be known job type'
예제 #9
0
 def test_cover_all_known_lifetime_metrics_paid_job_types(self):
     for entity_type in Entity.AA_SCOPED:
         with self.subTest(f'Entity type = "{entity_type}"'):
             assert (
                 detect_job_type(ReportType.lifetime, entity_type) == JobType.PAID_DATA
             ), f'Lifetime report for "{entity_type}" should be paid job type'
예제 #10
0
 def test_cover_all_known_lifetime_metrics_organic_job_types(self):
     for entity_type in Entity.NON_AA_SCOPED:
         with self.subTest(f'Entity type = "{entity_type}"'):
             assert (
                 detect_job_type(ReportType.lifetime, entity_type) == JobType.ORGANIC_DATA
             ), f'Lifetime report for "{entity_type}" should be organic job type'
예제 #11
0
def iter_persist_prioritized(
    sweep_id: str, prioritized_iter: Iterable[PrioritizationClaim]
) -> Generator[PrioritizationClaim, None, None]:
    """Persist prioritized jobs and pass-through context objects for inspection."""

    AccountCache.reset()

    with SortedJobsQueue(sweep_id).JobsWriter() as add_to_queue:

        _measurement_name_base = f'{__name__}.{iter_persist_prioritized.__name__}'

        _before_next_prioritized = time.time()
        for prioritization_claim in prioritized_iter:
            job_type = detect_job_type(prioritization_claim.report_type,
                                       prioritization_claim.entity_type)
            _measurement_tags = {
                'entity_type': prioritization_claim.entity_type,
                'report_type': prioritization_claim.report_type,
                'ad_account_id': prioritization_claim.ad_account_id,
                'job_type': job_type,
                'sweep_id': sweep_id,
            }

            Measure.timing(f'{_measurement_name_base}.next_prioritized',
                           tags=_measurement_tags,
                           sample_rate=0.01)(
                               (time.time() - _before_next_prioritized) * 1000)

            score = prioritization_claim.score
            if not should_persist(score):
                logger.debug(
                    f'Not persisting job {prioritization_claim.job_id} due to low score: {score}'
                )
                continue

            # Following are JobScope attributes we don't store on JobID
            # so we need to store them separately.
            # See JobScope object for exact attr names.
            # At this point persister forms the
            # auxiliary data blob for saving on Data Flower.
            # We don't have to do that here.
            # It can be pre-computed and placed on the JobSignature
            # TODO: contemplate moving auxiliary data formation to
            #       place where JobSignatures are generated and use that
            #       data for Data Flower (as it was originally intended
            #       but not implemented because saving each job's data
            #       individually to Data Flower was too slow)
            # So, here you would unpack
            # **job_kwargs
            # that you get from prioritization_claim.score_job_pairs
            # ... Until then:
            extra_data = {}
            if prioritization_claim.timezone:
                extra_data[
                    'ad_account_timezone_name'] = prioritization_claim.timezone

            with Measure.timer(f'{_measurement_name_base}.add_to_queue',
                               tags=_measurement_tags):
                if prioritization_claim.report_age_in_days is not None:
                    Measure.histogram(
                        f'{_measurement_name_base}.report_age',
                        tags=_measurement_tags)(
                            prioritization_claim.report_age_in_days)
                add_to_queue(prioritization_claim.job_id, score, **extra_data)

            # This time includes the time consumer of this generator wastes
            # between reads from us. Good way to measure how quickly we are
            # consumed (what pauses we have between each consumption)
            with Measure.timer(f'{_measurement_name_base}.yield_result',
                               tags=_measurement_tags):
                yield prioritization_claim

            _before_next_prioritized = time.time()