def collect_page(job_scope: JobScope, _job_context: JobContext):
    """
    Collect a single facebook page
    """
    if job_scope.report_variant != Entity.Page:
        raise ValueError(
            f"Report level {job_scope.report_variant} specified is not: {Entity.Page}"
        )

    token = job_scope.token
    if not token:
        raise ValueError(
            f"Job {job_scope.job_id} cannot proceed. No platform tokens provided."
        )

    # We don't use it for getting a token. Something else that calls us does.
    # However, we use it to report usages of the token we got.
    token_manager = PlatformTokenManager.from_job_scope(job_scope)

    with PlatformApiContext(token) as fb_ctx:
        page_inst = page.Page(fbid=job_scope.entity_id, api=fb_ctx.api)
        page_fetched = page_inst.api_get(fields=get_default_fields(Page))
        report_job_status_task.delay(ExternalPlatformJobStatus.DataFetched,
                                     job_scope)
        token_manager.report_usage(token, 2)

        record_id_data = job_scope.to_dict()
        record_id_data.update(entity_type=Entity.Page,
                              entity_id=job_scope.entity_id,
                              report_variant=None)
        entity_data = page_fetched.export_all_data()
        entity_data = add_vendor_data(
            entity_data, id=generate_universal_id(**record_id_data))
        store = NormalStore(job_scope)
        store.store(entity_data)
def iter_collect_entities_per_page_post(
        job_scope: JobScope) -> Generator[Dict[str, Any], None, None]:
    """
    Collects an arbitrary entity for a page post
    """
    entity_type = job_scope.report_variant

    page_token_manager = PageTokenManager.from_job_scope(job_scope)
    with PlatformApiContext(
            page_token_manager.get_best_token(
                job_scope.ad_account_id)) as fb_ctx:
        root_fb_entity = fb_ctx.to_fb_model(job_scope.entity_id,
                                            Entity.PagePost)

    entities = iter_native_entities_per_page_post(root_fb_entity, entity_type)

    record_id_base_data = job_scope.to_dict()
    record_id_base_data.update(entity_type=entity_type, report_variant=None)
    del record_id_base_data['entity_id']

    with ChunkDumpStore(job_scope, chunk_size=DEFAULT_CHUNK_SIZE) as store:
        for entity in entities:
            entity_data = entity.export_all_data()
            entity_data = add_vendor_data(entity_data,
                                          id=generate_universal_id(
                                              entity_id=entity_data.get('id'),
                                              **record_id_base_data))
            entity_data['page_id'] = job_scope.ad_account_id
            entity_data['page_post_id'] = job_scope.entity_id

            # Store the individual datum, use job context for the cold
            # storage thing to divine whatever it needs from the job context
            store(entity_data)

            yield entity_data
def collect_pages_from_business(job_scope: JobScope,
                                _job_context: JobContext) -> int:
    """
    Collect all facebook pages that are active
    """
    if job_scope.report_variant != Entity.Page:
        raise ValueError(
            f"Report level {job_scope.report_variant} specified is not: {Entity.Page}"
        )

    token = job_scope.token
    if not token:
        raise ValueError(
            f"Job {job_scope.job_id} cannot proceed. No platform tokens provided."
        )

    # We don't use it for getting a token. Something else that calls us does.
    # However, we use it to report usages of the token we got.
    token_manager = PlatformTokenManager.from_job_scope(job_scope)

    with PlatformApiContext(token) as fb_ctx:
        fb_req = FacebookRequest(node_id="me",
                                 method="GET",
                                 endpoint="/businesses",
                                 api=fb_ctx.api,
                                 api_type='EDGE',
                                 target_class=Business)
        businesses = fb_req.execute()

    report_job_status_task.delay(ExternalPlatformJobStatus.DataFetched,
                                 job_scope)
    token_manager.report_usage(token)

    entity_type = Entity.Page

    record_id_base_data = job_scope.to_dict()
    record_id_base_data.update(entity_type=entity_type, report_variant=None)

    cnt = 0
    for biz in businesses:
        client_pages = list(
            biz.get_client_pages(fields=get_default_fields(Page)))
        owned_pages = list(
            biz.get_owned_pages(fields=get_default_fields(Page)))
        pages_list = client_pages + owned_pages

        for page_inst in pages_list:

            entity_data = page_inst.export_all_data()
            record_id_base_data.update(entity_id=entity_data.get('id'))
            entity_data = add_vendor_data(
                entity_data, id=generate_universal_id(**record_id_base_data))

            store = NormalStore(job_scope)
            store.store(entity_data)
            cnt += 1

    report_job_status_task.delay(ExternalPlatformJobStatus.Done, job_scope)
    return cnt
Ejemplo n.º 4
0
def collect_adaccount(job_scope: JobScope) -> Dict[str, Any]:
    """
    Collects ad account data for a AA specific JobScope definition.
    :param JobScope job_scope: The JobScope as we get it from the task itself
    """
    if job_scope.report_variant != Entity.AdAccount:
        raise ValueError(
            f"Report level {job_scope.report_variant} specified is not: {Entity.AdAccount}"
        )

    token = job_scope.token
    if not token:
        raise ValueError(
            f"Job {job_scope.job_id} cannot proceed. No platform tokens provided."
        )

    assert (
        job_scope.ad_account_id == job_scope.entity_id
    ), f'This is an ad account entity job, account_id should be equal to entity_id'

    # Used to report token usage by this job
    token_manager = PlatformTokenManager.from_job_scope(job_scope)

    with PlatformApiContext(token) as fb_ctx:
        ad_account = fb_ctx.to_fb_model(job_scope.ad_account_id,
                                        Entity.AdAccount)

        fields = get_default_fields(ad_account.__class__)

        ad_account_with_selected_fields = ad_account.api_get(
            fields=fields)  # Read just the fields we need
        ad_account_data_dict = ad_account_with_selected_fields.export_all_data(
        )  # Export the object to a dict

        token_manager.report_usage(token)

        job_scope_base = {
            # Duplicate the job_scope data to avoid mutating it
            **job_scope.to_dict(),
            'entity_type': Entity.AdAccount,
            'report_variant': None,
        }

        augmented_ad_account_data = add_vendor_data(
            # Augment the data returned from the remote API with our vendor data
            ad_account_data_dict,
            id=generate_universal_id(**job_scope_base),
        )
        feedback_entity_task.delay(ad_account_data_dict,
                                   job_scope.report_variant)
        store = NormalStore(job_scope)
        store.store(augmented_ad_account_data)

        # TODO: feedback account? this probably wouldn't make sense at the moment
        # because ad accounts are discovered from console and their lifecycle is controlled from there.

        return ad_account_data_dict
def iter_collect_entities_per_page(
        job_scope: JobScope) -> Generator[Dict[str, Any], None, None]:
    """
    Collects an arbitrary entity for a page
    """
    token, entity_type, root_fb_entity = _extract_token_entity_type_parent_entity(
        job_scope, [Entity.PagePost, Entity.PageVideo], Entity.Page,
        'ad_account_id')

    entities = iter_native_entities_per_page(root_fb_entity, entity_type)

    record_id_base_data = job_scope.to_dict()
    record_id_base_data.update(entity_type=entity_type, report_variant=None)

    token_manager = PlatformTokenManager.from_job_scope(job_scope)
    with ChunkDumpStore(
            job_scope, chunk_size=DEFAULT_CHUNK_SIZE) as store, ChunkDumpStore(
                job_scope,
                chunk_size=DEFAULT_CHUNK_SIZE,
                bucket_type=ColdStoreBucketType.RAW_BUCKET,
                custom_namespace=NAMESPACE_RAW,
            ) as raw_store:
        cnt = 0
        for entity in entities:
            entity_data = entity.export_all_data()

            entity_data = add_vendor_data(entity_data,
                                          id=generate_universal_id(
                                              entity_id=entity_data.get('id'),
                                              **record_id_base_data))
            entity_data['page_id'] = job_scope.ad_account_id

            if entity_type == Entity.PagePost:
                # store raw version of response (just to remain consistent)
                raw_store(entity_data)
                entity_data = _augment_page_post(entity_data)

            # Store the individual datum, use job context for the cold
            # storage thing to divine whatever it needs from the job context
            store(entity_data)

            # Signal to the system the new entity
            feedback_entity_task.delay(entity_data, entity_type)

            yield entity_data
            cnt += 1

            if cnt % 1000 == 0:
                # default paging size for entities per parent
                # is typically around 200. So, each 200 results
                # means about 5 hits to FB
                token_manager.report_usage(token, 5)

    token_manager.report_usage(token)
def sync_expectations(job_scope: JobScope):
    assert (job_scope.report_type == ReportType.sync_expectations
            ), 'Only sync_expectations report type is processed by this task'

    if job_scope.ad_account_id:
        # this is per AA task. No need to iterate over all
        ad_account_ids_iter = [job_scope.ad_account_id]
    else:
        ad_account_ids_iter = expecations_store.iter_expectations_ad_accounts(
            sweep_id=job_scope.sweep_id)

    for ad_account_id in ad_account_ids_iter:

        ad_account_scoped_job_scope = JobScope(job_scope.to_dict(),
                                               ad_account_id=ad_account_id,
                                               entity_type=Entity.AdAccount,
                                               entity_id=ad_account_id)

        with ChunkDumpStore(ad_account_scoped_job_scope,
                            chunk_size=200) as store:

            job_ids_iter = expecations_store.iter_expectations_per_ad_account(
                ad_account_id, ad_account_scoped_job_scope.sweep_id)

            for job_id in job_ids_iter:
                job_id_parts = parse_id_parts(job_id)

                # default is platform namespace and we communicate out only those
                if job_id_parts.namespace == JobScope.namespace:
                    store({
                        'job_id':
                        job_id,
                        # 'status':'expected',
                        'account_id':
                        job_id_parts.ad_account_id,
                        'entity_type':
                        job_id_parts.entity_type,
                        'entity_id':
                        job_id_parts.entity_id,
                        'report_type':
                        job_id_parts.report_type,
                        'report_variant':
                        job_id_parts.report_variant,
                        'range_start':
                        _to_date_string_if_set(job_id_parts.range_start),
                        'range_end':
                        _to_date_string_if_set(job_id_parts.range_end),
                        'platform_namespace':
                        job_id_parts.namespace,
                    })
def iter_collect_entities_per_page_graph(
        job_scope: JobScope) -> Generator[Dict[str, Any], None, None]:
    """
    Collects an arbitrary entity for a page using graph API
    """
    page_token_manager = PageTokenManager.from_job_scope(job_scope)
    with PlatformApiContext(
            page_token_manager.get_best_token(
                job_scope.ad_account_id)) as fb_ctx:
        page_root_fb_entity = fb_ctx.to_fb_model(job_scope.ad_account_id,
                                                 Entity.Page)

    entity_type = job_scope.report_variant
    # page size reduced to avoid error:
    #  "Please reduce the amount of data you're asking for, then retry your request"
    entities = iter_native_entities_per_page_graph(page_root_fb_entity,
                                                   entity_type,
                                                   page_size=30)

    record_id_base_data = job_scope.to_dict()
    record_id_base_data.update(entity_type=entity_type, report_variant=None)

    with ChunkDumpStore(
            job_scope, chunk_size=DEFAULT_CHUNK_SIZE) as store, ChunkDumpStore(
                job_scope,
                chunk_size=DEFAULT_CHUNK_SIZE,
                bucket_type=ColdStoreBucketType.RAW_BUCKET,
                custom_namespace=NAMESPACE_RAW,
            ) as raw_store:
        for entity in entities:
            entity_data = entity.export_all_data()
            entity_data = add_vendor_data(entity_data,
                                          id=generate_universal_id(
                                              entity_id=entity_data.get('id'),
                                              **record_id_base_data))
            entity_data['page_id'] = job_scope.ad_account_id

            if entity_type == Entity.PagePostPromotable:
                # store raw version of response (just to remain consistent)
                raw_store(entity_data)
                entity_data = _augment_page_post(entity_data)

            # Store the individual datum, use job context for the cold
            # storage thing to divine whatever it needs from the job context
            store(entity_data)

            # Signal to the system the new entity
            feedback_entity_task.delay(entity_data, entity_type)
            yield entity_data
def iter_collect_entities_per_adaccount(
        job_scope: JobScope) -> Generator[Dict[str, Any], None, None]:
    """
    Collects an arbitrary entity for an ad account
    """
    token, entity_type, root_fb_entity = _extract_token_entity_type_parent_entity(
        job_scope, Entity.AA_SCOPED, Entity.AdAccount, 'ad_account_id')

    entities = iter_native_entities_per_adaccount(root_fb_entity, entity_type)

    record_id_base_data = job_scope.to_dict()
    record_id_base_data.update(entity_type=entity_type, report_variant=None)

    token_manager = PlatformTokenManager.from_job_scope(job_scope)
    with ChunkDumpStore(job_scope, chunk_size=DEFAULT_CHUNK_SIZE) as store:
        for cnt, entity in enumerate(entities):
            entity_data = entity.export_all_data()
            entity_data = add_vendor_data(
                entity_data,
                id=generate_universal_id(
                    # FIXME: add a bug to facebook ads (get_ad_videos doesnt return ad videos but AbstractCrudObject)
                    # FIXME so it is unable to access entity.Field.id then (only a problem for ad videos)
                    entity_id=entity_data.get('id'),
                    **record_id_base_data,
                ),
            )

            # Store the individual datum, use job context for the cold
            # storage thing to divine whatever it needs from the job context
            store(entity_data)

            # Signal to the system the new entity
            feedback_entity_task.delay(entity_data, entity_type)

            yield entity_data

            if cnt % 1000 == 0:
                # default paging size for entities per parent
                # is typically around 200. So, each 200 results
                # means about 5 hits to FB
                token_manager.report_usage(token, 5)

    # Report on the effective task status
    token_manager.report_usage(token)
Ejemplo n.º 9
0
    def __init__(self, job_scope: JobScope, bucket_type: str = ColdStoreBucketType.ORIGINAL_BUCKET):
        super().__init__(job_scope)

        normative_entity_type = job_scope.report_variant
        assert normative_entity_type in Entity.ALL

        self.job_scope_base_data = job_scope.to_dict()
        # since we are converting per-parent into per-child
        # job signature, report_variant cannot be set
        self.job_scope_base_data.update(
            entity_type=normative_entity_type,
            is_derivative=True,  # this keeps the scope from being counted as done task by looper
            report_variant=None,
        )
        self.id_attribute_name = {
            Entity.AdAccount: AdsInsights.Field.account_id,
            Entity.Campaign: AdsInsights.Field.campaign_id,
            Entity.AdSet: AdsInsights.Field.adset_id,
            Entity.Ad: AdsInsights.Field.ad_id,
        }[normative_entity_type]
        self.bucket_type = bucket_type
    def test_persister_saves_job_scope_auxiliary_data_to_data_flower(self):
        # There is a need to save some context data that does not fit on JobIS
        # Persister should store that on the Data Flower.

        sweep_id = random.gen_string_id()
        entity_id = random.gen_string_id()
        ad_account_id = random.gen_string_id()

        job_id = generate_id(ad_account_id=ad_account_id,
                             report_type=ReportType.lifetime,
                             report_variant=Entity.Campaign)

        prioritized_iter = [
            PrioritizationClaim(
                entity_id,
                Entity.Campaign,
                ReportType.lifetime,
                JobSignature(job_id),
                100,
                ad_account_id=ad_account_id,
                timezone='Europe/London',
                # TODO: contemplate moving auxiliary data formation to
                #       place where JobSignatures are generated and use that
                #       data for Data Flower (as it was originally intended
                #       but not implemented because saving each job's data
                #       individually to Data Flower was too slow)
            )
        ]

        persisted = persister.iter_persist_prioritized(sweep_id,
                                                       prioritized_iter)
        cnt = 0
        for item in persisted:
            cnt += 1
            # just need to spin the generator
            # so it does all the saving it needs to do per item
        assert cnt == 1

        # Now, finally, the testing:

        jobs_queued_actual = []
        with SortedJobsQueue(sweep_id).JobsReader() as jobs_iter:
            for job_id, job_scope_data, score in jobs_iter:
                jobs_queued_actual.append((job_id, job_scope_data, score))

        jobs_queued_should_be = [(
            job_id,
            # Contents of this dict is what we are testing here
            dict(
                # comes from Persister code
                # manually peeled off *Claim and injected into Data Flower
                ad_account_timezone_name='Europe/London'),
            100,
        )]

        assert jobs_queued_actual == jobs_queued_should_be

        # And, another way of looking at it
        # looper.iter_tasks preassembles JobScope and should apply aux data to it.

        job_scope = None
        cnt = 0
        for celery_task, job_scope, job_context, score in TaskProducer(
                sweep_id).iter_tasks():
            cnt += 1
            # this just needs to spin once
        assert cnt == 1

        job_scope_should_be = JobScope(
            sweep_id=sweep_id,
            namespace='fb',
            ad_account_id=ad_account_id,
            report_type=ReportType.lifetime,
            report_variant=Entity.Campaign,
            # \/ This is what we are testing \/
            # comes from Persister code
            # manually peeled off *Claim and injected into Data Flower
            ad_account_timezone_name='Europe/London',
            score=100,
        )

        assert job_scope.to_dict() == job_scope_should_be.to_dict()