Esempio n. 1
0
    def _iter_collect_organic_insights(
            cls, data_iter: Generator[Dict[str, Any], None, None],
            job_scope: JobScope) -> Generator[Dict[str, Any], None, None]:
        raw_store = batch_store.NormalStore(
            job_scope,
            bucket_type=ColdStoreBucketType.RAW_BUCKET,
            custom_namespace=NAMESPACE_RAW)
        orig_store = batch_store.NormalStore(
            job_scope, bucket_type=ColdStoreBucketType.ORIGINAL_BUCKET)
        common_vendor_data = {
            'ad_account_id':
            job_scope.ad_account_id,
            'entity_type':
            job_scope.report_variant,
            'report_type':
            job_scope.report_type,
            ORGANIC_DATA_ENTITY_ID_MAP[job_scope.report_variant]:
            job_scope.entity_id,
        }

        data = list(data_iter)
        raw_record = {
            'payload':
            data,
            'page_id':
            job_scope.ad_account_id,
            ORGANIC_DATA_ENTITY_ID_MAP[job_scope.report_variant]:
            job_scope.entity_id,
        }
        vendor_data_raw = report_type_vendor_data_raw_extractor_map[
            job_scope.report_type](raw_record, **common_vendor_data)

        raw_record = add_vendor_data(raw_record, **vendor_data_raw)
        raw_store.store(raw_record)

        if len(data):
            # then, transpose it to correct form
            final_record = {
                'page_id':
                job_scope.ad_account_id,
                ORGANIC_DATA_ENTITY_ID_MAP[job_scope.report_variant]:
                job_scope.entity_id,
            }
            for param_datum in data:
                final_record[
                    param_datum['name']] = param_datum['values'][0]['value']

            vendor_data = report_type_vendor_data_extractor_map[
                job_scope.report_type](raw_record, **common_vendor_data)
            final_record = add_vendor_data(final_record, **vendor_data)
            orig_store.store(final_record)

            yield final_record
def iter_collect_entities_per_page_post(
        job_scope: JobScope) -> Generator[Dict[str, Any], None, None]:
    """
    Collects an arbitrary entity for a page post
    """
    entity_type = job_scope.report_variant

    page_token_manager = PageTokenManager.from_job_scope(job_scope)
    with PlatformApiContext(
            page_token_manager.get_best_token(
                job_scope.ad_account_id)) as fb_ctx:
        root_fb_entity = fb_ctx.to_fb_model(job_scope.entity_id,
                                            Entity.PagePost)

    entities = iter_native_entities_per_page_post(root_fb_entity, entity_type)

    record_id_base_data = job_scope.to_dict()
    record_id_base_data.update(entity_type=entity_type, report_variant=None)
    del record_id_base_data['entity_id']

    with ChunkDumpStore(job_scope, chunk_size=DEFAULT_CHUNK_SIZE) as store:
        for entity in entities:
            entity_data = entity.export_all_data()
            entity_data = add_vendor_data(entity_data,
                                          id=generate_universal_id(
                                              entity_id=entity_data.get('id'),
                                              **record_id_base_data))
            entity_data['page_id'] = job_scope.ad_account_id
            entity_data['page_post_id'] = job_scope.entity_id

            # Store the individual datum, use job context for the cold
            # storage thing to divine whatever it needs from the job context
            store(entity_data)

            yield entity_data
def collect_page(job_scope: JobScope, _job_context: JobContext):
    """
    Collect a single facebook page
    """
    if job_scope.report_variant != Entity.Page:
        raise ValueError(
            f"Report level {job_scope.report_variant} specified is not: {Entity.Page}"
        )

    token = job_scope.token
    if not token:
        raise ValueError(
            f"Job {job_scope.job_id} cannot proceed. No platform tokens provided."
        )

    # We don't use it for getting a token. Something else that calls us does.
    # However, we use it to report usages of the token we got.
    token_manager = PlatformTokenManager.from_job_scope(job_scope)

    with PlatformApiContext(token) as fb_ctx:
        page_inst = page.Page(fbid=job_scope.entity_id, api=fb_ctx.api)
        page_fetched = page_inst.api_get(fields=get_default_fields(Page))
        report_job_status_task.delay(ExternalPlatformJobStatus.DataFetched,
                                     job_scope)
        token_manager.report_usage(token, 2)

        record_id_data = job_scope.to_dict()
        record_id_data.update(entity_type=Entity.Page,
                              entity_id=job_scope.entity_id,
                              report_variant=None)
        entity_data = page_fetched.export_all_data()
        entity_data = add_vendor_data(
            entity_data, id=generate_universal_id(**record_id_data))
        store = NormalStore(job_scope)
        store.store(entity_data)
def collect_pages_from_business(job_scope: JobScope,
                                _job_context: JobContext) -> int:
    """
    Collect all facebook pages that are active
    """
    if job_scope.report_variant != Entity.Page:
        raise ValueError(
            f"Report level {job_scope.report_variant} specified is not: {Entity.Page}"
        )

    token = job_scope.token
    if not token:
        raise ValueError(
            f"Job {job_scope.job_id} cannot proceed. No platform tokens provided."
        )

    # We don't use it for getting a token. Something else that calls us does.
    # However, we use it to report usages of the token we got.
    token_manager = PlatformTokenManager.from_job_scope(job_scope)

    with PlatformApiContext(token) as fb_ctx:
        fb_req = FacebookRequest(node_id="me",
                                 method="GET",
                                 endpoint="/businesses",
                                 api=fb_ctx.api,
                                 api_type='EDGE',
                                 target_class=Business)
        businesses = fb_req.execute()

    report_job_status_task.delay(ExternalPlatformJobStatus.DataFetched,
                                 job_scope)
    token_manager.report_usage(token)

    entity_type = Entity.Page

    record_id_base_data = job_scope.to_dict()
    record_id_base_data.update(entity_type=entity_type, report_variant=None)

    cnt = 0
    for biz in businesses:
        client_pages = list(
            biz.get_client_pages(fields=get_default_fields(Page)))
        owned_pages = list(
            biz.get_owned_pages(fields=get_default_fields(Page)))
        pages_list = client_pages + owned_pages

        for page_inst in pages_list:

            entity_data = page_inst.export_all_data()
            record_id_base_data.update(entity_id=entity_data.get('id'))
            entity_data = add_vendor_data(
                entity_data, id=generate_universal_id(**record_id_base_data))

            store = NormalStore(job_scope)
            store.store(entity_data)
            cnt += 1

    report_job_status_task.delay(ExternalPlatformJobStatus.Done, job_scope)
    return cnt
Esempio n. 5
0
def collect_adaccount(job_scope: JobScope) -> Dict[str, Any]:
    """
    Collects ad account data for a AA specific JobScope definition.
    :param JobScope job_scope: The JobScope as we get it from the task itself
    """
    if job_scope.report_variant != Entity.AdAccount:
        raise ValueError(
            f"Report level {job_scope.report_variant} specified is not: {Entity.AdAccount}"
        )

    token = job_scope.token
    if not token:
        raise ValueError(
            f"Job {job_scope.job_id} cannot proceed. No platform tokens provided."
        )

    assert (
        job_scope.ad_account_id == job_scope.entity_id
    ), f'This is an ad account entity job, account_id should be equal to entity_id'

    # Used to report token usage by this job
    token_manager = PlatformTokenManager.from_job_scope(job_scope)

    with PlatformApiContext(token) as fb_ctx:
        ad_account = fb_ctx.to_fb_model(job_scope.ad_account_id,
                                        Entity.AdAccount)

        fields = get_default_fields(ad_account.__class__)

        ad_account_with_selected_fields = ad_account.api_get(
            fields=fields)  # Read just the fields we need
        ad_account_data_dict = ad_account_with_selected_fields.export_all_data(
        )  # Export the object to a dict

        token_manager.report_usage(token)

        job_scope_base = {
            # Duplicate the job_scope data to avoid mutating it
            **job_scope.to_dict(),
            'entity_type': Entity.AdAccount,
            'report_variant': None,
        }

        augmented_ad_account_data = add_vendor_data(
            # Augment the data returned from the remote API with our vendor data
            ad_account_data_dict,
            id=generate_universal_id(**job_scope_base),
        )
        feedback_entity_task.delay(ad_account_data_dict,
                                   job_scope.report_variant)
        store = NormalStore(job_scope)
        store.store(augmented_ad_account_data)

        # TODO: feedback account? this probably wouldn't make sense at the moment
        # because ad accounts are discovered from console and their lifecycle is controlled from there.

        return ad_account_data_dict
    def test_add_new_vendor_block(self):

        data = {'a': 1}
        data_should_be = {'a': 1, '__oprm': {'id': 5}}

        data_actual = add_vendor_data(data, id=5)

        assert data_actual is data, 'we did not repackage the instance. Same instance'
        assert data_actual == data_should_be
    def test_update_existing_vendor_block(self):

        data = {'a': 1, '__oprm': {'id': 5}}
        data_should_be = {'a': 1, '__oprm': {'id': 5, 'extra_attr': 7}}

        data_actual = add_vendor_data(data, extra_attr=7)

        assert data_actual is data, 'we did not repackage the instance. Same instance'
        assert data_actual == data_should_be
def iter_collect_entities_per_page(
        job_scope: JobScope) -> Generator[Dict[str, Any], None, None]:
    """
    Collects an arbitrary entity for a page
    """
    token, entity_type, root_fb_entity = _extract_token_entity_type_parent_entity(
        job_scope, [Entity.PagePost, Entity.PageVideo], Entity.Page,
        'ad_account_id')

    entities = iter_native_entities_per_page(root_fb_entity, entity_type)

    record_id_base_data = job_scope.to_dict()
    record_id_base_data.update(entity_type=entity_type, report_variant=None)

    token_manager = PlatformTokenManager.from_job_scope(job_scope)
    with ChunkDumpStore(
            job_scope, chunk_size=DEFAULT_CHUNK_SIZE) as store, ChunkDumpStore(
                job_scope,
                chunk_size=DEFAULT_CHUNK_SIZE,
                bucket_type=ColdStoreBucketType.RAW_BUCKET,
                custom_namespace=NAMESPACE_RAW,
            ) as raw_store:
        cnt = 0
        for entity in entities:
            entity_data = entity.export_all_data()

            entity_data = add_vendor_data(entity_data,
                                          id=generate_universal_id(
                                              entity_id=entity_data.get('id'),
                                              **record_id_base_data))
            entity_data['page_id'] = job_scope.ad_account_id

            if entity_type == Entity.PagePost:
                # store raw version of response (just to remain consistent)
                raw_store(entity_data)
                entity_data = _augment_page_post(entity_data)

            # Store the individual datum, use job context for the cold
            # storage thing to divine whatever it needs from the job context
            store(entity_data)

            # Signal to the system the new entity
            feedback_entity_task.delay(entity_data, entity_type)

            yield entity_data
            cnt += 1

            if cnt % 1000 == 0:
                # default paging size for entities per parent
                # is typically around 200. So, each 200 results
                # means about 5 hits to FB
                token_manager.report_usage(token, 5)

    token_manager.report_usage(token)
def iter_collect_entities_per_page_graph(
        job_scope: JobScope) -> Generator[Dict[str, Any], None, None]:
    """
    Collects an arbitrary entity for a page using graph API
    """
    page_token_manager = PageTokenManager.from_job_scope(job_scope)
    with PlatformApiContext(
            page_token_manager.get_best_token(
                job_scope.ad_account_id)) as fb_ctx:
        page_root_fb_entity = fb_ctx.to_fb_model(job_scope.ad_account_id,
                                                 Entity.Page)

    entity_type = job_scope.report_variant
    # page size reduced to avoid error:
    #  "Please reduce the amount of data you're asking for, then retry your request"
    entities = iter_native_entities_per_page_graph(page_root_fb_entity,
                                                   entity_type,
                                                   page_size=30)

    record_id_base_data = job_scope.to_dict()
    record_id_base_data.update(entity_type=entity_type, report_variant=None)

    with ChunkDumpStore(
            job_scope, chunk_size=DEFAULT_CHUNK_SIZE) as store, ChunkDumpStore(
                job_scope,
                chunk_size=DEFAULT_CHUNK_SIZE,
                bucket_type=ColdStoreBucketType.RAW_BUCKET,
                custom_namespace=NAMESPACE_RAW,
            ) as raw_store:
        for entity in entities:
            entity_data = entity.export_all_data()
            entity_data = add_vendor_data(entity_data,
                                          id=generate_universal_id(
                                              entity_id=entity_data.get('id'),
                                              **record_id_base_data))
            entity_data['page_id'] = job_scope.ad_account_id

            if entity_type == Entity.PagePostPromotable:
                # store raw version of response (just to remain consistent)
                raw_store(entity_data)
                entity_data = _augment_page_post(entity_data)

            # Store the individual datum, use job context for the cold
            # storage thing to divine whatever it needs from the job context
            store(entity_data)

            # Signal to the system the new entity
            feedback_entity_task.delay(entity_data, entity_type)
            yield entity_data
def iter_collect_entities_per_adaccount(
        job_scope: JobScope) -> Generator[Dict[str, Any], None, None]:
    """
    Collects an arbitrary entity for an ad account
    """
    token, entity_type, root_fb_entity = _extract_token_entity_type_parent_entity(
        job_scope, Entity.AA_SCOPED, Entity.AdAccount, 'ad_account_id')

    entities = iter_native_entities_per_adaccount(root_fb_entity, entity_type)

    record_id_base_data = job_scope.to_dict()
    record_id_base_data.update(entity_type=entity_type, report_variant=None)

    token_manager = PlatformTokenManager.from_job_scope(job_scope)
    with ChunkDumpStore(job_scope, chunk_size=DEFAULT_CHUNK_SIZE) as store:
        for cnt, entity in enumerate(entities):
            entity_data = entity.export_all_data()
            entity_data = add_vendor_data(
                entity_data,
                id=generate_universal_id(
                    # FIXME: add a bug to facebook ads (get_ad_videos doesnt return ad videos but AbstractCrudObject)
                    # FIXME so it is unable to access entity.Field.id then (only a problem for ad videos)
                    entity_id=entity_data.get('id'),
                    **record_id_base_data,
                ),
            )

            # Store the individual datum, use job context for the cold
            # storage thing to divine whatever it needs from the job context
            store(entity_data)

            # Signal to the system the new entity
            feedback_entity_task.delay(entity_data, entity_type)

            yield entity_data

            if cnt % 1000 == 0:
                # default paging size for entities per parent
                # is typically around 200. So, each 200 results
                # means about 5 hits to FB
                token_manager.report_usage(token, 5)

    # Report on the effective task status
    token_manager.report_usage(token)
    def __init__(self, job_scope: JobScope, report_entity_api_kind: str):
        if job_scope.report_type not in ReportType.ALL_METRICS:
            raise ValueError(
                f"Report type {job_scope.report_type} specified is not one of supported values: "
                + ReportType.ALL_METRICS)
        # cool. we are in the right place...

        self.report_params = {
            'fields':
            DEFAULT_REPORT_FIELDS,
            'action_attribution_windows': [
                # https://developers.facebook.com/docs/marketing-api/reference/adgroup/insights/
                # https://developers.facebook.com/docs/marketing-api/insights#sample
                # 'actions' and 'action_values' can contain values per different measurement window
                # In case of 'actions', default 'value' is always 1d_view PLUS 28d_click and cannot be removed.
                # In case of 'action_values', default 'value' is some weighted sum of
                #  1d_view AND 28d_click $ values, that may be smaller than raw 1d_view PLUS 28d_click $ values.
                # Many customers interpret their conversions / actions in different attribution windows.
                # The more windows we ask the data for, the less reliably it returns reports.
                # Be super conservative about asking for more / all.
                AdsInsights.ActionAttributionWindows.value_1d_view,
                AdsInsights.ActionAttributionWindows.value_7d_view,
                AdsInsights.ActionAttributionWindows.value_28d_view,
                AdsInsights.ActionAttributionWindows.value_1d_click,
                AdsInsights.ActionAttributionWindows.value_7d_click,
                AdsInsights.ActionAttributionWindows.value_28d_click,
            ],
        }

        # Next is (a) vs (b) - abstraction level determination
        is_per_parent_report = not job_scope.entity_id and job_scope.report_variant in Entity.ALL

        if is_per_parent_report:
            entity_id = job_scope.ad_account_id
            entity_type = Entity.AdAccount
            entity_type_reporting = job_scope.report_variant
            if report_entity_api_kind == ReportEntityApiKind.Ad:
                self.report_params.update(
                    level=ENUM_LEVEL_MAP[job_scope.report_variant])
        else:
            # direct, per-entity report
            entity_id = job_scope.entity_id
            entity_type = job_scope.entity_type
            entity_type_reporting = job_scope.report_variant
            if report_entity_api_kind == ReportEntityApiKind.Ad:
                self.report_params.update(
                    level=ENUM_LEVEL_MAP[entity_type_reporting])

        # Now, (c), (d), (e), (f), (g) choices
        # we already checked above that this is one of metrics report types
        # So we know it will be either lifetime or day-with-breakdown type
        # TODO: add fields listings appropriate for each type
        if job_scope.report_type == ReportType.lifetime:
            self.report_params.update(
                date_preset=AdsInsights.DatePreset.lifetime)
        elif job_scope.report_type in REPORT_TYPE_FB_BREAKDOWN_ENUM:  # some day-with-breakdown type
            self.report_params.update(
                time_increment=1,  # group by calendar day (in AA tz)
                time_range={
                    'since':
                    _convert_and_validate_date_format(job_scope.range_start),
                    # No value for job_scope.range_end means 1-day report for range_start day
                    'until':
                    _convert_and_validate_date_format(
                        job_scope.range_end or job_scope.range_start),
                },
                breakdowns=REPORT_TYPE_FB_BREAKDOWN_ENUM[
                    job_scope.report_type],
            )
        else:
            raise ValueError(
                f"Report type {job_scope.report_type} does not have a mapped Platform-side breakdown value."
            )

        # Indicates that datum returned in a per-parent report is by itself
        # naturally mapped to some single normative job ,
        # meaning each element can be stored separately
        # but only under normative ID computed on the fly
        # from the datum.
        # This must be accompanied by a transform fn that
        # derives a normative ID from data.

        # special case.
        # when report type is per-specific-single-entity-ID
        # AND one of per-day-with-breakdown
        # per-Entity-ID-per-day bundle with 24 records before saving it.
        # This results in a single write to the cold store under
        # single normative ID.
        is_whole_report_bundle_write = (
            # must be one of those per-day reports
            job_scope.report_type in ReportType.ALL_DAY_BREAKDOWNS and
            # except for DMA-based data, as these can be very long,
            # - 10s of thousands of records per day
            job_scope.report_type not in [
                ReportType.day_dma, ReportType.day_region,
                ReportType.day_country
            ] and
            # and the report is per single entity ID
            job_scope.entity_id and not job_scope.report_variant and
            # and report is for a single calendar day
            # ReportType.ALL_DAY_BREAKDOWNS means there must be a non-Null
            # value in time_range, but we check anyway
            self.report_params['time_range']['since']
            and self.report_params['time_range']['since']
            == self.report_params['time_range']['until'])

        # a more complex variant of whole_report_bundle_write
        # where, while we canNOT spool entire report into memory to
        # write it as one bundle, we cannot really write each
        # individual result out either, as there will be a shit-load of them
        # and we have to write is some sort of batching mode, but
        # cannot cleanly group the bundles into per-normative-ID bundles,
        # and instead will write under effective ID, but with a suffix
        # indicating the monotonically-increasing chunk number.

        # Disabled but kept for reference to compare to shorter version immediately below
        # These represent good range of choices for cold store handlers.
        # When / if there is value to it, steal from this commented out code.
        # if is_naturally_normative_child:
        #     self.datum_handler = batch_store.NaturallyNormativeChildStore(job_scope)
        # elif is_whole_report_bundle_write:
        #     self.datum_handler = batch_store.MemorySpoolStore(job_scope)
        # elif is_chunk_write:
        #     self.datum_handler = batch_store.ChunkDumpStore(job_scope)
        # else:
        #     self.datum_handler = batch_store.NormalStore(job_scope)

        # let's be more aggressive about doing bundled writes to cold store
        # and (temporarily) get away from "normative" and single-datum writes
        # There are two ways we can get closer to bundled writes:
        #  - spool entire report in memory and flush out at the end, when we know we can tolerate that
        #  - spool large chunks of report in memory and flush them periodically if we fear large sizes in report.
        if is_whole_report_bundle_write:
            self.datum_handler = batch_store.MemorySpoolStore(job_scope)
        else:
            self.datum_handler = batch_store.ChunkDumpStore(job_scope,
                                                            chunk_size=200)

        with PlatformApiContext(job_scope.token) as fb_ctx:
            self.report_root_fb_entity = fb_ctx.to_fb_model(
                entity_id, entity_type)

        # here we configure code that will augment each datum with  record ID
        vendor_data_extractor = report_type_vendor_data_extractor_map[
            job_scope.report_type]
        if job_scope.report_type == ReportType.day_hour:
            # hour report type's ID extractor function needs extra leading arg - timezone
            vendor_data_extractor = functools.partial(
                vendor_data_extractor, job_scope.ad_account_timezone_name)

        aux_data = {
            'ad_account_id': job_scope.ad_account_id,
            'entity_type': entity_type_reporting,
            'report_type': job_scope.report_type,
        }

        self.augment_with_vendor_data = lambda data: add_vendor_data(
            data, **vendor_data_extractor(data, **aux_data))