Esempio n. 1
0
def sync_stream(config, state, stream, sdk_client):
    """Sync a stream."""
    # This bifurcation is real. Generic Endpoints have entirely different
    # performance characteristics and constraints than the Report
    # Endpoints and thus should be kept separate.
    token = refresh_auth_token(sdk_client, None)
    if stream.tap_stream_id in SELLER_STATS_REPORT_TYPES:
        sync_seller_v2_stats_report(config, state, stream, sdk_client, token)
    elif stream.tap_stream_id in STATISTICS_REPORT_TYPES:
        sync_statistics_report(config, state, stream, sdk_client, token)
    elif stream.tap_stream_id in GENERIC_ENDPOINT_MAPPINGS:
        sync_generic_endpoint(config, state, stream, sdk_client, token)
    else:
        raise Exception("Unrecognized tap_stream_id {}".format(
            stream.tap_stream_id))
Esempio n. 2
0
def sync_generic_endpoint(config, state, stream, sdk_client, token):
    """Sync a stream which is backed by a generic Criteo endpoint."""
    stream = add_synthetic_keys_to_stream_schema(stream)
    stream = add_synthetic_keys_to_stream_metadata(stream)
    mdata = metadata.to_map(stream.metadata)
    primary_keys = metadata.get(mdata, (), "table-key-properties") or []
    LOGGER.info("{} primary keys are {}".format(stream.stream, primary_keys))
    singer.write_schema(stream.stream, stream.schema.to_dict(), primary_keys)

    advertiser_ids = config.get("advertiser_ids", None)
    if stream.tap_stream_id == "Audiences":
        if not advertiser_ids:
            LOGGER.warn(
                "%s stream needs at least one advertiser_id defined in config"
                % stream.stream)
        for advertiser_id in advertiser_ids.split(","):
            token = refresh_auth_token(sdk_client, token)
            with metrics.http_request_timer(stream.tap_stream_id):
                result = get_audiences_endpoint(sdk_client,
                                                advertiser_id,
                                                token=token)
    else:
        module = GENERIC_ENDPOINT_MAPPINGS[stream.tap_stream_id]["module"]
        method = GENERIC_ENDPOINT_MAPPINGS[stream.tap_stream_id]["method"]
        if stream.tap_stream_id in (
                "Portfolio",
                "AdvertiserInfo",
                "Sellers",
                "SellerBudgets",
                "SellerCampaigns",
        ):
            result = call_generic_endpoint(stream,
                                           sdk_client,
                                           module,
                                           method,
                                           token=token)
        else:
            result = call_generic_endpoint(
                stream,
                sdk_client,
                module,
                method,
                advertiser_ids=advertiser_ids,
                token=token,
            )

    result = convert_keys_snake_to_camel([_.to_dict() for _ in result])

    with metrics.record_counter(stream.tap_stream_id) as counter:
        time_extracted = utils.now()

        with Transformer() as bumble_bee:
            for row in result:
                row["_sdc_report_datetime"] = REPORT_RUN_DATETIME
                row = bumble_bee.transform(row, stream.schema.to_dict())

                singer.write_record(stream.stream,
                                    row,
                                    time_extracted=time_extracted)
                counter.increment()

    LOGGER.info(
        "Done syncing %s records for the %s report for advertiser_ids %s",
        counter.value,
        stream.stream,
        advertiser_ids,
    )
Esempio n. 3
0
def sync_statistics_report(config, state, stream, sdk_client, token):
    """Sync a stream which is backed by the Criteo Statistics endpoint."""
    advertiser_ids = config.get("advertiser_ids", "")
    mdata = metadata.to_map(stream.metadata)

    stream = add_synthetic_keys_to_stream_schema(stream)

    field_list = get_field_list(stream)

    primary_keys = []
    LOGGER.info("{} primary keys are {}".format(stream.stream, primary_keys))
    singer.write_schema(
        stream.stream,
        stream.schema.to_dict(),
        primary_keys,
        bookmark_properties=["Day"],
    )

    # If an attribution window sync is interrupted, start where it left off
    start_date = get_attribution_window_bookmark(state, advertiser_ids,
                                                 stream.stream)
    if start_date is None:
        start_date = apply_conversion_window(
            config,
            get_start_for_stream(config, state, advertiser_ids, stream.stream),
        )

    # According to Criteo's documentation the StatisticsApi only supports
    # between one and three dimensions and at least one metric.
    report_dimensions = [
        field for field in field_list
        if metadata.get(mdata, ("properties",
                                field), "tap-criteo.behaviour") == "dimension"
    ]
    LOGGER.info("Selected dimensions: %s", report_dimensions)
    if not 0 <= len(report_dimensions) <= 3:
        raise ValueError(
            "%s stream only supports up to 3 selected dimensions" %
            stream.stream)
    report_metrics = [
        field for field in field_list
        if metadata.get(mdata, ("properties",
                                field), "tap-criteo.behaviour") == "metric"
    ]
    LOGGER.info("Selected metrics: %s", report_metrics)
    if not len(report_metrics) >= 1:
        raise ValueError("%s stream must have at least 1 selected metric" %
                         stream.stream)

    while start_date <= get_end_date(config):
        token = refresh_auth_token(sdk_client, token)
        sync_statistics_for_day(
            config,
            state,
            stream,
            sdk_client,
            token,
            start_date,
            report_metrics,
            report_dimensions,
        )
        start_date = start_date + relativedelta(days=1)
        bookmarks.write_bookmark(
            state,
            state_key_name(advertiser_ids, stream.stream),
            "last_attribution_window_date",
            utils.strftime(start_date),
        )
        singer.write_state(state)
    bookmarks.clear_bookmark(
        state,
        state_key_name(advertiser_ids, stream.stream),
        "last_attribution_window_date",
    )
    singer.write_state(state)
    LOGGER.info(
        "Done syncing the %s report for advertiser_ids %s",
        stream.stream,
        advertiser_ids,
    )