def read(self): if self.report_type == "ANALYTICS": entity_ids = self.get_active_entity_ids() total_jobs = (len(entity_ids) // MAX_ENTITY_IDS_PER_JOB) + 1 logger.info(f"Processing a total of {total_jobs} jobs") data = [] for chunk_entity_ids in split_list( entity_ids, MAX_ENTITY_IDS_PER_JOB * MAX_CONCURRENT_JOBS): job_ids = self.get_job_ids(chunk_entity_ids) data += self.get_analytics_report(job_ids) elif self.report_type == "REACH": data = self.get_reach_report() elif self.report_type == "ENTITY": if self.entity == "CARD": data = self.get_cards_report() else: data = self.get_campaign_management_report() def result_generator(): for record in data: yield self.add_request_or_period_dates(record) yield JSONStream("results_" + self.account.id, result_generator())
def post_queued_async_jobs( client, account_id, report_name, report_entity, entity_ids, report_granularity, report_segment, metric_groups, placement, start_time, end_time, country_id, platform_id, ): queued_job_ids = [] # CHUNK ENTITY_IDS LOOP chunk = 0 # chunk number # Make chunks of 20 of entity_ids for chunk_ids in split_list(entity_ids, 20): # POST async_queued_job for report entity chunk_ids # Reference: https://developer.twitter.com/en/docs/ads/analytics/api-reference/asynchronous#post-stats-jobs-accounts-account-id LOGGER.info("Report: {} - POST ASYNC queued_job, chunk#: {}".format( report_name, chunk)) queued_job_path = "stats/jobs/accounts/{account_id}".replace( "{account_id}", account_id) queued_job_params = { # Required params "entity": report_entity, "entity_ids": ",".join(map(str, chunk_ids)), "metric_groups": ",".join(map(str, metric_groups)), "placement": placement, "granularity": report_granularity, "start_time": start_time, "end_time": end_time, # Optional params "segmentation_type": report_segment, "country": country_id, "platform": platform_id, } LOGGER.info("Report: {} - queued_job POST URL: {}/{}/{}".format( report_name, ADS_API_URL, API_VERSION, queued_job_path)) LOGGER.info("Report: {} - queued_job params: {}".format( report_name, queued_job_params)) # POST queued_job: asynchronous job queued_job = post_resource("queued_job", client, queued_job_path, queued_job_params) queued_job_data = queued_job.get("data") queued_job_id = queued_job_data.get("id_str") queued_job_ids.append(queued_job_id) LOGGER.info( "queued_job_ids = {}".format(queued_job_ids)) # COMMENT OUT # End: for chunk_ids in entity_ids return queued_job_ids
def post_queued_async_jobs(client, account_id, report_name, report_entity, entity_ids, report_granularity, \ report_segment, metric_groups, placement, start_time, end_time, country_id, platform_id): queued_job_ids = [] # CHUNK ENTITY_IDS LOOP chunk = 0 # chunk number # Make chunks of 20 of entity_ids for chunk_ids in split_list(entity_ids, 20): # POST async_queued_job for report entity chunk_ids # Reference: https://developer.twitter.com/en/docs/ads/analytics/api-reference/asynchronous#post-stats-jobs-accounts-account-id LOGGER.info('Report: {} - POST ASYNC queued_job, chunk#: {}'.format( report_name, chunk)) queued_job_path = 'stats/jobs/accounts/{account_id}'.replace( '{account_id}', account_id) queued_job_params = { # Required params 'entity': report_entity, 'entity_ids': ','.join(map(str, chunk_ids)), 'metric_groups': ','.join(map(str, metric_groups)), 'placement': placement, 'granularity': report_granularity, 'start_time': start_time, 'end_time': end_time, # Optional params 'segmentation_type': report_segment, 'country': country_id, 'platform': platform_id } LOGGER.info('Report: {} - queued_job POST URL: {}/{}/{}'.format( report_name, ADS_API_URL, API_VERSION, queued_job_path)) LOGGER.info('Report: {} - queued_job params: {}'.format( report_name, queued_job_params)) # POST queued_job: asynchronous job queued_job = post_resource('queued_job', client, queued_job_path, \ queued_job_params) queued_job_data = queued_job.get('data') queued_job_id = queued_job_data.get('id_str') queued_job_ids.append(queued_job_id) LOGGER.info('queued_job_ids = {}'.format(queued_job_ids)) # COMMENT OUT # End: for chunk_ids in entity_ids return queued_job_ids
def get_job_ids(self, entity_ids): """ Step 2 of 'ANALYTICS' report generation process: Create asynchronous analytics jobs and return their ids for progress tracking Documentation: https://developer.twitter.com/en/docs/ads/analytics/api-reference/asynchronous """ return [ ENTITY_OBJECTS[self.entity].queue_async_stats_job( self.account, chunk_entity_ids, self.metric_groups, granularity=self.granularity, placement=self.placement, start_time=self.start_date, end_time=self.end_date, segmentation_type=self.segmentation_type, platform=self.platform, country=self.country, ).id for chunk_entity_ids in split_list(entity_ids, MAX_ENTITY_IDS_PER_JOB) ]
def get_reach_report(self): """ Get 'REACH' report through the 'Reach and Average Frequency' endpoint of Twitter Ads API. Documentation: https://developer.twitter.com/en/docs/ads/analytics/api-reference/reach """ resource = f"/{API_VERSION}/stats/accounts/{self.account.id}/reach/{self.entity.lower()}s" entity_ids = self.get_active_entity_ids() for chunk_entity_ids in split_list(entity_ids, MAX_ENTITY_IDS_PER_JOB): try: params = { "account_id": self.account.id, f"{self.entity.lower()}_ids": ",".join(entity_ids), "start_time": self.start_date.strftime(API_DATEFORMAT), "end_time": self.end_date.strftime(API_DATEFORMAT), } request = Request(self.client, "get", resource, params=params) yield from Cursor(None, request) except Exception: ex_type, ex, tb = sys.exc_info() logger.warning( f"Failed to ingest post with error: {ex}. Traceback: {traceback.print_tb(tb)}" )
# see: https://dev.twitter.com/ads/analytics/metrics-and-segmentation metric_groups = [METRIC_GROUP.BILLING] # fetching stats on the instance line_items[0].stats(metric_groups) # fetching stats for multiple line items ids = list(map(lambda x: x.id, line_items)) if not ids: print('Error: A minimum of 1 items must be provided for entity_ids') sys.exit() sync_data = [] # Sync/Async endpoint can handle max 20 entity IDs per request # so split the ids list into multiple requests for chunk_ids in split_list(ids, 20): sync_data.append(LineItem.all_stats(account, chunk_ids, metric_groups)) print(sync_data) # create async stats jobs and get job ids queued_job_ids = [] for chunk_ids in split_list(ids, 20): queued_job_ids.append( LineItem.queue_async_stats_job(account, chunk_ids, metric_groups).id) print(queued_job_ids) # let the job complete seconds = 30 time.sleep(seconds)
def sync_endpoint( client, state, start_date, stream_name, endpoint_config, tap_config, account_id=None, parent_ids=None, child_streams=None, ): # endpoint_config variables path = endpoint_config.get("path") LOGGER.info("Stream: {} - endpoint_config: {}".format( stream_name, endpoint_config)) id_fields = endpoint_config.get("key_properties", []) parent_id_field = next(iter(id_fields), None) # first ID field params = endpoint_config.get("params", {}) bookmark_field = next(iter(endpoint_config.get("replication_keys", [])), None) datetime_format = endpoint_config.get("datetime_format") sub_types = endpoint_config.get("sub_types", ["none"]) children = endpoint_config.get("children") if parent_ids is None: parent_ids = [] if child_streams is None: child_streams = [] # tap config variabless # Twitter Ads does not accept True/False as boolean, must be true/false with_deleted = tap_config.get("with_deleted", "true") country_codes = tap_config.get("country_codes", "").replace(" ", "") country_code_list = country_codes.split(",") LOGGER.info( "country_code_list = {}".format(country_code_list)) # COMMENT OUT if sub_types == ["{country_code_list}"]: sub_types = country_code_list LOGGER.info("sub_types = {}".format(sub_types)) # COMMENT OUT # Bookmark datetimes last_datetime = get_bookmark(state, stream_name, start_date) last_dttm = strptime_to_utc(last_datetime) # NOTE: Risk of syncing indefinitely and never getting bookmark max_bookmark_value = None total_records = 0 # Loop through sub_types (for tweets endpoint), all other endpoints loop once for sub_type in sub_types: LOGGER.info("sub_type = {}".format(sub_type)) # COMMENT OUT # Reset params and path for each sub_type params = {} new_params = {} path = None params = endpoint_config.get("params", {}) path = endpoint_config.get("path") # Replace keys/ids in path and params add_account_id = False # Initial default if "{account_id}" in path: add_account_id = True path = path.replace("{account_id}", account_id) if parent_ids: parent_id_list = ",".join(map(str, parent_ids)) path = path.replace("{parent_ids}", parent_id_list) key = None val = None for key, val in list(params.items()): new_val = val if isinstance(val, str): if key == "with_deleted": new_val = val.replace("{with_deleted}", with_deleted) if "{account_ids}" in val: new_val = val.replace("{account_ids}", account_id) if "{parent_ids}" in val: new_val = val.replace("{parent_ids}", parent_id_list) if "{start_date}" in val: new_val = val.replace("{start_date}", start_date) if "{country_codes}" in val: new_val = val.replace("{country_codes}", country_codes) if "{sub_type}" in val: new_val = val.replace("{sub_type}", sub_type) new_params[key] = new_val LOGGER.info("Stream: {} - Request URL: {}/{}/{}".format( stream_name, ADS_API_URL, API_VERSION, path)) LOGGER.info("Stream: {} - Request params: {}".format( stream_name, new_params)) # API Call cursor = get_resource(stream_name, client, path, new_params) # time_extracted: datetime when the data was extracted from the API time_extracted = utils.now() i = 0 with metrics.record_counter(stream_name) as counter: # Loop thru cursor records, break out if no more data or bookmark_value < last_dttm for record in cursor: # Get dictionary for record record_dict = obj_to_dict(record) if not record_dict: # Finish looping LOGGER.info( "Stream: {} - Finished Looping, no more data".format( stream_name)) break # Get record's bookmark_value # All bookmarked requests are sorted by updated_at descending # 'sort_by': ['updated_at-desc'] # The first record is the max_bookmark_value if bookmark_field: bookmark_value_str = record_dict.get(bookmark_field) if bookmark_value_str: # Tweets use a different datetime format: '%a %b %d %H:%M:%S %z %Y' if datetime_format: bookmark_value = datetime.strptime( record_dict.get(bookmark_field), datetime_format) # Other bookmarked endpoints use normal UTC format else: bookmark_value = strptime_to_utc( record_dict.get(bookmark_field)) # If first record, set max_bookmark_value if i == 0: max_bookmark_dttm = bookmark_value max_bookmark_value = max_bookmark_dttm.strftime( "%Y-%m-%dT%H:%M:%S%z") LOGGER.info( "Stream: {} - max_bookmark_value: {}".format( stream_name, max_bookmark_value)) else: # pylint: disable=line-too-long LOGGER.info( "Stream: {} - NO BOOKMARK, bookmark_field: {}, record: {}" .format(stream_name, bookmark_field, record_dict)) # pylint: enable=line-too-long bookmark_value = last_dttm if bookmark_value < last_dttm: # Finish looping LOGGER.info( "Stream: {} - Finished, bookmark value < last datetime" .format(stream_name)) break else: bookmark_value = last_dttm # Check for PK fields for key in id_fields: if not record_dict.get(key): LOGGER.info( "Stream: {} - Missing key {} in record: {}".format( stream_name, key, record)) # Transform record from transform.py prepared_record = transform_record(stream_name, record_dict) # Add account_id to record if add_account_id: prepared_record["account_id"] = account_id write_record(stream_name, prepared_record, time_extracted=time_extracted) counter.increment() # Append parent_id to parent_ids parent_id = record_dict.get(parent_id_field) parent_ids.append(parent_id) # Increment counters i = i + 1 total_records = total_records + 1 # End: for record in cursor # End: with metrics as counter # Loop through children and chunks of parent_ids if children: for child_stream_name, child_endpoint_config in children.items(): if child_stream_name in child_streams: update_currently_syncing(state, child_stream_name) # pylint: disable=line-too-long LOGGER.info( "Child Stream: {} - START Syncing, parent_stream: {}, account_id: {}" .format(child_stream_name, stream_name, account_id)) total_child_records = 0 child_total_records = 0 # parent_id_limit: max list size for parent_ids parent_id_limit = child_endpoint_config.get( "parent_ids_limit", 1) chunk = 0 # chunk number # Make chunks of parent_ids for chunk_ids in split_list(parent_ids, parent_id_limit): # pylint: disable=line-too-long LOGGER.info( "Child Stream: {} - Syncing, chunk#: {}, parent_stream: {}, parent chunk_ids: {}" .format(child_stream_name, chunk, stream_name, chunk_ids)) # pylint: enable=line-too-long child_total_records = sync_endpoint( client=client, state=state, start_date=start_date, stream_name=child_stream_name, endpoint_config=child_endpoint_config, tap_config=tap_config, account_id=account_id, parent_ids=chunk_ids, child_streams=child_streams, ) # pylint: disable=line-too-long LOGGER.info( "Child Stream: {} - Finished chunk#: {}, parent_stream: {}" .format(child_stream_name, chunk, stream_name)) # pylint: enable=line-too-long total_child_records = total_child_records + child_total_records chunk = chunk + 1 # End: for chunk in parent_id_chunks # pylint: disable=line-too-long LOGGER.info( "Child Stream: {} - FINISHED Syncing, parent_stream: {}, account_id: {}" .format(child_stream_name, stream_name, account_id)) # pylint: enable=line-too-long LOGGER.info("Child Stream: {} - total_records: {}".format( child_stream_name, total_child_records)) update_currently_syncing(state, stream_name) # End: if child_stream_name in child_streams # End: for child_stream_name in children.items() # End: if children # pylint: disable=line-too-long LOGGER.info( "Stream: {}, Account ID: {} - FINISHED Sub Type: {}, Total Sub Type Records: {}" .format(stream_name, account_id, sub_type, i)) # pylint: enable=line-too-long # End: for sub_type in sub_types # Update the state with the max_bookmark_value for the stream if bookmark_field: write_bookmark(state, stream_name, max_bookmark_value) return total_records