def add_period_to_params(self, params): """ Add the time_increment, time_range and/or date_preset keys to parameters. - time_increment: available in Ad Insights queries - time_range and date_preset: available in Ad Insights queries, and in Ad Management queries at the campaign, adset or ad levels only """ if self.ad_insights and self.time_increment: params["time_increment"] = self.time_increment if self.ad_insights or self.level in ["campaign", "adset", "ad"]: if self.start_date and self.end_date: logger.info( "Date format used for request: start_date and end_date") params["time_range"] = self.create_time_range() elif self.date_preset: logger.info("Date format used for request: date_preset") params["date_preset"] = self.date_preset else: logging.warning( "No date range provided - Last 30 days by default") logging.warning( "https://developers.facebook.com/docs/marketing-api/reference/ad-account/insights#parameters" ) logger.warning( "No date range provided - Last 30 days by default") logger.warning( "https://developers.facebook.com/docs/marketing-api/reference/ad-account/insights#parameters" )
def read(self): if self.report_type == "ANALYTICS": entity_ids = self.get_active_entity_ids() total_jobs = (len(entity_ids) // MAX_ENTITY_IDS_PER_JOB) + 1 logger.info(f"Processing a total of {total_jobs} jobs") data = [] for chunk_entity_ids in split_list( entity_ids, MAX_ENTITY_IDS_PER_JOB * MAX_CONCURRENT_JOBS): job_ids = self.get_job_ids(chunk_entity_ids) data += self.get_analytics_report(job_ids) elif self.report_type == "REACH": data = self.get_reach_report() elif self.report_type == "ENTITY": if self.entity == "CARD": data = self.get_cards_report() else: data = self.get_campaign_management_report() def result_generator(): for record in data: yield self.add_request_or_period_dates(record) yield JSONStream("results_" + self.account.id, result_generator())
def read(self): for prefix in self._prefix_list: objects_sorted_by_time = sorted( self.list_objects(bucket=self._bucket, prefix=prefix), key=lambda o: self.get_timestamp(o), ) for _object in objects_sorted_by_time: _object = self.to_object(_object) logger.info( f"Found {self._platform} file {self.get_key(_object)}") if not self.is_compatible_object(_object): logger.info( f"Wrong extension: Skipping file {self.get_key(_object)}" ) continue name = self.get_key(_object).split("/", self._dest_key_split)[-1] yield JSONStream(name, self._result_generator(_object))
def get_analytics_report(self, job_ids): """ Get 'ANALYTICS' report through the 'Asynchronous Analytics' endpoint of Twitter Ads API. Documentation: https://developer.twitter.com/en/docs/ads/analytics/api-reference/asynchronous """ all_responses = [] for job_id in job_ids: logger.info(f"Processing job_id: {job_id}") # job_result = self.get_job_result(job_id) # waiting_sec = 2 # while job_result.status == "PROCESSING": # logger.info(f"Waiting {waiting_sec} seconds for job to be completed") # sleep(waiting_sec) # if waiting_sec > MAX_WAITING_SEC: # raise JobTimeOutError("Waited too long for job to be completed") # waiting_sec *= 2 # job_result = self.get_job_result(job_id) job_result = self._waiting_for_job_to_complete(job_id) raw_analytics_response = self.get_raw_analytics_response( job_result) all_responses.append(self.parse(raw_analytics_response)) return chain(*all_responses)
def get_parsed_report(self, rep_desc, metrics, parent_dim_parsed={}): """ Iterating over report pages, parsing them, and returning a list of iterators, containing dictonnary-formatted records: {dimension: value, metric: value} The parent_dim_parsed argument (a dictionnary: {dimension: value}) should be passed if the request includes multiple dimension breakdowns, so that we can add their values to output records. """ report_info = { "parent_dim": parent_dim_parsed, "dim": rep_desc["dimension"].split("variables/")[1], "metrics": metrics, } logger.info(f"Getting report: {report_info}") first_response = self.get_report_page(rep_desc) all_responses = [ parse_response(first_response, metrics, parent_dim_parsed) ] if first_response["totalPages"] > 1: for page_nb in range(1, first_response["totalPages"]): next_response = self.get_report_page(rep_desc, page_nb) all_responses += [ parse_response(next_response, metrics, parent_dim_parsed) ] return chain(*all_responses)
def test_format_data(self): reader = GoogleDCMReader(**self.kwargs) input_report = (row for row in [b"x", b"x", b"Report Fields", b"headers", b"1,2,3", b"4,5,6", b"Grand Total"]) expected = [{"date": "1", "impressions": "2", "clicks": "3"}, {"date": "4", "impressions": "5", "clicks": "6"}] input_list = list(reader.format_response(input_report)) assert len(input_list) == len(expected) logger.info(f"{str(input_list)}\n{str(expected)}") for input_row, output in zip(input_list, expected): assert input_row == output
def __download_sdf(self, operation): request = self._client.media().download( resourceName=operation["response"]["resourceName"]) request.uri = request.uri.replace("?alt=json", "?alt=media") sdf = io.FileIO(f"{self.BASE}/{self.ARCHIVE_NAME}.zip", mode="wb") downloader = MediaIoBaseDownload(sdf, request) done = False while done is False: status, done = downloader.next_chunk() logger.info(f"Download {int(status.progress() * 100)}%.")
def _wait_for_100_percent_completion(self, async_job): async_job.api_get() percent_completion = async_job[ AdReportRun.Field.async_percent_completion] status = async_job[AdReportRun.Field.async_status] logger.info(f"{status}: {percent_completion}%") if status == "Job Failed": logger.info(status) elif percent_completion < 100: raise Exception(f"{status}: {percent_completion}")
def __init__(self, name, host, port=6379): if host: logger.info(f"Using checkpointing service: {host}:{port} ({name})") self._enabled = True self._name = name self._host = host self._port = port self._client = redis.Redis(host=host, port=port) else: self._enabled = False logger.info("No checkpointing")
def _wait_for_query(self, query_id): logger.info( f"waiting for query of id : {query_id} to complete running") query_infos = self.get_query(query_id) if query_infos["metadata"]["running"] or ( "googleCloudStoragePathForLatestReport" not in query_infos["metadata"] and "googleDrivePathForLatestReport" not in query_infos["metadata"]): raise Exception("Query still running.") else: return query_infos
def _load_access_info(self): logger.info("Retrieving Salesforce access token") res = requests.post(SALESFORCE_LOGIN_ENDPOINT, params=self._get_login_params()) res.raise_for_status() self._access_token = res.json().get("access_token") self._instance_url = res.json().get("instance_url") return self._access_token, self._instance_url
def __create_sdf_task(self, body): """ Create a sdf asynchronous task of type googleapiclient.discovery.Resource Args: body (dict) : request body to describe the data within the generated sdf file. Return: operation (dict) : contains the task metadata. """ operation = self._client.sdfdownloadtasks().create(body=body).execute() logger.info(f"Operation {operation['name']} was created.") return operation
def _create_report_schedule(self): method, endpoint = API_ENDPOINTS["create_report_schedule"] payload = { "ReportScheduleName": self.report_schedule_name, "ReportTemplateId": self.report_template_id, "AdvertiserFilters": self.advertiser_ids, "ReportStartDateInclusive": self.start_date.isoformat(), "ReportEndDateExclusive": self.end_date.isoformat(), **DEFAULT_REPORT_SCHEDULE_ARGS, } logger.info(f"Creating ReportSchedule: {payload}") json_response = self._make_api_call(method, endpoint, payload) self.report_schedule_id = json_response["ReportScheduleId"]
def assert_report_file_ready(self, report_id, file_id): """Poke the report file status""" report_file = self._service.files().get(reportId=report_id, fileId=file_id).execute() status = report_file["status"] if status == "REPORT_AVAILABLE": logger.info(f"File status is {status}, ready to download.") pass elif status != "PROCESSING": raise FileNotFoundError( f"File status is {status}, processing failed.") else: raise FileNotFoundError("File status is PROCESSING")
def _wait_for_download_url(self): report_execution_details = self._get_report_execution_details() if report_execution_details["ReportExecutionState"] == "Pending": raise ReportScheduleNotReadyError( f"ReportSchedule '{self.report_schedule_id}' is still running." ) else: # As the ReportSchedule that we just created runs only once, # the API response will include only one ReportDelivery (so we can get index "[0]") self.download_url = report_execution_details["ReportDeliveries"][ 0]["DownloadURL"] logger.info( f"ReportScheduleId '{self.report_schedule_id}' is ready. DownloadURL: {self.download_url}" )
def _create_engine(cls, host, port, user, password, database): logger.info( f"Connecting to MySQL Database {database} on {host}:{port}") url = sqlalchemy.engine.url.URL( **{ "drivername": "mysql+pymysql", "username": user, "password": password, "database": database, "port": port, "host": host, }) return sqlalchemy.create_engine(url)
def __get_creatives(self): response = self._client.advertisers().creatives().list( advertiserId=self.kwargs.get("advertiser_id")).execute() if len(response.keys()) == 0: # no data returned return {} else: all_creatives = response["creatives"] while "nextPageToken" in response: token = response["nextPageToken"] logger.info(f"Query a new page of creatives. Page id: {token}") response = (self._client.advertisers().creatives().list( advertiserId=self.kwargs.get("advertiser_id"), pageToken=token).execute()) all_creatives.extend(response["creatives"]) yield from all_creatives
def read(self): """ :return: stream that returns Radarly posts one by one """ date_ranges_and_posts_volumes: Dict = self.split_date_range() logger.info( f"API Compliant Date Ranges and Posts Volumes: {date_ranges_and_posts_volumes}" ) api_compliant_date_ranges = list(date_ranges_and_posts_volumes.keys()) t0 = time.time() ingestion_tracker = [] for i, date_range in enumerate(api_compliant_date_ranges): if self.throttle: current_time = time.time() - t0 ingestion_tracker.append(current_time) posts_ingested_over_window = (sum( np.array(ingestion_tracker) > current_time - self.api_window) * self.api_date_period_limit) if posts_ingested_over_window > self.throttling_threshold_coefficient * self.api_quarterly_posts_limit: sleep_duration = self.api_window * ( self.api_date_period_limit / self.api_quarterly_posts_limit) logger.info( f"Throttling activated: waiting for {sleep_duration} seconds..." ) time.sleep(sleep_duration) all_publications = self.get_publications_iterator(date_range) name = f"""radarly_{date_range[0].strftime("%Y-%m-%d-%H-%M-%S")}_{date_range[1].strftime( "%Y-%m-%d-%H-%M-%S")}""" def result_generator(): while True: try: pub = next(all_publications) yield dict(pub) except StopIteration: break except Exception: ex_type, ex, tb = sys.exc_info() logger.warning( f"Failed to ingest post with error: {ex}. Traceback: {traceback.print_tb(tb)}" ) yield JSONStream(name, result_generator())
def query_ad_insights(self, fields, params, object_id): """ Ad Insights documentation: https://developers.facebook.com/docs/marketing-api/insights """ logger.info( f"Running Facebook Ad Insights query on {self.object_type}_id: {object_id}" ) # Step 1 - Create Facebook object obj = self.create_object(object_id) # Step 2 - Run Ad Insights query on Facebook object report_job = self._get_report(obj, fields, params) yield from report_job.get_result()
def new_func(*args, **kwargs): _kwargs = {} for key, value in kwargs.items(): if key in sensitive_fields: _kwargs[key] = "*****" else: _kwargs[key] = value logger.info(f"Calling {f.__name__} with ({_kwargs})") def processor(): return f(*args, **kwargs) return update_wrapper(processor, f)
def add_report_filter(self, report_definition): """Check if a filter was provided and contains the necessary information""" if not self.report_filter: logger.info("No filter provided by user") elif all(required_param in self.report_filter.keys() for required_param in ("field", "operator", "values")): report_definition["selector"]["predicates"] = { "field": self.report_filter["field"], "operator": self.report_filter["operator"], "values": self.report_filter["values"], } else: raise ClickException( "Wrong format for Report filter : should be a dictionary as string, with the following fields:\n" "Dictionary {'field','operator','values'}")
def write(self, stream): """ Write file to disk at location given as parameter. """ file_name = self._file_name or stream.name path = os.path.join(self._directory, file_name) logger.info(f"Writing stream {file_name} to {path}") file = stream.as_file() with open(path, "wb") as h: while True: buffer = file.read(1024) if len(buffer) > 0: h.write(buffer) else: break
def log_sampling(report): """ Log sampling data if a report has been sampled.""" data = report.get("data", {}) if data.get("samplesReadCounts") is not None: logger.warning("☝️Report has been sampled.") sample_reads = data["samplesReadCounts"][0] sample_space = data["samplingSpaceSizes"][0] logger.warning(f"sample reads : {sample_reads}") logger.warning(f"sample space :{sample_space}") logger.warning( f"sample percent :{100 * int(sample_reads) / int(sample_space)}%" ) else: logger.info("Report is not sampled.")
def test_read(self, mock_get_payload, mock_Project, mock_RadarlyApi): mock_RadarlyApi.init.side_effect = lambda client_id, client_secret: logger.info( "Mock RadarlyApi successfully initiated") mock_get_payload.side_effect = create_mock_payload mock_project_object = MagicMock() mock_project_object.get_all_publications = create_mock_publications_iterator mock_Project.find.return_value = mock_project_object reader = RadarlyReader( pid=1, client_id="xxx", client_secret="xxx", focus_id=(1, 2, 3), start_date=datetime(2020, 1, 1), end_date=datetime(2020, 1, 1, 3), api_request_limit=250, api_date_period_limit=10000, api_quarterly_posts_limit=45000, api_window=300, throttle=True, throttling_threshold_coefficient=0.95, ) for stream in reader.read(): line = stream.as_file().readline() line = json.loads(line) assert "date" in line.keys() assert "text" in line.keys()
def _run_query(self): self.initialize_analyticsreporting() response = self._service.searchanalytics().query( siteUrl=self.site_url, body=self.build_query()).execute() yield response # Pagination while len(response.get("rows", [])) != 0: logger.info( f"{len(response.get('rows')) + self.start_row} lines successfully processed..." ) self.start_row += self.row_limit response = self._service.searchanalytics().query( siteUrl=self.site_url, body=self.build_query()).execute() yield response
def monitor_usage(response): """ Extracts "X-Business-Use-Case-Usage" header from a FacebookResponse object. If one of the 3 API usage rates (call_count, total_cputime, total_time) is above 75%, puts the program to sleep for 5 minutes. Documentation: https://developers.facebook.com/docs/graph-api/overview/rate-limiting/ """ for header in response._headers: if header["name"] == "X-Business-Use-Case-Usage": usage_header = json.loads(header["value"]) usage_header_values = list(usage_header.values())[0][0] usage_rates = [v for k, v in usage_header_values.items() if k in ["call_count", "total_cputime", "total_time"]] if max(usage_rates) > 75: logger.info("75% rate limit reached. Sleeping for 5 minutes...") sleep(300)
def add_period_to_report_definition(self, report_definition): """Add Date period from provided start date and end date, when CUSTOM DATE range is called""" if (self.date_range_type == "CUSTOM_DATE") & (not self.start_date or not self.end_date): raise NoDateDefinitionException("""You must define a couple start-date/end-date when using a custom_date""") elif self.date_range_type == "CUSTOM_DATE": logger.info( "Date format used for request : Custom Date Range with start_date and end_date provided" ) report_definition["selector"][ "dateRange"] = self.create_date_range(self.start_date, self.end_date) elif self.start_date is not None and self.end_date is not None and self.date_range_type != "CUSTOM_DATE": raise InconsistentDateDefinitionException( "You must define either the couple start_date and end_date or a date_range, \ different from CUSTOM_DATE, but not both")
def __wait_sdf_download_request(self, operation): """ Wait for a sdf task to be completed. ie. (file ready for download) Args: operation (dict): task metadata Returns: operation (dict): task metadata updated with resource location. """ logger.info( f"waiting for SDF operation: {operation['name']} to complete running." ) get_request = self._client.sdfdownloadtasks().operations().get( name=operation["name"]) operation = get_request.execute() if "done" not in operation: raise RetryTimeoutError( "The operation has taken more than 10 hours to complete.\n") return operation
def query(self, query): logger.info(f"Running Salesforce query: {query}") response = self._request_data(SALESFORCE_QUERY_ENDPOINT, {"q": query}) generating = True while generating: for rec in response["records"]: yield rec if "nextRecordsUrl" in response: logger.info("Fetching next page of Salesforce results") response = self._request_data(response["nextRecordsUrl"]) else: generating = False
def assert_report_file_ready(self, report_id): """Poll the API with the reportId until the report is ready, up to 100 times. Args: report_id: The ID SA360 has assigned to a report. """ request = self._service.reports().get(reportId=report_id) report_data = request.execute() if report_data["isReportReady"]: logger.info("The report is ready.") # For large reports, SA360 automatically fragments the report into multiple # files. The 'files' property in the JSON object that SA360 returns contains # the list of URLs for file fragment. To download a report, SA360 needs to # know the report ID and the index of a file fragment. return report_data else: logger.info("Report is not ready. Retrying...") raise FileNotFoundError