def __init__(self, cm_id: str=None, profile: str=None, email: str=None, project: str=None): self.email = email self.cm_id = cm_id self.cm_profile = profile self.project = project self.firestore = Firestore(email=email, project=project)
def __init__(self, report_id: str, email: str, project: str=None, timezone: str=None): self.email = email self.report_id = report_id self.project = project self.timezone = timezone self.firestore = Firestore()
def __init__(self, email: str, project: str, adh_customer: str, adh_query: str, api_key: str, days: int, dest_project: str=None, dest_dataset: str=None): """Constructor Setus up the ADH helper Arguments: email {str} -- authenticated user email (for the token) project {str} -- GCP project adh_customer {str} -- ADH customer id, 9-digit number, NO DASHES adh_query {str} -- ADH query id api_key {str} -- API Key (has to be set up in APIs and Libraries in GCP) days {int} -- Lookback window (default: 60) dest_project {str} -- target GCP project for results dest_dataset {str} -- target BQ dataset for results """ self.email = email self.project = project self.adh_customer = adh_customer self.adh_query = adh_query self.api_key = api_key self.days = days self.dest_project = dest_project self.dest_dataset = dest_dataset self.credentials = Credentials(email=email, project=project) self.storage = Cloud_Storage(email=email, project=project) self.firestore = Firestore(email=email, project=project)
class DCMReportRunner(ReportRunner): report_type = Type.CM def __init__(self, cm_id: str=None, profile: str=None, email: str=None, project: str=None): self.email = email self.cm_id = cm_id self.cm_profile = profile self.project = project self.firestore = Firestore(email=email, project=project) def run(self, unattended: bool=True) -> None: dcm = DCM(email=self.email, project=self.project, profile=self.cm_profile) if unattended: self._unattended_run(dcm) else: self._attended_run(dcm) def _attended_run(self, dcm: DCM) -> None: successful = [] response = dcm.run_report(report_id=self.cm_id, synchronous=True) if response: buffer = StringIO() pprint.pprint(response, stream=buffer) logging.info(buffer.getvalue()) while response['status'] == 'PROCESSING': time.sleep(60 * 0.5) response = dcm.report_state(report_id=self.cm_id, file_id=response['id']) buffer = StringIO() pprint.pprint(response, stream=buffer) logging.info(buffer.getvalue()) report2bq = Report2BQ( cm=True, cm_id=self.cm_id, email=self.email, project=self.project, profile=self.cm_profile ) report2bq.handle_report_fetcher(fetcher=dcm, report_id=self.cm_id) def _unattended_run(self, dcm: DCM) -> None: response = dcm.run_report(report_id=self.cm_id, synchronous=False) if response: buffer = StringIO() pprint.pprint(response, stream=buffer) logging.info(buffer.getvalue()) runner = { 'type': Type.CM.value, 'project': self.project, 'report_id': self.cm_id, 'email': self.email, 'profile': self.cm_profile, 'file_id': response['id'] } self.firestore.store_report_runner(runner)
def __init__(self, dbm_id: str = None, email: str = None, project: str = None): self.email = email self.dbm_id = dbm_id self.project = project self.firestore = Firestore(email=email, project=project)
class DBMReportRunner(ReportRunner): report_type = Type.DV360 def __init__(self, dbm_id: str = None, email: str = None, project: str = None): self.email = email self.dbm_id = dbm_id self.project = project self.firestore = Firestore(email=email, project=project) def run(self, unattended: bool = True): dbm = DBM(email=self.email, project=self.project) if unattended: self._unattended_run(dbm) else: self._attended_run(dbm) def _attended_run(self, dbm: DBM) -> None: response = dbm.run_report(self.dbm_id) if response: buffer = StringIO() pprint.pprint(response, stream=buffer) logging.info(buffer.getvalue()) while True: status = dbm.report_state(self.dbm_id) logging.info(f'Report {self.dbm_id} status: {status}') if status == 'RUNNING': time.sleep(10) elif status == 'DONE': report2bq = Report2BQ(dv360=True, dv360_id=self.dbm_id, email=self.email, project=self.project) report2bq.handle_report_fetcher(fetcher=dbm, report_id=self.dbm_id) break else: logging.error( f'DV360 Report {self.dbm_id} failed to run: {status}') break def _unattended_run(self, dbm: DBM) -> None: response = dbm.run_report(self.dbm_id) if response: runner = { 'type': Type.DV360.value, 'project': self.project, 'report_id': self.dbm_id, 'email': self.email, } self.firestore.store_report_runner(runner)
def firestore(self) -> Firestore: """The Firestore client wrapper Returns: Firestore: the wrapper """ return Firestore()
def firestore(self) -> Firestore: """Fetch the Firestore client on demand. Returns: Firestore: firestore client """ return Firestore()
def manage(self, **kwargs): firestore = Firestore(project=kwargs['project'], email=kwargs['email']) args = { 'report': kwargs.get( 'name', kwargs.get('file').split('/')[-1].split('.')[0] if kwargs.get('file') else None), 'file': kwargs.get('file'), 'firestore': firestore, 'project': kwargs['project'], 'email': kwargs['email'], **kwargs, } action = { 'list': self.list_all, 'show': self.show, 'add': self.add, 'delete': self.delete, }.get(kwargs['action']) if action: return action(**args) else: raise NotImplementedError()
def __init__(self, email: str, project: str, append: bool = False, infer_schema: bool = False): self.email = email self.project = project self.creds = Credentials(email=email, project=project) self.credentials = storage.Client()._credentials self.transport = AuthorizedSession(credentials=self.credentials) self.append = append self.infer_schema = infer_schema self.firestore = Firestore(email=email, project=project) self.chunk_multiplier = int(os.environ.get('CHUNK_MULTIPLIER', 64)) self.bucket = f'{self.project}-report2bq-upload'
def main(unusedargv): scheduler = Scheduler() with open(FLAGS.file) as reports: runners = json.loads(''.join(reports.readlines())) for runner in runners: id = f"{runner['report']}_{runner['AgencyId']}_{runner['AdvertiserId']}" Firestore().update_document(Type.SA360_RPT, f'{id}', runner)
def oauth_init(self, request: Request, project: str, email: str): project_credentials = json.loads(Files.fetch_file( '{project}-report2bq-tokens'.format(project=project), 'client_secrets.json'), encoding='utf-8') _flow = flow.Flow.from_client_config(client_config=project_credentials, scopes=self.SCOPES) _flow.redirect_uri = f"https://{os.environ.get('FUNCTION_REGION')}-{os.environ.get('GCP_PROJECT')}.cloudfunctions.net/OAuthComplete" authorization_url, state = _flow.authorization_url( access_type='offline', include_granted_scopes='true') firestore = Firestore() firestore.store_oauth_state(state=state, email=email, project=project) return redirect(authorization_url)
def list_all(self, firestore: Firestore, project: str, _print: bool = False, **unused): reports = firestore.list_documents(Type.SA360_RPT, '_reports') if _print: print(f'SA360 Dynamic Reports defined for project {project}') print() for report in reports: print(f' {report}') return reports
def show(self, firestore: Firestore, report: str, _print: bool = False, **unused): definition = firestore.get_document(Type.SA360_RPT, '_reports').get(report) if _print: print(f'SA360 Dynamic Report "{report}"') print() pprint.pprint(definition, indent=2, compact=False) return definition
def delete(self, firestore: Firestore, project: str, report: str, email: str, **unused): firestore.delete_document(Type.SA360_RPT, '_reports', report) scheduler = Scheduler() args = { 'action': 'list', 'email': email, 'project': project, 'html': False, } # Disable all runners for the now deleted report runners = list(runner['name'].split('/')[-1] for runner in scheduler.process(args) if report in runner['name']) for runner in runners: args = { 'action': 'disable', 'email': None, 'project': project, 'job_id': runner, } scheduler.process(args)
def oauth_complete(self, request: Request): logging.info(request.args) state = request.args.get('state', type=str) firestore = Firestore() email, project = firestore.get_oauth_state(state) project_credentials = json.loads(Files.fetch_file( '{project}-report2bq-tokens'.format(project=project), 'client_secrets.json'), encoding='utf-8') _flow = flow.Flow.from_client_config(client_config=project_credentials, scopes=self.SCOPES) _flow.redirect_uri = f"https://{os.environ.get('FUNCTION_REGION')}-{os.environ.get('GCP_PROJECT')}.cloudfunctions.net/OAuthComplete" r = urlparse(request.url) auth_response = urlunparse( ['https', r.netloc, r.path, r.params, r.query, r.fragment]) _flow.fetch_token(authorization_response=auth_response) logging.info(_flow.credentials) token_details = { 'access_token': _flow.credentials.token, 'refresh_token': _flow.credentials.refresh_token } Cloud_Storage.write_file( '{project}-report2bq-tokens'.format(project=project), '{email}_user_token.json'.format(email=email), json.dumps(token_details).encode('utf-8')) firestore.delete_oauth_state(state=state) return 'Ok'
def __init__(self, product: Type, email=None, project=None, report_id=None, profile=None, sa360_url=None, force: bool = False, append: bool = False, infer_schema: bool = False, dest_project: str = None, dest_dataset: str = 'report2bq', notify_topic: str = None, notify_message: str = None): self.product = product self.force = force self.email = email self.append = append self.infer_schema = infer_schema self.report_id = report_id self.sa360_url = unquote(sa360_url) if sa360_url else None self.cm_profile = profile self.project = project self.dest_project = dest_project self.dest_dataset = dest_dataset self.notify_topic = notify_topic self.notify_message = notify_message self.firestore = Firestore(email=email, project=project)
def __init__(self, email: str, project: str, append: bool = False, infer_schema: bool = False) -> SA360Web: self.email = email self.project = project self.creds = Credentials(email=email, project=project) self.credentials = storage.Client()._credentials self.transport = AuthorizedSession(credentials=self.credentials) self.append = append self.infer_schema = infer_schema self.firestore = Firestore(email=email, project=project) # chunk_multiplier is set in the environment, but defaults to 64 - this # leads to a 64M chunk size we can throw around. Given the memory # constraints of a cloud function this seems like a good, safe number. self.chunk_multiplier = int(os.environ.get('CHUNK_MULTIPLIER', 64)) self.bucket = f'{self.project}-report2bq-upload'
def check_running_report(self, config: Dict[str, Any]): """Check a running CM report for completion Arguments: report {Dict[str, Any]} -- The report data structure from Firestore """ append = config['append'] if config and 'append' in config else False response = self.report_state(report_id=config['id'], file_id=config['report_file']['id']) status = response[ 'status'] if response and 'status' in response else 'UNKNOWN' logging.info('Report {report} status: {status}'.format( report=config['id'], status=status)) firestore = Firestore(email=email, project=project) if status == 'REPORT_AVAILABLE': # Remove job from running firestore.remove_report_runner(config['id']) # Send pubsub to trigger report2bq now topic = 'projects/{project}/topics/report2bq-trigger'.format( project=self.project) pubsub = pubsub.PublisherClient() pubsub.publish(topic=topic, data=b'RUN', cm_id=config['id'], profile=config['profile_id'], email=config['email'], append=str(append), project=self.project) elif status == 'FAILED' or status == 'CANCELLED': # Remove job from running logging.error( f'Report {config["id"]}: {inflection.humanize(status)}.') firestore.remove_report_runner(config['id'])
class ADH(object): """Run ADH queries This class runs ADH queries. Where they output is determined by the query iteslf, in ADH. All we can specify here is the date range - and we do this by accepting a lookback window and doing "yesterday - lookback". TODO (davidharcombe@) ADH Query Parameters """ def __init__(self, email: str, project: str, adh_customer: str, adh_query: str, api_key: str, days: int, dest_project: str=None, dest_dataset: str=None): """Constructor Setus up the ADH helper Arguments: email {str} -- authenticated user email (for the token) project {str} -- GCP project adh_customer {str} -- ADH customer id, 9-digit number, NO DASHES adh_query {str} -- ADH query id api_key {str} -- API Key (has to be set up in APIs and Libraries in GCP) days {int} -- Lookback window (default: 60) dest_project {str} -- target GCP project for results dest_dataset {str} -- target BQ dataset for results """ self.email = email self.project = project self.adh_customer = adh_customer self.adh_query = adh_query self.api_key = api_key self.days = days self.dest_project = dest_project self.dest_dataset = dest_dataset self.credentials = Credentials(email=email, project=project) self.storage = Cloud_Storage(email=email, project=project) self.firestore = Firestore(email=email, project=project) def run(self, unattended: bool=True): """Run the ADH query Execute the ADH query, storing the run job result in Firestore. The data itself will be written to Big Query by ADH. Remember that ADH queries have many, many constraints so use this wisely: DON'T set up an hourly run - check with ADH. Keyword Arguments: unattended {bool} -- run unattended. Unused, but there for compatibility (default: {True}) """ query_details = self.fetch_query_details() if query_details: report = { 'id': self.adh_query, 'details': query_details, 'customer_id': self.adh_customer, 'table_name': self._sanitize_string(query_details['title']), } if self.dest_project: report['dest_project'] = self.dest_project if self.dest_dataset: report['dest_dataset'] = self.dest_dataset self.firestore.store_report_config( type=Type.ADH, report_data=report, id=self.adh_query) result = self.run_query(report) report['last_run'] = result self.firestore.store_report_config( type=Type.ADH, report_data=report, id=self.adh_query) logging.info('Result: {result}'.format(result=result)) def _get_adh_service(self) -> Resource: """Create the ADH Service Use the discovery API to create the ADH service Returns: Resource -- ADH service """ adh_service = DiscoverService.get_service(Service.ADH, self.credentials, self.api_key) return adh_service def _sanitize_string(self, original: str) -> str: """Sanitize Strings Convert any non alphanumeric into an '_' as per BQ requirements Arguments: original {str} -- Returns: str -- """ return re.sub('[^a-zA-Z0-9,]', '_', original) def fetch_query_details(self) -> Dict[str, Any]: """Get the Query details Returns: Dict[str, Any] -- [description] """ service = self._get_adh_service() query_id = 'customers/{customer_id}/analysisQueries/{query_id}'.format( customer_id=self.adh_customer, query_id=self.adh_query) query = service.customers().analysisQueries().get(name=query_id).execute() return query def run_query(self, query_details: Dict[str, Any]) -> Dict[str, Any]: """Run the ADH query Arguments: query_details {Dict[str, Any]} -- the details of the query job Returns: Dict[str, Any] -- result of the query run directive """ service = self._get_adh_service() yesterday = datetime.now(tz=pytz.timezone('US/Eastern')) - timedelta(days=1) earliest = yesterday - timedelta(days=60) body = { "spec": { "startDate": { "year": earliest.year, "month": earliest.month, "day": earliest.day }, "endDate": { "year": yesterday.year, "month": yesterday.month, "day": yesterday.day } }, "destTable": '{project}.{dataset}.{table_name}'.format( project=query_details['dest_project'] if 'dest_project' in query_details else self.project, dataset=query_details['dest_dataset'] if 'dest_dataset' in query_details else 'adh_results', table_name=query_details['table_name'] ), "customerId": query_details['customer_id'] } result = service.customers().analysisQueries().start( name=query_details['details']['name'], body=body).execute() return result
def process(self, data: Dict[str, Any], context): """Check all the running jobs Arguments: event {Dict[str, Any]} -- data sent from the PubSub message context {Dict[str, Any]} -- context data. unused """ firestore = Firestore(in_cloud=True, email=None, project=None) documents = firestore.get_all_jobs() for document in documents: for T in [t for t in Type if not t.name.startswith('_')]: config = firestore.get_report_config(T, document.id) if config: if config.get('dest_project'): # authenticate against supplied project with supplied key project = config.get('dest_project') or os.environ.get( 'GCP_PROJECT') client_key = json.loads( Cloud_Storage.fetch_file( bucket= f"{os.environ.get('GCP_PROJECT') or 'galvanic-card-234919'}-report2bq-tokens", file=f"{config['email']}_user_token.json")) server_key = json.loads( Cloud_Storage.fetch_file( bucket= f"{os.environ.get('GCP_PROJECT') or 'galvanic-card-234919'}-report2bq-tokens", file='client_secrets.json')) client_key['client_id'] = ( server_key.get('web') or server_key.get('installed')).get('client_id') client_key['client_secret'] = ( server_key.get('web') or server_key.get('installed')).get('client_secret') logging.info(client_key) creds = Credentials.from_authorized_user_info( client_key) bq = bigquery.Client(project=project, credentials=creds) else: bq = bigquery.Client() api_repr = document.get().to_dict() if api_repr: try: job = LoadJob.from_api_repr(api_repr, bq) job.reload() if job.state == 'DONE': if job.error_result: logging.error(job.errors) self._handle_finished(job=job, id=document.id, config=config, report_type=T) firestore.mark_import_job_complete( document.id, job) except Exception as e: logging.error( f"""Error loading job {document.id} for monitoring.""" ) break
class SA360Dynamic(ReportFetcher): report_type = Type.SA360_RPT email = None project = None profile = None def __init__(self, email: str, project: str, append: bool = False, infer_schema: bool = False): self.email = email self.project = project self.creds = Credentials(email=email, project=project) self.credentials = storage.Client()._credentials self.transport = AuthorizedSession(credentials=self.credentials) self.append = append self.infer_schema = infer_schema self.firestore = Firestore(email=email, project=project) self.chunk_multiplier = int(os.environ.get('CHUNK_MULTIPLIER', 64)) self.bucket = f'{self.project}-report2bq-upload' def service(self) -> Resource: return discovery.get_service(service=Service.SA360, credentials=self.creds) def handle_report(self, run_config: Dict[str, Any]) -> bool: sa360_service = self.service() request = sa360_service.reports().get(reportId=run_config['file_id']) try: report = request.execute() if report['isReportReady']: report_config = self.firestore.get_document( type=Type.SA360_RPT, id=run_config['report_id']) csv_header, _ = self.read_header(report) schema = csv_helpers.create_table_schema(csv_header, None) report_config['schema'] = schema report_config['files'] = report['files'] if 'dest_project' in run_config: report_config['dest_project'] = run_config['dest_project'] if 'dest_dataset' in run_config: report_config['dest_dataset'] = run_config['dest_dataset'] if 'notify_message' in run_config: report_config['notifier']['message'] = run_config[ 'notify_message'] # update the report details please... self.firestore.update_document(Type.SA360_RPT, run_config['report_id'], report_config) # ... then stream the file to GCS a la DV360/CM self.stream_to_gcs(report_details=report_config, run_config=run_config) return report['isReportReady'] except Exception as e: logging.error( f'Report fetch error: Run {run_config["file_id"]} for report {run_config["report_id"]}' ) return False def read_header(self, report_config: dict) -> list: r = urllib.request.Request(report_config['files'][0]['url']) for header in self.creds.auth_headers: r.add_header(header, self.creds.auth_headers[header]) with closing(urlopen(r)) as report: data = report.read(self.chunk_multiplier * 1024 * 1024) bytes_io = BytesIO(data) return csv_helpers.get_column_types(bytes_io) @measure_memory def stream_to_gcs(self, report_details: Dict[str, Any], run_config: Dict[str, Any]) -> None: """Multi-threaded stream to GCS Arguments: bucket (str): GCS Bucket report_details (dict): Report definition """ queue = Queue() report_id = run_config['report_id'] # chunk_multiplier is set in the environment, but defaults to 64 - this leads to a # 64M chunk size we can throw around. Given the memory constraints of a cloud function # this seems like a good, safe number. chunk_size = self.chunk_multiplier * 1024 * 1024 out_file = BytesIO() streamer = \ ThreadedGCSObjectStreamUpload( client=Cloud_Storage.client(), creds=credentials.Credentials( email=self.email, project=self.project).credentials, bucket_name=self.bucket, blob_name=f'{report_id}.csv', chunk_size=chunk_size, streamer_queue=queue) streamer.start() r = urllib.request.Request(report_details['files'][0]['url']) for header in self.creds.auth_headers: r.add_header(header, self.creds.auth_headers[header]) with closing(urlopen(r)) as _report: _downloaded = 0 chunk_id = 1 _report_size = int(_report.headers['content-length']) while _downloaded < _report_size: chunk = _report.read(chunk_size) _downloaded += len(chunk) queue.put(chunk) chunk_id += 1 queue.join() streamer.stop()
if args.get('encode_key'): key = encode_key(_key) else: key = _key src_data['email'] = _key if args.get('local_store'): from classes.local_datastore import LocalDatastore f = LocalDatastore() if args.get('firestore'): from classes.firestore import Firestore f = Firestore() if args.get('secret_manager'): from classes.secret_manager import SecretManager f = SecretManager(project=_project, email=args.get('email')) f.update_document(type=Type._ADMIN, id=key, new_data=src_data) def main(unused_argv): event = { 'key': FLAGS.key, 'file': FLAGS.file, 'encode_key': FLAGS.encode_key, 'local_store': FLAGS.local, 'firestore': FLAGS.firestore,
def firestore(self) -> Firestore: return Firestore() #email=self.email, project=self.project)
class RunMonitor(object): """Run the process watching running DV360/CM jobs This process is triggered by a Cloud Scheduler job every 5 minutes to watch the Firestore-held list list of jobs for running DV360/CM processes. If one is discovered to have completed, the Report2BQ process is invoked in the normal manner (via a PubSub message to the trigger queue). This process is not 100% necessary; if a report is defined with a "fetcher", then the fetcher will run as usual every hour and will pick up the change anyway. On the other hand, it allows for a user to schedule a quick report to run (say) every 30 minutes, and not create a "fetcher" since this process takes the "fetcher"'s place. """ firestore = Firestore() PS = pubsub.PublisherClient() schedules = None def process(self, data: Dict[str, Any], context): """[summary] Arguments: data {Dict[str, Any]} -- Data passed in from the calling function, containing the attributes from the calling PubSub message context {} -- unused """ self.project = os.environ['GCP_PROJECT'] report_checker = { Type.DV360: self._check_dv360_report, Type.CM: self._check_cm_report, Type.SA360: self._check_sa360_report, Type.SA360_RPT: self._check_sa360_report } try: documents = self.firestore.get_all_running() for document in documents: with suppress(ValueError): run_config = document.get().to_dict() T = Type(run_config['type']) # config = self.firestore.get_report_config(T, document.id) job_config = self._fetch_schedule(type=T, run_config=run_config) report_checker[T](run_config=run_config, job_config=job_config) # break # else: # logging.error(f'Invalid report: {document.get().to_dict()}') except Exception as e: logging.error(e) def _fetch_schedule(self, type: Type, run_config: Dict[str, Any]) -> Dict[str, Any]: scheduler = Scheduler() (success, job_config) = scheduler.process({ 'action': 'get', 'project': os.environ['GCP_PROJECT'], 'email': run_config['email'], 'html': False, 'job_id': type.runner(run_config['report_id']) }) return job_config def _check_dv360_report(self, job_config: Dict[str, Any], run_config: Dict[str, Any]): """Check a running DV360 report for completion Arguments: report {Dict[str, Any]} -- The report data structure from Firestore """ job_attributes = job_config['pubsubTarget']['attributes'] dbm = DBM(email=job_attributes['email'], project=self.project) status = dbm.report_state(job_attributes['report_id']) append = job_attributes[ 'append'] if job_attributes and 'append' in job_attributes else False logging.info('Report {report} status: {status}'.format( report=job_attributes['report_id'], status=status)) if status == 'DONE': # Remove job from running self.firestore.remove_report_runner(job_attributes['report_id']) # Send pubsub to trigger report2bq now topic = job_config['pubsubTarget']['topicName'] self.PS.publish(topic=topic, data=b'RUN', **job_attributes) elif status == 'FAILED': # Remove job from running logging.error(f'Report {run_config["report_id"]} failed!') self.firestore.remove_report_runner(run_config['report_id']) def _check_cm_report(self, job_config: Dict[str, Any], run_config: Dict[str, Any]): """Check a running CM report for completion Arguments: report {Dict[str, Any]} -- The report data structure from Firestore """ job_attributes = job_config['pubsubTarget']['attributes'] dcm = DCM(email=job_attributes['email'], project=self.project, profile=job_attributes['profile_id']) append = job_attributes[ 'append'] if job_attributes and 'append' in job_attributes else False # TODO: Add report_file.id to run_config response = dcm.report_state(report_id=job_attributes['report_id'], file_id=run_config['report_file']['id']) status = response[ 'status'] if response and 'status' in response else 'UNKNOWN' logging.info('Report {report} status: {status}'.format( report=job_attributes['report_id'], status=status)) if status == 'REPORT_AVAILABLE': # Remove job from running self.firestore.remove_report_runner(job_attributes['report_id']) # Send pubsub to trigger report2bq now topic = 'projects/{project}/topics/report2bq-trigger'.format( project=self.project) self.PS.publish(topic=topic, data=b'RUN', **job_attributes) elif status == 'FAILED' or status == 'CANCELLED': # Remove job from running logging.error('Report {report} failed!'.format( report=job_attributes['report_id'])) self.firestore.remove_report_runner(job_attributes['report_id']) def _check_sa360_report(self, job_config: Dict[str, Any], run_config: Dict[str, Any]): sa360 = SA360(email=run_config['email'], project=self.project) # Merge configs job_attributes = job_config['pubsubTarget'][ 'attributes'] if 'pubsubTarget' in job_config else {} config = {**run_config, **job_attributes} if sa360.handle_offline_report(run_config=config): self.firestore.remove_report_runner(run_config['report_id']) logging.info(f'Report {run_config["report_id"]} done.') else: # SA360 ones can't fail - they won't start if there are errors, so it's just # not ready yet. So just leave it here and try again later. logging.error(f'Report {run_config["report_id"]} not ready.')
def datastore(self) -> AbstractDatastore: """The datastore property.""" from classes.firestore import Firestore return Firestore()
def firestore(self) -> AbstractDatastore: return Firestore()
def add(self, firestore: Firestore, report: str, file: str, **unused): with open(file) as definition: cfg = json.loads(''.join(definition.readlines())) Firestore().update_document(Type.SA360_RPT, '_reports', {report: cfg})
class SA360(object): def __init__(self, email: str, project: str, append: bool = False, infer_schema: bool = False): self.email = email self.project = project self.creds = Credentials(email=email, project=project) self.credentials = storage.Client()._credentials self.transport = AuthorizedSession(credentials=self.credentials) self.append = append self.infer_schema = infer_schema self.firestore = Firestore(email=email, project=project) self.chunk_multiplier = int(os.environ.get('CHUNK_MULTIPLIER', 64)) self.bucket = f'{self.project}-report2bq-upload' def _soupify(self, data: BytesIO) -> BeautifulSoup: return BeautifulSoup(data, 'lxml') def process(self, bucket: str, report_details: Dict[str, Any]) -> None: input_buffer = BytesIO() repeater = self._stream_processor(bucket=bucket, report_details=report_details, repeatable=False) # old_id, report_details['id'] = report_details['id'], f'{report_details["id"]}-repeat' # self.upload_report(bucket=bucket, report_details=report_details, input_buffer=repeater) # report_details['id'] = old_id def handle_offline_report(self, run_config: Dict[str, Any]) -> bool: sa360_service = DiscoverService.get_service(Service.SA360, self.creds) request = sa360_service.reports().get(reportId=run_config['file_id']) try: report = request.execute() if report['isReportReady']: report_config = self.firestore.get_report_config( type=Type.SA360_RPT, id=run_config['report_id']) csv_header, csv_types = self.read_header(report) schema = CSVHelpers.create_table_schema( csv_header, csv_types if self.infer_schema else None) report_config['schema'] = schema report_config['files'] = report['files'] if 'dest_project' in run_config: report_config['dest_project'] = run_config['dest_project'] if 'dest_dataset' in run_config: report_config['dest_dataset'] = run_config['dest_dataset'] if 'notify_topic' in run_config: report_config['notifier'] = { 'topic': run_config['notify_topic'], } if 'notify_message' in run_config: report_config['notifier']['message'] = run_config[ 'notify_message'] # update the report details please... self.firestore.update_document(Type.SA360_RPT, run_config['report_id'], report_config) # ... then stream the file to GCS a la DV360/CM self._stream_report_to_gcs(report_details=report_config, run_config=run_config) return report['isReportReady'] except Exception as e: logging.error( f'Report fetch error: Run {run_config["file_id"]} for report {run_config["report_id"]}' ) return False def read_header(self, report_config: dict) -> list: r = urllib.request.Request(report_config['files'][0]['url']) for header in self.creds.get_auth_headers(): r.add_header(header, self.creds.get_auth_headers()[header]) with closing(urlopen(r)) as report: data = report.read(self.chunk_multiplier * 1024 * 1024) bytes_io = BytesIO(data) return CSVHelpers.get_column_types(bytes_io) @measure_memory def _stream_report_to_gcs(self, report_details: Dict[str, Any], run_config: Dict[str, Any]) -> None: """Multi-threaded stream to GCS Arguments: bucket {str} -- GCS Bucket report_details {dict} -- Report definition """ queue = Queue() report_id = run_config['report_id'] chunk_size = self.chunk_multiplier * 1024 * 1024 out_file = BytesIO() streamer = ThreadedGCSObjectStreamUpload(client=Cloud_Storage.client(), bucket_name=self.bucket, blob_name=f'{report_id}.csv', chunk_size=chunk_size, queue=queue) streamer.start() r = urllib.request.Request(report_details['files'][0]['url']) for header in self.creds.get_auth_headers(): r.add_header(header, self.creds.get_auth_headers()[header]) with closing(urlopen(r)) as _report: _downloaded = 0 chunk_id = 1 _report_size = int(_report.headers['content-length']) while _downloaded < _report_size: chunk = _report.read(chunk_size) _downloaded += len(chunk) queue.put((chunk_id, chunk)) chunk_id += 1 queue.join() streamer.stop() @timeit def _fetch_data(self, report_details: Dict[str, Any], buffer: BytesIO) -> int: try: report_url = report_details['url'] request = requests.Download(report_url, stream=buffer) request.consume(transport=self.transport) return self._stream_size(buffer) except Exception as e: logging.error(e) return -1 @timeit def _stream_size(self, buffer: BytesIO) -> int: pos = buffer.tell() buffer.seek(0, SEEK_END) size = buffer.tell() buffer.seek(pos) return size def _extract_keys(self, buffer: BytesIO, key: str) -> Tuple[str, BytesIO]: b = buffer.getvalue() start_pos = b.find((f'<{key}>').encode('utf-8')) if start_pos == -1: buffer.seek(0) extract = None new_stream = None else: end_pos = b.find((f'</{key}>').encode('utf-8'), start_pos) buffer.seek(start_pos) content = buffer.read(end_pos + len(f'</{key}>') - start_pos) extract = content.decode('utf-8') new_stream = BytesIO(buffer.read()) return extract, new_stream @timeit def _get_connection(self, report_url: str): auth_headers = self.creds.get_auth_headers() conn = req.get(report_url, stream=True, headers=auth_headers) return conn def _find_fieldnames(self, buffer: BytesIO) -> Tuple[str, BytesIO]: header, buffer = self._extract_keys(buffer=buffer, key='thead') if header: fieldnames = [ CSVHelpers.sanitize_string(field) for field in re.findall(r'\<th[^>]*\>([^<]*)\<\/th\>', header) ] # logging.info(f'Fields: {fieldnames}') del header else: fieldnames = None return fieldnames, buffer def _next_chunk(self, stream, html_chunk_size: int = None) -> Tuple[bytes, bool]: _buffer = BytesIO() last_chunk = False while len(_buffer.getvalue()) < html_chunk_size and not last_chunk: try: _block = stream.__next__() if _block: _buffer.write(_block) except StopIteration: last_chunk = True return _buffer.getvalue(), last_chunk @measure_memory def _stream_processor(self, bucket: str, report_details: Dict[str, Any], repeatable: bool = False) -> BytesIO: repeater = BytesIO() report_url = report_details['url'] remainder = b'' queue = Queue() output_buffer = StringIO() html_chunk_size = 2048 * 1024 chunk_size = 1024 * 1024 streamer = ThreadedGCSObjectStreamUpload( client=Cloud_Storage.client(credentials=self.creds), bucket_name=bucket, blob_name='{id}.csv'.format(id=report_details['id']), chunk_size=chunk_size, queue=queue) streamer.daemon = True streamer.start() try: chunk_id = 0 conn = self._get_connection(report_url) _stream = conn.iter_content(chunk_size=html_chunk_size) source_size = 0 done = False fieldnames = None while not done: # logging.info(f'Processing chunk {chunk_id}') # logging.info(f'Processing chunk {chunk_id}, remainder {remainder.decode("utf-8")}') chunk = BytesIO() chunk.write(remainder) remainder = b'' block, done = self._next_chunk(_stream, html_chunk_size) source_size += len(block) # logging.info(f'{len(block):,}, begins {block[0:80]} : ends {block[-80:].decode("utf-8")}') if repeatable: repeater.write(block) chunk.write(block) if len(chunk.getvalue()) < html_chunk_size and not done: continue # logging.info(f'Chunk size {len(chunk.getvalue()):,} bytes') chunk.seek(0) if chunk_id == 0: fieldnames, chunk = self._find_fieldnames(buffer=chunk) # find last </tr> on any section but the last, chop off the last portion and store last_tr_pos = chunk.getvalue().rfind(b'</tr>') if last_tr_pos == -1: # logging.debug(f'HALP! {chunk.getvalue()}') remainder = chunk.getvalue() continue else: last_tr_pos += 5 chunk.seek(last_tr_pos) remainder = chunk.read() # logging.debug(f'Remainder: {remainder}') chunk.truncate(last_tr_pos) rows = [] while True: tr, chunk = self._extract_keys(chunk, 'tr') if chunk: rows.append([ unescape(field) for field in re.findall( r'\<td[^>]*\>([^<]*)\<\/td\>', tr) ]) else: break # queue for upload report_data = [] for row in rows: report_data.append(dict(zip(fieldnames, row))) writer = csv.DictWriter(output_buffer, fieldnames=fieldnames) if chunk_id == 0: writer.writeheader() [writer.writerow(row) for row in report_data] output_buffer.seek(0) # logging.info(f'Sending chunk {chunk_id} size {len(output_buffer.getvalue())}') queue.put((chunk_id, output_buffer.getvalue().encode('utf-8'))) chunk_id += 1 chunk = BytesIO() output_buffer.seek(0) output_buffer.truncate(0) logging.info(f'SA360 report length: {source_size:,} bytes') queue.join() streamer.stop() report_details['schema'] = CSVHelpers.create_table_schema( fieldnames) return repeater except Exception as e: logging.error(e) @measure_memory def upload_report(self, bucket: str, report_details: Dict[str, Any], input_buffer: BytesIO = None): output_buffer = StringIO() #BytesIO() try: if not input_buffer: input_buffer = BytesIO() request = requests.Download(report_details['url'], stream=input_buffer) request.consume(transport=self.transport) logging.info('Report data size: {bytes}'.format(bytes=0)) input_buffer.seek(0) soup = self._soupify(input_buffer) # del input_buffer headers = soup.find('thead').find_all('th') fieldnames = [] for header in headers: fieldnames.append(CSVHelpers.sanitize_string(header.string)) rows = soup.find('tbody').find_all('tr') report_data = [] for row in rows: data = [] for col in row.contents: data.append(col.string) report_data.append(dict(zip(fieldnames, data))) writer = csv.DictWriter(output_buffer, fieldnames=fieldnames) writer.writeheader() for row in report_data: writer.writerow(row) output_buffer.seek(0) Cloud_Storage.write_file(bucket=bucket, file=f"{report_details['id']}.csv", data=output_buffer.getvalue()) report_details['schema'] = CSVHelpers.create_table_schema( fieldnames) except Exception as e: logging.error(e)
class ReportLoader(object): """Run the report loading process This performs the CSV import into BQ. It is triggered by a finalize/create on a monitored GCS bucket, and will ONLY process CSVs. All other files written to that bucket will result in an error in the logs. The file must named the same as the report id that is stored in Firestore - this is how the process knows which table/schema to use. Once started, the BQ Import Job (of type google.cloud.bigquery.LoadJob) is stored in Firestore, under the 'jobs' key. This is then monitored for completion by JobMonitor. """ CS = storage.Client() # uses default service account credentials FIRESTORE = Firestore() # uses default service account credentials def process(self, data: Dict[str, Any], context): """Process an added file This is the entry point for the Cloud Function to create the BQ import job. Arguments: event {Dict[str, Any]} -- data sent from the PubSub message context {Dict[str, Any]} -- context data. unused """ logging.info(data) bucket_name = data['bucket'] file_name = data['name'] if file_name.upper().endswith('CSV'): logging.info('Processing CSV file %s' % file_name) try: self._handle_csv(bucket_name, file_name) except Exception as e: logging.error('Error processing file %s\n%s' % (file_name, e)) else: # Ignore it, it's probably the schema logging.warn('File added that will not be processed: %s' % file_name) def _get_report_config(self, id: str) -> (Type, Dict[str, Any]): """Fetch the report configuration Load the stored report configuration from Firestore and return the report type and config as a tuple Arguments: id {int} -- Report Id, aka CSV file name Returns: (Type, Dict[str, Any]) -- Tuple containing the report type as an Enum, and the report configuration. """ config = None for config_type in [Type.DV360, Type.CM, Type.SA360, Type.SA360_RPT]: config = self.FIRESTORE.get_report_config(config_type, id) if config: return config_type, config return None, None def _handle_csv(self, bucket_name: str, file_name: str): """Handle the CSV file Work out which type of job it is and send it to the appropriate uploader Arguments: bucket_name {str} -- name of the source bucket file_name {str} -- name of the CSV file """ # Load config file. Must be present to continue # This could be either DBM/DV360 or (D)CM report_id = file_name.split('/')[-1].split('.')[0] config_type, config = self._get_report_config(report_id) if not config_type: self._email_error(f'No config found for report {report_id}') raise Exception(f'No config found for report {report_id}') logging.info(config) # Insert with schema and table name from config if config_type == Type.DV360: job = self._import_dbm_report(bucket_name, file_name, config) elif config_type == Type.CM: job = self._import_dcm_report(bucket_name, file_name, config) elif config_type == Type.SA360: job = self._import_sa360_report(bucket_name, file_name, config) elif config_type == Type.SA360_RPT: job = self._import_sa360_report(bucket_name, file_name, config) # Store the completed job in Firestore if job: self.FIRESTORE.store_import_job_details(report_id, job) def _import_dbm_report(self, bucket_name, file_name, config) -> bigquery.LoadJob: """Begin DV360 import These functions are identical, but need not be (used not to be) to reflect the fact that at some point, each product's CSVs could be subtly different, or that on product or another may switch from CSV to (say) json. Arguments: bucket_name {str} -- GCS bucket name file_name {str} -- CSV file name config {Dict[str, Any]} -- report config Returns: bigquery.LoadJob """ return self._import_report(bucket_name, file_name, config) def _import_dcm_report(self, bucket_name, file_name, config): """Begin CM import These functions are identical, but need not be (used not to be) to reflect the fact that at some point, each product's CSVs could be subtly different, or that on product or another may switch from CSV to (say) json. Arguments: bucket_name {str} -- GCS bucket name file_name {str} -- CSV file name config {Dict[str, Any]} -- report config Returns: bigquery.LoadJob """ return self._import_report(bucket_name, file_name, config) def _import_sa360_report(self, bucket_name, file_name, config): """Begin SA360 import These functions are identical, but need not be (used not to be) to reflect the fact that at some point, each product's CSVs could be subtly different, or that on product or another may switch from CSV to (say) json. Arguments: bucket_name {str} -- GCS bucket name file_name {str} -- CSV file name config {Dict[str, Any]} -- report config Returns: bigquery.LoadJob """ return self._import_report(bucket_name, file_name, config) def _import_report(self, bucket_name: str, file_name: str, config: dict) -> bigquery.LoadJob: """Begin CSV import Create and start the Big Query import job. Arguments: bucket_name {str} -- GCS bucket name file_name {str} -- CSV file name config {Dict[str, Any]} -- report config Returns: bigquery.LoadJob """ if config.get('dest_project'): # authenticate against supplied project with supplied key project = config.get('dest_project') or os.environ.get('GCP_PROJECT') client_key = json.loads(Cloud_Storage.fetch_file( bucket=f"{os.environ.get('GCP_PROJECT')}-report2bq-tokens", file=f"{config['email']}_user_token.json" )) server_key = json.loads(Cloud_Storage.fetch_file( bucket=f"{os.environ.get('GCP_PROJECT')}-report2bq-tokens", file='client_secrets.json' )) client_key['client_id'] = (server_key.get('web') or server_key.get('installed')).get('client_id') client_key['client_secret'] = (server_key.get('web') or server_key.get('installed')).get('client_secret') logging.info(client_key) creds = Credentials.from_authorized_user_info(client_key) bq = bigquery.Client(project=project, credentials=creds) else: project = os.environ.get('GCP_PROJECT') bq = bigquery.Client() dataset = config.get('dest_dataset') or os.environ.get('BQ_DATASET') or 'report2bq' table_name = config.get('table_name', CSVHelpers.sanitize_string(file_name)) logging.info(f'bucket {bucket_name}, table {table_name}, file_name {file_name}') json_schema = config['schema'] schema = [] _json_schema = [] # Build the json format schema that the BQ LoadJob requires from the text-based ones in the config for field in json_schema: f = bigquery.schema.SchemaField(name=field['name'], field_type=field['type'], mode=field['mode']) schema.append(f) _json_schema.append(f'{field["name"]}: {field["type"]}') table_ref = bq.dataset(dataset).table(table_name) # Default action is to completely replace the table each time. If requested, however then # we can do an append for (say) huge jobs where you would see the table with 60 days once # and then append 'yesterday' each day. if config.get('append', False): if self._table_exists(bq, table_ref) and not self._validate_schema(bq, table_ref, schema): config_schema = '\n'.join([ f'{field.name}, {field.field_type}' for field in schema]) target_schema = '\n'.join([ f'{field.name}, {field.field_type}' for field in bq.get_table(table_ref).schema]) self._email_error( email=config['email'], message=f''' Mismatched schema for {project}.{dataset}.{table_name}, trying anyway Report has schema: {config_schema} Table has schema: {target_schema} ''' ) logging.error(f"Mismatched schema for {project}.{dataset}.{table_name}, trying anyway") import_type = bigquery.WriteDisposition.WRITE_APPEND else: import_type = bigquery.WriteDisposition.WRITE_TRUNCATE job_config = bigquery.LoadJobConfig() job_config.write_disposition = import_type # Assume a CSV header is the first line unless otherwise specified in the report's own config job_config.skip_leading_rows = config.get('csv_header_length', 1) job_config.source_format = bigquery.SourceFormat.CSV job_config.schema = schema # Allow a few errors, just in case job_config.max_bad_records = 10 # Allow for DV360/CM (SA360 won't) to pass jagged rows, which they do job_config.allow_jagged_rows = True uri = f'gs://{bucket_name}/{file_name}' load_job = bq.load_table_from_uri( uri, table_ref, job_config=job_config ) # API request logging.info(f'Starting CSV import job {load_job.job_id}') return load_job def _table_exists(self, bq: bigquery.Client, table_ref: bigquery.TableReference) -> bool: try: bq.get_table(table_ref) return True except NotFound: return False def _validate_schema(self, bq: bigquery.Client, table_ref: bigquery.TableReference, schema: List[bigquery.schema.SchemaField]) -> bool: _table = bq.get_table(table_ref) _schema = _table.schema return _schema == schema def _email_error(self, message: str, email: str=None, error: Exception=None) -> None: _to = [email] if email else [] _administrator = os.environ.get('ADMINISTRATOR_EMAIL') or self.FIRESTORE.get_document(Type._ADMIN, 'admin').get('email') _cc = [_administrator] if _administrator else [] if _to or _cc: message = GMailMessage( to=_to, cc=_cc, subject=f'Error in report_loader', body=f''' {message} Error: {error if error else 'No exception.'} ''', project=os.environ.get('GCP_PROJECT')) GMail().send_message( message=message, credentials=Report2BQCredentials(email=email, project=os.environ.get('GCP_PROJECT')) )