def upload_report(self, bucket: str, report_details: Dict[str, Any], input_buffer: BytesIO = None): output_buffer = StringIO() #BytesIO() try: if not input_buffer: input_buffer = BytesIO() request = requests.Download(report_details['url'], stream=input_buffer) request.consume(transport=self.transport) logging.info('Report data size: {bytes}'.format(bytes=0)) input_buffer.seek(0) soup = self._soupify(input_buffer) # del input_buffer headers = soup.find('thead').find_all('th') fieldnames = [] for header in headers: fieldnames.append(CSVHelpers.sanitize_string(header.string)) rows = soup.find('tbody').find_all('tr') report_data = [] for row in rows: data = [] for col in row.contents: data.append(col.string) report_data.append(dict(zip(fieldnames, data))) writer = csv.DictWriter(output_buffer, fieldnames=fieldnames) writer.writeheader() for row in report_data: writer.writerow(row) output_buffer.seek(0) Cloud_Storage.write_file(bucket=bucket, file=f"{report_details['id']}.csv", data=output_buffer.getvalue()) report_details['schema'] = CSVHelpers.create_table_schema( fieldnames) except Exception as e: logging.error(e)
def read_header(self, report_details: dict) -> Tuple[List[str], List[str]]: if not 'current_path' in report_details: return (None, None) with closing(urlopen(report_details['current_path'])) as report: data = report.read(self.chunk_multiplier * 1024 * 1024) bytes_io = io.BytesIO(data) return CSVHelpers.get_column_types(bytes_io)
def read_header(self, report_config: dict) -> list: r = urllib.request.Request(report_config['files'][0]['url']) for header in self.creds.get_auth_headers(): r.add_header(header, self.creds.get_auth_headers()[header]) with closing(urlopen(r)) as report: data = report.read(self.chunk_multiplier * 1024 * 1024) bytes_io = BytesIO(data) return CSVHelpers.get_column_types(bytes_io)
def read_header(self, report_details: dict) -> Tuple[List[str], List[str]]: if not 'report_file' in report_details: return (None, None) data = self.read_data_chunk(report_details, 163840) bytes_io = io.BytesIO(data) csv_start = self.find_first_data_byte(bytes_io.getvalue()) if csv_start == -1: bytes_io.seek(0) else: bytes_io.seek(csv_start) return CSVHelpers.get_column_types(io.BytesIO(bytes_io.read()))
def _find_fieldnames(self, buffer: BytesIO) -> Tuple[str, BytesIO]: header, buffer = self._extract_keys(buffer=buffer, key='thead') if header: fieldnames = [ CSVHelpers.sanitize_string(field) for field in re.findall(r'\<th[^>]*\>([^<]*)\<\/th\>', header) ] # logging.info(f'Fields: {fieldnames}') del header else: fieldnames = None return fieldnames, buffer
def handle_offline_report(self, run_config: Dict[str, Any]) -> bool: sa360_service = DiscoverService.get_service(Service.SA360, self.creds) request = sa360_service.reports().get(reportId=run_config['file_id']) try: report = request.execute() if report['isReportReady']: report_config = self.firestore.get_report_config( type=Type.SA360_RPT, id=run_config['report_id']) csv_header, csv_types = self.read_header(report) schema = CSVHelpers.create_table_schema( csv_header, csv_types if self.infer_schema else None) report_config['schema'] = schema report_config['files'] = report['files'] if 'dest_project' in run_config: report_config['dest_project'] = run_config['dest_project'] if 'dest_dataset' in run_config: report_config['dest_dataset'] = run_config['dest_dataset'] if 'notify_topic' in run_config: report_config['notifier'] = { 'topic': run_config['notify_topic'], } if 'notify_message' in run_config: report_config['notifier']['message'] = run_config[ 'notify_message'] # update the report details please... self.firestore.update_document(Type.SA360_RPT, run_config['report_id'], report_config) # ... then stream the file to GCS a la DV360/CM self._stream_report_to_gcs(report_details=report_config, run_config=run_config) return report['isReportReady'] except Exception as e: logging.error( f'Report fetch error: Run {run_config["file_id"]} for report {run_config["report_id"]}' ) return False
def handle_report_fetcher(self, fetcher: ReportFetcher): # Get Latest Report report_object = fetcher.get_latest_report_file(self.report_id) # Normalize Report Details report_data = fetcher.fetch_report_config(report_object=report_object, report_id=self.report_id) last_report = self.firestore.get_report_config(fetcher.report_type, self.report_id) if last_report: if report_data['last_updated'] == last_report[ 'last_updated'] and not self.force: logging.info('No change: ignoring.') return report_data = fetcher.normalize_report_details( report_object=report_object, report_id=self.report_id) report_data['email'] = self.email report_data['append'] = self.append if self.dest_project: report_data['dest_project'] = self.dest_project if self.dest_dataset: report_data['dest_dataset'] = self.dest_dataset if self.notify_topic: report_data['notifier'] = { 'topic': self.notify_topic, } if self.notify_message: report_data['notifier']['message'] = self.notify_message if report_object: csv_header, csv_types = fetcher.read_header(report_data) if csv_header: schema = CSVHelpers.create_table_schema( csv_header, csv_types if self.infer_schema else None) report_data['schema'] = schema fetcher.stream_to_gcs(f'{self.project}-report2bq-upload', report_data) self.firestore.store_report_config(fetcher.report_type, self.report_id, report_data)
def _import_report(self, bucket_name: str, file_name: str, config: dict) -> bigquery.LoadJob: """Begin CSV import Create and start the Big Query import job. Arguments: bucket_name {str} -- GCS bucket name file_name {str} -- CSV file name config {Dict[str, Any]} -- report config Returns: bigquery.LoadJob """ if config.get('dest_project'): # authenticate against supplied project with supplied key project = config.get('dest_project') or os.environ.get('GCP_PROJECT') client_key = json.loads(Cloud_Storage.fetch_file( bucket=f"{os.environ.get('GCP_PROJECT')}-report2bq-tokens", file=f"{config['email']}_user_token.json" )) server_key = json.loads(Cloud_Storage.fetch_file( bucket=f"{os.environ.get('GCP_PROJECT')}-report2bq-tokens", file='client_secrets.json' )) client_key['client_id'] = (server_key.get('web') or server_key.get('installed')).get('client_id') client_key['client_secret'] = (server_key.get('web') or server_key.get('installed')).get('client_secret') logging.info(client_key) creds = Credentials.from_authorized_user_info(client_key) bq = bigquery.Client(project=project, credentials=creds) else: project = os.environ.get('GCP_PROJECT') bq = bigquery.Client() dataset = config.get('dest_dataset') or os.environ.get('BQ_DATASET') or 'report2bq' table_name = config.get('table_name', CSVHelpers.sanitize_string(file_name)) logging.info(f'bucket {bucket_name}, table {table_name}, file_name {file_name}') json_schema = config['schema'] schema = [] _json_schema = [] # Build the json format schema that the BQ LoadJob requires from the text-based ones in the config for field in json_schema: f = bigquery.schema.SchemaField(name=field['name'], field_type=field['type'], mode=field['mode']) schema.append(f) _json_schema.append(f'{field["name"]}: {field["type"]}') table_ref = bq.dataset(dataset).table(table_name) # Default action is to completely replace the table each time. If requested, however then # we can do an append for (say) huge jobs where you would see the table with 60 days once # and then append 'yesterday' each day. if config.get('append', False): if self._table_exists(bq, table_ref) and not self._validate_schema(bq, table_ref, schema): config_schema = '\n'.join([ f'{field.name}, {field.field_type}' for field in schema]) target_schema = '\n'.join([ f'{field.name}, {field.field_type}' for field in bq.get_table(table_ref).schema]) self._email_error( email=config['email'], message=f''' Mismatched schema for {project}.{dataset}.{table_name}, trying anyway Report has schema: {config_schema} Table has schema: {target_schema} ''' ) logging.error(f"Mismatched schema for {project}.{dataset}.{table_name}, trying anyway") import_type = bigquery.WriteDisposition.WRITE_APPEND else: import_type = bigquery.WriteDisposition.WRITE_TRUNCATE job_config = bigquery.LoadJobConfig() job_config.write_disposition = import_type # Assume a CSV header is the first line unless otherwise specified in the report's own config job_config.skip_leading_rows = config.get('csv_header_length', 1) job_config.source_format = bigquery.SourceFormat.CSV job_config.schema = schema # Allow a few errors, just in case job_config.max_bad_records = 10 # Allow for DV360/CM (SA360 won't) to pass jagged rows, which they do job_config.allow_jagged_rows = True uri = f'gs://{bucket_name}/{file_name}' load_job = bq.load_table_from_uri( uri, table_ref, job_config=job_config ) # API request logging.info(f'Starting CSV import job {load_job.job_id}') return load_job
def _stream_processor(self, bucket: str, report_details: Dict[str, Any], repeatable: bool = False) -> BytesIO: repeater = BytesIO() report_url = report_details['url'] remainder = b'' queue = Queue() output_buffer = StringIO() html_chunk_size = 2048 * 1024 chunk_size = 1024 * 1024 streamer = ThreadedGCSObjectStreamUpload( client=Cloud_Storage.client(credentials=self.creds), bucket_name=bucket, blob_name='{id}.csv'.format(id=report_details['id']), chunk_size=chunk_size, queue=queue) streamer.daemon = True streamer.start() try: chunk_id = 0 conn = self._get_connection(report_url) _stream = conn.iter_content(chunk_size=html_chunk_size) source_size = 0 done = False fieldnames = None while not done: # logging.info(f'Processing chunk {chunk_id}') # logging.info(f'Processing chunk {chunk_id}, remainder {remainder.decode("utf-8")}') chunk = BytesIO() chunk.write(remainder) remainder = b'' block, done = self._next_chunk(_stream, html_chunk_size) source_size += len(block) # logging.info(f'{len(block):,}, begins {block[0:80]} : ends {block[-80:].decode("utf-8")}') if repeatable: repeater.write(block) chunk.write(block) if len(chunk.getvalue()) < html_chunk_size and not done: continue # logging.info(f'Chunk size {len(chunk.getvalue()):,} bytes') chunk.seek(0) if chunk_id == 0: fieldnames, chunk = self._find_fieldnames(buffer=chunk) # find last </tr> on any section but the last, chop off the last portion and store last_tr_pos = chunk.getvalue().rfind(b'</tr>') if last_tr_pos == -1: # logging.debug(f'HALP! {chunk.getvalue()}') remainder = chunk.getvalue() continue else: last_tr_pos += 5 chunk.seek(last_tr_pos) remainder = chunk.read() # logging.debug(f'Remainder: {remainder}') chunk.truncate(last_tr_pos) rows = [] while True: tr, chunk = self._extract_keys(chunk, 'tr') if chunk: rows.append([ unescape(field) for field in re.findall( r'\<td[^>]*\>([^<]*)\<\/td\>', tr) ]) else: break # queue for upload report_data = [] for row in rows: report_data.append(dict(zip(fieldnames, row))) writer = csv.DictWriter(output_buffer, fieldnames=fieldnames) if chunk_id == 0: writer.writeheader() [writer.writerow(row) for row in report_data] output_buffer.seek(0) # logging.info(f'Sending chunk {chunk_id} size {len(output_buffer.getvalue())}') queue.put((chunk_id, output_buffer.getvalue().encode('utf-8'))) chunk_id += 1 chunk = BytesIO() output_buffer.seek(0) output_buffer.truncate(0) logging.info(f'SA360 report length: {source_size:,} bytes') queue.join() streamer.stop() report_details['schema'] = CSVHelpers.create_table_schema( fieldnames) return repeater except Exception as e: logging.error(e)