コード例 #1
0
    def upload_report(self,
                      bucket: str,
                      report_details: Dict[str, Any],
                      input_buffer: BytesIO = None):
        output_buffer = StringIO()  #BytesIO()

        try:
            if not input_buffer:
                input_buffer = BytesIO()
                request = requests.Download(report_details['url'],
                                            stream=input_buffer)
                request.consume(transport=self.transport)
                logging.info('Report data size: {bytes}'.format(bytes=0))

            input_buffer.seek(0)
            soup = self._soupify(input_buffer)
            # del input_buffer

            headers = soup.find('thead').find_all('th')
            fieldnames = []
            for header in headers:
                fieldnames.append(CSVHelpers.sanitize_string(header.string))

            rows = soup.find('tbody').find_all('tr')
            report_data = []
            for row in rows:
                data = []
                for col in row.contents:
                    data.append(col.string)
                report_data.append(dict(zip(fieldnames, data)))

            writer = csv.DictWriter(output_buffer, fieldnames=fieldnames)
            writer.writeheader()

            for row in report_data:
                writer.writerow(row)

            output_buffer.seek(0)
            Cloud_Storage.write_file(bucket=bucket,
                                     file=f"{report_details['id']}.csv",
                                     data=output_buffer.getvalue())
            report_details['schema'] = CSVHelpers.create_table_schema(
                fieldnames)

        except Exception as e:
            logging.error(e)
コード例 #2
0
ファイル: dbm.py プロジェクト: NeoTim/report2bq
    def read_header(self, report_details: dict) -> Tuple[List[str], List[str]]:
        if not 'current_path' in report_details:
            return (None, None)

        with closing(urlopen(report_details['current_path'])) as report:
            data = report.read(self.chunk_multiplier * 1024 * 1024)
            bytes_io = io.BytesIO(data)

        return CSVHelpers.get_column_types(bytes_io)
コード例 #3
0
    def read_header(self, report_config: dict) -> list:
        r = urllib.request.Request(report_config['files'][0]['url'])
        for header in self.creds.get_auth_headers():
            r.add_header(header, self.creds.get_auth_headers()[header])

        with closing(urlopen(r)) as report:
            data = report.read(self.chunk_multiplier * 1024 * 1024)
            bytes_io = BytesIO(data)

        return CSVHelpers.get_column_types(bytes_io)
コード例 #4
0
    def read_header(self, report_details: dict) -> Tuple[List[str], List[str]]:
        if not 'report_file' in report_details:
            return (None, None)

        data = self.read_data_chunk(report_details, 163840)
        bytes_io = io.BytesIO(data)
        csv_start = self.find_first_data_byte(bytes_io.getvalue())
        if csv_start == -1:
            bytes_io.seek(0)
        else:
            bytes_io.seek(csv_start)

        return CSVHelpers.get_column_types(io.BytesIO(bytes_io.read()))
コード例 #5
0
    def _find_fieldnames(self, buffer: BytesIO) -> Tuple[str, BytesIO]:
        header, buffer = self._extract_keys(buffer=buffer, key='thead')
        if header:
            fieldnames = [
                CSVHelpers.sanitize_string(field)
                for field in re.findall(r'\<th[^>]*\>([^<]*)\<\/th\>', header)
            ]
            # logging.info(f'Fields: {fieldnames}')
            del header
        else:
            fieldnames = None

        return fieldnames, buffer
コード例 #6
0
    def handle_offline_report(self, run_config: Dict[str, Any]) -> bool:
        sa360_service = DiscoverService.get_service(Service.SA360, self.creds)
        request = sa360_service.reports().get(reportId=run_config['file_id'])

        try:
            report = request.execute()

            if report['isReportReady']:
                report_config = self.firestore.get_report_config(
                    type=Type.SA360_RPT, id=run_config['report_id'])

                csv_header, csv_types = self.read_header(report)
                schema = CSVHelpers.create_table_schema(
                    csv_header, csv_types if self.infer_schema else None)
                report_config['schema'] = schema
                report_config['files'] = report['files']

                if 'dest_project' in run_config:
                    report_config['dest_project'] = run_config['dest_project']
                if 'dest_dataset' in run_config:
                    report_config['dest_dataset'] = run_config['dest_dataset']
                if 'notify_topic' in run_config:
                    report_config['notifier'] = {
                        'topic': run_config['notify_topic'],
                    }
                    if 'notify_message' in run_config:
                        report_config['notifier']['message'] = run_config[
                            'notify_message']

                # update the report details please...
                self.firestore.update_document(Type.SA360_RPT,
                                               run_config['report_id'],
                                               report_config)

                # ... then stream the file to GCS a la DV360/CM
                self._stream_report_to_gcs(report_details=report_config,
                                           run_config=run_config)

            return report['isReportReady']

        except Exception as e:
            logging.error(
                f'Report fetch error: Run {run_config["file_id"]} for report {run_config["report_id"]}'
            )
            return False
コード例 #7
0
ファイル: report2bq.py プロジェクト: NeoTim/report2bq
    def handle_report_fetcher(self, fetcher: ReportFetcher):
        # Get Latest Report
        report_object = fetcher.get_latest_report_file(self.report_id)

        # Normalize Report Details
        report_data = fetcher.fetch_report_config(report_object=report_object,
                                                  report_id=self.report_id)
        last_report = self.firestore.get_report_config(fetcher.report_type,
                                                       self.report_id)

        if last_report:
            if report_data['last_updated'] == last_report[
                    'last_updated'] and not self.force:
                logging.info('No change: ignoring.')
                return

        report_data = fetcher.normalize_report_details(
            report_object=report_object, report_id=self.report_id)

        report_data['email'] = self.email
        report_data['append'] = self.append

        if self.dest_project: report_data['dest_project'] = self.dest_project
        if self.dest_dataset: report_data['dest_dataset'] = self.dest_dataset
        if self.notify_topic:
            report_data['notifier'] = {
                'topic': self.notify_topic,
            }
            if self.notify_message:
                report_data['notifier']['message'] = self.notify_message

        if report_object:
            csv_header, csv_types = fetcher.read_header(report_data)
            if csv_header:
                schema = CSVHelpers.create_table_schema(
                    csv_header, csv_types if self.infer_schema else None)

                report_data['schema'] = schema
                fetcher.stream_to_gcs(f'{self.project}-report2bq-upload',
                                      report_data)

        self.firestore.store_report_config(fetcher.report_type, self.report_id,
                                           report_data)
コード例 #8
0
ファイル: report_loader.py プロジェクト: NeoTim/report2bq
  def _import_report(self, bucket_name: str, file_name: str, config: dict) -> bigquery.LoadJob:
    """Begin CSV import

    Create and start the Big Query import job.

    Arguments:
        bucket_name {str} -- GCS bucket name
        file_name {str} -- CSV file name
        config {Dict[str, Any]} -- report config
    
    Returns:
        bigquery.LoadJob
    """
    if config.get('dest_project'):
      # authenticate against supplied project with supplied key
      project = config.get('dest_project') or os.environ.get('GCP_PROJECT')
      client_key = json.loads(Cloud_Storage.fetch_file(
        bucket=f"{os.environ.get('GCP_PROJECT')}-report2bq-tokens",
        file=f"{config['email']}_user_token.json"
      ))
      server_key = json.loads(Cloud_Storage.fetch_file(
        bucket=f"{os.environ.get('GCP_PROJECT')}-report2bq-tokens",
        file='client_secrets.json'
      ))
      client_key['client_id'] = (server_key.get('web') or server_key.get('installed')).get('client_id')
      client_key['client_secret'] = (server_key.get('web') or server_key.get('installed')).get('client_secret')
      logging.info(client_key)
      creds = Credentials.from_authorized_user_info(client_key)
      bq = bigquery.Client(project=project, credentials=creds)

    else:
      project = os.environ.get('GCP_PROJECT')
      bq = bigquery.Client()

    dataset = config.get('dest_dataset') or os.environ.get('BQ_DATASET') or 'report2bq'

    table_name = config.get('table_name', CSVHelpers.sanitize_string(file_name))
    logging.info(f'bucket {bucket_name}, table {table_name}, file_name {file_name}')

    json_schema = config['schema']
    schema = []
    _json_schema = []
    # Build the json format schema that the BQ LoadJob requires from the text-based ones in the config
    for field in json_schema:
      f = bigquery.schema.SchemaField(name=field['name'],
                                      field_type=field['type'],
                                      mode=field['mode'])
      schema.append(f)
      _json_schema.append(f'{field["name"]}: {field["type"]}')

    table_ref = bq.dataset(dataset).table(table_name)

    # Default action is to completely replace the table each time. If requested, however then
    # we can do an append for (say) huge jobs where you would see the table with 60 days once
    # and then append 'yesterday' each day.
    if config.get('append', False):
      if self._table_exists(bq, table_ref) and not self._validate_schema(bq, table_ref, schema):
        config_schema = '\n'.join([ f'{field.name}, {field.field_type}' for field in schema])
        target_schema = '\n'.join([ f'{field.name}, {field.field_type}' for field in bq.get_table(table_ref).schema])
        self._email_error(
          email=config['email'], 
          message=f'''
Mismatched schema for {project}.{dataset}.{table_name}, trying anyway

Report has schema:
{config_schema}

Table has schema:
{target_schema}
'''
        )
        logging.error(f"Mismatched schema for {project}.{dataset}.{table_name}, trying anyway")

      import_type = bigquery.WriteDisposition.WRITE_APPEND
      
    else:
      import_type = bigquery.WriteDisposition.WRITE_TRUNCATE

    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = import_type
    # Assume a CSV header is the first line unless otherwise specified in the report's own config
    job_config.skip_leading_rows = config.get('csv_header_length', 1)
    job_config.source_format = bigquery.SourceFormat.CSV
    job_config.schema = schema
    # Allow a few errors, just in case
    job_config.max_bad_records = 10
    # Allow for DV360/CM (SA360 won't) to pass jagged rows, which they do
    job_config.allow_jagged_rows = True
    
    uri = f'gs://{bucket_name}/{file_name}'
    load_job = bq.load_table_from_uri(
        uri, table_ref, job_config=job_config
    )  # API request
    logging.info(f'Starting CSV import job {load_job.job_id}')

    return load_job
コード例 #9
0
    def _stream_processor(self,
                          bucket: str,
                          report_details: Dict[str, Any],
                          repeatable: bool = False) -> BytesIO:
        repeater = BytesIO()
        report_url = report_details['url']
        remainder = b''
        queue = Queue()
        output_buffer = StringIO()
        html_chunk_size = 2048 * 1024
        chunk_size = 1024 * 1024
        streamer = ThreadedGCSObjectStreamUpload(
            client=Cloud_Storage.client(credentials=self.creds),
            bucket_name=bucket,
            blob_name='{id}.csv'.format(id=report_details['id']),
            chunk_size=chunk_size,
            queue=queue)
        streamer.daemon = True
        streamer.start()

        try:
            chunk_id = 0
            conn = self._get_connection(report_url)
            _stream = conn.iter_content(chunk_size=html_chunk_size)
            source_size = 0

            done = False
            fieldnames = None

            while not done:
                # logging.info(f'Processing chunk {chunk_id}')
                # logging.info(f'Processing chunk {chunk_id}, remainder {remainder.decode("utf-8")}')
                chunk = BytesIO()
                chunk.write(remainder)
                remainder = b''

                block, done = self._next_chunk(_stream, html_chunk_size)
                source_size += len(block)
                # logging.info(f'{len(block):,}, begins {block[0:80]} : ends {block[-80:].decode("utf-8")}')
                if repeatable: repeater.write(block)
                chunk.write(block)
                if len(chunk.getvalue()) < html_chunk_size and not done:
                    continue

                # logging.info(f'Chunk size {len(chunk.getvalue()):,} bytes')
                chunk.seek(0)

                if chunk_id == 0:
                    fieldnames, chunk = self._find_fieldnames(buffer=chunk)

                # find last </tr> on any section but the last, chop off the last portion and store
                last_tr_pos = chunk.getvalue().rfind(b'</tr>')
                if last_tr_pos == -1:
                    # logging.debug(f'HALP! {chunk.getvalue()}')
                    remainder = chunk.getvalue()
                    continue

                else:
                    last_tr_pos += 5
                    chunk.seek(last_tr_pos)
                    remainder = chunk.read()
                    # logging.debug(f'Remainder: {remainder}')
                    chunk.truncate(last_tr_pos)

                rows = []
                while True:
                    tr, chunk = self._extract_keys(chunk, 'tr')
                    if chunk:
                        rows.append([
                            unescape(field) for field in re.findall(
                                r'\<td[^>]*\>([^<]*)\<\/td\>', tr)
                        ])
                    else:
                        break

                # queue for upload
                report_data = []
                for row in rows:
                    report_data.append(dict(zip(fieldnames, row)))

                writer = csv.DictWriter(output_buffer, fieldnames=fieldnames)
                if chunk_id == 0: writer.writeheader()

                [writer.writerow(row) for row in report_data]

                output_buffer.seek(0)
                # logging.info(f'Sending chunk {chunk_id} size {len(output_buffer.getvalue())}')
                queue.put((chunk_id, output_buffer.getvalue().encode('utf-8')))
                chunk_id += 1
                chunk = BytesIO()
                output_buffer.seek(0)
                output_buffer.truncate(0)

            logging.info(f'SA360 report length: {source_size:,} bytes')
            queue.join()
            streamer.stop()
            report_details['schema'] = CSVHelpers.create_table_schema(
                fieldnames)
            return repeater

        except Exception as e:
            logging.error(e)