Beispiel #1
0
 def test_create_table_schema(self):
     schema = csv_helpers.create_table_schema(HEADER, TYPES)
     self.assertEqual([
         {
             'name': 'string',
             'type': 'STRING',
             'mode': 'NULLABLE'
         },
         {
             'name': 'int',
             'type': 'INTEGER',
             'mode': 'NULLABLE'
         },
         {
             'name': 'float',
             'type': 'FLOAT',
             'mode': 'NULLABLE'
         },
         {
             'name': 'date',
             'type': 'DATE',
             'mode': 'NULLABLE'
         },
         {
             'name': 'datetime',
             'type': 'DATETIME',
             'mode': 'NULLABLE'
         },
         {
             'name': 'X0x2aSales_Confirm___Revenue___DDA',
             'type': 'STRING',
             'mode': 'NULLABLE'
         },
     ], schema)
Beispiel #2
0
    def _handle_partitioning(self, report_data: ReportConfig,
                             csv_header: List[str],
                             csv_types: List[str]) -> None:
        def _field_fix(field: Dict[str, str]) -> Dict[str, str]:
            if self._partition == 'infer' and field['type'] in [
                    'DATE', 'DATETIME'
            ]:
                return field
            elif not self._infer_schema:
                field['type'] = 'STRING'
            return field

        schema = list(
            map(_field_fix,
                csv_helpers.create_table_schema(csv_header, csv_types)))
        report_data.schema = schema
        if self._partition == 'infer':
            msg = [f'{F["name"]} - {F["type"]}' for F in schema]
            date_columns = \
              [F['name'] for F in schema if F['type'] in ['DATE', 'DATETIME']]
            if date_columns:
                report_data.partition = report_config.Partitioning.INFER
                report_data.partition_column = date_columns[0]
            else:
                logging.info(
                    'Inferred partitioning requested, but no DATE[TIME] columns '
                    'found in schema: %s', ", ".join(msg))
                if 'partition' in report_data:
                    report_data.pop('partition')
        elif self._partition:
            report_data.partition = report_config.Partitioning.INGESTION
Beispiel #3
0
    def handle_report(self, run_config: Dict[str, Any]) -> bool:
        sa360_service = self.service()
        request = sa360_service.reports().get(reportId=run_config['file_id'])

        try:
            report = request.execute()

            if report['isReportReady']:
                report_config = self.firestore.get_document(
                    type=Type.SA360_RPT, id=run_config['report_id'])

                csv_header, _ = self.read_header(report)
                schema = csv_helpers.create_table_schema(csv_header, None)
                report_config['schema'] = schema
                report_config['files'] = report['files']

                if 'dest_project' in run_config:
                    report_config['dest_project'] = run_config['dest_project']
                if 'dest_dataset' in run_config:
                    report_config['dest_dataset'] = run_config['dest_dataset']
                if 'notify_message' in run_config:
                    report_config['notifier']['message'] = run_config[
                        'notify_message']

                # update the report details please...
                self.firestore.update_document(Type.SA360_RPT,
                                               run_config['report_id'],
                                               report_config)

                # ... then stream the file to GCS a la DV360/CM
                self.stream_to_gcs(report_details=report_config,
                                   run_config=run_config)

            return report['isReportReady']

        except Exception as e:
            logging.error(
                f'Report fetch error: Run {run_config["file_id"]} for report {run_config["report_id"]}'
            )
            return False
            #       supported currently.
            #         -- davidharcombe@, 2021/04/09
            if report := response.get('reports'):
                report_json = json.dumps(report[0])
                result = \
                  ga360_report_response.GA360ReportResponse.from_json(report_json)

                # Convert report into CSV - handled by the dataclass itself.
                output_buffer = StringIO()
                result.to_csv(output=output_buffer)

                # Write schema to Firestore - update like any other.
                headers, types = csv_helpers.get_column_types(
                    BytesIO(output_buffer.getvalue().encode('utf-8')))
                schema = \
                  csv_helpers.create_table_schema(column_headers=headers,
                                                  column_types=None)
                runner['schema'] = schema
                self.firestore.update_document(self.report_type,
                                               self._report_id, runner)

                # Stream CSV to GCS. Should beable to use un-threaded streamer.
                # We look for a 'CHUNK_MULTIPLIER' setting in the environment, like
                # everywhere else, but default to 128, making the standard chunk
                # size we process 128Mb. Well within the 4Gb we're allowed for a
                # cloud function. If they turn out to be bigger than this (which I
                # # don't believe GA360 reports will be), we should move to the
                # ThreadedGCSObjectStreamUpload version.
                chunk_size = os.environ.get('CHUNK_MULTIPLIER',
                                            128) * 1024 * 1024
                streamer = GCSObjectStreamUpload(
                    creds=Credentials(email=self._email,
Beispiel #5
0
    def stream_to_gcs(self, bucket: str, report_details: ReportConfig) \
      -> Tuple[List[str], List[str]]:
        """Streams the data to Google Cloud Storage.

    This is to allow us to process much larger files than can be easily
    handled in toto in memory. Now we're limited to length of execution (900s)
    rather than size of 'stuff' (<2Gb).

    The response from SA360 is a _nasty_ piece of Microsoft Office format XML
    which has to be parsed and converted to a digestible CSV.

    Raises:
        SA360Exception: A custom SA360 exception because there can be a server
                        error returned when requesting, but the error is in
                        text and the HTTP code returned is _always_ a 200.
                        This is why the function is allowed to retry, as the
                        error is usually transient and caused by a failure to
                        connect to SA360's reporting back end.

    Returns:
        (fieldnames: List[str], fieldtypes: List[str]):
          the field names and types in the report.
    """
        report_url = report_details.url
        remainder = b''
        queue = Queue()
        output_buffer = StringIO()

        # size of pieces of xml we can safely download from the web report.
        html_chunk_size = 2048 * 1024
        chunk_size = self.chunk_multiplier * 1024 * 1024

        streamer = ThreadedGCSObjectStreamUpload(
            client=Cloud_Storage.client(credentials=self.creds),
            creds=credentials.Credentials(email=self.email,
                                          project=self.project).credentials,
            bucket_name=bucket,
            blob_name=f'{report_details.id}.csv',
            chunk_size=chunk_size,
            streamer_queue=queue)
        streamer.daemon = True
        streamer.start()

        chunk_id = 0
        conn = self.get_connection(report_url)
        _stream = conn.iter_content(chunk_size=html_chunk_size)
        source_size = 0

        first = True
        done = False
        fieldnames = None
        fieldtypes = None

        while not done:
            chunk = BytesIO()
            chunk.write(remainder)
            remainder = b''

            block, done = self.next_chunk(_stream, html_chunk_size)
            source_size += len(block)
            chunk.write(block)
            if len(chunk.getvalue()) < html_chunk_size and not done:
                continue

            chunk.seek(0)

            if first:
                fieldnames, chunk = self.find_fieldnames(buffer=chunk)
                if len(fieldnames) == 1 and fieldnames[0] == 'Error':
                    error = \
                      unescape(re.sub(r'<[^.]+>', '', chunk.getvalue().decode('utf-8')))
                    # logging.error('SA360 Error: %s', error)
                    streamer.stop()
                    raise SA360Exception(error)

            # find last </tr> on any section but the last, chop off the last
            # portion and store
            last_tr_pos = chunk.getvalue().rfind(b'</tr>')
            if last_tr_pos == -1:
                remainder = chunk.getvalue()
                continue

            else:
                last_tr_pos += 5
                chunk.seek(last_tr_pos)
                remainder = chunk.read()
                chunk.truncate(last_tr_pos)

            rows = []
            while True:
                tr, chunk = self.extract_keys(chunk, 'tr')
                if chunk:
                    rows.append([
                        unescape(field) for field in re.findall(
                            r'\<td[^>]*\>([^<]*)\<\/td\>', tr)
                    ])
                else:
                    break

            # queue for upload
            report_data = []
            for row in rows:
                report_data.append(dict(zip(fieldnames, row)))

            writer = csv.DictWriter(output_buffer, fieldnames=fieldnames)
            if first:
                writer.writeheader()

            [writer.writerow(row) for row in report_data]

            output_buffer.seek(0)

            if first:
                _, fieldtypes = \
                  csv_helpers.get_column_types(
                    BytesIO(output_buffer.getvalue().encode('utf-8')))

            queue.put(output_buffer.getvalue().encode('utf-8'))
            chunk_id += 1
            first = False
            chunk = BytesIO()
            output_buffer.seek(0)
            output_buffer.truncate(0)

        logging.info(f'SA360 report length: {source_size:,} bytes')
        queue.join()
        streamer.stop()
        report_details.schema = \
          csv_helpers.create_table_schema(fieldnames, fieldtypes)

        return fieldnames, fieldtypes