Esempio n. 1
0
    def _refresh_credentials(self, project_credentials: Dict[str, str],
                             user_token: Dict[str, str]) -> Dict[str, str]:
        # Remove top-level element
        secrets = project_credentials[
            'web'] if 'web' in project_credentials else project_credentials[
                'installed']

        # Init credentials
        creds = google.oauth2.credentials.Credentials(
            None,
            refresh_token=user_token['refresh_token'],
            token_uri="https://accounts.google.com/o/oauth2/token",
            client_id=secrets['client_id'],
            client_secret=secrets['client_secret'])

        # Force Refresh token
        creds.refresh(google.auth.transport.requests.Request())
        refresh_token_details = {
            'access_token': creds.token,
            'refresh_token': creds.refresh_token
        }

        Cloud_Storage.write_file(
            bucket=self.bucket,
            file=self.client_token,
            data=json.dumps(refresh_token_details).encode('utf-8'))

        return creds
Esempio n. 2
0
    def _read_json(self, config: ManagerConfiguration) -> List[Dict[str, Any]]:
        """Read the contens of a file as a json object.

    Args:
        config (ManagerConfiguration): the manager configuration

    Returns:
        List[Dict[str, Any]]: the file contents as json
    """
        objects = []

        if config.type == ManagerType.BIG_QUERY:
            query = ManagerInput(config)
            job = query.execute()
            objects = [dict(row) for row in job]

        else:
            if config.file:
                if config.gcs_stored:
                    content = \
                        Cloud_Storage(project=config.project,
                                      email=config.email).fetch_file(bucket=self.bucket,
                                                                     file=config.file)
                    objects = json.loads(content)
                else:
                    with open(config.file) as rpt:
                        objects = json.loads(''.join(rpt.readlines()))

            else:
                objects = self.firestore.list_documents(self.report_type)

        return objects
Esempio n. 3
0
 def token_details(self) -> Dict[str, Any]:
     """The users's refresh and access token."""
     # TODO: Remove the GCS check when fully migrated to Firestore.
     return self.datastore.get_document(type=Type._ADMIN, id='auth',
                                        key=self.encode_key(self._email)) or \
       json.loads(Cloud_Storage.fetch_file(bucket=self.bucket,
                                           file=self.client_token))
Esempio n. 4
0
  def storage(self) -> Cloud_Storage:
    """Fetch the GCS storage client on demand.

    Returns:
        Cloud_Storage: storage client
    """
    return Cloud_Storage()
Esempio n. 5
0
  def __init__(self, 
    email: str, project: str, adh_customer: str,
    adh_query: str, api_key: str, days: int,
    dest_project: str=None, dest_dataset: str=None):
    """Constructor

    Setus up the ADH helper
    
    Arguments:
        email {str} -- authenticated user email (for the token)
        project {str} -- GCP project
        adh_customer {str} -- ADH customer id, 9-digit number, NO DASHES
        adh_query {str} -- ADH query id
        api_key {str} -- API Key (has to be set up in APIs and Libraries in GCP)
        days {int} -- Lookback window (default: 60)
        dest_project {str} -- target GCP project for results
        dest_dataset {str} -- target BQ dataset for results
    """
    self.email = email
    self.project = project
    self.adh_customer = adh_customer
    self.adh_query = adh_query
    self.api_key = api_key
    self.days = days
    self.dest_project = dest_project
    self.dest_dataset = dest_dataset

    self.credentials = Credentials(email=email, project=project)
    self.storage = Cloud_Storage(email=email, project=project)
    self.firestore = Firestore(email=email, project=project)
Esempio n. 6
0
    def project_credentials(self) -> Dict[str, Any]:
        """The project credentials.

    TODO: Remove the GCS check when fully migrated to Firestore."""
        return self.datastore.get_document(type=Type._ADMIN,
                                           id='auth', key='client_secret') or \
          json.loads(Cloud_Storage.fetch_file(bucket=self.bucket,
                                              file='client_secrets.json'))
Esempio n. 7
0
    def upload_report(self,
                      bucket: str,
                      report_details: Dict[str, Any],
                      input_buffer: BytesIO = None):
        output_buffer = StringIO()  #BytesIO()

        try:
            if not input_buffer:
                input_buffer = BytesIO()
                request = requests.Download(report_details['url'],
                                            stream=input_buffer)
                request.consume(transport=self.transport)
                logging.info('Report data size: {bytes}'.format(bytes=0))

            input_buffer.seek(0)
            soup = self._soupify(input_buffer)
            # del input_buffer

            headers = soup.find('thead').find_all('th')
            fieldnames = []
            for header in headers:
                fieldnames.append(CSVHelpers.sanitize_string(header.string))

            rows = soup.find('tbody').find_all('tr')
            report_data = []
            for row in rows:
                data = []
                for col in row.contents:
                    data.append(col.string)
                report_data.append(dict(zip(fieldnames, data)))

            writer = csv.DictWriter(output_buffer, fieldnames=fieldnames)
            writer.writeheader()

            for row in report_data:
                writer.writerow(row)

            output_buffer.seek(0)
            Cloud_Storage.write_file(bucket=bucket,
                                     file=f"{report_details['id']}.csv",
                                     data=output_buffer.getvalue())
            report_details['schema'] = CSVHelpers.create_table_schema(
                fieldnames)

        except Exception as e:
            logging.error(e)
Esempio n. 8
0
 def __init__(self,
              in_cloud: bool = True,
              email: str = None,
              project: str = None):
     """
 Initialize Credential Class
 """
     self.project = project
     self.email = email
     self.bucket = f'{project}-report2bq-tokens'
     self.client_token = f'{email}_user_token.json'
     self.project_credentials = json.loads(
         Cloud_Storage.fetch_file(bucket=self.bucket,
                                  file='client_secrets.json'))
     self.token_details = json.loads(
         Cloud_Storage.fetch_file(bucket=self.bucket,
                                  file=self.client_token))
Esempio n. 9
0
    def validate(self, config: ManagerConfiguration, **unused) -> None:
        sa360_report_definitions = \
            self.firestore.get_document(self.report_type, '_reports')
        validation_results = []

        sa360_objects = self._read_json(config)

        for sa360_object in sa360_objects:
            if sa360_object == '_reports':
                continue
            creds = Credentials(project=config.project,
                                email=sa360_object['email'])
            sa360_service = \
                discovery.get_service(service=Service.SA360, credentials=creds)

            (valid, validation) = \
                self._report_validation(sa360_report_definitions,
                                        sa360_object, sa360_service)
            validation_results.append(validation)

        if validation_results:
            if config.type == ManagerType.BIG_QUERY:
                results = [json.loads(r.to_json()) for r in validation_results]
                # write to BQ
                client = bigquery.Client(project=config.project)
                table = client.dataset(
                    config.dataset).table('sa360_validation')
                job_config = bigquery.LoadJobConfig(
                    write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
                    source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON)

                client.load_table_from_json(results,
                                            table,
                                            job_config=job_config)

            else:
                csv_output = f'{config.email}-<now>-validation.csv'
                if config.gcs_stored:
                    csv_bytes = io.StringIO()
                    writer = csv.DictWriter(csv_bytes,
                                            fieldnames=Validation.keys(),
                                            quoting=csv.QUOTE_ALL)
                    writer.writeheader()
                    writer.writerows([r.to_dict() for r in validation_results])
                    Cloud_Storage(project=config.project,
                                  email=config.email).write_file(
                                      bucket=self.bucket,
                                      file=csv_output,
                                      data=csv_bytes.getvalue())

                else:
                    with open(csv_output, 'w') as csv_file:
                        writer = csv.DictWriter(csv_file,
                                                fieldnames=Validation.keys(),
                                                quoting=csv.QUOTE_ALL)
                        writer.writeheader()
                        writer.writerows(
                            [r.to_dict() for r in validation_results])
Esempio n. 10
0
 def exec_module(self, module: types.ModuleType):
   try:
     # Fetch the code here as string:
     # GCS? BQ? Firestore? All good options
     filename = module.__name__.split('.')[-1]
     code = Cloud_Storage.fetch_file(
         bucket=(f'{os.environ.get("GCP_PROJECT")}-report2bq-postprocessor'),
         file=f'{filename}.py'
     )
     exec(code, vars(module))
   except:
     raise ModuleNotFoundError()
Esempio n. 11
0
    def store_credentials(self, creds: credentials.Credentials) -> None:
        """Stores the credentials.

    This function uses the datastore to store the user credentials for later.

    Args:
        creds (credentials.Credentials): the user credentials."""
        # TODO: Remove the GCS write when fully migrated to Firestore.
        if self._email:
            key = self.encode_key(self._email)
            refresh_token_details = {
                'access_token': creds.token,
                'refresh_token': creds.refresh_token,
                '_key': key
            }
            self.datastore.update_document(
                type=Type._ADMIN,
                id='auth',
                new_data={key: json.loads(creds.to_json())})
            Cloud_Storage.write_file(
                bucket=self.bucket,
                file=self.client_token,
                data=json.dumps(refresh_token_details).encode('utf-8'))
Esempio n. 12
0
def report_manager(event: Dict[str, Any], context=None) -> None:
    """Processes files added to the *_report_manager bucket.

  Arguments:
      event (Dict[str, Any]):  data sent from the PubSub message
      context (Dict[str, Any]):  context data. unused
  """
    logging.info(event)
    project = os.environ.get('GCP_PROJECT')

    bucket_name = event['bucket']
    file_name = event['name']
    *n, e = file_name.split('/')[-1].split('.')
    (name, extension) = ('.'.join(n).lower(), e.lower())

    if f := {
            f'{project}-report2bq-ga360-manager': GA360ReportManager,
            f'{project}-report2bq-sa360-manager': SA360Manager,
    }.get(bucket_name):
        logging.info('Processing file %s', file_name)
        try:
            args = {
                'report': name,
                'project': project,
                'file': file_name,
                'gcs_stored': True,
                'action': extension,
            }
            f().manage(**args)
            Cloud_Storage.rename(bucket=bucket_name,
                                 source=file_name,
                                 destination=f'{file_name}.processed')

        except NotImplementedError:
            logging.error(
                'Extension command %s is not a valid action. Ignoring.',
                extension)
Esempio n. 13
0
    def oauth_complete(self, request: Request):
        logging.info(request.args)

        state = request.args.get('state', type=str)
        firestore = Firestore()
        email, project = firestore.get_oauth_state(state)

        project_credentials = json.loads(Files.fetch_file(
            '{project}-report2bq-tokens'.format(project=project),
            'client_secrets.json'),
                                         encoding='utf-8')

        _flow = flow.Flow.from_client_config(client_config=project_credentials,
                                             scopes=self.SCOPES)
        _flow.redirect_uri = f"https://{os.environ.get('FUNCTION_REGION')}-{os.environ.get('GCP_PROJECT')}.cloudfunctions.net/OAuthComplete"

        r = urlparse(request.url)
        auth_response = urlunparse(
            ['https', r.netloc, r.path, r.params, r.query, r.fragment])
        _flow.fetch_token(authorization_response=auth_response)

        logging.info(_flow.credentials)

        token_details = {
            'access_token': _flow.credentials.token,
            'refresh_token': _flow.credentials.refresh_token
        }

        Cloud_Storage.write_file(
            '{project}-report2bq-tokens'.format(project=project),
            '{email}_user_token.json'.format(email=email),
            json.dumps(token_details).encode('utf-8'))

        firestore.delete_oauth_state(state=state)

        return 'Ok'
Esempio n. 14
0
    def stream_to_gcs(self, report_details: Dict[str, Any],
                      run_config: Dict[str, Any]) -> None:
        """Multi-threaded stream to GCS

    Arguments:
        bucket (str):  GCS Bucket
        report_details (dict):  Report definition
    """
        queue = Queue()

        report_id = run_config['report_id']

        # chunk_multiplier is set in the environment, but defaults to 64 - this leads to a
        # 64M chunk size we can throw around. Given the memory constraints of a cloud function
        # this seems like a good, safe number.
        chunk_size = self.chunk_multiplier * 1024 * 1024
        out_file = BytesIO()

        streamer = \
          ThreadedGCSObjectStreamUpload(
            client=Cloud_Storage.client(),
            creds=credentials.Credentials(
              email=self.email, project=self.project).credentials,
            bucket_name=self.bucket,
            blob_name=f'{report_id}.csv',
            chunk_size=chunk_size,
            streamer_queue=queue)
        streamer.start()

        r = urllib.request.Request(report_details['files'][0]['url'])
        for header in self.creds.auth_headers:
            r.add_header(header, self.creds.auth_headers[header])

        with closing(urlopen(r)) as _report:
            _downloaded = 0
            chunk_id = 1
            _report_size = int(_report.headers['content-length'])
            while _downloaded < _report_size:
                chunk = _report.read(chunk_size)
                _downloaded += len(chunk)
                queue.put(chunk)
                chunk_id += 1

        queue.join()
        streamer.stop()
Esempio n. 15
0
    def _read_email(self, file: str, gcs_stored: bool) -> str:
        """Read an email address from a file.

    Args:
        file (str): the file to process.
        gcs_stored (bool): is the file local or GCS?

    Returns:
        str: the email address
    """
        if gcs_stored:
            email = str(Cloud_Storage().fetch_file(bucket=self.bucket,
                                                   file=file),
                        encoding='utf-8').strip()

        else:
            with open(file, 'r') as _command_file:
                email = _command_file.readline().strip()

        return email
Esempio n. 16
0
    def _stream_report_to_gcs(self, report_details: Dict[str, Any],
                              run_config: Dict[str, Any]) -> None:
        """Multi-threaded stream to GCS
    
    Arguments:
        bucket {str} -- GCS Bucket
        report_details {dict} -- Report definition
    """
        queue = Queue()

        report_id = run_config['report_id']
        chunk_size = self.chunk_multiplier * 1024 * 1024
        out_file = BytesIO()

        streamer = ThreadedGCSObjectStreamUpload(client=Cloud_Storage.client(),
                                                 bucket_name=self.bucket,
                                                 blob_name=f'{report_id}.csv',
                                                 chunk_size=chunk_size,
                                                 queue=queue)
        streamer.start()

        r = urllib.request.Request(report_details['files'][0]['url'])
        for header in self.creds.get_auth_headers():
            r.add_header(header, self.creds.get_auth_headers()[header])

        with closing(urlopen(r)) as _report:
            _downloaded = 0
            chunk_id = 1
            _report_size = int(_report.headers['content-length'])
            while _downloaded < _report_size:
                chunk = _report.read(chunk_size)
                _downloaded += len(chunk)
                queue.put((chunk_id, chunk))
                chunk_id += 1

        queue.join()
        streamer.stop()
Esempio n. 17
0
    def stream_to_gcs(self, bucket: str, report_data: ReportConfig):
        """Streams the report CSV to Cloud Storage.

    Arguments:
        bucket (str):  GCS Bucket
        report_data (dict):  Report definition
    """
        if not report_data.report_file:
            return

        queue = Queue()

        report_id = report_data.id
        file_id = report_data.report_file.id

        chunk_size = self.chunk_multiplier * 1024 * 1024
        out_file = io.BytesIO()

        download_request = self.service.files().get_media(reportId=report_id,
                                                          fileId=file_id)
        downloader = http.MediaIoBaseDownload(out_file,
                                              download_request,
                                              chunksize=chunk_size)

        # Execute the get request and download the file.
        streamer = ThreadedGCSObjectStreamUpload(
            creds=credentials.Credentials(email=self.email,
                                          project=self.project).credentials,
            client=Cloud_Storage.client(),
            bucket_name=bucket,
            blob_name='{id}.csv'.format(id=report_id),
            chunk_size=chunk_size,
            streamer_queue=queue)
        streamer.start()

        download_finished = False
        first = True
        while download_finished is False:
            status, download_finished = downloader.next_chunk()

            # Last chunk, drop the "Grand Total"
            if download_finished:
                total_pos = out_file.getvalue().rfind(b'Grand Total')
                if total_pos != -1:
                    out_file.truncate(total_pos)

            # First chunk, skip the pre-header
            if first:
                csv_start = self._find_first_data_byte(out_file.getvalue())
                out_file.seek(0 if csv_start == -1 else csv_start)
                first = False
            else:
                out_file.seek(0)

            logging.info(
                'Downloader status %s, %s of %s',
                f'{(status.resumable_progress/status.total_size):3.2%}',
                f'{status.resumable_progress:,}', f'{status.total_size:,}')

            chunk = out_file.read(chunk_size)
            queue.put(chunk)
            out_file.seek(0)
            out_file.truncate(0)

        queue.join()
        streamer.stop()
Esempio n. 18
0
    def stream_to_gcs(self, bucket: str, report_details: Dict[str,
                                                              Any]) -> None:
        """Multi-threaded stream to GCS
    
    Arguments:
        bucket {str} -- GCS Bucket
        report_details {dict} -- Report definition
    """
        if not 'current_path' in report_details:
            return

        queue = Queue()

        report_id = report_details['id']
        chunk_size = self.chunk_multiplier * 1024 * 1024
        out_file = io.BytesIO()

        streamer = ThreadedGCSObjectStreamUpload(
            client=Cloud_Storage.client(),
            bucket_name=bucket,
            blob_name='{id}.csv'.format(id=report_id),
            chunk_size=chunk_size,
            queue=queue)
        streamer.start()

        with closing(urlopen(report_details['current_path'])) as _report:
            _downloaded = 0
            chunk_id = 1
            _report_size = int(_report.headers['content-length'])
            while _downloaded < _report_size:
                chunk = _report.read(chunk_size)
                _downloaded += len(chunk)
                if _downloaded >= _report_size:
                    # last chunk... trim to footer if there is one, or last blank line if not
                    # NOTE: if no blank line (partial file?) NO TRIMMING WILL HAPPEN
                    # THIS SHOULD NEVER BE THE CASE
                    last = io.BytesIO(chunk)

                    # find the footer
                    blank_line_pos = chunk.rfind(b'\n\n')

                    # if we don't find it, there's no footer.
                    if blank_line_pos == -1:
                        logging.error(
                            'No footer delimiter found. Writing entire final chunk as is.'
                        )
                        queue.put((chunk_id, chunk))

                    else:
                        # read the footer
                        last.seek(blank_line_pos)
                        footer = last.readlines()
                        group_count = sum(
                            g.startswith(b'Group By:') for g in footer)
                        total_block_start = chunk.rfind(b'\n' +
                                                        b',' * group_count)

                        if total_block_start == -1:
                            last.truncate(blank_line_pos)

                        else:
                            last.truncate(total_block_start)

                        queue.put((chunk_id, last.getvalue()))
                        # break

                else:
                    queue.put((chunk_id, chunk))

                chunk_id += 1

        queue.join()
        streamer.stop()
Esempio n. 19
0
    def stream_to_gcs(self, bucket: str, report_details: ReportConfig) -> None:
        """Streams the report CSV to Cloud Storage.

    Arguments:
        bucket (str):  GCS Bucket
        report_details (dict):  Report definition
    """
        if not report_details.current_path:
            return

        queue = Queue()

        report_id = report_details.id
        chunk_size = self.chunk_multiplier * 1024 * 1024
        out_file = io.BytesIO()

        streamer = \
            ThreadedGCSObjectStreamUpload(
                client=Cloud_Storage.client(),
                creds=credentials.Credentials(
                    email=self.email, project=self.project).credentials,
                bucket_name=bucket,
                blob_name=f'{report_id}.csv',
                chunk_size=chunk_size,
                streamer_queue=queue)
        streamer.start()

        with closing(urlopen(report_details.current_path)) as _report:
            _downloaded = 0
            chunk_id = 1
            _report_size = int(_report.headers['content-length'])
            logging.info('Report is %s bytes', f'{_report_size:,}')
            while _downloaded < _report_size:
                chunk = _report.read(chunk_size)
                _downloaded += len(chunk)
                if _downloaded >= _report_size:
                    # last chunk... trim to footer if there is one, or last blank line if not
                    # NOTE: if no blank line (partial file?) NO TRIMMING WILL HAPPEN
                    # THIS SHOULD NEVER BE THE CASE
                    last = io.BytesIO(chunk)

                    # find the footer
                    blank_line_pos = chunk.rfind(b'\n\n')

                    # if we don't find it, there's no footer.
                    if blank_line_pos == -1:
                        logging.info(
                            ('No footer delimiter found. Writing entire '
                             'final chunk as is.'))
                        queue.put(chunk)

                    else:
                        # read the footer
                        last.seek(blank_line_pos)
                        footer = last.readlines()
                        group_count = sum(
                            g.startswith(b'Group By:') for g in footer)
                        total_block_start = chunk.rfind(b'\n' +
                                                        b',' * group_count)

                        if total_block_start == -1:
                            last.truncate(blank_line_pos)

                        else:
                            last.truncate(total_block_start)

                        queue.put(last.getvalue())
                        # break

                else:
                    queue.put(chunk)

                chunk_id += 1

        queue.join()
        streamer.stop()
Esempio n. 20
0
    def process(self, data: Dict[str, Any], context):
        """Check all the running jobs
    
    Arguments:
      event {Dict[str, Any]} -- data sent from the PubSub message
      context {Dict[str, Any]} -- context data. unused
    """
        firestore = Firestore(in_cloud=True, email=None, project=None)
        documents = firestore.get_all_jobs()

        for document in documents:
            for T in [t for t in Type if not t.name.startswith('_')]:
                config = firestore.get_report_config(T, document.id)

                if config:
                    if config.get('dest_project'):
                        # authenticate against supplied project with supplied key
                        project = config.get('dest_project') or os.environ.get(
                            'GCP_PROJECT')
                        client_key = json.loads(
                            Cloud_Storage.fetch_file(
                                bucket=
                                f"{os.environ.get('GCP_PROJECT') or 'galvanic-card-234919'}-report2bq-tokens",
                                file=f"{config['email']}_user_token.json"))
                        server_key = json.loads(
                            Cloud_Storage.fetch_file(
                                bucket=
                                f"{os.environ.get('GCP_PROJECT') or 'galvanic-card-234919'}-report2bq-tokens",
                                file='client_secrets.json'))
                        client_key['client_id'] = (
                            server_key.get('web')
                            or server_key.get('installed')).get('client_id')
                        client_key['client_secret'] = (
                            server_key.get('web') or
                            server_key.get('installed')).get('client_secret')
                        logging.info(client_key)
                        creds = Credentials.from_authorized_user_info(
                            client_key)
                        bq = bigquery.Client(project=project,
                                             credentials=creds)

                    else:
                        bq = bigquery.Client()

                    api_repr = document.get().to_dict()
                    if api_repr:
                        try:
                            job = LoadJob.from_api_repr(api_repr, bq)
                            job.reload()

                            if job.state == 'DONE':
                                if job.error_result:
                                    logging.error(job.errors)

                                self._handle_finished(job=job,
                                                      id=document.id,
                                                      config=config,
                                                      report_type=T)
                                firestore.mark_import_job_complete(
                                    document.id, job)

                        except Exception as e:
                            logging.error(
                                f"""Error loading job {document.id} for monitoring."""
                            )

                    break
Esempio n. 21
0
    def stream_to_gcs(self, bucket: str, report_data: dict):
        """Multi-threaded stream to GCS
    
    Arguments:
        bucket {str} -- GCS Bucket
        report_data {dict} -- Report definition
    """
        if not 'report_file' in report_data:
            return

        queue = Queue()

        report_id = report_data['id']
        file_id = report_data['report_file']['id']

        chunk_size = 16 * 1024 * 1024
        out_file = io.BytesIO()

        download_request = self.service().files().get_media(reportId=report_id,
                                                            fileId=file_id)
        downloader = http.MediaIoBaseDownload(out_file,
                                              download_request,
                                              chunksize=chunk_size)

        # Execute the get request and download the file.
        streamer = ThreadedGCSObjectStreamUpload(
            client=Cloud_Storage.client(),
            bucket_name=bucket,
            blob_name='{id}.csv'.format(id=report_id),
            chunk_size=chunk_size,
            queue=queue)
        streamer.start()

        download_finished = False
        chunk_id = 0
        while download_finished is False:
            status, download_finished = downloader.next_chunk()

            # Last chunk, drop the "Grand Total" shit
            if download_finished:
                total_pos = out_file.getvalue().rfind(b'Grand Total')
                if total_pos != -1:
                    out_file.truncate(total_pos)

            # First chunk, skip the pre-header shit
            if chunk_id == 0:
                csv_start = self.find_first_data_byte(out_file.getvalue())
                if csv_start == -1:
                    out_file.seek(0)
                else:
                    out_file.seek(csv_start)
            else:
                out_file.seek(0)

            logging.info(
                'Downloader status {percent:3.2%}, chunk {chunk} ({progress} of {size})'
                .format(percent=(status.resumable_progress /
                                 status.total_size),
                        progress=status.resumable_progress,
                        size=status.total_size,
                        chunk=chunk_id))

            chunk = out_file.read(chunk_size)
            # chunk = out_file.getvalue()
            queue.put((chunk_id, chunk))
            chunk_id += 1
            out_file.seek(0)
            out_file.truncate(0)

        queue.join()
        streamer.stop()
Esempio n. 22
0
    def _stream_processor(self,
                          bucket: str,
                          report_details: Dict[str, Any],
                          repeatable: bool = False) -> BytesIO:
        repeater = BytesIO()
        report_url = report_details['url']
        remainder = b''
        queue = Queue()
        output_buffer = StringIO()
        html_chunk_size = 2048 * 1024
        chunk_size = 1024 * 1024
        streamer = ThreadedGCSObjectStreamUpload(
            client=Cloud_Storage.client(credentials=self.creds),
            bucket_name=bucket,
            blob_name='{id}.csv'.format(id=report_details['id']),
            chunk_size=chunk_size,
            queue=queue)
        streamer.daemon = True
        streamer.start()

        try:
            chunk_id = 0
            conn = self._get_connection(report_url)
            _stream = conn.iter_content(chunk_size=html_chunk_size)
            source_size = 0

            done = False
            fieldnames = None

            while not done:
                # logging.info(f'Processing chunk {chunk_id}')
                # logging.info(f'Processing chunk {chunk_id}, remainder {remainder.decode("utf-8")}')
                chunk = BytesIO()
                chunk.write(remainder)
                remainder = b''

                block, done = self._next_chunk(_stream, html_chunk_size)
                source_size += len(block)
                # logging.info(f'{len(block):,}, begins {block[0:80]} : ends {block[-80:].decode("utf-8")}')
                if repeatable: repeater.write(block)
                chunk.write(block)
                if len(chunk.getvalue()) < html_chunk_size and not done:
                    continue

                # logging.info(f'Chunk size {len(chunk.getvalue()):,} bytes')
                chunk.seek(0)

                if chunk_id == 0:
                    fieldnames, chunk = self._find_fieldnames(buffer=chunk)

                # find last </tr> on any section but the last, chop off the last portion and store
                last_tr_pos = chunk.getvalue().rfind(b'</tr>')
                if last_tr_pos == -1:
                    # logging.debug(f'HALP! {chunk.getvalue()}')
                    remainder = chunk.getvalue()
                    continue

                else:
                    last_tr_pos += 5
                    chunk.seek(last_tr_pos)
                    remainder = chunk.read()
                    # logging.debug(f'Remainder: {remainder}')
                    chunk.truncate(last_tr_pos)

                rows = []
                while True:
                    tr, chunk = self._extract_keys(chunk, 'tr')
                    if chunk:
                        rows.append([
                            unescape(field) for field in re.findall(
                                r'\<td[^>]*\>([^<]*)\<\/td\>', tr)
                        ])
                    else:
                        break

                # queue for upload
                report_data = []
                for row in rows:
                    report_data.append(dict(zip(fieldnames, row)))

                writer = csv.DictWriter(output_buffer, fieldnames=fieldnames)
                if chunk_id == 0: writer.writeheader()

                [writer.writerow(row) for row in report_data]

                output_buffer.seek(0)
                # logging.info(f'Sending chunk {chunk_id} size {len(output_buffer.getvalue())}')
                queue.put((chunk_id, output_buffer.getvalue().encode('utf-8')))
                chunk_id += 1
                chunk = BytesIO()
                output_buffer.seek(0)
                output_buffer.truncate(0)

            logging.info(f'SA360 report length: {source_size:,} bytes')
            queue.join()
            streamer.stop()
            report_details['schema'] = CSVHelpers.create_table_schema(
                fieldnames)
            return repeater

        except Exception as e:
            logging.error(e)
Esempio n. 23
0
    def stream_to_gcs(self, bucket: str, report_details: ReportConfig) \
      -> Tuple[List[str], List[str]]:
        """Streams the data to Google Cloud Storage.

    This is to allow us to process much larger files than can be easily
    handled in toto in memory. Now we're limited to length of execution (900s)
    rather than size of 'stuff' (<2Gb).

    The response from SA360 is a _nasty_ piece of Microsoft Office format XML
    which has to be parsed and converted to a digestible CSV.

    Raises:
        SA360Exception: A custom SA360 exception because there can be a server
                        error returned when requesting, but the error is in
                        text and the HTTP code returned is _always_ a 200.
                        This is why the function is allowed to retry, as the
                        error is usually transient and caused by a failure to
                        connect to SA360's reporting back end.

    Returns:
        (fieldnames: List[str], fieldtypes: List[str]):
          the field names and types in the report.
    """
        report_url = report_details.url
        remainder = b''
        queue = Queue()
        output_buffer = StringIO()

        # size of pieces of xml we can safely download from the web report.
        html_chunk_size = 2048 * 1024
        chunk_size = self.chunk_multiplier * 1024 * 1024

        streamer = ThreadedGCSObjectStreamUpload(
            client=Cloud_Storage.client(credentials=self.creds),
            creds=credentials.Credentials(email=self.email,
                                          project=self.project).credentials,
            bucket_name=bucket,
            blob_name=f'{report_details.id}.csv',
            chunk_size=chunk_size,
            streamer_queue=queue)
        streamer.daemon = True
        streamer.start()

        chunk_id = 0
        conn = self.get_connection(report_url)
        _stream = conn.iter_content(chunk_size=html_chunk_size)
        source_size = 0

        first = True
        done = False
        fieldnames = None
        fieldtypes = None

        while not done:
            chunk = BytesIO()
            chunk.write(remainder)
            remainder = b''

            block, done = self.next_chunk(_stream, html_chunk_size)
            source_size += len(block)
            chunk.write(block)
            if len(chunk.getvalue()) < html_chunk_size and not done:
                continue

            chunk.seek(0)

            if first:
                fieldnames, chunk = self.find_fieldnames(buffer=chunk)
                if len(fieldnames) == 1 and fieldnames[0] == 'Error':
                    error = \
                      unescape(re.sub(r'<[^.]+>', '', chunk.getvalue().decode('utf-8')))
                    # logging.error('SA360 Error: %s', error)
                    streamer.stop()
                    raise SA360Exception(error)

            # find last </tr> on any section but the last, chop off the last
            # portion and store
            last_tr_pos = chunk.getvalue().rfind(b'</tr>')
            if last_tr_pos == -1:
                remainder = chunk.getvalue()
                continue

            else:
                last_tr_pos += 5
                chunk.seek(last_tr_pos)
                remainder = chunk.read()
                chunk.truncate(last_tr_pos)

            rows = []
            while True:
                tr, chunk = self.extract_keys(chunk, 'tr')
                if chunk:
                    rows.append([
                        unescape(field) for field in re.findall(
                            r'\<td[^>]*\>([^<]*)\<\/td\>', tr)
                    ])
                else:
                    break

            # queue for upload
            report_data = []
            for row in rows:
                report_data.append(dict(zip(fieldnames, row)))

            writer = csv.DictWriter(output_buffer, fieldnames=fieldnames)
            if first:
                writer.writeheader()

            [writer.writerow(row) for row in report_data]

            output_buffer.seek(0)

            if first:
                _, fieldtypes = \
                  csv_helpers.get_column_types(
                    BytesIO(output_buffer.getvalue().encode('utf-8')))

            queue.put(output_buffer.getvalue().encode('utf-8'))
            chunk_id += 1
            first = False
            chunk = BytesIO()
            output_buffer.seek(0)
            output_buffer.truncate(0)

        logging.info(f'SA360 report length: {source_size:,} bytes')
        queue.join()
        streamer.stop()
        report_details.schema = \
          csv_helpers.create_table_schema(fieldnames, fieldtypes)

        return fieldnames, fieldtypes
Esempio n. 24
0
  def _import_report(self, bucket_name: str, file_name: str, config: dict) -> bigquery.LoadJob:
    """Begin CSV import

    Create and start the Big Query import job.

    Arguments:
        bucket_name {str} -- GCS bucket name
        file_name {str} -- CSV file name
        config {Dict[str, Any]} -- report config
    
    Returns:
        bigquery.LoadJob
    """
    if config.get('dest_project'):
      # authenticate against supplied project with supplied key
      project = config.get('dest_project') or os.environ.get('GCP_PROJECT')
      client_key = json.loads(Cloud_Storage.fetch_file(
        bucket=f"{os.environ.get('GCP_PROJECT')}-report2bq-tokens",
        file=f"{config['email']}_user_token.json"
      ))
      server_key = json.loads(Cloud_Storage.fetch_file(
        bucket=f"{os.environ.get('GCP_PROJECT')}-report2bq-tokens",
        file='client_secrets.json'
      ))
      client_key['client_id'] = (server_key.get('web') or server_key.get('installed')).get('client_id')
      client_key['client_secret'] = (server_key.get('web') or server_key.get('installed')).get('client_secret')
      logging.info(client_key)
      creds = Credentials.from_authorized_user_info(client_key)
      bq = bigquery.Client(project=project, credentials=creds)

    else:
      project = os.environ.get('GCP_PROJECT')
      bq = bigquery.Client()

    dataset = config.get('dest_dataset') or os.environ.get('BQ_DATASET') or 'report2bq'

    table_name = config.get('table_name', CSVHelpers.sanitize_string(file_name))
    logging.info(f'bucket {bucket_name}, table {table_name}, file_name {file_name}')

    json_schema = config['schema']
    schema = []
    _json_schema = []
    # Build the json format schema that the BQ LoadJob requires from the text-based ones in the config
    for field in json_schema:
      f = bigquery.schema.SchemaField(name=field['name'],
                                      field_type=field['type'],
                                      mode=field['mode'])
      schema.append(f)
      _json_schema.append(f'{field["name"]}: {field["type"]}')

    table_ref = bq.dataset(dataset).table(table_name)

    # Default action is to completely replace the table each time. If requested, however then
    # we can do an append for (say) huge jobs where you would see the table with 60 days once
    # and then append 'yesterday' each day.
    if config.get('append', False):
      if self._table_exists(bq, table_ref) and not self._validate_schema(bq, table_ref, schema):
        config_schema = '\n'.join([ f'{field.name}, {field.field_type}' for field in schema])
        target_schema = '\n'.join([ f'{field.name}, {field.field_type}' for field in bq.get_table(table_ref).schema])
        self._email_error(
          email=config['email'], 
          message=f'''
Mismatched schema for {project}.{dataset}.{table_name}, trying anyway

Report has schema:
{config_schema}

Table has schema:
{target_schema}
'''
        )
        logging.error(f"Mismatched schema for {project}.{dataset}.{table_name}, trying anyway")

      import_type = bigquery.WriteDisposition.WRITE_APPEND
      
    else:
      import_type = bigquery.WriteDisposition.WRITE_TRUNCATE

    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = import_type
    # Assume a CSV header is the first line unless otherwise specified in the report's own config
    job_config.skip_leading_rows = config.get('csv_header_length', 1)
    job_config.source_format = bigquery.SourceFormat.CSV
    job_config.schema = schema
    # Allow a few errors, just in case
    job_config.max_bad_records = 10
    # Allow for DV360/CM (SA360 won't) to pass jagged rows, which they do
    job_config.allow_jagged_rows = True
    
    uri = f'gs://{bucket_name}/{file_name}'
    load_job = bq.load_table_from_uri(
        uri, table_ref, job_config=job_config
    )  # API request
    logging.info(f'Starting CSV import job {load_job.job_id}')

    return load_job