Python Firestore Beispiele, classes.firestore.Firestore Python Beispiele

Beispiel #1

0

Datei anzeigen

 def __init__(self, cm_id: str=None, profile: str=None,
              email: str=None, project: str=None):
   self.email = email
   self.cm_id = cm_id
   self.cm_profile = profile
   self.project = project
   self.firestore = Firestore(email=email, project=project)

Beispiel #2

0

Datei anzeigen

Datei: sa360_report_runner.py Projekt: NeoTim/report2bq

  def __init__(self, report_id: str, email: str, project: str=None, timezone: str=None):
    self.email = email
    self.report_id = report_id
    self.project = project
    self.timezone = timezone

    self.firestore = Firestore()

Beispiel #3

0

Datei anzeigen

  def __init__(self, 
    email: str, project: str, adh_customer: str,
    adh_query: str, api_key: str, days: int,
    dest_project: str=None, dest_dataset: str=None):
    """Constructor

    Setus up the ADH helper
    
    Arguments:
        email {str} -- authenticated user email (for the token)
        project {str} -- GCP project
        adh_customer {str} -- ADH customer id, 9-digit number, NO DASHES
        adh_query {str} -- ADH query id
        api_key {str} -- API Key (has to be set up in APIs and Libraries in GCP)
        days {int} -- Lookback window (default: 60)
        dest_project {str} -- target GCP project for results
        dest_dataset {str} -- target BQ dataset for results
    """
    self.email = email
    self.project = project
    self.adh_customer = adh_customer
    self.adh_query = adh_query
    self.api_key = api_key
    self.days = days
    self.dest_project = dest_project
    self.dest_dataset = dest_dataset

    self.credentials = Credentials(email=email, project=project)
    self.storage = Cloud_Storage(email=email, project=project)
    self.firestore = Firestore(email=email, project=project)

Beispiel #4

0

Datei anzeigen

class DCMReportRunner(ReportRunner):
  report_type = Type.CM

  def __init__(self, cm_id: str=None, profile: str=None,
               email: str=None, project: str=None):
    self.email = email
    self.cm_id = cm_id
    self.cm_profile = profile
    self.project = project
    self.firestore = Firestore(email=email, project=project)


  def run(self, unattended: bool=True) -> None:
    dcm = DCM(email=self.email, project=self.project, profile=self.cm_profile)
    
    if unattended:
      self._unattended_run(dcm)
    else:
      self._attended_run(dcm)


  def _attended_run(self, dcm: DCM) -> None:
    successful = []
    response = dcm.run_report(report_id=self.cm_id, synchronous=True)
    if response:
      buffer = StringIO()
      pprint.pprint(response, stream=buffer)
      logging.info(buffer.getvalue())

    while response['status'] == 'PROCESSING':
      time.sleep(60 * 0.5)
      response = dcm.report_state(report_id=self.cm_id, file_id=response['id'])
      buffer = StringIO()
      pprint.pprint(response, stream=buffer)
      logging.info(buffer.getvalue())

    report2bq = Report2BQ(
      cm=True, cm_id=self.cm_id, email=self.email, project=self.project,
      profile=self.cm_profile
    )
    report2bq.handle_report_fetcher(fetcher=dcm, report_id=self.cm_id)


  def _unattended_run(self, dcm: DCM) -> None:
    response = dcm.run_report(report_id=self.cm_id, synchronous=False)
    if response:
      buffer = StringIO()
      pprint.pprint(response, stream=buffer)
      logging.info(buffer.getvalue())

      runner = {
        'type': Type.CM.value,
        'project': self.project,
        'report_id': self.cm_id,
        'email': self.email,
        'profile': self.cm_profile,
        'file_id': response['id']
      }
      self.firestore.store_report_runner(runner)

Beispiel #5

0

Datei anzeigen

Datei: dbm_report_runner.py Projekt: NeoTim/report2bq

 def __init__(self,
              dbm_id: str = None,
              email: str = None,
              project: str = None):
     self.email = email
     self.dbm_id = dbm_id
     self.project = project
     self.firestore = Firestore(email=email, project=project)

Beispiel #6

0

Datei anzeigen

Datei: dbm_report_runner.py Projekt: NeoTim/report2bq

class DBMReportRunner(ReportRunner):
    report_type = Type.DV360

    def __init__(self,
                 dbm_id: str = None,
                 email: str = None,
                 project: str = None):
        self.email = email
        self.dbm_id = dbm_id
        self.project = project
        self.firestore = Firestore(email=email, project=project)

    def run(self, unattended: bool = True):
        dbm = DBM(email=self.email, project=self.project)

        if unattended:
            self._unattended_run(dbm)
        else:
            self._attended_run(dbm)

    def _attended_run(self, dbm: DBM) -> None:
        response = dbm.run_report(self.dbm_id)
        if response:
            buffer = StringIO()
            pprint.pprint(response, stream=buffer)
            logging.info(buffer.getvalue())

        while True:
            status = dbm.report_state(self.dbm_id)
            logging.info(f'Report {self.dbm_id} status: {status}')
            if status == 'RUNNING':
                time.sleep(10)

            elif status == 'DONE':
                report2bq = Report2BQ(dv360=True,
                                      dv360_id=self.dbm_id,
                                      email=self.email,
                                      project=self.project)
                report2bq.handle_report_fetcher(fetcher=dbm,
                                                report_id=self.dbm_id)
                break

            else:
                logging.error(
                    f'DV360 Report {self.dbm_id} failed to run: {status}')
                break

    def _unattended_run(self, dbm: DBM) -> None:
        response = dbm.run_report(self.dbm_id)
        if response:
            runner = {
                'type': Type.DV360.value,
                'project': self.project,
                'report_id': self.dbm_id,
                'email': self.email,
            }
            self.firestore.store_report_runner(runner)

Beispiel #7

0

Datei anzeigen

    def firestore(self) -> Firestore:
        """The Firestore client wrapper

    Returns:
        Firestore: the wrapper
    """
        return Firestore()

Beispiel #8

0

Datei anzeigen

  def firestore(self) -> Firestore:
    """Fetch the Firestore client on demand.

    Returns:
        Firestore: firestore client
    """
    return Firestore()

Beispiel #9

0

Datei anzeigen

Datei: sa360_report_manager.py Projekt: NeoTim/report2bq

    def manage(self, **kwargs):
        firestore = Firestore(project=kwargs['project'], email=kwargs['email'])

        args = {
            'report':
            kwargs.get(
                'name',
                kwargs.get('file').split('/')[-1].split('.')[0]
                if kwargs.get('file') else None),
            'file':
            kwargs.get('file'),
            'firestore':
            firestore,
            'project':
            kwargs['project'],
            'email':
            kwargs['email'],
            **kwargs,
        }

        action = {
            'list': self.list_all,
            'show': self.show,
            'add': self.add,
            'delete': self.delete,
        }.get(kwargs['action'])

        if action:
            return action(**args)

        else:
            raise NotImplementedError()

Beispiel #10

0

Datei anzeigen

    def __init__(self,
                 email: str,
                 project: str,
                 append: bool = False,
                 infer_schema: bool = False):
        self.email = email
        self.project = project
        self.creds = Credentials(email=email, project=project)
        self.credentials = storage.Client()._credentials
        self.transport = AuthorizedSession(credentials=self.credentials)
        self.append = append
        self.infer_schema = infer_schema

        self.firestore = Firestore(email=email, project=project)

        self.chunk_multiplier = int(os.environ.get('CHUNK_MULTIPLIER', 64))
        self.bucket = f'{self.project}-report2bq-upload'

Beispiel #11

0

Datei anzeigen

def main(unusedargv):
    scheduler = Scheduler()
    with open(FLAGS.file) as reports:
        runners = json.loads(''.join(reports.readlines()))

        for runner in runners:
            id = f"{runner['report']}_{runner['AgencyId']}_{runner['AdvertiserId']}"
            Firestore().update_document(Type.SA360_RPT, f'{id}', runner)

Beispiel #12

0

Datei anzeigen

Datei: oauth.py Projekt: NeoTim/report2bq

    def oauth_init(self, request: Request, project: str, email: str):
        project_credentials = json.loads(Files.fetch_file(
            '{project}-report2bq-tokens'.format(project=project),
            'client_secrets.json'),
                                         encoding='utf-8')

        _flow = flow.Flow.from_client_config(client_config=project_credentials,
                                             scopes=self.SCOPES)

        _flow.redirect_uri = f"https://{os.environ.get('FUNCTION_REGION')}-{os.environ.get('GCP_PROJECT')}.cloudfunctions.net/OAuthComplete"

        authorization_url, state = _flow.authorization_url(
            access_type='offline', include_granted_scopes='true')

        firestore = Firestore()
        firestore.store_oauth_state(state=state, email=email, project=project)

        return redirect(authorization_url)

Beispiel #13

0

Datei anzeigen

Datei: sa360_report_manager.py Projekt: NeoTim/report2bq

    def list_all(self,
                 firestore: Firestore,
                 project: str,
                 _print: bool = False,
                 **unused):
        reports = firestore.list_documents(Type.SA360_RPT, '_reports')
        if _print:
            print(f'SA360 Dynamic Reports defined for project {project}')
            print()
            for report in reports:
                print(f'  {report}')

        return reports

Beispiel #14

0

Datei anzeigen

Datei: sa360_report_manager.py Projekt: NeoTim/report2bq

    def show(self,
             firestore: Firestore,
             report: str,
             _print: bool = False,
             **unused):
        definition = firestore.get_document(Type.SA360_RPT,
                                            '_reports').get(report)
        if _print:
            print(f'SA360 Dynamic Report "{report}"')
            print()
            pprint.pprint(definition, indent=2, compact=False)

        return definition

Beispiel #15

0

Datei anzeigen

Datei: sa360_report_manager.py Projekt: NeoTim/report2bq

    def delete(self, firestore: Firestore, project: str, report: str,
               email: str, **unused):
        firestore.delete_document(Type.SA360_RPT, '_reports', report)
        scheduler = Scheduler()
        args = {
            'action': 'list',
            'email': email,
            'project': project,
            'html': False,
        }

        # Disable all runners for the now deleted report
        runners = list(runner['name'].split('/')[-1]
                       for runner in scheduler.process(args)
                       if report in runner['name'])
        for runner in runners:
            args = {
                'action': 'disable',
                'email': None,
                'project': project,
                'job_id': runner,
            }
            scheduler.process(args)

Beispiel #16

0

Datei anzeigen

Datei: oauth.py Projekt: NeoTim/report2bq

    def oauth_complete(self, request: Request):
        logging.info(request.args)

        state = request.args.get('state', type=str)
        firestore = Firestore()
        email, project = firestore.get_oauth_state(state)

        project_credentials = json.loads(Files.fetch_file(
            '{project}-report2bq-tokens'.format(project=project),
            'client_secrets.json'),
                                         encoding='utf-8')

        _flow = flow.Flow.from_client_config(client_config=project_credentials,
                                             scopes=self.SCOPES)
        _flow.redirect_uri = f"https://{os.environ.get('FUNCTION_REGION')}-{os.environ.get('GCP_PROJECT')}.cloudfunctions.net/OAuthComplete"

        r = urlparse(request.url)
        auth_response = urlunparse(
            ['https', r.netloc, r.path, r.params, r.query, r.fragment])
        _flow.fetch_token(authorization_response=auth_response)

        logging.info(_flow.credentials)

        token_details = {
            'access_token': _flow.credentials.token,
            'refresh_token': _flow.credentials.refresh_token
        }

        Cloud_Storage.write_file(
            '{project}-report2bq-tokens'.format(project=project),
            '{email}_user_token.json'.format(email=email),
            json.dumps(token_details).encode('utf-8'))

        firestore.delete_oauth_state(state=state)

        return 'Ok'

Beispiel #17

0

Datei anzeigen

Datei: report2bq.py Projekt: NeoTim/report2bq

    def __init__(self,
                 product: Type,
                 email=None,
                 project=None,
                 report_id=None,
                 profile=None,
                 sa360_url=None,
                 force: bool = False,
                 append: bool = False,
                 infer_schema: bool = False,
                 dest_project: str = None,
                 dest_dataset: str = 'report2bq',
                 notify_topic: str = None,
                 notify_message: str = None):
        self.product = product

        self.force = force
        self.email = email
        self.append = append
        self.infer_schema = infer_schema

        self.report_id = report_id

        self.sa360_url = unquote(sa360_url) if sa360_url else None

        self.cm_profile = profile

        self.project = project

        self.dest_project = dest_project
        self.dest_dataset = dest_dataset

        self.notify_topic = notify_topic
        self.notify_message = notify_message

        self.firestore = Firestore(email=email, project=project)

Beispiel #18

0

Datei anzeigen

Datei: sa360_web.py Projekt: google/report2bq

    def __init__(self,
                 email: str,
                 project: str,
                 append: bool = False,
                 infer_schema: bool = False) -> SA360Web:
        self.email = email
        self.project = project
        self.creds = Credentials(email=email, project=project)
        self.credentials = storage.Client()._credentials
        self.transport = AuthorizedSession(credentials=self.credentials)
        self.append = append
        self.infer_schema = infer_schema

        self.firestore = Firestore(email=email, project=project)

        # chunk_multiplier is set in the environment, but defaults to 64 - this
        # leads to a 64M chunk size we can throw around. Given the memory
        # constraints of a cloud function this seems like a good, safe number.
        self.chunk_multiplier = int(os.environ.get('CHUNK_MULTIPLIER', 64))
        self.bucket = f'{self.project}-report2bq-upload'

Beispiel #19

0

Datei anzeigen

    def check_running_report(self, config: Dict[str, Any]):
        """Check a running CM report for completion
    
    Arguments:
        report {Dict[str, Any]} -- The report data structure from Firestore
    """
        append = config['append'] if config and 'append' in config else False
        response = self.report_state(report_id=config['id'],
                                     file_id=config['report_file']['id'])
        status = response[
            'status'] if response and 'status' in response else 'UNKNOWN'

        logging.info('Report {report} status: {status}'.format(
            report=config['id'], status=status))
        firestore = Firestore(email=email, project=project)
        if status == 'REPORT_AVAILABLE':
            # Remove job from running
            firestore.remove_report_runner(config['id'])

            # Send pubsub to trigger report2bq now
            topic = 'projects/{project}/topics/report2bq-trigger'.format(
                project=self.project)
            pubsub = pubsub.PublisherClient()
            pubsub.publish(topic=topic,
                           data=b'RUN',
                           cm_id=config['id'],
                           profile=config['profile_id'],
                           email=config['email'],
                           append=str(append),
                           project=self.project)

        elif status == 'FAILED' or status == 'CANCELLED':
            # Remove job from running
            logging.error(
                f'Report {config["id"]}: {inflection.humanize(status)}.')
            firestore.remove_report_runner(config['id'])

Beispiel #20

0

Datei anzeigen

class ADH(object):
  """Run ADH queries

  This class runs ADH queries. Where they output is determined by the query iteslf, in ADH. All
  we can specify here is the date range - and we do this by accepting a lookback window and doing
  "yesterday - lookback".

  TODO (davidharcombe@) ADH Query Parameters
  """
  
  def __init__(self, 
    email: str, project: str, adh_customer: str,
    adh_query: str, api_key: str, days: int,
    dest_project: str=None, dest_dataset: str=None):
    """Constructor

    Setus up the ADH helper
    
    Arguments:
        email {str} -- authenticated user email (for the token)
        project {str} -- GCP project
        adh_customer {str} -- ADH customer id, 9-digit number, NO DASHES
        adh_query {str} -- ADH query id
        api_key {str} -- API Key (has to be set up in APIs and Libraries in GCP)
        days {int} -- Lookback window (default: 60)
        dest_project {str} -- target GCP project for results
        dest_dataset {str} -- target BQ dataset for results
    """
    self.email = email
    self.project = project
    self.adh_customer = adh_customer
    self.adh_query = adh_query
    self.api_key = api_key
    self.days = days
    self.dest_project = dest_project
    self.dest_dataset = dest_dataset

    self.credentials = Credentials(email=email, project=project)
    self.storage = Cloud_Storage(email=email, project=project)
    self.firestore = Firestore(email=email, project=project)


  def run(self, unattended: bool=True):
    """Run the ADH query
    
    Execute the ADH query, storing the run job result in Firestore. The data itself will be written
    to Big Query by ADH.
    Remember that ADH queries have many, many constraints so use this wisely: DON'T set up
    an hourly run - check with ADH.

    Keyword Arguments:
        unattended {bool} -- run unattended. Unused, but there for compatibility (default: {True})
    """
    query_details = self.fetch_query_details()
    if query_details:
      report = {
        'id': self.adh_query,
        'details': query_details,
        'customer_id': self.adh_customer,
        'table_name': self._sanitize_string(query_details['title']),
      }
      if self.dest_project:
        report['dest_project'] = self.dest_project

      if self.dest_dataset:
        report['dest_dataset'] = self.dest_dataset

      self.firestore.store_report_config(
        type=Type.ADH,
        report_data=report,
        id=self.adh_query)
    
      result = self.run_query(report)
      report['last_run'] = result
      self.firestore.store_report_config(
        type=Type.ADH,
        report_data=report,
        id=self.adh_query)

      logging.info('Result: {result}'.format(result=result))


  def _get_adh_service(self) -> Resource:
    """Create the ADH Service

    Use the discovery API to create the ADH service
    
    Returns:
        Resource -- ADH service
    """
    adh_service = DiscoverService.get_service(Service.ADH, self.credentials, self.api_key)
    return adh_service


  def _sanitize_string(self, original: str) -> str:
    """Sanitize Strings

    Convert any non alphanumeric into an '_' as per BQ requirements
    
    Arguments:
        original {str} -- 
    
    Returns:
        str -- 
    """
    return re.sub('[^a-zA-Z0-9,]', '_', original)

    
  def fetch_query_details(self) -> Dict[str, Any]:
    """Get the Query details
    
    Returns:
        Dict[str, Any] -- [description]
    """
    service = self._get_adh_service()

    query_id = 'customers/{customer_id}/analysisQueries/{query_id}'.format(
      customer_id=self.adh_customer,
      query_id=self.adh_query)
    query = service.customers().analysisQueries().get(name=query_id).execute()

    return query


  def run_query(self, query_details: Dict[str, Any]) -> Dict[str, Any]:
    """Run the ADH query
    
    Arguments:
        query_details {Dict[str, Any]} -- the details of the query job
    
    Returns:
        Dict[str, Any] -- result of the query run directive
    """
    service = self._get_adh_service()

    yesterday = datetime.now(tz=pytz.timezone('US/Eastern')) - timedelta(days=1)
    earliest = yesterday - timedelta(days=60)

    body = {
      "spec": {
        "startDate": {
          "year": earliest.year,
          "month": earliest.month,
          "day": earliest.day
        },
        "endDate": {
          "year": yesterday.year,
          "month": yesterday.month,
          "day": yesterday.day
        }
      },
      "destTable": '{project}.{dataset}.{table_name}'.format(
        project=query_details['dest_project'] if 'dest_project' in query_details else self.project, 
        dataset=query_details['dest_dataset'] if 'dest_dataset' in query_details else 'adh_results',
        table_name=query_details['table_name']
      ),
      "customerId": query_details['customer_id']
    }
    result = service.customers().analysisQueries().start(
      name=query_details['details']['name'], body=body).execute()

    return result

Beispiel #21

0

Datei anzeigen

Datei: job_monitor.py Projekt: NeoTim/report2bq

    def process(self, data: Dict[str, Any], context):
        """Check all the running jobs
    
    Arguments:
      event {Dict[str, Any]} -- data sent from the PubSub message
      context {Dict[str, Any]} -- context data. unused
    """
        firestore = Firestore(in_cloud=True, email=None, project=None)
        documents = firestore.get_all_jobs()

        for document in documents:
            for T in [t for t in Type if not t.name.startswith('_')]:
                config = firestore.get_report_config(T, document.id)

                if config:
                    if config.get('dest_project'):
                        # authenticate against supplied project with supplied key
                        project = config.get('dest_project') or os.environ.get(
                            'GCP_PROJECT')
                        client_key = json.loads(
                            Cloud_Storage.fetch_file(
                                bucket=
                                f"{os.environ.get('GCP_PROJECT') or 'galvanic-card-234919'}-report2bq-tokens",
                                file=f"{config['email']}_user_token.json"))
                        server_key = json.loads(
                            Cloud_Storage.fetch_file(
                                bucket=
                                f"{os.environ.get('GCP_PROJECT') or 'galvanic-card-234919'}-report2bq-tokens",
                                file='client_secrets.json'))
                        client_key['client_id'] = (
                            server_key.get('web')
                            or server_key.get('installed')).get('client_id')
                        client_key['client_secret'] = (
                            server_key.get('web') or
                            server_key.get('installed')).get('client_secret')
                        logging.info(client_key)
                        creds = Credentials.from_authorized_user_info(
                            client_key)
                        bq = bigquery.Client(project=project,
                                             credentials=creds)

                    else:
                        bq = bigquery.Client()

                    api_repr = document.get().to_dict()
                    if api_repr:
                        try:
                            job = LoadJob.from_api_repr(api_repr, bq)
                            job.reload()

                            if job.state == 'DONE':
                                if job.error_result:
                                    logging.error(job.errors)

                                self._handle_finished(job=job,
                                                      id=document.id,
                                                      config=config,
                                                      report_type=T)
                                firestore.mark_import_job_complete(
                                    document.id, job)

                        except Exception as e:
                            logging.error(
                                f"""Error loading job {document.id} for monitoring."""
                            )

                    break

Beispiel #22

0

Datei anzeigen

class SA360Dynamic(ReportFetcher):
    report_type = Type.SA360_RPT
    email = None
    project = None
    profile = None

    def __init__(self,
                 email: str,
                 project: str,
                 append: bool = False,
                 infer_schema: bool = False):
        self.email = email
        self.project = project
        self.creds = Credentials(email=email, project=project)
        self.credentials = storage.Client()._credentials
        self.transport = AuthorizedSession(credentials=self.credentials)
        self.append = append
        self.infer_schema = infer_schema

        self.firestore = Firestore(email=email, project=project)

        self.chunk_multiplier = int(os.environ.get('CHUNK_MULTIPLIER', 64))
        self.bucket = f'{self.project}-report2bq-upload'

    def service(self) -> Resource:
        return discovery.get_service(service=Service.SA360,
                                     credentials=self.creds)

    def handle_report(self, run_config: Dict[str, Any]) -> bool:
        sa360_service = self.service()
        request = sa360_service.reports().get(reportId=run_config['file_id'])

        try:
            report = request.execute()

            if report['isReportReady']:
                report_config = self.firestore.get_document(
                    type=Type.SA360_RPT, id=run_config['report_id'])

                csv_header, _ = self.read_header(report)
                schema = csv_helpers.create_table_schema(csv_header, None)
                report_config['schema'] = schema
                report_config['files'] = report['files']

                if 'dest_project' in run_config:
                    report_config['dest_project'] = run_config['dest_project']
                if 'dest_dataset' in run_config:
                    report_config['dest_dataset'] = run_config['dest_dataset']
                if 'notify_message' in run_config:
                    report_config['notifier']['message'] = run_config[
                        'notify_message']

                # update the report details please...
                self.firestore.update_document(Type.SA360_RPT,
                                               run_config['report_id'],
                                               report_config)

                # ... then stream the file to GCS a la DV360/CM
                self.stream_to_gcs(report_details=report_config,
                                   run_config=run_config)

            return report['isReportReady']

        except Exception as e:
            logging.error(
                f'Report fetch error: Run {run_config["file_id"]} for report {run_config["report_id"]}'
            )
            return False

    def read_header(self, report_config: dict) -> list:
        r = urllib.request.Request(report_config['files'][0]['url'])
        for header in self.creds.auth_headers:
            r.add_header(header, self.creds.auth_headers[header])

        with closing(urlopen(r)) as report:
            data = report.read(self.chunk_multiplier * 1024 * 1024)
            bytes_io = BytesIO(data)

        return csv_helpers.get_column_types(bytes_io)

    @measure_memory
    def stream_to_gcs(self, report_details: Dict[str, Any],
                      run_config: Dict[str, Any]) -> None:
        """Multi-threaded stream to GCS

    Arguments:
        bucket (str):  GCS Bucket
        report_details (dict):  Report definition
    """
        queue = Queue()

        report_id = run_config['report_id']

        # chunk_multiplier is set in the environment, but defaults to 64 - this leads to a
        # 64M chunk size we can throw around. Given the memory constraints of a cloud function
        # this seems like a good, safe number.
        chunk_size = self.chunk_multiplier * 1024 * 1024
        out_file = BytesIO()

        streamer = \
          ThreadedGCSObjectStreamUpload(
            client=Cloud_Storage.client(),
            creds=credentials.Credentials(
              email=self.email, project=self.project).credentials,
            bucket_name=self.bucket,
            blob_name=f'{report_id}.csv',
            chunk_size=chunk_size,
            streamer_queue=queue)
        streamer.start()

        r = urllib.request.Request(report_details['files'][0]['url'])
        for header in self.creds.auth_headers:
            r.add_header(header, self.creds.auth_headers[header])

        with closing(urlopen(r)) as _report:
            _downloaded = 0
            chunk_id = 1
            _report_size = int(_report.headers['content-length'])
            while _downloaded < _report_size:
                chunk = _report.read(chunk_size)
                _downloaded += len(chunk)
                queue.put(chunk)
                chunk_id += 1

        queue.join()
        streamer.stop()

Beispiel #23

0

Datei anzeigen

Datei: key_upload.py Projekt: google/report2bq

    if args.get('encode_key'):
        key = encode_key(_key)

    else:
        key = _key

    src_data['email'] = _key

    if args.get('local_store'):
        from classes.local_datastore import LocalDatastore
        f = LocalDatastore()

    if args.get('firestore'):
        from classes.firestore import Firestore
        f = Firestore()

    if args.get('secret_manager'):
        from classes.secret_manager import SecretManager
        f = SecretManager(project=_project, email=args.get('email'))

    f.update_document(type=Type._ADMIN, id=key, new_data=src_data)


def main(unused_argv):
    event = {
        'key': FLAGS.key,
        'file': FLAGS.file,
        'encode_key': FLAGS.encode_key,
        'local_store': FLAGS.local,
        'firestore': FLAGS.firestore,

Beispiel #24

0

Datei anzeigen

Datei: report2bq.py Projekt: google/report2bq

 def firestore(self) -> Firestore:
     return Firestore()  #email=self.email, project=self.project)

Beispiel #25

0

Datei anzeigen

Datei: run_monitor.py Projekt: NeoTim/report2bq

class RunMonitor(object):
    """Run the process watching running DV360/CM jobs

  This process is triggered by a Cloud Scheduler job every 5 minutes to watch the Firestore-held list
  list of jobs for running DV360/CM processes. If one is discovered to have completed, the Report2BQ
  process is invoked in the normal manner (via a PubSub message to the trigger queue).

  This process is not 100% necessary; if a report is defined with a "fetcher", then the fetcher will 
  run as usual every hour and will pick up the change anyway. On the other hand, it allows for a user
  to schedule a quick report to run (say) every 30 minutes, and not create a "fetcher" since this process
  takes the "fetcher"'s place.
  """

    firestore = Firestore()
    PS = pubsub.PublisherClient()

    schedules = None

    def process(self, data: Dict[str, Any], context):
        """[summary]
    
    Arguments:
        data {Dict[str, Any]} -- Data passed in from the calling function, containing the attributes from the
                                 calling PubSub message
        context {} -- unused
    """
        self.project = os.environ['GCP_PROJECT']
        report_checker = {
            Type.DV360: self._check_dv360_report,
            Type.CM: self._check_cm_report,
            Type.SA360: self._check_sa360_report,
            Type.SA360_RPT: self._check_sa360_report
        }

        try:
            documents = self.firestore.get_all_running()
            for document in documents:
                with suppress(ValueError):
                    run_config = document.get().to_dict()
                    T = Type(run_config['type'])
                    # config = self.firestore.get_report_config(T, document.id)
                    job_config = self._fetch_schedule(type=T,
                                                      run_config=run_config)
                    report_checker[T](run_config=run_config,
                                      job_config=job_config)
                    # break
                    # else:
                    #   logging.error(f'Invalid report: {document.get().to_dict()}')

        except Exception as e:
            logging.error(e)

    def _fetch_schedule(self, type: Type,
                        run_config: Dict[str, Any]) -> Dict[str, Any]:
        scheduler = Scheduler()
        (success, job_config) = scheduler.process({
            'action':
            'get',
            'project':
            os.environ['GCP_PROJECT'],
            'email':
            run_config['email'],
            'html':
            False,
            'job_id':
            type.runner(run_config['report_id'])
        })

        return job_config

    def _check_dv360_report(self, job_config: Dict[str, Any],
                            run_config: Dict[str, Any]):
        """Check a running DV360 report for completion
    
    Arguments:
        report {Dict[str, Any]} -- The report data structure from Firestore
    """
        job_attributes = job_config['pubsubTarget']['attributes']
        dbm = DBM(email=job_attributes['email'], project=self.project)
        status = dbm.report_state(job_attributes['report_id'])
        append = job_attributes[
            'append'] if job_attributes and 'append' in job_attributes else False

        logging.info('Report {report} status: {status}'.format(
            report=job_attributes['report_id'], status=status))

        if status == 'DONE':
            # Remove job from running
            self.firestore.remove_report_runner(job_attributes['report_id'])

            # Send pubsub to trigger report2bq now
            topic = job_config['pubsubTarget']['topicName']
            self.PS.publish(topic=topic, data=b'RUN', **job_attributes)

        elif status == 'FAILED':
            # Remove job from running
            logging.error(f'Report {run_config["report_id"]} failed!')
            self.firestore.remove_report_runner(run_config['report_id'])

    def _check_cm_report(self, job_config: Dict[str, Any],
                         run_config: Dict[str, Any]):
        """Check a running CM report for completion
    
    Arguments:
        report {Dict[str, Any]} -- The report data structure from Firestore
    """
        job_attributes = job_config['pubsubTarget']['attributes']
        dcm = DCM(email=job_attributes['email'],
                  project=self.project,
                  profile=job_attributes['profile_id'])
        append = job_attributes[
            'append'] if job_attributes and 'append' in job_attributes else False
        # TODO: Add report_file.id to run_config
        response = dcm.report_state(report_id=job_attributes['report_id'],
                                    file_id=run_config['report_file']['id'])
        status = response[
            'status'] if response and 'status' in response else 'UNKNOWN'

        logging.info('Report {report} status: {status}'.format(
            report=job_attributes['report_id'], status=status))
        if status == 'REPORT_AVAILABLE':
            # Remove job from running
            self.firestore.remove_report_runner(job_attributes['report_id'])

            # Send pubsub to trigger report2bq now
            topic = 'projects/{project}/topics/report2bq-trigger'.format(
                project=self.project)
            self.PS.publish(topic=topic, data=b'RUN', **job_attributes)

        elif status == 'FAILED' or status == 'CANCELLED':
            # Remove job from running
            logging.error('Report {report} failed!'.format(
                report=job_attributes['report_id']))
            self.firestore.remove_report_runner(job_attributes['report_id'])

    def _check_sa360_report(self, job_config: Dict[str, Any],
                            run_config: Dict[str, Any]):
        sa360 = SA360(email=run_config['email'], project=self.project)

        # Merge configs
        job_attributes = job_config['pubsubTarget'][
            'attributes'] if 'pubsubTarget' in job_config else {}
        config = {**run_config, **job_attributes}

        if sa360.handle_offline_report(run_config=config):
            self.firestore.remove_report_runner(run_config['report_id'])
            logging.info(f'Report {run_config["report_id"]} done.')

        else:
            # SA360 ones can't fail - they won't start if there are errors, so it's just
            # not ready yet. So just leave it here and try again later.
            logging.error(f'Report {run_config["report_id"]} not ready.')

Beispiel #26

0

Datei anzeigen

 def datastore(self) -> AbstractDatastore:
     """The datastore property."""
     from classes.firestore import Firestore
     return Firestore()

Beispiel #27

0

Datei anzeigen

 def firestore(self) -> AbstractDatastore:
     return Firestore()

Beispiel #28

0

Datei anzeigen

Datei: sa360_report_manager.py Projekt: NeoTim/report2bq

 def add(self, firestore: Firestore, report: str, file: str, **unused):
     with open(file) as definition:
         cfg = json.loads(''.join(definition.readlines()))
         Firestore().update_document(Type.SA360_RPT, '_reports',
                                     {report: cfg})

Beispiel #29

0

Datei anzeigen

class SA360(object):
    def __init__(self,
                 email: str,
                 project: str,
                 append: bool = False,
                 infer_schema: bool = False):
        self.email = email
        self.project = project
        self.creds = Credentials(email=email, project=project)
        self.credentials = storage.Client()._credentials
        self.transport = AuthorizedSession(credentials=self.credentials)
        self.append = append
        self.infer_schema = infer_schema

        self.firestore = Firestore(email=email, project=project)

        self.chunk_multiplier = int(os.environ.get('CHUNK_MULTIPLIER', 64))
        self.bucket = f'{self.project}-report2bq-upload'

    def _soupify(self, data: BytesIO) -> BeautifulSoup:
        return BeautifulSoup(data, 'lxml')

    def process(self, bucket: str, report_details: Dict[str, Any]) -> None:
        input_buffer = BytesIO()
        repeater = self._stream_processor(bucket=bucket,
                                          report_details=report_details,
                                          repeatable=False)
        # old_id, report_details['id'] = report_details['id'], f'{report_details["id"]}-repeat'
        # self.upload_report(bucket=bucket, report_details=report_details, input_buffer=repeater)
        # report_details['id'] = old_id

    def handle_offline_report(self, run_config: Dict[str, Any]) -> bool:
        sa360_service = DiscoverService.get_service(Service.SA360, self.creds)
        request = sa360_service.reports().get(reportId=run_config['file_id'])

        try:
            report = request.execute()

            if report['isReportReady']:
                report_config = self.firestore.get_report_config(
                    type=Type.SA360_RPT, id=run_config['report_id'])

                csv_header, csv_types = self.read_header(report)
                schema = CSVHelpers.create_table_schema(
                    csv_header, csv_types if self.infer_schema else None)
                report_config['schema'] = schema
                report_config['files'] = report['files']

                if 'dest_project' in run_config:
                    report_config['dest_project'] = run_config['dest_project']
                if 'dest_dataset' in run_config:
                    report_config['dest_dataset'] = run_config['dest_dataset']
                if 'notify_topic' in run_config:
                    report_config['notifier'] = {
                        'topic': run_config['notify_topic'],
                    }
                    if 'notify_message' in run_config:
                        report_config['notifier']['message'] = run_config[
                            'notify_message']

                # update the report details please...
                self.firestore.update_document(Type.SA360_RPT,
                                               run_config['report_id'],
                                               report_config)

                # ... then stream the file to GCS a la DV360/CM
                self._stream_report_to_gcs(report_details=report_config,
                                           run_config=run_config)

            return report['isReportReady']

        except Exception as e:
            logging.error(
                f'Report fetch error: Run {run_config["file_id"]} for report {run_config["report_id"]}'
            )
            return False

    def read_header(self, report_config: dict) -> list:
        r = urllib.request.Request(report_config['files'][0]['url'])
        for header in self.creds.get_auth_headers():
            r.add_header(header, self.creds.get_auth_headers()[header])

        with closing(urlopen(r)) as report:
            data = report.read(self.chunk_multiplier * 1024 * 1024)
            bytes_io = BytesIO(data)

        return CSVHelpers.get_column_types(bytes_io)

    @measure_memory
    def _stream_report_to_gcs(self, report_details: Dict[str, Any],
                              run_config: Dict[str, Any]) -> None:
        """Multi-threaded stream to GCS
    
    Arguments:
        bucket {str} -- GCS Bucket
        report_details {dict} -- Report definition
    """
        queue = Queue()

        report_id = run_config['report_id']
        chunk_size = self.chunk_multiplier * 1024 * 1024
        out_file = BytesIO()

        streamer = ThreadedGCSObjectStreamUpload(client=Cloud_Storage.client(),
                                                 bucket_name=self.bucket,
                                                 blob_name=f'{report_id}.csv',
                                                 chunk_size=chunk_size,
                                                 queue=queue)
        streamer.start()

        r = urllib.request.Request(report_details['files'][0]['url'])
        for header in self.creds.get_auth_headers():
            r.add_header(header, self.creds.get_auth_headers()[header])

        with closing(urlopen(r)) as _report:
            _downloaded = 0
            chunk_id = 1
            _report_size = int(_report.headers['content-length'])
            while _downloaded < _report_size:
                chunk = _report.read(chunk_size)
                _downloaded += len(chunk)
                queue.put((chunk_id, chunk))
                chunk_id += 1

        queue.join()
        streamer.stop()

    @timeit
    def _fetch_data(self, report_details: Dict[str, Any],
                    buffer: BytesIO) -> int:
        try:
            report_url = report_details['url']

            request = requests.Download(report_url, stream=buffer)
            request.consume(transport=self.transport)
            return self._stream_size(buffer)

        except Exception as e:
            logging.error(e)

        return -1

    @timeit
    def _stream_size(self, buffer: BytesIO) -> int:
        pos = buffer.tell()
        buffer.seek(0, SEEK_END)
        size = buffer.tell()
        buffer.seek(pos)
        return size

    def _extract_keys(self, buffer: BytesIO, key: str) -> Tuple[str, BytesIO]:
        b = buffer.getvalue()
        start_pos = b.find((f'<{key}>').encode('utf-8'))
        if start_pos == -1:
            buffer.seek(0)
            extract = None
            new_stream = None
        else:
            end_pos = b.find((f'</{key}>').encode('utf-8'), start_pos)
            buffer.seek(start_pos)
            content = buffer.read(end_pos + len(f'</{key}>') - start_pos)
            extract = content.decode('utf-8')
            new_stream = BytesIO(buffer.read())

        return extract, new_stream

    @timeit
    def _get_connection(self, report_url: str):
        auth_headers = self.creds.get_auth_headers()
        conn = req.get(report_url, stream=True, headers=auth_headers)
        return conn

    def _find_fieldnames(self, buffer: BytesIO) -> Tuple[str, BytesIO]:
        header, buffer = self._extract_keys(buffer=buffer, key='thead')
        if header:
            fieldnames = [
                CSVHelpers.sanitize_string(field)
                for field in re.findall(r'\<th[^>]*\>([^<]*)\<\/th\>', header)
            ]
            # logging.info(f'Fields: {fieldnames}')
            del header
        else:
            fieldnames = None

        return fieldnames, buffer

    def _next_chunk(self,
                    stream,
                    html_chunk_size: int = None) -> Tuple[bytes, bool]:
        _buffer = BytesIO()
        last_chunk = False
        while len(_buffer.getvalue()) < html_chunk_size and not last_chunk:
            try:
                _block = stream.__next__()
                if _block: _buffer.write(_block)
            except StopIteration:
                last_chunk = True

        return _buffer.getvalue(), last_chunk

    @measure_memory
    def _stream_processor(self,
                          bucket: str,
                          report_details: Dict[str, Any],
                          repeatable: bool = False) -> BytesIO:
        repeater = BytesIO()
        report_url = report_details['url']
        remainder = b''
        queue = Queue()
        output_buffer = StringIO()
        html_chunk_size = 2048 * 1024
        chunk_size = 1024 * 1024
        streamer = ThreadedGCSObjectStreamUpload(
            client=Cloud_Storage.client(credentials=self.creds),
            bucket_name=bucket,
            blob_name='{id}.csv'.format(id=report_details['id']),
            chunk_size=chunk_size,
            queue=queue)
        streamer.daemon = True
        streamer.start()

        try:
            chunk_id = 0
            conn = self._get_connection(report_url)
            _stream = conn.iter_content(chunk_size=html_chunk_size)
            source_size = 0

            done = False
            fieldnames = None

            while not done:
                # logging.info(f'Processing chunk {chunk_id}')
                # logging.info(f'Processing chunk {chunk_id}, remainder {remainder.decode("utf-8")}')
                chunk = BytesIO()
                chunk.write(remainder)
                remainder = b''

                block, done = self._next_chunk(_stream, html_chunk_size)
                source_size += len(block)
                # logging.info(f'{len(block):,}, begins {block[0:80]} : ends {block[-80:].decode("utf-8")}')
                if repeatable: repeater.write(block)
                chunk.write(block)
                if len(chunk.getvalue()) < html_chunk_size and not done:
                    continue

                # logging.info(f'Chunk size {len(chunk.getvalue()):,} bytes')
                chunk.seek(0)

                if chunk_id == 0:
                    fieldnames, chunk = self._find_fieldnames(buffer=chunk)

                # find last </tr> on any section but the last, chop off the last portion and store
                last_tr_pos = chunk.getvalue().rfind(b'</tr>')
                if last_tr_pos == -1:
                    # logging.debug(f'HALP! {chunk.getvalue()}')
                    remainder = chunk.getvalue()
                    continue

                else:
                    last_tr_pos += 5
                    chunk.seek(last_tr_pos)
                    remainder = chunk.read()
                    # logging.debug(f'Remainder: {remainder}')
                    chunk.truncate(last_tr_pos)

                rows = []
                while True:
                    tr, chunk = self._extract_keys(chunk, 'tr')
                    if chunk:
                        rows.append([
                            unescape(field) for field in re.findall(
                                r'\<td[^>]*\>([^<]*)\<\/td\>', tr)
                        ])
                    else:
                        break

                # queue for upload
                report_data = []
                for row in rows:
                    report_data.append(dict(zip(fieldnames, row)))

                writer = csv.DictWriter(output_buffer, fieldnames=fieldnames)
                if chunk_id == 0: writer.writeheader()

                [writer.writerow(row) for row in report_data]

                output_buffer.seek(0)
                # logging.info(f'Sending chunk {chunk_id} size {len(output_buffer.getvalue())}')
                queue.put((chunk_id, output_buffer.getvalue().encode('utf-8')))
                chunk_id += 1
                chunk = BytesIO()
                output_buffer.seek(0)
                output_buffer.truncate(0)

            logging.info(f'SA360 report length: {source_size:,} bytes')
            queue.join()
            streamer.stop()
            report_details['schema'] = CSVHelpers.create_table_schema(
                fieldnames)
            return repeater

        except Exception as e:
            logging.error(e)

    @measure_memory
    def upload_report(self,
                      bucket: str,
                      report_details: Dict[str, Any],
                      input_buffer: BytesIO = None):
        output_buffer = StringIO()  #BytesIO()

        try:
            if not input_buffer:
                input_buffer = BytesIO()
                request = requests.Download(report_details['url'],
                                            stream=input_buffer)
                request.consume(transport=self.transport)
                logging.info('Report data size: {bytes}'.format(bytes=0))

            input_buffer.seek(0)
            soup = self._soupify(input_buffer)
            # del input_buffer

            headers = soup.find('thead').find_all('th')
            fieldnames = []
            for header in headers:
                fieldnames.append(CSVHelpers.sanitize_string(header.string))

            rows = soup.find('tbody').find_all('tr')
            report_data = []
            for row in rows:
                data = []
                for col in row.contents:
                    data.append(col.string)
                report_data.append(dict(zip(fieldnames, data)))

            writer = csv.DictWriter(output_buffer, fieldnames=fieldnames)
            writer.writeheader()

            for row in report_data:
                writer.writerow(row)

            output_buffer.seek(0)
            Cloud_Storage.write_file(bucket=bucket,
                                     file=f"{report_details['id']}.csv",
                                     data=output_buffer.getvalue())
            report_details['schema'] = CSVHelpers.create_table_schema(
                fieldnames)

        except Exception as e:
            logging.error(e)

Beispiel #30

0

Datei anzeigen

Datei: report_loader.py Projekt: NeoTim/report2bq

class ReportLoader(object):
  """Run the report loading process
  
  This performs the CSV import into BQ. It is triggered by a finalize/create on a
  monitored GCS bucket, and will ONLY process CSVs. All other files written to that
  bucket will result in an error in the logs. The file must named the same as the report
  id that is stored in Firestore - this is how the process knows which table/schema to use.

  Once started, the BQ Import Job (of type google.cloud.bigquery.LoadJob) is stored in 
  Firestore, under the 'jobs' key. This is then monitored for completion by JobMonitor.
  """
  CS = storage.Client()     # uses default service account credentials
  FIRESTORE = Firestore()   # uses default service account credentials


  def process(self, data: Dict[str, Any], context):
    """Process an added file

    This is the entry point for the Cloud Function to create the BQ import job.
    
    Arguments:
        event {Dict[str, Any]} -- data sent from the PubSub message
        context {Dict[str, Any]} -- context data. unused
    """
    logging.info(data)
    bucket_name = data['bucket']
    file_name = data['name']

    if file_name.upper().endswith('CSV'):
      logging.info('Processing CSV file %s' % file_name)

      try:
        self._handle_csv(bucket_name, file_name)

      except Exception as e:
        logging.error('Error processing file %s\n%s' % (file_name, e))

    else:
      # Ignore it, it's probably the schema
      logging.warn('File added that will not be processed: %s' % file_name)


  def _get_report_config(self, id: str) -> (Type, Dict[str, Any]):
    """Fetch the report configuration

    Load the stored report configuration from Firestore and return the report type
    and config as a tuple
    
    Arguments:
        id {int} -- Report Id, aka CSV file name
    
    Returns:
        (Type, Dict[str, Any]) -- Tuple containing the report type as an Enum, and the
        report configuration.
    """
    config = None
    for config_type in [Type.DV360, Type.CM, Type.SA360, Type.SA360_RPT]:
      config = self.FIRESTORE.get_report_config(config_type, id)
      if config: return config_type, config

    return None, None


  def _handle_csv(self, bucket_name: str, file_name: str):
    """Handle the CSV file

    Work out which type of job it is and send it to the appropriate uploader
    
    Arguments:
        bucket_name {str} -- name of the source bucket
        file_name {str} -- name of the CSV file
    """
    # Load config file. Must be present to continue
    # This could be either DBM/DV360 or (D)CM
    report_id = file_name.split('/')[-1].split('.')[0]
    config_type, config = self._get_report_config(report_id)

    if not config_type:
      self._email_error(f'No config found for report {report_id}')
      raise Exception(f'No config found for report {report_id}')

    logging.info(config)

    # Insert with schema and table name from config
    if config_type == Type.DV360:
      job = self._import_dbm_report(bucket_name, file_name, config)

    elif config_type == Type.CM:
      job = self._import_dcm_report(bucket_name, file_name, config)

    elif config_type == Type.SA360:
      job = self._import_sa360_report(bucket_name, file_name, config)

    elif config_type == Type.SA360_RPT:
      job = self._import_sa360_report(bucket_name, file_name, config)

    # Store the completed job in Firestore
    if job:
       self.FIRESTORE.store_import_job_details(report_id, job)


  def _import_dbm_report(self, bucket_name, file_name, config) -> bigquery.LoadJob:
    """Begin DV360 import

    These functions are identical, but need not be (used not to be) to reflect the fact that at
    some point, each product's CSVs could be subtly different, or that on product or another may
    switch from CSV to (say) json.
    
    Arguments:
        bucket_name {str} -- GCS bucket name
        file_name {str} -- CSV file name
        config {Dict[str, Any]} -- report config
    
    Returns:
        bigquery.LoadJob
    """
    return self._import_report(bucket_name, file_name, config)


  def _import_dcm_report(self, bucket_name, file_name, config):
    """Begin CM import

    These functions are identical, but need not be (used not to be) to reflect the fact that at
    some point, each product's CSVs could be subtly different, or that on product or another may
    switch from CSV to (say) json.
    
    Arguments:
        bucket_name {str} -- GCS bucket name
        file_name {str} -- CSV file name
        config {Dict[str, Any]} -- report config
    
    Returns:
        bigquery.LoadJob
    """
    return self._import_report(bucket_name, file_name, config)


  def _import_sa360_report(self, bucket_name, file_name, config):
    """Begin SA360 import

    These functions are identical, but need not be (used not to be) to reflect the fact that at
    some point, each product's CSVs could be subtly different, or that on product or another may
    switch from CSV to (say) json.
    
    Arguments:
        bucket_name {str} -- GCS bucket name
        file_name {str} -- CSV file name
        config {Dict[str, Any]} -- report config
    
    Returns:
        bigquery.LoadJob
    """
    return self._import_report(bucket_name, file_name, config)


  def _import_report(self, bucket_name: str, file_name: str, config: dict) -> bigquery.LoadJob:
    """Begin CSV import

    Create and start the Big Query import job.

    Arguments:
        bucket_name {str} -- GCS bucket name
        file_name {str} -- CSV file name
        config {Dict[str, Any]} -- report config
    
    Returns:
        bigquery.LoadJob
    """
    if config.get('dest_project'):
      # authenticate against supplied project with supplied key
      project = config.get('dest_project') or os.environ.get('GCP_PROJECT')
      client_key = json.loads(Cloud_Storage.fetch_file(
        bucket=f"{os.environ.get('GCP_PROJECT')}-report2bq-tokens",
        file=f"{config['email']}_user_token.json"
      ))
      server_key = json.loads(Cloud_Storage.fetch_file(
        bucket=f"{os.environ.get('GCP_PROJECT')}-report2bq-tokens",
        file='client_secrets.json'
      ))
      client_key['client_id'] = (server_key.get('web') or server_key.get('installed')).get('client_id')
      client_key['client_secret'] = (server_key.get('web') or server_key.get('installed')).get('client_secret')
      logging.info(client_key)
      creds = Credentials.from_authorized_user_info(client_key)
      bq = bigquery.Client(project=project, credentials=creds)

    else:
      project = os.environ.get('GCP_PROJECT')
      bq = bigquery.Client()

    dataset = config.get('dest_dataset') or os.environ.get('BQ_DATASET') or 'report2bq'

    table_name = config.get('table_name', CSVHelpers.sanitize_string(file_name))
    logging.info(f'bucket {bucket_name}, table {table_name}, file_name {file_name}')

    json_schema = config['schema']
    schema = []
    _json_schema = []
    # Build the json format schema that the BQ LoadJob requires from the text-based ones in the config
    for field in json_schema:
      f = bigquery.schema.SchemaField(name=field['name'],
                                      field_type=field['type'],
                                      mode=field['mode'])
      schema.append(f)
      _json_schema.append(f'{field["name"]}: {field["type"]}')

    table_ref = bq.dataset(dataset).table(table_name)

    # Default action is to completely replace the table each time. If requested, however then
    # we can do an append for (say) huge jobs where you would see the table with 60 days once
    # and then append 'yesterday' each day.
    if config.get('append', False):
      if self._table_exists(bq, table_ref) and not self._validate_schema(bq, table_ref, schema):
        config_schema = '\n'.join([ f'{field.name}, {field.field_type}' for field in schema])
        target_schema = '\n'.join([ f'{field.name}, {field.field_type}' for field in bq.get_table(table_ref).schema])
        self._email_error(
          email=config['email'], 
          message=f'''
Mismatched schema for {project}.{dataset}.{table_name}, trying anyway

Report has schema:
{config_schema}

Table has schema:
{target_schema}
'''
        )
        logging.error(f"Mismatched schema for {project}.{dataset}.{table_name}, trying anyway")

      import_type = bigquery.WriteDisposition.WRITE_APPEND
      
    else:
      import_type = bigquery.WriteDisposition.WRITE_TRUNCATE

    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = import_type
    # Assume a CSV header is the first line unless otherwise specified in the report's own config
    job_config.skip_leading_rows = config.get('csv_header_length', 1)
    job_config.source_format = bigquery.SourceFormat.CSV
    job_config.schema = schema
    # Allow a few errors, just in case
    job_config.max_bad_records = 10
    # Allow for DV360/CM (SA360 won't) to pass jagged rows, which they do
    job_config.allow_jagged_rows = True
    
    uri = f'gs://{bucket_name}/{file_name}'
    load_job = bq.load_table_from_uri(
        uri, table_ref, job_config=job_config
    )  # API request
    logging.info(f'Starting CSV import job {load_job.job_id}')

    return load_job


  def _table_exists(self, bq: bigquery.Client, table_ref: bigquery.TableReference) -> bool:
    try:
        bq.get_table(table_ref)
        return True

    except NotFound:
        return False


  def _validate_schema(self, bq: bigquery.Client, table_ref: bigquery.TableReference, schema: List[bigquery.schema.SchemaField]) -> bool:
    _table = bq.get_table(table_ref)
    _schema = _table.schema

    return _schema == schema


  def _email_error(self, message: str, email: str=None, error: Exception=None) -> None:
    _to = [email] if email else []
    _administrator = os.environ.get('ADMINISTRATOR_EMAIL') or self.FIRESTORE.get_document(Type._ADMIN, 'admin').get('email')
    _cc = [_administrator] if _administrator else []

    if _to or _cc:
      message = GMailMessage(
        to=_to, 
        cc=_cc,
        subject=f'Error in report_loader',
        body=f'''
{message}

Error: {error if error else 'No exception.'}
''', 
        project=os.environ.get('GCP_PROJECT'))

      GMail().send_message(
        message=message,
        credentials=Report2BQCredentials(email=email, project=os.environ.get('GCP_PROJECT'))
      )