Esempio n. 1
0
    def execute(self, *args):
        super().execute()

        valid = EssentialParameters(
            self.__class__.__name__, [self._tblname, self._bucket, self._dest_dir]
        )
        valid()

        os.makedirs(self._dest_dir, exist_ok=True)

        if isinstance(self._credentials, str):
            self._logger.warning(
                (
                    "DeprecationWarning: "
                    "In the near future, "
                    "the `credentials` will be changed to accept only dictionary types. "
                )
            )
            key_filepath = self._credentials
        else:
            key_filepath = self._source_path_reader(self._credentials)
        gbq_client = BigQuery.get_bigquery_client(key_filepath)
        gbq_ref = gbq_client.dataset(self._dataset).table(self._tblname)

        gcs_client = Gcs.get_gcs_client(key_filepath)
        gcs_bucket = gcs_client.bucket(self._bucket)

        ymd_hms = datetime.now().strftime("%Y%m%d%H%M%S%f")
        path = "%s-%s" % ("".join(random.choices(string.ascii_letters, k=8)), ymd_hms)
        prefix = "%s/%s/%s" % (self._dataset, self._tblname, path)

        """
        gsc dir -> gs://{bucket_name}
                       /{dataset_name}/{table_name}
                       /{XXXXXXXX}-{yyyyMMddHHmmssSSS}/*.csv.gz
        """
        if self._filename:
            dest_gcs = "gs://%s/%s/%s*.csv.gz" % (self._bucket, prefix, self._filename)
        else:
            dest_gcs = "gs://%s/%s/*.csv.gz" % (self._bucket, prefix)

        # job config settings
        job_config = bigquery.ExtractJobConfig()
        job_config.compression = bigquery.Compression.GZIP
        job_config.destination_format = bigquery.DestinationFormat.CSV

        # Execute query.
        job = gbq_client.extract_table(
            gbq_ref, dest_gcs, job_config=job_config, location=self._location
        )
        job.result()

        # Download from gcs
        for blob in gcs_client.list_blobs(gcs_bucket, prefix=prefix):
            dest = os.path.join(self._dest_dir, os.path.basename(blob.name))
            blob.download_to_filename(dest)

        # Cleanup temporary files
        for blob in gcs_client.list_blobs(gcs_bucket, prefix=prefix):
            blob.delete()
Esempio n. 2
0
 def test_get_destination_format_json(self):
     assert BigQuery.get_destination_format(
         ".json") == "NEWLINE_DELIMITED_JSON"
Esempio n. 3
0
    def _save_as_file_via_gcs(self):
        self._logger.info("Save data as a file via GCS")
        os.makedirs(self._dest_dir, exist_ok=True)

        ymd_hms = datetime.now().strftime("%Y%m%d%H%M%S%f")
        path = "%s-%s" % (StringUtil().random_str(self._RANDOM_STR_LENGTH), ymd_hms,)
        prefix = "%s/%s/%s" % (self._dataset, self._tblname, path)

        if isinstance(self._credentials, str):
            self._logger.warning(
                (
                    "DeprecationWarning: "
                    "In the near future, "
                    "the `credentials` will be changed to accept only dictionary types. "
                    "Please see more information "
                    "https://github.com/BrainPad/cliboa/blob/master/docs/modules/bigquery_read.md"
                )
            )
            key_filepath = self._credentials
        else:
            key_filepath = self._source_path_reader(self._credentials)
        gbq_client = BigQuery.get_bigquery_client(key_filepath)
        if self._dataset and self._tblname:
            table_ref = gbq_client.dataset(self._dataset).table(self._tblname)
        elif self._dataset and not self._tblname:
            tmp_tbl = (
                "tmp_"
                + StringUtil().random_str(self._RANDOM_STR_LENGTH)
                + "_"
                + ymd_hms
            )
            table_ref = gbq_client.dataset(self._dataset).table(tmp_tbl)
        gcs_client = Gcs.get_gcs_client(key_filepath)
        gcs_bucket = gcs_client.bucket(self._bucket)

        # extract job config settings
        ext_job_config = BigQuery.get_extract_job_config()
        ext_job_config.compression = BigQuery.get_compression_type()
        ext = ".csv"
        if self._filename:
            _, ext = os.path.splitext(self._filename)
            support_ext = [".csv", ".json"]
            if ext not in support_ext:
                raise InvalidParameter("%s is not supported as filename." % ext)
        ext_job_config.destination_format = BigQuery.get_destination_format(ext)

        comp_format_and_ext = {"GZIP": ".gz"}
        comp_ext = comp_format_and_ext.get(str(BigQuery.get_compression_type()))
        if self._filename:
            dest_gcs = "gs://%s/%s/%s%s" % (
                self._bucket,
                prefix,
                self._filename,
                comp_ext,
            )
        else:
            dest_gcs = "gs://%s/%s/*%s%s" % (self._bucket, prefix, ext, comp_ext)

        # Execute query.
        if self._query:
            query_job_config = BigQuery.get_query_job_config()
            query_job_config.destination = table_ref
            query_job_config.write_disposition = BigQuery.get_write_disposition()
            query_job = gbq_client.query(
                self._query, location=self._location, job_config=query_job_config
            )
            query_job.result()

        # Extract to GCS
        extract_job = gbq_client.extract_table(
            table_ref, dest_gcs, job_config=ext_job_config, location=self._location
        )
        extract_job.result()

        # Download from gcs
        for blob in gcs_bucket.list_blobs(prefix=prefix):
            dest = os.path.join(self._dest_dir, os.path.basename(blob.name))
            blob.download_to_filename(dest)

        # Cleanup temporary table
        if self._query:
            gbq_client.delete_table(table_ref)

        # Cleanup temporary files
        for blob in gcs_bucket.list_blobs(prefix=prefix):
            blob.delete()
Esempio n. 4
0
 def test_get_comporession_type(self):
     assert BigQuery.get_compression_type() == "GZIP"
Esempio n. 5
0
 def test_get_destination_format_csv(self):
     assert BigQuery.get_destination_format(".csv") == "CSV"
Esempio n. 6
0
 def test_get_query_job_config(self):
     self.assertTrue(
         isinstance(BigQuery.get_query_job_config(),
                    type(bigquery.QueryJobConfig())))
Esempio n. 7
0
 def test_get_extract_job_config_with_no_header(self):
     self.assertTrue(
         isinstance(
             BigQuery.get_extract_job_config(print_header=False),
             type(bigquery.ExtractJobConfig(print_header=False)),
         ), )
Esempio n. 8
0
 def test_get_extract_job_config_with_header(self):
     self.assertTrue(
         isinstance(BigQuery.get_extract_job_config(),
                    type(bigquery.ExtractJobConfig())))
Esempio n. 9
0
 def test_get_bigquery_client_no_credentials(self):
     assert BigQuery.get_bigquery_client(None) == bigquery.Client()
Esempio n. 10
0
    def _save_as_file_via_gcs(self):
        self._logger.info("Save data as a file via GCS")
        os.makedirs(self._dest_dir, exist_ok=True)

        ymd_hms = datetime.now().strftime("%Y%m%d%H%M%S%f")
        path = "%s-%s" % (
            StringUtil().random_str(self._RANDOM_STR_LENGTH),
            ymd_hms,
        )
        prefix = "%s/%s/%s" % (self._dataset, self._tblname, path)

        gbq_client = BigQuery.get_bigquery_client(self._credentials)
        if self._dataset and self._tblname:
            table_ref = gbq_client.dataset(self._dataset).table(self._tblname)
        elif self._dataset and not self._tblname:
            tmp_tbl = ("tmp_" +
                       StringUtil().random_str(self._RANDOM_STR_LENGTH) + "_" +
                       ymd_hms)
            table_ref = gbq_client.dataset(self._dataset).table(tmp_tbl)

        gcs_client = Gcs.get_gcs_client(self._credentials)
        gcs_bucket = gcs_client.get_bucket(self._bucket)

        # extract job config settings
        ext_job_config = BigQuery.get_extract_job_config()
        ext_job_config.compression = BigQuery.get_compression_type()
        ext = ".csv"
        if self._filename:
            _, ext = os.path.splitext(self._filename)
            support_ext = [".csv", ".json"]
            if ext not in support_ext:
                raise InvalidParameter("%s is not supported as filename." %
                                       ext)
        ext_job_config.destination_format = BigQuery.get_destination_format(
            ext)

        comp_format_and_ext = {"GZIP": ".gz"}
        comp_ext = comp_format_and_ext.get(str(
            BigQuery.get_compression_type()))
        if self._filename:
            dest_gcs = "gs://%s/%s/%s%s" % (
                self._bucket,
                prefix,
                self._filename,
                comp_ext,
            )
        else:
            dest_gcs = "gs://%s/%s/*%s%s" % (self._bucket, prefix, ext,
                                             comp_ext)

        # Execute query.
        if self._query:
            query_job_config = BigQuery.get_query_job_config()
            query_job_config.destination = table_ref
            query_job_config.write_disposition = BigQuery.get_write_disposition(
            )
            query_job = gbq_client.query(self._query,
                                         location=self._location,
                                         job_config=query_job_config)
            query_job.result()

        # Extract to GCS
        extract_job = gbq_client.extract_table(table_ref,
                                               dest_gcs,
                                               job_config=ext_job_config,
                                               location=self._location)
        extract_job.result()

        # Download from gcs
        for blob in gcs_bucket.list_blobs(prefix=prefix):
            dest = os.path.join(self._dest_dir, os.path.basename(blob.name))
            blob.download_to_filename(dest)

        # Cleanup temporary table
        if self._query:
            gbq_client.delete_table(table_ref)

        # Cleanup temporary files
        for blob in gcs_bucket.list_blobs(prefix=prefix):
            blob.delete()