Esempio n. 1
0
    def setUp(self):

        self.cloud = GoogleCloudStorage()

        # Running into some issues creating and delete too many buckets, so
        # will check to see if it already exists
        if not self.cloud.bucket_exists(TEMP_BUCKET_NAME):
            self.cloud.create_bucket(TEMP_BUCKET_NAME)

            # Upload a file
            tmp_file_path = files.string_to_temp_file('A little string',
                                                      suffix='.txt')
            self.cloud.put_blob(TEMP_BUCKET_NAME, TEMP_FILE_NAME,
                                tmp_file_path)
Esempio n. 2
0
    def copy(self, table_obj, dataset_name, table_name, if_exists='fail',
             tmp_gcs_bucket=None, gcs_client=None, job_config=None, **load_kwargs):
        """
        Copy a :ref:`parsons-table` into Google BigQuery via Google Cloud Storage.

        `Args:`
            table_obj: obj
                The Parsons Table to copy into BigQuery.
            dataset_name: str
                The dataset name to load the data into.
            table_name: str
                The table name to load the data into.
            if_exists: str
                If the table already exists, either ``fail``, ``append``, ``drop``
                or ``truncate`` the table.
            temp_gcs_bucket: str
                The name of the Google Cloud Storage bucket to use to stage the data to load
                into BigQuery. Required if `GCS_TEMP_BUCKET` is not specified.
            gcs_client: object
                The GoogleCloudStorage Connector to use for loading data into Google Cloud Storage.
            job_config: object
                A LoadJobConfig object to provide to the underlying call to load_table_from_uri
                on the BigQuery client. The function will create its own if not provided.
            **load_kwargs: kwargs
                Arguments to pass to the underlying load_table_from_uri call on the BigQuery
                client.
        """
        tmp_gcs_bucket = check_env.check('GCS_TEMP_BUCKET', tmp_gcs_bucket)

        if if_exists not in ['fail', 'truncate', 'append', 'drop']:
            raise ValueError(f'Unexpected value for if_exists: {if_exists}, must be one of '
                             '"append", "drop", "truncate", or "fail"')

        table_exists = self.table_exists(dataset_name, table_name)

        if not job_config:
            job_config = bigquery.LoadJobConfig()
            job_config.autodetect = True

        job_config.skip_leading_rows = 1
        job_config.source_format = bigquery.SourceFormat.CSV
        job_config.write_disposition = bigquery.WriteDisposition.WRITE_EMPTY
        job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED

        dataset_ref = self.client.dataset(dataset_name)

        if table_exists:
            if if_exists == 'fail':
                raise ValueError('Table already exists.')
            elif if_exists == 'drop':
                self.delete_table(dataset_name, table_name)
            elif if_exists == 'append':
                job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
            elif if_exists == 'truncate':
                job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

        gcs_client = gcs_client or GoogleCloudStorage()
        temp_blob_name = f'{uuid.uuid4()}.csv'
        temp_blob_uri = gcs_client.upload_table(table_obj, tmp_gcs_bucket, temp_blob_name)

        # load CSV from Cloud Storage into BigQuery
        try:
            load_job = self.client.load_table_from_uri(
                temp_blob_uri, dataset_ref.table(table_name),
                job_config=job_config, **load_kwargs,
            )
            load_job.result()
        finally:
            gcs_client.delete_blob(tmp_gcs_bucket, temp_blob_name)
Esempio n. 3
0
class TestGoogleStorageBuckets(unittest.TestCase):
    def setUp(self):

        self.cloud = GoogleCloudStorage()

        # Running into some issues creating and delete too many buckets, so
        # will check to see if it already exists
        if not self.cloud.bucket_exists(TEMP_BUCKET_NAME):
            self.cloud.create_bucket(TEMP_BUCKET_NAME)

            # Upload a file
            tmp_file_path = files.string_to_temp_file('A little string',
                                                      suffix='.txt')
            self.cloud.put_blob(TEMP_BUCKET_NAME, TEMP_FILE_NAME,
                                tmp_file_path)

    def test_list_buckets(self):

        # Assert that it finds the correct buckets
        bucket_list = self.cloud.list_buckets()

        # Make sure that my bucket is in the list
        self.assertIn(TEMP_BUCKET_NAME, bucket_list)

    def test_bucket_exists(self):

        # Assert finds a bucket that exists
        self.assertTrue(self.cloud.bucket_exists(TEMP_BUCKET_NAME))

        # Assert doesn't find a bucket that doesn't exist
        self.assertFalse(self.cloud.bucket_exists('NOT_A_REAL_BUCKET'))

    def test_get_bucket(self):

        # Assert that a bucket object is returned
        self.assertIsInstance(self.cloud.get_bucket(TEMP_BUCKET_NAME),
                              storage.bucket.Bucket)

    def test_create_bucket(self):

        # Temporary bucket has already been created as part of set up, so just checking
        # that it really exists
        self.assertTrue(self.cloud.bucket_exists(TEMP_BUCKET_NAME))

    def test_delete_bucket(self):

        # Create another bucket, delete it and make sure it doesn't exist
        self.cloud.create_bucket(TEMP_BUCKET_NAME + '_2')
        self.cloud.delete_bucket(TEMP_BUCKET_NAME + '_2')
        self.assertFalse(self.cloud.bucket_exists(TEMP_BUCKET_NAME + '_2'))

    def test_list_blobs(self):

        blob_list = self.cloud.list_blobs(TEMP_BUCKET_NAME)

        # Make sure that my file is in the list
        self.assertIn(TEMP_FILE_NAME, blob_list)

        # Make sure that there is only one file in the bucket
        self.assertEqual(len(blob_list), 1)

    def test_blob_exists(self):

        # Assert that it thinks that the blob exists
        self.assertTrue(
            self.cloud.blob_exists(TEMP_BUCKET_NAME, TEMP_FILE_NAME))

        # Assert that it thinks that a non-existent blob doesn't exist
        self.assertFalse(self.cloud.blob_exists(TEMP_BUCKET_NAME, 'FAKE_BLOB'))

    def test_put_blob(self):

        # Already being tested as part of setUp
        pass

    def test_get_blob(self):

        # Assert that a blob object is returned
        self.assertIsInstance(
            self.cloud.get_blob(TEMP_BUCKET_NAME, TEMP_FILE_NAME),
            storage.blob.Blob)

    def test_download_blob(self):

        # Download blob and ensure that it is the expected file
        path = self.cloud.download_blob(TEMP_BUCKET_NAME, TEMP_FILE_NAME)
        with open(path, 'r') as f:
            self.assertEqual(f.read(), 'A little string')

    def test_delete_blob(self):

        file_name = 'delete_me.txt'

        # Upload a file
        tmp_file_path = files.string_to_temp_file('A little string',
                                                  suffix='.txt')
        self.cloud.put_blob(TEMP_BUCKET_NAME, file_name, tmp_file_path)

        # Check that it was deleted.
        self.cloud.delete_blob(TEMP_BUCKET_NAME, file_name)
        self.assertFalse(self.cloud.blob_exists(TEMP_BUCKET_NAME, file_name))
Esempio n. 4
0
    def to_gcs_csv(self,
                   bucket_name,
                   blob_name,
                   app_creds=None,
                   project=None,
                   compression=None,
                   encoding=None,
                   errors='strict',
                   write_header=True,
                   public_url=False,
                   public_url_expires=60,
                   **csvargs):
        """
        Writes the table to a Google Cloud Storage blob as a CSV.

        `Args:`
            bucket_name: str
                The bucket to upload to
            blob_name: str
                The blob to name the file. If it ends in '.gz' or '.zip', the file will be
                compressed.
            app_creds: str
                A credentials json string or a path to a json file. Not required
                if ``GOOGLE_APPLICATION_CREDENTIALS`` env variable set.
            project: str
                The project which the client is acting on behalf of. If not passed
                then will use the default inferred environment.
            compression: str
                The compression type for the csv. Currently "None", "zip" and "gzip" are
                supported. If specified, will override the key suffix.
            encoding: str
                The CSV encoding type for `csv.writer()
                <https://docs.python.org/2/library/csv.html#csv.writer/>`_
            errors: str
                Raise an Error if encountered
            write_header: boolean
                Include header in output
            public_url: boolean
                Create a public link to the file
            public_url_expire: 60
                The time, in minutes, until the url expires if ``public_url`` set to ``True``.
            \**csvargs: kwargs
                ``csv_writer`` optional arguments
        `Returns:`
            Public url if specified. If not ``None``.
        """  # noqa: W605

        compression = compression or files.compression_type_for_path(blob_name)

        csv_name = files.extract_file_name(blob_name,
                                           include_suffix=False) + '.csv'

        # Save the CSV as a temp file
        local_path = self.to_csv(temp_file_compression=compression,
                                 encoding=encoding,
                                 errors=errors,
                                 write_header=write_header,
                                 csv_name=csv_name,
                                 **csvargs)

        from parsons.google.google_cloud_storage import GoogleCloudStorage
        gcs = GoogleCloudStorage(app_creds=app_creds, project=project)
        gcs.put_blob(bucket_name, blob_name, local_path)

        if public_url:
            return gcs.get_url(bucket_name,
                               blob_name,
                               expires_in=public_url_expires)
        else:
            return None