def test_run_async_load_job_whenLoadingFileWithHeader_headerIsSkiped(self):
        bucket_name = "it_test_ems_gcp_toolkit"
        bucket = self.__get_test_bucket(bucket_name)
        blob_name = "sample_test_with_header.csv"
        blob = bucket.blob(blob_name)
        blob.upload_from_string(f"HEADER\nROW\n")
        source_uri = f"gs://{bucket_name}/{blob_name}"
        config = EmsLoadJobConfig(
            source_uri_template=source_uri,
            destination_project_id=GCP_PROJECT_ID,
            destination_dataset=self.DATASET.dataset_id,
            destination_table="load_job_test_skip_header",
            schema={"fields": [{
                "type": "STRING",
                "name": "COLUMN"
            }]},
            write_disposition=EmsWriteDisposition.WRITE_TRUNCATE,
            skip_leading_rows=1)

        load_job_id = self.client.run_async_load_job("it_test", config)
        self.__wait_for_job_done(load_job_id)

        query = f"""
        SELECT * from `{config.destination_project_id}.{config.destination_dataset}.{config.destination_table}`
        """

        result = self.client.run_sync_query(query=query)
        expected = [{"COLUMN": "ROW"}]
        self.assertEquals(expected, list(result))
Beispiel #2
0
 def setUp(self):
     self.ems_load_job_config = EmsLoadJobConfig(
         destination_project_id="test_project",
         destination_dataset="test_dataset",
         destination_table="test_table",
         create_disposition=EmsCreateDisposition.CREATE_IF_NEEDED,
         write_disposition=EmsWriteDisposition.WRITE_APPEND,
         schema=SCHEMA,
         source_uri_template="gs://bucket_id/{blob_id}")
    def test_run_async_load_job_loadsFileFromBucketToNewBigqueryTable(self):
        bucket_name = "it_test_ems_gcp_toolkit"
        bucket = self.__get_test_bucket(bucket_name)
        blob_name = "sample_fruit_test.csv"
        blob = bucket.blob(blob_name)
        random_quantity = random.randint(10000, 99000)
        blob.upload_from_string(
            f"apple,{random_quantity},True,1970-01-01T12:00:00.000Z\n")
        source_uri = f"gs://{bucket_name}/{blob_name}"
        config = EmsLoadJobConfig(
            source_uri_template=source_uri,
            destination_project_id=GCP_PROJECT_ID,
            destination_dataset=self.DATASET.dataset_id,
            destination_table="load_job_test",
            schema={
                "fields": [{
                    "type": "STRING",
                    "name": "fruit"
                }, {
                    "type": "INT64",
                    "name": "quantity"
                }, {
                    "type": "BOOL",
                    "name": "is_delicious"
                }, {
                    "type": "TIMESTAMP",
                    "name": "best_before"
                }]
            },
            write_disposition=EmsWriteDisposition.WRITE_TRUNCATE)

        load_job_id = self.client.run_async_load_job("it_test", config)
        self.__wait_for_job_done(load_job_id)

        query = f"""
        SELECT * from `{config.destination_project_id}.{config.destination_dataset}.{config.destination_table}`
        """

        result = self.client.run_sync_query(query=query)
        expected = [{
            "fruit":
            "apple",
            "quantity":
            random_quantity,
            "is_delicious":
            True,
            "best_before":
            datetime.datetime(1970,
                              1,
                              1,
                              12,
                              0,
                              0,
                              tzinfo=datetime.timezone.utc)
        }]
        self.assertEquals(expected, list(result))
 def setUp(self):
     self.expected_error_result =  {"some": "error", "happened": "here"}
     self.load_config = EmsLoadJobConfig(
         schema={"fields": [{"type": "INT64", "name": "f"}]},
         source_uri_template="",
         destination_project_id="dummy-project-id",
         destination_dataset="dummy-dataset",
         destination_table="dummy-project-id.dummy-dataset.dummy-table"
     )
     self.ems_load_job = EmsLoadJob("test-job-id", self.load_config, EmsJobState.DONE, self.expected_error_result)
    def test_run_async_load_job_submitsLoadJobAndReturnsJobIdWithProperConfig(
            self, bigquery_module_patch: bigquery):
        project_id = "some-project-id"
        source_uri = "gs://some-source-uri/to_object"
        bigquery_module_patch.Client.return_value = self.client_mock
        input_json_schema = {
            "fields": [{
                "type": "STRING",
                "name": "f1"
            }, {
                "mode": "REQUIRED",
                "type": "INTEGER",
                "name": "f2"
            }]
        }
        load_job_config = EmsLoadJobConfig(
            destination_project_id="some-destination-project-id",
            destination_dataset="some-destination-dataset",
            destination_table="some-destination-table",
            schema=input_json_schema,
            source_uri_template=source_uri,
            labels={"label1": "label1_value"})
        self.load_job_mock = Mock(LoadJob)
        self.load_job_mock.job_id = self.JOB_ID
        self.client_mock.load_table_from_uri.return_value = self.load_job_mock

        ems_bigquery_client = EmsBigqueryClient(project_id)
        result_job_id = ems_bigquery_client.run_async_load_job(
            "prefix", load_job_config)

        arguments = self.client_mock.load_table_from_uri.call_args_list[0][1]
        self.assertEqual(arguments["source_uris"], source_uri)
        self.assertEqual(arguments["job_id_prefix"], "prefix")
        self.assertEqual(result_job_id, "some-job-id")
        job_config = arguments["job_config"]
        self.assertIsInstance(job_config, LoadJobConfig)
        self.assertEqual(job_config.create_disposition,
                         EmsCreateDisposition.CREATE_IF_NEEDED.value)
        self.assertEqual(job_config.write_disposition,
                         EmsWriteDisposition.WRITE_APPEND.value)
        self.assertEqual(job_config.labels, {"label1": "label1_value"})

        field1 = SchemaField("f1", "STRING")
        field2 = SchemaField("f2", "INTEGER", "REQUIRED")
        self.assertEqual(job_config.schema, [field1, field2])
    def test_get_job_list_returnsLoadJob(self):
        config = EmsLoadJobConfig(
            {"fields": [{
                "name": "some_name",
                "type": "STRING"
            }]},
            "gs://some-non-existing-bucket-id/blob-id",
            destination_project_id=GCP_PROJECT_ID,
            destination_dataset="it_test_dataset",
            destination_table="some_table")
        min_creation_time = datetime.datetime.utcnow()
        unique_id = self.client.run_async_load_job("load_job_test", config)
        self.__wait_for_job_done(unique_id)
        jobs_iterator = self.client.get_jobs_with_prefix(
            "load_job_test", min_creation_time)
        found = unique_id in [job.job_id for job in jobs_iterator]

        self.assertTrue(found)
    def test_get_job_list_returnsAllKindOfJobs(self):
        load_config = EmsLoadJobConfig(
            {"fields": [{
                "name": "some_name",
                "type": "STRING"
            }]},
            "gs://some-non-existing-bucket-id/blob-id",
            destination_project_id=GCP_PROJECT_ID,
            destination_dataset="it_test_dataset",
            destination_table="some_table")
        destination_uris = ["gs://some-non-existing-bucket-id/destination1"]

        min_creation_time = datetime.datetime.utcnow()
        id_for_query_job = self.client.run_async_query(self.DUMMY_QUERY,
                                                       job_id_prefix="it_job")
        id_for_load_job = self.client.run_async_load_job(
            job_id_prefix="it_job", config=load_config)
        id_for_extract_job = self.client.run_async_extract_job(
            job_id_prefix="it_job",
            table=self.__get_table_path(),
            destination_uris=destination_uris,
            job_config=EmsExtractJobConfig(
                compression=Compression.NONE,
                destination_format=DestinationFormat.CSV,
                field_delimiter=",",
                print_header=False))

        self.__wait_for_job_done(id_for_query_job)
        self.__wait_for_job_done(id_for_load_job)
        self.__wait_for_job_done(id_for_extract_job)
        jobs_iterator = self.client.get_jobs_with_prefix(
            "it_job", min_creation_time)
        job_types = [type(j) for j in jobs_iterator]

        self.assertEqual(3, len(job_types))
        self.assertIn(EmsQueryJob, job_types)
        self.assertIn(EmsLoadJob, job_types)
        self.assertIn(EmsExtractJob, job_types)
Beispiel #8
0
 def test_destination_table_ifTableIsEmptyString_raisesValueError(self):
     with self.assertRaises(ValueError):
         EmsLoadJobConfig(destination_table="",
                          schema=SCHEMA,
                          source_uri_template="")
Beispiel #9
0
 def test_destination_dataset_ifDatasetIsNone_raisesValueError(self):
     with self.assertRaises(ValueError):
         EmsLoadJobConfig(destination_dataset=None,
                          schema=SCHEMA,
                          source_uri_template="")
Beispiel #10
0
 def test_destination_project_id_ifProjectIdIsMultipleWhitespaces_raisesValueError(
         self):
     with self.assertRaises(ValueError):
         EmsLoadJobConfig(destination_project_id="     \t  ",
                          schema=SCHEMA,
                          source_uri_template="")
Beispiel #11
0
 def test_destination_project_id_ifProjectIdIsEmptyString_raisesValueError(
         self):
     with self.assertRaises(ValueError):
         EmsLoadJobConfig(destination_project_id="",
                          schema=SCHEMA,
                          source_uri_template="")
Beispiel #12
0
    def __convert_to_ems_job(job):
        if isinstance(job, QueryJob):
            destination = job.destination
            table_id, dataset_id, project_id = \
                (destination.table_id, destination.dataset_id, destination.project) \
                    if destination is not None else (None, None, None)

            config = EmsQueryJobConfig(
                priority=EmsJobPriority[job.priority],
                destination_project_id=project_id,
                destination_dataset=dataset_id,
                destination_table=table_id,
                create_disposition=EmsBigqueryClient.
                __convert_to_ems_create_disposition(job.create_disposition),
                write_disposition=EmsBigqueryClient.
                __convert_to_ems_write_disposition(job.write_disposition),
                time_partitioning=EmsBigqueryClient.
                __convert_to_ems_time_partitioning(job.time_partitioning),
                labels=job.labels)
            return EmsQueryJob(job.job_id, job.query, config,
                               EmsJobState(job.state), job.error_result,
                               job.created)
        elif isinstance(job, LoadJob):
            destination = job.destination
            table_id, dataset_id, project_id = destination.table_id, destination.dataset_id, destination.project
            schema = {
                "fields": _build_schema_resource(job.schema)
            } if job.schema else []

            config = EmsLoadJobConfig(
                schema=schema,
                source_uri_template=job.source_uris[0]
                if job.source_uris else None,
                destination_project_id=project_id,
                destination_dataset=dataset_id,
                destination_table=table_id,
                create_disposition=EmsBigqueryClient.
                __convert_to_ems_create_disposition(job.create_disposition),
                write_disposition=EmsBigqueryClient.
                __convert_to_ems_write_disposition(job.write_disposition),
                labels=job.labels)

            return EmsLoadJob(job_id=job.job_id,
                              load_config=config,
                              state=EmsJobState(job.state),
                              error_result=None,
                              created=job.created)
        elif isinstance(job, ExtractJob):
            table = f'{job.source.project}.{job.source.dataset_id}.{job.source.table_id}'
            destination_uris = job.destination_uris
            job_config = EmsExtractJobConfig(
                compression=Compression(job.compression)
                if job.compression else Compression.NONE,
                destination_format=DestinationFormat(job.destination_format)
                if job.destination_format else DestinationFormat.CSV,
                field_delimiter=job.field_delimiter,
                print_header=job.print_header,
                labels=job.labels)
            return EmsExtractJob(job_id=job.job_id,
                                 table=table,
                                 destination_uris=destination_uris,
                                 job_config=job_config,
                                 state=EmsJobState(job.state),
                                 error_result=job.error_result,
                                 created=job.created)
        else:
            LOGGER.warning(
                f"Unexpected job type for : {job.job_id}, with type class: {job.__class__}"
            )
            return None