Esempio n. 1
0
    def export_csv(self,
                   bucket_name: str,
                   bucket_path: str,
                   dataset: str,
                   table: str,
                   sep: str = "\t") -> str:

        bucket_url = f"gs://{bucket_name}/{self.config.lake_path}/{bucket_path}"

        logging.info(
            f"DataWarehouse.export_csv {bucket_url} to {dataset}.{table} ...")
        client = self._get_client()

        dataset_ref = DatasetReference(self.config.gcp_project, dataset)

        to_export = TableReference(dataset_ref, table)
        config = ExtractJobConfig()
        config.field_delimiter = sep
        config.destination_format = bigquery.DestinationFormat.CSV

        extract_job = client.extract_table(to_export,
                                           bucket_url,
                                           job_config=config)
        result = extract_job.result()

        logging.info(
            f"DataWarehouse.export_csv {bucket_url} to {dataset}.{table} Complete!"
        )

        return bucket_url
Esempio n. 2
0
    def export(self,
               tbl_ref,
               gcs_base_dir,
               file_format="csv",
               compression=None,
               preview=True):
        """
        :param gcs_base_dir: gcs base bucket
        :param file_format: (Optional) csv, json or avro. default to csv if can't be determined from export_uri
        :param compression: default to None. could be gzip.
        """
        tables = self.get_tables(tbl_ref)
        jobs = list()
        jc = ExtractJobConfig()
        jc.compression = compression
        jc.destination_format = self.__get_bq_format(file_format)
        for tbl in tables:
            gcs_uri = "{}/{}/*.{}".format(gcs_base_dir, tbl, file_format)
            table_ref = TableReference(dataset_ref=tbl_ref.dataset_ref,
                                       table_id=tbl)
            print("--  {}{} => {} ".format("preview: " if preview else "", tbl,
                                           gcs_uri))
            if preview:
                continue

            jobs.append(
                self.connect(tbl_ref.project).extract_table(table_ref,
                                                            gcs_uri,
                                                            job_config=jc))
        self.__check_jobs(jobs)
Esempio n. 3
0
    def __init__(self, table_name=None, dataset_name=None, project_name=None):
        """ table name: [project.][dataset.]table. could be tbl_*_sth, or tbl_[min-max] for table names in range """
        self.project = project_name
        self.dataset = dataset_name
        self.table = table_name
        self.table_min = None  # to qualify date partitioned table, table min name
        self.table_max = None  # table max name

        if table_name:
            name_parts = table_name.replace(":", ".").split(".")
            if len(name_parts) == 3:  # project.dataset.table
                self.project = name_parts[0]
                self.dataset = name_parts[1]
                self.table = name_parts[2]
            elif len(name_parts) == 2:  # dataset.table
                self.dataset = name_parts[0]
                self.table = name_parts[1]

            tbl_parts = self.table.split("[")
            if len(tbl_parts) == 2:
                self.table = tbl_parts[0]
                dt_parts = tbl_parts[1].rstrip("]").split("-")
                self.table_min = self.table.rstrip(
                    "*") + dt_parts[0] if dt_parts[0] else None
                self.table_max = self.table.rstrip(
                    "*") + dt_parts[1] if dt_parts[1] else None

        if not self.project:
            self.project = DtTblRef.default_project()
        self.dataset_ref = DatasetReference(dataset_id=self.dataset,
                                            project=self.project)
        self.table_ref = TableReference(dataset_ref=self.dataset_ref,
                                        table_id=self.table)
Esempio n. 4
0
    def copy_table(self):
        # Overwrite parent method for alternative approach
        # If target table was created by a previous run, drop it first
        self.drop_table_if_exists(self.table_name,
                                  self.schema_name + self.schema_suffix,
                                  self.database_name)
        # Create target table if source table exists
        if self.test_if_table_exists(
                table_name=self.table_name,
                schema_name=self.schema_name,
                project_id=self.database_name,
        ):
            self.log.info("Copying table into temporary dataset!")
            conn = self.dwh_hook.dbconn
            ds_old = conn.get_dataset(self.schema_name)
            ds_new = conn.get_dataset(self.schema_name + self.schema_suffix)
            table_old = conn.get_table(table=TableReference(
                dataset_ref=ds_old, table_id=self.table_name))
            table_new = ds_new.table(self.table_name)
            copy_job = conn.copy_table(
                table_old,
                table_new,
                job_config=CopyJobConfig(write_disposition="WRITE_TRUNCATE"),
            )

            copy_job.result()  # Waits until the job is done
            assert copy_job.state == "DONE", "Unexpected job state: {0}".format(
                job.state)
            self.log.info("Successfully copied {0}!".format(
                copy_job.__dict__["_properties"]["configuration"]["copy"]
                ["destinationTable"]["tableId"]))
Esempio n. 5
0
    def table(self, table_id):
        """Constructs a TableReference.

        :type table_id: str
        :param table_id: the ID of the table.

        :rtype: :class:`~google.cloud.bigquery.table.TableReference`
        :returns: a TableReference for a table in this dataset.
        """
        return TableReference(self, table_id)
Esempio n. 6
0
    def copy_table(
            self,
            source_table_path,  # type: str
            destination_table_name,  # type: str
            destination_dataset=None,  # type: Optional[str]
            destination_project=None,  # type: Optional[str]
            replace_existing_table=False  # type: bool
    ):
        # type: (...) -> None
        """
        Copies the table at source_table_path to the location
        destination_project.destination_dataset.destination_table_name. If the destination project
        or dataset aren't set, the class default will be used.

        Args:
            source_table_path: The path of the table to copy.
            destination_table_name: The name of the table to copy to.
            destination_dataset: The name of the destination dataset. If unset, the client default
                dataset will be used.
            destination_project: The name of the destination project. If unset, the client default
                project will be used.
            replace_existing_table: If True, if the destination table already exists, it will delete
                it and copy the source table in its place.

        Raises:
            RuntimeError if the destination table already exists and replace_existing_table is False
            or the destination dataset does not exist
        """

        destination_dataset = destination_dataset or self.default_dataset_id
        destination_project = destination_project or self.project_id

        dataset_ref = DatasetReference(destination_project,
                                       destination_dataset)

        if not self.dataset_exists(dataset_ref):
            raise RuntimeError(
                'The dataset {} does not exist in project {}.'.format(
                    destination_dataset, destination_project))

        dest_table_ref = TableReference(dataset_ref, destination_table_name)

        if self.table_exists(dest_table_ref):
            if replace_existing_table:
                self.delete_table(dest_table_ref)
            else:
                raise RuntimeError(
                    'The table {} already exists in dataset {}.'.format(
                        destination_table_name, destination_dataset))

        dest_table_path = self.path(destination_table_name,
                                    destination_dataset, destination_project)

        self.create_table_from_query(
            'SELECT * FROM `{}`'.format(source_table_path), dest_table_path)
Esempio n. 7
0
def _get_table_reference(self, table_id):
    """Constructs a TableReference.

    Args:
        table_id (str): The ID of the table.

    Returns:
        google.cloud.bigquery.table.TableReference:
            A table reference for a table in this dataset.
    """
    return TableReference(self, table_id)
Esempio n. 8
0
 def delete(self, tbl_ref, preview):
     datasets = self.get_datasets(tbl_ref)
     for dt in datasets:
         tables = self.get_tables(DtTblRef(tbl_ref.table, dt,
                                           tbl_ref.project),
                                  types=['TABLE', 'VIEW'])
         for tbl in tables:
             print("--  {}delete {}.{} ".format(
                 "preview: " if preview else "", dt, tbl))
             if preview:
                 continue
             self.connect(tbl_ref.project).delete_table(
                 TableReference(tbl_ref.dataset_ref, tbl))
Esempio n. 9
0
    def get_table_reference_from_path(self, table_path):
        # type: (str) -> TableReference
        """
        Returns a TableReference for a given path to a BigQuery table.

        Args:
            table_path: A BigQuery table path in the form project.dataset.table

        Returns:
            A TableReference for the table specified by the path
        """
        _, dataset, table = self.parse_table_path(table_path)
        dataset_ref = DatasetReference(self.project_id, dataset)
        return TableReference(dataset_ref, table)
Esempio n. 10
0
    def get_schema(self, dataset_id, table_name, project_id=None):
        # type: (str, str, Optional[str]) -> List[SchemaField]
        """Returns the schema of a table.

        Args:
          dataset_id: The dataset to query.
          table_name: The name of the table.
          project_id: The project ID of the table.
        Returns:
          A list of SchemaFields representing the schema.
        """

        dataset_ref = DatasetReference(project_id if project_id else self.project_id, dataset_id)
        table = self.gclient.get_table(TableReference(dataset_ref, table_name))

        return table.schema
Esempio n. 11
0
 def drop_table_if_exists(self, table_name, schema_name, project_id=None):
     if self.test_if_table_exists(
             table_name=table_name,
             schema_name=schema_name,
             project_id=project_id,
     ):
         conn = self.dwh_hook.dbconn
         conn.delete_table(
             conn.get_table(
                 TableReference(
                     dataset_ref=FakeDatasetRef(
                         dataset_id=schema_name,
                         project_id=project_id or self.database_name,
                     ),
                     table_id=table_name,
                 )))
Esempio n. 12
0
    def __init__(self, table_name, dataset_name=None, project_name=None):
        self._project = project_name
        self._dataset = dataset_name
        self._table = table_name

        parts = table_name.replace(":", ".").split(".")
        if len(parts) == 3:  # project.dataset.table
            self._project = parts[0]
            self._dataset = parts[1]
            self._table = parts[2]
        elif len(parts) == 2:  # dataset.table
            self._dataset = parts[0]
            self._table = parts[1]

        self._dataset_ref = DatasetReference(dataset_id=self._dataset,
                                             project=self._project)
        self._table_ref = TableReference(dataset_ref=self._dataset_ref,
                                         table_id=self._table)
Esempio n. 13
0
    def __load_many(self, dt_ref, tables, gcs_base_dir, file_format, jc,
                    preview):
        """
        :param tables:
        :param gcs_base_dir: to map to table
        """
        jobs = list()
        for tbl in tables:
            data_uri = "{}/{}/*.{}".format(gcs_base_dir, tbl, file_format)
            table_ref = TableReference(dataset_ref=dt_ref.dataset_ref,
                                       table_id=tbl)
            print("--  {}{} <= {} ".format("preview: " if preview else "", tbl,
                                           data_uri))
            if preview:
                continue

            jobs.append(
                self.connect(dt_ref.project).load_table_from_uri(
                    data_uri, table_ref, job_config=jc))
        self.__check_jobs(jobs)
Esempio n. 14
0
    def create_tables_from_dict(
            self,
            table_names_to_schemas,  # type: Dict[str, List[SchemaField]]
            dataset_id=None,  # type: Optional[str]
            replace_existing_tables=False,  # type: Optional[bool]
    ):
        # type: (...) -> None
        """Creates a set of tables from a dictionary of table names to their schemas.

        Args:
          table_names_to_schemas: A dictionary of:
            key: The table name.
            value: A list of SchemaField objects.
          dataset_id: The dataset in which to create tables. If not specified, use default dataset.
          replace_existing_tables: If True, delete and re-create tables. Otherwise, checks to see
              if any of the requested tables exist. If they do, it will raise a RuntimeError.

        Raises:
            RuntimeError if replace_existing_tables is False and any of the tables requested for
                creation already exist
        """
        dataset_id = dataset_id or self.default_dataset_id
        dataset_ref = DatasetReference(self.project_id, dataset_id)

        # If the flag isn't set to replace existing tables, raise an error if any tables we're
        # trying to create already exist.
        if not replace_existing_tables:
            self._raise_if_tables_exist(table_names_to_schemas.keys(),
                                        dataset_id)

        for name, schema in six.iteritems(table_names_to_schemas):
            table_ref = TableReference(dataset_ref, name)
            # Use the Table object so it retains its schema.
            table = bigquery.Table(table_ref, schema=schema)

            if self.table_exists(table) and replace_existing_tables:
                self.delete_table(table)
            self.create_table(table)
def test_basic(url_with_everything):
    project_id, location, dataset_id, arraysize, credentials_path, job_config = parse_url(url_with_everything)

    assert project_id == 'some-project'
    assert location == 'some-location'
    assert dataset_id == 'some-dataset'
    assert arraysize == 1000
    assert credentials_path == '/some/path/to.json'
    assert isinstance(job_config, QueryJobConfig)


@pytest.mark.parametrize('param, value', [
    ('clustering_fields', ['a', 'b', 'c']),
    ('create_disposition', 'CREATE_IF_NEEDED'),
    ('destination', TableReference(DatasetReference('different-project', 'different-dataset'), 'table')),
    ('destination_encryption_configuration',
     lambda enc: enc.kms_key_name == EncryptionConfiguration('some-configuration').kms_key_name),
    ('dry_run', True),
    ('labels', {'a': 'b', 'c': 'd'}),
    ('maximum_bytes_billed', 1000),
    ('priority', 'INTERACTIVE'),
    ('schema_update_options', ['ALLOW_FIELD_ADDITION', 'ALLOW_FIELD_RELAXATION']),
    ('use_query_cache', True),
    ('write_disposition', 'WRITE_APPEND'),
])
def test_all_values(url_with_everything, param, value):
    job_config = parse_url(url_with_everything)[5]

    config_value = getattr(job_config, param)
    if callable(value):
Esempio n. 16
0
        def final_func(schema_name, schema_suffix, dwh_conn_id):
            # final: move new data into the final dataset
            conn = EWAHBaseHook.get_hook_from_conn_id(dwh_conn_id).dbconn
            # get dataset objects
            try:  # create final dataset if not exists
                ds_final = conn.get_dataset(schema_name)
            except:
                print("Creating dataset {0}".format(schema_name))
                ds_final = conn.create_dataset(schema_name)
            ds_temp = conn.get_dataset(schema_name + schema_suffix)

            # copy all tables from temp dataset to final dataset
            new_tables = conn.list_tables(ds_temp)
            new_table_ids = [
                table.table_id for table in conn.list_tables(ds_temp)
            ]
            old_table_ids = [
                table.table_id for table in conn.list_tables(ds_final)
            ]
            copy_jobs = []
            for table in new_tables:
                print("Copying table {0} from temp to final dataset".format(
                    table.table_id))
                try:
                    old_table = conn.get_table(table=TableReference(
                        dataset_ref=ds_final, table_id=table.table_id))
                    conn.delete_table(old_table)
                except:
                    # ignore failure, fails if old table does not exist to begin with
                    pass
                finally:
                    final_table = ds_final.table(table.table_id)
                    copy_jobs.append(conn.copy_table(table, final_table))

            # delete tables that don't exist in temp dataset from final dataset
            for table_id in old_table_ids:
                if not table_id in new_table_ids:
                    print("Deleting table {0}".format(table_id))
                    conn.delete_table(
                        conn.get_table(
                            TableReference(dataset_ref=ds_final,
                                           table_id=table_id)))

            # make sure all copy jobs succeeded
            while copy_jobs:
                sleep(0.1)
                job = copy_jobs.pop(0)
                job.result()
                assert job.state in ("RUNNING", "DONE")
                if job.state == "RUNNING":
                    copy_jobs.append(job)
                else:
                    print("Successfully copied {0}".format(
                        job.__dict__["_properties"]["configuration"]["copy"]
                        ["destinationTable"]["tableId"]))

            # delete temp dataset
            print("Deleting temp dataset.")
            conn.delete_dataset(ds_temp,
                                delete_contents=True,
                                not_found_ok=False)

            print("Done.")
Esempio n. 17
0
    assert location == "some-location"
    assert dataset_id == "some-dataset"
    assert arraysize == 1000
    assert credentials_path == "/some/path/to.json"
    assert isinstance(job_config, QueryJobConfig)


@pytest.mark.parametrize(
    "param, value, default",
    [
        ("clustering_fields", ["a", "b", "c"], None),
        ("create_disposition", "CREATE_IF_NEEDED", None),
        (
            "destination",
            TableReference(
                DatasetReference("different-project", "different-dataset"), "table"
            ),
            None,
        ),
        (
            "destination_encryption_configuration",
            lambda enc: enc.kms_key_name
            == EncryptionConfiguration("some-configuration").kms_key_name,
            None,
        ),
        ("dry_run", True, None),
        ("labels", {"a": "b", "c": "d"}, {}),
        ("maximum_bytes_billed", 1000, None),
        ("priority", "INTERACTIVE", None),
        (
            "schema_update_options",
Esempio n. 18
0
    def commit(self):
        # The commit is where the upload is actually done for BigQuery (special case).
        # The _create_or_update_table method can be called multiple times;
        # each time, data is appended to the .avro file. When "committing",
        # this .avro file is uploaded and, depending on the load strategy, used.
        if not hasattr(self, "avro_file_name"):
            # There was no data ever uploaded
            # Do nothing
            self.log.info("Nothing to upload!")
            return

        # Clean up after yourself first
        self.avro_writer.close()

        # Fetch the relevant configuration
        project_id = self.table_creation_config.get("database_name",
                                                    self.database_name)
        assert project_id, "Missing Project ID!"
        load_strategy = self.table_creation_config["load_strategy"]
        primary_key = self.table_creation_config["primary_key"]
        schema_name = self.table_creation_config["schema_name"]
        schema_suffix = self.table_creation_config["schema_suffix"]
        table_name_final = self.table_creation_config["table_name"]
        table_suffix = "__ewah_tmp"

        columns_definition = self.table_creation_config["columns_definition"]
        new_schema_name = schema_name + schema_suffix

        is_full_refresh = (load_strategy == EC.LS_INSERT_REPLACE
                           or not self.test_if_table_exists(
                               table_name=table_name_final,
                               schema_name=new_schema_name,
                               project_id=project_id,
                           ))

        conn = self.dwh_hook.dbconn
        ds_new = conn.get_dataset(new_schema_name)

        # Create temp table with .avro file
        if is_full_refresh:
            # temp table is also the final table for full refresh!
            table_name = table_name_final
        else:
            table_name = table_name_final + table_suffix

        # Drop temp table if it already exists
        if self.test_if_table_exists(
                table_name=table_name,
                schema_name=new_schema_name,
                project_id=project_id,
        ):
            # Drop table before re-creating it
            conn.delete_table(
                conn.get_table(
                    TableReference(dataset_ref=ds_new, table_id=table_name)))
        # Create temp table with .avro file
        table_obj = Table(".".join([project_id, new_schema_name, table_name]))
        if is_full_refresh and self.partition_field:
            table_obj.time_partitioning = bigquery.TimePartitioning(
                type_=self.partition_type,
                field=self.partition_field,
            )
            if self.require_partition_filter:
                table_obj.require_partition_filter = True
        self.log.info("Uploading data into table now...")
        with open(self.avro_file_name, "rb") as source_file:
            job = conn.load_table_from_file(
                file_obj=source_file,
                destination=table_obj,
                job_id_prefix="ewah_",
                rewind=True,
                job_config=LoadJobConfig(
                    autodetect=False,
                    source_format="AVRO",
                    schema=[
                        SchemaField(name=name, field_type=field["data_type"])
                        for name, field in columns_definition.items()
                    ],
                ),
            )
            try:
                job.result()
            except:
                self.log.info("Errors occured - job errors: {0}".format(
                    job.errors))
                raise
            assert job.state == "DONE", "Invalid job state: {0}".format(
                job.state)

        if not is_full_refresh:
            # Need to merge new rows into the existing table

            fields_pk = set(primary_key or [])
            fields_all = set(columns_definition.keys() or [])
            fields_non_pk = fields_all - fields_pk

            if load_strategy == EC.LS_UPSERT:
                assert fields_pk
            elif load_strategy == EC.LS_INSERT_ADD:
                fields_pk = []  # Ignore if set
            else:
                raise Exception("Not implemented!")

            merge_statement = """
                MERGE INTO `{target}` AS TARGET
                USING `{source}` AS SOURCE
                ON {condition}

                WHEN MATCHED THEN
                    UPDATE SET {update_fields}

                WHEN NOT MATCHED THEN
                    INSERT ({insert_fields})
                    VALUES ({insert_fields})
            """.format(
                target=".".join(
                    [project_id, new_schema_name, table_name_final]),
                source=".".join([project_id, new_schema_name, table_name]),
                condition=" AND ".join([
                    "TARGET.`{0}` = SOURCE.`{0}`".format(field)
                    for field in fields_pk
                ]) or "FALSE",
                insert_fields="`{0}`".format("`, `".join(fields_all)),
                update_fields=", ".join([
                    "`{0}` = SOURCE.`{0}`".format(field)
                    for field in fields_non_pk
                ]),
            )

            self.log.info(
                "Executing query:\n\n{0}\n\n".format(merge_statement))
            job = conn.query(
                query=merge_statement,
                job_id_prefix="ewah_",
            )
            try:
                job.result()
            except:
                self.log.info("Errors occured - job errors: {0}".format(
                    job.errors))
                raise
            assert job.state == "DONE", "Invalid job state: {0}".format(
                job.state)

            # Remove old temp table from dataset
            conn.delete_table(
                conn.get_table(
                    TableReference(dataset_ref=ds_new, table_id=table_name)))

        self.log.info("Done!")
Esempio n. 19
0
class _Base(unittest.TestCase):
    from google.cloud.bigquery.dataset import DatasetReference
    from google.cloud.bigquery.table import TableReference

    ENDPOINT = "https://bigquery.googleapis.com"
    PROJECT = "project"
    SOURCE1 = "http://example.com/source1.csv"
    DS_ID = "dataset_id"
    DS_REF = DatasetReference(PROJECT, DS_ID)
    TABLE_ID = "table_id"
    TABLE_REF = TableReference(DS_REF, TABLE_ID)
    JOB_ID = "JOB_ID"
    JOB_TYPE = "unknown"
    KMS_KEY_NAME = "projects/1/locations/us/keyRings/1/cryptoKeys/1"

    def _make_one(self, *args, **kw):
        return self._get_target_class()(*args, **kw)

    def _setUpConstants(self):
        import datetime
        from google.cloud._helpers import UTC

        self.WHEN_TS = 1437767599.006
        self.WHEN = datetime.datetime.utcfromtimestamp(
            self.WHEN_TS).replace(tzinfo=UTC)
        self.ETAG = "ETAG"
        self.FULL_JOB_ID = "%s:%s" % (self.PROJECT, self.JOB_ID)
        self.RESOURCE_URL = "{}/bigquery/v2/projects/{}/jobs/{}".format(
            self.ENDPOINT, self.PROJECT, self.JOB_ID)
        self.USER_EMAIL = "*****@*****.**"

    def _table_ref(self, table_id):
        from google.cloud.bigquery.table import TableReference

        return TableReference(self.DS_REF, table_id)

    def _make_resource(self, started=False, ended=False, location="US"):
        self._setUpConstants()
        return _make_job_resource(
            creation_time_ms=int(self.WHEN_TS * 1000),
            started_time_ms=int(self.WHEN_TS * 1000),
            ended_time_ms=int(self.WHEN_TS * 1000) + 1000000,
            started=started,
            ended=ended,
            etag=self.ETAG,
            endpoint=self.ENDPOINT,
            job_type=self.JOB_TYPE,
            job_id=self.JOB_ID,
            project_id=self.PROJECT,
            user_email=self.USER_EMAIL,
            location=location,
        )

    def _verifyInitialReadonlyProperties(self, job):
        # root elements of resource
        self.assertIsNone(job.etag)
        self.assertIsNone(job.self_link)
        self.assertIsNone(job.user_email)

        # derived from resource['statistics']
        self.assertIsNone(job.created)
        self.assertIsNone(job.started)
        self.assertIsNone(job.ended)

        # derived from resource['status']
        self.assertIsNone(job.error_result)
        self.assertIsNone(job.errors)
        self.assertIsNone(job.state)

    def _verifyReadonlyResourceProperties(self, job, resource):
        from datetime import timedelta

        statistics = resource.get("statistics", {})

        if "creationTime" in statistics:
            self.assertEqual(job.created, self.WHEN)
        else:
            self.assertIsNone(job.created)

        if "startTime" in statistics:
            self.assertEqual(job.started, self.WHEN)
        else:
            self.assertIsNone(job.started)

        if "endTime" in statistics:
            self.assertEqual(job.ended, self.WHEN + timedelta(seconds=1000))
        else:
            self.assertIsNone(job.ended)

        if "etag" in resource:
            self.assertEqual(job.etag, self.ETAG)
        else:
            self.assertIsNone(job.etag)

        if "selfLink" in resource:
            self.assertEqual(job.self_link, self.RESOURCE_URL)
        else:
            self.assertIsNone(job.self_link)

        if "user_email" in resource:
            self.assertEqual(job.user_email, self.USER_EMAIL)
        else:
            self.assertIsNone(job.user_email)
Esempio n. 20
0
    def _table_ref(self, table_id):
        from google.cloud.bigquery.table import TableReference

        return TableReference(self.DS_REF, table_id)
Esempio n. 21
0
    project_id, location, dataset_id, arraysize, credentials_path, job_config = parse_url(
        url_with_everything)

    assert project_id == 'some-project'
    assert location == 'some-location'
    assert dataset_id == 'some-dataset'
    assert arraysize == 1000
    assert credentials_path == '/some/path/to.json'
    assert isinstance(job_config, QueryJobConfig)


@pytest.mark.parametrize('param, value', [
    ('clustering_fields', ['a', 'b', 'c']),
    ('create_disposition', 'CREATE_IF_NEEDED'),
    ('destination',
     TableReference(DatasetReference('different-project', 'different-dataset'),
                    'table')),
    ('destination_encryption_configuration', lambda enc: enc.kms_key_name ==
     EncryptionConfiguration('some-configuration').kms_key_name),
    ('dry_run', True),
    ('labels', {
        'a': 'b',
        'c': 'd'
    }),
    ('maximum_bytes_billed', 1000),
    ('priority', 'INTERACTIVE'),
    ('schema_update_options',
     ['ALLOW_FIELD_ADDITION', 'ALLOW_FIELD_RELAXATION']),
    ('use_query_cache', True),
    ('write_disposition', 'WRITE_APPEND'),
])
def test_all_values(url_with_everything, param, value):
def parse_url(url):  # noqa: C901
    query = dict(url.query)  # need mutable query.

    # use_legacy_sql (legacy)
    if "use_legacy_sql" in query:
        raise ValueError("legacy sql is not supported by this dialect")
    # allow_large_results (legacy)
    if "allow_large_results" in query:
        raise ValueError(
            "allow_large_results is only allowed for legacy sql, which is not supported by this dialect"
        )
    # flatten_results (legacy)
    if "flatten_results" in query:
        raise ValueError(
            "flatten_results is only allowed for legacy sql, which is not supported by this dialect"
        )
    # maximum_billing_tier (deprecated)
    if "maximum_billing_tier" in query:
        raise ValueError("maximum_billing_tier is a deprecated argument")

    project_id = url.host
    location = None
    dataset_id = url.database or None
    arraysize = None
    credentials_path = None

    # location
    if "location" in query:
        location = query.pop("location")

    # credentials_path
    if "credentials_path" in query:
        credentials_path = query.pop("credentials_path")

    # arraysize
    if "arraysize" in query:
        str_arraysize = query.pop("arraysize")
        try:
            arraysize = int(str_arraysize)
        except ValueError:
            raise ValueError("invalid int in url query arraysize: " +
                             str_arraysize)

    # if only these "non-config" values were present, the dict will now be empty
    if not query:
        # if a dataset_id exists, we need to return a job_config that isn't None
        # so it can be updated with a dataset reference from the client
        if dataset_id:
            return (
                project_id,
                location,
                dataset_id,
                arraysize,
                credentials_path,
                QueryJobConfig(),
            )
        else:
            return project_id, location, dataset_id, arraysize, credentials_path, None

    job_config = QueryJobConfig()

    # clustering_fields list(str)
    if "clustering_fields" in query:
        clustering_fields = GROUP_DELIMITER.split(query["clustering_fields"])
        job_config.clustering_fields = list(clustering_fields)

    # create_disposition
    if "create_disposition" in query:
        create_disposition = query["create_disposition"]
        try:
            job_config.create_disposition = getattr(CreateDisposition,
                                                    create_disposition)
        except AttributeError:
            raise ValueError("invalid create_disposition in url query: " +
                             create_disposition)

    # default_dataset
    if "default_dataset" in query or "dataset_id" in query or "project_id" in query:
        raise ValueError(
            "don't pass default_dataset, dataset_id, project_id in url query, instead use the url host and database"
        )

    # destination
    if "destination" in query:
        dest_project = None
        dest_dataset = None
        dest_table = None

        try:
            dest_project, dest_dataset, dest_table = query[
                "destination"].split(".")
        except ValueError:
            raise ValueError(
                "url query destination parameter should be fully qualified with project, dataset, and table"
            )

        job_config.destination = TableReference(
            DatasetReference(dest_project, dest_dataset), dest_table)

    # destination_encryption_configuration
    if "destination_encryption_configuration" in query:
        job_config.destination_encryption_configuration = EncryptionConfiguration(
            query["destination_encryption_configuration"])

    # dry_run
    if "dry_run" in query:
        try:
            job_config.dry_run = parse_boolean(query["dry_run"])
        except ValueError:
            raise ValueError("invalid boolean in url query for dry_run: " +
                             query["dry_run"])

    # labels
    if "labels" in query:
        label_groups = GROUP_DELIMITER.split(query["labels"])
        labels = {}
        for label_group in label_groups:
            try:
                key, value = KEY_VALUE_DELIMITER.split(label_group)
            except ValueError:
                raise ValueError("malformed url query in labels: " +
                                 label_group)
            labels[key] = value

        job_config.labels = labels

    # maximum_bytes_billed
    if "maximum_bytes_billed" in query:
        try:
            job_config.maximum_bytes_billed = int(
                query["maximum_bytes_billed"])
        except ValueError:
            raise ValueError(
                "invalid int in url query maximum_bytes_billed: " +
                query["maximum_bytes_billed"])

    # priority
    if "priority" in query:
        try:
            job_config.priority = getattr(QueryPriority, query["priority"])
        except AttributeError:
            raise ValueError("invalid priority in url query: " +
                             query["priority"])

    # query_parameters
    if "query_parameters" in query:
        raise NotImplementedError("url query query_parameters not implemented")

    # schema_update_options
    if "schema_update_options" in query:
        schema_update_options = GROUP_DELIMITER.split(
            query["schema_update_options"])
        try:
            job_config.schema_update_options = [
                getattr(SchemaUpdateOption, schema_update_option)
                for schema_update_option in schema_update_options
            ]
        except AttributeError:
            raise ValueError("invalid schema_update_options in url query: " +
                             query["schema_update_options"])

    # table_definitions
    if "table_definitions" in query:
        raise NotImplementedError(
            "url query table_definitions not implemented")

    # time_partitioning
    if "time_partitioning" in query:
        raise NotImplementedError(
            "url query time_partitioning not implemented")

    # udf_resources
    if "udf_resources" in query:
        raise NotImplementedError("url query udf_resources not implemented")

    # use_query_cache
    if "use_query_cache" in query:
        try:
            job_config.use_query_cache = parse_boolean(
                query["use_query_cache"])
        except ValueError:
            raise ValueError(
                "invalid boolean in url query for use_query_cache: " +
                query["use_query_cache"])

    # write_disposition
    if "write_disposition" in query:
        try:
            job_config.write_disposition = getattr(WriteDisposition,
                                                   query["write_disposition"])
        except AttributeError:
            raise ValueError("invalid write_disposition in url query: " +
                             query["write_disposition"])

    return project_id, location, dataset_id, arraysize, credentials_path, job_config