Esempio n. 1
0
    def run(
        self,
        uri: str = None,
        dataset_id: str = None,
        table: str = None,
        project: str = None,
        schema: List[bigquery.SchemaField] = None,
        location: str = "US",
        credentials: dict = None,
        **kwargs,
    ):
        """
        Run method for this Task.  Invoked by _calling_ this Task within a Flow context, after
        initialization.

        Args:
            - uri (str, optional): GCS path to load data from
            - dataset_id (str, optional): the id of a destination dataset to write the
                records to; if not provided here, will default to the one provided at initialization
            - table (str, optional): the name of a destination table to write the
                records to; if not provided here, will default to the one provided at initialization
            - project (str, optional): the project to initialize the BigQuery Client with; if
                not provided, will default to the one inferred from your credentials
            - schema (List[bigquery.SchemaField], optional): the schema to use when creating
                the table
            - location (str, optional): location of the dataset that will be written to;
                defaults to "US"
            - credentials (dict, optional): a JSON document containing Google Cloud
                credentials.  You should provide these at runtime with an upstream Secret task.
                If not provided, Prefect will first check `context` for `GCP_CREDENTIALS` and
                lastly will use default Google client logic.
            - **kwargs (optional): additional kwargs to pass to the `bigquery.LoadJobConfig`;
                see the documentation here:
                https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.client.Client.html

        Raises:
            - ValueError: if all required arguments haven't been provided
            - ValueError: if the load job results in an error

        Returns:
            - the response from `load_table_from_uri`
        """
        # check for any argument inconsistencies
        if dataset_id is None or table is None:
            raise ValueError("Both dataset_id and table must be provided.")

        # create client
        client = get_bigquery_client(project=project, credentials=credentials)

        # get table reference
        table_ref = client.dataset(dataset_id).table(table)

        # load data
        autodetect = kwargs.pop("autodetect", True)
        job_config = bigquery.LoadJobConfig(autodetect=autodetect, **kwargs)
        if schema:
            job_config.schema = schema
        load_job = client.load_table_from_uri(uri, table_ref, job_config=job_config)
        load_job.result()  # block until job is finished
Esempio n. 2
0
    def run(
        self,
        project: str = None,
        credentials: dict = None,
        dataset: str = None,
        table: str = None,
        schema: List[bigquery.SchemaField] = None,
    ):
        """
        Run method for this Task.  Invoked by _calling_ this Task within a Flow context, after
        initialization.

        Args:
            - project (str, optional): the project to initialize the BigQuery Client with; if
                not provided, will default to the one inferred from your credentials
            - credentials (dict, optional): a JSON document containing Google Cloud
                credentials.  You should provide these at runtime with an upstream Secret task.
                If not provided, Prefect will first check `context` for `GCP_CREDENTIALS` and
                lastly will use default Google client logic.
            - dataset (str, optional): the name of a dataset in that the table will be created
            - table (str, optional): the name of a table to create
            - schema (List[bigquery.SchemaField], optional): the schema to use when creating
                the table

        Returns:
            - None

        Raises:
            - SUCCESS: a `SUCCESS` signal if the table already exists
        """
        client = get_bigquery_client(project=project, credentials=credentials)

        try:
            dataset_ref = client.get_dataset(dataset)
        except NotFound:
            self.logger.debug(
                "Dataset {} not found, creating...".format(dataset))
            dataset_ref = client.create_dataset(dataset)

        table_ref = dataset_ref.table(table)
        try:
            client.get_table(table_ref)
            raise SUCCESS("{dataset}.{table} already exists.".format(
                dataset=dataset, table=table))
        except NotFound:
            self.logger.debug("Table {} not found, creating...".format(table))
            table = bigquery.Table(table_ref, schema=schema)

            # partitioning
            if self.time_partitioning:
                table.time_partitioning = self.time_partitioning

            # cluster for optimal data sorting/access
            if self.clustering_fields:
                table.clustering_fields = self.clustering_fields
            client.create_table(table)
Esempio n. 3
0
def get_client(project: str,
               credentials: dict,
               credentials_secret: str = None) -> bigquery.Client:
    creds = None
    if credentials_secret is not None:
        warnings.warn(
            "The `credentials_secret` argument is deprecated. Use a `Secret` task "
            "to pass the credentials value at runtime instead.",
            UserWarning,
        )
        # TODO: make this optional so that the machine can authenticate externally to Prefect
        creds = Secret(credentials_secret).get()
    return get_bigquery_client(credentials=creds, project=project)
Esempio n. 4
0
def extract_ga_table(project: str, gcp_credentials: dict, dataset: str, date: str, output_root: str):
    """
    Runs a BigQuery extraction job, extracting the google analytics' `ga_sessions` table for a
    given date to a location in GCS in gzipped compressed JSON format.
    """
    table_name = "ga_sessions_{}".format(date)
    dest_filename = "{}_*.json.gz".format(table_name)
    base_extraction_path = os.path.join(output_root, dataset, date)
    destination_uri = os.path.join(base_extraction_path, dest_filename)

    client = get_bigquery_client(credentials=gcp_credentials, project=project)

    dataset = client.dataset(dataset, project=project)
    table = dataset.table(table_name)
    job_config = bigquery.job.ExtractJobConfig()
    job_config.destination_format = bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON
    job_config.compression = "GZIP"
    extract_job = client.extract_table(table, destination_uri, job_config=job_config)
    extract_job.result()

    return base_extraction_path
Esempio n. 5
0
    def run(
        self,
        query: str = None,
        query_params: List[tuple] = None,
        project: str = None,
        location: str = "US",
        dry_run_max_bytes: int = None,
        credentials: dict = None,
        dataset_dest: str = None,
        table_dest: str = None,
        job_config: dict = None,
    ):
        """
        Run method for this Task.  Invoked by _calling_ this Task within a Flow context, after initialization.

        Args:
            - query (str, optional): a string of the query to execute
            - query_params (list[tuple], optional): a list of 3-tuples specifying
                BigQuery query parameters; currently only scalar query parameters are supported. See
                [the Google documentation](https://cloud.google.com/bigquery/docs/parameterized-queries#bigquery-query-params-python)
                for more details on how both the query and the query parameters should be formatted
            - project (str, optional): the project to initialize the BigQuery Client with; if not provided,
                will default to the one inferred from your credentials
            - location (str, optional): location of the dataset that will be queried; defaults to "US"
            - dry_run_max_bytes (int, optional): if provided, the maximum number of bytes the query is allowed
                to process; this will be determined by executing a dry run and raising a `ValueError` if the
                maximum is exceeded
            - credentials (dict, optional): a JSON document containing Google Cloud credentials.
                You should provide these at runtime with an upstream Secret task.  If not provided, Prefect will
                first check `context` for `GCP_CREDENTIALS` and lastly will use default Google client logic.
            - dataset_dest (str, optional): the optional name of a destination dataset to write the
                query results to, if you don't want them returned; if provided, `table_dest` must also be
                provided
            - table_dest (str, optional): the optional name of a destination table to write the
                query results to, if you don't want them returned; if provided, `dataset_dest` must also be
                provided
            - job_config (dict, optional): an optional dictionary of job configuration parameters; note that
                the parameters provided here must be pickleable (e.g., dataset references will be rejected)

        Raises:
            - ValueError: if the `query` is `None`
            - ValueError: if only one of `dataset_dest` / `table_dest` is provided
            - ValueError: if the query will execeed `dry_run_max_bytes`

        Returns:
            - list: a fully populated list of Query results, with one item per row
        """
        ## check for any argument inconsistencies
        if query is None:
            raise ValueError("No query provided.")
        if sum([dataset_dest is None, table_dest is None]) == 1:
            raise ValueError(
                "Both `dataset_dest` and `table_dest` must be provided if writing to a destination table."
            )

        ## create client
        client = get_bigquery_client(project=project, credentials=credentials)

        ## setup jobconfig
        job_config = bigquery.QueryJobConfig(**job_config)
        if query_params is not None:
            hydrated_params = [
                bigquery.ScalarQueryParameter(*qp) for qp in query_params
            ]
            job_config.query_parameters = hydrated_params

        ## perform dry_run if requested
        if dry_run_max_bytes is not None:
            old_info = dict(dry_run=job_config.dry_run,
                            use_query_cache=job_config.use_query_cache)
            job_config.dry_run = True
            job_config.use_query_cache = False
            self.logger.debug("Performing a dry run...")
            query_job = client.query(query,
                                     location=location,
                                     job_config=job_config)
            if query_job.total_bytes_processed > dry_run_max_bytes:
                raise ValueError(
                    "Query will process {0} bytes which is above the set maximum of {1} for this task."
                    .format(query_job.total_bytes_processed,
                            dry_run_max_bytes))
            job_config.dry_run = old_info["dry_run"]
            job_config.use_query_cache = old_info["use_query_cache"]

        ## if writing to a destination table
        if dataset_dest is not None:
            table_ref = client.dataset(dataset_dest).table(table_dest)
            job_config.destination = table_ref

        query_job = client.query(query,
                                 location=location,
                                 job_config=job_config)
        return list(query_job.result())
Esempio n. 6
0
    def run(self,
            records: List[dict],
            dataset_id: str = None,
            table: str = None,
            project: str = None,
            location: str = "US",
            credentials: dict = None,
            **kwargs):
        """
        Run method for this Task.  Invoked by _calling_ this Task within a Flow context, after initialization.

        Args:
            - records (list[dict]): the list of records to insert as rows into
                the BigQuery table; each item in the list should be a dictionary whose keys correspond
                to columns in the table
            - dataset_id (str, optional): the id of a destination dataset to write the
                records to; if not provided here, will default to the one provided at initialization
            - table (str, optional): the name of a destination table to write the
                records to; if not provided here, will default to the one provided at initialization
            - project (str, optional): the project to initialize the BigQuery Client with; if not provided,
                will default to the one inferred from your credentials
            - location (str, optional): location of the dataset that will be written to; defaults to "US"
            - credentials (dict, optional): a JSON document containing Google Cloud credentials.
                You should provide these at runtime with an upstream Secret task.  If not provided, Prefect will
                first check `context` for `GCP_CREDENTIALS` and lastly will use default Google client logic.
            - **kwargs (optional): additional kwargs to pass to the
                `insert_rows_json` method; see the documentation here:
                https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.client.Client.html

        Raises:
            - ValueError: if all required arguments haven't been provided
            - ValueError: if any of the records result in errors

        Returns:
            - the response from `insert_rows_json`
        """
        ## check for any argument inconsistencies
        if dataset_id is None or table is None:
            raise ValueError("Both dataset_id and table must be provided.")

        ## create client
        client = get_bigquery_client(project=project, credentials=credentials)

        ## get table reference
        table_ref = client.dataset(dataset_id).table(table)

        ## stream data in
        response = client.insert_rows_json(table=table_ref,
                                           json_rows=records,
                                           **kwargs)

        errors = []
        output = []
        for row in response:
            output.append(row)
            if "errors" in row:
                errors.append(row["errors"])

        if errors:
            raise ValueError(errors)

        return output
Esempio n. 7
0
    def run(
        self,
        file: Union[str, Path] = None,
        rewind: bool = False,
        size: int = None,
        num_retries: int = 6,
        dataset_id: str = None,
        table: str = None,
        project: str = None,
        schema: List[bigquery.SchemaField] = None,
        location: str = "US",
        credentials: dict = None,
        **kwargs,
    ):
        """
        Run method for this Task. Invoked by _calling_ this Task within a Flow context, after
        initialization.

        Args:
            - file (Union[str, path-liike object], optional): A string or path-like object of
                the file to be loaded
            - rewind (bool, optional): if True, seek to the beginning of the file handle before
                reading the file
            - size (int, optional):  the number of bytes to read from the file handle. If size
                is None or large, resumable upload will be used. Otherwise, multipart upload
                will be used.
            - dataset_id (str, optional): the id of a destination dataset to write the records
                to; if not provided here, will default to the one provided at initialization
            - table (str, optional): the name of a destination table to write the records to;
                if not provided here, will default to the one provided at initialization
            - project (str, optional): the project to initialize the BigQuery Client with; if
                not provided, will default to the one inferred from your credentials
            - schema (List[bigquery.SchemaField], optional): the schema to use when creating
                the table
            - location (str, optional): location of the dataset that will be written to;
                defaults to "US"
            - credentials (dict, optional): a JSON document containing Google Cloud
                credentials.  You should provide these at runtime with an upstream Secret task.
            - **kwargs (optional): additional kwargs to pass to the `bigquery.LoadJobConfig`;
                see the documentation here:
                https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.client.Client.html

        Raises:
            - ValueError: if all required arguments haven't been provided or file does not exist
            - IOError: if file can't be opened and read
            - ValueError: if the load job results in an error

        Returns:
            - the response from `load_table_from_file`
        """
        # check for any argument inconsistencies
        if dataset_id is None or table is None:
            raise ValueError("Both dataset_id and table must be provided.")
        try:
            path = Path(file)
        except Exception:
            raise ValueError("A string or path-like object must be provided.")
        if not path.is_file():
            raise ValueError(f"File {path.as_posix()} does not exist.")

        # create client
        client = get_bigquery_client(
            project=project,
            credentials=credentials,
        )

        # get table reference
        table_ref = client.dataset(dataset_id).table(table)

        # configure job
        autodetect = kwargs.pop("autodetect", True)
        job_config = bigquery.LoadJobConfig(autodetect=autodetect, **kwargs)
        if schema:
            job_config.schema = schema

        # load data
        try:
            with open(file, "rb") as file_obj:
                load_job = client.load_table_from_file(
                    file_obj,
                    table_ref,
                    rewind,
                    size,
                    num_retries,
                    job_config=job_config,
                )
        except IOError:
            raise IOError(f"Can't open and read from {path.as_posix()}.")

        load_job.result()  # block until job is finished
Esempio n. 8
0
        try:
            path = Path(file)
<<<<<<< HEAD
        except Exception as value_error:
            raise ValueError(
                "A string or path-like object must be provided."
            ) from value_error
=======
        except Exception:
            raise ValueError("A string or path-like object must be provided.")
>>>>>>> prefect clone
        if not path.is_file():
            raise ValueError(f"File {path.as_posix()} does not exist.")

        # create client
        client = get_bigquery_client(project=project, credentials=credentials)

        # get table reference
        table_ref = client.dataset(dataset_id).table(table)

        # configure job
        autodetect = kwargs.pop("autodetect", True)
        job_config = bigquery.LoadJobConfig(autodetect=autodetect, **kwargs)
        if schema:
            job_config.schema = schema

        # load data
        try:
            with open(file, "rb") as file_obj:
                load_job = client.load_table_from_file(
                    file_obj,