Example #1
0
def scheduled_query(transfer_client, dataset_id, table_id, query_string,
                    schedule, name):

    # The project where the query job runs is the same as the project
    # containing the destination dataset.

    parent = transfer_client.common_project_path('zapp-case-study')

    transfer_config = bigquery_datatransfer.TransferConfig(
        destination_dataset_id=dataset_id,
        display_name=name,
        data_source_id="scheduled_query",
        params={
            "query": query_string,
            "destination_table_name_template": table_id,
            "write_disposition": "WRITE_APPEND",
            "partitioning_field": "",
        },
        schedule=schedule)

    transfer_config = transfer_client.create_transfer_config(
        bigquery_datatransfer.CreateTransferConfigRequest(
            parent=parent,
            transfer_config=transfer_config,
        ))

    print("Created scheduled query '{}'".format(transfer_config.name))
    def scheduled_query(self):

        transfer_client = bigquery_datatransfer.DataTransferServiceClient()

        # The project where the query job runs is the same as the project
        # containing the destination dataset.
        project_id = "your-project-id"
        dataset_id = "your_dataset_id"

        # This service account will be used to execute the scheduled queries. Omit
        # this request parameter to run the query as the user with the credentials
        # associated with this client.
        service_account_name = "*****@*****.**"

        # Use standard SQL syntax for the query.
        query_string = """
        SELECT
        CURRENT_TIMESTAMP() as current_time,
        @run_time as intended_run_time,
        @run_date as intended_run_date,
        17 as some_integer
        """

        parent = transfer_client.common_project_path(project_id)

        transfer_config = bigquery_datatransfer.TransferConfig(
            destination_dataset_id=dataset_id,
            display_name="Your Scheduled Query Name",
            data_source_id="scheduled_query",
            params={
                "query": query_string,
                "destination_table_name_template": "your_table_{run_date}",
                "write_disposition": "WRITE_TRUNCATE",
                "partitioning_field": "",
            },
            schedule="every 24 hours",
        )

        transfer_config = transfer_client.create_transfer_config(
            bigquery_datatransfer.CreateTransferConfigRequest(
                parent=parent,
                transfer_config=transfer_config,
                service_account_name=service_account_name,
            ))

        print("Created scheduled query '{}'".format(transfer_config.name))
def create_scheduled_query(override_values={}):
    # [START bigquerydatatransfer_create_scheduled_query]
    # [START bigquerydatatransfer_create_scheduled_query_with_service_account]
    from google.cloud import bigquery_datatransfer

    transfer_client = bigquery_datatransfer.DataTransferServiceClient()

    # The project where the query job runs is the same as the project
    # containing the destination dataset.
    project_id = "your-project-id"
    dataset_id = "your_dataset_id"

    # This service account will be used to execute the scheduled queries. Omit
    # this request parameter to run the query as the user with the credentials
    # associated with this client.
    service_account_name = "*****@*****.**"
    # [END bigquerydatatransfer_create_scheduled_query_with_service_account]
    # [END bigquerydatatransfer_create_scheduled_query]
    # To facilitate testing, we replace values with alternatives
    # provided by the testing harness.
    project_id = override_values.get("project_id", project_id)
    dataset_id = override_values.get("dataset_id", dataset_id)
    service_account_name = override_values.get("service_account_name",
                                               service_account_name)
    # [START bigquerydatatransfer_create_scheduled_query]
    # [START bigquerydatatransfer_create_scheduled_query_with_service_account]

    # Use standard SQL syntax for the query.
    query_string = """
    SELECT
      CURRENT_TIMESTAMP() as current_time,
      @run_time as intended_run_time,
      @run_date as intended_run_date,
      17 as some_integer
    """

    parent = transfer_client.common_project_path(project_id)

    transfer_config = bigquery_datatransfer.TransferConfig(
        destination_dataset_id=dataset_id,
        display_name="Your Scheduled Query Name",
        data_source_id="scheduled_query",
        params={
            "query": query_string,
            "destination_table_name_template": "your_table_{run_date}",
            "write_disposition": "WRITE_TRUNCATE",
            "partitioning_field": "",
        },
        schedule="every 24 hours",
    )

    transfer_config = transfer_client.create_transfer_config(
        bigquery_datatransfer.CreateTransferConfigRequest(
            parent=parent,
            transfer_config=transfer_config,
            service_account_name=service_account_name,
        ))

    print("Created scheduled query '{}'".format(transfer_config.name))
    # [END bigquerydatatransfer_create_scheduled_query_with_service_account]
    # [END bigquerydatatransfer_create_scheduled_query]
    # Return the config name for testing purposes, so that it can be deleted.
    return transfer_config
query_string = """
SELECT
  CURRENT_TIMESTAMP() as current_time,
  @run_time as intended_run_time,
  @run_date as intended_run_date,
  17 as some_integer
"""

parent = transfer_client.common_project_path(project_id)

transfer_config = bigquery_datatransfer.TransferConfig(
    destination_dataset_id=dataset_id,
    display_name="Your Scheduled Query Name",
    data_source_id="scheduled_query",
    params={
        "query": query_string,
        "destination_table_name_template": "mascots{20210115}",
        "write_disposition": "WRITE_TRUNCATE",
        "partitioning_field": "",
    },
    schedule="every 24 hours",
)

transfer_config = transfer_client.create_transfer_config(
    bigquery_datatransfer.CreateTransferConfigRequest(
        parent=parent,
        transfer_config=transfer_config,
        service_account_name=service_account_name,
    ))

print("Created scheduled query '{}'".format(transfer_config.name))
Example #5
0
    def execute(self, data):

        # setup 
        client = self.client
        bigquery = self.bigquery
        bigquery_datatransfer = self.bigquery_datatransfer
        datetime = self.datetime 
        pytz = self.pytz        
        time = self.time 
        name = data.get("titleName")
        emails = data.get("emails")
        query = data.get("query")
        table = ""
        #

        # create a dataset first if needed
        dataset_main = self.make_dataset()
        table_id = "{}.{}".format(dataset_main, name) 
        #    

        #create a table if needed
        table= self.make_table(table_id)
        #
                

        # an interactive query
        if(self.env.get("interactive")):
            try:
                 
                """
                    query =
                    SELECT name, SUM(number) as total_people
                    FROM `bigquery-public-data.usa_names.usa_1910_2013`
                    WHERE state = 'TX'
                    GROUP BY name, state
                    ORDER BY total_people DESC
                    LIMIT 20
                """
                query_job = client.query(query)  # Make an API request.

                
                return json.dumps({
                    "schema":[{"field":x} for x in ["Name","Count"]],
                    "data":[
                        # Row values can be accessed by field name or index.
                        {
                            "Name":row[0],
                            "Count":row["total_people"] 
                        }
                        for row in query_job
                    ]
                })
                    
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        #

        # a batch query
        elif(self.env.get("batch")):
            try:
                job_config = bigquery.QueryJobConfig(
                    # Run at batch priority, which won't count toward concurrent rate limit.
                    priority=bigquery.QueryPriority.BATCH
                )

                """
                    query =
                    SELECT corpus
                    FROM `bigquery-public-data.samples.shakespeare`
                    GROUP BY corpus;
                """

                # Start the query, passing in the extra configuration.
                query_job = client.query(query, job_config=job_config)  # Make an API request.

                query_job = client.get_job(
                    query_job.job_id, location=query_job.location
                )  # Make an API request.

                return "Job {} is currently in state {}".format(query_job.job_id, query_job.state)             
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'       
        # 

        # a dry run
        elif (self.env.get("dry_run")):
            try:
                """ query
                    
                    SELECT name, COUNT(*) as name_count 
                    FROM `bigquery-public-data.usa_names.usa_1910_2013` 
                    WHERE state = 'WA' 
                    GROUP BY name
                """                
                job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)

                # Start the query, passing in the extra configuration.
                query_job = client.query(
                    query,
                    job_config=job_config,
                )  # Make an API request.

                # A dry run query completes immediately.
                return "This query will process {} megabytes.".format(query_job.total_bytes_processed/1000000)                
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        #

        # write to a  permanent table
        elif(self.env.get("permanent_table")):
            try:
                 
                """ query
                SELECT corpus
                FROM `bigquery-public-data.samples.shakespeare`
                GROUP BY corpus;
                """                
                job_config = bigquery.QueryJobConfig(destination=table_id)
                # Start the query, passing in the extra configuration.
                query_job = client.query(query, job_config=job_config)  # Make an API request.

                # fr queries that return massive results using legacy SQL
                # job_config = bigquery.QueryJobConfig(
                #     allow_large_results=True, destination=table_id, use_legacy_sql=True
                # )
                
                query_job.result()  # Wait for the job to complete.

                return "Query results loaded to the table {}".format(table_id)           
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        #

        # query caching
        elif (self.env.get("cache_enable/disable")):
            try:
                """ query

                    SELECT corpus
                    FROM `bigquery-public-data.samples.shakespeare`
                    GROUP BY corpus;
                """                
                job_config = bigquery.QueryJobConfig(
                    use_query_cache=False # to disable
                    # createDisposition="CREATE_NEVER"  # to enable
                )
                query_job = client.query(query, job_config=job_config)  # Make an API request.
                query_job.result()
                return "was this query cached? {}".format(query_job.cache_hit)
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        #

        # regular parameterized 
        elif(self.env.get("parameterized")):
            try:
                """ query = 
                 
                     SELECT word, word_count
                     FROM `bigquery-public-data.samples.shakespeare`
                     WHERE corpus = @corpus
                     AND word_count >= @min_word_count
                     ORDER BY word_count DESC;
                 """
                job_config = bigquery.QueryJobConfig(
                    query_parameters=[
                        bigquery.ScalarQueryParameter("corpus", "STRING", "romeoandjuliet"),
                        bigquery.ScalarQueryParameter("min_word_count", "INT64", 250),
                    ]
                )
                query_job = client.query(query, job_config=job_config)  # Make an API request.

                return json.dumps({
                    "schema":[{"field":x} for x in ["word","word_count"]],
                    "data":[
                        # Row values can be accessed by field name or index.
                        {
                            "word":row["word"],
                            "word_count":row["word_count"] 
                        }
                        for row in query_job
                    ]
                })


            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'            
        #

        # array parameterized
        elif(self.env.get("array_parameterized")):
            try:
                schema = ["name","count"]
                """query = 
                    
                    SELECT name, sum(number) as count
                    FROM `bigquery-public-data.usa_names.usa_1910_2013`
                    WHERE gender = @gender
                    AND state IN UNNEST(@states)
                    GROUP BY name
                    ORDER BY count DESC
                    LIMIT 10;
                 """
                job_config = bigquery.QueryJobConfig(
                    query_parameters=[
                        bigquery.ScalarQueryParameter("gender", "STRING", "M"),
                        bigquery.ArrayQueryParameter("states", "STRING", ["WA", "WI", "WV", "WY"]),
                    ]
                )
                query_job = client.query(query, job_config=job_config)  # Make an API request.

                return json.dumps({
                    "schema":[{"field":x} for x in schema],
                    "data":[
                        # Row values can be accessed by field name or index.
                        {
                            schema[0]:row[schema[0]],
                            schema[1]:row[schema[1]] 
                        }
                        for row in query_job
                    ]
                })
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'            
        #

        # timestamp parameters
        elif(self.env.get("timestamp_parameterized")):
            try:
                schema = ["date","f0_"]
                """query = 
                    
                    SELECT TIMESTAMP_ADD(@ts_value, INTERVAL 1 HOUR);
                 """
                job_config = bigquery.QueryJobConfig(
                    query_parameters=[
                        bigquery.ScalarQueryParameter(
                            "ts_value",
                            "TIMESTAMP",
                            datetime.datetime(2016, 12, 7, 8, 0, tzinfo=pytz.UTC),
                        )
                    ]
                )
                query_job = client.query(query, job_config=job_config)  # Make an API request.    
                return json.dumps(
                    {
                        "schema":[{"field":x} for x in schema],
                        "data":[
                            # Row values can be accessed by field name or index.
                            {
                                "date":row[0].strftime("%m/%d/%Y, %H:%M:%S"),
                                schema[1]:row[schema[1]] 
                            }
                            for row in query_job
                        ]
                    },
                    default=str
                )
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'            
        #

        # struct parameters
        elif(self.env.get("struct_parameterized")):
            
            try:
                schema = ["s"]
                """ query = 
                SELECT @struct_value AS s;
                """
                job_config = bigquery.QueryJobConfig(
                    query_parameters=[
                        bigquery.StructQueryParameter(
                            "struct_value",
                            bigquery.ScalarQueryParameter("x", "INT64", 1),
                            bigquery.ScalarQueryParameter("y", "STRING", "foo"),
                        )
                    ]
                )
                query_job = client.query(query, job_config=job_config)  # Make an API request.
                return json.dumps({
                    "schema":[{"field":x} for x in schema],
                    "data":[
                        # Row values can be accessed by field name or index.
                        {
                            schema[0]:str(row[schema[0]]),
                        }
                        for row in query_job
                    ]
                })           
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        #

        # wildcard
        elif (self.env.get("mutliple_by_wildcard")):
            try:
                job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
                # replace the asterick with a number or change the table suffixes to see the amount of data used
                """ query
                
                SELECT
                max,
                ROUND((max-32)*5/9,1) celsius,
                mo,
                da,
                year
                FROM
                `bigquery-public-data.noaa_gsod.gsod194*` 
                WHERE
                max != 9999.9 # code for missing data
                AND ( _TABLE_SUFFIX = '0'
                    OR _TABLE_SUFFIX = '4' )
                ORDER BY
                max DESC 
                """
                # Start the query, passing in the extra configuration.
                query_job = client.query(
                    query
                    ,
                    job_config=job_config,
                )  # Make an API request.

                # A dry run query completes immediately.
                return "This query will process {} megabytes.".format(query_job.total_bytes_processed/1000000)                
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        # 
                
        # scheduled query
            # using bigquery datatransfer, the names are not joined as much
        elif(self.env.get("scheduled")):
            try:
                # Use standard SQL syntax for the query.
                """ query = 
                
                SELECT
                CURRENT_TIMESTAMP() as current_time,
                @run_time as intended_run_time,
                @run_date as intended_run_date,
                17 as some_integer
                """

                transfer_client = bigquery_datatransfer.DataTransferServiceClient()
                
                project_id = client.project
                dataset_id = self.dataset_names[0]
                # This service account will be used to execute the scheduled queries. Omit
                # this request parameter to run the query as the user with the credentials
                # associated with this client. remember roles and permissions
                # service_account_name = "*****@*****.**"


                parent = transfer_client.common_project_path(project_id)

                transfer_config = bigquery_datatransfer.TransferConfig(
                    destination_dataset_id=dataset_id,
                    display_name="Your Scheduled Query Name",
                    data_source_id="scheduled_query",
                    params={
                        "query": query,
                        "destination_table_name_template": "your_table_{run_date}",
                        "write_disposition": "WRITE_APPEND",
                        "partitioning_field": "",
                    },
                    schedule="every 24 hours",
                )

                transfer_config = transfer_client.create_transfer_config(
                    bigquery_datatransfer.CreateTransferConfigRequest(
                        parent=parent,
                        transfer_config=transfer_config,
                        # service_account_name=service_account_name,
                    )
                )

                return "Created scheduled query '{}'".format(transfer_config.name)       
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        #        

        # check schedule status
            # you see them ins scheduled queries
        elif(self.env.get("scheduled_status")):
            try:
                transfer_client = bigquery_datatransfer.DataTransferServiceClient()
                project_id = client.project
                parent = transfer_client.common_project_path(project_id)

                configs = transfer_client.list_transfer_configs(parent=parent)
                result = "Got the following configs:"
                for config in configs:
                    result += f"\n\tID: {config.name}, Schedule: {config.schedule}"      
                return result   
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'            
        #

        # manual runs
            # used to immediatley execute a query
        elif (self.env.get("manual_run")):
            try:
                transfer_client = bigquery_datatransfer.DataTransferServiceClient()
                transfer_config_name = query
                now = datetime.datetime.now(datetime.timezone.utc)
                start_time = now - datetime.timedelta(days=5)
                end_time = now - datetime.timedelta(days=2)

                # Some data sources, such as scheduled_query only support daily run.
                # Truncate start_time and end_time to midnight time (00:00AM UTC).
                start_time = datetime.datetime(
                    start_time.year, start_time.month, start_time.day, tzinfo=datetime.timezone.utc
                )
                end_time = datetime.datetime(
                    end_time.year, end_time.month, end_time.day, tzinfo=datetime.timezone.utc
                )

                response = transfer_client.schedule_transfer_runs(
                    parent=transfer_config_name,
                    start_time=start_time,
                    end_time=end_time,
                )

                result = "Started transfer runs:"
                for run in response.runs:
                    result += f"\n\tbackfill: {run.run_time} run: {run.name}"    
                return result        
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        # 

        # delete scheduled query
        elif(self.env.get("scheduled_delete")):
            try:
                import google.api_core.exceptions
                transfer_client = bigquery_datatransfer.DataTransferServiceClient()
                transfer_config_name = query
                transfer_client.delete_transfer_config(name=transfer_config_name)
                return f"Deleted transfer config: {transfer_config_name}"
            except google.api_core.exceptions.NotFound:
                return "Transfer config not found."                            
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'            
        #

        return "Check the backend env dictionary you did set it so the backend didnt do anything"