def stop_scheduler(act_cloud_scheduler, doc_ref): jobs = doc_ref.collection('jobs').stream() for job in jobs: # TODO: remove reference to fixed project and location project = 'example5-237118' location = 'europe-west1' client = scheduler_v1.CloudSchedulerClient() name = client.job_path(project, location, job.id) scheduler_v1.CloudSchedulerClient().delete_job(name) job.reference.delete()
def start_scheduler(act_cloud_scheduler, doc_ref, trigger_resource): topic = act_cloud_scheduler.get('topic') print(topic) cron_config = act_cloud_scheduler.get('cron') print(cron_config) # TODO: remove reference to fixed project and location project = '/projects/example5-237118' client = scheduler_v1.CloudSchedulerClient() parent = client.location_path('example5-237118', 'europe-west1') # parent = project + '/locations/europe-west1' complete_topic_name = project + '/topics/' + topic print(complete_topic_name) print('Function triggered by change to: %s' % trigger_resource) job = { "pubsub_target": { "topic_name": "projects/example5-237118/topics/" + topic, "data": bytes(trigger_resource, 'utf-8') }, "schedule": cron_config # "* * * * *" } print("job: " + str(job)) # see https://googleapis.github.io/google-cloud-python/latest/scheduler/gapic/v1/api.html response = client.create_job(parent, job) print("name") job_name = response.name[response.name.rindex('/') + 1:] print(job_name) print(response) doc_ref.collection('jobs').document(job_name).set( {'content': str(response)})
def test_update_job(self): # Setup Expected Response name = "name3373707" description = "description-1724546052" schedule = "schedule-697920873" time_zone = "timeZone36848094" expected_response = { "name": name, "description": description, "schedule": schedule, "time_zone": time_zone, } expected_response = job_pb2.Job(**expected_response) # Mock the API response channel = ChannelStub(responses=[expected_response]) patch = mock.patch("google.api_core.grpc_helpers.create_channel") with patch as create_channel: create_channel.return_value = channel client = scheduler_v1.CloudSchedulerClient() # Setup Request job = {} update_mask = {} response = client.update_job(job, update_mask) assert expected_response == response assert len(channel.requests) == 1 expected_request = cloudscheduler_pb2.UpdateJobRequest( job=job, update_mask=update_mask ) actual_request = channel.requests[0][1] assert expected_request == actual_request
def test_list_jobs(self): # Setup Expected Response next_page_token = "" jobs_element = {} jobs = [jobs_element] expected_response = {"next_page_token": next_page_token, "jobs": jobs} expected_response = cloudscheduler_pb2.ListJobsResponse(**expected_response) # Mock the API response channel = ChannelStub(responses=[expected_response]) patch = mock.patch("google.api_core.grpc_helpers.create_channel") with patch as create_channel: create_channel.return_value = channel client = scheduler_v1.CloudSchedulerClient() # Setup Request parent = client.location_path("[PROJECT]", "[LOCATION]") paged_list_response = client.list_jobs(parent) resources = list(paged_list_response) assert len(resources) == 1 assert expected_response.jobs[0] == resources[0] assert len(channel.requests) == 1 expected_request = cloudscheduler_pb2.ListJobsRequest(parent=parent) actual_request = channel.requests[0][1] assert expected_request == actual_request
def test_run_job(self): # Setup Expected Response name_2 = "name2-1052831874" description = "description-1724546052" schedule = "schedule-697920873" time_zone = "timeZone36848094" expected_response = { "name": name_2, "description": description, "schedule": schedule, "time_zone": time_zone, } expected_response = job_pb2.Job(**expected_response) # Mock the API response channel = ChannelStub(responses=[expected_response]) patch = mock.patch("google.api_core.grpc_helpers.create_channel") with patch as create_channel: create_channel.return_value = channel client = scheduler_v1.CloudSchedulerClient() # Setup Request name = client.job_path("[PROJECT]", "[LOCATION]", "[JOB]") response = client.run_job(name) assert expected_response == response assert len(channel.requests) == 1 expected_request = cloudscheduler_pb2.RunJobRequest(name=name) actual_request = channel.requests[0][1] assert expected_request == actual_request
def sync(event, context): """Sync projects with cloud datastore.""" del event, context #unused with ndb.Client().context(): github_client = Github(get_access_token()) repo = github_client.get_repo('google/oss-fuzz') projects = get_projects(repo) cloud_scheduler_client = scheduler_v1.CloudSchedulerClient() sync_projects(cloud_scheduler_client, projects)
def sync(event, context): """Sync projects with cloud datastore.""" del event, context # Unused. with ndb.Client().context(): git_creds = get_github_creds() github_client = Github(git_creds.client_id, git_creds.client_secret) repo = github_client.get_repo('google/oss-fuzz') projects = get_projects(repo) cloud_scheduler_client = scheduler_v1.CloudSchedulerClient() sync_projects(cloud_scheduler_client, projects)
def test_list_jobs_exception(self): channel = ChannelStub(responses=[CustomException()]) patch = mock.patch("google.api_core.grpc_helpers.create_channel") with patch as create_channel: create_channel.return_value = channel client = scheduler_v1.CloudSchedulerClient() # Setup request parent = client.location_path("[PROJECT]", "[LOCATION]") paged_list_response = client.list_jobs(parent) with pytest.raises(CustomException): list(paged_list_response)
def test_run_job_exception(self): # Mock the API response channel = ChannelStub(responses=[CustomException()]) patch = mock.patch("google.api_core.grpc_helpers.create_channel") with patch as create_channel: create_channel.return_value = channel client = scheduler_v1.CloudSchedulerClient() # Setup request name = client.job_path("[PROJECT]", "[LOCATION]", "[JOB]") with pytest.raises(CustomException): client.run_job(name)
def test_update_job_exception(self): # Mock the API response channel = ChannelStub(responses=[CustomException()]) patch = mock.patch("google.api_core.grpc_helpers.create_channel") with patch as create_channel: create_channel.return_value = channel client = scheduler_v1.CloudSchedulerClient() # Setup request job = {} update_mask = {} with pytest.raises(CustomException): client.update_job(job, update_mask)
def test_delete_job(self): channel = ChannelStub() patch = mock.patch("google.api_core.grpc_helpers.create_channel") with patch as create_channel: create_channel.return_value = channel client = scheduler_v1.CloudSchedulerClient() # Setup Request name = client.job_path("[PROJECT]", "[LOCATION]", "[JOB]") client.delete_job(name) assert len(channel.requests) == 1 expected_request = cloudscheduler_pb2.DeleteJobRequest(name=name) actual_request = channel.requests[0][1] assert expected_request == actual_request
def schedule_all_jobs(project_id: str, location_id: str, time_zone: str) -> None: """ Clears all previously scheduled jobs and schedules all necessary jobs for the current configuration. """ client = scheduler_v1.CloudSchedulerClient() # Create a custom method with our parameters for ease of use _schedule_job = partial( schedule_job, client=client, project_id=project_id, location_id=location_id, time_zone=time_zone, ) # Clear all pre-existing jobs clear_jobs(client=client, project_id=project_id, location_id=location_id) # Cache pull job runs hourly _schedule_job(schedule="0 * * * *", path="/cache_pull") # The job that publishes data into the prod bucket runs every 4 hours _schedule_job( path="/publish", # Offset by 30 minutes to let other hourly tasks finish schedule="30 */4 * * *", ) # Converting the outputs to JSON is less critical but also slow so it's run separately _schedule_job( path="/convert_json_1", # Offset by 30 minutes to run after publishing schedule="0 1-23/4 * * *", ) # The convert to JSON task is split in two because otherwise it takes too long _schedule_job( path="/convert_json_2", # Offset by 30 minutes to run after publishing schedule="0 1-23/4 * * *", ) # Get new errors once a day at midday. _schedule_job(path="/report_errors_to_github", schedule="0 12 * * *") # Keep track of the different job groups to only output them once job_urls_seen = set() for data_pipeline in get_pipelines(): # The job that combines data sources into a table runs hourly _schedule_job( path=f"/combine_table?table={data_pipeline.table}", # Offset by 15 minutes to let other hourly tasks finish schedule="15 * * * *", ) for idx, data_source in enumerate(data_pipeline.data_sources): # The job to pull each individual data source runs hourly unless specified otherwise job_sched = data_source.config.get("automation", {}).get("schedule", "0 * * * *") # Each data source has a job group. All data sources within the same job group are run # as part of the same job in series. The default job group is the index of the data # source. job_group = data_source.config.get("automation", {}).get("job_group", idx) job_url = f"/update_table?table={data_pipeline.table}&job_group={job_group}" if job_url not in job_urls_seen: job_urls_seen.add(job_url) _schedule_job(path=job_url, schedule=job_sched)
def schedule_all_jobs(project_id: str, location_id: str, time_zone: str) -> None: """ Clears all previously scheduled jobs and schedules all necessary jobs for the current configuration. """ client = scheduler_v1.CloudSchedulerClient() # Create a custom method with our parameters for ease of use _schedule_job = partial( schedule_job, client=client, project_id=project_id, location_id=location_id, time_zone=time_zone, ) # Clear all pre-existing jobs clear_jobs(client=client, project_id=project_id, location_id=location_id) # Read the list of all known locations, since we will be splitting some jobs based on that location_keys = list( table_read_column(SRC / "data" / "metadata.csv", "key")) # Cache pull job runs hourly _schedule_job(schedule="0 * * * *", path="/cache_pull") # The job that publishes combined tables into the prod bucket runs every 2 hours _schedule_job( path="/publish_tables", # Offset by 30 minutes to let other hourly tasks finish schedule="30 */2 * * *", ) # The job that publishes aggregate outputs runs every 4 hours _schedule_job( # Run in a separate, preemptible instance path="/deferred/publish_main_table", # Offset by 60 minutes to let other hourly tasks finish schedule="0 1-23/4 * * *", ) # The job that publishes breakdown outputs runs every 4 hours _schedule_job( path="/deferred/publish_subset_tables", # Offset by 90 minutes to run after publishing schedule="30 1-23/4 * * *", ) # Converting the outputs to JSON is less critical but also slow so it's run separately for subset in _split_into_subsets(location_keys, bin_count=5): job_params = f"prod_folder=v2&location_key_from={subset[0]}&location_key_until={subset[-1]}" _schedule_job( path=f"/deferred/publish_json?{job_params}", # Offset by 120 minutes to run after subset tables are published schedule="0 2-23/4 * * *", ) # Get new errors once a day at midday. _schedule_job(path="/report_errors_to_github", schedule="0 12 * * *") # Keep track of the different job groups to only output them once job_urls_seen = set() for data_pipeline in get_pipelines(): # The job that combines data sources into a table runs hourly _schedule_job( path=f"/deferred/combine_table?table={data_pipeline.table}", # Offset by 15 minutes to let other hourly tasks finish schedule="15 * * * *", ) for idx, data_source in enumerate(data_pipeline.data_sources): automation_opts = data_source.config.get("automation", {}) # The job to pull each individual data source runs hourly unless specified otherwise job_sched = automation_opts.get("schedule", "0 * * * *") # If the job is deferred, then prepend the token to the path job_prefix = "/deferred" if automation_opts.get("deferred") else "" # Each data source has a job group. All data sources within the same job group are run # as part of the same job in series. The default job group is the index of the data # source. job_group = automation_opts.get("job_group", idx) job_url = f"{job_prefix}/update_table?table={data_pipeline.table}&job_group={job_group}" if job_url not in job_urls_seen: job_urls_seen.add(job_url) _schedule_job(path=job_url, schedule=job_sched) # V3 publish jobs start here # Publish the tables with all location keys every 2 hours _schedule_job( path="/deferred/publish_v3_global_tables", # Offset by 30 minutes to let other hourly tasks finish schedule="30 */2 * * *", ) # Publish the main aggregated table every 2 hours _schedule_job( path="/deferred/publish_v3_main_table", # Offset by 60 minutes to let other hourly tasks finish schedule="0 1-23/2 * * *", ) # Break down the outputs by location key every 2 hours, and execute the job in chunks for subset in _split_into_subsets(location_keys, bin_count=5): job_params = f"location_key_from={subset[0]}&location_key_until={subset[-1]}" _schedule_job( path=f"/deferred/publish_v3_location_subsets?{job_params}", # Offset by 60 minutes to let other hourly tasks finish schedule="0 1-23/2 * * *", ) # Publish outputs in JSON format every 2 hours, and execute the job in chunks for subset in _split_into_subsets(location_keys, bin_count=5): job_params = f"prod_folder=v3&location_key_from={subset[0]}&location_key_until={subset[-1]}" _schedule_job( path=f"/deferred/publish_json?{job_params}", # Offset by 90 minutes to let other hourly tasks finish schedule="30 1-23/2 * * *", )
''' Scheduler to run crawler trigger Every day at noon ''' import os import json from google.api_core.exceptions import NotFound from google.cloud import scheduler_v1 # Create non-empty data to be passed to pub/sub msg data = {} data = json.dumps(data).encode('utf-8') # Instantiate scheduler client client = scheduler_v1.CloudSchedulerClient() # Get env vars and create job/topic strings project_name = os.getenv('GOOGLE_CLOUD_PROJECT') appengine_location = os.getenv('APPENG_LOCATION') job_id = 'pronova_cntx_cronjob' topic_id = os.getenv('TOPIC_NAME_CNTX') job_name = f'projects/{project_name}/locations/{appengine_location}/jobs/{job_id}' topic_name = f'projects/{project_name}/topics/{topic_id}' parent = client.location_path(project_name, appengine_location) job = { 'name': job_name, 'pubsub_target': { 'topic_name': topic_name, 'data': data
def test_create_job(self): client = scheduler_v1.CloudSchedulerClient() project_id = os.environ.get("PROJECT_ID") location_id = "us-central1" parent = f"projects/{project_id}/locations/{location_id}" client.list_jobs(parent=parent)
def test_create_job(self): client = scheduler_v1.CloudSchedulerClient() parent = client.location_path(os.environ.get("PROJECT_ID"), "us-central1") client.list_jobs(parent)