Esempio n. 1
0
def stop_scheduler(act_cloud_scheduler, doc_ref):
    jobs = doc_ref.collection('jobs').stream()
    for job in jobs:
        # TODO: remove reference to fixed project and location
        project = 'example5-237118'
        location = 'europe-west1'
        client = scheduler_v1.CloudSchedulerClient()
        name = client.job_path(project, location, job.id)
        scheduler_v1.CloudSchedulerClient().delete_job(name)
        job.reference.delete()
Esempio n. 2
0
def start_scheduler(act_cloud_scheduler, doc_ref, trigger_resource):
    topic = act_cloud_scheduler.get('topic')
    print(topic)
    cron_config = act_cloud_scheduler.get('cron')
    print(cron_config)
    # TODO: remove reference to fixed project and location
    project = '/projects/example5-237118'
    client = scheduler_v1.CloudSchedulerClient()
    parent = client.location_path('example5-237118', 'europe-west1')
    # parent = project + '/locations/europe-west1'
    complete_topic_name = project + '/topics/' + topic
    print(complete_topic_name)
    print('Function triggered by change to: %s' % trigger_resource)
    job = {
        "pubsub_target": {
            "topic_name": "projects/example5-237118/topics/" + topic,
            "data": bytes(trigger_resource, 'utf-8')
        },
        "schedule": cron_config  # "* * * * *"
    }
    print("job: " + str(job))
    # see https://googleapis.github.io/google-cloud-python/latest/scheduler/gapic/v1/api.html
    response = client.create_job(parent, job)
    print("name")
    job_name = response.name[response.name.rindex('/') + 1:]
    print(job_name)
    print(response)
    doc_ref.collection('jobs').document(job_name).set(
        {'content': str(response)})
Esempio n. 3
0
    def test_update_job(self):
        # Setup Expected Response
        name = "name3373707"
        description = "description-1724546052"
        schedule = "schedule-697920873"
        time_zone = "timeZone36848094"
        expected_response = {
            "name": name,
            "description": description,
            "schedule": schedule,
            "time_zone": time_zone,
        }
        expected_response = job_pb2.Job(**expected_response)

        # Mock the API response
        channel = ChannelStub(responses=[expected_response])
        patch = mock.patch("google.api_core.grpc_helpers.create_channel")
        with patch as create_channel:
            create_channel.return_value = channel
            client = scheduler_v1.CloudSchedulerClient()

        # Setup Request
        job = {}
        update_mask = {}

        response = client.update_job(job, update_mask)
        assert expected_response == response

        assert len(channel.requests) == 1
        expected_request = cloudscheduler_pb2.UpdateJobRequest(
            job=job, update_mask=update_mask
        )
        actual_request = channel.requests[0][1]
        assert expected_request == actual_request
Esempio n. 4
0
    def test_list_jobs(self):
        # Setup Expected Response
        next_page_token = ""
        jobs_element = {}
        jobs = [jobs_element]
        expected_response = {"next_page_token": next_page_token, "jobs": jobs}
        expected_response = cloudscheduler_pb2.ListJobsResponse(**expected_response)

        # Mock the API response
        channel = ChannelStub(responses=[expected_response])
        patch = mock.patch("google.api_core.grpc_helpers.create_channel")
        with patch as create_channel:
            create_channel.return_value = channel
            client = scheduler_v1.CloudSchedulerClient()

        # Setup Request
        parent = client.location_path("[PROJECT]", "[LOCATION]")

        paged_list_response = client.list_jobs(parent)
        resources = list(paged_list_response)
        assert len(resources) == 1

        assert expected_response.jobs[0] == resources[0]

        assert len(channel.requests) == 1
        expected_request = cloudscheduler_pb2.ListJobsRequest(parent=parent)
        actual_request = channel.requests[0][1]
        assert expected_request == actual_request
Esempio n. 5
0
    def test_run_job(self):
        # Setup Expected Response
        name_2 = "name2-1052831874"
        description = "description-1724546052"
        schedule = "schedule-697920873"
        time_zone = "timeZone36848094"
        expected_response = {
            "name": name_2,
            "description": description,
            "schedule": schedule,
            "time_zone": time_zone,
        }
        expected_response = job_pb2.Job(**expected_response)

        # Mock the API response
        channel = ChannelStub(responses=[expected_response])
        patch = mock.patch("google.api_core.grpc_helpers.create_channel")
        with patch as create_channel:
            create_channel.return_value = channel
            client = scheduler_v1.CloudSchedulerClient()

        # Setup Request
        name = client.job_path("[PROJECT]", "[LOCATION]", "[JOB]")

        response = client.run_job(name)
        assert expected_response == response

        assert len(channel.requests) == 1
        expected_request = cloudscheduler_pb2.RunJobRequest(name=name)
        actual_request = channel.requests[0][1]
        assert expected_request == actual_request
Esempio n. 6
0
def sync(event, context):
  """Sync projects with cloud datastore."""
  del event, context  #unused

  with ndb.Client().context():
    github_client = Github(get_access_token())
    repo = github_client.get_repo('google/oss-fuzz')
    projects = get_projects(repo)
    cloud_scheduler_client = scheduler_v1.CloudSchedulerClient()
    sync_projects(cloud_scheduler_client, projects)
Esempio n. 7
0
def sync(event, context):
    """Sync projects with cloud datastore."""
    del event, context  # Unused.

    with ndb.Client().context():
        git_creds = get_github_creds()
        github_client = Github(git_creds.client_id, git_creds.client_secret)
        repo = github_client.get_repo('google/oss-fuzz')
        projects = get_projects(repo)
        cloud_scheduler_client = scheduler_v1.CloudSchedulerClient()
        sync_projects(cloud_scheduler_client, projects)
Esempio n. 8
0
    def test_list_jobs_exception(self):
        channel = ChannelStub(responses=[CustomException()])
        patch = mock.patch("google.api_core.grpc_helpers.create_channel")
        with patch as create_channel:
            create_channel.return_value = channel
            client = scheduler_v1.CloudSchedulerClient()

        # Setup request
        parent = client.location_path("[PROJECT]", "[LOCATION]")

        paged_list_response = client.list_jobs(parent)
        with pytest.raises(CustomException):
            list(paged_list_response)
Esempio n. 9
0
    def test_run_job_exception(self):
        # Mock the API response
        channel = ChannelStub(responses=[CustomException()])
        patch = mock.patch("google.api_core.grpc_helpers.create_channel")
        with patch as create_channel:
            create_channel.return_value = channel
            client = scheduler_v1.CloudSchedulerClient()

        # Setup request
        name = client.job_path("[PROJECT]", "[LOCATION]", "[JOB]")

        with pytest.raises(CustomException):
            client.run_job(name)
Esempio n. 10
0
    def test_update_job_exception(self):
        # Mock the API response
        channel = ChannelStub(responses=[CustomException()])
        patch = mock.patch("google.api_core.grpc_helpers.create_channel")
        with patch as create_channel:
            create_channel.return_value = channel
            client = scheduler_v1.CloudSchedulerClient()

        # Setup request
        job = {}
        update_mask = {}

        with pytest.raises(CustomException):
            client.update_job(job, update_mask)
Esempio n. 11
0
    def test_delete_job(self):
        channel = ChannelStub()
        patch = mock.patch("google.api_core.grpc_helpers.create_channel")
        with patch as create_channel:
            create_channel.return_value = channel
            client = scheduler_v1.CloudSchedulerClient()

        # Setup Request
        name = client.job_path("[PROJECT]", "[LOCATION]", "[JOB]")

        client.delete_job(name)

        assert len(channel.requests) == 1
        expected_request = cloudscheduler_pb2.DeleteJobRequest(name=name)
        actual_request = channel.requests[0][1]
        assert expected_request == actual_request
Esempio n. 12
0
def schedule_all_jobs(project_id: str, location_id: str,
                      time_zone: str) -> None:
    """
    Clears all previously scheduled jobs and schedules all necessary jobs for the current
    configuration.
    """
    client = scheduler_v1.CloudSchedulerClient()

    # Create a custom method with our parameters for ease of use
    _schedule_job = partial(
        schedule_job,
        client=client,
        project_id=project_id,
        location_id=location_id,
        time_zone=time_zone,
    )

    # Clear all pre-existing jobs
    clear_jobs(client=client, project_id=project_id, location_id=location_id)

    # Cache pull job runs hourly
    _schedule_job(schedule="0 * * * *", path="/cache_pull")

    # The job that publishes data into the prod bucket runs every 4 hours
    _schedule_job(
        path="/publish",
        # Offset by 30 minutes to let other hourly tasks finish
        schedule="30 */4 * * *",
    )

    # Converting the outputs to JSON is less critical but also slow so it's run separately
    _schedule_job(
        path="/convert_json_1",
        # Offset by 30 minutes to run after publishing
        schedule="0 1-23/4 * * *",
    )

    # The convert to JSON task is split in two because otherwise it takes too long
    _schedule_job(
        path="/convert_json_2",
        # Offset by 30 minutes to run after publishing
        schedule="0 1-23/4 * * *",
    )

    # Get new errors once a day at midday.
    _schedule_job(path="/report_errors_to_github", schedule="0 12 * * *")

    # Keep track of the different job groups to only output them once
    job_urls_seen = set()

    for data_pipeline in get_pipelines():
        # The job that combines data sources into a table runs hourly
        _schedule_job(
            path=f"/combine_table?table={data_pipeline.table}",
            # Offset by 15 minutes to let other hourly tasks finish
            schedule="15 * * * *",
        )

        for idx, data_source in enumerate(data_pipeline.data_sources):
            # The job to pull each individual data source runs hourly unless specified otherwise
            job_sched = data_source.config.get("automation",
                                               {}).get("schedule", "0 * * * *")

            # Each data source has a job group. All data sources within the same job group are run
            # as part of the same job in series. The default job group is the index of the data
            # source.
            job_group = data_source.config.get("automation",
                                               {}).get("job_group", idx)
            job_url = f"/update_table?table={data_pipeline.table}&job_group={job_group}"

            if job_url not in job_urls_seen:
                job_urls_seen.add(job_url)
                _schedule_job(path=job_url, schedule=job_sched)
Esempio n. 13
0
def schedule_all_jobs(project_id: str, location_id: str,
                      time_zone: str) -> None:
    """
    Clears all previously scheduled jobs and schedules all necessary jobs for the current
    configuration.
    """
    client = scheduler_v1.CloudSchedulerClient()

    # Create a custom method with our parameters for ease of use
    _schedule_job = partial(
        schedule_job,
        client=client,
        project_id=project_id,
        location_id=location_id,
        time_zone=time_zone,
    )

    # Clear all pre-existing jobs
    clear_jobs(client=client, project_id=project_id, location_id=location_id)

    # Read the list of all known locations, since we will be splitting some jobs based on that
    location_keys = list(
        table_read_column(SRC / "data" / "metadata.csv", "key"))

    # Cache pull job runs hourly
    _schedule_job(schedule="0 * * * *", path="/cache_pull")

    # The job that publishes combined tables into the prod bucket runs every 2 hours
    _schedule_job(
        path="/publish_tables",
        # Offset by 30 minutes to let other hourly tasks finish
        schedule="30 */2 * * *",
    )

    # The job that publishes aggregate outputs runs every 4 hours
    _schedule_job(
        # Run in a separate, preemptible instance
        path="/deferred/publish_main_table",
        # Offset by 60 minutes to let other hourly tasks finish
        schedule="0 1-23/4 * * *",
    )

    # The job that publishes breakdown outputs runs every 4 hours
    _schedule_job(
        path="/deferred/publish_subset_tables",
        # Offset by 90 minutes to run after publishing
        schedule="30 1-23/4 * * *",
    )

    # Converting the outputs to JSON is less critical but also slow so it's run separately
    for subset in _split_into_subsets(location_keys, bin_count=5):
        job_params = f"prod_folder=v2&location_key_from={subset[0]}&location_key_until={subset[-1]}"
        _schedule_job(
            path=f"/deferred/publish_json?{job_params}",
            # Offset by 120 minutes to run after subset tables are published
            schedule="0 2-23/4 * * *",
        )

    # Get new errors once a day at midday.
    _schedule_job(path="/report_errors_to_github", schedule="0 12 * * *")

    # Keep track of the different job groups to only output them once
    job_urls_seen = set()

    for data_pipeline in get_pipelines():
        # The job that combines data sources into a table runs hourly
        _schedule_job(
            path=f"/deferred/combine_table?table={data_pipeline.table}",
            # Offset by 15 minutes to let other hourly tasks finish
            schedule="15 * * * *",
        )

        for idx, data_source in enumerate(data_pipeline.data_sources):
            automation_opts = data_source.config.get("automation", {})

            # The job to pull each individual data source runs hourly unless specified otherwise
            job_sched = automation_opts.get("schedule", "0 * * * *")

            # If the job is deferred, then prepend the token to the path
            job_prefix = "/deferred" if automation_opts.get("deferred") else ""

            # Each data source has a job group. All data sources within the same job group are run
            # as part of the same job in series. The default job group is the index of the data
            # source.
            job_group = automation_opts.get("job_group", idx)
            job_url = f"{job_prefix}/update_table?table={data_pipeline.table}&job_group={job_group}"

            if job_url not in job_urls_seen:
                job_urls_seen.add(job_url)
                _schedule_job(path=job_url, schedule=job_sched)

    # V3 publish jobs start here

    # Publish the tables with all location keys every 2 hours
    _schedule_job(
        path="/deferred/publish_v3_global_tables",
        # Offset by 30 minutes to let other hourly tasks finish
        schedule="30 */2 * * *",
    )

    # Publish the main aggregated table every 2 hours
    _schedule_job(
        path="/deferred/publish_v3_main_table",
        # Offset by 60 minutes to let other hourly tasks finish
        schedule="0 1-23/2 * * *",
    )

    # Break down the outputs by location key every 2 hours, and execute the job in chunks
    for subset in _split_into_subsets(location_keys, bin_count=5):
        job_params = f"location_key_from={subset[0]}&location_key_until={subset[-1]}"
        _schedule_job(
            path=f"/deferred/publish_v3_location_subsets?{job_params}",
            # Offset by 60 minutes to let other hourly tasks finish
            schedule="0 1-23/2 * * *",
        )

    # Publish outputs in JSON format every 2 hours, and execute the job in chunks
    for subset in _split_into_subsets(location_keys, bin_count=5):
        job_params = f"prod_folder=v3&location_key_from={subset[0]}&location_key_until={subset[-1]}"
        _schedule_job(
            path=f"/deferred/publish_json?{job_params}",
            # Offset by 90 minutes to let other hourly tasks finish
            schedule="30 1-23/2 * * *",
        )
'''
Scheduler to run crawler trigger
Every day at noon
'''

import os
import json
from google.api_core.exceptions import NotFound
from google.cloud import scheduler_v1

# Create non-empty data to be passed to pub/sub msg
data = {}
data = json.dumps(data).encode('utf-8')

# Instantiate scheduler client
client = scheduler_v1.CloudSchedulerClient()

# Get env vars and create job/topic strings
project_name = os.getenv('GOOGLE_CLOUD_PROJECT')
appengine_location = os.getenv('APPENG_LOCATION')
job_id = 'pronova_cntx_cronjob'
topic_id = os.getenv('TOPIC_NAME_CNTX')
job_name = f'projects/{project_name}/locations/{appengine_location}/jobs/{job_id}'
topic_name = f'projects/{project_name}/topics/{topic_id}'

parent = client.location_path(project_name, appengine_location)
job = {
    'name': job_name,
    'pubsub_target': {
        'topic_name': topic_name,
        'data': data
Esempio n. 15
0
 def test_create_job(self):
     client = scheduler_v1.CloudSchedulerClient()
     project_id = os.environ.get("PROJECT_ID")
     location_id = "us-central1"
     parent = f"projects/{project_id}/locations/{location_id}"
     client.list_jobs(parent=parent)
Esempio n. 16
0
 def test_create_job(self):
     client = scheduler_v1.CloudSchedulerClient()
     parent = client.location_path(os.environ.get("PROJECT_ID"),
                                   "us-central1")
     client.list_jobs(parent)