Beispiel #1
0
def install_job(job_dict):
    job_name = job_dict['id']

    # attempt to delete current job, if any:
    _remove_job_by_name(job_name)

    log.info('Adding job {}:\n{}'.format(job_name, json.dumps(job_dict)))
    sdk_cmd.service_request('POST', 'metronome', '/v1/jobs', json=job_dict)
Beispiel #2
0
def _remove_job_by_name(job_name):
    try:
        # Metronome doesn't understand 'True' -- only 'true' will do.
        sdk_cmd.service_request('DELETE',
                                'metronome',
                                '/v1/jobs/{}'.format(job_name),
                                retry=False,
                                params={'stopCurrentJobRuns': 'true'})
    except:
        log.info(
            'Failed to remove any existing job named {} (this is likely as expected):\n{}'
            .format(job_name, traceback.format_exc()))
Beispiel #3
0
    def wait():
        # Note: We COULD directly query the run here via /v1/jobs/<job_name>/runs/<run_id>, but that
        # only works for active runs -- for whatever reason the run will disappear after it's done.
        # Therefore we have to query the full run history from the parent job and find our run_id there.
        run_history = sdk_cmd.service_request('GET',
                                              'metronome',
                                              '/v1/jobs/{}'.format(job_name),
                                              retry=False,
                                              params={
                                                  'embed': 'history'
                                              }).json()['history']

        successful_run_ids = [
            run['id'] for run in run_history['successfulFinishedRuns']
        ]
        failed_run_ids = [
            run['id'] for run in run_history['failedFinishedRuns']
        ]

        log.info(
            'Job {} run history (waiting for successful {}): successful={} failed={}'
            .format(job_name, run_id, successful_run_ids, failed_run_ids))

        # Note: If a job has restart.policy=ON_FAILURE, it won't show up in failed_run_ids even when it fails.
        #       Instead it will just keep restarting automatically until it succeeds or is deleted.
        if raise_on_failure and run_id in failed_run_ids:
            raise Exception(
                'Job {} with id {} has failed, exiting early'.format(
                    job_name, run_id))

        return run_id in successful_run_ids
Beispiel #4
0
 def wait_for_plan():
     response = sdk_cmd.service_request('GET',
                                        service_name,
                                        '/v1/plans/{}'.format(plan),
                                        raise_on_error=False)
     if response.status_code == 417:
         return response  # avoid throwing, return plan with errors
     response.raise_for_status()
     return response
Beispiel #5
0
def run_job(job_dict, timeout_seconds=600, raise_on_failure=True):
    job_name = job_dict['id']

    # Start job run, get run ID to poll against:
    run_id = sdk_cmd.service_request('POST',
                                     'metronome',
                                     '/v1/jobs/{}/runs'.format(job_name),
                                     log_args=False).json()['id']
    log.info('Started job {}: run id {}'.format(job_name, run_id))

    # Wait for run to succeed, throw if run fails:
    @retrying.retry(wait_fixed=1000,
                    stop_max_delay=timeout_seconds * 1000,
                    retry_on_result=lambda res: not res)
    def wait():
        # Note: We COULD directly query the run here via /v1/jobs/<job_name>/runs/<run_id>, but that
        # only works for active runs -- for whatever reason the run will disappear after it's done.
        # Therefore we have to query the full run history from the parent job and find our run_id there.
        run_history = sdk_cmd.service_request('GET',
                                              'metronome',
                                              '/v1/jobs/{}'.format(job_name),
                                              retry=False,
                                              params={
                                                  'embed': 'history'
                                              }).json()['history']

        successful_run_ids = [
            run['id'] for run in run_history['successfulFinishedRuns']
        ]
        failed_run_ids = [
            run['id'] for run in run_history['failedFinishedRuns']
        ]

        log.info(
            'Job {} run history (waiting for successful {}): successful={} failed={}'
            .format(job_name, run_id, successful_run_ids, failed_run_ids))

        # Note: If a job has restart.policy=ON_FAILURE, it won't show up in failed_run_ids even when it fails.
        #       Instead it will just keep restarting automatically until it succeeds or is deleted.
        if raise_on_failure and run_id in failed_run_ids:
            raise Exception(
                'Job {} with id {} has failed, exiting early'.format(
                    job_name, run_id))

        return run_id in successful_run_ids

    wait()

    return run_id
Beispiel #6
0
def get_scheduler_metrics(service_name, timeout_seconds=15 * 60):
    """Returns a dict tree of Scheduler metrics fetched directly from the scheduler.
    Returned data will match the content of /service/<svc_name>/v1/metrics.
    """
    return sdk_cmd.service_request('GET', service_name, '/v1/metrics').json()
Beispiel #7
0
def test_cassandra_migration():
    backup_service_name = os.getenv('CASSANDRA_BACKUP_CLUSTER_NAME')
    restore_service_name = os.getenv('CASSANDRA_RESTORE_CLUSTER_NAME')

    backup_node_address = os.getenv('BACKUP_NODE_ADDRESS',
                                    config.DEFAULT_NODE_ADDRESS)
    backup_node_port = os.getenv('BACKUP_NODE_PORT', config.DEFAULT_NODE_PORT)

    backup_write_data_job = config.get_write_data_job(backup_node_address,
                                                      backup_node_port)
    backup_verify_data_job = config.get_verify_data_job(
        backup_node_address, backup_node_port)
    backup_delete_data_job = config.get_delete_data_job(
        backup_node_address, backup_node_port)
    backup_verify_deletion_job = config.get_verify_deletion_job(
        backup_node_address, backup_node_port)

    plan_parameters = {
        'S3_BUCKET_NAME': os.getenv('AWS_BUCKET_NAME',
                                    'infinity-framework-test'),
        'AWS_ACCESS_KEY_ID': os.getenv('AWS_ACCESS_KEY_ID'),
        'AWS_SECRET_ACCESS_KEY': os.getenv('AWS_SECRET_ACCESS_KEY'),
        'AWS_REGION': os.getenv('AWS_REGION', 'us-west-2'),
        'SNAPSHOT_NAME': str(uuid.uuid1()),
        'CASSANDRA_KEYSPACES': '"testspace1 testspace2"',
    }

    backup_install_job_context = sdk_jobs.InstallJobContext([
        backup_write_data_job, backup_verify_data_job, backup_delete_data_job,
        backup_verify_deletion_job
    ])
    backup_run_job_context = sdk_jobs.RunJobContext(
        before_jobs=[backup_write_data_job, backup_verify_data_job],
        after_jobs=[backup_delete_data_job, backup_verify_deletion_job])
    # Install and run the write/delete data jobs against backup cluster,
    # running dcos-cassandra-service
    with backup_install_job_context, backup_run_job_context:
        # Back this cluster up to S3
        backup_parameters = {
            'backup_name':
            plan_parameters['SNAPSHOT_NAME'],
            's3_access_key':
            plan_parameters['AWS_ACCESS_KEY_ID'],
            's3_secret_key':
            plan_parameters['AWS_SECRET_ACCESS_KEY'],
            'external_location':
            's3://{}'.format(plan_parameters['S3_BUCKET_NAME']),
        }
        sdk_cmd.service_request('PUT',
                                backup_service_name,
                                '/v1/backup/start',
                                json=backup_parameters)
        sdk_plan.wait_for_completed_deployment(backup_service_name)

    # Restore data to second instance:
    restore_node_address = os.getenv(
        'RESTORE_NODE_ADDRESS',
        sdk_hosts.autoip_host('sdk-cassandra', 'node-0-server'))
    restore_node_port = os.getenv('RESTORE_NODE_PORT', '9052')

    restore_write_data_job = config.get_write_data_job(restore_node_address,
                                                       restore_node_port)
    restore_verify_data_job = config.get_verify_data_job(
        restore_node_address, restore_node_port)
    restore_delete_data_job = config.get_delete_data_job(
        restore_node_address, restore_node_port)
    restore_verify_deletion_job = config.get_verify_deletion_job(
        restore_node_address, restore_node_port)

    restore_install_job_context = sdk_jobs.InstallJobContext([
        restore_write_data_job, restore_verify_data_job,
        restore_delete_data_job, restore_verify_deletion_job
    ])
    restore_run_job_context = sdk_jobs.RunJobContext(after_jobs=[
        restore_verify_data_job, restore_delete_data_job,
        restore_verify_deletion_job
    ])
    with restore_install_job_context, restore_run_job_context:
        sdk_plan.start_plan(restore_service_name,
                            'restore-s3',
                            parameters=plan_parameters)
        sdk_plan.wait_for_completed_plan(restore_service_name, 'restore-s3')
Beispiel #8
0
def start_plan(service_name, plan, parameters=None):
    sdk_cmd.service_request('POST',
                            service_name,
                            '/v1/plans/{}/start'.format(plan),
                            json=parameters if parameters is not None else {})
Beispiel #9
0
def list_plans(service_name, timeout_seconds=TIMEOUT_SECONDS):
    return sdk_cmd.service_request('GET', service_name, '/v1/plans').json()