def install_job(job_dict): job_name = job_dict['id'] # attempt to delete current job, if any: _remove_job_by_name(job_name) log.info('Adding job {}:\n{}'.format(job_name, json.dumps(job_dict))) sdk_cmd.service_request('POST', 'metronome', '/v1/jobs', json=job_dict)
def _remove_job_by_name(job_name): try: # Metronome doesn't understand 'True' -- only 'true' will do. sdk_cmd.service_request('DELETE', 'metronome', '/v1/jobs/{}'.format(job_name), retry=False, params={'stopCurrentJobRuns': 'true'}) except: log.info( 'Failed to remove any existing job named {} (this is likely as expected):\n{}' .format(job_name, traceback.format_exc()))
def wait(): # Note: We COULD directly query the run here via /v1/jobs/<job_name>/runs/<run_id>, but that # only works for active runs -- for whatever reason the run will disappear after it's done. # Therefore we have to query the full run history from the parent job and find our run_id there. run_history = sdk_cmd.service_request('GET', 'metronome', '/v1/jobs/{}'.format(job_name), retry=False, params={ 'embed': 'history' }).json()['history'] successful_run_ids = [ run['id'] for run in run_history['successfulFinishedRuns'] ] failed_run_ids = [ run['id'] for run in run_history['failedFinishedRuns'] ] log.info( 'Job {} run history (waiting for successful {}): successful={} failed={}' .format(job_name, run_id, successful_run_ids, failed_run_ids)) # Note: If a job has restart.policy=ON_FAILURE, it won't show up in failed_run_ids even when it fails. # Instead it will just keep restarting automatically until it succeeds or is deleted. if raise_on_failure and run_id in failed_run_ids: raise Exception( 'Job {} with id {} has failed, exiting early'.format( job_name, run_id)) return run_id in successful_run_ids
def wait_for_plan(): response = sdk_cmd.service_request('GET', service_name, '/v1/plans/{}'.format(plan), raise_on_error=False) if response.status_code == 417: return response # avoid throwing, return plan with errors response.raise_for_status() return response
def run_job(job_dict, timeout_seconds=600, raise_on_failure=True): job_name = job_dict['id'] # Start job run, get run ID to poll against: run_id = sdk_cmd.service_request('POST', 'metronome', '/v1/jobs/{}/runs'.format(job_name), log_args=False).json()['id'] log.info('Started job {}: run id {}'.format(job_name, run_id)) # Wait for run to succeed, throw if run fails: @retrying.retry(wait_fixed=1000, stop_max_delay=timeout_seconds * 1000, retry_on_result=lambda res: not res) def wait(): # Note: We COULD directly query the run here via /v1/jobs/<job_name>/runs/<run_id>, but that # only works for active runs -- for whatever reason the run will disappear after it's done. # Therefore we have to query the full run history from the parent job and find our run_id there. run_history = sdk_cmd.service_request('GET', 'metronome', '/v1/jobs/{}'.format(job_name), retry=False, params={ 'embed': 'history' }).json()['history'] successful_run_ids = [ run['id'] for run in run_history['successfulFinishedRuns'] ] failed_run_ids = [ run['id'] for run in run_history['failedFinishedRuns'] ] log.info( 'Job {} run history (waiting for successful {}): successful={} failed={}' .format(job_name, run_id, successful_run_ids, failed_run_ids)) # Note: If a job has restart.policy=ON_FAILURE, it won't show up in failed_run_ids even when it fails. # Instead it will just keep restarting automatically until it succeeds or is deleted. if raise_on_failure and run_id in failed_run_ids: raise Exception( 'Job {} with id {} has failed, exiting early'.format( job_name, run_id)) return run_id in successful_run_ids wait() return run_id
def get_scheduler_metrics(service_name, timeout_seconds=15 * 60): """Returns a dict tree of Scheduler metrics fetched directly from the scheduler. Returned data will match the content of /service/<svc_name>/v1/metrics. """ return sdk_cmd.service_request('GET', service_name, '/v1/metrics').json()
def test_cassandra_migration(): backup_service_name = os.getenv('CASSANDRA_BACKUP_CLUSTER_NAME') restore_service_name = os.getenv('CASSANDRA_RESTORE_CLUSTER_NAME') backup_node_address = os.getenv('BACKUP_NODE_ADDRESS', config.DEFAULT_NODE_ADDRESS) backup_node_port = os.getenv('BACKUP_NODE_PORT', config.DEFAULT_NODE_PORT) backup_write_data_job = config.get_write_data_job(backup_node_address, backup_node_port) backup_verify_data_job = config.get_verify_data_job( backup_node_address, backup_node_port) backup_delete_data_job = config.get_delete_data_job( backup_node_address, backup_node_port) backup_verify_deletion_job = config.get_verify_deletion_job( backup_node_address, backup_node_port) plan_parameters = { 'S3_BUCKET_NAME': os.getenv('AWS_BUCKET_NAME', 'infinity-framework-test'), 'AWS_ACCESS_KEY_ID': os.getenv('AWS_ACCESS_KEY_ID'), 'AWS_SECRET_ACCESS_KEY': os.getenv('AWS_SECRET_ACCESS_KEY'), 'AWS_REGION': os.getenv('AWS_REGION', 'us-west-2'), 'SNAPSHOT_NAME': str(uuid.uuid1()), 'CASSANDRA_KEYSPACES': '"testspace1 testspace2"', } backup_install_job_context = sdk_jobs.InstallJobContext([ backup_write_data_job, backup_verify_data_job, backup_delete_data_job, backup_verify_deletion_job ]) backup_run_job_context = sdk_jobs.RunJobContext( before_jobs=[backup_write_data_job, backup_verify_data_job], after_jobs=[backup_delete_data_job, backup_verify_deletion_job]) # Install and run the write/delete data jobs against backup cluster, # running dcos-cassandra-service with backup_install_job_context, backup_run_job_context: # Back this cluster up to S3 backup_parameters = { 'backup_name': plan_parameters['SNAPSHOT_NAME'], 's3_access_key': plan_parameters['AWS_ACCESS_KEY_ID'], 's3_secret_key': plan_parameters['AWS_SECRET_ACCESS_KEY'], 'external_location': 's3://{}'.format(plan_parameters['S3_BUCKET_NAME']), } sdk_cmd.service_request('PUT', backup_service_name, '/v1/backup/start', json=backup_parameters) sdk_plan.wait_for_completed_deployment(backup_service_name) # Restore data to second instance: restore_node_address = os.getenv( 'RESTORE_NODE_ADDRESS', sdk_hosts.autoip_host('sdk-cassandra', 'node-0-server')) restore_node_port = os.getenv('RESTORE_NODE_PORT', '9052') restore_write_data_job = config.get_write_data_job(restore_node_address, restore_node_port) restore_verify_data_job = config.get_verify_data_job( restore_node_address, restore_node_port) restore_delete_data_job = config.get_delete_data_job( restore_node_address, restore_node_port) restore_verify_deletion_job = config.get_verify_deletion_job( restore_node_address, restore_node_port) restore_install_job_context = sdk_jobs.InstallJobContext([ restore_write_data_job, restore_verify_data_job, restore_delete_data_job, restore_verify_deletion_job ]) restore_run_job_context = sdk_jobs.RunJobContext(after_jobs=[ restore_verify_data_job, restore_delete_data_job, restore_verify_deletion_job ]) with restore_install_job_context, restore_run_job_context: sdk_plan.start_plan(restore_service_name, 'restore-s3', parameters=plan_parameters) sdk_plan.wait_for_completed_plan(restore_service_name, 'restore-s3')
def start_plan(service_name, plan, parameters=None): sdk_cmd.service_request('POST', service_name, '/v1/plans/{}/start'.format(plan), json=parameters if parameters is not None else {})
def list_plans(service_name, timeout_seconds=TIMEOUT_SECONDS): return sdk_cmd.service_request('GET', service_name, '/v1/plans').json()