def __init__(self, args, environ): self.args = args # Fetch all required environment variables, exiting if unset. self.environ = sys_util.copy_from_env( ["CROMWELL", "CROMWELL_CONF", "JVM_OPTS"], environ) cromwell_conf = self.environ["CROMWELL_CONF"] cromwell_jar = self.environ["CROMWELL"] raw_jvm_flags = self.environ["JVM_OPTS"] jvm_flags = None if raw_jvm_flags: jvm_flags = raw_jvm_flags.split(" ") # Verify that the output directory is empty (or not there). if self.args.output_dir and not file_util.verify_gcs_dir_empty_or_missing( self.args.output_dir): sys_util.exit_with_error("Output directory not empty: %s" % self.args.output_dir) # Plug in the working directory and the project id to the Cromwell conf self.fill_cromwell_conf(cromwell_conf, self.args.working_dir, self.args.project) # Set up the Cromwell driver self.driver = cromwell_driver.CromwellDriver(cromwell_conf, cromwell_jar, jvm_flags) self.driver.start()
def verify_gcs_dir_empty_or_missing(path): """Verify that the output "directory" does not exist or is empty.""" # Use the storage API directly instead of gsutil. # gsutil does not return explicit error codes and so to detect # a non-existent path would require capturing and parsing the error message. # Verify the input is a GCS path if not path.startswith('gs://'): sys_util.exit_with_error("Path is not a GCS path: '%s'" % path) # Tokenize the path into bucket and prefix parts = path[len('gs://'):].split('/', 1) bucket = parts[0] prefix = parts[1] if len(parts) > 1 else None # Get the storage endpoint credentials = GoogleCredentials.get_application_default() service = discovery.build('storage', 'v1', credentials=credentials, cache_discovery=False) # Build the request - only need the name fields = 'nextPageToken,items(name)' request = service.objects().list(bucket=bucket, prefix=prefix, fields=fields, maxResults=2) # If we get more than 1 item, we are done (directory not empty) # If we get zero items, we are done (directory empty) # If we get 1 item, then we need to check if it is a "directory object" items = [] while request and len(items) < 2: try: response = request.execute() except HttpError as err: error = json.loads(err.content) error = error['error'] sys_util.exit_with_error("%s %s: '%s'" % (error['code'], error['message'], path)) items.extend(response.get('items', [])) request = service.objects().list_next(request, response) if not items: return True if len(items) == 1 and items[0]['name'].rstrip('/') == prefix.rstrip('/'): return True return False
def verify_gcs_dir_empty_or_missing(path): """Verify that the output "directory" does not exist or is empty.""" # Use the storage API directly instead of gsutil. # gsutil does not return explicit error codes and so to detect # a non-existent path would require capturing and parsing the error message. # Verify the input is a GCS path if not path.startswith('gs://'): sys_util.exit_with_error("Path is not a GCS path: '%s'" % path) # Tokenize the path into bucket and prefix parts = path[len('gs://'):].split('/', 1) bucket = parts[0] prefix = parts[1] if len(parts) > 1 else None # Get the storage endpoint credentials = GoogleCredentials.get_application_default() service = discovery.build('storage', 'v1', credentials=credentials, cache_discovery=False) # Build the request - only need the name fields = 'nextPageToken,items(name)' request = service.objects().list( bucket=bucket, prefix=prefix, fields=fields, maxResults=2) # If we get more than 1 item, we are done (directory not empty) # If we get zero items, we are done (directory empty) # If we get 1 item, then we need to check if it is a "directory object" items = [] while request and len(items) < 2: try: response = request.execute() except HttpError as err: error = simplejson.loads(err.content) error = error['error'] sys_util.exit_with_error( "%s %s: '%s'" % (error['code'], error['message'], path)) items.extend(response.get('items', [])) request = service.objects().list_next(request, response) if not items: return True if len(items) == 1 and items[0]['name'].rstrip('/') == prefix.rstrip('/'): return True return False
def gsutil_cp(source_files, dest_dir): """Copies files to GCS and exits on error.""" cp_cmd = ['gsutil', 'cp'] + source_files + [dest_dir] logging.info("Copying %s to %s", source_files, dest_dir) # Copies can fail, so include retries... for attempt in range(3): p = subprocess.Popen(cp_cmd, stderr=subprocess.PIPE) return_code = p.wait() if not return_code: return logging.warn("Copy %s to %s failed: attempt %d", source_files, dest_dir, attempt) sys_util.exit_with_error("copying files from %s to %s failed: %s" % (source_files, dest_dir, p.stderr.read()))
def gsutil_cp(source_files, dest_dir): """Copies files to GCS and exits on error.""" cp_cmd = ['gsutil', 'cp'] + source_files + [dest_dir] logging.info("Copying %s to %s", source_files, dest_dir) # Copies can fail, so include retries... for attempt in range(3): p = subprocess.Popen(cp_cmd, stderr=subprocess.PIPE) return_code = p.wait() if not return_code: return logging.warn("Copy %s to %s failed: attempt %d", source_files, dest_dir, attempt) sys_util.exit_with_error( "copying files from %s to %s failed: %s" % ( source_files, dest_dir, p.stderr.read()))
def __init__(self, args, environ): self.args = args # Fetch all required environment variables, exiting if unset. self.environ = sys_util.copy_from_env( ['CROMWELL', 'CROMWELL_CONF'], environ) cromwell_conf = self.environ['CROMWELL_CONF'] cromwell_jar = self.environ['CROMWELL'] # Verify that the output directory is empty (or not there). if not file_util.verify_gcs_dir_empty_or_missing(self.args.output_dir): sys_util.exit_with_error( "Output directory not empty: %s" % self.args.output_dir) # Plug in the working directory and the project id to the Cromwell conf self.fill_cromwell_conf(cromwell_conf, self.args.working_dir, self.args.project) # Set up the Cromwell driver self.driver = cromwell_driver.CromwellDriver(cromwell_conf, cromwell_jar) self.driver.start()
def __init__(self, args, environ): self.args = args # Fetch all required environment variables, exiting if unset. self.environ = sys_util.copy_from_env(['CROMWELL', 'CROMWELL_CONF'], environ) cromwell_conf = self.environ['CROMWELL_CONF'] cromwell_jar = self.environ['CROMWELL'] # Verify that the output directory is empty (or not there). if not file_util.verify_gcs_dir_empty_or_missing(self.args.output_dir): sys_util.exit_with_error("Output directory not empty: %s" % self.args.output_dir) # Plug in the working directory and the project id to the Cromwell conf self.fill_cromwell_conf(cromwell_conf, self.args.working_dir, self.args.project) # Set up the Cromwell driver self.driver = cromwell_driver.CromwellDriver(cromwell_conf, cromwell_jar) self.driver.start()
def submit(self, wdl, workflow_inputs, workflow_options, sleep_time=15): """Post new job to the server and poll for completion.""" # Add required input files with open(wdl, 'r') as f: wdl_source = f.read() with open(workflow_inputs, 'r') as f: wf_inputs = f.read() files = { 'wdlSource': wdl_source, 'workflowInputs': wf_inputs, } # Add workflow options if specified if workflow_options: with open(workflow_options, 'r') as f: wf_options = f.read() files['workflowOptions'] = wf_options # After Cromwell start, it may take a few seconds to be ready for requests. # Poll up to a minute for successful connect and submit. job = None max_time_wait = 60 wait_interval = 5 time.sleep(wait_interval) for attempt in range(max_time_wait / wait_interval): try: job = self.fetch(session, post=True, files=files) break except requests.exceptions.ConnectionError as e: logging.info("Failed to connect to Cromwell (attempt %d): %s", attempt + 1, e) time.sleep(wait_interval) if not job: sys_util.exit_with_error( "Failed to connect to Cromwell after {0} seconds".format( max_time_wait)) if job['status'] != 'Submitted': sys_util.exit_with_error( "Job status from Cromwell was not 'Submitted', instead '{0}'". format(job['status'])) # Job is running. cromwell_id = job['id'] logging.info("Job submitted to Cromwell. job id: %s", cromwell_id) # Poll Cromwell for job completion. attempt = 0 max_failed_attempts = 3 while True: time.sleep(sleep_time) # Cromwell occassionally fails to respond to the status request. # Only give up after 3 consecutive failed requests. try: status_json = self.fetch(session, wf_id=cromwell_id, method='status') attempt = 0 except requests.exceptions.ConnectionError as e: attempt += 1 logging.info( "Error polling Cromwell job status (attempt %d): %s", attempt, e) if attempt >= max_failed_attempts: sys_util.exit_with_error( "Cromwell did not respond for %d consecutive requests" % attempt) continue status = status_json['status'] if status == 'Succeeded': break elif status == 'Submitted': pass elif status == 'Running': pass else: sys_util.exit_with_error( "Status of job is not Submitted, Running, or Succeeded: %s" % status) logging.info("Cromwell job status: %s", status) # Cromwell produces a list of outputs and full job details outputs = self.fetch(session, wf_id=cromwell_id, method='outputs') metadata = self.fetch(session, wf_id=cromwell_id, method='metadata') return outputs, metadata
def batch(self, submission_id, wdl, inputs, options, batch_limit, query_limit): logging.info( "Starting batch request. Waiting for cromwell to start...") self.logger.log( "Beginning batch request", batch_limit=batch_limit, query_limit=query_limit, ) time.sleep(60) with open(wdl, 'r') as wdlReader: with open(options, 'r') as optionReader: opts = json.load(optionReader) opts['google_labels'] = { 'lapdog-submission-id': 'id-' + submission_id, 'lapdog-execution-role': 'worker' } data = { 'workflowSource': wdlReader.read(), # 'workflowInputs': json.dumps([line for line in reader]), 'workflowOptions': json.dumps(opts), } logging.info("Starting the following configuration: " + json.dumps(data)) output = [] first = True with open(inputs, 'r') as inputReader: reader = csv.DictReader(inputReader, delimiter='\t', lineterminator='\n') with requests.Session() as session: for batch in clump(reader, batch_limit): self.check_cromwell() logging.info("Running a new batch of %d workflows" % batch_limit) chunk = [] if not first: logging.info("Restarting cromwell...") self.cromwell_proc.kill() self.cromwell_proc = None time.sleep(10) self.start(self.mem) time.sleep(20) logging.info("Resuming next batch") else: first = False for group in clump(batch, query_limit): logging.info("Starting a chunk of %d workflows" % query_limit) group = [line for line in group] logging.info("There are %d workflows in this group" % len(group)) response = None for attempt in range(10): try: data['workflowInputs'] = json.dumps( [unpack(line) for line in group]) self.logger.log('Launching workflow batch', json=data) response = session.post( 'http://localhost:8000/api/workflows/v1/batch', files=data) response = response.json() logging.info("Submitted jobs. Begin polling") break except requests.exceptions.ConnectionError as e: self.logger.log_exception() traceback.print_exc() self.check_cromwell() logging.info( "Failed to connect to Cromwell (attempt %d): %s", attempt + 1, e) time.sleep(30) except ValueError: self.logger.log_exception( "JSON Decode error", response=response.text if response is not None else None, ) traceback.print_exc() self.check_cromwell() logging.error( "Unexpected response from Cromwell: (%d) : %s" % (response.status_code, response.text)) raise if not response: self.check_cromwell() self.logging.log("Cromwell timeout", severity="WARNING") sys_util.exit_with_error( "Failed to connect to Cromwell after {0} seconds" .format(300)) logging.info("Raw response: " + repr(response)) for job in response: if job['status'] != 'Submitted' and job[ 'status'] != 'Running': for job in response: self.abort(job['id']) self.logging.log('Unexpected job status', status=job['status'], jobs=response, severity='ERROR') sys_util.exit_with_error( "Job {} status from Cromwell was not 'Submitted', instead '{}'" .format(job['id'], job['status'])) else: chunk.append(job) self.batch_submission = True self.check_cromwell() @atexit.register def abort_all_jobs(): if self.batch_submission: for job in response: self.abort(job['id']) for i in range(12): time.sleep(5) attempt = 0 max_failed_attempts = 3 known_failures = set() while True: for i in range(3): time.sleep(10) self.check_cromwell() # Cromwell occassionally fails to respond to the status request. # Only give up after 3 consecutive failed requests. try: status_json = [[ self.fetch(session, wf_id=job['id'], method='status'), time.sleep(0.1) ][0] for job in chunk] attempt = 0 except requests.exceptions.ConnectionError as e: self.logger.log_exception() attempt += 1 logging.info( "Error polling Cromwell job status (attempt %d): %s", attempt, e) self.check_cromwell() if attempt >= max_failed_attempts: self.logger.log( 'Cromwell crash with active workflows', jobs=chunk, severity='WARNING') sys_util.exit_with_error( "Cromwell did not respond for %d consecutive requests" % attempt) continue statuses = {job['status'] for job in status_json} # logging.info("<WORKFLOW STATUS UPDATE> %s" % json.dumps(status_json)) if 'Failed' in statuses: new_failures = [ job for job in status_json if job['status'] == 'Failed' and job['id'] not in known_failures ] if len(new_failures): sys.stderr.write( "The following jobs failed: %s\n" % (', '.join('%s (%s)' % (job['id'], job['status']) for job in new_failures))) known_failures |= { job['id'] for job in new_failures } if not len(statuses - {'Succeeded', 'Failed', 'Aborted'}): logging.info("All workflows in terminal states") self.logger.log( 'Batch complete', json=status_json, ) break self.batch_submission = False output += [{ 'workflow_id': job['id'], 'workflow_status': job['status'], 'workflow_output': self.fetch(session, wf_id=job['id'], method='outputs') if job['status'] == 'Succeeded' else None, 'workflow_metadata': self.fetch(session, wf_id=job['id'], method='metadata') if job['status'] == 'Succeeded' else None, } for job in status_json] self.check_cromwell() if 'Aborted' in statuses: # Quit now. No reason to start a new batch to get aborted self.logger.log('Submission aborted', json=output) sys.stderr.write( "There were aborted workflows. Aborting submission now." ) return output logging.info("<SUBMISSION COMPLETE. FINALIZING DATA>") self.logger.log('Submission complete. Finalizing data', json=output) return output
def submit(self, wdl, workflow_inputs, workflow_options, sleep_time=15): """Post new job to the server and poll for completion.""" # Add required input files with open(wdl, 'rb') as f: wdl_source = f.read() with open(workflow_inputs, 'rb') as f: wf_inputs = f.read() files = { 'wdlSource': wdl_source, 'workflowInputs': wf_inputs, } # Add workflow options if specified if workflow_options: with open(workflow_options, 'rb') as f: wf_options = f.read() files['workflowOptions'] = wf_options # After Cromwell start, it may take a few seconds to be ready for requests. # Poll up to a minute for successful connect and submit. job = None max_time_wait = 60 wait_interval = 5 time.sleep(wait_interval) for attempt in range(max_time_wait/wait_interval): try: job = self.fetch(post=True, files=files) break except requests.exceptions.ConnectionError as e: logging.info("Failed to connect to Cromwell (attempt %d): %s", attempt + 1, e) time.sleep(wait_interval) if not job: sys_util.exit_with_error( "Failed to connect to Cromwell after {0} seconds".format( max_time_wait)) if job['status'] != 'Submitted': sys_util.exit_with_error( "Job status from Cromwell was not 'Submitted', instead '{0}'".format( job['status'])) # Job is running. cromwell_id = job['id'] logging.info("Job submitted to Cromwell. job id: %s", cromwell_id) # Poll Cromwell for job completion. attempt = 0 max_failed_attempts = 3 while True: time.sleep(sleep_time) # Cromwell occassionally fails to respond to the status request. # Only give up after 3 consecutive failed requests. try: status_json = self.fetch(wf_id=cromwell_id, method='status') attempt = 0 except requests.exceptions.ConnectionError as e: attempt += 1 logging.info("Error polling Cromwell job status (attempt %d): %s", attempt, e) if attempt >= max_failed_attempts: sys_util.exit_with_error( "Cromwell did not respond for %d consecutive requests" % attempt) continue status = status_json['status'] if status == 'Succeeded': break elif status == 'Submitted': pass elif status == 'Running': pass else: sys_util.exit_with_error( "Status of job is not Submitted, Running, or Succeeded: %s" % status) logging.info("Cromwell job status: %s", status) # Cromwell produces a list of outputs and full job details outputs = self.fetch(wf_id=cromwell_id, method='outputs') metadata = self.fetch(wf_id=cromwell_id, method='metadata') return outputs, metadata
def submit(self, wdl, workflow_inputs, workflow_options, sleep_time=15): """Post new job to the server and poll for completion.""" # Add required input files with open(wdl, 'rb') as f: wdl_source = f.read() with open(workflow_inputs, 'rb') as f: wf_inputs = f.read() files = { 'wdlSource': wdl_source, 'workflowInputs': wf_inputs, } # Add workflow options if specified if workflow_options: with open(workflow_options, 'rb') as f: wf_options = f.read() files['workflowOptions'] = wf_options # After Cromwell start, it may take a few seconds to be ready for requests. # Try up to a minute to connect. job = None max_time_wait = 60 wait_interval = 5 for attempt in range(max_time_wait/wait_interval): try: job = self.fetch(post=True, files=files) break except requests.exceptions.ConnectionError as e: logging.info("Failed to connect to Cromwell(%d): %s", attempt, e) time.sleep(wait_interval) if not job: sys_util.exit_with_error( "Failed to connect to Cromwell after {0} seconds".format( max_time_wait)) if job['status'] != 'Submitted': sys_util.exit_with_error( "Job status from Cromwell was not 'Submitted', instead '{0}'".format( job['status'])) # Job is running. cromwell_id = job['id'] logging.info("Cromwell job id: %s", cromwell_id) # Poll for completion. while True: time.sleep(sleep_time) status_json = self.fetch(wf_id=cromwell_id, method='status') status = status_json['status'] if status == 'Succeeded': break elif status == 'Running': pass else: sys_util.exit_with_error( 'Status of job is not Running or Succeeded: %s' % status) logging.info("Succeeded") # Cromwell produces a list of outputs and full job details outputs = self.fetch(wf_id=cromwell_id, method='outputs') metadata = self.fetch(wf_id=cromwell_id, method='metadata') return outputs, metadata
def submit( self, wdl, workflow_inputs, workflow_options, workflow_dependencies, sleep_time=15, ): """Post new job to the server and poll for completion.""" # Add required input files with open(wdl, "rb") as f: wf_source = f.read() with open(workflow_inputs, "rb") as f: wf_inputs = f.read() files = { "workflowSource": wf_source, "workflowInputs": wf_inputs, } if workflow_dependencies: with open(workflow_dependencies, "rb") as f: # Read as Base64 byte string wf_dependencies = f.read() # Convert to binary zip file files["workflowDependencies"] = base64.decodebytes( wf_dependencies) # Add workflow options if specified if workflow_options: with open(workflow_options, "rb") as f: wf_options = f.read() files["workflowOptions"] = wf_options # After Cromwell start, it may take a few seconds to be ready for requests. # Poll up to a minute for successful connect and submit. job = None max_time_wait = 60 wait_interval = 5 time.sleep(wait_interval) for attempt in range(max_time_wait // wait_interval): try: job = self.fetch(post=True, files=files) break except requests.exceptions.ConnectionError as e: logging.info("Failed to connect to Cromwell (attempt %d): %s", attempt + 1, e) time.sleep(wait_interval) if not job: sys_util.exit_with_error( "Failed to connect to Cromwell after {0} seconds".format( max_time_wait)) if job["status"] != "Submitted": sys_util.exit_with_error( "Job status from Cromwell was not 'Submitted', instead '{0}'". format(job["status"])) # Job is running. cromwell_id = job["id"] logging.info("Job submitted to Cromwell. job id: %s", cromwell_id) # Poll Cromwell for job completion. attempt = 0 max_failed_attempts = 3 while True: time.sleep(sleep_time) # Cromwell occassionally fails to respond to the status request. # Only give up after 3 consecutive failed requests. try: status_json = self.fetch(wf_id=cromwell_id, method="status") attempt = 0 except requests.exceptions.ConnectionError as e: attempt += 1 logging.info( "Error polling Cromwell job status (attempt %d): %s", attempt, e) if attempt >= max_failed_attempts: sys_util.exit_with_error( "Cromwell did not respond for %d consecutive requests" % attempt) continue status = status_json["status"] if status == "Succeeded": break elif status == "Submitted": pass elif status == "Running": pass else: sys_util.exit_with_error( "Status of job is not Submitted, Running, or Succeeded: %s" % status) logging.info("Cromwell job status: %s", status) # Cromwell produces a list of outputs and full job details outputs = self.fetch(wf_id=cromwell_id, method="outputs") metadata = self.fetch(wf_id=cromwell_id, method="metadata") return outputs, metadata