def test_log_event(mock_log_write): '''Test a successful log event''' log.log_event('fake_event_name', values={"name_a": 1, "name_b": "fake_string", "name_c": 3.1415}, warning=True, flush=False, extra_fields={"extra_field_1": "abc"}) mock_log_write.assert_has_calls([ call(obj_data={"event": "fake_event_name", "extra_field_1": "abc", "values": {"name_a": 1, "name_b": "fake_string", "name_c": 3.1415}}, warning=True, debug=False, flush=False) ])
def manage_reference_downloads_cache(self): command.make_dirs(PURGE_SENTINEL_DIR) command.touch(PURGE_SENTINEL) time.sleep( 3 ) # 1 should be enough for the sentinel to predate all current stage downloads present_set, missing_set = self.references_roll_call() total_set = present_set | missing_set if total_set: present = len(present_set) missing = len(missing_set) total = len(total_set) log.write( f"Reference download cache: {present} of {total} large files ({100 * present / total:3.1f} percent) already exist locally." ) structured_report = { "summary_stats": { "cache_requests": total, "cache_hits": present, "cache_misses": missing, "cache_hit_rate": present / total }, "per_request_stats": { f: "hit" if f in present_set else "miss" for f in sorted(total_set) } } log.log_event("Reference downloads cache efficiency report", values=structured_report) idseq_dag.util.s3.make_space( ) # make sure to touch this stage's files before deleting LRU ones
def log_stats(self, total_chunks): # This happens after the last call to tracker.note_outcome(), making it # safe to access tracker.outcome_data directly without holding a lock. # But, to be safe... with self.lock: report = self.status_report(total_chunks) outcomes = deepcopy(self.outcome_data) log.log_event(f"STATUS REPORT FOR {self.service}", values=report) log.log_event(f"FULL OUTCOME DATA FOR {self.service}", values=outcomes)
def _log_alignment_batch_job_status(self, job_id, job_queue, job_definition, chunk_id, status): log.log_event('alignment_batch_job_status', values={ 'job_id': job_id, 'chunk_id': chunk_id, 'job_queue': job_queue, 'job_definition': job_definition, 'status': status, 'alignment_algorithm': self.alignment_algorithm, })
def release(self): log.log_event("trace_lock", values={ "lock_name": self._lock_name, "thread_name": threading.current_thread().name, "state": "released" }, debug=self.debug) self._lock.release()
def run(self): # lz4 is "scalable with multi-cores CPU" so we let it parallelize # itself. See https://github.com/lz4/lz4 . for file_list in self.input_files_local: for input_file in file_list: if input_file.endswith(('.gz', '.zip', '.lz4')): log.log_event( f'Skipping already-compressed file {input_file}') else: command.execute(self.get_command(input_file))
def test_log_event_3(mock_log_write): '''Test a successful log event with duration''' start_time = datetime.datetime(2019, 5, 27, 16, 0, 0) now = start_time + datetime.timedelta(minutes=1, seconds=18) mock_datetime = Mock(datetime.datetime) mock_datetime.now.return_value = now with patch.object(datetime, 'datetime', mock_datetime): log.log_event('fake_event_name', values={"name_a": 1}, warning=False, flush=False, start_time=start_time) mock_log_write.assert_has_calls([ call(obj_data={"event": "fake_event_name", "values": {"name_a": 1}, "duration_ms": 78000}, warning=False, debug=False, flush=False) ])
def parse_and_validate_conf(dag_json): ''' Validate the json format. see examples/*.json. Required fields are: "output_dir_s3": base results folder. a pipeline version number will be appended for real output folder. "targets": lists of files that are given or would be generated "steps": steps that species actions to generate input and output "given_targets": input files that are given ''' dag = json.loads(open(dag_json).read()) log.log_event("pipeline_flow.dag_json_loaded", values={ "file": dag_json, "contents": dag }) output_dir = dag["output_dir_s3"] # noqa targets = dag["targets"] steps = dag["steps"] given_targets = dag["given_targets"] dag['name'] = dag.get("name", _get_name_from_path(dag_json)) covered_targets = set() for s in steps: # validate each step in/out are valid targets for itarget in s["in"]: if itarget not in targets: raise ValueError("input %s doesn't exit for step %s" % (itarget, s["out"])) if s["out"] not in targets: raise ValueError("%s target doesn't exit" % s["out"]) if s["out"] in covered_targets: raise ValueError("%s hasn't been generated in other steps" % s["out"]) covered_targets.add(s["out"]) for target_name, target_data in given_targets.items(): # validate the given targets exist in s3 s3_path = target_data["s3_dir"] covered_targets.add(target_name) for file_name in targets[target_name]: s3_file = os.path.join(s3_path, file_name) if not idseq_dag.util.s3.check_s3_presence(s3_file): raise ValueError("%s file doesn't exist" % s3_file) # Check that all targets are covered # ALL Inputs Outputs VALIDATED for target_name in targets.keys(): if target_name not in covered_targets: raise ValueError("%s couldn't be generated from the steps" % target_name) return dag
def _get_job_status(self, session, job_id): batch_job_desc_bucket = session.resource("s3").Bucket(self.batch_job_desc_bucket) key = f"job_descriptions/{job_id}" try: job_desc_object = batch_job_desc_bucket.Object(key) return json.loads(job_desc_object.get()["Body"].read())["status"] except ClientError as e: if e.response['Error']['Code'] == 'NoSuchKey': # Warn that the object is missing so any issue with the s3 mechanism can be identified log.log_event("missing_job_description_ojbect", values={key: key}, debug=True) # Return submitted because a missing job status probably means it hasn't been added yet return "SUBMITTED" else: raise e
def _run_batch_job(self, session, job_name, job_queue, job_definition, command, environment, chunk_id, retries): client = session.client("batch") response = client.submit_job(jobName=job_name, jobQueue=job_queue, jobDefinition=job_definition, containerOverrides={ "command": command, "environment": environment, }, retryStrategy={"attempts": retries}) job_id = response["jobId"] self._log_alignment_batch_job_status(job_id, job_queue, job_definition, chunk_id, 'SUBMITTED') chunks_in_flight = min( self.chunk_count, MAX_CHUNKS_IN_FLIGHT ) # use min in case we have fewer chunks than the maximum mean_delay = chunks_in_flight # ~1 chunk per second to avoid throttling, delay = mean_delay + random.randint( -mean_delay // 2, mean_delay // 2) # Add some noise to de-synchronize chunks status = "SUBMITTED" # the job this is monitoring has an timeout and the job this runs in has a timeout while True: try: status = self._get_job_status(session, job_id) except ClientError as e: # If we get throttled, randomly wait to de-synchronize the requests if e.response['Error']['Code'] == "TooManyRequestsException": log.log_event("describe_jobs_rate_limit_error", values={"job_id": job_id}, warning=True) # Possibly implement a backoff here if throttling becomes an issue else: log.log_event( "unexpected_client_error_while_polling_job_status", values={"job_id": job_id}) raise e if status == "SUCCEEDED": self._log_alignment_batch_job_status(job_id, job_queue, job_definition, chunk_id, status) return job_id if status == "FAILED": log.log_event("alignment_batch_job_failed", values={ 'job_id': job_id, 'chunk_id': chunk_id, 'alignment_algorithm': self.alignment_algorithm }) self._log_alignment_batch_job_status(job_id, job_queue, job_definition, chunk_id, status) raise BatchJobFailed("chunk alignment failed") time.sleep(delay)
def acquire(self): v = { "lock_name": self._lock_name, "thread_name": threading.current_thread().name } if self._lock.acquire(False): log.log_event("trace_lock", values={ **v, "state": "acquired" }, debug=self.debug) else: log.log_event("trace_lock", values={ **v, "state": "waiting" }, debug=self.debug) self._lock.acquire(True) log.log_event("trace_lock", values={ **v, "state": "acquired_after_wait" }, debug=self.debug)
def run_chunk(self, part_suffix, remote_home_dir, remote_index_dir, remote_work_dir, remote_username, input_files, key_path, service, lazy_run): """Dispatch a chunk to worker machines for distributed GSNAP or RAPSearch group machines and handle their execution. """ assert service in ("gsnap", "rapsearch2") chunk_id = int(input_files[0].split(part_suffix)[-1]) multihit_basename = f"multihit-{service}-out{part_suffix}{chunk_id}.m8" multihit_local_outfile = os.path.join(self.chunks_result_dir_local, multihit_basename) multihit_remote_outfile = os.path.join(remote_work_dir, multihit_basename) multihit_s3_outfile = os.path.join(self.chunks_result_dir_s3, multihit_basename) def aws_cp_operation(input_fa): return "aws s3 cp --only-show-errors {src} {dest}".format( src=shlex.quote( os.path.join(self.chunks_result_dir_s3, input_fa)), dest=shlex.quote(os.path.join(remote_work_dir, input_fa))) download_input_from_s3 = " ; ".join(map(aws_cp_operation, input_files)) # Clean up remote work directory before running # This ensures that files from a failed previous run that may still be on the instance # are removed so they don't corrupt the current run base_str = "rm -rf {remote_work_dir} ; mkdir -p {remote_work_dir} ; {download_input_from_s3} ; " environment = self.additional_attributes["environment"] # See step class docstrings for more parameter details. if service == "gsnap": commands = base_str + "{remote_home_dir}/bin/gsnapl -A m8 --batch=0 --use-shared-memory=0 --gmap-mode=none --npaths=100 --ordered -t 48 --max-mismatches=40 -D {remote_index_dir} -d nt_k16 {remote_input_files} > {multihit_remote_outfile}" else: commands = base_str + "/usr/local/bin/rapsearch -d {remote_index_dir}/nr_rapsearch -e -6 -l 10 -a T -b 0 -v 50 -z 24 -q {remote_input_files} -o {multihit_remote_outfile}" commands = commands.format( remote_work_dir=shlex.quote(remote_work_dir), download_input_from_s3=download_input_from_s3, remote_home_dir=shlex.quote(remote_home_dir), remote_index_dir=shlex.quote(remote_index_dir), remote_input_files=" ".join( shlex.quote(remote_work_dir + "/" + input_fa) for input_fa in input_files), multihit_remote_outfile=shlex.quote(multihit_remote_outfile) if service == "gsnap" else shlex.quote(multihit_remote_outfile[:-3]) # Strip the .m8 for RAPSearch as it adds that ) if lazy_run and fetch_from_s3(multihit_s3_outfile, multihit_local_outfile, okay_if_missing=True, allow_s3mi=False): log.write( f"finished alignment for chunk {chunk_id} with {service} by lazily fetching last result" ) else: chunk_timeout = int( self.additional_attributes.get( f"{service.lower()}_chunk_timeout", DEFAULT_CHUNK_TIMEOUT)) for try_number in range(1, CHUNK_MAX_TRIES + 1): log.write( f"waiting for {service} server for chunk {chunk_id}. Try #{try_number}" ) with ASGInstance(service, key_path, remote_username, environment, chunk_id, try_number, self.additional_attributes) as instance_ip: # Try/Except block needs to be inside the ASGInstance context. # A failure to acquire an ASGInstnace is and should be unrecoverable. chunk_status = None elapsed = 0.0 try: t_start = time.time() try: command.execute(command.remote( commands, key_path, remote_username, instance_ip), timeout=chunk_timeout) except: chunk_status = ChunkStatus.CRASH raise finally: elapsed = time.time() - t_start if chunk_status == ChunkStatus.CRASH and elapsed >= chunk_timeout: chunk_status = ChunkStatus.TIMEOUT output_corrupt = self.__check_if_output_is_corrupt( service, key_path, remote_username, instance_ip, multihit_remote_outfile, chunk_id, try_number) if output_corrupt: chunk_status = ChunkStatus.CORRUPT_OUTPUT assert not output_corrupt, output_corrupt # Yay, chunk succeeded. Copy from server and break out of retry loop. try: self.__copy_multihit_remote_outfile( key_path, remote_username, instance_ip, multihit_remote_outfile, multihit_local_outfile) chunk_status = ChunkStatus.SUCCESS break except: # If we failed to copy from the server, it's as bad as a crash in alignment. chunk_status = ChunkStatus.CRASH raise except Exception as e: # 1. No backoff needed here before retrying. We rate limit chunk dispatch (the ASGInstance # acquisition above is blocking). ASGInstance acquisition also tries to ensure that every # chunk flight gets its first try before any retry is dispatched. # 2. If the reason we failed is timeout on the server, we don't retry. The operator must decide # whether to QC the data more, or use smaller chunk size. In fact, we only retry for CRASH and # CORRUPT_OUTPUT. # 3. If this is the last attempt, we gotta re-raise the exception. # 4. Elapsed time is only the time spent in alignment. It excludes the time spent waiting to # acquire ASGinstance. log.log_event('alignment_remote_error', values={ "chunk": chunk_id, "try_number": try_number, "CHUNK_MAX_TRIES": CHUNK_MAX_TRIES, "chunk_status": chunk_status, "elapsed": elapsed, "chunk_timeout": chunk_timeout, "exception": log.parse_exception(e) }) retrying_might_help = chunk_status in ( ChunkStatus.CORRUPT_OUTPUT, ChunkStatus.CRASH) if try_number < CHUNK_MAX_TRIES and retrying_might_help: # Retry! continue else: # End of the road. raise finally: # None chunk_status indicates code bug above. An exception has been raised already # for it, and it says nothing about whether the alignment succeeded or not. if chunk_status != None: chunk_status_tracker(service).note_outcome( instance_ip, chunk_id, elapsed, chunk_status, try_number) self.__delete_remote_dir(remote_work_dir, key_path, remote_username, instance_ip) # Upload to s3 with self.iostream_upload: # Limit concurrent uploads so as not to stall the pipeline. command.execute( command_patterns.SingleCommand( cmd="aws", args=[ "s3", "cp", "--only-show-errors", multihit_local_outfile, os.path.join(self.chunks_result_dir_s3, "") ])) log.write( f"finished alignment for chunk {chunk_id} on {service} server {instance_ip}" ) # Whether lazy or not lazy, we've now got the chunk result locally here. return multihit_local_outfile
def run_chunk(self, part_suffix, input_files, lazy_run): """ Dispatch a chunk to worker machines for distributed GSNAP or RAPSearch group machines and handle their execution. """ chunk_id = int(input_files[0].split(part_suffix)[-1]) multihit_basename = f"multihit-{self.alignment_algorithm}-out{part_suffix}{chunk_id}.m8" multihit_local_outfile = os.path.join(self.chunks_result_dir_local, multihit_basename) multihit_s3_outfile = os.path.join(self.chunks_result_dir_s3, multihit_basename) session = boto3.session.Session() if lazy_run and download_from_s3(session, multihit_s3_outfile, multihit_local_outfile): log.write( f"finished alignment for chunk {chunk_id} with {self.alignment_algorithm} by lazily fetching last result" ) return multihit_local_outfile deployment_environment = os.environ["DEPLOYMENT_ENVIRONMENT"] # TODO: (tmorse) remove compat hack https://jira.czi.team/browse/IDSEQ-2568 if deployment_environment == "production": deployment_environment = "prod" priority_name = os.environ.get("PRIORITY_NAME", "normal") index_dir_suffix = self.additional_attributes["index_dir_suffix"] pattern = r's3://.+/samples/([0-9]+)/([0-9]+)/' m = re.match(pattern, self.chunks_result_dir_s3) if m: project_id, sample_id = m.group(1), m.group(2) else: project_id, sample_id = '0', '0' provisioning_model = 'SPOT' # TODO: parameterize these https://jira.czi.team/browse/IDSEQ-2673 job_name = f"idseq-{deployment_environment}-{self.alignment_algorithm}-project-{project_id}-sample-{sample_id}-part-{chunk_id}" job_queue = f"idseq-{deployment_environment}-{self.alignment_algorithm}-{provisioning_model}-{index_dir_suffix}-{priority_name}" job_definition = f"idseq-{deployment_environment}-{self.alignment_algorithm}" environment = [ { 'name': f"INPUT_PATH_{i}", 'value': os.path.join(self.chunks_result_dir_s3, input_file), } for i, input_file in enumerate(input_files) ] + [{ 'name': "OUTPUT_PATH", 'value': multihit_s3_outfile, }] try: job_id = self._run_batch_job(session=session, job_name=job_name, job_queue=job_queue, job_definition=job_definition, environment=environment, chunk_id=chunk_id, retries=2) except BatchJobFailed: provisioning_model = 'EC2' # TODO: parameterize this https://jira.czi.team/browse/IDSEQ-2673 job_queue = f"idseq-{deployment_environment}-{self.alignment_algorithm}-{provisioning_model}-{index_dir_suffix}-{priority_name}" job_id = self._run_batch_job(session=session, job_name=job_name, job_queue=job_queue, job_definition=job_definition, environment=environment, chunk_id=chunk_id, retries=1) for _ in range(12): if download_from_s3(session, multihit_s3_outfile, multihit_local_outfile): break time.sleep(10) else: log.log_event("chunk_result_missing_in_s3", values={ 'job_id': job_id, 'chunk_id': chunk_id, 'job_queue': job_queue, 'job_definition': job_definition, 'alignment_algorithm': self.alignment_algorithm, }) raise Exception("Chunk result is missing from s3") log.log_event("alignment_batch_chunk_result_downloaded", values={ 'job_id': job_id, 'chunk_id': chunk_id, 'job_queue': job_queue, 'job_definition': job_definition, 'alignment_algorithm': self.alignment_algorithm, }) return multihit_local_outfile