Beispiel #1
0
    def test_log_event(mock_log_write):
        '''Test a successful log event'''
        log.log_event('fake_event_name', values={"name_a": 1, "name_b": "fake_string", "name_c": 3.1415}, warning=True, flush=False, extra_fields={"extra_field_1": "abc"})

        mock_log_write.assert_has_calls([
            call(obj_data={"event": "fake_event_name", "extra_field_1": "abc", "values": {"name_a": 1, "name_b": "fake_string", "name_c": 3.1415}}, warning=True, debug=False, flush=False)
        ])
 def manage_reference_downloads_cache(self):
     command.make_dirs(PURGE_SENTINEL_DIR)
     command.touch(PURGE_SENTINEL)
     time.sleep(
         3
     )  # 1 should be enough for the sentinel to predate all current stage downloads
     present_set, missing_set = self.references_roll_call()
     total_set = present_set | missing_set
     if total_set:
         present = len(present_set)
         missing = len(missing_set)
         total = len(total_set)
         log.write(
             f"Reference download cache: {present} of {total} large files ({100 * present / total:3.1f} percent) already exist locally."
         )
         structured_report = {
             "summary_stats": {
                 "cache_requests": total,
                 "cache_hits": present,
                 "cache_misses": missing,
                 "cache_hit_rate": present / total
             },
             "per_request_stats": {
                 f: "hit" if f in present_set else "miss"
                 for f in sorted(total_set)
             }
         }
         log.log_event("Reference downloads cache efficiency report",
                       values=structured_report)
     idseq_dag.util.s3.make_space(
     )  # make sure to touch this stage's files before deleting LRU ones
Beispiel #3
0
 def log_stats(self, total_chunks):
     # This happens after the last call to tracker.note_outcome(), making it
     # safe to access tracker.outcome_data directly without holding a lock.
     # But, to be safe...
     with self.lock:
         report = self.status_report(total_chunks)
         outcomes = deepcopy(self.outcome_data)
     log.log_event(f"STATUS REPORT FOR {self.service}", values=report)
     log.log_event(f"FULL OUTCOME DATA FOR {self.service}", values=outcomes)
 def _log_alignment_batch_job_status(self, job_id, job_queue, job_definition, chunk_id, status):
     log.log_event('alignment_batch_job_status', values={
         'job_id': job_id,
         'chunk_id': chunk_id,
         'job_queue': job_queue,
         'job_definition': job_definition,
         'status': status,
         'alignment_algorithm': self.alignment_algorithm,
     })
 def release(self):
     log.log_event("trace_lock",
                   values={
                       "lock_name": self._lock_name,
                       "thread_name": threading.current_thread().name,
                       "state": "released"
                   },
                   debug=self.debug)
     self._lock.release()
 def run(self):
     # lz4 is "scalable with multi-cores CPU" so we let it parallelize
     # itself. See https://github.com/lz4/lz4 .
     for file_list in self.input_files_local:
         for input_file in file_list:
             if input_file.endswith(('.gz', '.zip', '.lz4')):
                 log.log_event(
                     f'Skipping already-compressed file {input_file}')
             else:
                 command.execute(self.get_command(input_file))
Beispiel #7
0
    def test_log_event_3(mock_log_write):
        '''Test a successful log event with duration'''
        start_time = datetime.datetime(2019, 5, 27, 16, 0, 0)
        now = start_time + datetime.timedelta(minutes=1, seconds=18)
        mock_datetime = Mock(datetime.datetime)
        mock_datetime.now.return_value = now

        with patch.object(datetime, 'datetime', mock_datetime):
            log.log_event('fake_event_name', values={"name_a": 1}, warning=False, flush=False, start_time=start_time)

        mock_log_write.assert_has_calls([
            call(obj_data={"event": "fake_event_name", "values": {"name_a": 1}, "duration_ms": 78000}, warning=False, debug=False, flush=False)
        ])
    def parse_and_validate_conf(dag_json):
        '''
        Validate the json format. see examples/*.json.
        Required fields are:
          "output_dir_s3": base results folder. a pipeline version number will be appended for real output folder.
          "targets": lists of files that are given or would be generated
          "steps": steps that species actions to generate input and output
          "given_targets": input files that are given
        '''
        dag = json.loads(open(dag_json).read())

        log.log_event("pipeline_flow.dag_json_loaded",
                      values={
                          "file": dag_json,
                          "contents": dag
                      })
        output_dir = dag["output_dir_s3"]  # noqa
        targets = dag["targets"]
        steps = dag["steps"]
        given_targets = dag["given_targets"]
        dag['name'] = dag.get("name", _get_name_from_path(dag_json))
        covered_targets = set()
        for s in steps:
            # validate each step in/out are valid targets
            for itarget in s["in"]:
                if itarget not in targets:
                    raise ValueError("input %s doesn't exit for step %s" %
                                     (itarget, s["out"]))
            if s["out"] not in targets:
                raise ValueError("%s target doesn't exit" % s["out"])
            if s["out"] in covered_targets:
                raise ValueError("%s hasn't been generated in other steps" %
                                 s["out"])
            covered_targets.add(s["out"])
        for target_name, target_data in given_targets.items():
            # validate the given targets exist in s3
            s3_path = target_data["s3_dir"]
            covered_targets.add(target_name)
            for file_name in targets[target_name]:
                s3_file = os.path.join(s3_path, file_name)
                if not idseq_dag.util.s3.check_s3_presence(s3_file):
                    raise ValueError("%s file doesn't exist" % s3_file)
        # Check that all targets are covered
        # ALL Inputs Outputs VALIDATED
        for target_name in targets.keys():
            if target_name not in covered_targets:
                raise ValueError("%s couldn't be generated from the steps" %
                                 target_name)

        return dag
 def _get_job_status(self, session, job_id):
     batch_job_desc_bucket = session.resource("s3").Bucket(self.batch_job_desc_bucket)
     key = f"job_descriptions/{job_id}"
     try:
         job_desc_object = batch_job_desc_bucket.Object(key)
         return json.loads(job_desc_object.get()["Body"].read())["status"]
     except ClientError as e:
         if e.response['Error']['Code'] == 'NoSuchKey':
             # Warn that the object is missing so any issue with the s3 mechanism can be identified
             log.log_event("missing_job_description_ojbect", values={key: key}, debug=True)
             # Return submitted because a missing job status probably means it hasn't been added yet
             return "SUBMITTED"
         else:
             raise e
    def _run_batch_job(self, session, job_name, job_queue, job_definition,
                       command, environment, chunk_id, retries):
        client = session.client("batch")
        response = client.submit_job(jobName=job_name,
                                     jobQueue=job_queue,
                                     jobDefinition=job_definition,
                                     containerOverrides={
                                         "command": command,
                                         "environment": environment,
                                     },
                                     retryStrategy={"attempts": retries})
        job_id = response["jobId"]
        self._log_alignment_batch_job_status(job_id, job_queue, job_definition,
                                             chunk_id, 'SUBMITTED')

        chunks_in_flight = min(
            self.chunk_count, MAX_CHUNKS_IN_FLIGHT
        )  # use min in case we have fewer chunks than the maximum
        mean_delay = chunks_in_flight  # ~1 chunk per second to avoid throttling,
        delay = mean_delay + random.randint(
            -mean_delay // 2,
            mean_delay // 2)  # Add some noise to de-synchronize chunks
        status = "SUBMITTED"
        # the job this is monitoring has an timeout and the job this runs in has a timeout
        while True:
            try:
                status = self._get_job_status(session, job_id)
            except ClientError as e:
                # If we get throttled, randomly wait to de-synchronize the requests
                if e.response['Error']['Code'] == "TooManyRequestsException":
                    log.log_event("describe_jobs_rate_limit_error",
                                  values={"job_id": job_id},
                                  warning=True)
                    # Possibly implement a backoff here if throttling becomes an issue
                else:
                    log.log_event(
                        "unexpected_client_error_while_polling_job_status",
                        values={"job_id": job_id})
                    raise e

            if status == "SUCCEEDED":
                self._log_alignment_batch_job_status(job_id, job_queue,
                                                     job_definition, chunk_id,
                                                     status)
                return job_id
            if status == "FAILED":
                log.log_event("alignment_batch_job_failed",
                              values={
                                  'job_id': job_id,
                                  'chunk_id': chunk_id,
                                  'alignment_algorithm':
                                  self.alignment_algorithm
                              })
                self._log_alignment_batch_job_status(job_id, job_queue,
                                                     job_definition, chunk_id,
                                                     status)
                raise BatchJobFailed("chunk alignment failed")
            time.sleep(delay)
Beispiel #11
0
 def acquire(self):
     v = {
         "lock_name": self._lock_name,
         "thread_name": threading.current_thread().name
     }
     if self._lock.acquire(False):
         log.log_event("trace_lock",
                       values={
                           **v, "state": "acquired"
                       },
                       debug=self.debug)
     else:
         log.log_event("trace_lock",
                       values={
                           **v, "state": "waiting"
                       },
                       debug=self.debug)
         self._lock.acquire(True)
         log.log_event("trace_lock",
                       values={
                           **v, "state": "acquired_after_wait"
                       },
                       debug=self.debug)
Beispiel #12
0
    def run_chunk(self, part_suffix, remote_home_dir, remote_index_dir,
                  remote_work_dir, remote_username, input_files, key_path,
                  service, lazy_run):
        """Dispatch a chunk to worker machines for distributed GSNAP or RAPSearch
        group machines and handle their execution.
        """
        assert service in ("gsnap", "rapsearch2")

        chunk_id = int(input_files[0].split(part_suffix)[-1])
        multihit_basename = f"multihit-{service}-out{part_suffix}{chunk_id}.m8"
        multihit_local_outfile = os.path.join(self.chunks_result_dir_local,
                                              multihit_basename)
        multihit_remote_outfile = os.path.join(remote_work_dir,
                                               multihit_basename)
        multihit_s3_outfile = os.path.join(self.chunks_result_dir_s3,
                                           multihit_basename)

        def aws_cp_operation(input_fa):
            return "aws s3 cp --only-show-errors {src} {dest}".format(
                src=shlex.quote(
                    os.path.join(self.chunks_result_dir_s3, input_fa)),
                dest=shlex.quote(os.path.join(remote_work_dir, input_fa)))

        download_input_from_s3 = " ; ".join(map(aws_cp_operation, input_files))

        # Clean up remote work directory before running
        #   This ensures that files from a failed previous run that may still be on the instance
        #   are removed so they don't corrupt the current run
        base_str = "rm -rf {remote_work_dir} ; mkdir -p {remote_work_dir} ; {download_input_from_s3} ; "
        environment = self.additional_attributes["environment"]

        # See step class docstrings for more parameter details.
        if service == "gsnap":
            commands = base_str + "{remote_home_dir}/bin/gsnapl -A m8 --batch=0 --use-shared-memory=0 --gmap-mode=none --npaths=100 --ordered -t 48 --max-mismatches=40 -D {remote_index_dir} -d nt_k16 {remote_input_files} > {multihit_remote_outfile}"
        else:
            commands = base_str + "/usr/local/bin/rapsearch -d {remote_index_dir}/nr_rapsearch -e -6 -l 10 -a T -b 0 -v 50 -z 24 -q {remote_input_files} -o {multihit_remote_outfile}"

        commands = commands.format(
            remote_work_dir=shlex.quote(remote_work_dir),
            download_input_from_s3=download_input_from_s3,
            remote_home_dir=shlex.quote(remote_home_dir),
            remote_index_dir=shlex.quote(remote_index_dir),
            remote_input_files=" ".join(
                shlex.quote(remote_work_dir + "/" + input_fa)
                for input_fa in input_files),
            multihit_remote_outfile=shlex.quote(multihit_remote_outfile) if
            service == "gsnap" else shlex.quote(multihit_remote_outfile[:-3])
            # Strip the .m8 for RAPSearch as it adds that
        )

        if lazy_run and fetch_from_s3(multihit_s3_outfile,
                                      multihit_local_outfile,
                                      okay_if_missing=True,
                                      allow_s3mi=False):
            log.write(
                f"finished alignment for chunk {chunk_id} with {service} by lazily fetching last result"
            )
        else:
            chunk_timeout = int(
                self.additional_attributes.get(
                    f"{service.lower()}_chunk_timeout", DEFAULT_CHUNK_TIMEOUT))
            for try_number in range(1, CHUNK_MAX_TRIES + 1):
                log.write(
                    f"waiting for {service} server for chunk {chunk_id}. Try #{try_number}"
                )
                with ASGInstance(service, key_path, remote_username,
                                 environment, chunk_id, try_number,
                                 self.additional_attributes) as instance_ip:
                    # Try/Except block needs to be inside the ASGInstance context.
                    # A failure to acquire an ASGInstnace is and should be unrecoverable.
                    chunk_status = None
                    elapsed = 0.0
                    try:
                        t_start = time.time()
                        try:
                            command.execute(command.remote(
                                commands, key_path, remote_username,
                                instance_ip),
                                            timeout=chunk_timeout)
                        except:
                            chunk_status = ChunkStatus.CRASH
                            raise
                        finally:
                            elapsed = time.time() - t_start
                            if chunk_status == ChunkStatus.CRASH and elapsed >= chunk_timeout:
                                chunk_status = ChunkStatus.TIMEOUT

                        output_corrupt = self.__check_if_output_is_corrupt(
                            service, key_path, remote_username, instance_ip,
                            multihit_remote_outfile, chunk_id, try_number)

                        if output_corrupt:
                            chunk_status = ChunkStatus.CORRUPT_OUTPUT
                            assert not output_corrupt, output_corrupt

                        # Yay, chunk succeeded.  Copy from server and break out of retry loop.
                        try:
                            self.__copy_multihit_remote_outfile(
                                key_path, remote_username, instance_ip,
                                multihit_remote_outfile,
                                multihit_local_outfile)
                            chunk_status = ChunkStatus.SUCCESS
                            break
                        except:
                            # If we failed to copy from the server, it's as bad as a crash in alignment.
                            chunk_status = ChunkStatus.CRASH
                            raise

                    except Exception as e:

                        # 1. No backoff needed here before retrying.  We rate limit chunk dispatch (the ASGInstance
                        # acquisition above is blocking).  ASGInstance acquisition also tries to ensure that every
                        # chunk flight gets its first try before any retry is dispatched.

                        # 2. If the reason we failed is timeout on the server, we don't retry.  The operator must decide
                        # whether to QC the data more, or use smaller chunk size.  In fact, we only retry for CRASH and
                        # CORRUPT_OUTPUT.

                        # 3. If this is the last attempt, we gotta re-raise the exception.

                        # 4. Elapsed time is only the time spent in alignment.  It excludes the time spent waiting to
                        # acquire ASGinstance.

                        log.log_event('alignment_remote_error',
                                      values={
                                          "chunk": chunk_id,
                                          "try_number": try_number,
                                          "CHUNK_MAX_TRIES": CHUNK_MAX_TRIES,
                                          "chunk_status": chunk_status,
                                          "elapsed": elapsed,
                                          "chunk_timeout": chunk_timeout,
                                          "exception": log.parse_exception(e)
                                      })
                        retrying_might_help = chunk_status in (
                            ChunkStatus.CORRUPT_OUTPUT, ChunkStatus.CRASH)
                        if try_number < CHUNK_MAX_TRIES and retrying_might_help:
                            # Retry!
                            continue
                        else:
                            # End of the road.
                            raise
                    finally:
                        # None chunk_status indicates code bug above.  An exception has been raised already
                        # for it, and it says nothing about whether the alignment succeeded or not.
                        if chunk_status != None:
                            chunk_status_tracker(service).note_outcome(
                                instance_ip, chunk_id, elapsed, chunk_status,
                                try_number)
                        self.__delete_remote_dir(remote_work_dir, key_path,
                                                 remote_username, instance_ip)

            # Upload to s3
            with self.iostream_upload:  # Limit concurrent uploads so as not to stall the pipeline.
                command.execute(
                    command_patterns.SingleCommand(
                        cmd="aws",
                        args=[
                            "s3", "cp", "--only-show-errors",
                            multihit_local_outfile,
                            os.path.join(self.chunks_result_dir_s3, "")
                        ]))
            log.write(
                f"finished alignment for chunk {chunk_id} on {service} server {instance_ip}"
            )

        # Whether lazy or not lazy, we've now got the chunk result locally here.
        return multihit_local_outfile
    def run_chunk(self, part_suffix, input_files, lazy_run):
        """
        Dispatch a chunk to worker machines for distributed GSNAP or RAPSearch
        group machines and handle their execution.
        """

        chunk_id = int(input_files[0].split(part_suffix)[-1])
        multihit_basename = f"multihit-{self.alignment_algorithm}-out{part_suffix}{chunk_id}.m8"
        multihit_local_outfile = os.path.join(self.chunks_result_dir_local,
                                              multihit_basename)
        multihit_s3_outfile = os.path.join(self.chunks_result_dir_s3,
                                           multihit_basename)

        session = boto3.session.Session()
        if lazy_run and download_from_s3(session, multihit_s3_outfile,
                                         multihit_local_outfile):
            log.write(
                f"finished alignment for chunk {chunk_id} with {self.alignment_algorithm} by lazily fetching last result"
            )
            return multihit_local_outfile

        deployment_environment = os.environ["DEPLOYMENT_ENVIRONMENT"]
        # TODO: (tmorse) remove compat hack https://jira.czi.team/browse/IDSEQ-2568
        if deployment_environment == "production":
            deployment_environment = "prod"
        priority_name = os.environ.get("PRIORITY_NAME", "normal")

        index_dir_suffix = self.additional_attributes["index_dir_suffix"]

        pattern = r's3://.+/samples/([0-9]+)/([0-9]+)/'
        m = re.match(pattern, self.chunks_result_dir_s3)
        if m:
            project_id, sample_id = m.group(1), m.group(2)
        else:
            project_id, sample_id = '0', '0'

        provisioning_model = 'SPOT'

        # TODO: parameterize these https://jira.czi.team/browse/IDSEQ-2673
        job_name = f"idseq-{deployment_environment}-{self.alignment_algorithm}-project-{project_id}-sample-{sample_id}-part-{chunk_id}"
        job_queue = f"idseq-{deployment_environment}-{self.alignment_algorithm}-{provisioning_model}-{index_dir_suffix}-{priority_name}"
        job_definition = f"idseq-{deployment_environment}-{self.alignment_algorithm}"

        environment = [
            {
                'name': f"INPUT_PATH_{i}",
                'value': os.path.join(self.chunks_result_dir_s3, input_file),
            } for i, input_file in enumerate(input_files)
        ] + [{
            'name': "OUTPUT_PATH",
            'value': multihit_s3_outfile,
        }]

        try:
            job_id = self._run_batch_job(session=session,
                                         job_name=job_name,
                                         job_queue=job_queue,
                                         job_definition=job_definition,
                                         environment=environment,
                                         chunk_id=chunk_id,
                                         retries=2)
        except BatchJobFailed:
            provisioning_model = 'EC2'
            # TODO: parameterize this https://jira.czi.team/browse/IDSEQ-2673
            job_queue = f"idseq-{deployment_environment}-{self.alignment_algorithm}-{provisioning_model}-{index_dir_suffix}-{priority_name}"
            job_id = self._run_batch_job(session=session,
                                         job_name=job_name,
                                         job_queue=job_queue,
                                         job_definition=job_definition,
                                         environment=environment,
                                         chunk_id=chunk_id,
                                         retries=1)

        for _ in range(12):
            if download_from_s3(session, multihit_s3_outfile,
                                multihit_local_outfile):
                break
            time.sleep(10)
        else:
            log.log_event("chunk_result_missing_in_s3",
                          values={
                              'job_id': job_id,
                              'chunk_id': chunk_id,
                              'job_queue': job_queue,
                              'job_definition': job_definition,
                              'alignment_algorithm': self.alignment_algorithm,
                          })
            raise Exception("Chunk result is missing from s3")

        log.log_event("alignment_batch_chunk_result_downloaded",
                      values={
                          'job_id': job_id,
                          'chunk_id': chunk_id,
                          'job_queue': job_queue,
                          'job_definition': job_definition,
                          'alignment_algorithm': self.alignment_algorithm,
                      })

        return multihit_local_outfile