Beispiel #1
0
 def __check_if_output_is_corrupt(
         self,
         service,
         key_path,
         remote_username,
         instance_ip,  # self unused
         multihit_remote_outfile,
         chunk_id,
         try_number):
     # Check if every row has correct number of columns (12) in the output
     # file on the remote machine
     if service == "gsnap":
         verification_command = "cat %s" % shlex.quote(
             multihit_remote_outfile)
     else:
         # For rapsearch, first remove header lines starting with '#'
         verification_command = "grep -v '^#' %s" % shlex.quote(
             multihit_remote_outfile)
     verification_command += " | awk '{print NF}' | sort -nu | head -n 1"
     min_column_number_string = command.execute_with_output(
         command.remote(verification_command, key_path, remote_username,
                        instance_ip))
     min_column_number = PipelineStepRunAlignmentRemotely.__interpret_min_column_number_string(
         min_column_number_string, CORRECT_NUMBER_OF_OUTPUT_COLUMNS,
         try_number)
     error = None
     if min_column_number != CORRECT_NUMBER_OF_OUTPUT_COLUMNS:
         msg = "Chunk %s output corrupt; not copying to S3. min_column_number = %d -> expected = %d."
         msg += " Re-start pipeline to try again."
         error = msg % (chunk_id, min_column_number,
                        CORRECT_NUMBER_OF_OUTPUT_COLUMNS)
     return error
Beispiel #2
0
 def __delete_remote_dir(self, remote_dir, key_path, remote_username,
                         instance_ip):
     """
     Delete a directory on a remote machine
     This needs to happen while we are holding the machine reservation,
     i.e., inside the "with ASGInstnace" context.
     """
     rm_command = f"rm -rf {remote_dir}"
     command.execute(
         command.remote(rm_command, key_path, remote_username, instance_ip))
Beispiel #3
0
 def poll_server(ip):
     # ServerAliveInterval to fix issue with containers keeping open
     # an SSH connection even after worker machines had finished
     # running.
     commands = "ps aux | grep %s | grep -v bash || echo error" % service_name
     output = command.execute_with_output(
         command.remote(commands, key_path, remote_username, ip),
         timeout=MAX_POLLING_LATENCY).rstrip().split("\n")
     if output != ["error"]:
         with dict_mutex:
             if dict_writable:
                 ip_nproc_dict[ip] = len(output) - 1
Beispiel #4
0
    def run_chunk(self, part_suffix, remote_home_dir, remote_index_dir,
                  remote_work_dir, remote_username, input_files, key_path,
                  service, lazy_run):
        """Dispatch a chunk to worker machines for distributed GSNAP or RAPSearch
        group machines and handle their execution.
        """
        assert service in ("gsnap", "rapsearch2")

        chunk_id = input_files[0].split(part_suffix)[-1]
        # TODO: Switch to python 3.6 which supports interpolation in string
        # formatting, and we will half the number of lines below.
        multihit_basename = "multihit-{service}-out{part_suffix}{chunk_id}.m8".format(
            service=service,
            part_suffix=part_suffix,
            chunk_id=chunk_id,
        )
        multihit_local_outfile = os.path.join(self.chunks_result_dir_local,
                                              multihit_basename)
        multihit_remote_outfile = os.path.join(remote_work_dir,
                                               multihit_basename)
        multihit_s3_outfile = os.path.join(self.chunks_result_dir_s3,
                                           multihit_basename)

        base_str = "aws s3 cp --only-show-errors {s3_path}/{input_fa} {remote_work_dir}/{input_fa} "
        download_input_from_s3 = " ; ".join(
            base_str.format(s3_path=self.chunks_result_dir_s3,
                            input_fa=input_fa,
                            remote_work_dir=remote_work_dir)
            for input_fa in input_files)

        base_str = "mkdir -p {remote_work_dir} ; {download_input_from_s3} ; "
        if service == "gsnap":
            commands = base_str + "{remote_home_dir}/bin/gsnapl -A m8 --batch=0 --use-shared-memory=0 --gmap-mode=none --npaths=100 --ordered -t 36 --maxsearch=1000 --max-mismatches=40 -D {remote_index_dir} -d nt_k16 {remote_input_files} > {multihit_remote_outfile}"
        else:
            commands = base_str + "/usr/local/bin/rapsearch -d {remote_index_dir}/nr_rapsearch -e -6 -l 10 -a T -b 0 -v 50 -z 24 -q {remote_input_files} -o {multihit_remote_outfile}"

        commands = commands.format(
            remote_work_dir=remote_work_dir,
            download_input_from_s3=download_input_from_s3,
            remote_home_dir=remote_home_dir,
            remote_index_dir=remote_index_dir,
            remote_input_files=" ".join(remote_work_dir + "/" + input_fa
                                        for input_fa in input_files),
            multihit_remote_outfile=multihit_remote_outfile
            if service == "gsnap" else multihit_remote_outfile[:-3]
            # Strip the .m8 for RAPSearch as it adds that
        )

        if not lazy_run or not fetch_from_s3(multihit_s3_outfile,
                                             multihit_local_outfile):
            correct_number_of_output_columns = 12
            min_column_number = 0
            max_tries = 2
            try_number = 1
            instance_ip = ""

            def interpret_min_column_number_string(
                    min_column_number_string, correct_number_of_output_columns,
                    try_number):
                if min_column_number_string:
                    min_column_number = float(min_column_number_string)
                    log.write(
                        "Try no. %d: Smallest number of columns observed in any line was %d"
                        % (try_number, min_column_number))
                else:
                    log.write("Try no. %d: No hits" % try_number)
                    min_column_number = correct_number_of_output_columns
                return min_column_number

            # Check if every row has correct number of columns (12) in the output
            # file on the remote machine
            while min_column_number != correct_number_of_output_columns \
                    and try_number <= max_tries:
                log.write("waiting for {} server for chunk {}".format(
                    service, chunk_id))
                max_concurrent = self.additional_attributes["max_concurrent"]
                environment = self.additional_attributes["environment"]

                instance_ip = server.wait_for_server_ip(
                    service, key_path, remote_username, environment,
                    max_concurrent, chunk_id)
                log.write("starting alignment for chunk %s on %s server %s" %
                          (chunk_id, service, instance_ip))
                command.execute(
                    command.remote(commands, key_path, remote_username,
                                   instance_ip))

                if service == "gsnap":
                    verification_command = "cat %s" % multihit_remote_outfile
                else:
                    # For rapsearch, first remove header lines starting with '#'
                    verification_command = "grep -v '^#' %s" % multihit_remote_outfile
                verification_command += " | awk '{print NF}' | sort -nu | head -n 1"
                min_column_number_string = command.execute_with_output(
                    command.remote(verification_command, key_path,
                                   remote_username, instance_ip))
                min_column_number = interpret_min_column_number_string(
                    min_column_number_string, correct_number_of_output_columns,
                    try_number)
                try_number += 1

            # Move output from remote machine to local machine
            msg = "Chunk %s output corrupt; not copying to S3. Re-start pipeline " \
                  "to try again." % chunk_id
            assert min_column_number == correct_number_of_output_columns, msg

            with self.iostream_upload:  # Limit concurrent uploads so as not to stall the pipeline.
                command.execute(
                    command.scp(key_path, remote_username, instance_ip,
                                multihit_remote_outfile,
                                multihit_local_outfile))
                command.execute(
                    "aws s3 cp --only-show-errors %s %s/" %
                    (multihit_local_outfile, self.chunks_result_dir_s3))
            log.write("finished alignment for chunk %s on %s server %s" %
                      (chunk_id, service, instance_ip))
        return multihit_local_outfile
Beispiel #5
0
    def run_chunk(self, part_suffix, remote_home_dir, remote_index_dir,
                  remote_work_dir, remote_username, input_files, key_path,
                  service, lazy_run):
        """Dispatch a chunk to worker machines for distributed GSNAP or RAPSearch
        group machines and handle their execution.
        """
        assert service in ("gsnap", "rapsearch2")

        chunk_id = int(input_files[0].split(part_suffix)[-1])
        multihit_basename = f"multihit-{service}-out{part_suffix}{chunk_id}.m8"
        multihit_local_outfile = os.path.join(self.chunks_result_dir_local,
                                              multihit_basename)
        multihit_remote_outfile = os.path.join(remote_work_dir,
                                               multihit_basename)
        multihit_s3_outfile = os.path.join(self.chunks_result_dir_s3,
                                           multihit_basename)

        def aws_cp_operation(input_fa):
            return "aws s3 cp --only-show-errors {src} {dest}".format(
                src=shlex.quote(
                    os.path.join(self.chunks_result_dir_s3, input_fa)),
                dest=shlex.quote(os.path.join(remote_work_dir, input_fa)))

        download_input_from_s3 = " ; ".join(map(aws_cp_operation, input_files))

        # Clean up remote work directory before running
        #   This ensures that files from a failed previous run that may still be on the instance
        #   are removed so they don't corrupt the current run
        base_str = "rm -rf {remote_work_dir} ; mkdir -p {remote_work_dir} ; {download_input_from_s3} ; "
        environment = self.additional_attributes["environment"]

        # See step class docstrings for more parameter details.
        if service == "gsnap":
            commands = base_str + "{remote_home_dir}/bin/gsnapl -A m8 --batch=0 --use-shared-memory=0 --gmap-mode=none --npaths=100 --ordered -t 48 --max-mismatches=40 -D {remote_index_dir} -d nt_k16 {remote_input_files} > {multihit_remote_outfile}"
        else:
            commands = base_str + "/usr/local/bin/rapsearch -d {remote_index_dir}/nr_rapsearch -e -6 -l 10 -a T -b 0 -v 50 -z 24 -q {remote_input_files} -o {multihit_remote_outfile}"

        commands = commands.format(
            remote_work_dir=shlex.quote(remote_work_dir),
            download_input_from_s3=download_input_from_s3,
            remote_home_dir=shlex.quote(remote_home_dir),
            remote_index_dir=shlex.quote(remote_index_dir),
            remote_input_files=" ".join(
                shlex.quote(remote_work_dir + "/" + input_fa)
                for input_fa in input_files),
            multihit_remote_outfile=shlex.quote(multihit_remote_outfile) if
            service == "gsnap" else shlex.quote(multihit_remote_outfile[:-3])
            # Strip the .m8 for RAPSearch as it adds that
        )

        if lazy_run and fetch_from_s3(multihit_s3_outfile,
                                      multihit_local_outfile,
                                      okay_if_missing=True,
                                      allow_s3mi=False):
            log.write(
                f"finished alignment for chunk {chunk_id} with {service} by lazily fetching last result"
            )
        else:
            chunk_timeout = int(
                self.additional_attributes.get(
                    f"{service.lower()}_chunk_timeout", DEFAULT_CHUNK_TIMEOUT))
            for try_number in range(1, CHUNK_MAX_TRIES + 1):
                log.write(
                    f"waiting for {service} server for chunk {chunk_id}. Try #{try_number}"
                )
                with ASGInstance(service, key_path, remote_username,
                                 environment, chunk_id, try_number,
                                 self.additional_attributes) as instance_ip:
                    # Try/Except block needs to be inside the ASGInstance context.
                    # A failure to acquire an ASGInstnace is and should be unrecoverable.
                    chunk_status = None
                    elapsed = 0.0
                    try:
                        t_start = time.time()
                        try:
                            command.execute(command.remote(
                                commands, key_path, remote_username,
                                instance_ip),
                                            timeout=chunk_timeout)
                        except:
                            chunk_status = ChunkStatus.CRASH
                            raise
                        finally:
                            elapsed = time.time() - t_start
                            if chunk_status == ChunkStatus.CRASH and elapsed >= chunk_timeout:
                                chunk_status = ChunkStatus.TIMEOUT

                        output_corrupt = self.__check_if_output_is_corrupt(
                            service, key_path, remote_username, instance_ip,
                            multihit_remote_outfile, chunk_id, try_number)

                        if output_corrupt:
                            chunk_status = ChunkStatus.CORRUPT_OUTPUT
                            assert not output_corrupt, output_corrupt

                        # Yay, chunk succeeded.  Copy from server and break out of retry loop.
                        try:
                            self.__copy_multihit_remote_outfile(
                                key_path, remote_username, instance_ip,
                                multihit_remote_outfile,
                                multihit_local_outfile)
                            chunk_status = ChunkStatus.SUCCESS
                            break
                        except:
                            # If we failed to copy from the server, it's as bad as a crash in alignment.
                            chunk_status = ChunkStatus.CRASH
                            raise

                    except Exception as e:

                        # 1. No backoff needed here before retrying.  We rate limit chunk dispatch (the ASGInstance
                        # acquisition above is blocking).  ASGInstance acquisition also tries to ensure that every
                        # chunk flight gets its first try before any retry is dispatched.

                        # 2. If the reason we failed is timeout on the server, we don't retry.  The operator must decide
                        # whether to QC the data more, or use smaller chunk size.  In fact, we only retry for CRASH and
                        # CORRUPT_OUTPUT.

                        # 3. If this is the last attempt, we gotta re-raise the exception.

                        # 4. Elapsed time is only the time spent in alignment.  It excludes the time spent waiting to
                        # acquire ASGinstance.

                        log.log_event('alignment_remote_error',
                                      values={
                                          "chunk": chunk_id,
                                          "try_number": try_number,
                                          "CHUNK_MAX_TRIES": CHUNK_MAX_TRIES,
                                          "chunk_status": chunk_status,
                                          "elapsed": elapsed,
                                          "chunk_timeout": chunk_timeout,
                                          "exception": log.parse_exception(e)
                                      })
                        retrying_might_help = chunk_status in (
                            ChunkStatus.CORRUPT_OUTPUT, ChunkStatus.CRASH)
                        if try_number < CHUNK_MAX_TRIES and retrying_might_help:
                            # Retry!
                            continue
                        else:
                            # End of the road.
                            raise
                    finally:
                        # None chunk_status indicates code bug above.  An exception has been raised already
                        # for it, and it says nothing about whether the alignment succeeded or not.
                        if chunk_status != None:
                            chunk_status_tracker(service).note_outcome(
                                instance_ip, chunk_id, elapsed, chunk_status,
                                try_number)
                        self.__delete_remote_dir(remote_work_dir, key_path,
                                                 remote_username, instance_ip)

            # Upload to s3
            with self.iostream_upload:  # Limit concurrent uploads so as not to stall the pipeline.
                command.execute(
                    command_patterns.SingleCommand(
                        cmd="aws",
                        args=[
                            "s3", "cp", "--only-show-errors",
                            multihit_local_outfile,
                            os.path.join(self.chunks_result_dir_s3, "")
                        ]))
            log.write(
                f"finished alignment for chunk {chunk_id} on {service} server {instance_ip}"
            )

        # Whether lazy or not lazy, we've now got the chunk result locally here.
        return multihit_local_outfile