Exemple #1
0
def count_reads(filename):
    '''
    Count reads in a given FASTA or FASTQ file.
    '''
    with open(filename, "rb") as gz_fh:
        is_gzipped = True if gz_fh.read(2).startswith(
            GZIP_MAGIC_HEADER) else False
    with gzip.open(filename) if is_gzipped else open(filename,
                                                     mode="rb") as fmt_fh:
        first_char = fmt_fh.read(1).decode()[0]
    with open(filename, "rb") as fh:
        if first_char == ">":
            cmd = "grep -c '^>'"
            if is_gzipped:
                cmd = "gunzip | " + cmd
            return int(
                run(cmd, stdin=fh, stdout=PIPE, check=True, shell=True).stdout)
        elif first_char == "@":
            cmd = "wc -l"
            if is_gzipped:
                cmd = "gunzip | " + cmd
            num_lines = int(
                run(cmd, stdin=fh, stdout=PIPE, check=True, shell=True).stdout)
            if num_lines % 4 != 0:
                raise InvalidFileFormatError(
                    "File does not follow fastq format")
            return num_lines // 4
        raise InvalidFileFormatError("Unable to recognize file format")
    def quick_check_file(self, file, is_fastq, max_fragments_to_check=100):
        num_fragments = 0
        fragment_length = 0

        with open(file, 'r', encoding='utf-8') as input_f:
            while True:
                num_fragments += 1
                if num_fragments > max_fragments_to_check:
                    break

                identifier_l = input_f.readline()
                if len(identifier_l) == 0:  # EOF
                    if num_fragments == 1:
                        raise InsufficientReadsError(
                            "The input file contains 0 reads")
                    break

                read_l = input_f.readline()
                if len(read_l) == 0:  # unexpected EOF
                    raise InvalidFileFormatError("Invalid input file")

                if is_fastq:
                    identifier2_l = input_f.readline()
                    if len(identifier2_l) == 0:
                        raise InvalidFileFormatError("Invalid FASTQ file")

                    quality_l = input_f.readline()
                    if len(quality_l) == 0:
                        raise InvalidFileFormatError("Invalid FASTQ file")

                if is_fastq:
                    if identifier_l[0] != '@' or identifier2_l[0] != '+':
                        # may be FASTQ file with multi-line reads, requires full check
                        return False
                else:
                    if identifier_l[0] != '>':
                        # may be FASTA file with multi-line reads, requires full check
                        return False

                if fragment_length == 0:
                    fragment_length = len(read_l)
                    if fragment_length < vc.READ_LEN_CUTOFF_LOW or fragment_length > vc.READ_LEN_CUTOFF_MID:
                        # non-standard fragment lengths require more detailed examination
                        return False

                if fragment_length != len(read_l) or (
                        is_fastq and fragment_length != len(quality_l)):
                    # file does not meet "quick check" requirements since fragments/quality
                    # scores are not all of same length
                    return False

        return True
    def _full_check_and_truncate_file(self, infile, outfile, is_fastq,
                                      max_fragments, num_inputs):
        num_fragments = 0

        with open(infile, 'r',
                  encoding='utf-8') as input_f, open(outfile, 'w') as output_f:
            next_line = input_f.readline()
            while True:
                num_fragments += 1
                if num_fragments > max_fragments:
                    break

                identifier_l = next_line
                if len(identifier_l) == 0:  # EOF
                    break

                read_l = input_f.readline()
                if len(read_l) == 0:
                    raise InvalidFileFormatError("Invalid input file")

                read_l = read_l.rstrip()
                next_line = input_f.readline()
                while len(next_line) > 0 and next_line[0] not in [
                        '>', '@', '+'
                ]:
                    read_l += next_line.rstrip()
                    next_line = input_f.readline()

                if is_fastq:
                    identifier2_l = next_line
                    if len(identifier2_l) == 0:
                        raise InvalidFileFormatError("Invalid FASTQ file")

                    quality_l = input_f.readline()
                    if len(quality_l) == 0:
                        raise InvalidFileFormatError("Invalid FASTQ file")

                    quality_l = quality_l.rstrip()
                    next_line = input_f.readline()
                    while len(next_line) > 0 and next_line[0] not in [
                            '>', '@', '+'
                    ]:
                        quality_l += next_line.rstrip()
                        next_line = input_f.readline()

                if is_fastq:
                    if identifier_l[0] != '@':
                        raise InvalidFileFormatError("Invalid FASTQ file")
                    if identifier2_l[0] != '+':
                        raise InvalidFileFormatError("Invalid FASTQ file")
                else:
                    if identifier_l[0] != '>':
                        raise InvalidFileFormatError("Invalid FASTA file")

                # At this point, identifier_l and identifier2_l end in a newline and
                # read_l and quality_l do not end in a newline
                read_len = len(read_l)

                # Force read and quality lengths to be identical, either by padding quality
                # with the last quality score or truncating quality score
                if is_fastq:
                    if read_len > len(quality_l):
                        quality_l += (quality_l[-1] *
                                      (read_len - len(quality_l)))
                    elif read_len < len(quality_l):
                        quality_l = quality_l[0:read_len]

                if read_len < vc.READ_LEN_CUTOFF_LOW:
                    self.summary_dict[vc.BUCKET_TOO_SHORT] += 1
                    if num_inputs == 1:
                        continue
                elif read_len < vc.READ_LEN_CUTOFF_MID:
                    self.summary_dict[vc.BUCKET_NORMAL] += 1
                elif read_len < vc.READ_LEN_CUTOFF_HIGH:
                    self.summary_dict[vc.BUCKET_LONG] += 1
                else:
                    self.summary_dict[vc.BUCKET_TOO_LONG] += 1
                    read_l = read_l[0:vc.READ_LEN_CUTOFF_HIGH]
                    if is_fastq:
                        quality_l = quality_l[0:vc.READ_LEN_CUTOFF_HIGH]

                output_f.write(identifier_l + read_l + "\n")
                if is_fastq:
                    output_f.write(identifier2_l + quality_l + "\n")

        return num_fragments
    def run(self):
        # Setup
        input_files = self.input_files_local[0][0:2]
        num_inputs = len(input_files)
        assert num_inputs in [1, 2], 'Invalid number of input files'
        output_files = self.output_files_local()[1:3]
        summary_file = self.output_files_local()[0]
        max_fragments = self.additional_attributes["truncate_fragments_to"]

        file_ext = self.additional_attributes.get("file_ext")
        assert file_ext in ['fastq', 'fasta'], 'Invalid file extension'

        is_fastq = file_ext == 'fastq'

        try:
            for i in range(num_inputs):
                input_file = input_files[i]
                splited_input_file_name, splited_input_file_ext = os.path.splitext(
                    input_file)

                num_lines = self.calc_max_num_lines(is_fastq, max_fragments)

                # unzip if .gz file
                if splited_input_file_ext == '.gz':
                    input_files[i] = splited_input_file_name
                    try:
                        # test if a valid gzip file
                        command.execute(
                            command_patterns.SingleCommand(
                                cmd="gzip", args=["-t", input_file]))
                        # then decompress it
                        command.execute(
                            command_patterns.ShellScriptCommand(
                                script=
                                r'''gzip -dc "${input_file}" | cut -c -"$[max_line_length+1]" | head -n "${num_lines}" | awk -f "${awk_script_file}" -v max_line_length="${max_line_length}" > "${output_file}";''',
                                named_args={
                                    "input_file":
                                    input_file,
                                    "awk_script_file":
                                    command.get_resource_filename(
                                        "scripts/fastq-fasta-line-validation.awk"
                                    ),
                                    "max_line_length":
                                    vc.MAX_LINE_LENGTH,
                                    "num_lines":
                                    num_lines,
                                    "output_file":
                                    splited_input_file_name
                                }))
                    except:
                        raise InvalidFileFormatError(
                            "Invalid fastq/fasta/gzip file")
                else:
                    # Validate and truncate the input file to keep behavior consistent with gz input files
                    try:
                        tmp_file = splited_input_file_name + ".tmp"
                        command.execute(
                            command_patterns.ShellScriptCommand(
                                script=
                                r'''cat "${input_file}" | cut -c -"$[max_line_length+1]" | head -n "${num_lines}" | awk -f "${awk_script_file}" -v max_line_length="${max_line_length}" > "${output_file}";''',
                                named_args={
                                    "input_file":
                                    input_file,
                                    "awk_script_file":
                                    command.get_resource_filename(
                                        "scripts/fastq-fasta-line-validation.awk"
                                    ),
                                    "max_line_length":
                                    vc.MAX_LINE_LENGTH,
                                    "num_lines":
                                    num_lines,
                                    "output_file":
                                    tmp_file
                                }))
                        input_files[i] = tmp_file
                    except:
                        raise InvalidFileFormatError(
                            "Invalid fastq/fasta file")

            # keep a dictionary of the distribution of read lengths in the files
            self.summary_dict = {
                vc.BUCKET_TOO_SHORT: 0,
                vc.BUCKET_NORMAL: 0,
                vc.BUCKET_LONG: 0,
                vc.BUCKET_TOO_LONG: 0
            }

            quick_check_passed = \
                self.quick_check_file(input_files[0], is_fastq) and \
                (num_inputs == 1 or self.quick_check_file(input_files[1], is_fastq))

            all_fragments = []

            for infile, outfile in zip(input_files, output_files):
                if quick_check_passed:
                    num_fragments = self.truncate_file(infile, outfile,
                                                       is_fastq, max_fragments)
                else:
                    num_fragments = self._full_check_and_truncate_file(
                        infile, outfile, is_fastq, max_fragments, num_inputs)
                all_fragments.append(num_fragments)

            if len(all_fragments) == 2 and abs(all_fragments[1] -
                                               all_fragments[0]) > 1000:
                raise InvalidFileFormatError(
                    "Paired input files need to contain the same number of reads"
                )

            with open(summary_file, 'w') as summary_f:
                json.dump(self.summary_dict, summary_f)

        except Exception as e:
            with open(summary_file, 'w') as summary_f:
                json.dump({'Validation error': str(e)}, summary_f)
            s3_path = self.s3_path(summary_file)
            s3.upload_with_retries(summary_file, s3_path)
            raise e

        return