Ejemplo n.º 1
0
    def run(self):
        log = self.log

        #quality_cutoff, args = grace.get_option_value(args, '--quality', int, 10)
        #qoffset, args = grace.get_option_value(args, '--qoffset', int, None)
        #clip_ambiguous, args = grace.get_option_value(args, '--clip-ambiguous', grace.as_bool, True)
        #length_cutoff, args = grace.get_option_value(args, '--length', int, 24)
        #adaptor_cutoff, args = grace.get_option_value(args, '--match', int, 10)
        #max_error, args = grace.get_option_value(args, '--max-errors', int, 1)
        #adaptor_set, args = grace.get_option_value(args, '--adaptors', str, 'truseq-adapter,truseq-srna,genomic,multiplexing,pe,srna')
        #disallow_homopolymers, args = grace.get_option_value(args, '--homopolymers', grace.as_bool, False)
        #reverse_complement, args = grace.get_option_value(args, '--revcom', grace.as_bool, False)
        #trim_start, args = grace.get_option_value(args, '--trim-start', int, 0)
        #trim_end, args = grace.get_option_value(args, '--trim-end', int, 0)
        #output_fasta, args = grace.get_option_value(args, '--fasta', grace.as_bool, False)
        #use_gzip, args = grace.get_option_value(args, '--gzip', grace.as_bool, True)
        #output_rejects, args = grace.get_option_value(args, '--rejects', grace.as_bool, False)
        #grace.expect_no_further_options(args)

        prefix = self.prefix
        log_name = os.path.split(prefix)[1]

        quality_cutoff = self.quality
        qoffset = self.qoffset
        clip_ambiguous = self.clip_ambiguous
        length_cutoff = self.length
        adaptor_cutoff = self.match
        max_error = self.max_errors
        adaptor_set = self.adaptors
        disallow_homopolymers = self.homopolymers
        reverse_complement = self.revcom
        trim_start = self.trim_start
        trim_end = self.trim_end
        output_fasta = self.fasta
        use_gzip = self.gzip
        output_rejects = self.rejects

        iterators = []
        filenames = []
        any_paired = False

        for filename in self.reads:
            filenames.append(filename)
            iterators.append(
                itertools.izip(io.read_sequences(filename, qualities=True)))

        for pair_filenames in self.pairs:
            assert len(pair_filenames
                       ) == 2, 'Expected a pair of files for "pairs" section.'
            filenames.extend(pair_filenames)
            any_paired = True
            iterators.append(
                itertools.izip(
                    io.read_sequences(pair_filenames[0], qualities=True),
                    io.read_sequences(pair_filenames[1], qualities=True)))

        for filename in self.interleaved:
            filenames.extend(filename)
            any_paired = True
            iterators.append(
                deinterleave(io.read_sequences(filename, qualities=True)))

        fragment_reads = (2 if any_paired else 1)
        read_in_fragment_names = ['read-1', 'read-2'
                                  ] if any_paired else ['read']

        assert iterators, 'Nothing to clip'

        if qoffset is None:
            guesses = [
                io.guess_quality_offset(filename) for filename in filenames
            ]
            assert len(
                set(guesses)
            ) == 1, 'Conflicting quality offset guesses, please specify manually.'
            qoffset = guesses[0]
            log.log('FASTQ offset seems to be %d\n' % qoffset)

        quality_cutoff_char = chr(qoffset + quality_cutoff)

        #log.log('Minimum quality:        %d (%s)\n' % (quality_cutoff, quality_cutoff_char))
        #log.log('Clip ambiguous bases:   %s\n' % (grace.describe_bool(clip_ambiguous)))
        #log.log('Minimum adaptor match:  %d bases, %d errors\n' % (adaptor_cutoff, max_error))
        #log.log('Minimum length:         %d bases\n' % length_cutoff)

        adaptor_seqs = []
        adaptor_names = []
        if adaptor_set and adaptor_set.lower() != 'none':
            for item in adaptor_set.split(','):
                item = item.strip().lower() + ' '
                any = False
                for line in ADAPTORS.strip().split('\n'):
                    if line.startswith('#'): continue
                    if not line.lower().startswith(item): continue
                    any = True
                    name, seq = line.rsplit(None, 1)
                    seq = seq.replace('U', 'T')

                    #if seq in adaptor_seqs: print 'Dup', name
                    adaptor_seqs.append(seq)
                    adaptor_names.append(name)
                    adaptor_seqs.append(bio.reverse_complement(seq))
                    adaptor_names.append(name)
                if not any:
                    raise grace.Error('Unknown adaptor set: ' + item)

        matcher = Matcher(adaptor_seqs, adaptor_names, max_error)

        start_clips = [
            collections.defaultdict(list) for i in xrange(fragment_reads)
        ]
        end_clips = [
            collections.defaultdict(list) for i in xrange(fragment_reads)
        ]

        if output_fasta:
            write_sequence = io.write_fasta_single_line
        else:
            write_sequence = io.write_fastq

        f_single = io.open_possibly_compressed_writer(
            self.reads_output_filenames()[0])
        if fragment_reads == 2:
            f_paired = io.open_possibly_compressed_writer(
                self.interleaved_output_filenames()[0])
        if output_rejects:
            f_reject = io.open_possibly_compressed_writer(
                self.rejects_output_filenames()[0])

        n_single = 0
        n_paired = 0

        n_in_single = 0
        n_in_paired = 0
        total_in_length = [0] * fragment_reads

        n_out = [0] * fragment_reads
        n_q_clipped = [0] * fragment_reads
        n_a_clipped = [0] * fragment_reads
        n_homopolymers = [0] * fragment_reads
        total_out_length = [0] * fragment_reads

        #log.attach(open(prefix + '_log.txt', 'wb'))

        for iterator in iterators:
            for fragment in iterator:
                if (n_in_single + n_in_paired) % 10000 == 0:
                    grace.status(
                        'Clipping fragment %s' %
                        grace.pretty_number(n_in_single + n_in_paired))

                if len(fragment) == 1:
                    n_in_single += 1
                else:
                    n_in_paired += 1

                graduates = []
                rejects = []
                for i, (name, seq, qual) in enumerate(fragment):
                    name = name.split()[0]
                    seq = seq.upper()
                    total_in_length[i] += len(seq)

                    start = trim_start
                    best_start = 0
                    best_len = 0
                    for j in xrange(len(seq) - trim_end):
                        if qual[j] < quality_cutoff_char or \
                           (clip_ambiguous and seq[j] not in 'ACGT'):
                            if best_len < j - start:
                                best_start = start
                                best_len = j - start
                            start = j + 1
                    j = len(seq) - trim_end
                    if best_len < j - start:
                        best_start = start
                        best_len = j - start

                    clipped_seq = seq[best_start:best_start + best_len]
                    clipped_qual = qual[best_start:best_start + best_len]
                    if len(clipped_seq) < length_cutoff:
                        n_q_clipped[i] += 1
                        rejects.append((name, seq, qual, 'quality'))
                        continue

                    match = matcher.match(clipped_seq)
                    if match and match[0] >= adaptor_cutoff:
                        clipped_seq = clipped_seq[match[0]:]
                        clipped_qual = clipped_qual[match[0]:]
                        start_clips[i][match[0]].append(match[1][0])
                        if len(clipped_seq) < length_cutoff:
                            n_a_clipped[i] += 1
                            rejects.append((name, seq, qual, 'adaptor'))
                            continue

                    match = matcher.match(bio.reverse_complement(clipped_seq))
                    if match and match[0] >= adaptor_cutoff:
                        clipped_seq = clipped_seq[:len(clipped_seq) - match[0]]
                        clipped_qual = clipped_qual[:len(clipped_qual) -
                                                    match[0]]
                        end_clips[i][match[0]].append(match[1][0])
                        if len(clipped_seq) < length_cutoff:
                            n_a_clipped[i] += 1
                            rejects.append((name, seq, qual, 'adaptor'))
                            continue

                    if disallow_homopolymers and len(set(clipped_seq)) <= 1:
                        n_homopolymers[i] += 1
                        rejects.append((name, seq, qual, 'homopolymer'))
                        continue

                    graduates.append((name, clipped_seq, clipped_qual))
                    n_out[i] += 1
                    total_out_length[i] += len(clipped_seq)

                if output_rejects:
                    for name, seq, qual, reason in rejects:
                        write_sequence(f_reject, name + ' ' + reason, seq,
                                       qual)

                if graduates:
                    if reverse_complement:
                        graduates = [(name, bio.reverse_complement(seq),
                                      qual[::-1])
                                     for name, seq, qual in graduates]

                    if len(graduates) == 1:
                        this_f = f_single
                        n_single += 1
                    else:
                        assert len(graduates) == 2
                        this_f = f_paired
                        n_paired += 1

                    for name, seq, qual in graduates:
                        write_sequence(this_f, name, seq, qual)

        grace.status('')

        if output_rejects:
            f_reject.close()
        if fragment_reads == 2:
            f_paired.close()
        f_single.close()

        def summarize_clips(name, location, clips):
            total = 0
            for i in clips:
                total += len(clips[i])
            log.datum(log_name, name + ' adaptors clipped at ' + location,
                      total)

            if not clips:
                return

            for i in xrange(min(clips), max(clips) + 1):
                item = clips[i]
                log.quietly_log('%3d bases: %10d ' % (i, len(item)))
                if item:
                    avg_errors = float(sum(item2[0]
                                           for item2 in item)) / len(item)
                    log.quietly_log(' avg errors: %5.2f  ' % avg_errors)

                    counts = collections.defaultdict(int)
                    for item2 in item:
                        counts[item2[1]] += 1
                    #print counts
                    for no in sorted(counts,
                                     key=lambda item2: counts[item2],
                                     reverse=True)[:2]:
                        log.quietly_log('%dx%s ' %
                                        (counts[no], matcher.names[no]))
                    if len(counts) > 2: log.quietly_log('...')

                log.quietly_log('\n')
            log.quietly_log('\n')

        if n_in_paired:
            log.datum(log_name, 'read-pairs', n_in_paired)
        if n_in_single:
            log.datum(log_name, 'single reads', n_in_single)

        for i in xrange(fragment_reads):
            if start_clips:
                summarize_clips(read_in_fragment_names[i], 'start',
                                start_clips[i])

            if end_clips:
                summarize_clips(read_in_fragment_names[i], 'end', end_clips[i])

                prefix = read_in_fragment_names[i]

            log.datum(log_name, prefix + ' too short after quality clip',
                      n_q_clipped[i])
            log.datum(log_name, prefix + ' too short after adaptor clip',
                      n_a_clipped[i])
            if disallow_homopolymers:
                log.datum(log_name, prefix + ' homopolymers',
                          n_homopolymers[i])
            if fragment_reads > 1:
                log.datum(log_name, prefix + ' kept', n_out[i])
            log.datum(log_name, prefix + ' average input length',
                      float(total_in_length[i]) / (n_in_single + n_in_paired))
            if n_out[i]:
                log.datum(log_name, prefix + ' average output length',
                          float(total_out_length[i]) / n_out[i])

        if fragment_reads == 2:
            log.datum(log_name, 'pairs kept after clipping', n_paired)
        log.datum(log_name, 'reads kept after clipping', n_single)
Ejemplo n.º 2
0
    def run(self):
        grace.require_shrimp_2()
        grace.require_samtools()
        assert self.references, 'No reference sequences given'
        assert self.reads or self.pairs or self.interleaved, 'No reads given'
        for pair in self.pairs:
            assert len(pair) == 2, 'Two files required in each pair: section'

        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)

        read_sets = []
        for item in self.reads:
            read_sets.append(([item], False))
        for item in self.pairs:
            read_sets.append((item, True))
        for item in self.interleaved:
            read_sets.append(([item], True))

        #Create working directory

        workspace = self.get_workspace()
        workspace.setup_reference(self.references)
        workspace.update_param(snp_cost=25)
        reference = workspace.get_reference()
        reference_filename = reference.reference_fasta_filename()

        cores = min(self.cores, legion.coordinator().get_cores())

        default_options = {
            '-E': None,
            '-T': None,
            '-N': str(cores),
            '-n': '2',
            '-w': '200%',
            '-p': 'opp-in',
            '-I': '0,500',
            '-X': None,
        }

        if self.sam_unaligned:
            default_options['--sam-unaligned'] = None

        if self.half_paired:
            default_options['--half-paired'] = None
        else:
            default_options['--no-half-paired'] = None

        cutoff = '55%'  #Default changed in SHRiMP 2.0.2
        if '-h' in self.shrimp_options:
            cutoff = self.shrimp_options[self.shrimp_options.index('-h') + 1]

        #Run shrimp

        bam_filename = io.abspath(self.output_dir, 'alignments.bam')
        bam_prefix = io.abspath(self.output_dir, 'alignments')
        bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted')

        temp_filename = io.abspath(self.output_dir, 'temp.bam')

        log_filename = io.abspath(self.output_dir, 'shrimp_log.txt')
        log_file = open(log_filename, 'wb')

        sam_eater = sam.Bam_writer(temp_filename)

        sam_header_sent = [False]
        n_seen = [0]

        def eat(f):
            for line in f:
                if line.startswith('@'):
                    if sam_header_sent[0]: continue
                else:
                    n_seen[0] += 1
                    if n_seen[0] % 100000 == 0:
                        grace.status('%s alignments produced' %
                                     grace.pretty_number(n_seen[0]))
                sam_eater.write_raw(line)
            sam_header_sent[0] = True

        def remove_pair_options(options):
            for flag in ['-p', '-I']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 2:]
            for flag in ['--half-paired']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 1:]
            return options

        for i, (filenames, is_paired) in enumerate(read_sets):
            options = self.shrimp_options[:]

            has_qualities = all(
                len(io.read_sequences(filename, qualities=True).next()) ==
                3  #A little ugly
                for filename in filenames)
            if has_qualities:
                options.append('--fastq')

            if len(filenames) == 1:
                reads_parameters = [filenames[0]]
            else:
                reads_parameters = ['-1', filenames[0], '-2', filenames[1]]

            if '--qv-offset' not in self.shrimp_options:
                #guesses = [ ]
                #for filename in filenames:
                #    guesses.append(io.guess_quality_offset(filename))
                #assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.'
                #default_options['--qv-offset'] = str(guesses[0])
                default_options['--qv-offset'] = str(
                    io.guess_quality_offset(*filenames))

            default_options['--read-group'] = '%s,%s' % (
                workspace.name.replace(',',
                                       '_'), workspace.name.replace(',', '_'))
            for flag in default_options:
                if flag not in options:
                    options.append(flag)
                    if default_options[flag] is not None:
                        options.append(default_options[flag])

            if not is_paired:
                options = remove_pair_options(options)

            grace.status('')

            full_param = reference.shrimp_command(self.cs,
                                                  options + reads_parameters)

            print >> sys.stderr, 'Running', ' '.join(full_param)

            with io.pipe_from(full_param, stderr=log_file, cores=cores) as f:
                eat(f)

        log_file.close()

        sam_eater.close()

        grace.status('Sort')

        #io.execute([
        #    'samtools', 'sort', '-n', temp_filename, bam_prefix
        #])
        sam.sort_bam(temp_filename, bam_prefix, by_name=True, cores=self.cores)

        os.unlink(temp_filename)

        grace.status('')
Ejemplo n.º 3
0
    def run(self):
        grace.require_shrimp_2()
        grace.require_samtools()
        assert self.references, "No reference sequences given"
        assert self.reads or self.pairs or self.interleaved, "No reads given"
        for pair in self.pairs:
            assert len(pair) == 2, "Two files required in each pair: section"

        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)

        read_sets = []
        for item in self.reads:
            read_sets.append(([item], False))
        for item in self.pairs:
            read_sets.append((item, True))
        for item in self.interleaved:
            read_sets.append(([item], True))

        # Create working directory

        workspace = self.get_workspace()
        workspace.setup_reference(self.references)
        workspace.update_param(snp_cost=25)
        reference = workspace.get_reference()
        reference_filename = reference.reference_fasta_filename()

        cores = min(self.cores, legion.coordinator().get_cores())

        default_options = {
            "-E": None,
            "-T": None,
            "-N": str(cores),
            "-n": "2",
            "-w": "200%",
            "-p": "opp-in",
            "-I": "0,500",
            "-X": None,
        }

        if self.sam_unaligned:
            default_options["--sam-unaligned"] = None

        if self.half_paired:
            default_options["--half-paired"] = None
        else:
            default_options["--no-half-paired"] = None

        cutoff = "55%"  # Default changed in SHRiMP 2.0.2
        if "-h" in self.shrimp_options:
            cutoff = self.shrimp_options[self.shrimp_options.index("-h") + 1]

        # Run shrimp

        bam_filename = io.abspath(self.output_dir, "alignments.bam")
        bam_prefix = io.abspath(self.output_dir, "alignments")
        bam_sorted_prefix = io.abspath(self.output_dir, "alignments_sorted")

        temp_filename = io.abspath(self.output_dir, "temp.bam")

        log_filename = io.abspath(self.output_dir, "shrimp_log.txt")
        log_file = open(log_filename, "wb")

        sam_eater = sam.Bam_writer(temp_filename)

        sam_header_sent = [False]
        n_seen = [0]

        def eat(f):
            for line in f:
                if line.startswith("@"):
                    if sam_header_sent[0]:
                        continue
                else:
                    n_seen[0] += 1
                    if n_seen[0] % 100000 == 0:
                        grace.status("%s alignments produced" % grace.pretty_number(n_seen[0]))
                sam_eater.write_raw(line)
            sam_header_sent[0] = True

        def remove_pair_options(options):
            for flag in ["-p", "-I"]:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 2 :]
            for flag in ["--half-paired"]:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 1 :]
            return options

        for i, (filenames, is_paired) in enumerate(read_sets):
            options = self.shrimp_options[:]

            has_qualities = all(
                len(io.read_sequences(filename, qualities=True).next()) == 3 for filename in filenames  # A little ugly
            )
            if has_qualities:
                options.append("--fastq")

            if len(filenames) == 1:
                reads_parameters = [filenames[0]]
            else:
                reads_parameters = ["-1", filenames[0], "-2", filenames[1]]

            if "--qv-offset" not in self.shrimp_options:
                guesses = []
                for filename in filenames:
                    guesses.append(io.guess_quality_offset(filename))
                assert (
                    len(set(guesses)) == 1
                ), "Conflicting quality offset guesses, please specify --qv-offset manually."
                default_options["--qv-offset"] = str(guesses[0])

            default_options["--read-group"] = "%s,%s" % (
                workspace.name.replace(",", "_"),
                workspace.name.replace(",", "_"),
            )
            for flag in default_options:
                if flag not in options:
                    options.append(flag)
                    if default_options[flag] is not None:
                        options.append(default_options[flag])

            if not is_paired:
                options = remove_pair_options(options)

            grace.status("")

            full_param = reference.shrimp_command(self.cs, options + reads_parameters)

            print >>sys.stderr, "Running", " ".join(full_param)

            with io.pipe_from(full_param, stderr=log_file, cores=cores) as f:
                eat(f)

        log_file.close()

        sam_eater.close()

        grace.status("Sort")

        io.execute(["samtools", "sort", "-n", temp_filename, bam_prefix])

        os.unlink(temp_filename)

        grace.status("")
Ejemplo n.º 4
0
    def run(self):
        log = self.log
        
        #quality_cutoff, args = grace.get_option_value(args, '--quality', int, 10)
        #qoffset, args = grace.get_option_value(args, '--qoffset', int, None)
        #clip_ambiguous, args = grace.get_option_value(args, '--clip-ambiguous', grace.as_bool, True)
        #length_cutoff, args = grace.get_option_value(args, '--length', int, 24)
        #adaptor_cutoff, args = grace.get_option_value(args, '--match', int, 10)
        #max_error, args = grace.get_option_value(args, '--max-errors', int, 1)
        #adaptor_set, args = grace.get_option_value(args, '--adaptors', str, 'truseq-adapter,truseq-srna,genomic,multiplexing,pe,srna')
        #disallow_homopolymers, args = grace.get_option_value(args, '--homopolymers', grace.as_bool, False)
        #reverse_complement, args = grace.get_option_value(args, '--revcom', grace.as_bool, False)
        #trim_start, args = grace.get_option_value(args, '--trim-start', int, 0)
        #trim_end, args = grace.get_option_value(args, '--trim-end', int, 0)
        #output_fasta, args = grace.get_option_value(args, '--fasta', grace.as_bool, False)
        #use_gzip, args = grace.get_option_value(args, '--gzip', grace.as_bool, True)
        #output_rejects, args = grace.get_option_value(args, '--rejects', grace.as_bool, False)
        #grace.expect_no_further_options(args)
        
        prefix = self.prefix
        log_name = os.path.split(prefix)[1]
        
        quality_cutoff = self.quality
        qoffset = self.qoffset
        clip_ambiguous = self.clip_ambiguous
        length_cutoff = self.length
        adaptor_cutoff = self.match
        max_error = self.max_errors
        disallow_homopolymers = self.homopolymers
        reverse_complement = self.revcom
        trim_start = self.trim_start
        trim_end = self.trim_end
        output_fasta = self.fasta
        use_gzip = self.gzip
        output_rejects = self.rejects
    
        iterators = [ ]        
        filenames = [ ]
        any_paired = False
        
        for filename in self.reads:
            filenames.append(filename)
            iterators.append(itertools.izip(
                 io.read_sequences(filename, qualities=True)
            ))
        
        for pair_filenames in self.pairs:
            assert len(pair_filenames) == 2, 'Expected a pair of files for "pairs" section.'
            filenames.extend(pair_filenames)
            any_paired = True
            iterators.append(itertools.izip(
                io.read_sequences(pair_filenames[0], qualities=True),
                io.read_sequences(pair_filenames[1], qualities=True)
            ))
        
        for filename in self.interleaved:
            filenames.append(filename)
            any_paired = True
            iterators.append(deinterleave(
                io.read_sequences(filename, qualities=True)
            ))
        
        fragment_reads = (2 if any_paired else 1)
        read_in_fragment_names = [ 'read-1', 'read-2' ] if any_paired else [ 'read' ]
        
        assert iterators, 'Nothing to clip'
        
        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)
    
        if qoffset is None:
            guesses = [ io.guess_quality_offset(filename) for filename in filenames ]
            assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify manually.'
            qoffset = guesses[0]
            log.log('FASTQ offset seems to be %d\n' % qoffset)    
    
        quality_cutoff_char = chr(qoffset + quality_cutoff)
        
        #log.log('Minimum quality:        %d (%s)\n' % (quality_cutoff, quality_cutoff_char))
        #log.log('Clip ambiguous bases:   %s\n' % (grace.describe_bool(clip_ambiguous)))
        #log.log('Minimum adaptor match:  %d bases, %d errors\n' % (adaptor_cutoff, max_error))
        #log.log('Minimum length:         %d bases\n' % length_cutoff)
        
        adaptor_seqs = [ ]
        adaptor_names = [ ]
        if self.adaptor_clip:
            if self.adaptor_file:
                adaptor_iter = io.read_sequences(self.adaptor_file)
            else:
                adaptor_iter = ADAPTORS
            for name, seq in adaptor_iter:
                seq = seq.upper().replace('U','T')
                adaptor_seqs.append(seq)
                adaptor_names.append(name)
                adaptor_seqs.append(bio.reverse_complement(seq))
                adaptor_names.append(name)

        matcher = Matcher(adaptor_seqs, adaptor_names, max_error)
        
        start_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ]
        end_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ]
    
        if output_fasta:
            write_sequence = io.write_fasta_single_line
        else:
            write_sequence = io.write_fastq
    
        f_single = io.open_possibly_compressed_writer(self.reads_output_filenames()[0])
        if fragment_reads == 2:
            names = self.pairs_output_filenames()[0] if self.out_separate else self.interleaved_output_filenames()
            f_paired = map(io.open_possibly_compressed_writer, names)
        if output_rejects:
            f_reject = io.open_possibly_compressed_writer(self.rejects_output_filenames()[0])
        
        n_single = 0
        n_paired = 0
        
        n_in_single = 0
        n_in_paired = 0
        total_in_length = [ 0 ] * fragment_reads
        
        n_out = [ 0 ] * fragment_reads
        n_q_clipped = [ 0 ] * fragment_reads
        n_a_clipped = [ 0 ] * fragment_reads
        n_homopolymers = [ 0 ] * fragment_reads
        total_out_length = [ 0 ] * fragment_reads
        
        #log.attach(open(prefix + '_log.txt', 'wb'))
        
        for iterator in iterators:
          for fragment in iterator:
            if (n_in_single+n_in_paired) % 10000 == 0:
                grace.status('Clipping fragment %s' % grace.pretty_number(n_in_single+n_in_paired))
        
            if len(fragment) == 1:
                n_in_single += 1
            else:
                n_in_paired += 1
            
            graduates = [ ]
            rejects = [ ]
            for i, (name, seq, qual) in enumerate(fragment):
                seq = seq.upper()
                total_in_length[i] += len(seq)
                
                if self.trim_to:
                    seq = seq[:self.trim_to]
                    qual = qual[:self.trim_to]
                
                start = trim_start
                best_start = 0
                best_len = 0
                for j in xrange(len(seq)-trim_end):
                    if qual[j] < quality_cutoff_char or \
                       (clip_ambiguous and seq[j] not in 'ACGT'):
                        if best_len < j-start:
                            best_start = start
                            best_len = j-start
                        start = j + 1
                j = len(seq)-trim_end
                if best_len < j-start:
                    best_start = start
                    best_len = j-start
        
                clipped_seq = seq[best_start:best_start+best_len]
                clipped_qual = qual[best_start:best_start+best_len]
                if len(clipped_seq) < length_cutoff:
                    n_q_clipped[i] += 1
                    rejects.append( (name,seq,qual,'quality') ) 
                    continue
        
                match = matcher.match(clipped_seq)
                if match and match[0] >= adaptor_cutoff:
                    clipped_seq = clipped_seq[match[0]:]
                    clipped_qual = clipped_qual[match[0]:]
                    start_clips[i][match[0]].append( match[1][0] )
                    if len(clipped_seq) < length_cutoff:
                        n_a_clipped[i] += 1 
                        rejects.append( (name,seq,qual,'adaptor') ) 
                        continue
            
                match = matcher.match(bio.reverse_complement(clipped_seq))
                if match and match[0] >= adaptor_cutoff:
                    clipped_seq = clipped_seq[: len(clipped_seq)-match[0] ]    
                    clipped_qual = clipped_qual[: len(clipped_qual)-match[0] ]    
                    end_clips[i][match[0]].append( match[1][0] )
                    if len(clipped_seq) < length_cutoff:
                        n_a_clipped[i] += 1 
                        rejects.append( (name,seq,qual,'adaptor') ) 
                        continue
    
                if disallow_homopolymers and len(set(clipped_seq)) <= 1:
                    n_homopolymers[i] += 1
                    rejects.append( (name,seq,qual,'homopolymer') ) 
                    continue
        
                graduates.append( (name, clipped_seq, clipped_qual) )
                n_out[i] += 1
                total_out_length[i] += len(clipped_seq)
    
            if output_rejects:
                for name,seq,qual,reason in rejects:
                    write_sequence(f_reject, name + ' ' + reason, seq, qual)
             
            if graduates:
                if reverse_complement:
                    graduates = [
                        (name, bio.reverse_complement(seq), qual[::-1])
                        for name, seq, qual in graduates
                    ]
            
                if len(graduates) == 1:
                    n_single += 1

                    (name, seq, qual) = graduates[0]
                    write_sequence(f_single, name, seq, qual)
                else:
                    assert len(graduates) == 2
                    n_paired += 1

                    # Write the pair to an interleaved file or separate l/r files
                    for (lr,(name, seq, qual)) in enumerate(graduates):
                        write_sequence(f_paired[lr%len(f_paired)], name, seq, qual)
                
        
        grace.status('')
        
        if output_rejects:
            f_reject.close()
        if fragment_reads == 2:
            map(lambda f: f.close(), f_paired)
        f_single.close()
        
        def summarize_clips(name, location, clips):
            total = 0
            for i in clips:
                total += len(clips[i])
            log.datum(log_name, name + ' adaptors clipped at ' + location, total) 
            
            if not clips:
                return
    
            for i in xrange(min(clips), max(clips)+1):
                item = clips[i]
                log.quietly_log('%3d bases: %10d ' % (i, len(item)))
                if item:
                    avg_errors = float(sum( item2[0] for item2 in item )) / len(item)
                    log.quietly_log(' avg errors: %5.2f  ' % avg_errors)
                    
                    counts = collections.defaultdict(int)
                    for item2 in item: counts[item2[1]] += 1
                    #print counts
                    for no in sorted(counts,key=lambda item2:counts[item2],reverse=True)[:2]:
                        log.quietly_log('%dx%s ' % (counts[no], matcher.names[no]))
                    if len(counts) > 2: log.quietly_log('...')
                    
                log.quietly_log('\n')
            log.quietly_log('\n')


        if n_in_paired:
            log.datum(log_name,'read-pairs', n_in_paired)                      
        if n_in_single:
            log.datum(log_name,'single reads', n_in_single)                      
        
        for i in xrange(fragment_reads):
            if start_clips:
                summarize_clips(read_in_fragment_names[i], 'start', start_clips[i])
        
            if end_clips:
                summarize_clips(read_in_fragment_names[i], 'end', end_clips[i])

                prefix = read_in_fragment_names[i]
                
            log.datum(log_name, prefix + ' too short after quality clip', n_q_clipped[i])
            log.datum(log_name, prefix + ' too short after adaptor clip', n_a_clipped[i])
            if disallow_homopolymers:
                log.datum(log_name, prefix + ' homopolymers', n_homopolymers[i])
            if fragment_reads > 1:
                log.datum(log_name, prefix + ' kept', n_out[i])
            log.datum(log_name, prefix + ' average input length',  float(total_in_length[i]) / (n_in_single+n_in_paired))                     
            if n_out[i]:
                log.datum(log_name, prefix + ' average output length', float(total_out_length[i]) / n_out[i])                     
        
        if fragment_reads == 2:
            log.datum(log_name,'pairs kept after clipping', n_paired)                      
        log.datum(log_name, 'reads kept after clipping', n_single)
Ejemplo n.º 5
0
    def run(self):
        grace.require_shrimp_2()
        grace.require_samtools()
        assert self.references, 'No reference sequences given'
        assert self.reads or self.pairs or self.interleaved, 'No reads given'
        for pair in self.pairs:
            assert len(pair) == 2, 'Two files required in each pair: section'

        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)

        read_sets = [ ]
        for item in self.reads:
            read_sets.append( ([item], False) )
        for item in self.pairs:
            read_sets.append( (item, True) )
        for item in self.interleaved:
            read_sets.append( ([item], True) )

        #Create working directory
        
        workspace = self.get_workspace()
        workspace.setup_reference(self.references)        
        workspace.update_param(snp_cost=25)
        reference = workspace.get_reference()
        reference_filename = reference.reference_fasta_filename()

        cores = min(self.cores, legion.coordinator().get_cores())
                
        default_options = { 
            '-E' : None, 
            '-T' : None, 
            '-N' : str(cores), 
            '-n':'2', 
            '-w':'200%',
            '-p': 'opp-in', 
            '-I': '0,500', 
            '-X':None,
        }
        
        if self.sam_unaligned:
            default_options['--sam-unaligned'] = None
        
        if self.half_paired:
            default_options['--half-paired'] = None
        else:
            default_options['--no-half-paired'] = None

        cutoff = '55%' #Default changed in SHRiMP 2.0.2
        if '-h' in self.shrimp_options:
            cutoff = self.shrimp_options[ self.shrimp_options.index('-h')+1 ]
        
        #Run shrimp
        
        bam_filename = io.abspath(self.output_dir, 'alignments.bam')
        bam_prefix = io.abspath(self.output_dir, 'alignments')
        bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted')
        
        temp_filename = io.abspath(self.output_dir, 'temp.bam')
        
        log_filename = io.abspath(self.output_dir, 'shrimp_log.txt')
        log_file = open(log_filename, 'wb')
        
        sam_eater = sam.Bam_writer(temp_filename)
        
        sam_header_sent = [False]
        n_seen = [0]
        
        def eat(f):
            for line in f:
                if line.startswith('@'):
                    if sam_header_sent[0]: continue
                else:
                    n_seen[0] += 1
                    if n_seen[0] % 100000 == 0:
                        grace.status('%s alignments produced' % grace.pretty_number(n_seen[0]))
                sam_eater.write_raw(line)
            sam_header_sent[0] = True
        
        def remove_pair_options(options):
            for flag in ['-p','-I']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos+2:]
            for flag in ['--half-paired']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos+1:]
            return options
        
        for i, (filenames, is_paired) in enumerate(read_sets):
            options = self.shrimp_options[:]
               
            has_qualities = all(
                len( io.read_sequences(filename, qualities=True).next() ) == 3  #A little ugly
                for filename in filenames
            )
            if has_qualities:
                options.append( '--fastq' )
            
            if len(filenames) == 1:
                reads_parameters = [ filenames[0] ]
            else:
                reads_parameters = [ '-1', filenames[0], '-2', filenames[1] ]
            
            if '--qv-offset' not in self.shrimp_options:
                #guesses = [ ]
                #for filename in filenames:
                #    guesses.append(io.guess_quality_offset(filename))
                #assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.'
                #default_options['--qv-offset'] = str(guesses[0])
                default_options['--qv-offset'] = str( io.guess_quality_offset(*filenames) )
                
            default_options['--read-group'] = '%s,%s' % (
                workspace.name.replace(',','_'),
                workspace.name.replace(',','_')
            )
            for flag in default_options:
                if flag not in options:
                    options.append(flag)
                    if default_options[flag] is not None:
                        options.append(default_options[flag])
            
            if not is_paired:
               options = remove_pair_options(options)
            
            grace.status('')
            
            full_param = reference.shrimp_command(self.cs, options + reads_parameters)
            
            print >> sys.stderr, 'Running', ' '.join(full_param)
            
            with io.pipe_from(full_param,
                    stderr=log_file,
                    cores=cores) as f:
                eat(f)
        
        log_file.close()
        
        sam_eater.close()
        
        grace.status('Sort')
        
        #io.execute([
        #    'samtools', 'sort', '-n', temp_filename, bam_prefix
        #])
        sam.sort_bam(temp_filename, bam_prefix, by_name=True, cores=self.cores)
        
        os.unlink(temp_filename)
        
        grace.status('')
Ejemplo n.º 6
0
    def run(self):
        grace.require_shrimp_2()
        grace.require_samtools()
        assert self.references, 'No reference sequences given'
        assert self.reads or self.pairs or self.interleaved, 'No reads given'
        for pair in self.pairs:
            assert len(pair) == 2, 'Two files required in each pair: section'

        read_sets = [ ]
        for item in self.reads:
            read_sets.append( ([item], False) )
        for item in self.pairs:
            read_sets.append( (item, True) )
        for item in self.interleaved:
            read_sets.append( ([item], True) )
        
        default_options = { '-E' : None, '-T' : None, '-N' : str(grace.how_many_cpus()), '-n':'2', '-w':'200%',
                            '-p': 'opp-in', '-I': '0,500', '-X':None }
        
        if self.sam_unaligned:
            default_options['--sam-unaligned'] = None
        
        if self.half_paired:
            default_options['--half-paired'] = None
        else:
            default_options['--no-half-paired'] = None


        cutoff = '55%' #Default changed in SHRiMP 2.0.2
        if '-h' in self.shrimp_options:
            cutoff = self.shrimp_options[ self.shrimp_options.index('-h')+1 ]
        
        #Create working directory
        
        workspace = self.get_workspace() #working_directory.Working(self.output_dir, must_exist=False)
        workspace.setup_reference(self.references)        
        reference = workspace.get_reference()
        reference_filename = reference.reference_fasta_filename()
        
        #workspace = io.Workspace(self.output_dir)
        #
        #workspace.update_param( 
        #    shrimp_cutoff = cutoff
        #)
        #
        ##Make copy of reference sequences
        #
        #reference_filename = io.abspath(self.output_dir,'reference.fa')
        #reference_file = open(reference_filename,'wb')
        #
        #reference_genbank_filename = io.abspath(self.output_dir,'reference.gbk')
        #reference_genbank_file = open(reference_genbank_filename,'wb')
        #any_genbank = [ False ]
        #
        #def genbank_callback(name, record):
        #    """ Make a copy of any genbank files passed in. """
        #    from Bio import SeqIO
        #    
        #    SeqIO.write([record], reference_genbank_file, 'genbank')
        #    
        #    f = open(os.path.join(
        #        self.output_dir,
        #        grace.filesystem_friendly_name(name) + '.gbk'
        #    ), 'wb')
        #    SeqIO.write([record], f, 'genbank')
        #    f.close()
        #    
        #    any_genbank[0] = True
        #
        #for filename in self.references:
        #    for name, sequence in io.read_sequences(filename, genbank_callback=genbank_callback):
        #        #Don't retain any comment
        #        name = name.split()[0]
        #        io.write_fasta(reference_file, name, sequence.upper())
        #        
        #        f = open(os.path.join(
        #            self.output_dir,
        #            grace.filesystem_friendly_name(name) + '.fa'
        #        ), 'wb')
        #        io.write_fasta(f, name, sequence.upper())
        #        f.close()
        #        
        #
        #reference_file.close()
        #reference_genbank_file.close()
        #if not any_genbank[0]:
        #    os.unlink(reference_genbank_filename)
        #
        ## Create an index of the reference sequences
        #io.execute([
        #    'samtools', 'faidx', reference_filename
        #])
        
        #Run shrimp
        
        bam_filename = io.abspath(self.output_dir, 'alignments.bam')
        bam_prefix = io.abspath(self.output_dir, 'alignments')
        bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted')
        
        temp_filename = io.abspath(self.output_dir, 'temp.bam')
        
        log_filename = io.abspath(self.output_dir, 'shrimp_log.txt')
        log_file = open(log_filename, 'wb')
        
        sam_eater = sam.Bam_writer(temp_filename)
        
        #if self.cs:
        #    program = 'gmapper-cs'
        #else:
        #    program = 'gmapper-ls'
        
        sam_header_sent = [False]
        n_seen = [0]
        
        def eat(process):
            for line in process.stdout:
                if line.startswith('@'):
                    if sam_header_sent[0]: continue
                else:
                    n_seen[0] += 1
                    if n_seen[0] % 100000 == 0:
                        grace.status('%s alignments produced' % grace.pretty_number(n_seen[0]))
                sam_eater.write_raw(line)
                
            assert process.wait() == 0, 'shrimp failed'
            sam_header_sent[0] = True
        
        def remove_pair_options(options):
            for flag in ['-p','-I']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos+2:]
            for flag in ['--half-paired']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos+1:]
            return options
        
        if '--qv-offset' not in self.shrimp_options:
            guesses = [ ]
            for filenames, is_paired in read_sets:
                for filename in filenames:
                    guesses.append(io.guess_quality_offset(filename))
            assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.'
            default_options['--qv-offset'] = str(guesses[0])
                
        for filenames, is_paired in read_sets:
            options = self.shrimp_options[:]
               
            has_qualities = all(
                len( io.read_sequences(filename, qualities=True).next() ) == 3  #A little ugly
                for filename in filenames
            )
            if has_qualities:
                options.append( '--fastq' )
            #    temp_read_filename = io.abspath(working_dir, 'temp.fa')
            #else:
            #    temp_read_filename = io.abspath(working_dir, 'temp.fq')
            
            #try:
            
            #if len(filenames) == 1: # gmapper can cope with gzipped     and filenames[0].endswith('.fa') or filenames[0].endswith('.fq'):
            #    actual_read_filename = filenames[0]
            #else:
            #    actual_read_filename = temp_read_filename
            #    grace.status('Copying reads')
            #    f = open(temp_read_filename, 'wb')
            #    if has_qualities:
            #        for reads in itertools.izip(*[ io.read_sequences(filename, qualities=True) for filename in filenames ]):
            #            for name, seq, qual in reads:
            #                io.write_fastq(f, name, seq, qual)
            #    else:
            #        for reads in itertools.izip(*[ io.read_sequences(filename) for filename in filenames ]):
            #            for name, seq in reads:
            #                io.write_fasta(f, name, seq)
            #    f.close()
            #    grace.status('')
            
            if len(filenames) == 1:
                reads_parameters = [ filenames[0] ]
            else:
                reads_parameters = [ '-1', filenames[0], '-2', filenames[1] ]
            
            for flag in default_options:
                if flag not in options:
                    options.append(flag)
                    if default_options[flag] is not None:
                        options.append(default_options[flag])
            
            if not is_paired:
               options = remove_pair_options(options)
            
            grace.status('')
            
            full_param = reference.shrimp_command(self.cs, options + reads_parameters)
            
            print >> sys.stderr, 'Running', ' '.join(full_param)
            
            p = io.run(full_param,
                    stdout=subprocess.PIPE,
                    stderr=log_file)
            eat(p)
                        
            #finally:
            #    if os.path.exists(temp_read_filename):
            #        os.unlink(temp_read_filename)
        
        log_file.close()
        
        sam_eater.close()
        
        grace.status('Sort')
        
        io.execute([
            'samtools', 'sort', '-n', temp_filename, bam_prefix
        ])
        
        os.unlink(temp_filename)
        
        grace.status('')
Ejemplo n.º 7
0
    def run(self):
        grace.require_shrimp_2()
        grace.require_samtools()
        assert self.references, 'No reference sequences given'
        assert self.reads or self.pairs or self.interleaved, 'No reads given'
        for pair in self.pairs:
            assert len(pair) == 2, 'Two files required in each pair: section'

        read_sets = []
        for item in self.reads:
            read_sets.append(([item], False))
        for item in self.pairs:
            read_sets.append((item, True))
        for item in self.interleaved:
            read_sets.append(([item], True))

        default_options = {
            '-E': None,
            '-T': None,
            '-N': str(grace.how_many_cpus()),
            '-n': '2',
            '-w': '200%',
            '-p': 'opp-in',
            '-I': '0,500',
            '-X': None
        }

        if self.sam_unaligned:
            default_options['--sam-unaligned'] = None

        if self.half_paired:
            default_options['--half-paired'] = None
        else:
            default_options['--no-half-paired'] = None

        cutoff = '55%'  #Default changed in SHRiMP 2.0.2
        if '-h' in self.shrimp_options:
            cutoff = self.shrimp_options[self.shrimp_options.index('-h') + 1]

        #Create working directory

        workspace = self.get_workspace(
        )  #working_directory.Working(self.output_dir, must_exist=False)
        workspace.setup_reference(self.references)
        reference = workspace.get_reference()
        reference_filename = reference.reference_fasta_filename()

        #workspace = io.Workspace(self.output_dir)
        #
        #workspace.update_param(
        #    shrimp_cutoff = cutoff
        #)
        #
        ##Make copy of reference sequences
        #
        #reference_filename = io.abspath(self.output_dir,'reference.fa')
        #reference_file = open(reference_filename,'wb')
        #
        #reference_genbank_filename = io.abspath(self.output_dir,'reference.gbk')
        #reference_genbank_file = open(reference_genbank_filename,'wb')
        #any_genbank = [ False ]
        #
        #def genbank_callback(name, record):
        #    """ Make a copy of any genbank files passed in. """
        #    from Bio import SeqIO
        #
        #    SeqIO.write([record], reference_genbank_file, 'genbank')
        #
        #    f = open(os.path.join(
        #        self.output_dir,
        #        grace.filesystem_friendly_name(name) + '.gbk'
        #    ), 'wb')
        #    SeqIO.write([record], f, 'genbank')
        #    f.close()
        #
        #    any_genbank[0] = True
        #
        #for filename in self.references:
        #    for name, sequence in io.read_sequences(filename, genbank_callback=genbank_callback):
        #        #Don't retain any comment
        #        name = name.split()[0]
        #        io.write_fasta(reference_file, name, sequence.upper())
        #
        #        f = open(os.path.join(
        #            self.output_dir,
        #            grace.filesystem_friendly_name(name) + '.fa'
        #        ), 'wb')
        #        io.write_fasta(f, name, sequence.upper())
        #        f.close()
        #
        #
        #reference_file.close()
        #reference_genbank_file.close()
        #if not any_genbank[0]:
        #    os.unlink(reference_genbank_filename)
        #
        ## Create an index of the reference sequences
        #io.execute([
        #    'samtools', 'faidx', reference_filename
        #])

        #Run shrimp

        bam_filename = io.abspath(self.output_dir, 'alignments.bam')
        bam_prefix = io.abspath(self.output_dir, 'alignments')
        bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted')

        temp_filename = io.abspath(self.output_dir, 'temp.bam')

        log_filename = io.abspath(self.output_dir, 'shrimp_log.txt')
        log_file = open(log_filename, 'wb')

        sam_eater = sam.Bam_writer(temp_filename)

        #if self.cs:
        #    program = 'gmapper-cs'
        #else:
        #    program = 'gmapper-ls'

        sam_header_sent = [False]
        n_seen = [0]

        def eat(process):
            for line in process.stdout:
                if line.startswith('@'):
                    if sam_header_sent[0]: continue
                else:
                    n_seen[0] += 1
                    if n_seen[0] % 100000 == 0:
                        grace.status('%s alignments produced' %
                                     grace.pretty_number(n_seen[0]))
                sam_eater.write_raw(line)

            assert process.wait() == 0, 'shrimp failed'
            sam_header_sent[0] = True

        def remove_pair_options(options):
            for flag in ['-p', '-I']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 2:]
            for flag in ['--half-paired']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 1:]
            return options

        if '--qv-offset' not in self.shrimp_options:
            guesses = []
            for filenames, is_paired in read_sets:
                for filename in filenames:
                    guesses.append(io.guess_quality_offset(filename))
            assert len(
                set(guesses)
            ) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.'
            default_options['--qv-offset'] = str(guesses[0])

        for filenames, is_paired in read_sets:
            options = self.shrimp_options[:]

            has_qualities = all(
                len(io.read_sequences(filename, qualities=True).next()) ==
                3  #A little ugly
                for filename in filenames)
            if has_qualities:
                options.append('--fastq')
            #    temp_read_filename = io.abspath(working_dir, 'temp.fa')
            #else:
            #    temp_read_filename = io.abspath(working_dir, 'temp.fq')

            #try:

            #if len(filenames) == 1: # gmapper can cope with gzipped     and filenames[0].endswith('.fa') or filenames[0].endswith('.fq'):
            #    actual_read_filename = filenames[0]
            #else:
            #    actual_read_filename = temp_read_filename
            #    grace.status('Copying reads')
            #    f = open(temp_read_filename, 'wb')
            #    if has_qualities:
            #        for reads in itertools.izip(*[ io.read_sequences(filename, qualities=True) for filename in filenames ]):
            #            for name, seq, qual in reads:
            #                io.write_fastq(f, name, seq, qual)
            #    else:
            #        for reads in itertools.izip(*[ io.read_sequences(filename) for filename in filenames ]):
            #            for name, seq in reads:
            #                io.write_fasta(f, name, seq)
            #    f.close()
            #    grace.status('')

            if len(filenames) == 1:
                reads_parameters = [filenames[0]]
            else:
                reads_parameters = ['-1', filenames[0], '-2', filenames[1]]

            for flag in default_options:
                if flag not in options:
                    options.append(flag)
                    if default_options[flag] is not None:
                        options.append(default_options[flag])

            if not is_paired:
                options = remove_pair_options(options)

            grace.status('')

            full_param = reference.shrimp_command(self.cs,
                                                  options + reads_parameters)

            print >> sys.stderr, 'Running', ' '.join(full_param)

            p = io.run(full_param, stdout=subprocess.PIPE, stderr=log_file)
            eat(p)

            #finally:
            #    if os.path.exists(temp_read_filename):
            #        os.unlink(temp_read_filename)

        log_file.close()

        sam_eater.close()

        grace.status('Sort')

        io.execute(['samtools', 'sort', '-n', temp_filename, bam_prefix])

        os.unlink(temp_filename)

        grace.status('')