Ejemplo n.º 1
0
    def run(self):
        seqs = []
        seen = 0
        for filename in self.filenames:
            for seq in io.read_sequences(filename, qualities=True):
                seen += 1
                if seen % 100000 == 0:
                    grace.status('Scanned ' + grace.pretty_number(seen))

                if len(seqs) < self.n:
                    seqs.append(seq)
                elif self.n <= random.random() * seen:
                    seqs[random.randrange(self.n)] = seq
        grace.status('')

        print >> sys.stderr, 'Sampled', grace.pretty_number(
            len(seqs)), 'of', grace.pretty_number(seen), 'sequences'

        if not seqs: return
        qualities = len(seqs[0])
        if qualities:
            for name, seq, qual in seqs:
                io.write_fastq(sys.stdout, name, seq, qual)
        else:
            for name, seq in seqs:
                io.write_fastq(sys.stdout, name, seq)
Ejemplo n.º 2
0
Archivo: sam.py Proyecto: mscook/nesoni
    def run(self):
        headers = bam_headers(self.input)
        writer = Bam_writer(self.prefix + '.bam', headers)
        chrom = None
        endpoints = []
        total = 0
        discarded = 0
        for record in Bam_reader(self.input):
            if record.flag & FLAG_UNMAPPED: continue

            total += 1

            if chrom != record.rname:
                chrom = record.rname
                endpoints = []
            while endpoints and endpoints[0] <= record.pos:
                heapq.heappop(endpoints)

            if len(endpoints) >= self.depth:
                discarded += 1
                continue

            heapq.heappush(endpoints, record.pos + record.length)

            record.flag &= ~(FLAG_PAIRED | FLAG_PROPER | FLAG_MATE_UNMAPPED
                             | FLAG_MATE_REVERSE | FLAG_FIRST | FLAG_SECOND)
            record.mrnm = '*'
            record.mpos = 0
            writer.write(record)
        writer.close()

        self.log.log(
            'Discarded %s alignments out of %s.\n' %
            (grace.pretty_number(discarded), grace.pretty_number(total)))
Ejemplo n.º 3
0
    def run(self):
        headers = bam_headers(self.input)
        writer = Bam_writer(self.prefix+'.bam', headers)
        chrom = None
        endpoints = [ ]
        total = 0
        discarded = 0
        for record in Bam_reader(self.input):
            if record.flag & FLAG_UNMAPPED: continue
        
            total += 1
            
            if chrom != record.rname:
                chrom = record.rname
                endpoints = [ ]
            while endpoints and endpoints[0] <= record.pos:
                heapq.heappop(endpoints)
            
            if len(endpoints) >= self.depth:
                discarded += 1
                continue

            heapq.heappush(endpoints, record.pos+record.length)
            
            record.flag &= ~(FLAG_PAIRED|FLAG_PROPER|FLAG_MATE_UNMAPPED|FLAG_MATE_REVERSE|FLAG_FIRST|FLAG_SECOND)
            record.mrnm = '*'
            record.mpos = 0
            writer.write(record)
        writer.close()
        
        self.log.log('Discarded %s alignments out of %s.\n' % (grace.pretty_number(discarded),grace.pretty_number(total)))
Ejemplo n.º 4
0
def reader(working_dirs, references, use_reference, annotations={}):
    for name, sequence in references:
        features = annotations.get(sequence, [])
    
        if use_reference:
            readers = [ reference_reader(sequence) ]
        else:
            readers = [ ]
        
        readers.extend( evidence_reader(working_dir, name) for working_dir in working_dirs )
        
        active_features = [ ]
        feature_pos = 0        
        
        for i in xrange(len(sequence)):
            if i % 10000 == 0:
                grace.status('%s %s' % (name, grace.pretty_number(i)))
            
            active_features = [ item for item in active_features if item.location.nofuzzy_end > i ]
            while feature_pos < len(features) and \
                  features[feature_pos].location.nofuzzy_start <= i:
                active_features.append(features[feature_pos])
                feature_pos += 1
        
            for is_insertion in (True, False):
                 yield Calls(name, i, is_insertion, [ item.next() for item in readers ], active_features)
        
        for reader in readers:
            for item in reader:
                raise grace.Error('Unexpected extra data in evidence file')

    grace.status('')
Ejemplo n.º 5
0
def reader(working_dirs, references, use_reference, annotations={}):
    for name, sequence in references:
        features = annotations.get(sequence, [])

        if use_reference:
            readers = [reference_reader(sequence)]
        else:
            readers = []

        readers.extend(evidence_reader(working_dir, name) for working_dir in working_dirs)

        active_features = []
        feature_pos = 0

        for i in xrange(len(sequence)):
            if i % 10000 == 0:
                grace.status("%s %s" % (name, grace.pretty_number(i)))

            active_features = [item for item in active_features if item.location.nofuzzy_end > i]
            while feature_pos < len(features) and features[feature_pos].location.nofuzzy_start <= i:
                active_features.append(features[feature_pos])
                feature_pos += 1

            for is_insertion in (True, False):
                yield Calls(name, i, is_insertion, [item.next() for item in readers], active_features)

        for reader in readers:
            for item in reader:
                raise grace.Error("Unexpected extra data in evidence file")

    grace.status("")
Ejemplo n.º 6
0
 def eat(f):
     for line in f:
         if line.startswith('@'):
             if sam_header_sent[0]: continue
         else:
             n_seen[0] += 1
             if n_seen[0] % 100000 == 0:
                 grace.status('%s alignments produced' % grace.pretty_number(n_seen[0]))
         sam_eater.write_raw(line)
     sam_header_sent[0] = True
Ejemplo n.º 7
0
def normalize_files(dirnames, prefix, min_depth):
    contents = [
        read_userplot(os.path.join(item, prefix + '-depth.userplot'))
        for item in dirnames
    ]
    data = [item[2] for item in contents]

    if contents[0][1]:
        j_range = xrange(1, len(contents[2][0]))
    else:
        j_range = [0]

    totals = [0.0] * len(data)
    n = 0

    for i in xrange(len(contents[0][2])):
        for j in j_range:
            depths = [item[i][j] for item in data]
            good = True
            for item in depths:
                if item < min_depth:
                    good = False
                    break
            if not good: continue

            for k, item in enumerate(depths):
                totals[k] += math.log(item)
            n += 1

    print prefix, 'sites used:', grace.pretty_number(n)

    if n == 0:
        print 'Can\'t normalize, skipping.'
        print
        return

    avg_total = sum(totals) / len(data)
    norm_divisors = [math.exp((item - avg_total) / n) for item in totals]

    print 'Relative abundances:'
    for i in xrange(len(dirnames)):
        print '  %.3f %s' % (norm_divisors[i], dirnames[i])
    print

    #for i, item in enumerate(contents):
    #    write_normalized_userplot(item, 1.0/norm_divisors[i], os.path.join(dirnames[i],prefix+'-norm.userplot'))
    for i, dirname in enumerate(dirnames):
        for filename in os.listdir(dirname):
            if not filename.startswith(prefix): continue
            if not filename.endswith('.userplot'): continue
            if filename.endswith('-norm.userplot'): continue
            fullname = os.path.join(dirname, filename)
            full_outname = fullname[:-9] + '-norm.userplot'
            write_normalized_userplot(read_userplot(fullname),
                                      1.0 / norm_divisors[i], full_outname)
Ejemplo n.º 8
0
def normalize_files(dirnames, prefix, min_depth):
    contents = [ read_userplot(os.path.join(item,prefix+'-depth.userplot')) for item in dirnames ]
    data = [ item[2] for item in contents ]

    if contents[0][1]:
        j_range = xrange(1,len(contents[2][0]))
    else:
        j_range = [0]
    
    totals = [ 0.0 ] * len(data)
    n = 0

    for i in xrange(len(contents[0][2])):
        for j in j_range:
            depths = [ item[i][j] for item in data ]
            good = True
            for item in depths: 
                if item < min_depth: 
                    good = False 
                    break
            if not good: continue
            
            for k, item in enumerate(depths):
                totals[k] += math.log(item)
            n += 1
    
    print prefix, 'sites used:', grace.pretty_number(n)
    
    if n == 0:
        print 'Can\'t normalize, skipping.'
        print
        return
    
    avg_total = sum( totals ) / len(data)
    norm_divisors = [ math.exp( (item - avg_total)/n ) for item in totals ]

    print 'Relative abundances:'
    for i in xrange(len(dirnames)):
        print '  %.3f %s' % (norm_divisors[i], dirnames[i])
    print
    
    #for i, item in enumerate(contents):
    #    write_normalized_userplot(item, 1.0/norm_divisors[i], os.path.join(dirnames[i],prefix+'-norm.userplot'))
    for i, dirname in enumerate(dirnames):
        for filename in os.listdir(dirname):
            if not filename.startswith(prefix): continue
            if not filename.endswith('.userplot'): continue
            if filename.endswith('-norm.userplot'): continue
            fullname = os.path.join(dirname, filename)
            full_outname = fullname[:-9] + '-norm.userplot'
            write_normalized_userplot(
                read_userplot(fullname),
                1.0/norm_divisors[i],
                full_outname)
Ejemplo n.º 9
0
 def eat(f):
     for line in f:
         if line.startswith('@'):
             if sam_header_sent[0]: continue
         else:
             n_seen[0] += 1
             if n_seen[0] % 100000 == 0:
                 grace.status('%s alignments produced' %
                              grace.pretty_number(n_seen[0]))
         sam_eater.write_raw(line)
     sam_header_sent[0] = True
Ejemplo n.º 10
0
 def eat(process):
     for line in process.stdout:
         if line.startswith('@'):
             if sam_header_sent[0]: continue
         else:
             n_seen[0] += 1
             if n_seen[0] % 100000 == 0:
                 grace.status('%s alignments produced' % grace.pretty_number(n_seen[0]))
         sam_eater.write_raw(line)
         
     assert process.wait() == 0, 'shrimp failed'
     sam_header_sent[0] = True
Ejemplo n.º 11
0
        def eat(process):
            for line in process.stdout:
                if line.startswith('@'):
                    if sam_header_sent[0]: continue
                else:
                    n_seen[0] += 1
                    if n_seen[0] % 100000 == 0:
                        grace.status('%s alignments produced' %
                                     grace.pretty_number(n_seen[0]))
                sam_eater.write_raw(line)

            assert process.wait() == 0, 'shrimp failed'
            sam_header_sent[0] = True
Ejemplo n.º 12
0
 def run(self):
     seqs = [ ]
     seen = 0
     for filename in self.filenames:
         for seq in io.read_sequences(filename, qualities=True):
             seen += 1
             if seen % 100000 == 0: grace.status('Scanned '+grace.pretty_number(seen))
             
             if len(seqs) < self.n:
                 seqs.append(seq)
             elif self.n <= random.random() * seen:
                 seqs[ random.randrange(self.n) ] = seq
     grace.status('')
     
     print >> sys.stderr, 'Sampled', grace.pretty_number(len(seqs)), 'of', grace.pretty_number(seen), 'sequences'
 
     if not seqs: return
     qualities = len(seqs[0])
     if qualities:
         for name, seq, qual in seqs:
             io.write_fastq(sys.stdout, name, seq, qual)
     else:
         for name, seq in seqs:
             io.write_fastq(sys.stdout, name, seq)
Ejemplo n.º 13
0
def main(args):
    grace.require_shrimp_1()

    n_cpus = grace.how_many_cpus()

    solid, args = grace.get_flag(args, '--solid')
    verbose, args = grace.get_flag(args, '--verbose')

    threshold, args = grace.get_option_value(args, '--threshold', str, '68%')

    stride, args = grace.get_option_value(args, '--stride', int, 1)
    max_shrimps, args = grace.get_option_value(args, '--cpus', int, n_cpus)
    batch_size, args = grace.get_option_value(args, '--batch-size', int,
                                              5000000)

    input_reference_filenames = []
    reads_filenames = []

    shrimp_options = ['-h', threshold]
    if threshold.endswith('%'):
        threshold = -float(threshold[:-1]) / 100.0
    else:
        threshold = int(threshold)

    output_dir = []  #As list so can write to from function. Gah.

    def front_command(args):
        grace.expect_no_further_options(args)

        if len(args) < 1:
            return

        output_dir.append(args[0])
        input_reference_filenames.extend(
            [os.path.abspath(filename) for filename in args[1:]])

    def reads_command(args):
        grace.expect_no_further_options(args)
        reads_filenames.extend([[os.path.abspath(filename)]
                                for filename in args])

    def pairs_command(args):
        grace.expect_no_further_options(args)
        assert len(args) == 2, 'Expected exactly two files in "pairs"'
        reads_filenames.append(
            [os.path.abspath(filename) for filename in args])

    def shrimp_options_command(args):
        shrimp_options.extend(args)

    grace.execute(
        args, {
            'reads': reads_command,
            '--reads': reads_command,
            'pairs': pairs_command,
            'shrimp-options': shrimp_options_command,
            '--shrimp-options': shrimp_options_command,
        }, front_command)

    if not output_dir:
        print >> sys.stderr, USAGE % n_cpus
        return 1

    output_dir = output_dir[0]

    assert input_reference_filenames, 'No reference files given'
    assert reads_filenames, 'No read files given'

    for filename in itertools.chain(input_reference_filenames,
                                    *reads_filenames):
        assert os.path.exists(filename), '%s does not exist' % filename

    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    if solid:
        shrimp = 'rmapper-cs'
    else:
        shrimp = 'rmapper-ls'

    reference_filename = os.path.join(output_dir, 'reference.fa')
    reference_file = open(reference_filename, 'wb')
    total_reference_sequences = 0
    total_reference_bases = 0
    for input_reference_filename in input_reference_filenames:
        for name, sequence in io.read_sequences(input_reference_filename):
            #Don't retain any comment
            name = name.split()[0]
            io.write_fasta(reference_file, name, sequence)

            total_reference_sequences += 1
            total_reference_bases += len(sequence)

    reference_file.close()

    print '%s base%s in %s reference sequence%s' % (
        grace.pretty_number(total_reference_bases),
        's' if total_reference_bases != 1 else '',
        grace.pretty_number(total_reference_sequences),
        's' if total_reference_sequences != 1 else '')

    assert total_reference_bases, 'Reference sequence file is empty'

    config = {
        'references': input_reference_filenames,
        'reads': reads_filenames,
        'stride': stride,
        'solid': solid,
        'threshold': threshold,
    }
    config_file = open(os.path.join(output_dir, 'config.txt'), 'wb')
    pprint.pprint(config, config_file)
    config_file.close()

    output_filename = os.path.join(output_dir, 'shrimp_hits.txt.gz')
    output_file = gzip.open(output_filename, 'wb')

    unmapped_filename = os.path.join(output_dir, 'unmapped.fa.gz')
    unmapped_file = gzip.open(unmapped_filename, 'wb')

    dirty_filenames = set()
    dirty_filenames.add(output_filename)
    dirty_filenames.add(unmapped_filename)

    #warn_low_threshold = True

    try:  #Cleanup temporary files

        N = [0]

        def do_shrimp(read_set):
            my_number = N[0]
            N[0] += 1

            tempname = os.path.join(output_dir,
                                    'temp%d-%d.fa' % (os.getpid(), my_number))
            tempname_out = os.path.join(
                output_dir, 'temp%d-%d.txt' % (os.getpid(), my_number))

            dirty_filenames.add(tempname)
            dirty_filenames.add(tempname_out)

            f = open(tempname, 'wb')
            for read_name, read_seq in read_set:
                print >> f, '>' + read_name
                print >> f, read_seq
            f.close()

            command = shrimp + ' ' + ' '.join(shrimp_options) + ' ' + \
                      tempname + ' ' + reference_filename + ' >' + tempname_out
            if not verbose:
                command += ' 2>/dev/null'
            #f = os.popen(command, 'r')
            child_pid = os.spawnl(os.P_NOWAIT, '/bin/sh', '/bin/sh', '-c',
                                  command)

            #print 'SHRiMP %d running' % my_number

            def finalize():
                exit_status = os.waitpid(child_pid, 0)[1]
                assert exit_status == 0, 'Shrimp indicated an error'

                hits = {}  # read_name -> [ hit line ]

                f = open(tempname_out, 'rb')
                for line in f:
                    if line.startswith('>'):
                        read_name = line.split(None, 1)[0][1:]
                        if read_name not in hits:
                            hits[read_name] = []
                        hits[read_name].append(line)
                f.close()

                for read_name, read_seq in read_set:
                    if read_name in hits:
                        for hit in hits[read_name]:
                            output_file.write(hit)
                    else:
                        print >> unmapped_file, '>' + read_name
                        print >> unmapped_file, read_seq

                output_file.flush()
                unmapped_file.flush()

                os.unlink(tempname)
                dirty_filenames.remove(tempname)
                os.unlink(tempname_out)
                dirty_filenames.remove(tempname_out)
                #print 'SHRiMP %d finished' % my_number

            return finalize

        shrimps = []

        reader = iter_reads(config)
        read_count = 0

        while True:
            read_set = []
            read_set_bases = 0

            #Read name should not include comment cruft
            # - SHRIMP passes this through
            # - might stuff up identification of pairs

            for read_name, read_seq in reader:
                read_name = read_name.split()[0]
                read_set.append((read_name, read_seq))
                read_set_bases += len(read_seq)

                #if warn_low_threshold and len(read_seq)*7 < threshold: #Require 70% exact match
                #    sys.stderr.write('\n*** WARNING: Short reads, consider reducing --threshold ***\n\n')
                #    warn_low_threshold = False

                read_count += 1
                if read_set_bases >= batch_size: break

            if not read_set: break

            if len(shrimps) >= max_shrimps:
                shrimps.pop(0)()
            shrimps.append(do_shrimp(read_set))

            grace.status('SHRiMPing %s' % grace.pretty_number(read_count))

        while shrimps:
            grace.status('Waiting for SHRiMPs to finish %d ' % len(shrimps))
            shrimps.pop(0)()

        grace.status('')

        output_file.close()
        dirty_filenames.remove(output_filename)
        unmapped_file.close()
        dirty_filenames.remove(unmapped_filename)

        return 0

    finally:
        for filename in dirty_filenames:
            if os.path.exists(filename):
                os.unlink(filename)
Ejemplo n.º 14
0
    def run(self):
        """
        
        <sequence> <poly-A> <adaptor> <anything>
        
        """
        clip_quality = chr(33+self.clip_quality)
        ignore_quality = chr(33+self.ignore_quality)
        
        with io.open_possibly_compressed_writer(self.prefix+'.fastq.gz') as out_file, \
             io.open_possibly_compressed_writer(self.prefix+'.clips.gz') as out_clips_file:
            print >> out_clips_file, '#Read\tread length\tpoly-A start\tpoly-A end\tpoly-A start, ignoring adaptor\tpoly-A end, ignoring adaptor\tadaptor bases matched'
             
            n = 0
            n_discarded = 0
            n_clipped = 0
            total_before = 0
            total_clipped = 0

            for filename in self.filenames:
                for name, seq, qual in io.read_sequences(filename, qualities='required'):
                    # "Good quality" sequence ends at the first low quality base
                    #good_quality_end = 0
                    #while good_quality_end < len(seq) and qual[good_quality_end] >= clip_quality:
                    #    good_quality_end += 1
                    
                    goodness_score = 0
                    best_goodness_score = 0
                    good_quality_end = 0
                    i = 0
                    while True:
                        if goodness_score > best_goodness_score:
                            best_goodness_score = goodness_score
                            good_quality_end = i
                        
                        if i >= len(seq):
                            break
                            
                        if qual[i] >= clip_quality:
                            goodness_score += 1
                        else:
                            goodness_score -= 9
                        i += 1
                            
                    
                    best_score = 0
                    best_a_start = good_quality_end
                    best_a_end = good_quality_end
                    best_adaptor_bases = 0
                    best_aonly_score = 0
                    best_aonly_start = good_quality_end
                    best_aonly_end = good_quality_end
                    
                    # Consider each possible start position for the poly(A)
                    for a_start in xrange(good_quality_end):
                        if a_start and seq[a_start-1] == 'A': continue
                        
                        # Consider each possible end position for the poly(A)
                        a_end = a_start
                        aonly_score = 0
                        while True:
                            if aonly_score > best_aonly_score:
                                best_aonly_score = aonly_score
                                best_aonly_start = a_start
                                best_aonly_end = a_end
                            
                            
                            # The poly(A) should be followed by adaptor,
                            # at least until the end of good quality sequence.
                            # However if there is evidence of the adaptor beyond
                            # the end of good quality, we still want to know that,
                            # and count it towards the number of adaptor bases present.
                            score = aonly_score
                            adaptor_bases = 0
                            i = a_end
                            while True:
                                if (score > best_score and 
                                    (i >= good_quality_end or i >= a_end+len(self.adaptor))):
                                    best_score = score
                                    best_a_start = a_start
                                    best_a_end = a_end
                                    best_adaptor_bases = adaptor_bases
                            
                                if i >= a_end+len(self.adaptor) or i >= len(seq):
                                    break
                            
                                if qual[i] >= ignore_quality:
                                    if seq[i] == self.adaptor[i-a_end]:
                                        score += 1
                                        adaptor_bases += 1
                                    else:
                                        score -= 4
                                i += 1
                                
                            #if a_end >= len(seq): break
                            
                            # poly(A) tail only within good quality region.
                            if a_end >= good_quality_end: break
                            
                            
                            if qual[a_end] >= ignore_quality:
                                if seq[a_end] == 'A':
                                    aonly_score += 1
                                else:
                                    aonly_score -= 4
                                    if aonly_score <= 0: break
                            a_end += 1
                        
                    a_start = best_a_start
                    a_end = best_a_end
                    adaptor_bases = best_adaptor_bases
                    aonly_start = best_aonly_start
                    aonly_end = best_aonly_end                    
                        
                    if self.debug: # and a_end == a_start and a_end < len(seq)-10:        
                        print name
                        print ''.join( 
                            'I' if item<ignore_quality 
                            else ('C' if item<clip_quality else ' ') 
                            for item in qual )
                        print '-' * good_quality_end
                        print seq
                        print ' '*a_start + 'A'*(a_end-a_start) + self.adaptor + ".%d %d"%(adaptor_bases,best_score)
                        #print ' '*aonly_start + 'A'*(aonly_end-aonly_start) + "."
                        print 
                        sys.stdout.flush()

                    n += 1
                    total_before += len(seq)

                    # 0 - sequence name
                    # 1 - sequence length
                    # 2 - poly(A) start
                    # 3 - poly(A) end
                    # (4 - best run of As start, for debugging the need to detect adaptor seq)
                    # (5 - best run of As end)
                    # 6 - number of adaptor bases matched
                    print >> out_clips_file, '%s\t%d\t%d\t%d\t%d\t%d\t%d' % (name, len(seq) , a_start, a_end, aonly_start, aonly_end,  adaptor_bases)
                    
                    if a_start > self.length:
                        if a_start < len(seq):
                            n_clipped += 1
                            total_clipped += a_start
                    
                        print >> out_file, '@'+name
                        print >> out_file, seq[:a_start]
                        print >> out_file, '+'
                        print >> out_file, qual[:a_start]
                    else:
                        n_discarded += 1
                    
                    if n%10000 == 0: 
                        grace.status('Clip-runs ' + self.sample + ' ' + grace.pretty_number(n)) # + ' (' + grace.pretty_number(len(dstates)) + ' dstates)')
        
        grace.status('')
        
        self.log.datum(self.sample,'reads',n)
        if n:
            self.log.datum(self.sample,'mean length before poly-A/adaptor clipping',float(total_before)/n)
        self.log.datum(self.sample,'reads discarded as too short after poly-A/adaptor clipping',n_discarded)
        self.log.datum(self.sample,'reads poly-A/adaptor clipped and kept',n_clipped)
        if n_clipped:
            self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)
Ejemplo n.º 15
0
    def run(self):
        log = self.log

        #quality_cutoff, args = grace.get_option_value(args, '--quality', int, 10)
        #qoffset, args = grace.get_option_value(args, '--qoffset', int, None)
        #clip_ambiguous, args = grace.get_option_value(args, '--clip-ambiguous', grace.as_bool, True)
        #length_cutoff, args = grace.get_option_value(args, '--length', int, 24)
        #adaptor_cutoff, args = grace.get_option_value(args, '--match', int, 10)
        #max_error, args = grace.get_option_value(args, '--max-errors', int, 1)
        #adaptor_set, args = grace.get_option_value(args, '--adaptors', str, 'truseq-adapter,truseq-srna,genomic,multiplexing,pe,srna')
        #disallow_homopolymers, args = grace.get_option_value(args, '--homopolymers', grace.as_bool, False)
        #reverse_complement, args = grace.get_option_value(args, '--revcom', grace.as_bool, False)
        #trim_start, args = grace.get_option_value(args, '--trim-start', int, 0)
        #trim_end, args = grace.get_option_value(args, '--trim-end', int, 0)
        #output_fasta, args = grace.get_option_value(args, '--fasta', grace.as_bool, False)
        #use_gzip, args = grace.get_option_value(args, '--gzip', grace.as_bool, True)
        #output_rejects, args = grace.get_option_value(args, '--rejects', grace.as_bool, False)
        #grace.expect_no_further_options(args)

        prefix = self.prefix
        log_name = os.path.split(prefix)[1]

        quality_cutoff = self.quality
        qoffset = self.qoffset
        clip_ambiguous = self.clip_ambiguous
        length_cutoff = self.length
        adaptor_cutoff = self.match
        max_error = self.max_errors
        adaptor_set = self.adaptors
        disallow_homopolymers = self.homopolymers
        reverse_complement = self.revcom
        trim_start = self.trim_start
        trim_end = self.trim_end
        output_fasta = self.fasta
        use_gzip = self.gzip
        output_rejects = self.rejects

        iterators = []
        filenames = []
        any_paired = False

        for filename in self.reads:
            filenames.append(filename)
            iterators.append(
                itertools.izip(io.read_sequences(filename, qualities=True)))

        for pair_filenames in self.pairs:
            assert len(pair_filenames
                       ) == 2, 'Expected a pair of files for "pairs" section.'
            filenames.extend(pair_filenames)
            any_paired = True
            iterators.append(
                itertools.izip(
                    io.read_sequences(pair_filenames[0], qualities=True),
                    io.read_sequences(pair_filenames[1], qualities=True)))

        for filename in self.interleaved:
            filenames.extend(filename)
            any_paired = True
            iterators.append(
                deinterleave(io.read_sequences(filename, qualities=True)))

        fragment_reads = (2 if any_paired else 1)
        read_in_fragment_names = ['read-1', 'read-2'
                                  ] if any_paired else ['read']

        assert iterators, 'Nothing to clip'

        if qoffset is None:
            guesses = [
                io.guess_quality_offset(filename) for filename in filenames
            ]
            assert len(
                set(guesses)
            ) == 1, 'Conflicting quality offset guesses, please specify manually.'
            qoffset = guesses[0]
            log.log('FASTQ offset seems to be %d\n' % qoffset)

        quality_cutoff_char = chr(qoffset + quality_cutoff)

        #log.log('Minimum quality:        %d (%s)\n' % (quality_cutoff, quality_cutoff_char))
        #log.log('Clip ambiguous bases:   %s\n' % (grace.describe_bool(clip_ambiguous)))
        #log.log('Minimum adaptor match:  %d bases, %d errors\n' % (adaptor_cutoff, max_error))
        #log.log('Minimum length:         %d bases\n' % length_cutoff)

        adaptor_seqs = []
        adaptor_names = []
        if adaptor_set and adaptor_set.lower() != 'none':
            for item in adaptor_set.split(','):
                item = item.strip().lower() + ' '
                any = False
                for line in ADAPTORS.strip().split('\n'):
                    if line.startswith('#'): continue
                    if not line.lower().startswith(item): continue
                    any = True
                    name, seq = line.rsplit(None, 1)
                    seq = seq.replace('U', 'T')

                    #if seq in adaptor_seqs: print 'Dup', name
                    adaptor_seqs.append(seq)
                    adaptor_names.append(name)
                    adaptor_seqs.append(bio.reverse_complement(seq))
                    adaptor_names.append(name)
                if not any:
                    raise grace.Error('Unknown adaptor set: ' + item)

        matcher = Matcher(adaptor_seqs, adaptor_names, max_error)

        start_clips = [
            collections.defaultdict(list) for i in xrange(fragment_reads)
        ]
        end_clips = [
            collections.defaultdict(list) for i in xrange(fragment_reads)
        ]

        if output_fasta:
            write_sequence = io.write_fasta_single_line
        else:
            write_sequence = io.write_fastq

        f_single = io.open_possibly_compressed_writer(
            self.reads_output_filenames()[0])
        if fragment_reads == 2:
            f_paired = io.open_possibly_compressed_writer(
                self.interleaved_output_filenames()[0])
        if output_rejects:
            f_reject = io.open_possibly_compressed_writer(
                self.rejects_output_filenames()[0])

        n_single = 0
        n_paired = 0

        n_in_single = 0
        n_in_paired = 0
        total_in_length = [0] * fragment_reads

        n_out = [0] * fragment_reads
        n_q_clipped = [0] * fragment_reads
        n_a_clipped = [0] * fragment_reads
        n_homopolymers = [0] * fragment_reads
        total_out_length = [0] * fragment_reads

        #log.attach(open(prefix + '_log.txt', 'wb'))

        for iterator in iterators:
            for fragment in iterator:
                if (n_in_single + n_in_paired) % 10000 == 0:
                    grace.status(
                        'Clipping fragment %s' %
                        grace.pretty_number(n_in_single + n_in_paired))

                if len(fragment) == 1:
                    n_in_single += 1
                else:
                    n_in_paired += 1

                graduates = []
                rejects = []
                for i, (name, seq, qual) in enumerate(fragment):
                    name = name.split()[0]
                    seq = seq.upper()
                    total_in_length[i] += len(seq)

                    start = trim_start
                    best_start = 0
                    best_len = 0
                    for j in xrange(len(seq) - trim_end):
                        if qual[j] < quality_cutoff_char or \
                           (clip_ambiguous and seq[j] not in 'ACGT'):
                            if best_len < j - start:
                                best_start = start
                                best_len = j - start
                            start = j + 1
                    j = len(seq) - trim_end
                    if best_len < j - start:
                        best_start = start
                        best_len = j - start

                    clipped_seq = seq[best_start:best_start + best_len]
                    clipped_qual = qual[best_start:best_start + best_len]
                    if len(clipped_seq) < length_cutoff:
                        n_q_clipped[i] += 1
                        rejects.append((name, seq, qual, 'quality'))
                        continue

                    match = matcher.match(clipped_seq)
                    if match and match[0] >= adaptor_cutoff:
                        clipped_seq = clipped_seq[match[0]:]
                        clipped_qual = clipped_qual[match[0]:]
                        start_clips[i][match[0]].append(match[1][0])
                        if len(clipped_seq) < length_cutoff:
                            n_a_clipped[i] += 1
                            rejects.append((name, seq, qual, 'adaptor'))
                            continue

                    match = matcher.match(bio.reverse_complement(clipped_seq))
                    if match and match[0] >= adaptor_cutoff:
                        clipped_seq = clipped_seq[:len(clipped_seq) - match[0]]
                        clipped_qual = clipped_qual[:len(clipped_qual) -
                                                    match[0]]
                        end_clips[i][match[0]].append(match[1][0])
                        if len(clipped_seq) < length_cutoff:
                            n_a_clipped[i] += 1
                            rejects.append((name, seq, qual, 'adaptor'))
                            continue

                    if disallow_homopolymers and len(set(clipped_seq)) <= 1:
                        n_homopolymers[i] += 1
                        rejects.append((name, seq, qual, 'homopolymer'))
                        continue

                    graduates.append((name, clipped_seq, clipped_qual))
                    n_out[i] += 1
                    total_out_length[i] += len(clipped_seq)

                if output_rejects:
                    for name, seq, qual, reason in rejects:
                        write_sequence(f_reject, name + ' ' + reason, seq,
                                       qual)

                if graduates:
                    if reverse_complement:
                        graduates = [(name, bio.reverse_complement(seq),
                                      qual[::-1])
                                     for name, seq, qual in graduates]

                    if len(graduates) == 1:
                        this_f = f_single
                        n_single += 1
                    else:
                        assert len(graduates) == 2
                        this_f = f_paired
                        n_paired += 1

                    for name, seq, qual in graduates:
                        write_sequence(this_f, name, seq, qual)

        grace.status('')

        if output_rejects:
            f_reject.close()
        if fragment_reads == 2:
            f_paired.close()
        f_single.close()

        def summarize_clips(name, location, clips):
            total = 0
            for i in clips:
                total += len(clips[i])
            log.datum(log_name, name + ' adaptors clipped at ' + location,
                      total)

            if not clips:
                return

            for i in xrange(min(clips), max(clips) + 1):
                item = clips[i]
                log.quietly_log('%3d bases: %10d ' % (i, len(item)))
                if item:
                    avg_errors = float(sum(item2[0]
                                           for item2 in item)) / len(item)
                    log.quietly_log(' avg errors: %5.2f  ' % avg_errors)

                    counts = collections.defaultdict(int)
                    for item2 in item:
                        counts[item2[1]] += 1
                    #print counts
                    for no in sorted(counts,
                                     key=lambda item2: counts[item2],
                                     reverse=True)[:2]:
                        log.quietly_log('%dx%s ' %
                                        (counts[no], matcher.names[no]))
                    if len(counts) > 2: log.quietly_log('...')

                log.quietly_log('\n')
            log.quietly_log('\n')

        if n_in_paired:
            log.datum(log_name, 'read-pairs', n_in_paired)
        if n_in_single:
            log.datum(log_name, 'single reads', n_in_single)

        for i in xrange(fragment_reads):
            if start_clips:
                summarize_clips(read_in_fragment_names[i], 'start',
                                start_clips[i])

            if end_clips:
                summarize_clips(read_in_fragment_names[i], 'end', end_clips[i])

                prefix = read_in_fragment_names[i]

            log.datum(log_name, prefix + ' too short after quality clip',
                      n_q_clipped[i])
            log.datum(log_name, prefix + ' too short after adaptor clip',
                      n_a_clipped[i])
            if disallow_homopolymers:
                log.datum(log_name, prefix + ' homopolymers',
                          n_homopolymers[i])
            if fragment_reads > 1:
                log.datum(log_name, prefix + ' kept', n_out[i])
            log.datum(log_name, prefix + ' average input length',
                      float(total_in_length[i]) / (n_in_single + n_in_paired))
            if n_out[i]:
                log.datum(log_name, prefix + ' average output length',
                          float(total_out_length[i]) / n_out[i])

        if fragment_reads == 2:
            log.datum(log_name, 'pairs kept after clipping', n_paired)
        log.datum(log_name, 'reads kept after clipping', n_single)
def make_ambiguity_bigwig_by_readname(prefix, bam_filenames, stop_after=None, subsample=1): 
    #import pysam
    
    #alf = pysam.AlignmentFile(bam_filenames[0])
    #header = alf.header
    header = sam.parsed_bam_headers(bam_filenames[0])
    
    with open(prefix+"-chrom.sizes","wb") as f:
        for entry in header["SQ"]:
            f.write("{}\t{}\n".format(entry["SN"],entry["LN"]))
    
    chrom_names = [ entry["SN"] for entry in header["SQ"] ]
    chrom_sizes = [ int(entry["LN"]) for entry in header["SQ"] ]

    #alf.close()

    unambiguous = dict([ (i,Piler(j)) for i,j in zip(chrom_names,chrom_sizes) ])
    total = dict([ (i,Piler(j)) for i,j in zip(chrom_names,chrom_sizes) ])
    
    old = grace.status("Ambiguity bigwig")

    for filename in bam_filenames:
        #alf = pysam.AlignmentFile(filename)
        alf = sam.Bam_reader(filename)
        n = 0
        
        sub = subsample-1
        for (key,items) in itertools.groupby(alf, lambda item: item.query_name):
            sub = (sub + 1) % subsample
            if sub: continue
        
            items = [ item for item in items if not item.is_unmapped and not item.is_supplementary ]
            if not items:
                continue
            
            # Only use top scoring alignments
            AS = [ item.get_AS() for item in items ]
            best_AS = max(AS)
            items = [ item for item, this_AS in zip(items,AS) if this_AS >= best_AS ]
        
            for item in items:
                #spanner = fragment_split_coverage([item])
                spanner = fragment_coverage([item])        #TODO fixme when blocks available
                spanner = scale_spanner(1.0/len(items), spanner)
                total[item.reference_name].add(spanner)
                if len(items) == 1:
                    unambiguous[item.reference_name].add(spanner)
                
            n += 1
            if stop_after is not None and n > stop_after: break
            if n % 1000000 == 0: grace.status(os.path.basename(prefix)+" "+filename+" "+grace.pretty_number(n))
        
        alf.close()


    ambiguities = [ ]
    for i in xrange(len(total)):
        u = unambiguous[chrom_names[i]].get()
        t = map_spanner(lambda x: x*1j, total[chrom_names[i]].get())
        c = pile([u,t],initial=0.0)
        c = map_spanner(lambda x: max(0.0,x.imag-x.real)/max(x.imag,1.0), c)
        ambiguities.append(c)

    bedgraph(prefix+".bedgraph", zip(chrom_names, [ item for item in ambiguities ]))
    subprocess.check_call([
        "wigToBigWig",prefix+".bedgraph",prefix+"-chrom.sizes",prefix+".bw"])
    os.unlink(prefix+".bedgraph")
    os.unlink(prefix+"-chrom.sizes")
    
    grace.status(old)
def make_ambiguity_bigwig(prefix, bam_filenames, stop_after=None, subsample=1): 
    #import pysam
    
    #alf = pysam.AlignmentFile(bam_filenames[0])
    #header = alf.header
    header = sam.parsed_bam_headers(bam_filenames[0])
    
    with open(prefix+"-chrom.sizes","wb") as f:
        for entry in header["SQ"]:
            f.write("{}\t{}\n".format(entry["SN"],entry["LN"]))
    
    chrom_names = [ entry["SN"] for entry in header["SQ"] ]
    chrom_sizes = [ int(entry["LN"]) for entry in header["SQ"] ]

    #alf.close()

    unambiguous = dict([ (i,Piler(j)) for i,j in zip(chrom_names,chrom_sizes) ])
    total = dict([ (i,Piler(j)) for i,j in zip(chrom_names,chrom_sizes) ])

    for filename in bam_filenames:
        #alf = pysam.AlignmentFile(filename)
        alf = sam.Bam_reader(filename)
        n = 0
        
        sub = subsample-1
        for item in alf:
            if item.is_unmapped or item.is_supplementary:
                continue
        
            sub = (sub + 1) % subsample
            if sub: continue
        
            #spanner = fragment_split_coverage([item])
            spanner = fragment_coverage([item])        #TODO fixme when blocks available
            total[item.reference_name].add(spanner)
            
            NH = 1
            for item2 in item.extra:
                if item2.startswith("NH:i:"):
                    NH = int(item2[5:])
            if NH == 1:
                unambiguous[item.reference_name].add(spanner)
                
            n += 1
            if stop_after is not None and n > stop_after: break
            if n % 1000000 == 0: print prefix, filename, grace.pretty_number(n)
        
        alf.close()


    ambiguities = [ ]
    for i in xrange(len(total)):
        u = unambiguous[chrom_names[i]].get()
        t = map_spanner(lambda x: x*1j, total[chrom_names[i]].get())
        c = pile([u,t],initial=0.0)
        c = map_spanner(lambda x: max(0.0,x.imag-x.real)/max(x.imag,1.0), c)
        ambiguities.append(c)

    bedgraph(prefix+".bedgraph", zip(chrom_names, [ item for item in ambiguities ]))
    subprocess.check_call([
        "wigToBigWig",prefix+".bedgraph",prefix+"-chrom.sizes",prefix+".bw"])
    os.unlink(prefix+".bedgraph")
    os.unlink(prefix+"-chrom.sizes")
def make_bigwig(prefix, bam_filenames, make_spanner, fragments=False, stop_after=None, scale=1.0, polya=False): 
    have_pysam = False
    try:
        import pysam
        have_pysam = True
    except ImportError:
        pass
    
    #alf = pysam.AlignmentFile(bam_filenames[0])
    #header = alf.header
    header = sam.parsed_bam_headers(bam_filenames[0])
    
    with open(prefix+"-chrom.sizes","wb") as f:
        for entry in header["SQ"]:
            f.write("{}\t{}\n".format(entry["SN"],entry["LN"]))
    
    chrom_names = [ entry["SN"] for entry in header["SQ"] ]
    chrom_sizes = [ int(entry["LN"]) for entry in header["SQ"] ]

    #alf.close()

    forward = dict([ (i,Piler(j)) for i,j in zip(chrom_names,chrom_sizes) ])
    reverse = dict([ (i,Piler(j)) for i,j in zip(chrom_names,chrom_sizes) ])
    
    old = grace.status("Bigwig")

    for filename in bam_filenames:
        if have_pysam:
            alf = pysam.AlignmentFile(filename)
        else:
            alf = sam.Bam_reader(filename)
        
        n = 0
        
        if not fragments:
            for item in alf:
                if item.is_unmapped or item.is_secondary or item.is_supplementary:
                    continue
                    
                if polya and not alignment_is_polya(item):
                    continue
            
                # Assume --> <-- oriented read pairs
                which = forward if bool(item.is_reverse) == bool(item.is_read2) else reverse
                which[item.reference_name].add( make_spanner(item) )
                    
                n += 1
                if stop_after is not None and n > stop_after: break
                if n % 1000000 == 0: grace.status(os.path.basename(prefix)+" "+filename+" "+grace.pretty_number(n))
        
        else:
            for item in iter_fragments(alf):
                if polya and not any(alignment_is_polya(al) for al in item):
                    continue
            
                # Assume --> <-- oriented read pairs
                which = forward if bool(item[0].is_reverse) == bool(item[0].is_read2) else reverse
                which[item[0].reference_name].add( make_spanner(item) )
                    
                n += 1
                if stop_after is not None and n > stop_after: break
                if n % 1000000 == 0: grace.status(os.path.basename(prefix)+" "+filename+" "+grace.pretty_number(n))
        
        if have_pysam:
            alf.close()

    bedgraph(prefix+"-fwd.bedgraph", zip(chrom_names, [ scale_spanner(scale, forward[item].get()) for item in chrom_names ]))
    subprocess.check_call([
        "wigToBigWig",prefix+"-fwd.bedgraph",prefix+"-chrom.sizes",prefix+"-fwd.bw"])
    os.unlink(prefix+"-fwd.bedgraph")
        
    bedgraph(prefix+"-rev.bedgraph", zip(chrom_names, [ scale_spanner(scale, reverse[item].get()) for item in chrom_names ]))    
    subprocess.check_call([
        "wigToBigWig",prefix+"-rev.bedgraph",prefix+"-chrom.sizes",prefix+"-rev.bw"])
    os.unlink(prefix+"-rev.bedgraph")
    
    os.unlink(prefix+"-chrom.sizes")
    
    grace.status(old)
Ejemplo n.º 19
0
def make_ambiguity_bigwig_by_readname(prefix,
                                      bam_filenames,
                                      stop_after=None,
                                      subsample=1):
    #import pysam

    #alf = pysam.AlignmentFile(bam_filenames[0])
    #header = alf.header
    header = sam.parsed_bam_headers(bam_filenames[0])

    with open(prefix + "-chrom.sizes", "wb") as f:
        for entry in header["SQ"]:
            f.write("{}\t{}\n".format(entry["SN"], entry["LN"]))

    chrom_names = [entry["SN"] for entry in header["SQ"]]
    chrom_sizes = [int(entry["LN"]) for entry in header["SQ"]]

    #alf.close()

    unambiguous = dict([(i, Piler(j))
                        for i, j in zip(chrom_names, chrom_sizes)])
    total = dict([(i, Piler(j)) for i, j in zip(chrom_names, chrom_sizes)])

    old = grace.status("Ambiguity bigwig")

    for filename in bam_filenames:
        #alf = pysam.AlignmentFile(filename)
        alf = sam.Bam_reader(filename)
        n = 0

        sub = subsample - 1
        for (key, items) in itertools.groupby(alf,
                                              lambda item: item.query_name):
            sub = (sub + 1) % subsample
            if sub: continue

            items = [
                item for item in items
                if not item.is_unmapped and not item.is_supplementary
            ]
            if not items:
                continue

            # Only use top scoring alignments
            AS = [item.get_AS() for item in items]
            best_AS = max(AS)
            items = [
                item for item, this_AS in zip(items, AS) if this_AS >= best_AS
            ]

            for item in items:
                #spanner = fragment_split_coverage([item])
                spanner = fragment_coverage(
                    [item])  #TODO fixme when blocks available
                spanner = scale_spanner(1.0 / len(items), spanner)
                total[item.reference_name].add(spanner)
                if len(items) == 1:
                    unambiguous[item.reference_name].add(spanner)

            n += 1
            if stop_after is not None and n > stop_after: break
            if n % 1000000 == 0:
                grace.status(
                    os.path.basename(prefix) + " " + filename + " " +
                    grace.pretty_number(n))

        alf.close()

    ambiguities = []
    for i in xrange(len(total)):
        u = unambiguous[chrom_names[i]].get()
        t = map_spanner(lambda x: x * 1j, total[chrom_names[i]].get())
        c = pile([u, t], initial=0.0)
        c = map_spanner(lambda x: max(0.0, x.imag - x.real) / max(x.imag, 1.0),
                        c)
        ambiguities.append(c)

    bedgraph(prefix + ".bedgraph",
             zip(chrom_names, [item for item in ambiguities]))
    subprocess.check_call([
        "wigToBigWig", prefix + ".bedgraph", prefix + "-chrom.sizes",
        prefix + ".bw"
    ])
    os.unlink(prefix + ".bedgraph")
    os.unlink(prefix + "-chrom.sizes")

    grace.status(old)
Ejemplo n.º 20
0
def make_ambiguity_bigwig(prefix, bam_filenames, stop_after=None, subsample=1):
    #import pysam

    #alf = pysam.AlignmentFile(bam_filenames[0])
    #header = alf.header
    header = sam.parsed_bam_headers(bam_filenames[0])

    with open(prefix + "-chrom.sizes", "wb") as f:
        for entry in header["SQ"]:
            f.write("{}\t{}\n".format(entry["SN"], entry["LN"]))

    chrom_names = [entry["SN"] for entry in header["SQ"]]
    chrom_sizes = [int(entry["LN"]) for entry in header["SQ"]]

    #alf.close()

    unambiguous = dict([(i, Piler(j))
                        for i, j in zip(chrom_names, chrom_sizes)])
    total = dict([(i, Piler(j)) for i, j in zip(chrom_names, chrom_sizes)])

    for filename in bam_filenames:
        #alf = pysam.AlignmentFile(filename)
        alf = sam.Bam_reader(filename)
        n = 0

        sub = subsample - 1
        for item in alf:
            if item.is_unmapped or item.is_supplementary:
                continue

            sub = (sub + 1) % subsample
            if sub: continue

            #spanner = fragment_split_coverage([item])
            spanner = fragment_coverage([item
                                         ])  #TODO fixme when blocks available
            total[item.reference_name].add(spanner)

            NH = 1
            for item2 in item.extra:
                if item2.startswith("NH:i:"):
                    NH = int(item2[5:])
            if NH == 1:
                unambiguous[item.reference_name].add(spanner)

            n += 1
            if stop_after is not None and n > stop_after: break
            if n % 1000000 == 0: print prefix, filename, grace.pretty_number(n)

        alf.close()

    ambiguities = []
    for i in xrange(len(total)):
        u = unambiguous[chrom_names[i]].get()
        t = map_spanner(lambda x: x * 1j, total[chrom_names[i]].get())
        c = pile([u, t], initial=0.0)
        c = map_spanner(lambda x: max(0.0, x.imag - x.real) / max(x.imag, 1.0),
                        c)
        ambiguities.append(c)

    bedgraph(prefix + ".bedgraph",
             zip(chrom_names, [item for item in ambiguities]))
    subprocess.check_call([
        "wigToBigWig", prefix + ".bedgraph", prefix + "-chrom.sizes",
        prefix + ".bw"
    ])
    os.unlink(prefix + ".bedgraph")
    os.unlink(prefix + "-chrom.sizes")
Ejemplo n.º 21
0
    def run(self):
        #mincov, args = grace.get_option_value(args, '--mincov', int, 1) 
        #maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) 
        #minsize, args = grace.get_option_value(args, '--minsize', int, 200)
        #what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core')    
        #is_core = (what == 'core') 
        #
        #grace.expect_no_further_options(args)
        #
        #if len(args) < 2:
        #    print >> sys.stderr, HELP
        #    raise grace.Help_shown()
        #
        #output_dir, working_dirs = args[0], args[1:]
        #
        ##assert not path.exists(path.join(output_dir, 'reference.fa')), \
        #assert not path.exists(path.join(output_dir, 'parameters')), \
        #        'Output directory not given'
        #
        #if not path.exists(output_dir):
        #    os.mkdir(output_dir)

        assert self.what in ('core','unique'), 'Expected --what to be either "core" or "unique".'
        is_core = (self.what == 'core') 
        
        workspace = self.get_workspace()
        
        for name, seq in io.read_sequences(working_directory.Working(self.working_dirs[0]).get_reference().reference_fasta_filename()):
            self.log.log(name + '\n')
            friendly_name = grace.filesystem_friendly_name(name)
            
            good = [ True ] * len(seq)
            
            for working_dir in self.working_dirs:
                if is_core:
                   suffix = '-depth.userplot'
                else:
                   suffix = '-ambiguous-depth.userplot'
                data = trivia.read_unstranded_userplot(
                    os.path.join(working_dir, friendly_name+suffix)
                )
                assert len(seq) == len(data)
                for i in xrange(len(seq)):
                   if good[i]:
                       if is_core:
                           good[i] = data[i] >= self.mincov
                       else:
                           good[i] = data[i] < self.mincov
    
            #Close holes
            start = -self.maxdiff-1
            n_holes = 0
            for i in xrange(len(seq)):
                if good[i]:
                     if 0 < i-start <= self.maxdiff:
                         for j in xrange(start,i): good[j] = True
                         n_holes += 1
                     start = i+1
            self.log.log('Closed '+grace.pretty_number(n_holes)+' holes\n')
            
            
            f = open( workspace/('%s-%s.fa' % (friendly_name,self.what)), 'wb')
            io.write_fasta(f, name,
                ''.join([ (seq[i] if good[i] else 'N')
                          for i in xrange(len(seq)) ])
            )
            f.close()
    
            f = open( workspace/('%s-%s_masked.fa' % (friendly_name,self.what)), 'wb')
            io.write_fasta(f, name,
                ''.join([ (seq[i] if good[i] else seq[i].lower())
                          for i in xrange(len(seq)) ])
            )
            f.close()
    
            f_good = open( workspace/('%s-%s_parts.fa' % (friendly_name,self.what)), 'wb')
            f_nongood = open( workspace/('%s-non%s_parts.fa' % (friendly_name,self.what)), 'wb')
            start = 0
            n_good = [0]
            n_good_bases = [0]    
            def emit(i):
                if i-start < self.minsize: return
                if good[start]:
                    n_good[0] += 1
                    n_good_bases[0] += i-start
                io.write_fasta(
                    f_good if good[start] else f_nongood,
                    '%s:%d..%d' % (name, start+1,i),
                    seq[start:i]
                )
            for i in xrange(1,len(seq)):
                if good[i] != good[start]:
                    emit(i)
                    start = i
            emit(len(seq))
            f_nongood.close()
            f_good.close()
            
            self.log.log(grace.pretty_number(sum(good))+' bases are '+self.what+', of '+grace.pretty_number(len(seq))+' in reference sequence\n')
            self.log.log(grace.pretty_number(n_good[0])+' parts at least '+grace.pretty_number(self.minsize)+' bases long with '+grace.pretty_number(n_good_bases[0])+' total bases\n')
            self.log.log('\n')
Ejemplo n.º 22
0
def bam_iter_fragments(filename, status_text='Processing'):
    reader = Bam_reader(filename)
    
    old_status = grace.status(status_text)

    n = 0
    n_ambiguous = 0
    for read_name, alignment_iter in itertools.groupby(reader, lambda read: read.qname):
        if n % 100000 == 0:
            grace.status(status_text + ' fragment %s' % grace.pretty_number(n))
        n += 1
        
        unpaired = [ ]
        first = [ ]
        second = [ ]
        unmapped = [ ]
        for al in alignment_iter:
            if al.flag&FLAG_UNMAPPED:
                unmapped.append(al)            
            elif not al.flag&FLAG_PAIRED or al.flag&FLAG_MATE_UNMAPPED:
                unpaired.append((al,))
            elif al.flag&FLAG_FIRST:
                first.append(al)
            elif al.flag&FLAG_SECOND:
                second.append(al)
            else:
                assert False, 'Read in pair that is neither first nor second'
        
        pairs = [ ]
        
        unused = set(first + second)
        
        second_index = { }
        for al in second:
            key = (al.rname, al.pos)
            if key not in second_index: second_index[key] = [ ]
            second_index[key].append(al)
        
        for al1 in first:
            key = (al1.get_mrnm(), al1.mpos)
            for al2 in second_index.get(key, ()):
                if al2.get_mrnm() != al1.rname or \
                   al2.mpos != al1.pos:
                    continue
                
                if al1 not in unused or al2 not in unused:
                    # pfh says: I am displeased that the pairing is sometimes ambiguous
                    n_ambiguous += 1
                    continue
                
                pairs.append( (al1, al2) )
                unused.remove(al1)
                unused.remove(al2)
        
        if unused:
            print unused
        assert not unused, 'Alignment pairing not even pretending to make sense. Is the BAM file sorted by read name?'
        
        yield read_name, pairs + unpaired, unmapped
    
    grace.status(old_status)
    if n_ambiguous:
        print >> sys.stderr 
        print >> sys.stderr, 'The alignment pairing was unclear %s times, and alignments were paired arbitrarily.' % grace.pretty_number(n_ambiguous)
        print >> sys.stderr, 'Blame the SAM format.'
        print >> sys.stderr 
Ejemplo n.º 23
0
def main(args):
    grace.require_shrimp_1()

    n_cpus = grace.how_many_cpus()
        
    solid, args = grace.get_flag(args, '--solid')
    verbose, args = grace.get_flag(args, '--verbose')

    threshold, args = grace.get_option_value(args, '--threshold', str, '68%')
    
    stride, args = grace.get_option_value(args, '--stride', int, 1)
    max_shrimps, args = grace.get_option_value(args, '--cpus', int, n_cpus)
    batch_size, args = grace.get_option_value(args, '--batch-size', int, 5000000)
        
    input_reference_filenames = [ ]
    reads_filenames = [ ]
    
    shrimp_options = [ '-h', threshold ]
    if threshold.endswith('%'):
        threshold = -float(threshold[:-1])/100.0
    else:
        threshold = int(threshold)
    
    output_dir = [ ]  #As list so can write to from function. Gah.
    
    def front_command(args):
        grace.expect_no_further_options(args)
        
        if len(args) < 1:
            return
        
        output_dir.append(args[0])        
        input_reference_filenames.extend(
            [ os.path.abspath(filename) for filename in args[1:] ])
    def reads_command(args):
        grace.expect_no_further_options(args)
        reads_filenames.extend([ [ os.path.abspath(filename) ] for filename in args])
    def pairs_command(args):
        grace.expect_no_further_options(args)
        assert len(args) == 2, 'Expected exactly two files in "pairs"'
        reads_filenames.append([ os.path.abspath(filename) for filename in args ])
    def shrimp_options_command(args):
        shrimp_options.extend(args)
    
    grace.execute(args, {
        'reads': reads_command,
        '--reads': reads_command,
        'pairs': pairs_command,
        'shrimp-options': shrimp_options_command,
        '--shrimp-options': shrimp_options_command,
    }, front_command)
    
    
    if not output_dir:
        print >> sys.stderr, USAGE % n_cpus
        return 1
    
    output_dir = output_dir[0]
    
    assert input_reference_filenames, 'No reference files given'
    assert reads_filenames, 'No read files given'
    
    for filename in itertools.chain(input_reference_filenames, *reads_filenames):
        assert os.path.exists(filename), '%s does not exist' % filename

    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    
    if solid:
        shrimp = 'rmapper-cs'
    else:
        shrimp = 'rmapper-ls'
    
    
    reference_filename = os.path.join(output_dir,'reference.fa')
    reference_file = open(reference_filename,'wb')
    total_reference_sequences = 0
    total_reference_bases = 0
    for input_reference_filename in input_reference_filenames:
        for name, sequence in io.read_sequences(input_reference_filename):
            #Don't retain any comment
            name = name.split()[0]
            io.write_fasta(reference_file, name, sequence)
            
            total_reference_sequences += 1
            total_reference_bases += len(sequence)
            
    reference_file.close()
    
    print '%s base%s in %s reference sequence%s' % (
        grace.pretty_number(total_reference_bases), 's' if total_reference_bases != 1 else '',
        grace.pretty_number(total_reference_sequences), 's' if total_reference_sequences != 1 else '')
    
    assert total_reference_bases, 'Reference sequence file is empty' 
    
    config = {
        'references' : input_reference_filenames,
        'reads' : reads_filenames,
        'stride' : stride,
        'solid': solid,
        'threshold': threshold,
    }
    config_file = open(os.path.join(output_dir, 'config.txt'), 'wb')
    pprint.pprint(config, config_file)
    config_file.close()
    
    output_filename = os.path.join(output_dir, 'shrimp_hits.txt.gz')
    output_file = gzip.open(output_filename, 'wb')
    
    unmapped_filename = os.path.join(output_dir, 'unmapped.fa.gz')
    unmapped_file = gzip.open(unmapped_filename, 'wb')
    
    dirty_filenames = set()
    dirty_filenames.add(output_filename)
    dirty_filenames.add(unmapped_filename)
    
    #warn_low_threshold = True
    
    try: #Cleanup temporary files
        
        N = [0]
        def do_shrimp(read_set):
            my_number = N[0]
            N[0] += 1
            
            tempname = os.path.join(output_dir,'temp%d-%d.fa' % (os.getpid(),my_number))
            tempname_out = os.path.join(output_dir,'temp%d-%d.txt' % (os.getpid(),my_number))
            
            dirty_filenames.add(tempname)
            dirty_filenames.add(tempname_out)
            
            f = open(tempname,'wb')
            for read_name, read_seq in read_set:
                print >> f, '>' + read_name
                print >> f, read_seq
            f.close()
        
            command = shrimp + ' ' + ' '.join(shrimp_options) + ' ' + \
                      tempname + ' ' + reference_filename + ' >' + tempname_out
            if not verbose:
                command += ' 2>/dev/null'
            #f = os.popen(command, 'r')
            child_pid = os.spawnl(os.P_NOWAIT,'/bin/sh','/bin/sh','-c',command)
            #print 'SHRiMP %d running' % my_number
            
            def finalize():
                exit_status = os.waitpid(child_pid, 0)[1]
                assert exit_status == 0, 'Shrimp indicated an error'
                
                hits = { } # read_name -> [ hit line ]
                
                f = open(tempname_out,'rb')
                for line in f:
                    if line.startswith('>'):
                        read_name = line.split(None,1)[0][1:]
                        if read_name not in hits:
                            hits[read_name] = [ ]
                        hits[read_name].append(line)
                f.close()
                                
                for read_name, read_seq in read_set:
                    if read_name in hits:
                        for hit in hits[read_name]:
                            output_file.write(hit)
                    else:
                        print >> unmapped_file, '>' + read_name
                        print >> unmapped_file, read_seq

                output_file.flush()
                unmapped_file.flush()
        
                os.unlink(tempname)
                dirty_filenames.remove(tempname)
                os.unlink(tempname_out)
                dirty_filenames.remove(tempname_out)
                #print 'SHRiMP %d finished' % my_number
            return finalize
        
        
        shrimps = [ ]
        
        reader = iter_reads(config)
        read_count = 0
        
        while True:
            read_set = [ ]
            read_set_bases = 0

            #Read name should not include comment cruft
            # - SHRIMP passes this through
            # - might stuff up identification of pairs
            
            for read_name, read_seq in reader:
                read_name = read_name.split()[0]                
                read_set.append((read_name, read_seq))
                read_set_bases += len(read_seq)
                
                #if warn_low_threshold and len(read_seq)*7 < threshold: #Require 70% exact match
                #    sys.stderr.write('\n*** WARNING: Short reads, consider reducing --threshold ***\n\n')                    
                #    warn_low_threshold = False
            
                read_count += 1
                if read_set_bases >= batch_size: break
                
            if not read_set: break
        
            if len(shrimps) >= max_shrimps:
                shrimps.pop(0)()
            shrimps.append( do_shrimp(read_set) )
            
            grace.status('SHRiMPing %s' % grace.pretty_number(read_count))
        
        while shrimps:
            grace.status('Waiting for SHRiMPs to finish %d ' % len(shrimps) )
            shrimps.pop(0)()
        
        grace.status('')
        
        output_file.close()
        dirty_filenames.remove(output_filename)
        unmapped_file.close()
        dirty_filenames.remove(unmapped_filename)
        
        return 0

    finally:
        for filename in dirty_filenames:
            if os.path.exists(filename):
                os.unlink(filename)
Ejemplo n.º 24
0
def main(args):
    mincov, args = grace.get_option_value(args, '--mincov', int, 1)
    maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16)
    minsize, args = grace.get_option_value(args, '--minsize', int, 200)
    what, args = grace.get_option_value(args, '--what', as_core_or_unique,
                                        'core')
    is_core = (what == 'core')

    grace.expect_no_further_options(args)

    if len(args) < 2:
        print >> sys.stderr, HELP
        raise grace.Help_shown()

    output_dir, working_dirs = args[0], args[1:]

    assert not path.exists(path.join(output_dir, 'reference.fa')), \
        'Output directory not given'

    if not path.exists(output_dir):
        os.mkdir(output_dir)

    for name, seq in io.read_sequences(
            path.join(working_dirs[0], 'reference.fa')):
        print name
        friendly_name = grace.filesystem_friendly_name(name)

        good = [True] * len(seq)

        for working_dir in working_dirs:
            if is_core:
                suffix = '-depth.userplot'
            else:
                suffix = '-ambiguous-depth.userplot'
            data = trivia.read_unstranded_userplot(
                os.path.join(working_dir, friendly_name + suffix))
            assert len(seq) == len(data)
            for i in xrange(len(seq)):
                if good[i]:
                    if is_core:
                        good[i] = data[i] >= mincov
                    else:
                        good[i] = data[i] < mincov

        #Close holes
        start = -maxdiff - 1
        n_holes = 0
        for i in xrange(len(seq)):
            if good[i]:
                if 0 < i - start <= maxdiff:
                    for j in xrange(start, i):
                        good[j] = True
                    n_holes += 1
                start = i + 1
        print 'Closed', grace.pretty_number(n_holes), 'holes'

        f = open(path.join(output_dir, '%s-%s.fa' % (friendly_name, what)),
                 'wb')
        io.write_fasta(
            f, name,
            ''.join([(seq[i] if good[i] else 'N') for i in xrange(len(seq))]))
        f.close()

        f = open(
            path.join(output_dir, '%s-%s_masked.fa' % (friendly_name, what)),
            'wb')
        io.write_fasta(
            f, name, ''.join([(seq[i] if good[i] else seq[i].lower())
                              for i in xrange(len(seq))]))
        f.close()

        f_good = open(
            path.join(output_dir, '%s-%s_parts.fa' % (friendly_name, what)),
            'wb')
        f_nongood = open(
            path.join(output_dir, '%s-non%s_parts.fa' % (friendly_name, what)),
            'wb')
        start = 0
        n_good = [0]
        n_good_bases = [0]

        def emit(i):
            if i - start < minsize: return
            if good[start]:
                n_good[0] += 1
                n_good_bases[0] += i - start
            io.write_fasta(f_good if good[start] else f_nongood,
                           '%s:%d..%d' % (name, start + 1, i), seq[start:i])

        for i in xrange(1, len(seq)):
            if good[i] != good[start]:
                emit(i)
                start = i
        emit(len(seq))
        f_nongood.close()
        f_good.close()

        print grace.pretty_number(
            sum(good)), 'bases are ' + what + ', of', grace.pretty_number(
                len(seq)), 'in reference sequence'
        print grace.pretty_number(
            n_good[0]), 'parts at least', grace.pretty_number(
                minsize), 'bases long with', grace.pretty_number(
                    n_good_bases[0]), 'total bases'

        print
Ejemplo n.º 25
0
def recombination(args):
    grace.expect_no_further_options(args)
    if len(args) != 2:
        print >> sys.stderr, USAGE
        raise grace.Help_shown()

    working_dir, seq_name = args

    references = dict(io.read_sequences(os.path.join(working_dir, 'reference.fa')))
    
    depth = { }
    prefixes = { }
    suffixes = { }
    for name in references:
        depth[name] = numpy.zeros(len(references[name]), 'int64')
        prefixes[name] = [ [] for base in references[name] ]
        suffixes[name] = [ [] for base in references[name] ]
    def register_divergence(hit):
        if not hit.query_forward:
            hit = hit.reversed()
        
        margin = 20
        
        if hit.target_end - hit.target_start < 20: 
            return False
        
        depth[hit.target_name][hit.target_start : hit.target_end] += 1

        any = False
        
        if hit.query_end <= len(hit.query_seq)-margin: # and hit.target_end < len(hit.target_seq):
            suffixes[hit.target_name][hit.target_end-1].append( hit.query_seq[hit.query_end:] )
            any = True
        
        if hit.query_start >= margin: # and hit.target_start > 0:
            prefixes[hit.target_name][hit.target_start].append( hit.query_seq[:hit.query_start] )
            any = True
        
        return any

    n = 0
    for (read_name, read_seq), hits in shrimp.iter_read_hits(working_dir):
        # Skip reads containing Ns
        if 'N' in read_seq: continue
    
        for line in hits:
            register_divergence(alignment_from_shrimp(line, references, read_name, read_seq))
        
        n += 1
        #if n > 100000:
        #    break
            
        if n%10000 == 0:
            grace.status('Processing read %s' % grace.pretty_number(n))

    grace.status('')

    
    def show_items(items):
        original_length = len(items)
        cut = 0
        while len(items) > 80:
            cut += 1
            items = [ item for item in items if item[0] >= cut ]
        for item in items:
            print item[1]
        if len(items) < original_length:
            print '(and %d more occurring %d times or less)' % (original_length-len(items), cut-1) 
    
    def score(items):
        if not items: return 1.0
        return float(sum( item[0] * item[0] for item in items )) / (sum( item[0] for item in items )**2)
    
    def summarize_prefixes(seqs, pad):
        seqs = sorted(seqs, key=lambda seq: seq[::-1])
        
        cut = 100
        while True:
            items = [ ] 
            for (seq, iterator) in itertools.groupby(seqs, key = lambda x: x[-cut:]):
                ss = list(iterator)
                anylong = any( item != seq for item in ss )            
                n = len(ss)
                items.append( (n, ('%'+str(pad)+'s')%(('...' if anylong else '') + seq) + ' x %d' % n) )
            
            if score(items) >= 1.0/20: break
            cut -= 1
        
        show_items(items)
        
    def summarize_suffixes(seqs, pad):
        seqs = sorted(seqs)
        
        cut = 100
        while True:
            items = [ ] 
            for (seq, iterator) in itertools.groupby(seqs, key = lambda x: x[:cut]):
                ss = list(iterator)            
                anylong = any( item != seq for item in ss )            
                n = len(ss)
                items.append( (n, ('%'+str(pad)+'s')%('%d x '%n) + seq + ('...' if anylong else '')) )
            
            if score(items) >= 1.0/20: break
            cut -= 1
        show_items(items)
    
    print 'Position        Depth        Changed prefixes             Changed suffixes'
    print '                          Count    % of depth       Count    % of depth'
    for i in xrange(len(references[seq_name])):
        print '%8d   %10d %9d  %11s       %9d  %11s' % (
            i+1,
            depth[seq_name][i],
            len(prefixes[seq_name][i]),
            '%.3f%%' % (len(prefixes[seq_name][i])*100.0/depth[seq_name][i]) if prefixes[seq_name][i] else '',  
            len(suffixes[seq_name][i]),
            '%.3f%%' % (len(suffixes[seq_name][i])*100.0/depth[seq_name][i]) if suffixes[seq_name][i] else '')
        #summarize_suffixes(suffixes[name][i], references[name][i+1:], references[name], suffix_depth[name][i])

    print
    print 'Details'
    print
    for i in xrange(len(references[seq_name])):
        print '%-80s*' % ('Base %d' % (i+1))
        
        print pad_slice(references[seq_name], i-80,i+1+80)        
        summarize_prefixes(prefixes[seq_name][i], 80)
        summarize_suffixes(suffixes[seq_name][i], 81)
        print
Ejemplo n.º 26
0
    def run(self):
        """
        
        <sequence> <poly-A> <adaptor> <anything>
        
        """
        min_quality = chr(33+self.quality)
        
        with io.open_possibly_compressed_writer(self.prefix+'.fastq.gz') as out_file, \
             io.open_possibly_compressed_writer(self.prefix+'.clips.gz') as out_clips_file:
            print >> out_clips_file, '#Read\tread length\tpoly-A start\tpoly-A end\tpoly-A start, ignoring adaptor\tpoly-A end, ignoring adaptor\tadaptor bases matched'
             
            n = 0
            n_discarded = 0
            n_clipped = 0
            total_before = 0
            total_clipped = 0

            for filename in self.filenames:
                for name, seq, qual in io.read_sequences(filename, qualities='required'):
                    best_score = 0
                    best_a_start = len(seq)
                    best_a_end = len(seq)
                    best_adaptor_bases = 0
                    best_aonly_score = 0
                    best_aonly_start = len(seq)
                    best_aonly_end = len(seq)
                    for a_start in xrange(len(seq)):
                        if a_start and seq[a_start-1] == 'A': continue
                        
                        a_end = a_start
                        aonly_score = 0
                        while True:
                            if aonly_score > best_aonly_score:
                                best_aonly_score = aonly_score
                                best_aonly_start = a_start
                                best_aonly_end = a_end
                                            
                            score = aonly_score
                            adaptor_bases = 0
                            for i in xrange(a_end,min(a_end+len(self.adaptor),len(seq))):
                                if qual[i] >= min_quality:
                                    if seq[i] == self.adaptor[i-a_end]:
                                        score += 1
                                        adaptor_bases += 1
                                    else:
                                        score -= 4
                            if score > best_score:
                                best_score = score
                                best_a_start = a_start
                                best_a_end = a_end
                                best_adaptor_bases = adaptor_bases
                        
                            if a_end >= len(seq): break
                            if qual[a_end] >= min_quality:
                                if seq[a_end] == 'A':
                                    aonly_score += 1
                                else:
                                    aonly_score -= 4
                                    if aonly_score <= 0: break
                            a_end += 1
                        
                    a_start = best_a_start
                    a_end = best_a_end
                    adaptor_bases = best_adaptor_bases
                    aonly_start = best_aonly_start
                    aonly_end = best_aonly_end                    
                        
                    if self.debug: # and a_end == a_start and a_end < len(seq)-10:        
                        print name
                        print ''.join( 'X' if item<min_quality else ' ' for item in qual )
                        print seq
                        print ' '*a_start + 'A'*(a_end-a_start) + self.adaptor
                        print ' '*aonly_start + 'A'*(aonly_end-aonly_start)
                        print 
                        sys.stdout.flush()

                    n += 1
                    total_before += len(seq)

                    # 0 - sequence name
                    # 1 - sequence length
                    # 2 - poly(A) start
                    # 3 - poly(A) end
                    # (4 - best run of As start, for debugging the need to detect adaptor seq)
                    # (5 - best run of As end)
                    # 6 - number of adaptor bases matched
                    print >> out_clips_file, '%s\t%d\t%d\t%d\t%d\t%d\t%d' % (name, len(seq) , a_start, a_end, aonly_start, aonly_end,  adaptor_bases)
                    
                    if a_start > self.length:
                        if a_start < len(seq):
                            n_clipped += 1
                            total_clipped += a_start
                    
                        print >> out_file, '@'+name
                        print >> out_file, seq[:a_start]
                        print >> out_file, '+'
                        print >> out_file, qual[:a_start]
                    else:
                        n_discarded += 1
                    
                    if n%10000 == 0: 
                        grace.status('Clip-runs ' + self.sample + ' ' + grace.pretty_number(n)) # + ' (' + grace.pretty_number(len(dstates)) + ' dstates)')
        
        grace.status('')
        
        self.log.datum(self.sample,'reads',n)
        if n:
            self.log.datum(self.sample,'mean length before poly-A/adaptor clipping',float(total_before)/n)
        self.log.datum(self.sample,'reads discarded as too short after poly-A/adaptor clipping',n_discarded)
        self.log.datum(self.sample,'reads poly-A/adaptor clipped and kept',n_clipped)
        if n_clipped:
            self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)
Ejemplo n.º 27
0
    def run(self):
        log = self.log
        
        #quality_cutoff, args = grace.get_option_value(args, '--quality', int, 10)
        #qoffset, args = grace.get_option_value(args, '--qoffset', int, None)
        #clip_ambiguous, args = grace.get_option_value(args, '--clip-ambiguous', grace.as_bool, True)
        #length_cutoff, args = grace.get_option_value(args, '--length', int, 24)
        #adaptor_cutoff, args = grace.get_option_value(args, '--match', int, 10)
        #max_error, args = grace.get_option_value(args, '--max-errors', int, 1)
        #adaptor_set, args = grace.get_option_value(args, '--adaptors', str, 'truseq-adapter,truseq-srna,genomic,multiplexing,pe,srna')
        #disallow_homopolymers, args = grace.get_option_value(args, '--homopolymers', grace.as_bool, False)
        #reverse_complement, args = grace.get_option_value(args, '--revcom', grace.as_bool, False)
        #trim_start, args = grace.get_option_value(args, '--trim-start', int, 0)
        #trim_end, args = grace.get_option_value(args, '--trim-end', int, 0)
        #output_fasta, args = grace.get_option_value(args, '--fasta', grace.as_bool, False)
        #use_gzip, args = grace.get_option_value(args, '--gzip', grace.as_bool, True)
        #output_rejects, args = grace.get_option_value(args, '--rejects', grace.as_bool, False)
        #grace.expect_no_further_options(args)
        
        prefix = self.prefix
        log_name = os.path.split(prefix)[1]
        
        quality_cutoff = self.quality
        qoffset = self.qoffset
        clip_ambiguous = self.clip_ambiguous
        length_cutoff = self.length
        adaptor_cutoff = self.match
        max_error = self.max_errors
        disallow_homopolymers = self.homopolymers
        reverse_complement = self.revcom
        trim_start = self.trim_start
        trim_end = self.trim_end
        output_fasta = self.fasta
        use_gzip = self.gzip
        output_rejects = self.rejects
    
        iterators = [ ]        
        filenames = [ ]
        any_paired = False
        
        for filename in self.reads:
            filenames.append(filename)
            iterators.append(itertools.izip(
                 io.read_sequences(filename, qualities=True)
            ))
        
        for pair_filenames in self.pairs:
            assert len(pair_filenames) == 2, 'Expected a pair of files for "pairs" section.'
            filenames.extend(pair_filenames)
            any_paired = True
            iterators.append(itertools.izip(
                io.read_sequences(pair_filenames[0], qualities=True),
                io.read_sequences(pair_filenames[1], qualities=True)
            ))
        
        for filename in self.interleaved:
            filenames.append(filename)
            any_paired = True
            iterators.append(deinterleave(
                io.read_sequences(filename, qualities=True)
            ))
        
        fragment_reads = (2 if any_paired else 1)
        read_in_fragment_names = [ 'read-1', 'read-2' ] if any_paired else [ 'read' ]
        
        assert iterators, 'Nothing to clip'
        
        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)
    
        if qoffset is None:
            guesses = [ io.guess_quality_offset(filename) for filename in filenames ]
            assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify manually.'
            qoffset = guesses[0]
            log.log('FASTQ offset seems to be %d\n' % qoffset)    
    
        quality_cutoff_char = chr(qoffset + quality_cutoff)
        
        #log.log('Minimum quality:        %d (%s)\n' % (quality_cutoff, quality_cutoff_char))
        #log.log('Clip ambiguous bases:   %s\n' % (grace.describe_bool(clip_ambiguous)))
        #log.log('Minimum adaptor match:  %d bases, %d errors\n' % (adaptor_cutoff, max_error))
        #log.log('Minimum length:         %d bases\n' % length_cutoff)
        
        adaptor_seqs = [ ]
        adaptor_names = [ ]
        if self.adaptor_clip:
            if self.adaptor_file:
                adaptor_iter = io.read_sequences(self.adaptor_file)
            else:
                adaptor_iter = ADAPTORS
            for name, seq in adaptor_iter:
                seq = seq.upper().replace('U','T')
                adaptor_seqs.append(seq)
                adaptor_names.append(name)
                adaptor_seqs.append(bio.reverse_complement(seq))
                adaptor_names.append(name)

        matcher = Matcher(adaptor_seqs, adaptor_names, max_error)
        
        start_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ]
        end_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ]
    
        if output_fasta:
            write_sequence = io.write_fasta_single_line
        else:
            write_sequence = io.write_fastq
    
        f_single = io.open_possibly_compressed_writer(self.reads_output_filenames()[0])
        if fragment_reads == 2:
            names = self.pairs_output_filenames()[0] if self.out_separate else self.interleaved_output_filenames()
            f_paired = map(io.open_possibly_compressed_writer, names)
        if output_rejects:
            f_reject = io.open_possibly_compressed_writer(self.rejects_output_filenames()[0])
        
        n_single = 0
        n_paired = 0
        
        n_in_single = 0
        n_in_paired = 0
        total_in_length = [ 0 ] * fragment_reads
        
        n_out = [ 0 ] * fragment_reads
        n_q_clipped = [ 0 ] * fragment_reads
        n_a_clipped = [ 0 ] * fragment_reads
        n_homopolymers = [ 0 ] * fragment_reads
        total_out_length = [ 0 ] * fragment_reads
        
        #log.attach(open(prefix + '_log.txt', 'wb'))
        
        for iterator in iterators:
          for fragment in iterator:
            if (n_in_single+n_in_paired) % 10000 == 0:
                grace.status('Clipping fragment %s' % grace.pretty_number(n_in_single+n_in_paired))
        
            if len(fragment) == 1:
                n_in_single += 1
            else:
                n_in_paired += 1
            
            graduates = [ ]
            rejects = [ ]
            for i, (name, seq, qual) in enumerate(fragment):
                seq = seq.upper()
                total_in_length[i] += len(seq)
                
                if self.trim_to:
                    seq = seq[:self.trim_to]
                    qual = qual[:self.trim_to]
                
                start = trim_start
                best_start = 0
                best_len = 0
                for j in xrange(len(seq)-trim_end):
                    if qual[j] < quality_cutoff_char or \
                       (clip_ambiguous and seq[j] not in 'ACGT'):
                        if best_len < j-start:
                            best_start = start
                            best_len = j-start
                        start = j + 1
                j = len(seq)-trim_end
                if best_len < j-start:
                    best_start = start
                    best_len = j-start
        
                clipped_seq = seq[best_start:best_start+best_len]
                clipped_qual = qual[best_start:best_start+best_len]
                if len(clipped_seq) < length_cutoff:
                    n_q_clipped[i] += 1
                    rejects.append( (name,seq,qual,'quality') ) 
                    continue
        
                match = matcher.match(clipped_seq)
                if match and match[0] >= adaptor_cutoff:
                    clipped_seq = clipped_seq[match[0]:]
                    clipped_qual = clipped_qual[match[0]:]
                    start_clips[i][match[0]].append( match[1][0] )
                    if len(clipped_seq) < length_cutoff:
                        n_a_clipped[i] += 1 
                        rejects.append( (name,seq,qual,'adaptor') ) 
                        continue
            
                match = matcher.match(bio.reverse_complement(clipped_seq))
                if match and match[0] >= adaptor_cutoff:
                    clipped_seq = clipped_seq[: len(clipped_seq)-match[0] ]    
                    clipped_qual = clipped_qual[: len(clipped_qual)-match[0] ]    
                    end_clips[i][match[0]].append( match[1][0] )
                    if len(clipped_seq) < length_cutoff:
                        n_a_clipped[i] += 1 
                        rejects.append( (name,seq,qual,'adaptor') ) 
                        continue
    
                if disallow_homopolymers and len(set(clipped_seq)) <= 1:
                    n_homopolymers[i] += 1
                    rejects.append( (name,seq,qual,'homopolymer') ) 
                    continue
        
                graduates.append( (name, clipped_seq, clipped_qual) )
                n_out[i] += 1
                total_out_length[i] += len(clipped_seq)
    
            if output_rejects:
                for name,seq,qual,reason in rejects:
                    write_sequence(f_reject, name + ' ' + reason, seq, qual)
             
            if graduates:
                if reverse_complement:
                    graduates = [
                        (name, bio.reverse_complement(seq), qual[::-1])
                        for name, seq, qual in graduates
                    ]
            
                if len(graduates) == 1:
                    n_single += 1

                    (name, seq, qual) = graduates[0]
                    write_sequence(f_single, name, seq, qual)
                else:
                    assert len(graduates) == 2
                    n_paired += 1

                    # Write the pair to an interleaved file or separate l/r files
                    for (lr,(name, seq, qual)) in enumerate(graduates):
                        write_sequence(f_paired[lr%len(f_paired)], name, seq, qual)
                
        
        grace.status('')
        
        if output_rejects:
            f_reject.close()
        if fragment_reads == 2:
            map(lambda f: f.close(), f_paired)
        f_single.close()
        
        def summarize_clips(name, location, clips):
            total = 0
            for i in clips:
                total += len(clips[i])
            log.datum(log_name, name + ' adaptors clipped at ' + location, total) 
            
            if not clips:
                return
    
            for i in xrange(min(clips), max(clips)+1):
                item = clips[i]
                log.quietly_log('%3d bases: %10d ' % (i, len(item)))
                if item:
                    avg_errors = float(sum( item2[0] for item2 in item )) / len(item)
                    log.quietly_log(' avg errors: %5.2f  ' % avg_errors)
                    
                    counts = collections.defaultdict(int)
                    for item2 in item: counts[item2[1]] += 1
                    #print counts
                    for no in sorted(counts,key=lambda item2:counts[item2],reverse=True)[:2]:
                        log.quietly_log('%dx%s ' % (counts[no], matcher.names[no]))
                    if len(counts) > 2: log.quietly_log('...')
                    
                log.quietly_log('\n')
            log.quietly_log('\n')


        if n_in_paired:
            log.datum(log_name,'read-pairs', n_in_paired)                      
        if n_in_single:
            log.datum(log_name,'single reads', n_in_single)                      
        
        for i in xrange(fragment_reads):
            if start_clips:
                summarize_clips(read_in_fragment_names[i], 'start', start_clips[i])
        
            if end_clips:
                summarize_clips(read_in_fragment_names[i], 'end', end_clips[i])

                prefix = read_in_fragment_names[i]
                
            log.datum(log_name, prefix + ' too short after quality clip', n_q_clipped[i])
            log.datum(log_name, prefix + ' too short after adaptor clip', n_a_clipped[i])
            if disallow_homopolymers:
                log.datum(log_name, prefix + ' homopolymers', n_homopolymers[i])
            if fragment_reads > 1:
                log.datum(log_name, prefix + ' kept', n_out[i])
            log.datum(log_name, prefix + ' average input length',  float(total_in_length[i]) / (n_in_single+n_in_paired))                     
            if n_out[i]:
                log.datum(log_name, prefix + ' average output length', float(total_out_length[i]) / n_out[i])                     
        
        if fragment_reads == 2:
            log.datum(log_name,'pairs kept after clipping', n_paired)                      
        log.datum(log_name, 'reads kept after clipping', n_single)
Ejemplo n.º 28
0
    def run(self):
        #mincov, args = grace.get_option_value(args, '--mincov', int, 1)
        #maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16)
        #minsize, args = grace.get_option_value(args, '--minsize', int, 200)
        #what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core')
        #is_core = (what == 'core')
        #
        #grace.expect_no_further_options(args)
        #
        #if len(args) < 2:
        #    print >> sys.stderr, HELP
        #    raise grace.Help_shown()
        #
        #output_dir, working_dirs = args[0], args[1:]
        #
        ##assert not path.exists(path.join(output_dir, 'reference.fa')), \
        #assert not path.exists(path.join(output_dir, 'parameters')), \
        #        'Output directory not given'
        #
        #if not path.exists(output_dir):
        #    os.mkdir(output_dir)

        assert self.what in (
            'core',
            'unique'), 'Expected --what to be either "core" or "unique".'
        is_core = (self.what == 'core')

        workspace = self.get_workspace()

        for name, seq in io.read_sequences(
                working_directory.Working(self.working_dirs[0]).get_reference(
                ).reference_fasta_filename()):
            self.log.log(name + '\n')
            friendly_name = grace.filesystem_friendly_name(name)

            good = [True] * len(seq)

            for working_dir in self.working_dirs:
                if is_core:
                    suffix = '-depth.userplot'
                else:
                    suffix = '-ambiguous-depth.userplot'
                data = trivia.read_unstranded_userplot(
                    os.path.join(working_dir, friendly_name + suffix))
                assert len(seq) == len(data)
                for i in xrange(len(seq)):
                    if good[i]:
                        if is_core:
                            good[i] = data[i] >= self.mincov
                        else:
                            good[i] = data[i] < self.mincov

            #Close holes
            start = -self.maxdiff - 1
            n_holes = 0
            for i in xrange(len(seq)):
                if good[i]:
                    if 0 < i - start <= self.maxdiff:
                        for j in xrange(start, i):
                            good[j] = True
                        n_holes += 1
                    start = i + 1
            self.log.log('Closed ' + grace.pretty_number(n_holes) + ' holes\n')

            f = open(workspace / ('%s-%s.fa' % (friendly_name, self.what)),
                     'wb')
            io.write_fasta(
                f, name, ''.join([(seq[i] if good[i] else 'N')
                                  for i in xrange(len(seq))]))
            f.close()

            f = open(
                workspace / ('%s-%s_masked.fa' % (friendly_name, self.what)),
                'wb')
            io.write_fasta(
                f, name, ''.join([(seq[i] if good[i] else seq[i].lower())
                                  for i in xrange(len(seq))]))
            f.close()

            f_good = open(
                workspace / ('%s-%s_parts.fa' % (friendly_name, self.what)),
                'wb')
            f_nongood = open(
                workspace / ('%s-non%s_parts.fa' % (friendly_name, self.what)),
                'wb')
            start = 0
            n_good = [0]
            n_good_bases = [0]

            def emit(i):
                if i - start < self.minsize: return
                if good[start]:
                    n_good[0] += 1
                    n_good_bases[0] += i - start
                io.write_fasta(f_good if good[start] else f_nongood,
                               '%s:%d..%d' % (name, start + 1, i),
                               seq[start:i])

            for i in xrange(1, len(seq)):
                if good[i] != good[start]:
                    emit(i)
                    start = i
            emit(len(seq))
            f_nongood.close()
            f_good.close()

            self.log.log(
                grace.pretty_number(sum(good)) + ' bases are ' + self.what +
                ', of ' + grace.pretty_number(len(seq)) +
                ' in reference sequence\n')
            self.log.log(
                grace.pretty_number(n_good[0]) + ' parts at least ' +
                grace.pretty_number(self.minsize) + ' bases long with ' +
                grace.pretty_number(n_good_bases[0]) + ' total bases\n')
            self.log.log('\n')
Ejemplo n.º 29
0
def bam_iter_fragments(filename, status_text='Processing'):
    reader = Bam_reader(filename)

    n = 0
    n_ambiguous = 0
    for read_name, alignment_iter in itertools.groupby(reader, lambda read: read.qname):
        if n % 100000 == 0:
            grace.status(status_text + ' fragment %s' % grace.pretty_number(n))
        n += 1
        
        unpaired = [ ]
        first = [ ]
        second = [ ]
        unmapped = [ ]
        for al in alignment_iter:
            if al.flag&FLAG_UNMAPPED:
                unmapped.append(al)            
            elif not al.flag&FLAG_PAIRED or al.flag&FLAG_MATE_UNMAPPED:
                unpaired.append((al,))
            elif al.flag&FLAG_FIRST:
                first.append(al)
            elif al.flag&FLAG_SECOND:
                second.append(al)
            else:
                assert False, 'Read in pair that is neither first nor second'
        
        pairs = [ ]
        
        unused = set(first + second)
        
        second_index = { }
        for al in second:
            key = (al.rname, al.pos)
            if key not in second_index: second_index[key] = [ ]
            second_index[key].append(al)
        
        for al1 in first:
            key = (al1.get_mrnm(), al1.mpos)
            for al2 in second_index.get(key, ()):
                if al2.get_mrnm() != al1.rname or \
                   al2.mpos != al1.pos:
                    continue
                
                if al1 not in unused or al2 not in unused:
                    # pfh says: I am displeased that the pairing is sometimes ambiguous
                    n_ambiguous += 1
                    continue
                
                pairs.append( (al1, al2) )
                unused.remove(al1)
                unused.remove(al2)
        
        if unused:
            print unused
        assert not unused, 'Alignment pairing not even pretending to make sense. Is the BAM file sorted by read name?'
        
        yield read_name, pairs + unpaired, unmapped
    
    grace.status('')
    if n_ambiguous:
        print >> sys.stderr 
        print >> sys.stderr, 'The alignment pairing was unclear %s times, and alignments were paired arbitrarily.' % grace.pretty_number(n_ambiguous)
        print >> sys.stderr, 'Blame the SAM format.'
        print >> sys.stderr 
Ejemplo n.º 30
0
    def run(self):
        """
        
        <sequence> <poly-A> <adaptor> <anything>
        
        """
        clip_quality = chr(33+self.clip_quality)
        #ignore_quality = chr(33+self.ignore_quality)
        
        with io.open_possibly_compressed_writer(self.prefix+'.fastq.gz') as out_file, \
             io.open_possibly_compressed_writer(self.prefix+'.clips.gz') as out_clips_file:
            print >> out_clips_file, '#Read\tread length\tpoly-A start\tpoly-A end\tpoly-A start, ignoring adaptor\tpoly-A end, ignoring adaptor\tadaptor bases matched'
             
            n = 0
            n_discarded = 0
            n_clipped = 0
            total_before = 0
            total_clipped = 0

            for filename in self.filenames:
                for name, seq, qual in io.read_sequences(filename, qualities='required'):
                    # "Good quality" sequence ends at the first low quality base
                    #good_quality_end = 0
                    #while good_quality_end < len(seq) and qual[good_quality_end] >= clip_quality:
                    #    good_quality_end += 1
                    
                    goodness_score = 0
                    best_goodness_score = 0
                    good_quality_end = 0
                    i = 0
                    while True:
                        if goodness_score > best_goodness_score:
                            best_goodness_score = goodness_score
                            good_quality_end = i
                        
                        if i >= len(seq):
                            break
                            
                        if qual[i] >= clip_quality:
                            goodness_score += 1
                        else:
                            goodness_score -= 9
                        i += 1
                            
                    
                    best_score = self.min_score-1
                    best_a_start = good_quality_end
                    best_a_end = good_quality_end
                    best_adaptor_bases = 0
                    best_aonly_score = 0
                    best_aonly_start = good_quality_end
                    best_aonly_end = good_quality_end
                    
                    # Consider each possible start position for the poly(A)
                    for a_start in xrange(len(seq)):
                        if a_start and seq[a_start-1] == 'A': continue
                        
                        # Consider each possible end position for the poly(A)
                        a_end = a_start
                        aonly_score = 0
                        while True:
                            if aonly_score > best_aonly_score:
                                best_aonly_score = aonly_score
                                best_aonly_start = a_start
                                best_aonly_end = a_end
                            
                            
                            # The poly(A) should be followed by adaptor,
                            ## at least until the end of good quality sequence.
                            # However if there is evidence of the adaptor beyond
                            # the end of good quality, we still want to know that,
                            # and count it towards the number of adaptor bases present.
                            score = aonly_score
                            adaptor_bases = 0
                            i = a_end
                            abort_score = best_score-len(self.adaptor)
                            abort_i = min(len(seq), a_end+len(self.adaptor))
                            while score >= abort_score:
                                #if (score > best_score and 
                                #    (i >= good_quality_end or i >= a_end+len(self.adaptor))):
                                if score > best_score:
                                    best_score = score
                                    best_a_start = a_start
                                    best_a_end = a_end
                                    best_adaptor_bases = adaptor_bases
                            
                                if i >= abort_i:
                                    break
                            
                                if seq[i] == self.adaptor[i-a_end]:
                                    score += 1
                                    adaptor_bases += 1
                                else:
                                    score -= 4
                                i += 1
                                
                            #if a_end >= len(seq): break
                            
                            # Modified 2018-03-21
                            # poly(A) tail only within good quality region.
                            #if a_end >= good_quality_end: break
                            #if qual[a_end] >= ignore_quality:
                            #    if seq[a_end] == 'A':
                            #        aonly_score += 1
                            #    else:
                            #        aonly_score -= 4
                            #        if aonly_score <= 0: break

                            if a_end >= len(seq): break

                            if seq[a_end] == 'A':
                                aonly_score += 1
                            else: #if qual[a_end] >= ignore_quality:
                                aonly_score -= 4
                            #else:
                            #    aonly_score -= 1                       

                            a_end += 1
                    
                    # 2018-03-21 
                    # Look for tail starting after good quality,
                    # however don't call a tail if starts after good quality 
                    if best_a_start > good_quality_end:
                        best_a_start = good_quality_end
                        best_a_end = good_quality_end
                        best_adaptor_bases = 0
                        best_score = 0

                    a_start = best_a_start
                    a_end = best_a_end
                    adaptor_bases = best_adaptor_bases
                    aonly_start = best_aonly_start
                    aonly_end = best_aonly_end                    
                        
                    if self.debug: # and a_end == a_start and a_end < len(seq)-10:        
                        print name
                        print ''.join( 
                            ('C' if item<clip_quality else ' ') 
                            for item in qual )
                        print '-' * good_quality_end
                        print seq
                        print ' '*a_start + 'A'*(a_end-a_start) + self.adaptor + ".%d %d"%(adaptor_bases,best_score)
                        #print ' '*aonly_start + 'A'*(aonly_end-aonly_start) + "."
                        print 
                        sys.stdout.flush()

                    n += 1
                    total_before += len(seq)

                    # 0 - sequence name
                    # 1 - sequence length
                    # 2 - poly(A) start
                    # 3 - poly(A) end
                    # (4 - best run of As start, for debugging the need to detect adaptor seq)
                    # (5 - best run of As end)
                    # 6 - number of adaptor bases matched
                    print >> out_clips_file, '%s\t%d\t%d\t%d\t%d\t%d\t%d' % (name, len(seq) , a_start, a_end, aonly_start, aonly_end,  adaptor_bases)
                    
                    if a_start >= self.length:
                        if a_start < len(seq):
                            n_clipped += 1
                            total_clipped += a_start
                    
                        print >> out_file, '@'+name
                        print >> out_file, seq[:a_start]
                        print >> out_file, '+'
                        print >> out_file, qual[:a_start]
                    else:
                        n_discarded += 1
                    
                    if n%10000 == 0: 
                        grace.status('Clip-runs ' + self.sample + ' ' + grace.pretty_number(n)) # + ' (' + grace.pretty_number(len(dstates)) + ' dstates)')
                    
                    # Option to do a quick subsample
                    if self.only and self.only <= n:
                        break
        
        grace.status('')
        
        self.log.datum(self.sample,'reads',n)
        if n:
            self.log.datum(self.sample,'mean length before poly-A/adaptor clipping',float(total_before)/n)
        self.log.datum(self.sample,'reads discarded as too short after poly-A/adaptor clipping',n_discarded)
        self.log.datum(self.sample,'reads poly-A/adaptor clipped and kept',n_clipped)
        if n_clipped:
            self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)
Ejemplo n.º 31
0
def main(args):
    mincov, args = grace.get_option_value(args, '--mincov', int, 1) 
    maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) 
    minsize, args = grace.get_option_value(args, '--minsize', int, 200)
    what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core')    
    is_core = (what == 'core') 

    grace.expect_no_further_options(args)
    
    if len(args) < 2:
        print >> sys.stderr, HELP
        raise grace.Help_shown()
    
    output_dir, working_dirs = args[0], args[1:]
    
    assert not path.exists(path.join(output_dir, 'reference.fa')), \
        'Output directory not given'
    
    if not path.exists(output_dir):
        os.mkdir(output_dir)
    
    for name, seq in io.read_sequences(path.join(working_dirs[0],'reference.fa')):
        print name
        friendly_name = grace.filesystem_friendly_name(name)
        
        good = [ True ] * len(seq)
        
        for working_dir in working_dirs:
            if is_core:
               suffix = '-depth.userplot'
            else:
               suffix = '-ambiguous-depth.userplot'
            data = trivia.read_unstranded_userplot(
                os.path.join(working_dir, friendly_name+suffix)
            )
            assert len(seq) == len(data)
            for i in xrange(len(seq)):
               if good[i]:
                   if is_core:
                       good[i] = data[i] >= mincov
                   else:
                       good[i] = data[i] < mincov

        #Close holes
        start = -maxdiff-1
        n_holes = 0
        for i in xrange(len(seq)):
            if good[i]:
                 if 0 < i-start <= maxdiff:
                     for j in xrange(start,i): good[j] = True
                     n_holes += 1
                 start = i+1
        print 'Closed', grace.pretty_number(n_holes), 'holes'
        
        
        f = open(path.join(output_dir, '%s-%s.fa' % (friendly_name,what)), 'wb')
        io.write_fasta(f, name,
            ''.join([ (seq[i] if good[i] else 'N')
                      for i in xrange(len(seq)) ])
        )
        f.close()

        f = open(path.join(output_dir, '%s-%s_masked.fa' % (friendly_name,what)), 'wb')
        io.write_fasta(f, name,
            ''.join([ (seq[i] if good[i] else seq[i].lower())
                      for i in xrange(len(seq)) ])
        )
        f.close()

        f_good = open(path.join(output_dir, '%s-%s_parts.fa' % (friendly_name,what)), 'wb')
        f_nongood = open(path.join(output_dir, '%s-non%s_parts.fa' % (friendly_name,what)), 'wb')
        start = 0
        n_good = [0]
        n_good_bases = [0]    
        def emit(i):
            if i-start < minsize: return
            if good[start]:
                n_good[0] += 1
                n_good_bases[0] += i-start
            io.write_fasta(
                f_good if good[start] else f_nongood,
                '%s:%d..%d' % (name, start+1,i),
                seq[start:i]
            )
        for i in xrange(1,len(seq)):
            if good[i] != good[start]:
                emit(i)
                start = i
        emit(len(seq))
        f_nongood.close()
        f_good.close()
        
        print grace.pretty_number(sum(good)), 'bases are '+what+', of', grace.pretty_number(len(seq)), 'in reference sequence'
        print grace.pretty_number(n_good[0]), 'parts at least', grace.pretty_number(minsize), 'bases long with', grace.pretty_number(n_good_bases[0]), 'total bases'

        print
Ejemplo n.º 32
0
def make_bigwig(prefix,
                bam_filenames,
                make_spanner,
                fragments=False,
                stop_after=None,
                scale=1.0,
                polya=False):
    have_pysam = False
    try:
        import pysam
        have_pysam = True
    except ImportError:
        pass

    #alf = pysam.AlignmentFile(bam_filenames[0])
    #header = alf.header
    header = sam.parsed_bam_headers(bam_filenames[0])

    with open(prefix + "-chrom.sizes", "wb") as f:
        for entry in header["SQ"]:
            f.write("{}\t{}\n".format(entry["SN"], entry["LN"]))

    chrom_names = [entry["SN"] for entry in header["SQ"]]
    chrom_sizes = [int(entry["LN"]) for entry in header["SQ"]]

    #alf.close()

    forward = dict([(i, Piler(j)) for i, j in zip(chrom_names, chrom_sizes)])
    reverse = dict([(i, Piler(j)) for i, j in zip(chrom_names, chrom_sizes)])

    old = grace.status("Bigwig")

    for filename in bam_filenames:
        if have_pysam:
            alf = pysam.AlignmentFile(filename)
        else:
            alf = sam.Bam_reader(filename)

        n = 0

        if not fragments:
            for item in alf:
                if item.is_unmapped or item.is_secondary or item.is_supplementary:
                    continue

                if polya and not alignment_is_polya(item):
                    continue

                # Assume --> <-- oriented read pairs
                which = forward if bool(item.is_reverse) == bool(
                    item.is_read2) else reverse
                which[item.reference_name].add(make_spanner(item))

                n += 1
                if stop_after is not None and n > stop_after: break
                if n % 1000000 == 0:
                    grace.status(
                        os.path.basename(prefix) + " " + filename + " " +
                        grace.pretty_number(n))

        else:
            for item in iter_fragments(alf):
                if polya and not any(alignment_is_polya(al) for al in item):
                    continue

                # Assume --> <-- oriented read pairs
                which = forward if bool(item[0].is_reverse) == bool(
                    item[0].is_read2) else reverse
                which[item[0].reference_name].add(make_spanner(item))

                n += 1
                if stop_after is not None and n > stop_after: break
                if n % 1000000 == 0:
                    grace.status(
                        os.path.basename(prefix) + " " + filename + " " +
                        grace.pretty_number(n))

        if have_pysam:
            alf.close()

    bedgraph(
        prefix + "-fwd.bedgraph",
        zip(chrom_names, [
            scale_spanner(scale, forward[item].get()) for item in chrom_names
        ]))
    subprocess.check_call([
        "wigToBigWig", prefix + "-fwd.bedgraph", prefix + "-chrom.sizes",
        prefix + "-fwd.bw"
    ])
    os.unlink(prefix + "-fwd.bedgraph")

    bedgraph(
        prefix + "-rev.bedgraph",
        zip(chrom_names, [
            scale_spanner(scale, reverse[item].get()) for item in chrom_names
        ]))
    subprocess.check_call([
        "wigToBigWig", prefix + "-rev.bedgraph", prefix + "-chrom.sizes",
        prefix + "-rev.bw"
    ])
    os.unlink(prefix + "-rev.bedgraph")

    os.unlink(prefix + "-chrom.sizes")

    grace.status(old)