Example #1
0
    def run(self):
        seqs = []
        seen = 0
        for filename in self.filenames:
            for seq in io.read_sequences(filename, qualities=True):
                seen += 1
                if seen % 100000 == 0:
                    grace.status('Scanned ' + grace.pretty_number(seen))

                if len(seqs) < self.n:
                    seqs.append(seq)
                elif self.n <= random.random() * seen:
                    seqs[random.randrange(self.n)] = seq
        grace.status('')

        print >> sys.stderr, 'Sampled', grace.pretty_number(
            len(seqs)), 'of', grace.pretty_number(seen), 'sequences'

        if not seqs: return
        qualities = len(seqs[0])
        if qualities:
            for name, seq, qual in seqs:
                io.write_fastq(sys.stdout, name, seq, qual)
        else:
            for name, seq in seqs:
                io.write_fastq(sys.stdout, name, seq)
Example #2
0
 def convert(filename):
     info = io.get_file_info(filename)
     ok = selection.matches('type-fastq:[compression-none/compression-gzip/compression-bzip2]', info)
     if ok:
         return filename            
     result_name = tempname()
     with open(result_name,'wb') as f:
         for name, seq, qual in io.read_sequences(filename, qualities='required'):
             io.write_fastq(f, name, seq, qual)
     return result_name
Example #3
0
 def convert(filename):
     info = io.get_file_info(filename)
     ok = selection.matches(
         'type-fastq:[compression-none/compression-gzip/compression-bzip2]',
         info)
     if ok:
         return filename
     result_name = tempname()
     with open(result_name, 'wb') as f:
         for name, seq, qual in io.read_sequences(
                 filename, qualities='required'):
             io.write_fastq(f, name, seq, qual)
     return result_name
Example #4
0
    def run(self):    
        f = self.begin_output()
    
        for filename in self.filenames:
            for name, seq in io.read_sequences(filename):
                name_parts = name.split(None, 1)
                name = name_parts[0]
                for i in xrange(-self.size+self.stride,len(seq),self.stride):
                    start = max(0,min(len(seq),i))
                    end = max(0,min(len(seq), i+self.size))
                    shred_name = '%s:%d..%d' % (name,start+1,end)
                    shred_seq = seq
                    if self.quality:
                        io.write_fastq(f, shred_name, seq[start:end], chr(33+self.quality)*(end-start))
                    else:
                        io.write_fasta(f, shred_name, seq[start:end])

        self.end_output(f)
Example #5
0
    def run(self):
        f = self.begin_output()

        for filename in self.filenames:
            for name, seq in io.read_sequences(filename):
                name_parts = name.split(None, 1)
                name = name_parts[0]
                for i in xrange(-self.size + self.stride, len(seq),
                                self.stride):
                    start = max(0, min(len(seq), i))
                    end = max(0, min(len(seq), i + self.size))
                    shred_name = '%s:%d..%d' % (name, start + 1, end)
                    shred_seq = seq
                    if self.quality:
                        io.write_fastq(f, shred_name, seq[start:end],
                                       chr(33 + self.quality) * (end - start))
                    else:
                        io.write_fasta(f, shred_name, seq[start:end])

        self.end_output(f)
Example #6
0
 def run(self):
     seqs = [ ]
     seen = 0
     for filename in self.filenames:
         for seq in io.read_sequences(filename, qualities=True):
             seen += 1
             if seen % 100000 == 0: grace.status('Scanned '+grace.pretty_number(seen))
             
             if len(seqs) < self.n:
                 seqs.append(seq)
             elif self.n <= random.random() * seen:
                 seqs[ random.randrange(self.n) ] = seq
     grace.status('')
     
     print >> sys.stderr, 'Sampled', grace.pretty_number(len(seqs)), 'of', grace.pretty_number(seen), 'sequences'
 
     if not seqs: return
     qualities = len(seqs[0])
     if qualities:
         for name, seq, qual in seqs:
             io.write_fastq(sys.stdout, name, seq, qual)
     else:
         for name, seq in seqs:
             io.write_fastq(sys.stdout, name, seq)
Example #7
0
    def run(self):
        assert self.reads or self.pairs or self.interleaved, 'No reads given'
    
        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)
        
        working = self.get_workspace()
        working.setup_reference(self.references, bowtie=True)
        working.update_param(snp_cost=2.0)        
        reference = working.get_reference()
        
        log_file = open(self.log_filename(),'wb')
              
        with workspace.tempspace(dir=working.working_dir) as temp:
            n = [ 0 ]
            def tempname():
                n[0] += 1
                return temp/('%d.fq'%n[0])
            def convert(filename):
                info = io.get_file_info(filename)
                ok = selection.matches('type-fastq:[compression-none/compression-gzip/compression-bzip2]', info)
                if ok:
                    return filename            
                result_name = tempname()
                with open(result_name,'wb') as f:
                    for name, seq, qual in io.read_sequences(filename, qualities='required'):
                        io.write_fastq(f, name, seq, qual)
                return result_name
            
            ones = [ ]
            twos = [ ]
            singles = [ ]
            
            for pair in self.pairs:
                assert len(pair) == 2, 'Need two files in each "pair:" section.'
                ones.append(convert(pair[0]))
                twos.append(convert(pair[1]))
            
            for item in self.interleaved:
                left_name = tempname()
                right_name = tempname()
                ones.append(left_name)
                twos.append(right_name)
                with open(left_name,'wb') as left, \
                     open(right_name,'wb') as right:
                    reader = io.read_sequences(item, qualities='required')
                    while True:
                        try:
                            name, seq, qual = reader.next()
                        except StopIteration:
                            break
                        io.write_fastq(left, name,seq,qual)
                        
                        try:
                            name, seq, qual = reader.next()
                        except StopIteration:
                            raise grace.Error('Interleaved file contains odd number of sequences')
                        io.write_fastq(right, name,seq,qual)
            
            for item in self.reads:
                singles.append(convert(item))

            cores = min(self.cores, legion.coordinator().get_cores())

            command = (
                [ 'bowtie2', 
                    '--threads', str(cores),
                    '--rg-id', '1',
                    '--rg', 'SM:'+working.name,                    
                    ] + 
                self.bowtie_options + 
                [ '-x', reference.get_bowtie_index_prefix() ]
                )
            commands = [ ]
            if ones:
                commands.append(command + [ '-1', ','.join(ones), '-2', ','.join(twos) ])
            if singles:
                commands.append(command + [ '-U', ','.join(singles) ])
            
            temp_bam_name = temp/'temp.bam'

            with io.pipe_to(
                     ['samtools', 'view', '-S', '-b', '-'],
                     stdout=open(temp_bam_name,'wb'),
                     stderr=log_file
                     ) as f:
                header_sent = False
                for command in commands:
                    self.log.log('Running:\n' + ' '.join(command) + '\n')            
                    with io.pipe_from(
                        command,
                        stderr=log_file,
                        cores=cores
                        ) as f_out:
                        for line in f_out:
                            if not header_sent or not line.startswith('@'):
                                f.write(line)
                    header_sent = True

            #io.execute([
            #    'samtools', 'sort', '-n', temp_bam_name, working/'alignments'
            #    ])
            
            sam.sort_bam(temp_bam_name, working/'alignments', by_name=True, cores=self.cores)
            
        log_file.close()
Example #8
0
    def run(self):
        assert self.reads or self.pairs or self.interleaved, 'No reads given'

        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)

        working = self.get_workspace()
        working.setup_reference(self.references, bowtie=True)
        working.update_param(snp_cost=2.0)
        reference = working.get_reference()

        log_file = open(self.log_filename(), 'wb')

        with workspace.tempspace(dir=working.working_dir) as temp:
            n = [0]

            def tempname():
                n[0] += 1
                return temp / ('%d.fq' % n[0])

            def convert(filename):
                info = io.get_file_info(filename)
                ok = selection.matches(
                    'type-fastq:[compression-none/compression-gzip/compression-bzip2]',
                    info)
                if ok:
                    return filename
                result_name = tempname()
                with open(result_name, 'wb') as f:
                    for name, seq, qual in io.read_sequences(
                            filename, qualities='required'):
                        io.write_fastq(f, name, seq, qual)
                return result_name

            ones = []
            twos = []
            singles = []

            for pair in self.pairs:
                assert len(
                    pair) == 2, 'Need two files in each "pair:" section.'
                ones.append(convert(pair[0]))
                twos.append(convert(pair[1]))

            for item in self.interleaved:
                left_name = tempname()
                right_name = tempname()
                ones.append(left_name)
                twos.append(right_name)
                with open(left_name,'wb') as left, \
                     open(right_name,'wb') as right:
                    reader = io.read_sequences(item, qualities='required')
                    while True:
                        try:
                            name, seq, qual = reader.next()
                        except StopIteration:
                            break
                        io.write_fastq(left, name, seq, qual)

                        try:
                            name, seq, qual = reader.next()
                        except StopIteration:
                            raise grace.Error(
                                'Interleaved file contains odd number of sequences'
                            )
                        io.write_fastq(right, name, seq, qual)

            for item in self.reads:
                singles.append(convert(item))

            cores = min(self.cores, legion.coordinator().get_cores())

            command = ([
                'bowtie2',
                '--threads',
                str(cores),
                '--rg-id',
                '1',
                '--rg',
                'SM:' + working.name,
            ] + self.bowtie_options +
                       ['-x', reference.get_bowtie_index_prefix()])
            commands = []
            if ones:
                commands.append(command +
                                ['-1', ','.join(ones), '-2', ','.join(twos)])
            if singles:
                commands.append(command + ['-U', ','.join(singles)])

            temp_bam_name = temp / 'temp.bam'

            with io.pipe_to(['samtools', 'view', '-S', '-b', '-'],
                            stdout=open(temp_bam_name, 'wb'),
                            stderr=log_file) as f:
                header_sent = False
                for command in commands:
                    self.log.log('Running:\n' + ' '.join(command) + '\n')
                    with io.pipe_from(command, stderr=log_file,
                                      cores=cores) as f_out:
                        for line in f_out:
                            if not header_sent or not line.startswith('@'):
                                f.write(line)
                    header_sent = True

            #io.execute([
            #    'samtools', 'sort', '-n', temp_bam_name, working/'alignments'
            #    ])

            sam.sort_bam(temp_bam_name,
                         working / 'alignments',
                         by_name=True,
                         cores=self.cores)

        log_file.close()
Example #9
0
    def run(self):
        workspace = self.get_workspace()
        
        read_length = 100
        left = rand_seq(read_length-1)
        while True:
            flank = rand_seq(1)
            if flank != self.ref[:1]: break
        left += flank
        
        right = rand_seq(read_length-1)
        while True:
            flank = rand_seq(1)
            if flank != self.ref[-1:]: break
        right = flank+right
        
        i = 0
        
        variants_used = [ ]
        
        with open(workspace/'reads.fq','wb') as f:
            for i, variant in enumerate(self.variants):
                if 'x' in variant:
                    variant, count = variant.split('x')
                    count = int(count)
                else:
                    count = 10
                variants_used.append( (variant,count) )
                seq = left+variant+right
                for j in xrange(count):
                    pos = len(variant)+random.randrange(read_length-len(variant))
                    read = seq[pos:pos+read_length]
                    if random.randrange(2):
                        read = bio.reverse_complement(read)
                    i += 1
                    io.write_fastq(f,'read_%s_%d' % (variant,i),read,chr(64+30)*len(read))

        reference = left+self.ref+right
        primary_variant = left+variants_used[0][0]+right

        with open(workspace/'reference.fa','wb') as f:
            io.write_fasta(f,'chr1',reference)
        
        legion.remake_needed()
        
        self.analysis(
            workspace/'sample',
            workspace/'reference.fa',
            reads = [ workspace/'reads.fq' ],
            ).run()
        
        self.freebayes(
            workspace/'freebayes',
            workspace/'sample',
            ).run()
        
        self.vcf_filter(
            workspace/'filtered',
            workspace/'freebayes.vcf',
            ).run()
        
        Vcf_patch(
            workspace/'patch',
            workspace/('sample','reference'),
            workspace/'filtered.vcf'
            ).run()
        
        patched = io.read_sequences(workspace/('patch','sample.fa')).next()[1]
        
        masked = io.read_sequences(workspace/('sample','consensus_masked.fa')).next()[1].upper()
        
        with open(workspace/'freebayes.vcf','rU') as f:
            reader = vcf.Reader(f)
            raw_count = len(list(reader))
        
        with open(workspace/'filtered.vcf','rU') as f:
            reader =  vcf.Reader(f)
            filtered_count = len(list(vcf.Reader(open(workspace/'filtered.vcf','rU'))))
        
        with open(workspace/('sample','report.txt'),'rb') as f:
            nesoni_count = len(f.readlines()) - 1

        self.log.log('\n')
        self.log.datum(workspace.name,'changes found by "nesoni consensus:"', nesoni_count)
        self.log.datum(workspace.name,'is correctly patched by "nesoni consensus:"', masked == primary_variant)
        self.log.log('\n')
        self.log.datum(workspace.name,'raw variants', raw_count)
        self.log.datum(workspace.name,'variants after filtering', filtered_count)
        self.log.datum(workspace.name,'is correctly patched by VCF pipeline', patched == primary_variant)
        self.log.log('\n')
Example #10
0
    def run(self):
        workspace = self.get_workspace()

        read_length = 100
        left = rand_seq(read_length - 1)
        while True:
            flank = rand_seq(1)
            if flank != self.ref[:1]: break
        left += flank

        right = rand_seq(read_length - 1)
        while True:
            flank = rand_seq(1)
            if flank != self.ref[-1:]: break
        right = flank + right

        i = 0

        variants_used = []

        with open(workspace / 'reads.fq', 'wb') as f:
            for i, variant in enumerate(self.variants):
                if 'x' in variant:
                    variant, count = variant.split('x')
                    count = int(count)
                else:
                    count = 10
                variants_used.append((variant, count))
                seq = left + variant + right
                for j in xrange(count):
                    pos = len(variant) + random.randrange(read_length -
                                                          len(variant))
                    read = seq[pos:pos + read_length]
                    if random.randrange(2):
                        read = bio.reverse_complement(read)
                    i += 1
                    io.write_fastq(f, 'read_%s_%d' % (variant, i), read,
                                   chr(64 + 30) * len(read))

        reference = left + self.ref + right
        primary_variant = left + variants_used[0][0] + right

        with open(workspace / 'reference.fa', 'wb') as f:
            io.write_fasta(f, 'chr1', reference)

        legion.remake_needed()

        self.analysis(
            workspace / 'sample',
            workspace / 'reference.fa',
            reads=[workspace / 'reads.fq'],
        ).run()

        self.freebayes(
            workspace / 'freebayes',
            workspace / 'sample',
        ).run()

        self.vcf_filter(
            workspace / 'filtered',
            workspace / 'freebayes.vcf',
        ).run()

        Vcf_patch(workspace / 'patch', workspace / ('sample', 'reference'),
                  workspace / 'filtered.vcf').run()

        patched = io.read_sequences(workspace /
                                    ('patch', 'sample.fa')).next()[1]

        masked = io.read_sequences(
            workspace / ('sample', 'consensus_masked.fa')).next()[1].upper()

        with open(workspace / 'freebayes.vcf', 'rU') as f:
            reader = vcf.Reader(f)
            raw_count = len(list(reader))

        with open(workspace / 'filtered.vcf', 'rU') as f:
            reader = vcf.Reader(f)
            filtered_count = len(
                list(vcf.Reader(open(workspace / 'filtered.vcf', 'rU'))))

        with open(workspace / ('sample', 'report.txt'), 'rb') as f:
            nesoni_count = len(f.readlines()) - 1

        self.log.log('\n')
        self.log.datum(workspace.name, 'changes found by "nesoni consensus:"',
                       nesoni_count)
        self.log.datum(workspace.name,
                       'is correctly patched by "nesoni consensus:"',
                       masked == primary_variant)
        self.log.log('\n')
        self.log.datum(workspace.name, 'raw variants', raw_count)
        self.log.datum(workspace.name, 'variants after filtering',
                       filtered_count)
        self.log.datum(workspace.name, 'is correctly patched by VCF pipeline',
                       patched == primary_variant)
        self.log.log('\n')