def run(self): seqs = [] seen = 0 for filename in self.filenames: for seq in io.read_sequences(filename, qualities=True): seen += 1 if seen % 100000 == 0: grace.status('Scanned ' + grace.pretty_number(seen)) if len(seqs) < self.n: seqs.append(seq) elif self.n <= random.random() * seen: seqs[random.randrange(self.n)] = seq grace.status('') print >> sys.stderr, 'Sampled', grace.pretty_number( len(seqs)), 'of', grace.pretty_number(seen), 'sequences' if not seqs: return qualities = len(seqs[0]) if qualities: for name, seq, qual in seqs: io.write_fastq(sys.stdout, name, seq, qual) else: for name, seq in seqs: io.write_fastq(sys.stdout, name, seq)
def convert(filename): info = io.get_file_info(filename) ok = selection.matches('type-fastq:[compression-none/compression-gzip/compression-bzip2]', info) if ok: return filename result_name = tempname() with open(result_name,'wb') as f: for name, seq, qual in io.read_sequences(filename, qualities='required'): io.write_fastq(f, name, seq, qual) return result_name
def convert(filename): info = io.get_file_info(filename) ok = selection.matches( 'type-fastq:[compression-none/compression-gzip/compression-bzip2]', info) if ok: return filename result_name = tempname() with open(result_name, 'wb') as f: for name, seq, qual in io.read_sequences( filename, qualities='required'): io.write_fastq(f, name, seq, qual) return result_name
def run(self): f = self.begin_output() for filename in self.filenames: for name, seq in io.read_sequences(filename): name_parts = name.split(None, 1) name = name_parts[0] for i in xrange(-self.size+self.stride,len(seq),self.stride): start = max(0,min(len(seq),i)) end = max(0,min(len(seq), i+self.size)) shred_name = '%s:%d..%d' % (name,start+1,end) shred_seq = seq if self.quality: io.write_fastq(f, shred_name, seq[start:end], chr(33+self.quality)*(end-start)) else: io.write_fasta(f, shred_name, seq[start:end]) self.end_output(f)
def run(self): f = self.begin_output() for filename in self.filenames: for name, seq in io.read_sequences(filename): name_parts = name.split(None, 1) name = name_parts[0] for i in xrange(-self.size + self.stride, len(seq), self.stride): start = max(0, min(len(seq), i)) end = max(0, min(len(seq), i + self.size)) shred_name = '%s:%d..%d' % (name, start + 1, end) shred_seq = seq if self.quality: io.write_fastq(f, shred_name, seq[start:end], chr(33 + self.quality) * (end - start)) else: io.write_fasta(f, shred_name, seq[start:end]) self.end_output(f)
def run(self): seqs = [ ] seen = 0 for filename in self.filenames: for seq in io.read_sequences(filename, qualities=True): seen += 1 if seen % 100000 == 0: grace.status('Scanned '+grace.pretty_number(seen)) if len(seqs) < self.n: seqs.append(seq) elif self.n <= random.random() * seen: seqs[ random.randrange(self.n) ] = seq grace.status('') print >> sys.stderr, 'Sampled', grace.pretty_number(len(seqs)), 'of', grace.pretty_number(seen), 'sequences' if not seqs: return qualities = len(seqs[0]) if qualities: for name, seq, qual in seqs: io.write_fastq(sys.stdout, name, seq, qual) else: for name, seq in seqs: io.write_fastq(sys.stdout, name, seq)
def run(self): assert self.reads or self.pairs or self.interleaved, 'No reads given' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) working = self.get_workspace() working.setup_reference(self.references, bowtie=True) working.update_param(snp_cost=2.0) reference = working.get_reference() log_file = open(self.log_filename(),'wb') with workspace.tempspace(dir=working.working_dir) as temp: n = [ 0 ] def tempname(): n[0] += 1 return temp/('%d.fq'%n[0]) def convert(filename): info = io.get_file_info(filename) ok = selection.matches('type-fastq:[compression-none/compression-gzip/compression-bzip2]', info) if ok: return filename result_name = tempname() with open(result_name,'wb') as f: for name, seq, qual in io.read_sequences(filename, qualities='required'): io.write_fastq(f, name, seq, qual) return result_name ones = [ ] twos = [ ] singles = [ ] for pair in self.pairs: assert len(pair) == 2, 'Need two files in each "pair:" section.' ones.append(convert(pair[0])) twos.append(convert(pair[1])) for item in self.interleaved: left_name = tempname() right_name = tempname() ones.append(left_name) twos.append(right_name) with open(left_name,'wb') as left, \ open(right_name,'wb') as right: reader = io.read_sequences(item, qualities='required') while True: try: name, seq, qual = reader.next() except StopIteration: break io.write_fastq(left, name,seq,qual) try: name, seq, qual = reader.next() except StopIteration: raise grace.Error('Interleaved file contains odd number of sequences') io.write_fastq(right, name,seq,qual) for item in self.reads: singles.append(convert(item)) cores = min(self.cores, legion.coordinator().get_cores()) command = ( [ 'bowtie2', '--threads', str(cores), '--rg-id', '1', '--rg', 'SM:'+working.name, ] + self.bowtie_options + [ '-x', reference.get_bowtie_index_prefix() ] ) commands = [ ] if ones: commands.append(command + [ '-1', ','.join(ones), '-2', ','.join(twos) ]) if singles: commands.append(command + [ '-U', ','.join(singles) ]) temp_bam_name = temp/'temp.bam' with io.pipe_to( ['samtools', 'view', '-S', '-b', '-'], stdout=open(temp_bam_name,'wb'), stderr=log_file ) as f: header_sent = False for command in commands: self.log.log('Running:\n' + ' '.join(command) + '\n') with io.pipe_from( command, stderr=log_file, cores=cores ) as f_out: for line in f_out: if not header_sent or not line.startswith('@'): f.write(line) header_sent = True #io.execute([ # 'samtools', 'sort', '-n', temp_bam_name, working/'alignments' # ]) sam.sort_bam(temp_bam_name, working/'alignments', by_name=True, cores=self.cores) log_file.close()
def run(self): assert self.reads or self.pairs or self.interleaved, 'No reads given' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) working = self.get_workspace() working.setup_reference(self.references, bowtie=True) working.update_param(snp_cost=2.0) reference = working.get_reference() log_file = open(self.log_filename(), 'wb') with workspace.tempspace(dir=working.working_dir) as temp: n = [0] def tempname(): n[0] += 1 return temp / ('%d.fq' % n[0]) def convert(filename): info = io.get_file_info(filename) ok = selection.matches( 'type-fastq:[compression-none/compression-gzip/compression-bzip2]', info) if ok: return filename result_name = tempname() with open(result_name, 'wb') as f: for name, seq, qual in io.read_sequences( filename, qualities='required'): io.write_fastq(f, name, seq, qual) return result_name ones = [] twos = [] singles = [] for pair in self.pairs: assert len( pair) == 2, 'Need two files in each "pair:" section.' ones.append(convert(pair[0])) twos.append(convert(pair[1])) for item in self.interleaved: left_name = tempname() right_name = tempname() ones.append(left_name) twos.append(right_name) with open(left_name,'wb') as left, \ open(right_name,'wb') as right: reader = io.read_sequences(item, qualities='required') while True: try: name, seq, qual = reader.next() except StopIteration: break io.write_fastq(left, name, seq, qual) try: name, seq, qual = reader.next() except StopIteration: raise grace.Error( 'Interleaved file contains odd number of sequences' ) io.write_fastq(right, name, seq, qual) for item in self.reads: singles.append(convert(item)) cores = min(self.cores, legion.coordinator().get_cores()) command = ([ 'bowtie2', '--threads', str(cores), '--rg-id', '1', '--rg', 'SM:' + working.name, ] + self.bowtie_options + ['-x', reference.get_bowtie_index_prefix()]) commands = [] if ones: commands.append(command + ['-1', ','.join(ones), '-2', ','.join(twos)]) if singles: commands.append(command + ['-U', ','.join(singles)]) temp_bam_name = temp / 'temp.bam' with io.pipe_to(['samtools', 'view', '-S', '-b', '-'], stdout=open(temp_bam_name, 'wb'), stderr=log_file) as f: header_sent = False for command in commands: self.log.log('Running:\n' + ' '.join(command) + '\n') with io.pipe_from(command, stderr=log_file, cores=cores) as f_out: for line in f_out: if not header_sent or not line.startswith('@'): f.write(line) header_sent = True #io.execute([ # 'samtools', 'sort', '-n', temp_bam_name, working/'alignments' # ]) sam.sort_bam(temp_bam_name, working / 'alignments', by_name=True, cores=self.cores) log_file.close()
def run(self): workspace = self.get_workspace() read_length = 100 left = rand_seq(read_length-1) while True: flank = rand_seq(1) if flank != self.ref[:1]: break left += flank right = rand_seq(read_length-1) while True: flank = rand_seq(1) if flank != self.ref[-1:]: break right = flank+right i = 0 variants_used = [ ] with open(workspace/'reads.fq','wb') as f: for i, variant in enumerate(self.variants): if 'x' in variant: variant, count = variant.split('x') count = int(count) else: count = 10 variants_used.append( (variant,count) ) seq = left+variant+right for j in xrange(count): pos = len(variant)+random.randrange(read_length-len(variant)) read = seq[pos:pos+read_length] if random.randrange(2): read = bio.reverse_complement(read) i += 1 io.write_fastq(f,'read_%s_%d' % (variant,i),read,chr(64+30)*len(read)) reference = left+self.ref+right primary_variant = left+variants_used[0][0]+right with open(workspace/'reference.fa','wb') as f: io.write_fasta(f,'chr1',reference) legion.remake_needed() self.analysis( workspace/'sample', workspace/'reference.fa', reads = [ workspace/'reads.fq' ], ).run() self.freebayes( workspace/'freebayes', workspace/'sample', ).run() self.vcf_filter( workspace/'filtered', workspace/'freebayes.vcf', ).run() Vcf_patch( workspace/'patch', workspace/('sample','reference'), workspace/'filtered.vcf' ).run() patched = io.read_sequences(workspace/('patch','sample.fa')).next()[1] masked = io.read_sequences(workspace/('sample','consensus_masked.fa')).next()[1].upper() with open(workspace/'freebayes.vcf','rU') as f: reader = vcf.Reader(f) raw_count = len(list(reader)) with open(workspace/'filtered.vcf','rU') as f: reader = vcf.Reader(f) filtered_count = len(list(vcf.Reader(open(workspace/'filtered.vcf','rU')))) with open(workspace/('sample','report.txt'),'rb') as f: nesoni_count = len(f.readlines()) - 1 self.log.log('\n') self.log.datum(workspace.name,'changes found by "nesoni consensus:"', nesoni_count) self.log.datum(workspace.name,'is correctly patched by "nesoni consensus:"', masked == primary_variant) self.log.log('\n') self.log.datum(workspace.name,'raw variants', raw_count) self.log.datum(workspace.name,'variants after filtering', filtered_count) self.log.datum(workspace.name,'is correctly patched by VCF pipeline', patched == primary_variant) self.log.log('\n')
def run(self): workspace = self.get_workspace() read_length = 100 left = rand_seq(read_length - 1) while True: flank = rand_seq(1) if flank != self.ref[:1]: break left += flank right = rand_seq(read_length - 1) while True: flank = rand_seq(1) if flank != self.ref[-1:]: break right = flank + right i = 0 variants_used = [] with open(workspace / 'reads.fq', 'wb') as f: for i, variant in enumerate(self.variants): if 'x' in variant: variant, count = variant.split('x') count = int(count) else: count = 10 variants_used.append((variant, count)) seq = left + variant + right for j in xrange(count): pos = len(variant) + random.randrange(read_length - len(variant)) read = seq[pos:pos + read_length] if random.randrange(2): read = bio.reverse_complement(read) i += 1 io.write_fastq(f, 'read_%s_%d' % (variant, i), read, chr(64 + 30) * len(read)) reference = left + self.ref + right primary_variant = left + variants_used[0][0] + right with open(workspace / 'reference.fa', 'wb') as f: io.write_fasta(f, 'chr1', reference) legion.remake_needed() self.analysis( workspace / 'sample', workspace / 'reference.fa', reads=[workspace / 'reads.fq'], ).run() self.freebayes( workspace / 'freebayes', workspace / 'sample', ).run() self.vcf_filter( workspace / 'filtered', workspace / 'freebayes.vcf', ).run() Vcf_patch(workspace / 'patch', workspace / ('sample', 'reference'), workspace / 'filtered.vcf').run() patched = io.read_sequences(workspace / ('patch', 'sample.fa')).next()[1] masked = io.read_sequences( workspace / ('sample', 'consensus_masked.fa')).next()[1].upper() with open(workspace / 'freebayes.vcf', 'rU') as f: reader = vcf.Reader(f) raw_count = len(list(reader)) with open(workspace / 'filtered.vcf', 'rU') as f: reader = vcf.Reader(f) filtered_count = len( list(vcf.Reader(open(workspace / 'filtered.vcf', 'rU')))) with open(workspace / ('sample', 'report.txt'), 'rb') as f: nesoni_count = len(f.readlines()) - 1 self.log.log('\n') self.log.datum(workspace.name, 'changes found by "nesoni consensus:"', nesoni_count) self.log.datum(workspace.name, 'is correctly patched by "nesoni consensus:"', masked == primary_variant) self.log.log('\n') self.log.datum(workspace.name, 'raw variants', raw_count) self.log.datum(workspace.name, 'variants after filtering', filtered_count) self.log.datum(workspace.name, 'is correctly patched by VCF pipeline', patched == primary_variant) self.log.log('\n')