def run(self): genome = self.genome if os.path.isdir(genome): genome = os.path.join(genome, os.path.split(genome)[1]+'.genome') print genome #pref_filename = os.path.join(os.path.expanduser('~'),'igv','prefs.properties') #if os.path.exists(pref_filename): # with open(pref_filename,'rb') as f: # lines = f.readlines() # with open(pref_filename,'wb') as f: # for line in lines: # if line.startswith('DEFAULT_GENOME_KEY='): # #line = 'DEFAULT_GENOME_KEY=\n' # continue # f.write(line) with workspace.tempspace() as temp: with open(temp/'batch.txt','wb') as f: print >> f, 'new' print >> f, 'preference LAST_TRACK_DIRECTORY', os.getcwd() print >> f, 'preference LAST_GENOME_IMPORT_DIRECTORY', os.getcwd() print >> f, 'genome '+os.path.abspath(genome) for filename in self.files: print >> f, 'load '+os.path.abspath(filename) io.execute(['java','-Xmx32000m', #Flags from igb.sh script: '-Dproduction=true','-Dapple.laf.useScreenMenuBar=true','-Djava.net.preferIPv4Stack=true', '-jar',io.find_jar('igv.jar'),'-b',temp/'batch.txt'])
def run(self): extractions = [ ] for item in self.genes.split(','): extraction = item.split('/') assert len(extraction) == 4 extractions.append(extraction) rename = { } if self.rename: for item in self.rename.split(','): old,new = item.split('=') rename[old] = new work = self.get_workspace() with workspace.tempspace() as temp: items = list(annotation.read_annotations(self.annotation)) for item in items: item.seqid = rename.get(item.seqid, item.seqid) annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log)) del items with open(temp/'temp.fa','wb') as f: for name,seq in io.read_sequences(self.genome): name = name.split()[0] name = rename.get(name,name) io.write_fasta(f, name, seq) reference_directory.Make_tt_reference( self.output_dir, filenames = [ temp/'temp.fa', temp/'temp.gff' ] + self.extra, index = self.index, shrimp = self.shrimp, bowtie = self.bowtie, star = self.star ).run()
def run(self): extractions = [ ] for item in self.genes.split(','): extraction = item.split('/') assert len(extraction) == 4 extractions.append(extraction) rename = { } if self.rename: for item in self.rename.split(','): old,new = item.split('=') rename[old] = new work = self.get_workspace() with workspace.tempspace() as temp: items = list(annotation.read_annotations(self.annotation)) for item in items: item.seqid = rename.get(item.seqid, item.seqid) annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log)) del items with open(temp/'temp.fa','wb') as f: for name,seq in io.read_sequences(self.genome): name = name.split()[0] name = rename.get(name,name) io.write_fasta(f, name, seq) reference_directory.Make_tt_reference( self.output_dir, filenames = [ temp/'temp.fa', temp/'temp.gff' ], index = self.index, ).run()
def run(self): with workspace.tempspace() as temp: with open(temp/'batch.txt','wb') as f: print >> f, 'new' print >> f, 'genome '+os.path.abspath(self.genome) for filename in self.files: print >> f, 'load '+os.path.abspath(filename) io.execute(['java','-jar',io.find_jar('igv.jar'),'-b',temp/'batch.txt'])
def tryout(self, ref, variants): with workspace.tempspace() as temp: job = self.template(temp.working_dir, ref=ref, variants=variants) job.run() result = dict( tuple(item.values()) for item in reporting.mine_logs([job.log_filename()]) ) nesoni_count = int(result['changes found by "nesoni consensus:"']) nesoni_good = {'yes':True,'no':False}[result['is correctly patched by "nesoni consensus:"']] vcf_count = int(result['variants after filtering']) vcf_good = {'yes':True,'no':False}[result['is correctly patched by VCF pipeline']] return nesoni_count, nesoni_good, vcf_count, vcf_good
def run(self): with workspace.tempspace() as temp: with open(temp/'batch.txt','wb') as f: print >> f, 'new' print >> f, 'preference LAST_TRACK_DIRECTORY', os.getcwd() print >> f, 'preference LAST_GENOME_IMPORT_DIRECTORY', os.getcwd() print >> f, 'genome '+os.path.abspath(self.genome) for filename in self.files: print >> f, 'load '+os.path.abspath(filename) io.execute(['java','-jar',io.find_jar('igv.jar'),'-b',temp/'batch.txt'])
def run(self): assert self.release assert self.species assert self.assembly assert self.dna extractions = [ ] for item in self.genes.split(','): extraction = item.split('/') assert len(extraction) == 4 extractions.append(extraction) rename = { } if self.rename: for item in self.rename.split(','): old,new = item.split('=') rename[old] = new work = self.get_workspace() ensembl = workspace.Workspace(work/'ensembl') genome_filename = self.species+"."+self.assembly+"."+self.dna+".fa.gz" genome_url = "rsync://ftp.ensembl.org/ensembl/pub/release-"+self.release+"/fasta/"+self.species.lower()+"/dna/"+genome_filename gff_filename = self.species+"."+self.assembly+"."+self.release+".gff3.gz" gff_url = "rsync://ftp.ensembl.org/ensembl/pub/release-"+self.release+"/gff3/"+self.species.lower()+"/"+gff_filename if self.download: self.log.log("Fetching "+genome_url+"\n") io.execute(['rsync','-aP',genome_url, ensembl/genome_filename]) self.log.log("Fetching "+gff_url+"\n") io.execute(['rsync','-aP',gff_url, ensembl/gff_filename]) with workspace.tempspace() as temp: items = list(annotation.read_annotations(ensembl/gff_filename)) for item in items: item.seqid = rename.get(item.seqid, item.seqid) annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log)) del items with open(temp/'temp.fa','wb') as f: for name,seq in io.read_sequences(ensembl/genome_filename): name = name.split()[0] name = rename.get(name,name) io.write_fasta(f, name, seq) reference_directory.Make_tt_reference( self.output_dir, filenames = [ temp/'temp.fa', temp/'temp.gff' ], index = self.index, ).run()
def tryout(self, ref, variants): with workspace.tempspace() as temp: job = self.template(temp.working_dir, ref=ref, variants=variants) job.run() #result = dict( tuple(item.values()) for item in reporting.mine_logs([job.log_filename()]) ) [result] = reporting.mine_logs([job.log_filename()]).values() nesoni_count = int(result['changes found by "nesoni consensus:"']) nesoni_good = { 'yes': True, 'no': False }[result['is correctly patched by "nesoni consensus:"']] vcf_count = int(result['variants after filtering']) vcf_good = { 'yes': True, 'no': False }[result['is correctly patched by VCF pipeline']] return nesoni_count, nesoni_good, vcf_count, vcf_good
def run(self): assert self.reads or self.pairs or self.interleaved, 'No reads given' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) working = self.get_workspace() working.setup_reference(self.references, bowtie=True) working.update_param(snp_cost=2.0) reference = working.get_reference() log_file = open(self.log_filename(),'wb') with workspace.tempspace(dir=working.working_dir) as temp: n = [ 0 ] def tempname(): n[0] += 1 return temp/('%d.fq'%n[0]) def convert(filename): info = io.get_file_info(filename) ok = selection.matches('type-fastq:[compression-none/compression-gzip/compression-bzip2]', info) if ok: return filename result_name = tempname() with open(result_name,'wb') as f: for name, seq, qual in io.read_sequences(filename, qualities='required'): io.write_fastq(f, name, seq, qual) return result_name ones = [ ] twos = [ ] singles = [ ] for pair in self.pairs: assert len(pair) == 2, 'Need two files in each "pair:" section.' ones.append(convert(pair[0])) twos.append(convert(pair[1])) for item in self.interleaved: left_name = tempname() right_name = tempname() ones.append(left_name) twos.append(right_name) with open(left_name,'wb') as left, \ open(right_name,'wb') as right: reader = io.read_sequences(item, qualities='required') while True: try: name, seq, qual = reader.next() except StopIteration: break io.write_fastq(left, name,seq,qual) try: name, seq, qual = reader.next() except StopIteration: raise grace.Error('Interleaved file contains odd number of sequences') io.write_fastq(right, name,seq,qual) for item in self.reads: singles.append(convert(item)) cores = min(self.cores, legion.coordinator().get_cores()) command = ( [ 'bowtie2', '--threads', str(cores), '--rg-id', '1', '--rg', 'SM:'+working.name, ] + self.bowtie_options + [ '-x', reference.get_bowtie_index_prefix() ] ) commands = [ ] if ones: commands.append(command + [ '-1', ','.join(ones), '-2', ','.join(twos) ]) if singles: commands.append(command + [ '-U', ','.join(singles) ]) temp_bam_name = temp/'temp.bam' with io.pipe_to( ['samtools', 'view', '-S', '-b', '-'], stdout=open(temp_bam_name,'wb'), stderr=log_file ) as f: header_sent = False for command in commands: self.log.log('Running:\n' + ' '.join(command) + '\n') with io.pipe_from( command, stderr=log_file, cores=cores ) as f_out: for line in f_out: if not header_sent or not line.startswith('@'): f.write(line) header_sent = True #io.execute([ # 'samtools', 'sort', '-n', temp_bam_name, working/'alignments' # ]) sam.sort_bam(temp_bam_name, working/'alignments', by_name=True, cores=self.cores) log_file.close()
def run(self): assert self.reads or self.pairs or self.interleaved, 'No reads given' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) working = self.get_workspace() working.setup_reference(self.references, bowtie=True) working.update_param(snp_cost=2.0) reference = working.get_reference() log_file = open(self.log_filename(), 'wb') with workspace.tempspace(dir=working.working_dir) as temp: n = [0] def tempname(): n[0] += 1 return temp / ('%d.fq' % n[0]) def convert(filename): info = io.get_file_info(filename) ok = selection.matches( 'type-fastq:[compression-none/compression-gzip/compression-bzip2]', info) if ok: return filename result_name = tempname() with open(result_name, 'wb') as f: for name, seq, qual in io.read_sequences( filename, qualities='required'): io.write_fastq(f, name, seq, qual) return result_name ones = [] twos = [] singles = [] for pair in self.pairs: assert len( pair) == 2, 'Need two files in each "pair:" section.' ones.append(convert(pair[0])) twos.append(convert(pair[1])) for item in self.interleaved: left_name = tempname() right_name = tempname() ones.append(left_name) twos.append(right_name) with open(left_name,'wb') as left, \ open(right_name,'wb') as right: reader = io.read_sequences(item, qualities='required') while True: try: name, seq, qual = reader.next() except StopIteration: break io.write_fastq(left, name, seq, qual) try: name, seq, qual = reader.next() except StopIteration: raise grace.Error( 'Interleaved file contains odd number of sequences' ) io.write_fastq(right, name, seq, qual) for item in self.reads: singles.append(convert(item)) cores = min(self.cores, legion.coordinator().get_cores()) command = ([ 'bowtie2', '--threads', str(cores), '--rg-id', '1', '--rg', 'SM:' + working.name, ] + self.bowtie_options + ['-x', reference.get_bowtie_index_prefix()]) commands = [] if ones: commands.append(command + ['-1', ','.join(ones), '-2', ','.join(twos)]) if singles: commands.append(command + ['-U', ','.join(singles)]) temp_bam_name = temp / 'temp.bam' with io.pipe_to(['samtools', 'view', '-S', '-b', '-'], stdout=open(temp_bam_name, 'wb'), stderr=log_file) as f: header_sent = False for command in commands: self.log.log('Running:\n' + ' '.join(command) + '\n') with io.pipe_from(command, stderr=log_file, cores=cores) as f_out: for line in f_out: if not header_sent or not line.startswith('@'): f.write(line) header_sent = True #io.execute([ # 'samtools', 'sort', '-n', temp_bam_name, working/'alignments' # ]) sam.sort_bam(temp_bam_name, working / 'alignments', by_name=True, cores=self.cores) log_file.close()
def run(self): assert self.ucsc_name, 'Need a UCSC genome name' scratch = _ucsc_scratch(self) # Load annotations source = 'tt-ucsc-%s-%s' % (self.ucsc_name, self.table) table = scratch.get_table(self.table) get_name = scratch.getter(self.name) get_product = scratch.getter(self.product) mrnas = [ ] for item in table: ann = annotation.Annotation( seqid = item.chrom, source = source, type = 'mRNA', strand = {'+':1, '-':-1}[item.strand], start = int(item.txStart), end = int(item.txEnd), attr = { 'ID' : item.name, 'Name' : get_name(item), 'Product' : get_product(item), #'UCSC_name2' : item.name2, } ) ann.record = item mrnas.append(ann) _uniquify_ids(mrnas) annotations = [ ] for group in _grouped_features(mrnas): ID = '/'.join(item.attr['ID'] for item in group) for item in group: item.attr['Parent'] = ID item.attr['ID'] = item.attr['ID'] + '-mRNA' annotations.append(annotation.Annotation( source = source, type = 'gene', seqid = group[0].seqid, strand = group[0].strand, start = min(item.start for item in group), end = max(item.end for item in group), attr = { 'ID' : ID, 'Name' : annotation_tools.join_descriptions([ item.attr['Name'] for item in group ], '/'), 'Product' : annotation_tools.join_descriptions([ item.attr['Product'] for item in group ], '/'), #'UCSC_name2' : annotation_tools.join_descriptions([ item.attr['UCSC_name2'] for item in group ], '/'), } )) for item in group: annotations.append(item) exonStarts = _parse_ints(item.record.exonStarts) exonEnds = _parse_ints(item.record.exonEnds) cdsStart = int(item.record.cdsStart) cdsEnd = int(item.record.cdsEnd) for start,end in zip(exonStarts,exonEnds): annotations.append(annotation.Annotation( source = source, type = 'exon', seqid = item.seqid, strand = item.strand, start = start, end = end, attr = { 'Parent' : item.attr['ID'], } )) if max(cdsStart,start) < min(cdsEnd,end): annotations.append(annotation.Annotation( source = source, type = 'CDS', seqid = item.seqid, strand = item.strand, start = max(cdsStart,start), end = min(cdsEnd,end), #TODO: phase attr = { 'Parent' : item.attr['ID'], } )) # Load sequence if self.download: io.execute(['rsync','-P','rsync://hgdownload.cse.ucsc.edu/goldenPath/'+self.ucsc_name+'/bigZips/chromFa.tar.gz',scratch.ucsc/'chromFa.tar.gz']) with workspace.tempspace() as temp: io.execute(['tar','-C',temp.working_dir,'-zxf',scratch.ucsc/'chromFa.tar.gz']) sequences = [ temp/item for item in natural_sorted(os.listdir(temp.working_dir)) ] with open(temp/'reference.gff','wb') as f: annotation.write_gff3_header(f) for item in annotations: print >> f, item.as_gff() Make_tt_reference( self.output_dir, filenames = sequences + [ temp/'reference.gff' ], index = self.index, ).run()
def run(self): bams = [ ] reference = None reference2 = None extra = [ ] for sample in self.samples: if sam.is_bam(sample): bams.append(sample) elif os.path.isdir(sample): working = working_directory.Working(sample,True) bams.append( working.get_filtered_sorted_bam() ) extra.append( '##sampleTags=' + ','.join(working.get_tags()) ) if reference2 is None: reference2 = working.get_reference().reference_fasta_filename() elif io.is_sequence_file(sample): assert reference is None, 'Only one reference FASTA file allowed.' reference = sample if reference is None: reference = reference2 if reference is None: raise grace.Error('No reference FASTA file given.') with nesoni.Stage() as stage: tempspace = stage.enter( workspace.tempspace() ) if self.depth_limit: with nesoni.Stage() as stage2: for i in xrange(len(bams)): sam.Bam_depth_limit( tempspace/('%d'%i), bams[i], depth=self.depth_limit ).process_make(stage2) bams[i] = tempspace/('%d.bam'%i) # FreeBayes claims to handle multiple bams, but it doesn't actually work if len(bams) > 1: sam.Bam_merge(tempspace/'merged', bams=bams, index=False).run() bams = [ tempspace/'merged.bam' ] command = [ 'freebayes', '-f', reference, '--ploidy',str(self.ploidy), '--pvar',str(self.pvar), ] + self.freebayes_options + bams self.log.log('Running: '+' '.join(command)+'\n') f_out = stage.enter( open(self.prefix+'.vcf','wb') ) f_in = stage.enter( io.pipe_from(command) ) done_extra = False for line in f_in: if not done_extra and not line.startswith('##'): for extra_line in extra: f_out.write(extra_line+'\n') done_extra = True f_out.write(line) index_vcf(self.prefix+'.vcf')
def run(self): bams = [] reference = None reference2 = None extra = [] for sample in self.samples: if sam.is_bam(sample): bams.append(sample) elif os.path.isdir(sample): working = working_directory.Working(sample, True) bams.append(working.get_filtered_sorted_bam()) extra.append('##sampleTags=' + ','.join(working.get_tags())) if reference2 is None: reference2 = working.get_reference( ).reference_fasta_filename() elif io.is_sequence_file(sample): assert reference is None, 'Only one reference FASTA file allowed.' reference = sample if reference is None: reference = reference2 if reference is None: raise grace.Error('No reference FASTA file given.') with nesoni.Stage() as stage: tempspace = stage.enter(workspace.tempspace()) if self.depth_limit: with nesoni.Stage() as stage2: for i in xrange(len(bams)): sam.Bam_depth_limit( tempspace / ('%d' % i), bams[i], depth=self.depth_limit).process_make(stage2) bams[i] = tempspace / ('%d.bam' % i) # FreeBayes claims to handle multiple bams, but it doesn't actually work if len(bams) > 1: sam.Bam_merge(tempspace / 'merged', bams=bams, index=False).run() bams = [tempspace / 'merged.bam'] command = [ 'freebayes', '-f', reference, '--ploidy', str(self.ploidy), '--pvar', str(self.pvar), ] + self.freebayes_options + bams self.log.log('Running: ' + ' '.join(command) + '\n') f_out = stage.enter(open(self.prefix + '.vcf', 'wb')) f_in = stage.enter(io.pipe_from(command)) done_extra = False for line in f_in: if not done_extra and not line.startswith('##'): for extra_line in extra: f_out.write(extra_line + '\n') done_extra = True f_out.write(line) index_vcf(self.prefix + '.vcf')
def run(self): assert self.ucsc_name, 'Need a UCSC genome name' scratch = _ucsc_scratch(self) # Load annotations source = 'tt-ucsc-%s-%s' % (self.ucsc_name, self.table) table = scratch.get_table(self.table) get_name = scratch.getter(self.name) get_product = scratch.getter(self.product) mrnas = [] for item in table: ann = annotation.Annotation( seqid=item.chrom, source=source, type='mRNA', strand={ '+': 1, '-': -1 }[item.strand], start=int(item.txStart), end=int(item.txEnd), attr={ 'ID': item.name, 'Name': get_name(item), 'Product': get_product(item), #'UCSC_name2' : item.name2, }) ann.record = item mrnas.append(ann) _uniquify_ids(mrnas) annotations = [] for group in _grouped_features(mrnas): ID = '/'.join(item.attr['ID'] for item in group) for item in group: item.attr['Parent'] = ID item.attr['ID'] = item.attr['ID'] + '-mRNA' annotations.append( annotation.Annotation( source=source, type='gene', seqid=group[0].seqid, strand=group[0].strand, start=min(item.start for item in group), end=max(item.end for item in group), attr={ 'ID': ID, 'Name': annotation_tools.join_descriptions( [item.attr['Name'] for item in group], '/'), 'Product': annotation_tools.join_descriptions( [item.attr['Product'] for item in group], '/'), #'UCSC_name2' : annotation_tools.join_descriptions([ item.attr['UCSC_name2'] for item in group ], '/'), })) for item in group: annotations.append(item) exonStarts = _parse_ints(item.record.exonStarts) exonEnds = _parse_ints(item.record.exonEnds) cdsStart = int(item.record.cdsStart) cdsEnd = int(item.record.cdsEnd) for start, end in zip(exonStarts, exonEnds): annotations.append( annotation.Annotation(source=source, type='exon', seqid=item.seqid, strand=item.strand, start=start, end=end, attr={ 'Parent': item.attr['ID'], })) if max(cdsStart, start) < min(cdsEnd, end): annotations.append( annotation.Annotation( source=source, type='CDS', seqid=item.seqid, strand=item.strand, start=max(cdsStart, start), end=min(cdsEnd, end), #TODO: phase attr={ 'Parent': item.attr['ID'], })) # Load sequence if self.download: io.execute([ 'rsync', '-P', 'rsync://hgdownload.cse.ucsc.edu/goldenPath/' + self.ucsc_name + '/bigZips/chromFa.tar.gz', scratch.ucsc / 'chromFa.tar.gz' ]) with workspace.tempspace() as temp: io.execute([ 'tar', '-C', temp.working_dir, '-zxf', scratch.ucsc / 'chromFa.tar.gz' ]) sequences = [ temp / item for item in natural_sorted(os.listdir(temp.working_dir)) ] with open(temp / 'reference.gff', 'wb') as f: annotation.write_gff3_header(f) for item in annotations: print >> f, item.as_gff() Make_tt_reference( self.output_dir, filenames=sequences + [temp / 'reference.gff'], index=self.index, ).run()