def build_bowtie_index(self): io.execute([ 'bowtie2-build', self.reference_fasta_filename(), self/'bowtie', ], )
def build_snpeff(self): jar = io.find_jar('snpEff.jar') with open(self/'snpeff.config','wb') as f: print >> f, 'data_dir = snpeff' print >> f, 'genomes : ' + self.name print >> f, self.name + '.genome : ' + self.name snpwork = io.Workspace(self/'snpeff',must_exist=False) snpwork_genome = io.Workspace(snpwork/self.name,must_exist=False) snpwork_genomes = io.Workspace(snpwork/'genomes',must_exist=False) annotations = self.annotations_filename() assert annotations with open(snpwork_genome/'genes.gff','wb') as f: for record in annotation.read_annotations(annotations): if record.end <= record.start: continue if not record.attr: record.attr['attributes'] = 'none' print >> f, record.as_gff() with open(snpwork_genomes/(self.name+'.fa'),'wb') as f: for name, seq in io.read_sequences(self.reference_fasta_filename()): io.write_fasta(f, name, seq) io.execute('java -jar JAR build NAME -gff3 -c CONFIG', JAR=jar, NAME=self.name, CONFIG=self/'snpeff.config')
def get_table(self, table_name): if table_name not in self.tables: if self.action.download: for filename in [table_name + '.txt.gz', table_name + '.sql']: io.execute([ 'rsync', '-P', 'rsync://hgdownload.cse.ucsc.edu/goldenPath/' + self.action.ucsc_name + '/database/' + filename, self.ucsc / filename ]) fields = [] with open(self.ucsc / table_name + '.sql', 'rU') as f: for line in f: if line.startswith(' `'): parts = line.strip().split() assert parts[0][0] == '`' and parts[0][-1] == '`' fields.append(parts[0][1:-1]) tup_class = collections.namedtuple(table_name, fields) data = [] with gzip.open(self.ucsc / table_name + '.txt.gz', 'rb') as f: for line in f: data.append(tup_class(*line.rstrip('\n').split('\t'))) self.tables[table_name] = data return self.tables[table_name]
def run(self): genome = self.genome if os.path.isdir(genome): genome = os.path.join(genome, os.path.split(genome)[1]+'.genome') print genome #pref_filename = os.path.join(os.path.expanduser('~'),'igv','prefs.properties') #if os.path.exists(pref_filename): # with open(pref_filename,'rb') as f: # lines = f.readlines() # with open(pref_filename,'wb') as f: # for line in lines: # if line.startswith('DEFAULT_GENOME_KEY='): # #line = 'DEFAULT_GENOME_KEY=\n' # continue # f.write(line) with workspace.tempspace() as temp: with open(temp/'batch.txt','wb') as f: print >> f, 'new' print >> f, 'preference LAST_TRACK_DIRECTORY', os.getcwd() print >> f, 'preference LAST_GENOME_IMPORT_DIRECTORY', os.getcwd() print >> f, 'genome '+os.path.abspath(genome) for filename in self.files: print >> f, 'load '+os.path.abspath(filename) io.execute(['java','-Xmx32000m', #Flags from igb.sh script: '-Dproduction=true','-Dapple.laf.useScreenMenuBar=true','-Djava.net.preferIPv4Stack=true', '-jar',io.find_jar('igv.jar'),'-b',temp/'batch.txt'])
def sort_bam(in_filename, out_prefix, by_name=False, cores=8): cores = min(cores, legion.coordinator().get_cores()) megs = max(10, 800 // cores) io.execute( [ 'samtools', 'sort', '-@', '%d' % cores, '-m', '%dM' % megs ] + ([ '-n' ] if by_name else [ ]) + [ in_filename, out_prefix ], cores=cores)
def index_vcf(filename): """ IGV index a VCF file. Don't fail if igvtools fails (eg not installed). """ try: io.execute('igvtools index FILENAME', FILENAME=filename) except (OSError, AssertionError): print >> sys.stderr, 'Failed to index VCF file with igvtools. Continuing.'
def sort_and_index(in_filename, out_prefix): io.execute([ 'samtools', 'sort', in_filename, out_prefix ]) io.execute([ 'samtools', 'index', out_prefix + '.bam' ])
def index_vcf(filename): """ IGV index a VCF file. Don't fail if igvtools fails (eg not installed). """ try: io.execute('igvtools index FILENAME',FILENAME=filename) except (OSError, AssertionError): print >> sys.stderr, 'Failed to index VCF file with igvtools. Continuing.'
def build_bowtie_index(self, log_to=sys.stdout): io.execute([ 'bowtie2-build', self.reference_fasta_filename(), self/'bowtie', ], stdout = log_to, )
def run(self): base = os.path.split(self.prefix)[1] annotations = [ ] sequences = [ ] for filename in self.filenames: any = False if io.is_sequence_file(filename): sequences.append(filename) any = True if annotation.is_annotation_file(filename): annotations.append(filename) any = True assert any, 'File is neither a recognized sequence or annotation file' cytoband_filename = os.path.join(self.prefix,base+'_cytoband.txt') property_filename = os.path.join(self.prefix,'property.txt') gff_filename = os.path.join(self.prefix,base+'.gff') output_filenames = [ cytoband_filename, property_filename, gff_filename ] if not os.path.exists(self.prefix): os.mkdir(self.prefix) f = open(property_filename,'wb') print >> f, 'ordered=true' print >> f, 'id=%s' % base print >> f, 'name=%s' % (self.name or base) print >> f, 'cytobandFile=%s_cytoband.txt' % base print >> f, 'geneFile=%s.gff' % base print >> f, 'sequenceLocation=%s' % base f.close() trivia.As_gff(output=gff_filename, filenames=annotations, exclude=[ 'gene', 'source' ] ).run() f_cyt = open(cytoband_filename,'wb') for filename in sequences: for name, seq in io.read_sequences(filename): assert '/' not in name f = open(os.path.join(self.prefix, name + '.txt'), 'wb') f.write(seq) f.close() print >> f_cyt, '%s\t0\t%d' % (name, len(seq)) f_cyt.close() genome_filename = self.prefix + '.genome' if os.path.exists(genome_filename): os.unlink(genome_filename) io.execute( ['zip', '-j', io.abspath(genome_filename)] + [ io.abspath(item) for item in output_filenames ] ) for filename in output_filenames: if os.path.exists(filename): os.unlink(filename)
def run(self): with workspace.tempspace() as temp: with open(temp/'batch.txt','wb') as f: print >> f, 'new' print >> f, 'genome '+os.path.abspath(self.genome) for filename in self.files: print >> f, 'load '+os.path.abspath(filename) io.execute(['java','-jar',io.find_jar('igv.jar'),'-b',temp/'batch.txt'])
def build_shrimp_mmap(self, cs=False): suffix = '-cs' if cs else '-ls' grace.status('Building SHRiMP mmap') io.execute([ 'gmapper' + suffix, '--save', self.object_filename('reference' + suffix), self.reference_fasta_filename(), ]) grace.status('')
def run(self): from nesoni import io f_in = self.begin_input() f_out = self.begin_output() try: io.execute(self.command, stdin=f_in, stdout=f_out) finally: self.end_output(f_out) self.end_input(f_in)
def run(self): with workspace.tempspace() as temp: with open(temp/'batch.txt','wb') as f: print >> f, 'new' print >> f, 'preference LAST_TRACK_DIRECTORY', os.getcwd() print >> f, 'preference LAST_GENOME_IMPORT_DIRECTORY', os.getcwd() print >> f, 'genome '+os.path.abspath(self.genome) for filename in self.files: print >> f, 'load '+os.path.abspath(filename) io.execute(['java','-jar',io.find_jar('igv.jar'),'-b',temp/'batch.txt'])
def run(self): reference = reference_directory.Reference(self.reference, must_exist=True) jar = io.find_jar('snpEff.jar') with open(self.prefix + '.vcf','wb') as f: io.execute('java -jar JAR eff GENOME VCF -c CONFIG', JAR=jar, GENOME=reference.name, VCF=self.vcf, CONFIG=reference/'snpeff.config', stdout=f) index_vcf(self.prefix+'.vcf')
def run(self): assert self.release assert self.species assert self.assembly assert self.dna extractions = [ ] for item in self.genes.split(','): extraction = item.split('/') assert len(extraction) == 4 extractions.append(extraction) rename = { } if self.rename: for item in self.rename.split(','): old,new = item.split('=') rename[old] = new work = self.get_workspace() ensembl = workspace.Workspace(work/'ensembl') genome_filename = self.species+"."+self.assembly+"."+self.dna+".fa.gz" genome_url = "rsync://ftp.ensembl.org/ensembl/pub/release-"+self.release+"/fasta/"+self.species.lower()+"/dna/"+genome_filename gff_filename = self.species+"."+self.assembly+"."+self.release+".gff3.gz" gff_url = "rsync://ftp.ensembl.org/ensembl/pub/release-"+self.release+"/gff3/"+self.species.lower()+"/"+gff_filename if self.download: self.log.log("Fetching "+genome_url+"\n") io.execute(['rsync','-aP',genome_url, ensembl/genome_filename]) self.log.log("Fetching "+gff_url+"\n") io.execute(['rsync','-aP',gff_url, ensembl/gff_filename]) with workspace.tempspace() as temp: items = list(annotation.read_annotations(ensembl/gff_filename)) for item in items: item.seqid = rename.get(item.seqid, item.seqid) annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log)) del items with open(temp/'temp.fa','wb') as f: for name,seq in io.read_sequences(ensembl/genome_filename): name = name.split()[0] name = rename.get(name,name) io.write_fasta(f, name, seq) reference_directory.Make_tt_reference( self.output_dir, filenames = [ temp/'temp.fa', temp/'temp.gff' ], index = self.index, ).run()
def run(self): reference = reference_directory.Reference(self.reference, must_exist=True) jar = io.find_jar('snpEff.jar') with open(self.prefix + '.vcf', 'wb') as f: io.execute('java -jar JAR eff GENOME VCF -c CONFIG', JAR=jar, GENOME=reference.name, VCF=self.vcf, CONFIG=reference / 'snpeff.config', stdout=f) index_vcf(self.prefix + '.vcf')
def run(self): from nesoni import io assert self.command, 'Nothing to execute!' print self.ident() f_in = self.begin_input() f_out = self.begin_output() try: io.execute(self.command[:1] + self.execution_options + self.command[1:], stdin=f_in, stdout=f_out) finally: self.end_output(f_out) self.end_input(f_in)
def run(self): work = self.get_workspace() acc = self.run_accession io.execute( 'wget -c URL', URL='http://ftp-private.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/%s/%s/%s/%s.sra' % (acc[:3],acc[:6],acc,acc), cwd=work.working_dir, ) io.execute( 'fastq-dump --split-files --bzip2 FILENAME', FILENAME=acc+'.sra', cwd=work.working_dir, )
def run(self): work = self.get_workspace() acc = self.run_accession io.execute( 'wget -c URL', #URL='http://ftp-private.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/%s/%s/%s/%s.sra' URL= 'http://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/%s/%s/%s/%s.sra' % (acc[:3], acc[:6], acc, acc), cwd=work.working_dir, ) io.execute( 'fastq-dump --split-files --bzip2 FILENAME', FILENAME='./' + acc + '.sra', cwd=work.working_dir, )
def run(self): assert self.sort in ('queryname', 'coordinate') jar = io.find_jar('MergeSamFiles.jar', 'MergeSamFiles is part of the Picard package.') io.execute([ 'java','-jar',jar, 'USE_THREADING=true', 'TMP_DIR='+tempfile.gettempdir(), #Force picard to use same temp dir as Python 'SORT_ORDER='+self.sort, 'OUTPUT='+self.prefix+'.bam' ] + [ 'INPUT='+item for item in self.bams ]) if self.sort == 'coordinate' and self.index: jar = io.find_jar('BuildBamIndex.jar', 'BuildBamIndex is part of the Picard package.') io.execute([ 'java','-jar',jar, 'INPUT='+self.prefix+'.bam' ])
def href(self, filename, title=None, image=False): relative = self.workspace.path_as_relative_path(filename) if title is None: title = os.path.split(filename)[1] size = os.stat(filename).st_size if size >= 1<<30: title += ' (%.1fGb)' % (float(size)/(1<<30)) elif size >= 1<<20: title += ' (%.1fMb)' % (float(size)/(1<<20)) elif size >= 1<<10: title += ' (%.1fkb)' % (float(size)/(1<<10)) if image: thumb_name = 'thumb-'+os.path.splitext(relative)[0]+'.png' thumb_filename = self.workspace/thumb_name io.execute(['convert', '-thumbnail', '50x50', filename, thumb_filename]) title = ('<span style="display: inline-block; width: 50px;"><img src="%s"/></span> ' % thumb_name) + title return '<a href="%s">%s</a>' % (relative, title)
def set_sequences(self, filenames): reference_genbank_filename = self / 'reference.gbk' reference_filename = self / 'reference.fa' reference_genbank_file = open(reference_genbank_filename,'wb') any_genbank = [ False ] def genbank_callback(name, record): """ Make a copy of any genbank files passed in. """ from Bio import SeqIO SeqIO.write([record], reference_genbank_file, 'genbank') f = open(self / (grace.filesystem_friendly_name(name) + '.gbk'), 'wb') SeqIO.write([record], f, 'genbank') f.close() any_genbank[0] = True lengths = [ ] seen = set() f = open(reference_filename, 'wb') for filename in filenames: for name, seq in io.read_sequences(filename, genbank_callback=genbank_callback): name = name.split()[0] assert name not in seen, 'Duplicate chromosome name: ' + name seen.add(name) lengths.append( (name, len(seq)) ) io.write_fasta(f, name, seq) f.close() self.set_object(lengths, 'reference-lengths.pickle.gz') reference_genbank_file.close() if not any_genbank[0]: os.unlink(reference_genbank_filename) # Create an index of the reference sequences for samtools io.execute([ 'samtools', 'faidx', reference_filename ])
def get_table(self, table_name): if table_name not in self.tables: if self.action.download: for filename in [ table_name+'.txt.gz', table_name+'.sql' ]: io.execute(['rsync','-P','rsync://hgdownload.cse.ucsc.edu/goldenPath/'+self.action.ucsc_name+'/database/'+filename, self.ucsc/filename]) fields = [ ] with open(self.ucsc/table_name+'.sql','rU') as f: for line in f: if line.startswith(' `'): parts = line.strip().split() assert parts[0][0] == '`' and parts[0][-1] == '`' fields.append(parts[0][1:-1]) tup_class = collections.namedtuple(table_name, fields) data = [ ] with gzip.open(self.ucsc/table_name+'.txt.gz','rb') as f: for line in f: data.append(tup_class(* line.rstrip('\n').split('\t') )) self.tables[table_name] = data return self.tables[table_name]
def run(self): workspace = working_directory.Working(self.output_dir) workspace.setup_reference(self.reference) workspace.update_param(snp_cost = self.snp_cost) #assert os.path.exists(self.reference), 'Reference file does not exist' #reference_filename = workspace._object_filename('reference.fa') #if os.path.exists(reference_filename): # os.unlink(reference_filename) #os.symlink(os.path.relpath(self.reference, self.output_dir), reference_filename) bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') if sam.is_bam(self.input): sort_input_filename = self.input temp_filename = None else: temp_filename = io.abspath(self.output_dir, 'temp.bam') sort_input_filename = temp_filename writer = io.Pipe_writer(temp_filename, ['samtools', 'view', '-S', '-b', '-']) f = open(self.input, 'rb') while True: data = f.read(1<<20) if not data: break writer.write(data) writer.close() f.close() grace.status('Sort') io.execute([ 'samtools', 'sort', '-n', sort_input_filename, bam_prefix ]) if temp_filename is not None: os.unlink(temp_filename) grace.status('')
def href(self, filename, title=None, image=False): relative = self.workspace.path_as_relative_path(filename) if title is None: title = os.path.split(filename)[1] size = os.stat(filename).st_size if size >= 1 << 30: title += ' (%.1fGb)' % (float(size) / (1 << 30)) elif size >= 1 << 20: title += ' (%.1fMb)' % (float(size) / (1 << 20)) elif size >= 1 << 10: title += ' (%.1fkb)' % (float(size) / (1 << 10)) if image: thumb_name = 'thumb-' + relative thumb_filename = self.workspace / thumb_name io.execute( ['convert', '-thumbnail', '50x50', filename, thumb_filename]) title = ( '<span style="display: inline-block; width: 50px;"><img src="%s"/></span> ' % thumb_name) + title return '<a href="%s">%s</a>' % (relative, title)
def run(self): workspace = working_directory.Working(self.output_dir) workspace.setup_reference(self.reference) # assert os.path.exists(self.reference), 'Reference file does not exist' # reference_filename = workspace._object_filename('reference.fa') # if os.path.exists(reference_filename): # os.unlink(reference_filename) # os.symlink(os.path.relpath(self.reference, self.output_dir), reference_filename) bam_filename = io.abspath(self.output_dir, "alignments.bam") bam_prefix = io.abspath(self.output_dir, "alignments") if sam.is_bam(self.input): sort_input_filename = self.input temp_filename = None else: temp_filename = io.abspath(self.output_dir, "temp.bam") sort_input_filename = temp_filename writer = io.Pipe_writer(temp_filename, ["samtools", "view", "-S", "-b", "-"]) f = open(self.input, "rb") while True: data = f.read(1 << 20) if not data: break writer.write(data) writer.close() f.close() grace.status("Sort") io.execute(["samtools", "sort", "-n", sort_input_filename, bam_prefix]) if temp_filename is not None: os.unlink(temp_filename) grace.status("")
def run(self): workspace = working_directory.Working(self.output_dir) workspace.setup_reference(self.reference) #assert os.path.exists(self.reference), 'Reference file does not exist' #reference_filename = workspace._object_filename('reference.fa') #if os.path.exists(reference_filename): # os.unlink(reference_filename) #os.symlink(os.path.relpath(self.reference, self.output_dir), reference_filename) bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') if sam.is_bam(self.input): sort_input_filename = self.input temp_filename = None else: temp_filename = io.abspath(self.output_dir, 'temp.bam') sort_input_filename = temp_filename writer = io.Pipe_writer(temp_filename, ['samtools', 'view', '-S', '-b', '-']) f = open(self.input, 'rb') while True: data = f.read(1 << 20) if not data: break writer.write(data) writer.close() f.close() grace.status('Sort') io.execute(['samtools', 'sort', '-n', sort_input_filename, bam_prefix]) if temp_filename is not None: os.unlink(temp_filename) grace.status('')
def run(self): assert self.method in ("limma", "fitnoise1", "fitnoise2"), "Unknown method." assert self.method != "limma" or not self.empirical_controls title = self.get_title() n_alt = len(self.alt) n_null = len(self.null) suffix = '-dedup' if self.dedup else '' genewise_filename = join(self.analysis,'expression','genewise'+suffix,'counts.csv') genewise_norm_filename = join(self.analysis,'expression','genewise'+suffix,'norm.csv') primarypeakwise_filename = join(self.analysis,'expression','primarypeakwise'+suffix,'counts.csv') primarypeakwise_norm_filename = join(self.analysis,'expression','primarypeakwise'+suffix,'norm.csv') peakwise_filename = join(self.analysis,'expression','peakwise'+suffix,'counts.csv') peakwise_norm_filename = join(self.analysis,'expression','peakwise'+suffix,'norm.csv') pairwise_filename = join(self.analysis,'peak-shift'+suffix,'individual-pairs.csv') pairwise_norm_filename = join(self.analysis,'peak-shift'+suffix,'individual-pairs-norm.csv') reader = io.Table_reader(genewise_filename, 'Count') reader.close() samples = [ item for i, item in enumerate(reader.headings) if reader.groups[i] == 'Count' ] tags = { } for item in samples: tags[item] = [ item ] for line in reader.comments: if line.startswith('#sampleTags='): parts = line[len('#sampleTags='):].split(',') tags[parts[0]] = parts model = [ ] for term in self.alt + self.null: spec = selection.term_specification(term) model.append([ selection.weight(spec, tags[item]) for item in samples ]) model = zip(*model) #Transpose select = [ any(row) for row in model ] model = [ row for row,selected in zip(model,select) if selected ] model_columns = [ selection.term_name(item) for item in self.alt + self.null ] model_rows = [ item for keep, item in zip(select, samples) if keep ] #degust complains if name starts with '-', delimits with commas model_columns = [ ('.' if item[:1] == '-' else '') + item.replace(',',';') for item in model_columns ] pairs_n_alt = n_alt pairs_select = select + select pairs_model = ( [ (0,) * n_alt + row + (0,) for row in model ] + [ row[:n_alt] + row + (1,) for row in model ] ) pairs_model_columns = ( [ item+'-interaction' for item in model_columns[:n_alt] ] + model_columns + [ 'pair2' ] ) pairs_model_rows = [ item+'-peak1' for item in model_rows ] + [ item+'-peak2' for item in model_rows ] design_str = '['+('-'*(8*n_alt-2))+'] test coefficients\n' for row, name in zip(model, model_rows): design_str += "%s %s\n" % (''.join('%7g ' % item for item in row), name) print print "Design matrix" print design_str print print 'Pair design matrix' print '['+('-'*(8*n_alt-2))+'] test coefficients' for row, name in zip(pairs_model, pairs_model_rows): print ''.join('%7g ' % item for item in row), name print workspace = self.get_workspace() runr.run_script(TEST_R, self.tell, DIR = workspace.working_dir, METHOD = self.method, WEIGHT = self.weight, EMPIRICAL_CONTROLS = self.empirical_controls, MIN_READS = self.min_reads, BIOTYPE = self.biotype, RELATION = self.relation, QUANTILE_TAIL = self.quantile_tail, DO_EXPRESSION = self.do_expression, DO_TAIL_LENGTH = self.do_tail_length, VERBOSE = self.verbose, GENEWISE_FILENAME = genewise_filename, GENEWISE_NORM_FILENAME = genewise_norm_filename, PRIMARYPEAKWISE_FILENAME = primarypeakwise_filename, PRIMARYPEAKWISE_NORM_FILENAME = primarypeakwise_norm_filename, PEAKWISE_FILENAME = peakwise_filename, PEAKWISE_NORM_FILENAME = peakwise_norm_filename, PAIRWISE_FILENAME = pairwise_filename, PAIRWISE_NORM_FILENAME = pairwise_norm_filename, N_ALT = n_alt, SELECT = select, MODEL = model, MODEL_COLUMNS = model_columns, PAIRS_N_ALT = pairs_n_alt, PAIRS_SELECT = pairs_select, PAIRS_MODEL = pairs_model, PAIRS_MODEL_COLUMNS = pairs_model_columns, ) if self.tell: return reporter = reporting.Reporter(workspace.working_dir, title, style=web.style()) if self.dedup: reporter.p('Read deduplication was used.') reporter.write('<table>\n') for is_expression, entities, result, aveexpr, subtitle, terms in [ (True, 'genes', 'genewise-voom', 'avg.expression', 'Genewise expression level', model_columns[:n_alt]), (False, 'genes', 'genewise-tail', 'avg.tail', 'Genewise tail length', model_columns[:n_alt]), (True, 'primary peaks', 'primarypeakwise-voom', 'avg.expression', 'Primary-peakwise expression level', model_columns[:n_alt]), (False, 'primary peaks', 'primarypeakwise-tail', 'avg.tail', 'Primary-peakwise tail length', model_columns[:n_alt]), (True, 'peaks', 'peakwise-voom', 'avg.expression', 'Peakwise expression level', model_columns[:n_alt]), (False, 'peaks', 'peakwise-tail', 'avg.tail', 'Peakwise tail length', model_columns[:n_alt]), (True, 'peak pairs', 'pairwise-voom', 'avg.expression', 'Peak-pair expression shift', pairs_model_columns[:n_alt]), (False, 'peak pairs', 'pairwise-tail', 'avg.tail', 'Peak-pair tail length shift', pairs_model_columns[:n_alt]), ]: #data = io.read_grouped_table(workspace/(result+'-toptable.csv'))['All'] #n = 0 #n_01 = 0 #n_05 = 0 #for row in data.values(): # fdr = float(row['adj.P.Val']) # if fdr <= 0.01: n_01 += 1 # if fdr <= 0.05: n_05 += 1 # n += 1 if is_expression and not self.do_expression: continue if not is_expression and not self.do_tail_length: continue io.execute([ 'degust.py', '--name', title + ' : ' + subtitle, '--avg', aveexpr, '--primary', 'baseline', '--logFC', ','.join(terms), '--fdr', 'adj.P.Val', '--info', 'gene,locus_tag,product,reads,polya.reads,tail.lengths,'+aveexpr, '--notour', '1', '--out', workspace/(result+'.html'), workspace/(result+'-toptable.csv'), ]) with open(workspace/(result+'.txt'),'rU') as f: lines = f.readlines() reporter.write('<tr><td valign="top" width="33%">') reporter.subheading( reporter.href(workspace/(result+'.html'), subtitle) ) #reporter.p( '%d %s, %d with fdr<=0.01, %d with fdr<=0.05' % (n,entities,n_01,n_05) ) line = reporter.href(workspace/(result+'-toptable.csv'), 'Spreadsheet') if result.endswith('voom'): line += ', ' + reporter.href(workspace/(result+'.png'), 'voom plot') reporter.p(line) for line in lines[-2:]: reporter.p(line.strip()) reporter.write('</td><td valign="top"><br/><br/>') for line in lines[:-2]: reporter.write(line.strip() + '<br/>\n') reporter.write('</td></tr>') reporter.write('</table>\n') reporter.subheading("Design matrix") reporter.write('<pre>' + design_str + '</pre>') reporter.close()
def run(self): assert self.ucsc_name, 'Need a UCSC genome name' scratch = _ucsc_scratch(self) # Load annotations source = 'tt-ucsc-%s-%s' % (self.ucsc_name, self.table) table = scratch.get_table(self.table) get_name = scratch.getter(self.name) get_product = scratch.getter(self.product) mrnas = [ ] for item in table: ann = annotation.Annotation( seqid = item.chrom, source = source, type = 'mRNA', strand = {'+':1, '-':-1}[item.strand], start = int(item.txStart), end = int(item.txEnd), attr = { 'ID' : item.name, 'Name' : get_name(item), 'Product' : get_product(item), #'UCSC_name2' : item.name2, } ) ann.record = item mrnas.append(ann) _uniquify_ids(mrnas) annotations = [ ] for group in _grouped_features(mrnas): ID = '/'.join(item.attr['ID'] for item in group) for item in group: item.attr['Parent'] = ID item.attr['ID'] = item.attr['ID'] + '-mRNA' annotations.append(annotation.Annotation( source = source, type = 'gene', seqid = group[0].seqid, strand = group[0].strand, start = min(item.start for item in group), end = max(item.end for item in group), attr = { 'ID' : ID, 'Name' : annotation_tools.join_descriptions([ item.attr['Name'] for item in group ], '/'), 'Product' : annotation_tools.join_descriptions([ item.attr['Product'] for item in group ], '/'), #'UCSC_name2' : annotation_tools.join_descriptions([ item.attr['UCSC_name2'] for item in group ], '/'), } )) for item in group: annotations.append(item) exonStarts = _parse_ints(item.record.exonStarts) exonEnds = _parse_ints(item.record.exonEnds) cdsStart = int(item.record.cdsStart) cdsEnd = int(item.record.cdsEnd) for start,end in zip(exonStarts,exonEnds): annotations.append(annotation.Annotation( source = source, type = 'exon', seqid = item.seqid, strand = item.strand, start = start, end = end, attr = { 'Parent' : item.attr['ID'], } )) if max(cdsStart,start) < min(cdsEnd,end): annotations.append(annotation.Annotation( source = source, type = 'CDS', seqid = item.seqid, strand = item.strand, start = max(cdsStart,start), end = min(cdsEnd,end), #TODO: phase attr = { 'Parent' : item.attr['ID'], } )) # Load sequence if self.download: io.execute(['rsync','-P','rsync://hgdownload.cse.ucsc.edu/goldenPath/'+self.ucsc_name+'/bigZips/chromFa.tar.gz',scratch.ucsc/'chromFa.tar.gz']) with workspace.tempspace() as temp: io.execute(['tar','-C',temp.working_dir,'-zxf',scratch.ucsc/'chromFa.tar.gz']) sequences = [ temp/item for item in natural_sorted(os.listdir(temp.working_dir)) ] with open(temp/'reference.gff','wb') as f: annotation.write_gff3_header(f) for item in annotations: print >> f, item.as_gff() Make_tt_reference( self.output_dir, filenames = sequences + [ temp/'reference.gff' ], index = self.index, ).run()
def run(self): reader_f = io.open_possibly_compressed_file(self.vcf) reader = vcf.Reader(reader_f) tags = {} for item in reader.metadata.get('sampleTags', []): parts = item.split(',') tags[parts[0]] = parts assert 'reference' not in reader.samples, 'Can\'t have a sample called reference, sorry.' samples = ['reference'] + reader.samples for sample in samples: if sample not in tags: tags[sample] = [sample, 'all'] samples = selection.select_and_sort(self.select, self.sort, samples, lambda sample: tags[sample]) required = [ i for i, sample in enumerate(samples) if selection.matches(self.require, tags[sample]) ] sample_number = dict((b, a) for a, b in enumerate(reader.samples)) items = [] for record in reader: variants = get_variants(record) genotypes = [] counts = [] qualities = [] for sample in samples: if sample == 'reference': genotypes.append([0]) counts.append([1]) qualities.append(float('inf')) else: genotypes.append( get_genotype(record.samples[sample_number[sample]])) counts.append( get_variant_counts( record.samples[sample_number[sample]])) qualities.append( record.samples[sample_number[sample]].data.GQ) # Only output when there are at least two genotypes any_interesting = False for i in xrange(len(genotypes)): for j in xrange(i): if (genotypes[i] is not None and genotypes[j] is not None and not genotypes_equal(genotypes[i], genotypes[j])): any_interesting = True break if any_interesting: break if not any_interesting: continue if any(genotypes[i] is None for i in required): continue if self.only_snps and any(genotype is not None and any( len(variants[i]) != 1 for i in genotype) for genotype in genotypes): continue snpeff = snpeff_describe(record.INFO.get('EFF', '')) if not any( selection.matches(self.snpeff_filter, item[1]) for item in (snpeff or [('', [])])): continue items.append( _Nway_record(variants=variants, genotypes=genotypes, counts=counts, qualities=qualities, snpeff=snpeff, record=record)) self.log.log('%d variants\n\n' % len(items)) if self.as_ == 'table': self._write_table(samples, items) elif self.as_ == 'nexus': self._write_nexus(samples, items) elif self.as_ == 'splitstree': self._write_nexus(samples, items) io.execute( 'SplitsTree +g -i INPUT -x COMMAND', no_display=True, INPUT=self.prefix + '.nex', COMMAND='UPDATE; ' 'SAVE FILE=\'%s.nex\' REPLACE=yes; ' 'EXPORTGRAPHICS format=svg file=\'%s.svg\' REPLACE=yes TITLE=\'NeighborNet from %d variants\'; ' 'QUIT' % (self.prefix, self.prefix, len(items)), ) elif self.as_ == 'vcf': self._write_vcf(samples, items, reader) else: raise grace.Error('Unknown output format: ' + self.as_)
def run(self): assert self.reads or self.pairs or self.interleaved, 'No reads given' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) working = self.get_workspace() working.setup_reference(self.references, bowtie=True) working.update_param(snp_cost=2.0) reference = working.get_reference() log_file = open(self.log_filename(),'wb') with workspace.tempspace(dir=working.working_dir) as temp: n = [ 0 ] def tempname(): n[0] += 1 return temp/('%d.fq'%n[0]) def convert(filename): info = io.get_file_info(filename) ok = selection.matches('type-fastq:[compression-none/compression-gzip/compression-bzip2]', info) if ok: return filename result_name = tempname() with open(result_name,'wb') as f: for name, seq, qual in io.read_sequences(filename, qualities='required'): io.write_fastq(f, name, seq, qual) return result_name ones = [ ] twos = [ ] singles = [ ] for pair in self.pairs: assert len(pair) == 2, 'Need two files in each "pair:" section.' ones.append(convert(pair[0])) twos.append(convert(pair[1])) for item in self.interleaved: left_name = tempname() right_name = tempname() ones.append(left_name) twos.append(right_name) with open(left_name,'wb') as left, \ open(right_name,'wb') as right: reader = io.read_sequences(item, qualities='required') while True: try: name, seq, qual = reader.next() except StopIteration: break io.write_fastq(left, name,seq,qual) try: name, seq, qual = reader.next() except StopIteration: raise grace.Error('Interleaved file contains odd number of sequences') io.write_fastq(right, name,seq,qual) for item in self.reads: singles.append(convert(item)) cores = min(self.cores, legion.coordinator().get_cores()) command = ( [ 'bowtie2', '--threads', str(cores), '--rg-id', '1', '--rg', 'SM:'+working.name, ] + self.bowtie_options + [ '-x', reference.get_bowtie_index_prefix() ] ) commands = [ ] if ones: commands.append(command + [ '-1', ','.join(ones), '-2', ','.join(twos) ]) if singles: commands.append(command + [ '-U', ','.join(singles) ]) temp_bam_name = temp/'temp.bam' with io.pipe_to( ['samtools', 'view', '-S', '-b', '-'], stdout=open(temp_bam_name,'wb'), stderr=log_file ) as f: header_sent = False for command in commands: self.log.log('Running:\n' + ' '.join(command) + '\n') with io.pipe_from( command, stderr=log_file, cores=cores ) as f_out: for line in f_out: if not header_sent or not line.startswith('@'): f.write(line) header_sent = True io.execute([ 'samtools', 'sort', '-n', temp_bam_name, working/'alignments' ]) log_file.close()
def sort_and_index_bam(in_filename, out_prefix, cores=8): sort_bam(in_filename, out_prefix, cores=cores) io.execute([ 'samtools', 'index', out_prefix + '.bam' ])
def run(self): work = self.get_workspace() work.update_param(remove=['tail_tools_reference_version']) nesoni.Make_reference( self.output_dir, filenames = self.filenames, snpeff = False, cs = 'ifavailable' if self.index and self.shrimp else False, ls = False, bowtie = 'ifavailable' if self.index and self.bowtie else False, ).run() annotations = list(annotation.read_annotations(work/'reference.gff')) annotation.link_up_annotations(annotations) exon_index = span_index.index_annotations([ item for item in annotations if item.type == "exon" ]) mrna_end_index = span_index.index_annotations([ item.three_prime() for item in annotations if item.type == "mRNA" ]) mrna_utrs = [ ] gene_utrs = [ ] for gene in annotations: if gene.type != 'gene': continue mrnas = [ item for item in gene.children if item.type == 'mRNA' ] assert mrnas, "Gene without any mRNAs: "+gene.get_id() gene.attr['color'] = '#880088' gene.start = min(item.start for item in mrnas) gene.end = max(item.end for item in mrnas) gene.attr["max_extension"] = str(_max_extension(gene, exon_index, mrna_end_index)) gene_utr_5primes = [ ] for mrna in mrnas: assert mrna.strand == gene.strand, mrna assert mrna.seqid == gene.seqid, mrna mrna.attr["max_extension"] = str(_max_extension(mrna, exon_index, mrna_end_index)) cdss = [ item for item in mrna.children if item.type == 'CDS' ] exons = [ item for item in mrna.children if item.type == 'exon' ] if not exons: continue #link up annotations sorts children, so final is really final for item in exons[:-1]: item.attr["max_extension"] = "0" exons[-1].attr["max_extension"] = mrna.attr["max_extension"] if not cdss: continue mrna_utr_5primes = [ ] if gene.strand >= 0: cds_3prime = max(item.end for item in cdss) for item in exons: if item.end >= cds_3prime: mrna_utr_5primes.append(max(item.start,cds_3prime)) else: cds_3prime = min(item.start for item in cdss) for item in exons: if item.start <= cds_3prime: mrna_utr_5primes.append(min(item.end,cds_3prime)) if mrna.strand >= 0: utr_start = min(mrna_utr_5primes) if mrna_utr_5primes else mrna.end utr_end = max(utr_start+1,mrna.end) gene_utr_5primes.append(utr_start) else: utr_end = max(mrna_utr_5primes) if mrna_utr_5primes else mrna.start utr_start = min(mrna.start,utr_end-1) gene_utr_5primes.append(utr_end) attr = mrna.attr.copy() attr['Parent'] = attr['ID'] attr['ID'] = attr['ID']+'-3UTR' attr['color'] = '#008888' utr = annotation.Annotation( source = 'tt', type = 'three_prime_utr', seqid = mrna.seqid, strand = mrna.strand, start = utr_start, end = utr_end, attr = attr, ) max_ext = _max_extension(utr, exon_index, mrna_end_index) utr.attr["max_extension"] = str(max_ext) #Only include if there is an annotated 3' UTR or end is not in the middle of some other isoform's exon if utr_end-utr_start+max_ext > 1: mrna_utrs.append(utr) if gene.strand >= 0: utr_start = max(gene_utr_5primes) if gene_utr_5primes else gene.end utr_end = max(utr_start+1,gene.end) else: utr_end = min(gene_utr_5primes) if gene_utr_5primes else gene.start utr_start = min(gene.start,utr_end-1) attr = gene.attr.copy() attr['Parent'] = attr['ID'] attr['ID'] = attr['ID']+'-3UTR' attr['color'] = '#008888' utr = annotation.Annotation( source = 'tt', type = 'three_prime_utr', seqid = gene.seqid, strand = gene.strand, start = utr_start, end = utr_end, attr = attr, ) utr.attr["max_extension"] = str(_max_extension(utr, exon_index, mrna_end_index)) gene_utrs.append(utr) annotation.write_gff3(work/'reference.gff', annotations + mrna_utrs) annotation.write_gff3(work/'utr.gff', gene_utrs) if self.index and self.star and grace.can_execute("STAR"): star_work = workspace.Workspace(work/'star') io.execute([ 'STAR','--runMode','genomeGenerate', '--outFileNamePrefix',star_work.working_dir+'/', '--genomeDir',star_work.working_dir, '--genomeFastaFiles',work/'reference.fa', '--sjdbGTFfile',work/'reference.gff', '--sjdbGTFtagExonParentTranscript','Parent', '--sjdbOverhang','100', ]) work.update_param(tail_tools_reference_version=work.VERSION)
def run(self): reader_f = io.open_possibly_compressed_file(self.vcf) reader = vcf.Reader(reader_f) tags = { } for item in reader.metadata.get('sampleTags',[]): parts = item.split(',') tags[parts[0]] = parts assert 'reference' not in reader.samples, 'Can\'t have a sample called reference, sorry.' samples = [ 'reference'] + reader.samples for sample in samples: if sample not in tags: tags[sample] = [ sample, 'all' ] samples = selection.select_and_sort( self.select, self.sort, samples, lambda sample: tags[sample]) required = [ i for i, sample in enumerate(samples) if selection.matches(self.require, tags[sample]) ] sample_number = dict((b,a) for a,b in enumerate(reader.samples)) items = [ ] for record in reader: variants = get_variants(record) genotypes = [ ] counts = [ ] qualities = [ ] for sample in samples: if sample == 'reference': genotypes.append([0]) counts.append([1]) qualities.append(float('inf')) else: genotypes.append(get_genotype(record.samples[sample_number[sample]])) counts.append(get_variant_counts(record.samples[sample_number[sample]])) qualities.append(record.samples[sample_number[sample]].data.GQ) # Only output when there are at least two genotypes any_interesting = False for i in xrange(len(genotypes)): for j in xrange(i): if (genotypes[i] is not None and genotypes[j] is not None and not genotypes_equal(genotypes[i], genotypes[j])): any_interesting = True break if any_interesting: break if not any_interesting: continue if any(genotypes[i] is None for i in required): continue if self.only_snps and any( genotype is not None and any(len(variants[i]) != 1 for i in genotype) for genotype in genotypes): continue snpeff = snpeff_describe(record.INFO.get('EFF','')) if not any( selection.matches(self.snpeff_filter, item[1]) for item in (snpeff or [('',[])]) ): continue items.append(_Nway_record(variants=variants, genotypes=genotypes, counts=counts, qualities=qualities, snpeff=snpeff, record=record)) self.log.log('%d variants\n\n' % len(items)) if self.as_ == 'table': self._write_table(samples, items) elif self.as_ == 'nexus': self._write_nexus(samples, items) elif self.as_ == 'splitstree': self._write_nexus(samples, items) io.execute( 'SplitsTree +g -i INPUT -x COMMAND', no_display=True, INPUT=self.prefix + '.nex', COMMAND='UPDATE; ' 'SAVE FILE=\'%s.nex\' REPLACE=yes; ' 'EXPORTGRAPHICS format=svg file=\'%s.svg\' REPLACE=yes TITLE=\'NeighborNet from %d variants\'; ' 'QUIT' % (self.prefix, self.prefix, len(items)), ) elif self.as_ == 'vcf': self._write_vcf(samples, items, reader) else: raise grace.Error('Unknown output format: '+self.as_)
def run(self): assert self.method in ("limma", "fitnoise1", "fitnoise2"), "Unknown method." assert self.method != "limma" or not self.empirical_controls title = self.get_title() n_alt = len(self.alt) n_null = len(self.null) suffix = '-dedup' if self.dedup else '' genewise_filename = join(self.analysis, 'expression', 'genewise' + suffix, 'counts.csv') genewise_norm_filename = join(self.analysis, 'expression', 'genewise' + suffix, 'norm.csv') primarypeakwise_filename = join(self.analysis, 'expression', 'primarypeakwise' + suffix, 'counts.csv') primarypeakwise_norm_filename = join(self.analysis, 'expression', 'primarypeakwise' + suffix, 'norm.csv') peakwise_filename = join(self.analysis, 'expression', 'peakwise' + suffix, 'counts.csv') peakwise_norm_filename = join(self.analysis, 'expression', 'peakwise' + suffix, 'norm.csv') pairwise_filename = join(self.analysis, 'peak-shift' + suffix, 'individual-pairs.csv') pairwise_norm_filename = join(self.analysis, 'peak-shift' + suffix, 'individual-pairs-norm.csv') reader = io.Table_reader(genewise_filename, 'Count') reader.close() samples = [ item for i, item in enumerate(reader.headings) if reader.groups[i] == 'Count' ] tags = {} for item in samples: tags[item] = [item] for line in reader.comments: if line.startswith('#sampleTags='): parts = line[len('#sampleTags='):].split(',') tags[parts[0]] = parts model = [] for term in self.alt + self.null: spec = selection.term_specification(term) model.append( [selection.weight(spec, tags[item]) for item in samples]) model = zip(*model) #Transpose select = [any(row) for row in model] model = [row for row, selected in zip(model, select) if selected] model_columns = [ selection.term_name(item) for item in self.alt + self.null ] model_rows = [item for keep, item in zip(select, samples) if keep] #degust complains if name starts with '-', delimits with commas model_columns = [ ('.' if item[:1] == '-' else '') + item.replace(',', ';') for item in model_columns ] pairs_n_alt = n_alt pairs_select = select + select pairs_model = ([(0, ) * n_alt + row + (0, ) for row in model] + [row[:n_alt] + row + (1, ) for row in model]) pairs_model_columns = ( [item + '-interaction' for item in model_columns[:n_alt]] + model_columns + ['pair2']) pairs_model_rows = [item + '-peak1' for item in model_rows ] + [item + '-peak2' for item in model_rows] design_str = '[' + ('-' * (8 * n_alt - 2)) + '] test coefficients\n' for row, name in zip(model, model_rows): design_str += "%s %s\n" % (''.join('%7g ' % item for item in row), name) print print "Design matrix" print design_str print print 'Pair design matrix' print '[' + ('-' * (8 * n_alt - 2)) + '] test coefficients' for row, name in zip(pairs_model, pairs_model_rows): print ''.join('%7g ' % item for item in row), name print workspace = self.get_workspace() runr.run_script( TEST_R, self.tell, DIR=workspace.working_dir, METHOD=self.method, WEIGHT=self.weight, EMPIRICAL_CONTROLS=self.empirical_controls, MIN_READS=self.min_reads, BIOTYPE=self.biotype, RELATION=self.relation, QUANTILE_TAIL=self.quantile_tail, DO_EXPRESSION=self.do_expression, DO_TAIL_LENGTH=self.do_tail_length, VERBOSE=self.verbose, GENEWISE_FILENAME=genewise_filename, GENEWISE_NORM_FILENAME=genewise_norm_filename, PRIMARYPEAKWISE_FILENAME=primarypeakwise_filename, PRIMARYPEAKWISE_NORM_FILENAME=primarypeakwise_norm_filename, PEAKWISE_FILENAME=peakwise_filename, PEAKWISE_NORM_FILENAME=peakwise_norm_filename, PAIRWISE_FILENAME=pairwise_filename, PAIRWISE_NORM_FILENAME=pairwise_norm_filename, N_ALT=n_alt, SELECT=select, MODEL=model, MODEL_COLUMNS=model_columns, PAIRS_N_ALT=pairs_n_alt, PAIRS_SELECT=pairs_select, PAIRS_MODEL=pairs_model, PAIRS_MODEL_COLUMNS=pairs_model_columns, ) if self.tell: return reporter = reporting.Reporter(workspace.working_dir, title, style=web.style()) if self.dedup: reporter.p('Read deduplication was used.') reporter.write('<table>\n') for is_expression, entities, result, aveexpr, subtitle, terms in [ (True, 'genes', 'genewise-voom', 'avg.expression', 'Genewise expression level', model_columns[:n_alt]), (False, 'genes', 'genewise-tail', 'avg.tail', 'Genewise tail length', model_columns[:n_alt]), (True, 'primary peaks', 'primarypeakwise-voom', 'avg.expression', 'Primary-peakwise expression level', model_columns[:n_alt]), (False, 'primary peaks', 'primarypeakwise-tail', 'avg.tail', 'Primary-peakwise tail length', model_columns[:n_alt]), (True, 'peaks', 'peakwise-voom', 'avg.expression', 'Peakwise expression level', model_columns[:n_alt]), (False, 'peaks', 'peakwise-tail', 'avg.tail', 'Peakwise tail length', model_columns[:n_alt]), (True, 'peak pairs', 'pairwise-voom', 'avg.expression', 'Peak-pair expression shift', pairs_model_columns[:n_alt]), (False, 'peak pairs', 'pairwise-tail', 'avg.tail', 'Peak-pair tail length shift', pairs_model_columns[:n_alt]), ]: #data = io.read_grouped_table(workspace/(result+'-toptable.csv'))['All'] #n = 0 #n_01 = 0 #n_05 = 0 #for row in data.values(): # fdr = float(row['adj.P.Val']) # if fdr <= 0.01: n_01 += 1 # if fdr <= 0.05: n_05 += 1 # n += 1 if is_expression and not self.do_expression: continue if not is_expression and not self.do_tail_length: continue io.execute([ 'degust.py', '--name', title + ' : ' + subtitle, '--avg', aveexpr, '--primary', 'baseline', '--logFC', ','.join(terms), '--fdr', 'adj.P.Val', '--info', 'gene,locus_tag,product,reads,polya.reads,tail.lengths,' + aveexpr, '--notour', '1', '--out', workspace / (result + '.html'), workspace / (result + '-toptable.csv'), ]) with open(workspace / (result + '.txt'), 'rU') as f: lines = f.readlines() reporter.write('<tr><td valign="top" width="33%">') reporter.subheading( reporter.href(workspace / (result + '.html'), subtitle)) #reporter.p( '%d %s, %d with fdr<=0.01, %d with fdr<=0.05' % (n,entities,n_01,n_05) ) line = reporter.href(workspace / (result + '-toptable.csv'), 'Spreadsheet') if result.endswith('voom'): line += ', ' + reporter.href(workspace / (result + '.png'), 'voom plot') reporter.p(line) for line in lines[-2:]: reporter.p(line.strip()) reporter.write('</td><td valign="top"><br/><br/>') for line in lines[:-2]: reporter.write(line.strip() + '<br/>\n') reporter.write('</td></tr>') reporter.write('</table>\n') reporter.subheading("Design matrix") reporter.write('<pre>' + design_str + '</pre>') reporter.close()
def run(self): assert self.ucsc_name, 'Need a UCSC genome name' scratch = _ucsc_scratch(self) # Load annotations source = 'tt-ucsc-%s-%s' % (self.ucsc_name, self.table) table = scratch.get_table(self.table) get_name = scratch.getter(self.name) get_product = scratch.getter(self.product) mrnas = [] for item in table: ann = annotation.Annotation( seqid=item.chrom, source=source, type='mRNA', strand={ '+': 1, '-': -1 }[item.strand], start=int(item.txStart), end=int(item.txEnd), attr={ 'ID': item.name, 'Name': get_name(item), 'Product': get_product(item), #'UCSC_name2' : item.name2, }) ann.record = item mrnas.append(ann) _uniquify_ids(mrnas) annotations = [] for group in _grouped_features(mrnas): ID = '/'.join(item.attr['ID'] for item in group) for item in group: item.attr['Parent'] = ID item.attr['ID'] = item.attr['ID'] + '-mRNA' annotations.append( annotation.Annotation( source=source, type='gene', seqid=group[0].seqid, strand=group[0].strand, start=min(item.start for item in group), end=max(item.end for item in group), attr={ 'ID': ID, 'Name': annotation_tools.join_descriptions( [item.attr['Name'] for item in group], '/'), 'Product': annotation_tools.join_descriptions( [item.attr['Product'] for item in group], '/'), #'UCSC_name2' : annotation_tools.join_descriptions([ item.attr['UCSC_name2'] for item in group ], '/'), })) for item in group: annotations.append(item) exonStarts = _parse_ints(item.record.exonStarts) exonEnds = _parse_ints(item.record.exonEnds) cdsStart = int(item.record.cdsStart) cdsEnd = int(item.record.cdsEnd) for start, end in zip(exonStarts, exonEnds): annotations.append( annotation.Annotation(source=source, type='exon', seqid=item.seqid, strand=item.strand, start=start, end=end, attr={ 'Parent': item.attr['ID'], })) if max(cdsStart, start) < min(cdsEnd, end): annotations.append( annotation.Annotation( source=source, type='CDS', seqid=item.seqid, strand=item.strand, start=max(cdsStart, start), end=min(cdsEnd, end), #TODO: phase attr={ 'Parent': item.attr['ID'], })) # Load sequence if self.download: io.execute([ 'rsync', '-P', 'rsync://hgdownload.cse.ucsc.edu/goldenPath/' + self.ucsc_name + '/bigZips/chromFa.tar.gz', scratch.ucsc / 'chromFa.tar.gz' ]) with workspace.tempspace() as temp: io.execute([ 'tar', '-C', temp.working_dir, '-zxf', scratch.ucsc / 'chromFa.tar.gz' ]) sequences = [ temp / item for item in natural_sorted(os.listdir(temp.working_dir)) ] with open(temp / 'reference.gff', 'wb') as f: annotation.write_gff3_header(f) for item in annotations: print >> f, item.as_gff() Make_tt_reference( self.output_dir, filenames=sequences + [temp / 'reference.gff'], index=self.index, ).run()
def run(self): title = self.get_title() n_alt = len(self.alt) n_null = len(self.null) suffix = '-dedup' if self.dedup else '' genewise_filename = join(self.analysis,'expression','genewise'+suffix,'counts.csv') genewise_norm_filename = join(self.analysis,'expression','genewise'+suffix,'norm.csv') peakwise_filename = join(self.analysis,'expression','peakwise'+suffix,'counts.csv') peakwise_norm_filename = join(self.analysis,'expression','peakwise'+suffix,'norm.csv') pairwise_filename = join(self.analysis,'peak-shift'+suffix,'individual-pairs.csv') pairwise_norm_filename = join(self.analysis,'peak-shift'+suffix,'individual-pairs-norm.csv') reader = io.Table_reader(genewise_filename, 'Count') reader.close() samples = [ item for i, item in enumerate(reader.headings) if reader.groups[i] == 'Count' ] tags = { } for item in samples: tags[item] = [ item ] for line in reader.comments: if line.startswith('#sampleTags='): parts = line[len('#sampleTags='):].split(',') tags[parts[0]] = parts model = [ ] for term in self.alt + self.null: spec = term_specification(term) model.append([ 1 if selection.matches(spec, tags[item]) else 0 for item in samples ]) model = zip(*model) #Transpose select = [ any(row) for row in model ] model = [ row for row,selected in zip(model,select) if selected ] model_columns = [ term_name(item) for item in self.alt + self.null ] pairs_n_alt = n_alt pairs_select = select + select pairs_model = ( [ (0,) * n_alt + row + (0,) for row in model ] + [ row[:n_alt] + row + (1,) for row in model ] ) pairs_model_columns = ( [ item+'-interaction' for item in model_columns[:n_alt] ] + model_columns + [ 'pair2' ] ) workspace = self.get_workspace() runr.run_script(TEST_R, self.tell, SOURCE = os.path.join(os.path.dirname(__file__),'tail_tools.R'), DIR = workspace.working_dir, MIN_READS = self.min_reads, GENEWISE_FILENAME = genewise_filename, GENEWISE_NORM_FILENAME = genewise_norm_filename, PEAKWISE_FILENAME = peakwise_filename, PEAKWISE_NORM_FILENAME = peakwise_norm_filename, PAIRWISE_FILENAME = pairwise_filename, PAIRWISE_NORM_FILENAME = pairwise_norm_filename, N_ALT = n_alt, SELECT = select, MODEL = model, MODEL_COLUMNS = model_columns, PAIRS_N_ALT = pairs_n_alt, PAIRS_SELECT = pairs_select, PAIRS_MODEL = pairs_model, PAIRS_MODEL_COLUMNS = pairs_model_columns, ) if self.tell: return reporter = reporting.Reporter(workspace.working_dir, title) if self.dedup: reporter.p('Read deduplication was used.') for entities, result, aveexpr, subtitle, terms in [ ('genes', 'genewise-voom', 'avg.expression', 'Genewise expression level', model_columns[:n_alt]), ('genes', 'genewise-tail', 'avg.tail', 'Genewise tail length', model_columns[:n_alt]), ('peaks', 'peakwise-voom', 'avg.expression', 'Peakwise expression level', model_columns[:n_alt]), ('peaks', 'peakwise-tail', 'avg.tail', 'Peakwise tail length', model_columns[:n_alt]), ('peak pairs', 'pairwise-voom', 'avg.expression', 'Peak-pair expression shift', pairs_model_columns[:n_alt]), ('peak pairs', 'pairwise-tail', 'avg.tail', 'Peak-pair tail length shift', pairs_model_columns[:n_alt]), ]: #data = io.read_grouped_table(workspace/(result+'-toptable.csv'))['All'] #n = 0 #n_01 = 0 #n_05 = 0 #for row in data.values(): # fdr = float(row['adj.P.Val']) # if fdr <= 0.01: n_01 += 1 # if fdr <= 0.05: n_05 += 1 # n += 1 io.execute([ 'degust.py', '--name', title + ' : ' + subtitle, '--avg', aveexpr, '--primary', 'baseline', '--logFC', ','.join(terms), '--fdr', 'adj.P.Val', '--info', 'gene,locus_tag,product,reads,polya.reads,tail.lengths,'+aveexpr, '--notour', '1', '--out', workspace/(result+'.html'), workspace/(result+'-toptable.csv'), ]) reporter.subheading( reporter.href(workspace/(result+'.html'), subtitle) ) #reporter.p( '%d %s, %d with fdr<=0.01, %d with fdr<=0.05' % (n,entities,n_01,n_05) ) with open(workspace/(result+'.txt'),'rU') as f: for line in f: reporter.write(line.strip() + '<br/>\n') reporter.close()
def run(self): grace.require_shrimp_2() grace.require_samtools() assert self.references, 'No reference sequences given' assert self.reads or self.pairs or self.interleaved, 'No reads given' for pair in self.pairs: assert len(pair) == 2, 'Two files required in each pair: section' read_sets = [] for item in self.reads: read_sets.append(([item], False)) for item in self.pairs: read_sets.append((item, True)) for item in self.interleaved: read_sets.append(([item], True)) default_options = { '-E': None, '-T': None, '-N': str(grace.how_many_cpus()), '-n': '2', '-w': '200%', '-p': 'opp-in', '-I': '0,500', '-X': None } if self.sam_unaligned: default_options['--sam-unaligned'] = None if self.half_paired: default_options['--half-paired'] = None else: default_options['--no-half-paired'] = None cutoff = '55%' #Default changed in SHRiMP 2.0.2 if '-h' in self.shrimp_options: cutoff = self.shrimp_options[self.shrimp_options.index('-h') + 1] #Create working directory workspace = self.get_workspace( ) #working_directory.Working(self.output_dir, must_exist=False) workspace.setup_reference(self.references) reference = workspace.get_reference() reference_filename = reference.reference_fasta_filename() #workspace = io.Workspace(self.output_dir) # #workspace.update_param( # shrimp_cutoff = cutoff #) # ##Make copy of reference sequences # #reference_filename = io.abspath(self.output_dir,'reference.fa') #reference_file = open(reference_filename,'wb') # #reference_genbank_filename = io.abspath(self.output_dir,'reference.gbk') #reference_genbank_file = open(reference_genbank_filename,'wb') #any_genbank = [ False ] # #def genbank_callback(name, record): # """ Make a copy of any genbank files passed in. """ # from Bio import SeqIO # # SeqIO.write([record], reference_genbank_file, 'genbank') # # f = open(os.path.join( # self.output_dir, # grace.filesystem_friendly_name(name) + '.gbk' # ), 'wb') # SeqIO.write([record], f, 'genbank') # f.close() # # any_genbank[0] = True # #for filename in self.references: # for name, sequence in io.read_sequences(filename, genbank_callback=genbank_callback): # #Don't retain any comment # name = name.split()[0] # io.write_fasta(reference_file, name, sequence.upper()) # # f = open(os.path.join( # self.output_dir, # grace.filesystem_friendly_name(name) + '.fa' # ), 'wb') # io.write_fasta(f, name, sequence.upper()) # f.close() # # #reference_file.close() #reference_genbank_file.close() #if not any_genbank[0]: # os.unlink(reference_genbank_filename) # ## Create an index of the reference sequences #io.execute([ # 'samtools', 'faidx', reference_filename #]) #Run shrimp bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted') temp_filename = io.abspath(self.output_dir, 'temp.bam') log_filename = io.abspath(self.output_dir, 'shrimp_log.txt') log_file = open(log_filename, 'wb') sam_eater = sam.Bam_writer(temp_filename) #if self.cs: # program = 'gmapper-cs' #else: # program = 'gmapper-ls' sam_header_sent = [False] n_seen = [0] def eat(process): for line in process.stdout: if line.startswith('@'): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status('%s alignments produced' % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) assert process.wait() == 0, 'shrimp failed' sam_header_sent[0] = True def remove_pair_options(options): for flag in ['-p', '-I']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 2:] for flag in ['--half-paired']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 1:] return options if '--qv-offset' not in self.shrimp_options: guesses = [] for filenames, is_paired in read_sets: for filename in filenames: guesses.append(io.guess_quality_offset(filename)) assert len( set(guesses) ) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.' default_options['--qv-offset'] = str(guesses[0]) for filenames, is_paired in read_sets: options = self.shrimp_options[:] has_qualities = all( len(io.read_sequences(filename, qualities=True).next()) == 3 #A little ugly for filename in filenames) if has_qualities: options.append('--fastq') # temp_read_filename = io.abspath(working_dir, 'temp.fa') #else: # temp_read_filename = io.abspath(working_dir, 'temp.fq') #try: #if len(filenames) == 1: # gmapper can cope with gzipped and filenames[0].endswith('.fa') or filenames[0].endswith('.fq'): # actual_read_filename = filenames[0] #else: # actual_read_filename = temp_read_filename # grace.status('Copying reads') # f = open(temp_read_filename, 'wb') # if has_qualities: # for reads in itertools.izip(*[ io.read_sequences(filename, qualities=True) for filename in filenames ]): # for name, seq, qual in reads: # io.write_fastq(f, name, seq, qual) # else: # for reads in itertools.izip(*[ io.read_sequences(filename) for filename in filenames ]): # for name, seq in reads: # io.write_fasta(f, name, seq) # f.close() # grace.status('') if len(filenames) == 1: reads_parameters = [filenames[0]] else: reads_parameters = ['-1', filenames[0], '-2', filenames[1]] for flag in default_options: if flag not in options: options.append(flag) if default_options[flag] is not None: options.append(default_options[flag]) if not is_paired: options = remove_pair_options(options) grace.status('') full_param = reference.shrimp_command(self.cs, options + reads_parameters) print >> sys.stderr, 'Running', ' '.join(full_param) p = io.run(full_param, stdout=subprocess.PIPE, stderr=log_file) eat(p) #finally: # if os.path.exists(temp_read_filename): # os.unlink(temp_read_filename) log_file.close() sam_eater.close() grace.status('Sort') io.execute(['samtools', 'sort', '-n', temp_filename, bam_prefix]) os.unlink(temp_filename) grace.status('')
def run(self): grace.require_shrimp_2() grace.require_samtools() assert self.references, "No reference sequences given" assert self.reads or self.pairs or self.interleaved, "No reads given" for pair in self.pairs: assert len(pair) == 2, "Two files required in each pair: section" io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) read_sets = [] for item in self.reads: read_sets.append(([item], False)) for item in self.pairs: read_sets.append((item, True)) for item in self.interleaved: read_sets.append(([item], True)) # Create working directory workspace = self.get_workspace() workspace.setup_reference(self.references) workspace.update_param(snp_cost=25) reference = workspace.get_reference() reference_filename = reference.reference_fasta_filename() cores = min(self.cores, legion.coordinator().get_cores()) default_options = { "-E": None, "-T": None, "-N": str(cores), "-n": "2", "-w": "200%", "-p": "opp-in", "-I": "0,500", "-X": None, } if self.sam_unaligned: default_options["--sam-unaligned"] = None if self.half_paired: default_options["--half-paired"] = None else: default_options["--no-half-paired"] = None cutoff = "55%" # Default changed in SHRiMP 2.0.2 if "-h" in self.shrimp_options: cutoff = self.shrimp_options[self.shrimp_options.index("-h") + 1] # Run shrimp bam_filename = io.abspath(self.output_dir, "alignments.bam") bam_prefix = io.abspath(self.output_dir, "alignments") bam_sorted_prefix = io.abspath(self.output_dir, "alignments_sorted") temp_filename = io.abspath(self.output_dir, "temp.bam") log_filename = io.abspath(self.output_dir, "shrimp_log.txt") log_file = open(log_filename, "wb") sam_eater = sam.Bam_writer(temp_filename) sam_header_sent = [False] n_seen = [0] def eat(f): for line in f: if line.startswith("@"): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status("%s alignments produced" % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) sam_header_sent[0] = True def remove_pair_options(options): for flag in ["-p", "-I"]: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 2 :] for flag in ["--half-paired"]: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 1 :] return options for i, (filenames, is_paired) in enumerate(read_sets): options = self.shrimp_options[:] has_qualities = all( len(io.read_sequences(filename, qualities=True).next()) == 3 for filename in filenames # A little ugly ) if has_qualities: options.append("--fastq") if len(filenames) == 1: reads_parameters = [filenames[0]] else: reads_parameters = ["-1", filenames[0], "-2", filenames[1]] if "--qv-offset" not in self.shrimp_options: guesses = [] for filename in filenames: guesses.append(io.guess_quality_offset(filename)) assert ( len(set(guesses)) == 1 ), "Conflicting quality offset guesses, please specify --qv-offset manually." default_options["--qv-offset"] = str(guesses[0]) default_options["--read-group"] = "%s,%s" % ( workspace.name.replace(",", "_"), workspace.name.replace(",", "_"), ) for flag in default_options: if flag not in options: options.append(flag) if default_options[flag] is not None: options.append(default_options[flag]) if not is_paired: options = remove_pair_options(options) grace.status("") full_param = reference.shrimp_command(self.cs, options + reads_parameters) print >>sys.stderr, "Running", " ".join(full_param) with io.pipe_from(full_param, stderr=log_file, cores=cores) as f: eat(f) log_file.close() sam_eater.close() grace.status("Sort") io.execute(["samtools", "sort", "-n", temp_filename, bam_prefix]) os.unlink(temp_filename) grace.status("")