def parse_fragFile(fragfile,chrom_dict={}): """ Parse fragment file to create segment info bed file and fragment bed file """ segInfoBedFile = unique_filename_in() fragmentBedFile = unique_filename_in() o = open(segInfoBedFile,'w') obed = open(fragmentBedFile,'w') with open(fragfile,'r') as f: s = f.next() for s in f: if re.search('FragIsNotValid',s): continue s = s.strip().split('\t') chrom = chrom_dict.get(s[1],s[1]) fragmentInfo = '|'.join(['',s[0],chrom+':'+str(int(s[2])+1)+'-'+s[3], 'indexOfSecondRestSiteOcc='+s[10], 'status='+s[-1],'length='+str(int(s[3])-int(s[2])), '0','0','0','0']) o.write('\t'.join([chrom,s[5],s[6],'type=startSegment'+fragmentInfo])+'\n') o.write('\t'.join([chrom,s[8],s[9],'type=endSegment'+fragmentInfo])+'\n') row = [chrom,s[2],s[3],'frag'+s[0]] obed.write('\t'.join(row)+'\n') row[1:3] = [s[5],s[6]] obed.write('\t'.join(row)+'_startSeq\n') row[1:3] = [s[8],s[9]] obed.write('\t'.join(row)+'_endSeq\n') o.close() obed.close() return([segInfoBedFile,fragmentBedFile])
def motif_scan( ex, bedlist, assembly, groups, via, logfile ): logfile.write("Scanning motifs\n");logfile.flush() motifbeds = {} supdir = os.path.split(ex.remote_working_directory)[0] for gid,bedfile in bedlist.iteritems(): logfile.write("\n%i: "%gid);logfile.flush() group = groups[gid] motifs = {} for mot in group.get('motif',[]): if os.path.exists(mot): mname = os.path.basename(os.path.splitext(mot)[0]) motifs[mname] = mot elif os.path.exists(os.path.join(supdir,mot)): mname = os.path.basename(os.path.splitext(mot)[0]) motifs[mname] = os.path.join(supdir,mot) else: _gnid, mname = mot.split(' ') motifs[mname] = _gnrp.get_motif_PWM(int(_gnid), mname, output=unique_filename_in()) logfile.write(mname+", ");logfile.flush() _descr = set_file_descr(group['name']+'_motifs.bed', type='bed', ucsc='1', step='motifs', groupId=gid) _out = unique_filename_in() _hd = "track name='%s_motifs'" %group['name'] motifbeds[gid] = save_motif_profile( ex, motifs, assembly, bedfile, keep_max_only=True, output=_out, header=_hd, description=_descr, via=via ) return motifbeds
def plot_footprint_profile(ex, bedlist, signals, chrnames, groups, logfile): files = dict((gid, {'pdf': "", 'mat': []}) for gid in bedlist.keys()) logfile.write("Plotting footprints:\n") logfile.flush() for gid, motifbed in bedlist.iteritems(): # signals = [track(sig) for sig in siglist[gid]] snames = [sig.name for sig in signals[gid]] tmotif = track(motifbed, format='bed') data = {} numregs = {} for chrom in chrnames: fread = {} for r in tmotif.read(chrom): r2 = r[3].split(":") key = (r2[0], len(r2[1])) if key in fread: fread[key].append(r[1:3]) else: fread[key] = [r[1:3]] for motif, regs in fread.iteritems(): if motif not in data: data[motif] = zeros(shape=(motif[1] + 2 * _plot_flank[1], len(signals[gid]))) numregs[motif] = 0 numregs[motif] += len(regs) tFeat = sorted_stream( segment_features(FeatureStream(regs, fields=['start', 'end']), nbins=motif[1], upstream=_plot_flank, downstream=_plot_flank)) for t in score_by_feature( [s.read(chrom) for s in signals[gid]], tFeat): data[motif][t[2]] += t[3:] files[gid]['pdf'] = unique_filename_in() new = True last = len(data) for motif, dat in data.iteritems(): last -= 1 mname, nbins = motif dat /= float(numregs[motif]) X = range(-_plot_flank[1], _plot_flank[1] + nbins) for k in range(nbins): X[k + _plot_flank[1]] = str(k + 1) ####### Could do a heatmap (sort by intensity)... lineplot(X, [dat[:, n] for n in range(dat.shape[-1])], mfrow=[4, 2], output=files[gid]['pdf'], new=new, last=(last == 0), legend=snames, main=mname) new = False _datf = unique_filename_in() with open(_datf, "w") as dff: dff.write("\t".join([""] + [str(x) for x in X]) + "\n") for n, sn in enumerate(snames): dff.write("\t".join([sn] + [str(x) for x in dat[:, n]]) + "\n") files[gid]['mat'].append((mname, _datf)) return files
def wellington( bed, bam, output=None, options=[] ): """ Binds the ``wellington_footprints.py`` program: `<http://pythonhosted.org/pyDNase/scripts.html#wellington-footprints-py>`_. """ if output is None: output = unique_filename_in() outdir = unique_filename_in() os.mkdir(outdir) args = ["wellington_footprints.py","-o",output] return {'arguments': args+options+[bed,bam,outdir], 'return_value': (outdir,output)}
def FDR_threshold( ex, motif, background, assembly, regions, alpha=.1, nb_samples=1, via='lsf' ): """ Computes a score threshold for 'motif' on 'regions' based on a false discovery rate < alpha and returns the threshold or a dictionary with keys thresholds and values simulated FDRs when alpha < 0. """ fasta, size = assembly.fasta_from_regions( regions, ex=ex ) shuf_fasta, shuf_size = assembly.fasta_from_regions( regions, shuffled=True, ex=ex ) output = unique_filename_in() #### Threshold at -100 to get all scores! future = motif_scan.nonblocking( ex, fasta, motif, background, -100, stdout=output, via=via ) shuf_futures = {} for i in range(nb_samples): out = unique_filename_in() shuf_futures[out] = motif_scan.nonblocking( ex, shuf_fasta, motif, background, -100, stdout=out, via=via ) _ = future.wait() TP_scores = {} ntp = 0 with open(output, 'r') as fin: for line in fin: row = line.split("\t") score = int(round(float(row[2]))) if score in TP_scores: TP_scores[score] += 1 else: TP_scores[score] = 1 ntp += 1 scores = sorted(TP_scores.keys(),reverse=True) scores = [scores[0]+1]+scores+[-101] FP_scores = dict((k,0) for k in scores) nfp = 0 for file,fut in shuf_futures.iteritems(): _ = fut.wait() with open(file, 'r') as fin: for line in fin: row = line.split("\t") fscore = int(round(float(row[2]))) tscore = max([k for k in scores if k<=fscore]) FP_scores[tscore] += 1 nfp += 1 TP_scores[scores[-1]] = ntp TP_scores[scores[0]] = 0 FP_scores[scores[-1]] = nfp for i,sc in enumerate(scores[1:-1]): TP_scores[sc] = TP_scores[scores[i]]+TP_scores[sc] FP_scores[sc] = FP_scores[scores[i]]+FP_scores[sc] cur_fdr = 1.0 threshold = scores[0] for k in sorted(FP_scores.keys()): if TP_scores[k] > 0 and FP_scores[k]/float(TP_scores[k]) < cur_fdr: cur_fdr = FP_scores[k]/float(TP_scores[k]) if cur_fdr <= alpha: threshold = k break FP_scores[k] = cur_fdr if alpha < 0: return FP_scores return threshold
def wellington(bed, bam, output=None, options=[]): """ Binds the ``wellington_footprints.py`` program: `<http://pythonhosted.org/pyDNase/scripts.html#wellington-footprints-py>`_. """ if output is None: output = unique_filename_in() outdir = unique_filename_in() os.mkdir(outdir) args = ["wellington_footprints.py", "-o", output] return { 'arguments': args + options + [bed, bam, outdir], 'return_value': (outdir, output) }
def getRestEnzymeOccAndSeq(fasta_file, prim_site, sec_site, l_seg, l_type='typeI'): """ Creates segments and fragments files of the new library from the genome sequence (via a call to getRestEnzymeOccAndSeq.pl). """ segFile = unique_filename_in() fragFile = unique_filename_in() logFile = unique_filename_in() # script_path='/archive/epfl/bbcf/mleleu/pipeline_vMarion/pipeline_3Cseq/vWebServer_SAM/' progname = (l_type=='typeI') and "getRestEnzymeOccAndSeq.pl" or "getRestEnzymeOccAndSeq_typeII.pl" options = ["-i",fasta_file,"-m",prim_site,"-s",sec_site, "-l",l_seg,"-o",segFile,"-f",fragFile,"-x",logFile] return {'arguments': [progname]+options, 'return_value': [ segFile, fragFile, logFile ]}
def soapsplice(unmapped_R1, unmapped_R2, index, output=None, path_to_soapsplice=None, options={}): """Bind 'soapsplice'. Return a text file containing the list of junctions. :param unmapped_R1: (str) path to the fastq file containing the 'left' reads. :param unmapped_R2: (str) path to the fastq file containing the 'right' reads. :param index: (str) path to the SOAPsplice index. :param output: (str) output file name. :param path_to_soapsplice: (str) path to the SOAPsplice executable. If not specified, the program must be in your $PATH. :param options: (dict) SOAPsplice options, given as {opt: value}. :rtype: str Main options:: -p: number of threads, <= 20. [1] -S: 1: forward strand, 2: reverse strand, 3: both. [3] -m: maximum mismatch for one-segment alignment, <= 5. [3] -g: maximum indel for one-segment alignment, <= 2. [2] -i: length of tail that can be ignored in one-segment alignment. [7] -t: longest gap between two segments in two-segment alignment. [500000] -a: shortest length of a segment in two-segment alignment. [8] -q: input quality type in FASTQ file (0: old Illumina, 1: Sanger). [0] -L: maximum distance between paired-end reads. [500000] -l: minimum distance between paired-end reads. [50] -I: insert length of paired-end reads. """ if not output: output = unique_filename_in() path_to_soapsplice = path_to_soapsplice or 'soapsplice' args = [path_to_soapsplice,'-d',index,'-1',unmapped_R1,'-2',unmapped_R2,'-o',output,'-f','2'] opts = [] for k,v in options.iteritems(): opts.extend([str(k),str(v)]) return {"arguments": args+opts, "return_value": output}
def create_tracks(ex, outall, sample_names, assembly): """Write BED tracks showing SNPs found in each sample.""" infields = ['chromosome','position','reference']+sample_names+['gene','location_type','distance'] intrack = track(outall, format='text', fields=infields, chrmeta=assembly.chrmeta, intypes={'position':int}) instream = intrack.read(fields=infields[:-3]) outtracks = {} for sample_name in sample_names: out = unique_filename_in()+'.bed.gz' t = track(out,fields=['name']) t.make_header(name=sample_name+"_SNPs") outtracks[sample_name] = (t,out) def _row_to_annot(x,ref,n): if x[3+n][0] == ref: return None else: return "%s>%s"%(ref,x[3+n][0]) for x in instream: coord = (x[0],x[1]-1,x[1]) ref = x[2] snp = dict((name, _row_to_annot(x,ref,n)) for n,name in enumerate(sample_names)) for name, tr in outtracks.iteritems(): if snp[name]: tr[0].write([coord+(snp[name],)],mode='append') for name, tr in outtracks.iteritems(): tr[0].close() description = set_file_descr(name+"_SNPs.bed.gz",type='bed',step='tracks',gdv='1',ucsc='1') ex.add(tr[1], description=description)
def run_DE(data_file): """Run limma.R on *data_file*.""" output_file = unique_filename_in() arguments = [ "limma.R", data_file, "-s", "$'\t'", "-o", output_file ] return {'arguments': arguments, 'return_value': output_file}
def run_wellington(ex, tests, names, assembly, via, logfile): futures = {} logfile.write("Running Wellington:\n") logfile.flush() wellout = {} for nbam, bed_bam in enumerate(tests): name = names['tests'][nbam] wellout[name] = [] tbed = track(bed_bam[0]) for chrom in assembly.chrnames: _chrombed = unique_filename_in() with track(_chrombed, format="bed", fields=tbed.fields) as _tt: if len(bed_bam) > 2: _neighb = neighborhood(tbed.read(chrom), before_start=bed_bam[2], after_end=bed_bam[2]) else: _neighb = tbed.read(chrom) _tt.write(fusion(_neighb), clip=True) if os.path.getsize(_chrombed) > 0: futures[(chrom, name)] = wellington.nonblocking(ex, _chrombed, bed_bam[1], via=via, memory=8) for chro_name, _fut in futures.iteritems(): chrom, name = chro_name logfile.write(name[1] + " " + chrom + ", ") logfile.flush() wellout[name].append(_fut.wait()) logfile.write("\n") logfile.flush() bedlist = save_wellington(ex, wellout, assembly.chrmeta) return bedlist
def run_wellington( ex, tests, names, assembly, via, logfile ): futures = {} logfile.write("Running Wellington:\n");logfile.flush() wellout = {} for nbam,bed_bam in enumerate(tests): name = names['tests'][nbam] wellout[name] = [] tbed = track(bed_bam[0]) for chrom in assembly.chrnames: _chrombed = unique_filename_in() with track(_chrombed,format="bed",fields=tbed.fields) as _tt: if len(bed_bam) > 2: _neighb = neighborhood( tbed.read(chrom), before_start=bed_bam[2], after_end=bed_bam[2] ) else: _neighb = tbed.read(chrom) _tt.write(fusion(_neighb),clip=True) if os.path.getsize(_chrombed) > 0: futures[(chrom,name)] = wellington.nonblocking(ex, _chrombed, bed_bam[1], via=via, memory=8) for chro_name, _fut in futures.iteritems(): chrom, name = chro_name logfile.write(name[1]+" "+chrom+", ");logfile.flush() wellout[name].append(_fut.wait()) logfile.write("\n");logfile.flush() bedlist = save_wellington(ex, wellout, assembly.chrmeta) return bedlist
def _begin(output, format, new, ratio=1.375, **kwargs): """Initializes the plot in *R*.""" if new: if output is None: output = unique_filename_in() if format == 'pdf': robjects.r('pdf("%s",paper="a4",height=8*%f,width=8)' % (output, ratio)) elif format == 'png': robjects.r('png("%s",height=800*%f,width=800,type="cairo")' % (output, ratio)) else: raise ValueError("Format not supported: %s" % format) pars = "lwd=2,cex=1.1,cex.main=1.5,cex.lab=1.3,cex.axis=1.1,mar=c(4,4,1,1),las=1,pch=20" if len(kwargs.get('mfrow', [])) == 2: pars += ",mfrow=c(%i,%i)" % tuple(kwargs['mfrow']) robjects.r('par(%s)' % pars) opts = '' if 'log' in kwargs: opts += ',log="%s"' % kwargs['log'] if 'xlim' in kwargs: opts += ',xlim=c(%f,%f)' % tuple(kwargs['xlim']) if 'ylim' in kwargs: opts += ',ylim=c(%f,%f)' % tuple(kwargs['ylim']) opts += ',main="%s"' % kwargs.get('main', '') opts += ',xlab="%s"' % kwargs.get('xlab', '') opts += ',ylab="%s"' % kwargs.get('ylab', '') return opts, output
def plot_footprint_profile( ex, bedlist, signals, chrnames, groups, logfile ): files = dict((gid,{'pdf':"",'mat':[]}) for gid in bedlist.keys()) logfile.write("Plotting footprints:\n");logfile.flush() for gid, motifbed in bedlist.iteritems(): # signals = [track(sig) for sig in siglist[gid]] snames = [sig.name for sig in signals[gid]] tmotif = track(motifbed,format='bed') data = {} numregs = {} for chrom in chrnames: fread = {} for r in tmotif.read(chrom): r2 = r[3].split(":") key = (r2[0],len(r2[1])) if key in fread: fread[key].append(r[1:3]) else: fread[key] = [r[1:3]] for motif, regs in fread.iteritems(): if motif not in data: data[motif] = zeros(shape=(motif[1]+2*_plot_flank[1], len(signals[gid]))) numregs[motif] = 0 numregs[motif] += len(regs) tFeat = sorted_stream(segment_features(FeatureStream(regs,fields=['start','end']), nbins=motif[1],upstream=_plot_flank,downstream=_plot_flank)) for t in score_by_feature([s.read(chrom) for s in signals[gid]], tFeat): data[motif][t[2]] += t[3:] files[gid]['pdf'] = unique_filename_in() new = True last = len(data) for motif, dat in data.iteritems(): last -= 1 mname, nbins = motif dat /= float(numregs[motif]) X = range(-_plot_flank[1],_plot_flank[1]+nbins) for k in range(nbins): X[k+_plot_flank[1]] = str(k+1) ####### Could do a heatmap (sort by intensity)... lineplot(X, [dat[:, n] for n in range(dat.shape[-1])], mfrow=[4,2], output=files[gid]['pdf'], new=new, last=(last==0), legend=snames, main=mname) new = False _datf = unique_filename_in() with open(_datf,"w") as dff: dff.write("\t".join([""]+[str(x) for x in X])+"\n") for n,sn in enumerate(snames): dff.write("\t".join([sn]+[str(x) for x in dat[:, n]])+"\n") files[gid]['mat'].append((mname,_datf)) return files
def removeNA( fileToClean ): ''' remove NA present in the 4th column of a file ''' ''' mainly used with bedgraph''' fileNoNA = unique_filename_in() resfile = open(fileNoNA, 'w') with open( fileToClean ) as f: for s in f: if s[0:5] == 'track': resfile.write(s) if s[0:5] != 'track' and s.strip().split('\t')[3] != "NA": resfile.write(s) resfile.close() return fileNoNA
def run_microbiome(options=[], output=None): if output is None: output = unique_filename_in() options = [ ",".join([str(x) for x in o]) if isinstance(o, (list, tuple)) else str(o) for o in options ] return { 'arguments': ["run_microbiome.py"] + options + [output], 'return_value': output }
def coverageInRepeats(ex, infile, genomeName='mm9', repeatsPath=GlobalRepbasePath, outdir=None, via='lsf'): """ Completes the segment info bed file with the coverage in repeats of each segment. For now, works only for mm9, hg19 and dm3. """ if not(isinstance(infile,dict)): infile = {"":infile} if outdir is None: resfile = unique_filename_in()+".bed" outf = open(resfile,'w') repeatsFile = os.path.join(repeatsPath, genomeName, genomeName+'_rmsk.bed') if not(os.path.exists(repeatsFile)): print("coverage in repeats not calculated as file "+repeatsFile+" does not exist.") if outdir is None: outf.close() cat([inf[0] for inf in infile.values()],out=resfile) else: for chrom,inf in infile.iteritems(): shutil.copy(inf[0], os.path.join(outdir,chrom+".bed")) resfile = outdir return resfile futures = {} for chrom,inf in infile.iteritems(): tmpfile = unique_filename_in() futures[chrom] = (tmpfile,coverageBed.nonblocking(ex,repeatsFile,inf[0],via=via,stdout=tmpfile)) for chrom,fut in futures.iteritems(): if not(outdir is None): resfile = os.path.join(outdir,chrom+".bed") outf = open(resfile,'w') fut[1].wait() coverout = track(fut[0],format='text',fields=['chr','start','end','name','c1','c2','c3','c4']) for s in sorted_stream(coverout.read(),[chrom]): s_split = s[3].split('|') infos = '|'.join(s_split[0:(len(s_split)-4)]+list(s[4:8])) outf.write('\t'.join([str(x) for x in s[0:3]+(infos,)])+'\n') if not(outdir is None): outf.close() if outdir is None: outf.close() else: resfile = outdir return resfile
def camelPeaks( scores_fwd, scores_rev, peaks, chromosome_name, chromosome_length, read_extension, script_path ): """Runs the 'camelPeaks.py' wrapper script on the 'scores_fwd', 'scores_rev' and 'peaks' input with parameters 'chromosome_name' (name of chromosome to process) and 'read_extension', using functions from 'script_path'/deconv_fcts.R. Returns a pdf file and several data tracks. """ output = unique_filename_in() args = ["-p",peaks,"-f",scores_fwd,"-r",scores_rev,"-o",output,"-c",chromosome_name, "-l",str(chromosome_length),"-e",str(read_extension),"-z",script_path,"-s","1500"] return {'arguments': ["camelPeaks.py"]+args, 'return_value': None}
def removeNA(fileToClean): ''' remove NA present in the 4th column of a file ''' ''' mainly used with bedgraph''' fileNoNA = unique_filename_in() resfile = open(fileNoNA, 'w') with open(fileToClean) as f: for s in f: if s[0:5] == 'track': resfile.write(s) if s[0:5] != 'track' and s.strip().split('\t')[3] != "NA": resfile.write(s) resfile.close() return fileNoNA
def gtf_from_bam_header(bam): """In case of alignment on a custom sequence.""" bamtrack = track(bam,format='bam') gtf = unique_filename_in()+'.gtf' with open(gtf,"wb") as g: for c,meta in bamtrack.chrmeta.iteritems(): n = c.split("|")[1] if "|" in c else c gtfline = '\t'.join([c,'','exon','1',str(meta['length']),'.','.','.', 'exon_id "%s"; transcript_id "%s"; gene_id "%s"; gene_name "%s"' % (c,c,c,n)])+'\n' g.write(gtfline) bamtrack.close() return gtf
def motif_scan(ex, bedlist, assembly, groups, via, logfile): logfile.write("Scanning motifs\n") logfile.flush() motifbeds = {} supdir = os.path.split(ex.remote_working_directory)[0] for gid, bedfile in bedlist.iteritems(): logfile.write("\n%i: " % gid) logfile.flush() group = groups[gid] motifs = {} for mot in group.get('motif', []): if os.path.exists(mot): mname = os.path.basename(os.path.splitext(mot)[0]) motifs[mname] = mot elif os.path.exists(os.path.join(supdir, mot)): mname = os.path.basename(os.path.splitext(mot)[0]) motifs[mname] = os.path.join(supdir, mot) else: _gnid, mname = mot.split(' ') motifs[mname] = _gnrp.get_motif_PWM( int(_gnid), mname, output=unique_filename_in()) logfile.write(mname + ", ") logfile.flush() _descr = set_file_descr(group['name'] + '_motifs.bed', type='bed', ucsc='1', step='motifs', groupId=gid) _out = unique_filename_in() _hd = "track name='%s_motifs'" % group['name'] motifbeds[gid] = save_motif_profile(ex, motifs, assembly, bedfile, keep_max_only=True, output=_out, header=_hd, description=_descr, via=via) return motifbeds
def parse_meme_xml( ex, meme_file, chrmeta ): """ Parse meme xml file and convert to track """ from xml.etree import ElementTree as ET touch(ex,meme_file) tree = ET.parse(meme_file) ncol = {} allmatrices = {} for motif in tree.find('motifs').findall('motif'): mid = motif.attrib['id'] ncol[mid] = 0 allmatrices[mid] = unique_filename_in() with open(allmatrices[mid],'w') as mat_out: for parray in motif.find('probabilities')[0].findall('alphabet_array'): ncol[mid] += 1 m = {'letter_A':0,'letter_C':0,'letter_G':0,'letter_T':0} for col in parray: m[col.attrib['letter_id']] = float(col.text) mat_out.write("1\t%f\t%f\t%f\t%f\n" %(m['letter_A'],m['letter_C'],m['letter_G'],m['letter_T'])) def _xmltree(_t):#(_c,_t): seq_name = {} seq_chr = None for it in _t.getiterator(): if it.tag == 'sequence': seq_name[it.attrib['id']] = it.attrib['name'] if it.tag == 'scanned_sites': name = seq_name[it.attrib['sequence_id']] name,seq_chr,start,end = re.search(r'(.*)\|(.+):(\d+)-(\d+)',name).groups() if it.tag == 'scanned_site':# and _c == seq_chr: start = int(start)+int(it.attrib['position'])-1 end = start+ncol[it.attrib['motif_id']] strnd = it.attrib['strand'] == 'plus' and 1 or -1 score = it.attrib['pvalue'] yield (seq_chr,str(start),str(end),it.attrib['motif_id'],score,strnd) outsql = unique_filename_in()+".sql" outtrack = track(outsql, chrmeta=chrmeta, info={'datatype':'qualitative'}, fields=['start','end','name','score','strand']) outtrack.write(FeatureStream(_xmltree(tree),fields=['chr']+outtrack.fields)) outtrack.close() return {'sql':outsql,'matrices':allmatrices}
def transcriptome_gtf_from_genrep(assembly): """In case of mapping on the transcriptome - it if still ever happens.""" tmap = assembly.get_transcript_mapping() gtf = unique_filename_in() gtflines = [] smap = {1: '+', -1: '-'} with open(gtf, "wb") as g: for tid, t in tmap.iteritems(): gtfline = [t.id,'Ensembl','exon',1,t.length,'.','+','.','gene_id "%s"; gene_name "%s"; gene_locus "%s:%i-%i"'\ % (t.gene_id,t.gene_name,t.chrom,t.start,t.end)] g.write('\t'.join([str(x) for x in gtfline]) + '\n') del tmap return gtf
def transcriptome_gtf_from_genrep(assembly): """In case of mapping on the transcriptome - it if still ever happens.""" tmap = assembly.get_transcript_mapping() gtf = unique_filename_in() gtflines = [] smap = {1:'+', -1:'-'} with open(gtf,"wb") as g: for tid,t in tmap.iteritems(): gtfline = [t.id,'Ensembl','exon',1,t.length,'.','+','.','gene_id "%s"; gene_name "%s"; gene_locus "%s:%i-%i"'\ % (t.gene_id,t.gene_name,t.chrom,t.start,t.end)] g.write('\t'.join([str(x) for x in gtfline])+'\n') del tmap return gtf
def differential_analysis(counts_file, feature_type): #shutil.copy(counts_file, "../") diff_files = DE.differential_analysis(counts_file) if diff_files is not None: for diff in diff_files: # Remove first line diff_nohead = unique_filename_in() with open(diff) as f: head = f.readline().strip() with open(diff_nohead, "wb") as g: for line in f: g.write(line) oname = feature_type + "_differential_"+ head + ".txt" desc = set_file_descr(oname, step='stats', type='txt', ucsc=0) ex.add(diff_nohead, description=desc)
def combine_counts(counts, idsColsKey, idsColsCounts, output="combined_counts.txt"): if output in [None, '']: output = unique_filename_in() all_counts = {} infos = {} leninfos = 0 if not isinstance(idsColsKey, (list, tuple)): idsColsKey = [idsColsKey] if not isinstance(idsColsCounts, (list, tuple)): idsColsCounts = [idsColsCounts] for i, filename in enumerate(counts): with open(filename) as f: s = f.next().strip('\n').replace("[", "").replace("]", "").split("\t") if i == 0: #1st file: initialization of counts and infos _colinfos = [ ss for n, ss in enumerate(s) if n not in idsColsKey + idsColsCounts ] leninfos = len(_colinfos) h_infos = '\t'.join(_colinfos) h_counts = '\t'.join([s[n] for n in idsColsCounts]) h_key = '\t'.join([s[n] for n in idsColsKey]) else: h_counts += '\t'.join([''] + [s[n] for n in idsColsCounts]) for line in f: s = line.strip('\n').replace("[", "").replace("]", "").split("\t") curKey = '\t'.join([s[n] for n in idsColsKey]) if i == 0: #1st file: initialization of counts and infos all_counts[curKey] = [''] * len(counts) curInfo = [ ss for n, ss in enumerate(s) if n not in idsColsKey + idsColsCounts ] if len(curInfo) < leninfos: curInfo.extend([''] * (leninfos - len(curInfo))) infos[curKey] = '\t'.join(curInfo) all_counts[curKey][i] = '\t'.join( [s[n] for n in idsColsCounts]) with open(output, 'w') as out: out.write(h_key + '\t' + h_counts + '\t' + h_infos + '\n') for k, v in all_counts.iteritems(): out.write(k + '\t' + '\t'.join(str(s) for s in all_counts[k]) + '\t' + infos.get(k, '') + '\n') return (output)
def differential_analysis(counts_file, feature_type): #shutil.copy(counts_file, "../") diff_files = DE.differential_analysis(counts_file) if diff_files is not None: for diff in diff_files: # Remove first line diff_nohead = unique_filename_in() with open(diff) as f: head = f.readline().strip() with open(diff_nohead, "wb") as g: for line in f: g.write(line) oname = feature_type + "_differential_" + head + ".txt" desc = set_file_descr(oname, step='stats', type='txt', ucsc=0) ex.add(diff_nohead, description=desc)
def camelPeaks(scores_fwd, scores_rev, peaks, chromosome_name, chromosome_length, read_extension, script_path): """Runs the 'camelPeaks.py' wrapper script on the 'scores_fwd', 'scores_rev' and 'peaks' input with parameters 'chromosome_name' (name of chromosome to process) and 'read_extension', using functions from 'script_path'/deconv_fcts.R. Returns a pdf file and several data tracks. """ output = unique_filename_in() args = [ "-p", peaks, "-f", scores_fwd, "-r", scores_rev, "-o", output, "-c", chromosome_name, "-l", str(chromosome_length), "-e", str(read_extension), "-z", script_path, "-s", "1500" ] return {'arguments': ["camelPeaks.py"] + args, 'return_value': None}
def gtf_from_bam_header(bam): """In case of alignment on a custom sequence.""" bamtrack = track(bam, format='bam') gtf = unique_filename_in() + '.gtf' with open(gtf, "wb") as g: for c, meta in bamtrack.chrmeta.iteritems(): n = c.split("|")[1] if "|" in c else c gtfline = '\t'.join([ c, '', 'exon', '1', str(meta['length']), '.', '.', '.', 'exon_id "%s"; transcript_id "%s"; gene_id "%s"; gene_name "%s"' % (c, c, c, n) ]) + '\n' g.write(gtfline) bamtrack.close() return gtf
def save_wellington( ex, wellout, chrmeta ): bedlist = {} for name, wlist in wellout.iteritems(): wellall = unique_filename_in() #### Dummy file touch( ex, wellall ) ex.add(wellall, description=set_file_descr(name[1]+'_wellington_files', type='none', view='admin', step='footprints', groupId=name[0])) #### BED at FDR 1% bedlist[name[0]] = wellall+"FDR01.bed.gz" bedzip = gzip.open(bedlist[name[0]],'wb') bedzip.write("track name='"+name[1]+"_WellingtonFootprints_FDR_0.01'\n") for x in wlist: with open(os.path.join(*x)+".WellingtonFootprints.FDR.0.01.bed") as _bed: [bedzip.write(l) for l in _bed] bedzip.close() ex.add(wellall+"FDR01.bed.gz", description=set_file_descr(name[1]+'_WellingtonFootprintsFDR01.bed.gz', type='bed', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprintsFDR01.bed.gz') #### BED at p-values [...] bedzip = gzip.open(wellall+"PvalCutoffs.bed.gz",'wb') for bfile in os.listdir(os.path.join(wlist[0][0],"p_value_cutoffs")): cut = os.path.splitext(bfile[:-4])[1][1:] #between . ([1:]) and .bed ([:-4]) bedzip.write("track name='"+name[1]+"_WellingtonFootprints_Pval_%s'\n" %cut) for wdir,wpref in wlist: _bedpath = os.path.join(wdir,"p_value_cutoffs",wpref+".WellingtonFootprints."+cut+".bed") with open(_bedpath) as _bed: [bedzip.write(l) for l in _bed] bedzip.close() ex.add(wellall+"PvalCutoffs.bed.gz", description=set_file_descr(name[1]+'_WellingtonFootprintsPvalCutoffs.bed.gz', type='bed', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprintsPvalCutoffs.bed.gz') #### WIG cat([os.path.join(*x)+".WellingtonFootprints.wig" for x in wlist], wellall+".wig") #convert(wellall+".wig", wellall+".bw", chrmeta=chrmeta) #ex.add(wellall+".bw", # description=set_file_descr(name[1]+'_WellingtonFootprints.bw', # type='bigWig', ucsc='1', step='footprints', groupId=name[0]), # associate_to_filename=wellall, template='%s_WellingtonFootprints.bw') ex.add(wellall+".wig", description=set_file_descr(name[1]+'_WellingtonFootprints.wig', type='wig', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprints.wig') return bedlist
def macs( read_length, genome_size, bamfile, ctrlbam=None, args=None ): """Binding for the ``macs`` peak caller `<http://liulab.dfci.harvard.edu/MACS/>`_. Takes one (optionally two) bam file(s) and the 'read_length' and 'genome_size' parameters passed to ``macs``. Returns the file prefix ('-n' option of ``macs``) """ macs_args = ["macs14","-t",bamfile,"-f","BAM","-g",str(genome_size)] if isinstance(args,list): macs_args += args if not(ctrlbam is None): macs_args += ["-c",ctrlbam] if "-n" in macs_args: outname = macs_args[macs_args.index("-n")+1] else: outname = unique_filename_in() macs_args += ["-n",outname] if read_length>0 and "-s" not in macs_args: macs_args += ["-s",str(read_length)] if not("--verbose" in macs_args): macs_args += ["--verbose","1"] if not("--keep-dup" in macs_args): macs_args += ["--keep-dup","all"] return {"arguments": macs_args, "return_value": outname}
def macs(read_length, genome_size, bamfile, ctrlbam=None, args=None): """Binding for the ``macs`` peak caller `<http://liulab.dfci.harvard.edu/MACS/>`_. Takes one (optionally two) bam file(s) and the 'read_length' and 'genome_size' parameters passed to ``macs``. Returns the file prefix ('-n' option of ``macs``) """ macs_args = ["macs14", "-t", bamfile, "-f", "BAM", "-g", str(genome_size)] if isinstance(args, list): macs_args += args if not (ctrlbam is None): macs_args += ["-c", ctrlbam] if "-n" in macs_args: outname = macs_args[macs_args.index("-n") + 1] else: outname = unique_filename_in() macs_args += ["-n", outname] if read_length > 0 and "-s" not in macs_args: macs_args += ["-s", str(read_length)] if not ("--verbose" in macs_args): macs_args += ["--verbose", "1"] if not ("--keep-dup" in macs_args): macs_args += ["--keep-dup", "all"] return {"arguments": macs_args, "return_value": outname}
def createLibrary(ex, assembly_or_fasta, params, url=GlobalHtsUrl, via='local'): """ Main call to create the library """ if len(params['primary'])<2: print('Some parameters are missing, cannot create the library') print('primary='+params['primary']+" ; "+'secondary='+params['secondary']) return [None,None,None,None] if not isinstance(assembly_or_fasta,genrep.Assembly): assembly_or_fasta = genrep.Assembly( ex=ex, fasta=assembly_or_fasta ) chrnames = assembly_or_fasta.chrnames chrom_map = dict((v['ac'],k) for k,v in assembly_or_fasta.chrmeta.iteritems()) allfiles = assembly_or_fasta.fasta_by_chrom #assembly_or_fasta.untar_genome_fasta() libfiles = dict((c, getRestEnzymeOccAndSeq.nonblocking( ex, f, params['primary'], params['secondary'], params['length'], params['type'], via=via )) for c, f in allfiles.iteritems()) resfile = unique_filename_in() os.mkdir(resfile) bedfiles = {} for chrom, future in libfiles.iteritems(): libfiles[chrom] = future.wait() if not os.path.getsize(libfiles[chrom][1])>0: time.sleep(60) touch(ex,libfiles[chrom][1]) bedfiles[chrom] = parse_fragFile(libfiles[chrom][1],chrom_map) rescov = coverageInRepeats(ex, bedfiles, params['species'], outdir=resfile, via=via) bedchrom = [os.path.join(resfile,chrom+".bed") for chrom in chrnames] cat(bedchrom,out=resfile+".bed") gzipfile(ex,[resfile+".bed"]+bedchrom) # resfile_sql = resfile+".sql" # track.convert((resfile,'bed'),(resfile_sql,'sql'),assembly=params['species']) enz_list = [] infos_lib = { 'assembly_name': params['species'], 'enzyme1_id': getEnzymeSeqId(params['primary'], True, enz_list, url), 'enzyme2_id': getEnzymeSeqId(params['secondary'], True, enz_list, url), 'segment_length': params['length'], 'type': params['type'], 'filename': resfile } return [ libfiles, bedfiles, resfile, infos_lib ]
def convert_junc_file(self, filename): """Convert a .junc SOAPsplice output file to bed format. Return the file name. :param filename: (str) name of the .junc file to convert. """ t = track(filename, format='txt', fields=['chr','start','end','strand','score'], chrmeta=self.assembly.chrmeta) stream = t.read() # Translate chromosome names s1 = map_chromosomes(stream, self.assembly.chromosomes) # Add junction IDs s2 = duplicate(s1,'strand','name') C = itertools.count() s3 = apply(s2,'name', lambda x: 'junction'+str(C.next())) # Convert to bed format outfile = unique_filename_in() bed = outfile + '.bed' out = track(bed, fields=s3.fields, chrmeta=self.assembly.chrmeta) out.write(s3) return bed
def soapsplice(unmapped_R1, unmapped_R2, index, output=None, path_to_soapsplice=None, options={}): """Bind 'soapsplice'. Return a text file containing the list of junctions. :param unmapped_R1: (str) path to the fastq file containing the 'left' reads. :param unmapped_R2: (str) path to the fastq file containing the 'right' reads. :param index: (str) path to the SOAPsplice index. :param output: (str) output file name. :param path_to_soapsplice: (str) path to the SOAPsplice executable. If not specified, the program must be in your $PATH. :param options: (dict) SOAPsplice options, given as {opt: value}. :rtype: str Main options:: -p: number of threads, <= 20. [1] -S: 1: forward strand, 2: reverse strand, 3: both. [3] -m: maximum mismatch for one-segment alignment, <= 5. [3] -g: maximum indel for one-segment alignment, <= 2. [2] -i: length of tail that can be ignored in one-segment alignment. [7] -t: longest gap between two segments in two-segment alignment. [500000] -a: shortest length of a segment in two-segment alignment. [8] -q: input quality type in FASTQ file (0: old Illumina, 1: Sanger). [0] -L: maximum distance between paired-end reads. [500000] -l: minimum distance between paired-end reads. [50] -I: insert length of paired-end reads. """ if not output: output = unique_filename_in() path_to_soapsplice = path_to_soapsplice or 'soapsplice' args = [ path_to_soapsplice, '-d', index, '-1', unmapped_R1, '-2', unmapped_R2, '-o', output, '-f', '2' ] opts = [] for k, v in options.iteritems(): opts.extend([str(k), str(v)]) return {"arguments": args + opts, "return_value": output}
def bam_to_annot_counts(bamfiles, annotations_file, pref_name='', output=None): ''' Scan each bam file of a list and calculate the corrected counts for each annotation key present in the "annotations_file". ''' if output is None: output = unique_filename_in() map = {} counts = {} with open(annotations_file) as f: header = f.next().strip('\n').split("\t") for line in f: s = line.strip('\n').split("\t") k = s.pop(0) map[k] = s counts[k] = 0 tot = 0 for bamfile in bamfiles: infile = pysam.Samfile(bamfile) for read in infile: nh = dict(read.tags).get('NH', 1) if isinstance(nh, basestring): nh = 1 if nh < 1: continue inh = 1.0 / nh rname = infile.getrname(read.rname).split("|")[0] if rname in counts: counts[rname] += inh ## still increment if not in counts? tot += inh infile.close() with open(output, 'w') as out: out.write('\t'.join( [header[0], 'counts_' + pref_name, '%counts_' + pref_name] + header[1:]) + '\n') for k, v in map.iteritems(): pc = 100 * counts[k] / tot out.write('\t'.join([k, "%.2f" % counts[k], "%.3f" % pc] + map[k]) + '\n') return output
def bam_to_annot_counts(bamfiles, annotations_file, pref_name="", output=None): """ Scan each bam file of a list and calculate the corrected counts for each annotation key present in the "annotations_file". """ if output is None: output = unique_filename_in() map = {} counts = {} with open(annotations_file) as f: header = f.next().strip("\n").split("\t") for line in f: s = line.strip("\n").split("\t") k = s.pop(0) map[k] = s counts[k] = 0 tot = 0 for bamfile in bamfiles: infile = pysam.Samfile(bamfile) for read in infile: nh = dict(read.tags).get("NH", 1) if isinstance(nh, basestring): nh = 1 if nh < 1: continue inh = 1.0 / nh rname = infile.getrname(read.rname).split("|")[0] if rname in counts: counts[rname] += inh ## still increment if not in counts? tot += inh infile.close() with open(output, "w") as out: out.write("\t".join([header[0], "counts_" + pref_name, "%counts_" + pref_name] + header[1:]) + "\n") for k, v in map.iteritems(): pc = 100 * counts[k] / tot out.write("\t".join([k, "%.2f" % counts[k], "%.3f" % pc] + map[k]) + "\n") return output
def getCountsPerLevel(infile, level=None, output=None): if output is None: output = unique_filename_in() counts = {} map = {} tot = 0 idColCounts = 1 name = '' with open(infile) as f: header = f.next().strip('\n').split('\t') try: level_idx = header.index(level) except: raise ValueError("No column corresponds to " + level + " in file " + infile) level_top = header.index('Kingdom') colrange = range(level_idx, level_top, 2 * int(level_top > level_idx) - 1) header_out = [header[n] for n in colrange] name = header[idColCounts] for line in f: s = line.strip('\n').split('\t') if len(s) < len(header): s.extend([''] * (len(header) - len(s))) tot += float(s[idColCounts]) counts[s[level_idx]] = counts.get(s[level_idx], 0.0) + float( s[idColCounts]) map[s[level_idx]] = [s[n] for n in colrange] with open(output, 'w') as out: header = [level] + header_out + ["counts_" + name, "%counts_" + name] out.write("\t".join(header) + "\n") for k, v in map.iteritems(): pc = 100 * counts[k] / tot curk = k or 'Unnanotated' out.write( "\t".join([curk] + v + ["%.2f" % counts[k], "%.3f" % pc]) + "\n") return output
def convert_junc_file(self, filename): """Convert a .junc SOAPsplice output file to bed format. Return the file name. :param filename: (str) name of the .junc file to convert. """ t = track(filename, format='txt', fields=['chr', 'start', 'end', 'strand', 'score'], chrmeta=self.assembly.chrmeta) stream = t.read() # Translate chromosome names s1 = map_chromosomes(stream, self.assembly.chromosomes) # Add junction IDs s2 = duplicate(s1, 'strand', 'name') C = itertools.count() s3 = apply(s2, 'name', lambda x: 'junction' + str(C.next())) # Convert to bed format outfile = unique_filename_in() bed = outfile + '.bed' out = track(bed, fields=s3.fields, chrmeta=self.assembly.chrmeta) out.write(s3) return bed
def combine_counts(counts, idsColsKey, idsColsCounts, output="combined_counts.txt"): if output in [None, ""]: output = unique_filename_in() all_counts = {} infos = {} leninfos = 0 if not isinstance(idsColsKey, (list, tuple)): idsColsKey = [idsColsKey] if not isinstance(idsColsCounts, (list, tuple)): idsColsCounts = [idsColsCounts] for i, filename in enumerate(counts): with open(filename) as f: s = f.next().strip("\n").replace("[", "").replace("]", "").split("\t") if i == 0: # 1st file: initialization of counts and infos _colinfos = [ss for n, ss in enumerate(s) if n not in idsColsKey + idsColsCounts] leninfos = len(_colinfos) h_infos = "\t".join(_colinfos) h_counts = "\t".join([s[n] for n in idsColsCounts]) h_key = "\t".join([s[n] for n in idsColsKey]) else: h_counts += "\t".join([""] + [s[n] for n in idsColsCounts]) for line in f: s = line.strip("\n").replace("[", "").replace("]", "").split("\t") curKey = "\t".join([s[n] for n in idsColsKey]) if i == 0: # 1st file: initialization of counts and infos all_counts[curKey] = [""] * len(counts) curInfo = [ss for n, ss in enumerate(s) if n not in idsColsKey + idsColsCounts] if len(curInfo) < leninfos: curInfo.extend([""] * (leninfos - len(curInfo))) infos[curKey] = "\t".join(curInfo) all_counts[curKey][i] = "\t".join([s[n] for n in idsColsCounts]) with open(output, "w") as out: out.write(h_key + "\t" + h_counts + "\t" + h_infos + "\n") for k, v in all_counts.iteritems(): out.write(k + "\t" + "\t".join(str(s) for s in all_counts[k]) + "\t" + infos.get(k, "") + "\n") return output
def main(): parser = None try: parser = optparse.OptionParser(usage=usage, description=description) for opt in opts: if len(opt) == 4: parser.add_option(opt[0], opt[1], help=opt[2], **opt[3]) elif len(opt) == 3: parser.add_option(opt[0], help=opt[1], **opt[2]) (opt, args) = parser.parse_args() if not (opt.input and os.path.exists(opt.input)): raise Usage("Please provide a fastq file") if opt.debug: print(""" fastqToFasta.py i=%s n=%i x=%i """ % (opt.input, n, x)) fq = pysam.FastqFile(opt.input) faFile = opt.output or unique_filename_in() rlen = int(opt.length) rskip = int(opt.start) - 1 fa = open(faFile, "w") for i, s in enumerate(fq): seq = s.sequence[rskip:(rskip + rlen)] header = "_".join([s.name, s.sequence, s.quality]) fa.write(">" + header + "\n" + seq + "\n") fq.close() fa.close() except Usage, err: print >> sys.stderr, '\n', err.msg, '\n' if parser: parser.print_help() return 1
def main(): parser = None try: parser = optparse.OptionParser(usage=usage, description=description) for opt in opts: if len(opt) == 4: parser.add_option(opt[0],opt[1],help=opt[2],**opt[3]) elif len(opt) == 3: parser.add_option(opt[0],help=opt[1],**opt[2]) (opt, args) = parser.parse_args() if not(opt.input and os.path.exists(opt.input)): raise Usage("Please provide a fastq file") if opt.debug: print(""" fastqToFasta.py i=%s n=%i x=%i """ %(opt.input,n,x)) fq = pysam.FastqFile(opt.input) faFile = opt.output or unique_filename_in() rlen = int(opt.length) rskip = int(opt.start)-1 fa = open(faFile,"w") for i,s in enumerate(fq): seq = s.sequence[rskip:(rskip+rlen)] header = "_".join([s.name,s.sequence,s.quality]) fa.write(">"+header+"\n"+seq+"\n") fq.close() fa.close() except Usage, err: print >>sys.stderr, '\n',err.msg,'\n' if parser: parser.print_help() return 1
def _begin(output,format,new,ratio=1.375,**kwargs): """Initializes the plot in *R*.""" if new: if output is None: output = unique_filename_in() if format == 'pdf': robjects.r('pdf("%s",paper="a4",height=8*%f,width=8)' %(output,ratio)) elif format == 'png': robjects.r('png("%s",height=800*%f,width=800,type="cairo")' %(output,ratio)) else: raise ValueError("Format not supported: %s" %format) pars = "lwd=2,cex=1.1,cex.main=1.5,cex.lab=1.3,cex.axis=1.1,mar=c(4,4,1,1),las=1,pch=20" if len(kwargs.get('mfrow',[])) == 2: pars += ",mfrow=c(%i,%i)" %tuple(kwargs['mfrow']) robjects.r('par(%s)' %pars) opts = '' if 'log' in kwargs: opts += ',log="%s"' %kwargs['log'] if 'xlim' in kwargs: opts += ',xlim=c(%f,%f)' %tuple(kwargs['xlim']) if 'ylim' in kwargs: opts += ',ylim=c(%f,%f)' %tuple(kwargs['ylim']) opts += ',main="%s"' %kwargs.get('main','') opts += ',xlab="%s"' %kwargs.get('xlab','') opts += ',ylab="%s"' %kwargs.get('ylab','') return opts, output
def create_tracks(ex, outall, sample_names, assembly): """Write BED tracks showing SNPs found in each sample.""" infields = ['chromosome', 'position', 'reference' ] + sample_names + ['gene', 'location_type', 'distance'] intrack = track(outall, format='text', fields=infields, chrmeta=assembly.chrmeta, intypes={'position': int}) instream = intrack.read(fields=infields[:-3]) outtracks = {} for sample_name in sample_names: out = unique_filename_in() + '.bed.gz' t = track(out, fields=['name']) t.make_header(name=sample_name + "_SNPs") outtracks[sample_name] = (t, out) def _row_to_annot(x, ref, n): if x[3 + n][0] == ref: return None else: return "%s>%s" % (ref, x[3 + n][0]) for x in instream: coord = (x[0], x[1] - 1, x[1]) ref = x[2] snp = dict((name, _row_to_annot(x, ref, n)) for n, name in enumerate(sample_names)) for name, tr in outtracks.iteritems(): if snp[name]: tr[0].write([coord + (snp[name], )], mode='append') for name, tr in outtracks.iteritems(): tr[0].close() description = set_file_descr(name + "_SNPs.bed.gz", type='bed', step='tracks', gdv='1', ucsc='1') ex.add(tr[1], description=description)
def getCountsPerLevel(infile, level=None, output=None): if output is None: output = unique_filename_in() counts = {} map = {} tot = 0 idColCounts = 1 name = "" with open(infile) as f: header = f.next().strip("\n").split("\t") try: level_idx = header.index(level) except: raise ValueError("No column corresponds to " + level + " in file " + infile) level_top = header.index("Kingdom") colrange = range(level_idx, level_top, 2 * int(level_top > level_idx) - 1) header_out = [header[n] for n in colrange] name = header[idColCounts] for line in f: s = line.strip("\n").split("\t") if len(s) < len(header): s.extend([""] * (len(header) - len(s))) tot += float(s[idColCounts]) counts[s[level_idx]] = counts.get(s[level_idx], 0.0) + float(s[idColCounts]) map[s[level_idx]] = [s[n] for n in colrange] with open(output, "w") as out: header = [level] + header_out + ["counts_" + name, "%counts_" + name] out.write("\t".join(header) + "\n") for k, v in map.iteritems(): pc = 100 * counts[k] / tot curk = k or "Unnanotated" out.write("\t".join([curk] + v + ["%.2f" % counts[k], "%.3f" % pc]) + "\n") return output
def dnaseseq_workflow(ex, job, assembly, logfile=sys.stdout, via='lsf'): """ This workflow performs the following steps: * BAM files from replicates within the same group are merged * MACS is called to identify enriched regions (only peak summit +- 300 will be used), this can be by-passed by provinding a bed file to any group * Wellington is called to identify footprints within these enriched regions * If a list of motifs is provided (by group), footprints are scanned and motif occurences (log-likelihood ratio > 0) are recorded in a bed file * Average DNAse profiles around motifs are plotted """ tests = [] controls = [] names = {'tests': [], 'controls': []} supdir = os.path.split(ex.remote_working_directory)[0] for gid, mapped in job.files.iteritems(): group_name = job.groups[gid]['name'] if not isinstance(mapped, dict): raise TypeError( "Files values must be dictionaries with keys *run_ids* or 'bam'." ) if 'bam' in mapped: mapped = {'_': mapped} if len(mapped) > 1: bamfile = merge_bam(ex, [m['bam'] for m in mapped.values()]) index = index_bam(ex, bamfile) else: bamfile = mapped.values()[0]['bam'] if job.groups[gid]['control']: controls.append(bamfile) names['controls'].append((gid, group_name)) else: if os.path.exists(job.groups[gid].get('bedfile', 'null')): bedfile = job.groups[gid]['bedfile'] elif os.path.exists( os.path.join(supdir, job.groups[gid].get('bedfile', 'null'))): bedfile = os.path.join(supdir, job.groups[gid]['bedfile']) else: bedfile = None tests.append((bedfile, bamfile)) names['tests'].append((gid, group_name)) if len(controls) < 1: controls = [None] names['controls'] = [(0, None)] tests = macs_bedfiles(ex, assembly.chrmeta, tests, controls, names, job.options.get('macs_args', ["--keep-dup", "10"]), via, logfile) bedlist = run_wellington(ex, tests, names, assembly, via, logfile) ######################### Motif scanning / plotting if any([ gr.get('motif') != 'null' and gr.get('motif') for gr in job.groups.values() ]): motifbeds = motif_scan(ex, bedlist, assembly, job.groups, via, logfile) siglist = dict((gid[0], []) for gid in names['tests']) for gid, mapped in job.files.iteritems(): wig = [] suffixes = ["fwd", "rev"] merge_strands = int(job.options.get('merge_strands', -1)) read_extension = int(job.options.get('read_extension') or -1) make_wigs = merge_strands >= 0 or read_extension != 1 for m in mapped.values(): if make_wigs or not ('wig' in m) or len(m['wig']) < 2: output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta, nreads=m["stats"]["total"], merge=-1, read_extension=1, convert=False, b2w_args=[], via=via) wig.append(dict( (s, output + s + '.sql') for s in suffixes)) else: wig.append(m['wig']) if len(wig) > 1: wig[0] = dict((s, merge_sql(ex, [x[s] for x in wig], via=via)) for s in suffixes) _trn = job.groups[gid]['name'] + "_%s" if job.groups[gid]['control']: for s, w in wig[0].iteritems(): for _g in siglist.keys(): siglist[_g].append(track(w, info={'name': _trn % s})) else: siglist[gid].extend([ track(w, info={'name': _trn % s}) for s, w in wig[0].iteritems() ]) plot_files = plot_footprint_profile(ex, motifbeds, siglist, assembly.chrnames, job.groups, logfile) for gid, flist in plot_files.iteritems(): gname = job.groups[gid]['name'] plotall = unique_filename_in() touch(ex, plotall) ex.add(plotall, description=set_file_descr(gname + '_footprints_plots', type='none', view='admin', step='motifs', groupId=gid)) ex.add(flist['pdf'], description=set_file_descr(gname + '_footprints_plots.pdf', type='pdf', step='motifs', groupId=gid), associate_to_filename=plotall, template='%s.pdf') tarname = unique_filename_in() tarfh = tarfile.open(tarname, "w:gz") for mname, matf in flist['mat']: tarfh.add(matf, arcname="%s_%s.txt" % (gname, mname)) tarfh.close() ex.add(tarname, description=set_file_descr(gname + '_footprints_plots.tar.gz', type='tar', step='motifs', groupId=gid), associate_to_filename=plotall, template='%s.tar.gz') logfile.write("\nDone.\n ") logfile.flush() return 0
def snp_workflow(ex, job, assembly, minsnp=40., mincov=5, path_to_ref=None, via='local', logfile=sys.stdout, debugfile=sys.stderr): """Main function of the workflow""" ref_genome = assembly.fasta_by_chrom sample_names = [job.groups[gid]['name'] for gid in sorted(job.files.keys())] logfile.write("\n* Generate vcfs for each chrom/group\n"); logfile.flush() vcfs = dict((chrom,{}) for chrom in ref_genome.keys()) # {chr: {}} bams = {} # Launch the jobs for gid in sorted(job.files.keys()): # Merge all bams belonging to the same group runs = [r['bam'] for r in job.files[gid].itervalues()] bam = Samfile(runs[0]) header = bam.header headerfile = unique_filename_in() for h in header["SQ"]: if h["SN"] in assembly.chrmeta: h["SN"] = assembly.chrmeta[h["SN"]]["ac"] head = Samfile( headerfile, "wh", header=header ) head.close() if len(runs) > 1: _b = merge_bam(ex,runs) index_bam(ex,_b) bams[gid] = _b else: bams[gid] = runs[0] # Samtools mpileup + bcftools + vcfutils.pl for chrom,ref in ref_genome.iteritems(): vcf = unique_filename_in() vcfs[chrom][gid] = (vcf, pileup.nonblocking(ex, bams[gid], ref, header=headerfile, via=via, stdout=vcf)) logfile.write(" ...Group %s running.\n" %job.groups[gid]['name']); logfile.flush() # Wait for vcfs to finish and store them in *vcfs[chrom][gid]* for gid in sorted(job.files.keys()): for chrom,ref in ref_genome.iteritems(): vcfs[chrom][gid][1].wait() vcfs[chrom][gid] = vcfs[chrom][gid][0] logfile.write(" ...Group %s done.\n" %job.groups[gid]['name']); logfile.flush() # Targz the pileup files (vcf) tarname = unique_filename_in() tarfh = tarfile.open(tarname, "w:gz") for chrom,v in vcfs.iteritems(): for gid,vcf in v.iteritems(): tarfh.add(vcf, arcname="%s_%s.vcf" % (job.groups[gid]['name'],chrom)) tarfh.close() ex.add( tarname, description=set_file_descr("vcfs_files.tar.gz",step="pileup",type="tar",view='admin') ) logfile.write("\n* Merge info from vcf files\n"); logfile.flush() outall = unique_filename_in() outexons = unique_filename_in() with open(outall,"w") as fout: fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+ \ ['gene','location_type','distance'])+'\n') with open(outexons,"w") as fout: fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+['exon','strand','ref_aa'] \ + ['new_aa_'+s for s in sample_names])+'\n') msa_table = dict((s,'') for s in [assembly.name]+sample_names) for chrom,v in vcfs.iteritems(): logfile.write(" > Chromosome '%s'\n" % chrom); logfile.flush() # Put together info from all vcf files logfile.write(" - All SNPs\n"); logfile.flush() allsnps = all_snps(ex,chrom,vcfs[chrom],bams,outall,assembly, sample_names,mincov,float(minsnp),logfile,debugfile) # Annotate SNPs and check synonymy logfile.write(" - Exonic SNPs\n"); logfile.flush() exon_snps(chrom,outexons,allsnps,assembly,sample_names,ref_genome,logfile,debugfile) for snprow in allsnps: for n,k in enumerate([assembly.name]+sample_names): msa_table[k] += snprow[3+n][0] description = set_file_descr("allSNP.txt",step="SNPs",type="txt") ex.add(outall,description=description) description = set_file_descr("exonsSNP.txt",step="SNPs",type="txt") ex.add(outexons,description=description) msafile = unique_filename_in() with open(msafile,"w") as msa: msa.write(" %i %i\n"%(len(msa_table),len(msa_table.values()[0]))) for name,seq in msa_table.iteritems(): msa.write("%s\t%s\n" %(name,seq)) msa_table = {} description = set_file_descr("SNPalignment.txt",step="SNPs",type="txt") ex.add(msafile,description=description) # Create UCSC bed tracks logfile.write("\n* Create tracks\n"); logfile.flush() create_tracks(ex,outall,sample_names,assembly) # Create quantitative tracks logfile.write("\n* Create heteroz. and quality tracks\n"); logfile.flush() def _process_pileup(pileups, seq, startpos, endpos): atoi = {'A': 0, 'C': 1, 'G': 2, 'T': 3} vectors = ([],[],[]) for pileupcolumn in pileups: position = pileupcolumn.pos if position < startpos: continue if position >= endpos: break coverage = pileupcolumn.n ref_symbol = seq[position-startpos] ref = atoi.get(ref_symbol, 4) symbols = [0,0,0,0,0] quality = 0 for pileupread in pileupcolumn.pileups: symbols[atoi.get(pileupread.alignment.seq[pileupread.qpos], 4)] += 1 quality += ord(pileupread.alignment.qual[pileupread.qpos])-33 quality = float(quality)/coverage info = heterozygosity(ref, symbols[0:4]) if coverage > 0: vectors[0].append((position, position+1, coverage)) if info > 0: vectors[1].append((position, position+1, info)) if quality > 0: vectors[2].append((position, position+1, quality)) # yield (position, position+1, coverage, info, quality) return vectors if job.options.get('make_bigwigs',False): _descr = {'groupId':0,'step':"tracks",'type':"bigWig",'ucsc':'1'} for gid,bamfile in bams.iteritems(): _descr['groupId'] = gid bamtr = track(bamfile,format="bam") covname = unique_filename_in()+".bw" out_cov = track(covname, chrmeta=assembly.chrmeta) hetname = unique_filename_in()+".bw" out_het = track(hetname, chrmeta=assembly.chrmeta) qualname = unique_filename_in()+".bw" out_qual = track(qualname, chrmeta=assembly.chrmeta) for chrom, cinfo in assembly.chrmeta.iteritems(): fasta = Fastafile(ref_genome[chrom]) #process fasta and bam by 10Mb chunks for chunk in range(0,cinfo["length"],10**7): fastaseq = fasta.fetch(cinfo['ac'], chunk, chunk+10**7) vecs = _process_pileup(bamtr.pileup(chrom, chunk, chunk+10**7), fastaseq, chunk, chunk+10**7) out_cov.write(vecs[0], fields=['start','end','score'], chrom=chrom) out_het.write(vecs[1], fields=['start','end','score'], chrom=chrom) out_qual.write(vecs[2], fields=['start','end','score'], chrom=chrom) out_cov.close() out_het.close() out_qual.close() description = set_file_descr(job.groups[gid]['name']+"_coverage.bw",**_descr) ex.add(covname,description=description) description = set_file_descr(job.groups[gid]['name']+"_heterozygosity.bw",**_descr) ex.add(hetname,description=description) description = set_file_descr(job.groups[gid]['name']+"_quality.bw",**_descr) ex.add(qualname,description=description) return 0
#!/usr/bin/env python from bbcflib.common import unique_filename_in import sys, getopt, os opts = dict(getopt.getopt(sys.argv[1:], "i:o:n:x:", [])[0]) exportFile = opts['-i'] n = opts.get('-n') or 1 x = opts.get('-x') or 22 print("In fastqToFasta") print("i=" + fqFile) print("n=" + str(n)) print("x=" + str(n)) faFile = opts.get('-o') or unique_filename_in() output = open(faFile, "w") i = 1 n = int(n) x = int(x) with open(exportFile, "r") as f: for s in f: s = s.strip('\n').split('\t') output.write(">line" + str(i) + ":" + s[8] + ":" + s[9] + ":" + s[-1] + s[8][(n - 1):(n + x - 1)] + "\n") i = i + 1 output.close()
opts = dict(getopt.getopt(sys.argv[1:],"i:o:n:x:",[])[0]) fqFile=opts['-i'] n=opts.get('-n') or 1 x=opts.get('-x') or 22 print("In fastqToFasta") print("i="+fqFile) print("n="+str(n)) print("x="+str(n)) faFile=opts.get('-o') or unique_filename_in() output=open(faFile,"w") i=1; nextIsQual=0; nextIsSeq=0; n=int(n);x=int(x) read_length=getReadLength(fqFile) print("readLength="+str(read_length)) with open(fqFile,"r") as f: for s in f: s=s.strip('\n') i=i+1 if re.search(r'^@',s) and nextIsSeq == 0: #to avoid situations where the quality starts with either "@" or "+" nextIsSeq=1 continue if re.search(r'^\+',s) and nextIsQual == 0: #to avoid situations where the quality starts with either "@" or "+" nextIsQual=1 nextIsSeq=0
def _outfile(kw): return kw.pop('outfile', unique_filename_in())
def _fetch_symlink(self, link_name, to=None): """Fetch the data from a file in the LIMS into *to*. *link_name* is a (list of) URL to a .tar.gz file in the LIMS. These .tar.gz files all contain only one file, which we write to *to*. If *to* is omitted, then the data is written to a randomly named file in the current working directory. If *to* is a directory, the data is written to a randomly named file in that directory. Otherwise *to* is taken as the full path to the file to write to. ``_fetch_symlink`` returns the path to the output file, including its filename. """ def _concat_all(target,llist): with open(target, 'w') as output_file: for link in llist: try: url = self._open_url(link) tar = None if re.sub('.gz[ip]*','',link).endswith(".tar"): tar = tarfile.open(fileobj=url, mode='r|gz') # Since the tar file contains exactly one file, calling # ``next()`` on the tar gives us the file we want. We cannot # use ``getnames()[0]`` or similar methods, since they scan # all the way through the file, and we cannot rewind on HTTP # responses. tar_filename = tar.next() # extractfile returns a file-like object we can stream from. input_file = tar.extractfile(tar_filename) elif not(link.endswith(".gz")): input_file = url else: input_file = gzip.GzipFile(fileobj=StringIO.StringIO(url.read())) while True: chunk = input_file.read(4096) if chunk == '': break else: output_file.write(chunk) input_file.close() if tar: tar.close() except Exception as e: raise Exception("Problem with file %s: %s"%(link,e)) if to == None: target = unique_filename_in() elif os.path.isdir(to): target = os.path.join(to, unique_filename_in(to)) else: target = to if isinstance(link_name,dict): linknext = ([],[]) for k in sorted(link_name.keys()): if k[0] > 1: linknext[1].append(link_name[k]) else: linknext[0].append(link_name[k]) link_name = linknext if isinstance(link_name,str): link_name = [link_name] if isinstance(link_name,list): link_name = (link_name,[]) _concat_all(target,link_name[0]) if len(link_name[1])>0: _concat_all(target+"_R2",link_name[1]) return (target,target+"_R2") return target
def run(**kwargs): """ Wrapper function to execute any operation contained in this package, directly from file inputs. Arguments are: :param operation: (str) the name of the function to be called. :param output: (str) a filename or a directory to write the results into. :param assembly: (str) a genome assembly identifier if needed. :param chromosome: (str) a chromosome name if operation must be restricted to a single chromsome. :param ...: additional parameters passed to `operation`. Example:: run(operation="score_by_feature", output="score_output.bed", chromosome="chr1", trackScores="density_file.sql", trackFeatures="genes.sql") """ from bbcflib import genrep def _map(fct): for module in _module_list: __import__(_here+module) smod = sys.modules[_here+module] if hasattr(getattr(smod, module)(),fct): return module return None funct = kwargs.pop("operation",'None') module = _map(funct) if module is None: raise ValueError("No such operation %s." %funct) output = kwargs.pop("output","./") or "./" if os.path.isdir(output): output = os.path.join(output,unique_filename_in(output)+".sql") format = "sql" else: format = os.path.splitext(output)[1][1:] or "sql" if format in ['gz','gzip']: format = os.path.splitext(output.strip("."+format))[1][1:]+"."+format smod = sys.modules[_here+module] trackSet = {} for targ in getattr(smod, module)().loadable(funct): trackSet[targ] = [track(t) for t in kwargs[targ].split(",")] assembly = None if 'assembly' in kwargs: assembly = kwargs.pop('assembly') if assembly: chrmeta = genrep.Assembly(assembly).chrmeta else: chrmeta = trackSet[targ][0].chrmeta if 'chromosome' in kwargs: chrom = kwargs.pop('chromosome') chrmeta = {chrom: chrmeta.get(chrom,{})} chr = chrmeta.keys()[0] info = None if 'datatype' in kwargs: info = {'datatype': kwargs.pop('datatype')} files = None for targ in getattr(smod, module)().loadable(funct): kwargs[targ] = [t.read(selection=chr) for t in trackSet[targ]] funct_output = getattr(smod, funct)(**kwargs) if isinstance(funct_output,list): files = [] for n,stream in enumerate(funct_output): outf = "%s_%i.%s" %(output.strip(format),n,format) files.append(outf) fields = stream.fields track(outf,chrmeta=chrmeta,fields=fields, info=info).write(stream,chrom=chr) for chr in chrmeta.keys()[1:]: for targ in getattr(smod, module)().loadable(funct): kwargs[targ] = [t.read(selection=chr) for t in trackSet[targ]] funct_output = getattr(smod, funct)(**kwargs) for n,stream in enumerate(funct_output): track(files[n],chrmeta=chrmeta).write(stream,chrom=chr,mode='append') else: files = output fields = funct_output.fields track(files,chrmeta=chrmeta,fields=fields, info=info).write(funct_output,chrom=chr) for chr in chrmeta.keys()[1:]: for targ in getattr(smod, module)().loadable(funct): kwargs[targ] = [t.read(selection=chr) for t in trackSet[targ]] funct_output = getattr(smod, funct)(**kwargs) track(files,chrmeta=chrmeta).write(funct_output,chrom=chr,mode='append') return files
def save_wellington(ex, wellout, chrmeta): bedlist = {} for name, wlist in wellout.iteritems(): wellall = unique_filename_in() #### Dummy file touch(ex, wellall) ex.add(wellall, description=set_file_descr(name[1] + '_wellington_files', type='none', view='admin', step='footprints', groupId=name[0])) #### BED at FDR 1% bedlist[name[0]] = wellall + "FDR01.bed.gz" bedzip = gzip.open(bedlist[name[0]], 'wb') bedzip.write("track name='" + name[1] + "_WellingtonFootprints_FDR_0.01'\n") for x in wlist: with open(os.path.join(*x) + ".WellingtonFootprints.FDR.0.01.bed") as _bed: [bedzip.write(l) for l in _bed] bedzip.close() ex.add(wellall + "FDR01.bed.gz", description=set_file_descr(name[1] + '_WellingtonFootprintsFDR01.bed.gz', type='bed', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprintsFDR01.bed.gz') #### BED at p-values [...] bedzip = gzip.open(wellall + "PvalCutoffs.bed.gz", 'wb') for bfile in os.listdir(os.path.join(wlist[0][0], "p_value_cutoffs")): cut = os.path.splitext( bfile[:-4])[1][1:] #between . ([1:]) and .bed ([:-4]) bedzip.write("track name='" + name[1] + "_WellingtonFootprints_Pval_%s'\n" % cut) for wdir, wpref in wlist: _bedpath = os.path.join( wdir, "p_value_cutoffs", wpref + ".WellingtonFootprints." + cut + ".bed") with open(_bedpath) as _bed: [bedzip.write(l) for l in _bed] bedzip.close() ex.add(wellall + "PvalCutoffs.bed.gz", description=set_file_descr( name[1] + '_WellingtonFootprintsPvalCutoffs.bed.gz', type='bed', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprintsPvalCutoffs.bed.gz') #### WIG cat([os.path.join(*x) + ".WellingtonFootprints.wig" for x in wlist], wellall + ".wig") #convert(wellall+".wig", wellall+".bw", chrmeta=chrmeta) #ex.add(wellall+".bw", # description=set_file_descr(name[1]+'_WellingtonFootprints.bw', # type='bigWig', ucsc='1', step='footprints', groupId=name[0]), # associate_to_filename=wellall, template='%s_WellingtonFootprints.bw') ex.add(wellall + ".wig", description=set_file_descr(name[1] + '_WellingtonFootprints.wig', type='wig', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprints.wig') return bedlist
def run_microbiome(options=[], output=None): if output is None: output = unique_filename_in() options = [",".join([str(x) for x in o]) if isinstance(o, (list, tuple)) else str(o) for o in options] return {"arguments": ["run_microbiome.py"] + options + [output], "return_value": output}