def run_nfr(args): """run nfr calling """ if args.bam is None and args.ins_track is None: raise Exception("Must supply either bam file or insertion track") if not args.out: args.out = '.'.join(os.path.basename(args.calls).split('.')[0:-3]) if args.fasta is not None: chrs_fasta = read_chrom_sizes_from_fasta(args.fasta) pwm = PWM.open(args.pwm) chunks = ChunkList.read(args.bed, chromDict = chrs_fasta, min_offset = max(pwm.up, pwm.down)) else: chunks = ChunkList.read(args.bed) if args.bam is not None: chrs_bam = read_chrom_sizes_from_bam(args.bam) chunks.checkChroms(chrs_bam, chrom_source = "BAM file") chunks.merge() maxQueueSize = args.cores * 10 params = NFRParameters(args.occ_track, args.calls, args.ins_track, args.bam, max_occ = args.max_occ, max_occ_upper = args.max_occ_upper, fasta = args.fasta, pwm = args.pwm) sets = chunks.split(items = args.cores * 5) pool1 = mp.Pool(processes = max(1,args.cores-1)) nfr_handle = open(args.out + '.nfrpos.bed','w') nfr_handle.close() nfr_queue = mp.JoinableQueue() nfr_process = mp.Process(target = _writeNFR, args=(nfr_queue, args.out)) nfr_process.start() if params.ins_track is None: ins_handle = open(args.out + '.ins.bedgraph','w') ins_handle.close() ins_queue = mp.JoinableQueue() ins_process = mp.Process(target = _writeIns, args=(ins_queue, args.out)) ins_process.start() for j in sets: tmp = pool1.map(_nfrHelper, zip(j,itertools.repeat(params))) for result in tmp: if params.ins_track is None: nfr_queue.put(result[0]) ins_queue.put(result[1]) else: nfr_queue.put(result) pool1.close() pool1.join() nfr_queue.put('STOP') nfr_process.join() if params.ins_track is None: ins_queue.put('STOP') ins_process.join() pysam.tabix_compress(args.out + '.nfrpos.bed', args.out + '.nfrpos.bed.gz',force = True) shell_command('rm ' + args.out + '.nfrpos.bed') pysam.tabix_index(args.out + '.nfrpos.bed.gz', preset = "bed", force = True) if params.ins_track is None: pysam.tabix_compress(args.out + '.ins.bedgraph', args.out + '.ins.bedgraph.gz', force = True) shell_command('rm ' + args.out + '.ins.bedgraph') pysam.tabix_index(args.out + '.ins.bedgraph.gz', preset = "bed", force = True)
def get_counts(args): """function to get fragment sizes """ if args.out is None: args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1]) chunks = ChunkList.read(args.bed) mat = np.zeros(len(chunks), dtype=np.int) bamHandle = AlignmentFile(args.bam) j = 0 for chunk in chunks: for read in bamHandle.fetch(chunk.chrom, max(0, chunk.start - args.upper), chunk.end + args.upper): if read.is_proper_pair and not read.is_reverse: if args.atac: #get left position l_pos = read.pos + 4 #get insert size #correct by 8 base pairs to be inserion to insertion ilen = abs(read.template_length) - 8 else: l_pos = read.pos ilen = abs(read.template_length) r_pos = l_pos + ilen - 1 if _between(ilen, args.lower, args.upper) and ( _between(l_pos, chunk.start, chunk.end) or _between(r_pos, chunk.start, chunk.end)): mat[j] += 1 j += 1 bamHandle.close() np.savetxt(args.out + ".counts.txt.gz", mat, delimiter="\n", fmt='%i')
def get_signal(args): """function to get signal from a track around some sites """ if not args.out: args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1]) chunks = ChunkList.read(args.bed, strand_col=args.strand) params = _signalParams(args.bg, args.sizes, args.up, args.down, args.exp, args.scale, args.positive, args.all) sets = chunks.split(items=min(args.cores * 20, len(chunks))) pool = Pool(processes=args.cores) tmp = pool.map(_signalHelper, zip(sets, itertools.repeat(params))) pool.close() pool.join() if args.all: mat = np.vstack(tmp) np.savetxt(args.out + ".tracks.txt.gz", mat, delimiter=",", fmt="%1.5g") mat[np.isnan(mat)] = 0 result = np.sum(mat, axis=0) else: result = sum(tmp) if not args.no_agg: if args.norm: result = result / len(chunks) fig = plt.figure() plt.plot(range(-args.up, args.down + 1), result) plt.xlabel("Position relative to Site") plt.ylabel("Signal Intensity") fig.savefig(args.out + '.agg.track.eps') plt.close(fig) np.savetxt(args.out + '.agg.track.txt', result, delimiter="\t")
def get_counts(args): """function to get fragment sizes """ if args.out is None: args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1]) chunks = ChunkList.read(args.bed) mat = np.zeros(len(chunks), dtype=np.int) bamHandle = AlignmentFile(args.bam) j = 0 for chunk in chunks: for read in bamHandle.fetch(chunk.chrom, max(0, chunk.start - args.upper), chunk.end + args.upper): if read.is_proper_pair and not read.is_reverse: if args.atac: #get left position l_pos = read.pos + 4 #get insert size #correct by 8 base pairs to be inserion to insertion ilen = abs(read.template_length) - 8 else: l_pos = read.pos ilen = abs(read.template_length) r_pos = l_pos + ilen - 1 if _between(ilen, args.lower, args.upper) and (_between(l_pos, chunk.start, chunk.end) or _between(r_pos, chunk.start, chunk.end)): mat[j] += 1 j += 1 bamHandle.close() np.savetxt(args.out + ".counts.txt.gz", mat, delimiter="\n", fmt='%i')
def get_cov(args, bases = 50000, splitsize = 1000): """function to get coverages """ if not args.out: if args.bed is None: args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1]) else: args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1]) if args.bed is None: chrs = read_chrom_sizes_from_bam(args.bam) chunks = ChunkList.convertChromSizes(chrs, splitsize = splitsize) sets = chunks.split(items = bases/splitsize) else: chunks = ChunkList.read(args.bed) chunks.merge() sets = chunks.split(bases = bases) maxQueueSize = max(2,int(2 * bases / np.mean([chunk.length() for chunk in chunks]))) pool1 = mp.Pool(processes = max(1,args.cores-1)) out_handle = open(args.out + '.cov.bedgraph','w') out_handle.close() write_queue = mp.JoinableQueue(maxsize = maxQueueSize) write_process = mp.Process(target = _writeCov, args=(write_queue, args.out)) write_process.start() for j in sets: tmp = pool1.map(_covHelper, zip(j,itertools.repeat(args))) for track in tmp: write_queue.put(track) pool1.close() pool1.join() write_queue.put('STOP') write_process.join() pysam.tabix_compress(args.out + '.cov.bedgraph', args.out + '.cov.bedgraph.gz', force = True) shell_command('rm ' + args.out + '.cov.bedgraph') pysam.tabix_index(args.out + '.cov.bedgraph.gz', preset = "bed", force = True)
def get_nucleotide(args): """Function to obain sequence content around sites""" if not args.out: args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1]) chunks = ChunkList.read(args.bed, strand_col=args.strand) params = _NucleotideParameters(args.up, args.down, args.fasta, args.dinucleotide) sets = chunks.split(bases=10000) pool = Pool(processes=args.cores) tmp = pool.map(_nucleotideHelper, list(zip(sets, itertools.repeat(params)))) pool.close() pool.join() result = np.zeros(params.matsize) n = 0.0 for i in tmp: result += i[0] n += i[1] result = result // n if args.norm: normfreqs = seq.getNucFreqs(params.fasta, params.nucleotides) result = result / np.reshape(np.repeat(normfreqs, result.shape[1]), result.shape) #save text output out = np.hstack( (np.array(params.nucleotides)[:, np.newaxis], result.astype('|S8'))) np.savetxt(args.out + '.nucfreq.txt', out, delimiter="\t", fmt='%s') #,fmt = '%1.4g')
def get_signal(args): """function to get signal from a track around some sites """ if not args.out: args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1]) chunks = ChunkList.read(args.bed, strand_col = args.strand) params = _signalParams(args.bg, args.sizes, args.up, args.down,args.exp, args.scale, args.positive, args.all) sets = chunks.split(items = min(args.cores*20,len(chunks))) pool = Pool(processes = args.cores) tmp = pool.map(_signalHelper, zip(sets,itertools.repeat(params))) pool.close() pool.join() if args.all: mat = np.vstack(tmp) np.savetxt(args.out + ".tracks.txt.gz", mat, delimiter = ",", fmt="%1.5g") mat[np.isnan(mat)]=0 result = np.sum(mat, axis = 0) else: result = sum(tmp) if not args.no_agg: if args.norm: result = result / len(chunks) fig = plt.figure() plt.plot(range(-args.up,args.down+1),result) plt.xlabel("Position relative to Site") plt.ylabel("Signal Intensity") fig.savefig(args.out+'.agg.track.eps') plt.close(fig) np.savetxt(args.out+'.agg.track.txt',result,delimiter="\t")
def make_bias_vplot(args): """function to make vplot """ if not args.out: args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1]) chunks = ChunkList.read(args.bed, strand_col = args.strand) sets = chunks.split(items = min(args.cores*20,len(chunks))) params = _BiasVplotParams(flank = args.flank, lower = args.lower, upper = args.upper, bg = args.bg, sizes = args.sizes, scale = args.scale, pwm = args.pwm, fasta = args.fasta) pool = Pool(processes = args.cores) tmp = pool.map(_biasVplotHelper, zip(sets,itertools.repeat(params))) pool.close() pool.join() result = sum(tmp) ##Turn matrix into VMat object vmat=VMat(result,args.lower,args.upper) vmat.plot(filename=args.out+".Bias.Vplot.eps") if args.plot_extra: ##get insertion profile represented by vplot vmat.converto1d() vmat.plot_1d(filename=args.out+'.InsertionProfile.eps') #get insert size dstribution represented by vplot vmat.plot_insertsize(filename= args.out + ".InsertSizes.eps") ##save vmat.save(args.out+".Bias.VMat")
def make_bias_vplot(args): """function to make vplot """ if not args.out: args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1]) chunks = ChunkList.read(args.bed, strand_col=args.strand) sets = chunks.split(items=min(args.cores * 20, len(chunks))) params = _BiasVplotParams(flank=args.flank, lower=args.lower, upper=args.upper, bg=args.bg, sizes=args.sizes, scale=args.scale, pwm=args.pwm, fasta=args.fasta) pool = Pool(processes=args.cores) tmp = pool.map(_biasVplotHelper, zip(sets, itertools.repeat(params))) pool.close() pool.join() result = sum(tmp) ##Turn matrix into VMat object vmat = VMat(result, args.lower, args.upper) vmat.plot(filename=args.out + ".Bias.Vplot.eps") if args.plot_extra: ##get insertion profile represented by vplot vmat.converto1d() vmat.plot_1d(filename=args.out + '.InsertionProfile.eps') #get insert size dstribution represented by vplot vmat.plot_insertsize(filename=args.out + ".InsertSizes.eps") ##save vmat.save(args.out + ".Bias.VMat")
def make_bias_track(args, bases = 500000, splitsize = 1000): """function to compute bias track """ if args.out is None: if args.bed is not None: args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1]) else: args.out = '.'.join(os.path.basename(args.fasta).split('.')[0:-1]) params = _BiasParams(args.fasta, args.pwm) if args.bed is None: chunks = ChunkList.convertChromSizes(params.chrs, splitsize = splitsize) sets = chunks.split(items = bases/splitsize) else: chunks = ChunkList.read(args.bed) chunks.merge() sets = chunks.split(bases = bases) maxQueueSize = max(2,int(2 * bases / np.mean([chunk.length() for chunk in chunks]))) pool = mp.Pool(processes = max(1,args.cores-1)) out_handle = open(args.out + '.Scores.bedgraph','w') out_handle.close() write_queue = mp.JoinableQueue(maxsize = maxQueueSize) write_process = mp.Process(target = _writeBias, args=(write_queue, args.out)) write_process.start() for j in sets: tmp = pool.map(_biasHelper, zip(j,itertools.repeat(params))) for track in tmp: write_queue.put(track) pool.close() pool.join() write_queue.put('STOP') write_process.join() pysam.tabix_compress(args.out + '.Scores.bedgraph', args.out + '.Scores.bedgraph.gz', force = True) shell_command('rm ' + args.out + '.Scores.bedgraph') pysam.tabix_index(args.out + '.Scores.bedgraph.gz', preset = "bed", force = True)
def setUp(self): """setup Test_BiasMat class with construction of a biasmat""" bed_list = ChunkList.read('example/example.bed') self.chunk = bed_list[0] self.biastrack = InsertionBiasTrack(self.chunk.chrom, self.chunk.start, self.chunk.end) self.biastrack.read_track('example/example.Scores.bedgraph.gz') self.biasmat = BiasMat2D(self.chunk.chrom,self.chunk.start+100,self.chunk.end-100,100,200) self.biasmat.makeBiasMat(self.biastrack)
def setUp(self): """setup Test_occupancy class by establishing parameters""" bed_list = ChunkList.read('example/example.bed') self.chunk = bed_list[0] self.vmat = V.VMat.open('example/example.VMat') self.vmat = V.VMat.open('example/example.VMat') self.mat = FragmentMat2D(self.chunk.chrom,self.chunk.start-self.vmat.w,self.chunk.end+self.vmat.w,self.vmat.lower,self.vmat.upper) self.mat.makeFragmentMat('example/example.bam') self.signal = Nuc.SignalTrack(self.chunk.chrom,self.chunk.start,self.chunk.end) self.signal.calculateSignal(self.mat, self.vmat)
def setUp(self): """setup Test_BiasMat class with construction of a biasmat""" bed_list = ChunkList.read('example/example.bed') self.chunk = bed_list[0] self.biastrack = InsertionBiasTrack(self.chunk.chrom, self.chunk.start, self.chunk.end) self.biastrack.read_track('example/example.Scores.bedgraph.gz') self.biasmat = BiasMat2D(self.chunk.chrom, self.chunk.start + 100, self.chunk.end - 100, 100, 200) self.biasmat.makeBiasMat(self.biastrack)
def setUp(self): """ set up class for testing variance calculation for background signal """ bed_list = ChunkList.read('example/example.bed') chunk = bed_list[0] vmat = V.VMat.open('example/example.VMat') biastrack = InsertionBiasTrack(chunk.chrom, chunk.start, chunk.end) biastrack.read_track('example/example.Scores.bedgraph.gz') biasmat = BiasMat2D(chunk.chrom,chunk.start+200,chunk.end-200,100,250) biasmat.makeBiasMat(biastrack) self.signaldist = Nuc.SignalDistribution(chunk.start+300,vmat,biasmat,35)
def run_diff(args, bases=500000): """run differential occupancy calling """ chrs = read_chrom_sizes_from_bam(args.bam) pwm = PWM.open(args.pwm) chunks = ChunkList.read(args.bed, chromDict=chrs, min_offset=args.flank + args.upper / 2 + max(pwm.up, pwm.down)) chunks.merge() maxQueueSize = max( 2, int(100 * bases / np.mean([chunk.length() for chunk in chunks]))) #get fragmentsizes fragment_dist1 = FragmentMixDistribution(0, upper=args.upper) fragment_dist1.fragmentsizes = FragmentSizes( 0, args.upper, vals=FragmentSizes.open(args.sizes1).get(0, args.upper)) fragment_dist1.modelNFR() fragment_dist2 = FragmentMixDistribution(0, upper=args.upper) fragment_dist2.fragmentsizes = FragmentSizes( 0, args.upper, vals=FragmentSizes.open(args.sizes2).get(0, args.upper)) fragment_dist2.modelNFR() params = OccupancyParameters(fragment_dist, args.upper, args.fasta, args.pwm, sep=args.nuc_sep, min_occ=args.min_occ, flank=args.flank, bam=args.bam, ci=args.confidence_interval) sets = chunks.split(bases=bases) pool1 = mp.Pool(processes=max(1, args.cores - 1)) diff_handle = open(args.out + '.occdiff.bed', 'w') diff_handle.close() diff_queue = mp.JoinableQueue() diff_process = mp.Process(target=_writeDiff, args=(diff_queue, args.out)) diff_process.start() nuc_dist = np.zeros(args.upper) for j in sets: tmp = pool1.map(_occHelper, zip(j, itertools.repeat(params))) for result in tmp: diff_queue.put(result[1]) pool1.close() pool1.join() diff_queue.put('STOP') diff_process.join() pysam.tabix_compress(args.out + '.occdiff.bed', args.out + '.occdiff.bed.gz', force=True) shell_command('rm ' + args.out + '.occdiff.bed') pysam.tabix_index(args.out + '.occdiff.bed.gz', preset="bed", force=True)
def get_pwm(args, bases=50000, splitsize=1000): """Functiono obtain PWM around ATAC insertion""" if not args.out: args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1]) chrs = read_chrom_sizes_from_fasta(args.fasta) if args.bed is None: chunks = ChunkList.convertChromSizes(chrs, splitsize=splitsize, offset=args.flank) sets = chunks.split(items=bases / splitsize) else: chunks = ChunkList.read(args.bed, chromDict=chrs, min_offset=args.flank) sets = chunks.split(bases=bases) params = _PWMParameters(bam=args.bam, up=args.flank, down=args.flank, fasta=args.fasta, lower=args.lower, upper=args.upper, atac=args.atac, sym=args.sym) pool = Pool(processes=args.cores) tmp = pool.map(_pwmHelper, zip(sets, itertools.repeat(params))) pool.close() pool.join() n = 0.0 result = np.zeros((len(params.nucleotides), params.up + params.down + 1)) for i in tmp: result += i[0] n += i[1] result /= n if args.bed: normfreqs = seq.getNucFreqsFromChunkList(chunks, args.fasta, params.nucleotides) else: normfreqs = seq.getNucFreqs(args.fasta, params.nucleotides) result = result / np.reshape(np.repeat(normfreqs, result.shape[1]), result.shape) if args.sym: #Symmetrize left = result[:, 0:(args.flank + 1)] right = result[:, args.flank:] rightflipped = np.fliplr(np.flipud(right)) combined = (left + rightflipped) / 2 result = np.hstack( (combined, np.fliplr(np.flipud(combined[:, 0:args.flank])))) #save pwm = PWM(result, args.flank, args.flank, params.nucleotides) pwm.save(args.out + '.PWM.txt')
def setUp(self): """setup Test_occupancy class by establishing parameters""" bed_list = ChunkList.read('example/example.bed') self.chunk = bed_list[0] self.vmat = V.VMat.open('example/example.VMat') self.vmat = V.VMat.open('example/example.VMat') self.mat = FragmentMat2D(self.chunk.chrom, self.chunk.start - self.vmat.w, self.chunk.end + self.vmat.w, self.vmat.lower, self.vmat.upper) self.mat.makeFragmentMat('example/example.bam') self.signal = Nuc.SignalTrack(self.chunk.chrom, self.chunk.start, self.chunk.end) self.signal.calculateSignal(self.mat, self.vmat)
def run_nfr(args): """run nfr calling """ if args.bam is None and args.ins_track is None: raise Exception("Must supply either bam file or insertion track") if not args.out: args.out = '.'.join(os.path.basename(args.calls).split('.')[0:-3]) chunks = ChunkList.read(args.bed) chunks.merge() maxQueueSize = args.cores * 10 params = NFRParameters(args.occ_track, args.calls, args.ins_track, args.bam, max_occ = args.max_occ, max_occ_upper = args.max_occ_upper, fasta = args.fasta, pwm = args.pwm) sets = chunks.split(items = args.cores * 5) pool1 = mp.Pool(processes = max(1,args.cores-1)) nfr_handle = open(args.out + '.nfrpos.bed','w') nfr_handle.close() nfr_queue = mp.JoinableQueue() nfr_process = mp.Process(target = _writeNFR, args=(nfr_queue, args.out)) nfr_process.start() if params.ins_track is None: ins_handle = open(args.out + '.ins.bedgraph','w') ins_handle.close() ins_queue = mp.JoinableQueue() ins_process = mp.Process(target = _writeIns, args=(ins_queue, args.out)) ins_process.start() for j in sets: tmp = pool1.map(_nfrHelper, zip(j,itertools.repeat(params))) for result in tmp: if params.ins_track is None: nfr_queue.put(result[0]) ins_queue.put(result[1]) else: nfr_queue.put(result) pool1.close() pool1.join() nfr_queue.put('STOP') nfr_process.join() if params.ins_track is None: ins_queue.put('STOP') ins_process.join() pysam.tabix_compress(args.out + '.nfrpos.bed', args.out + '.nfrpos.bed.gz',force = True) shell_command('rm ' + args.out + '.nfrpos.bed') pysam.tabix_index(args.out + '.nfrpos.bed.gz', preset = "bed", force = True) if params.ins_track is None: pysam.tabix_compress(args.out + '.ins.bedgraph', args.out + '.ins.bedgraph.gz', force = True) shell_command('rm ' + args.out + '.ins.bedgraph') pysam.tabix_index(args.out + '.ins.bedgraph.gz', preset = "bed", force = True)
def make_bias_track(args, bases=500000, splitsize=1000): """function to compute bias track """ if args.out is None: if args.bed is not None: args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1]) else: args.out = '.'.join(os.path.basename(args.fasta).split('.')[0:-1]) params = _BiasParams(args.fasta, args.pwm) if args.bed is None: chunks = ChunkList.convertChromSizes(params.chrs, splitsize=splitsize) sets = chunks.split(items=bases // splitsize) else: chunks = ChunkList.read(args.bed) chunks.checkChroms(list(params.chrs.keys())) chunks.merge() sets = chunks.split(bases=bases) maxQueueSize = max( 2, int(2 * bases / np.mean([chunk.length() for chunk in chunks]))) pool = mp.Pool(processes=max(1, args.cores - 1)) out_handle = open(args.out + '.Scores.bedgraph', 'w') out_handle.close() write_queue = mp.JoinableQueue(maxsize=maxQueueSize) write_process = mp.Process(target=_writeBias, args=(write_queue, args.out)) write_process.start() for j in sets: tmp = pool.map(_biasHelper, list(zip(j, itertools.repeat(params)))) for track in tmp: write_queue.put(track) pool.close() pool.join() write_queue.put('STOP') write_process.join() pysam.tabix_compress(args.out + '.Scores.bedgraph', args.out + '.Scores.bedgraph.gz', force=True) shell_command('rm ' + args.out + '.Scores.bedgraph') pysam.tabix_index(args.out + '.Scores.bedgraph.gz', preset="bed", force=True)
def get_ins(args, bases=50000, splitsize=1000): """function to get insertions """ if not args.out: if args.bed is None: args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1]) else: args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1]) if args.bed is None: chrs = read_chrom_sizes_from_bam(args.bam) chunks = ChunkList.convertChromSizes(chrs, splitsize=splitsize) sets = chunks.split(items=bases / splitsize) else: chunks = ChunkList.read(args.bed) chunks.merge() sets = chunks.split(bases=bases) maxQueueSize = max( 2, int(2 * bases / np.mean([chunk.length() for chunk in chunks]))) pool1 = mp.Pool(processes=max(1, args.cores - 1)) out_handle = open(args.out + '.ins.bedgraph', 'w') out_handle.close() write_queue = mp.JoinableQueue(maxsize=maxQueueSize) write_process = mp.Process(target=_writeIns, args=(write_queue, args.out)) write_process.start() for j in sets: if args.smooth: tmp = pool1.map(_insHelperSmooth, list(zip(j, itertools.repeat(args)))) else: tmp = pool1.map(_insHelper, list(zip(j, itertools.repeat(args)))) for track in tmp: write_queue.put(track) pool1.close() pool1.join() write_queue.put('STOP') write_process.join() pysam.tabix_compress(args.out + '.ins.bedgraph', args.out + '.ins.bedgraph.gz', force=True) shell_command('rm ' + args.out + '.ins.bedgraph') pysam.tabix_index(args.out + '.ins.bedgraph.gz', preset="bed", force=True)
def get_sizes(args): """function to get fragment sizes """ if args.out is None: args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1]) sizes = FragmentSizes(lower = args.lower, upper = args.upper, atac = args.atac) if args.bed: chunks = ChunkList.read(args.bed) chunks.merge() sizes.calculateSizes(args.bam, chunks) else: sizes.calculateSizes(args.bam) sizes.save(args.out+'.fragmentsizes.txt') if not args.no_plot: #make figure fig = plt.figure() plt.plot(range(sizes.lower,sizes.upper),sizes.get(sizes.lower,sizes.upper),label = args.out) plt.xlabel("Fragment Size") plt.ylabel("Frequency") fig.savefig(args.out+'.fragmentsizes.eps') plt.close(fig)
def get_pwm(args, bases = 50000, splitsize = 1000): """Functiono obtain PWM around ATAC insertion""" if not args.out: args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1]) chrs = read_chrom_sizes_from_fasta(args.fasta) if args.bed is None: chunks = ChunkList.convertChromSizes(chrs, splitsize = splitsize, offset = args.flank) sets = chunks.split(items = bases/splitsize) else: chunks = ChunkList.read(args.bed, chromDict = chrs, min_offset = args.flank) sets = chunks.split(bases = bases) params = _PWMParameters(bam = args.bam, up = args.flank, down = args.flank, fasta = args.fasta, lower = args.lower, upper = args.upper, atac = args.atac, sym = args.sym) pool = Pool(processes = args.cores) tmp = pool.map(_pwmHelper, zip(sets,itertools.repeat(params))) pool.close() pool.join() n = 0.0 result = np.zeros((len(params.nucleotides), params.up + params.down + 1)) for i in tmp: result += i[0] n += i[1] result /= n if args.bed: normfreqs = seq.getNucFreqsFromChunkList(chunks, args.fasta, params.nucleotides) else: normfreqs = seq.getNucFreqs(args.fasta, params.nucleotides) result = result / np.reshape(np.repeat(normfreqs,result.shape[1]),result.shape) if args.sym: #Symmetrize left = result[:,0:(args.flank + 1)] right = result[:,args.flank:] rightflipped = np.fliplr(np.flipud(right)) combined = (left + rightflipped) / 2 result = np.hstack((combined, np.fliplr(np.flipud(combined[:,0:args.flank])))) #save pwm = PWM(result, args.flank, args.flank, params.nucleotides) pwm.save(args.out + '.PWM.txt')
def get_nucleotide(args): """Function to obain sequence content around sites""" if not args.out: args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1]) chunks = ChunkList.read(args.bed, strand_col = args.strand) params = _NucleotideParameters(args.up, args.down, args.fasta, args.dinucleotide) sets = chunks.split(bases = 10000) pool = Pool(processes = args.cores) tmp = pool.map(_nucleotideHelper, zip(sets,itertools.repeat(params))) pool.close() pool.join() result = np.zeros(params.matsize) n = 0.0 for i in tmp: result += i[0] n += i[1] result = result / n if args.norm: normfreqs = seq.getNucFreqs(params.fasta, params.nucleotides) result = result / np.reshape(np.repeat(normfreqs,result.shape[1]),result.shape) #save text output out = np.hstack((np.array(params.nucleotides)[:,np.newaxis], result.astype('|S8'))) np.savetxt(args.out+'.nucfreq.txt', out, delimiter="\t", fmt = '%s')#,fmt = '%1.4g')
def run_diff(args, bases = 500000): """run differential occupancy calling """ chrs = read_chrom_sizes_from_bam(args.bam) pwm = PWM.open(args.pwm) chunks = ChunkList.read(args.bed, chromDict = chrs, min_offset = args.flank + args.upper/2 + max(pwm.up,pwm.down)) chunks.merge() maxQueueSize = max(2,int(100 * bases / np.mean([chunk.length() for chunk in chunks]))) #get fragmentsizes fragment_dist1 = FragmentMixDistribution(0, upper = args.upper) fragment_dist1.fragmentsizes = FragmentSizes(0, args.upper, vals = FragmentSizes.open(args.sizes1).get(0,args.upper)) fragment_dist1.modelNFR() fragment_dist2 = FragmentMixDistribution(0, upper = args.upper) fragment_dist2.fragmentsizes = FragmentSizes(0, args.upper, vals = FragmentSizes.open(args.sizes2).get(0,args.upper)) fragment_dist2.modelNFR() params = OccupancyParameters(fragment_dist, args.upper, args.fasta, args.pwm, sep = args.nuc_sep, min_occ = args.min_occ, flank = args.flank, bam = args.bam, ci = args.confidence_interval) sets = chunks.split(bases = bases) pool1 = mp.Pool(processes = max(1,args.cores-1)) diff_handle = open(args.out + '.occdiff.bed','w') diff_handle.close() diff_queue = mp.JoinableQueue() diff_process = mp.Process(target = _writeDiff, args=(diff_queue, args.out)) diff_process.start() nuc_dist = np.zeros(args.upper) for j in sets: tmp = pool1.map(_occHelper, zip(j,itertools.repeat(params))) for result in tmp: diff_queue.put(result[1]) pool1.close() pool1.join() diff_queue.put('STOP') diff_process.join() pysam.tabix_compress(args.out + '.occdiff.bed', args.out + '.occdiff.bed.gz',force = True) shell_command('rm ' + args.out + '.occdiff.bed') pysam.tabix_index(args.out + '.occdiff.bed.gz', preset = "bed", force = True)
def get_sizes(args): """function to get fragment sizes """ if args.out is None: args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1]) sizes = FragmentSizes(lower=args.lower, upper=args.upper, atac=args.atac) if args.bed: chunks = ChunkList.read(args.bed) chunks.merge() sizes.calculateSizes(args.bam, chunks) else: sizes.calculateSizes(args.bam) sizes.save(args.out + '.fragmentsizes.txt') if not args.no_plot: #make figure fig = plt.figure() plt.plot(list(range(sizes.lower, sizes.upper)), sizes.get(sizes.lower, sizes.upper), label=args.out) plt.xlabel("Fragment Size") plt.ylabel("Frequency") fig.savefig(args.out + '.fragmentsizes.pdf') plt.close(fig)
def run_nuc(args): """run occupancy calling """ vmat = VMat.open(args.vmat) if args.fasta: chrs = read_chrom_sizes_from_fasta(args.fasta) else: chrs = read_chrom_sizes_from_bam(args.bam) pwm = PWM.open(args.pwm) chunks = ChunkList.read(args.bed, chromDict=chrs, min_offset=vmat.mat.shape[1] + vmat.upper // 2 + max(pwm.up, pwm.down) + args.nuc_sep // 2, min_length=args.nuc_sep * 2) chunks.slop(chrs, up=args.nuc_sep // 2, down=args.nuc_sep // 2) chunks.merge() maxQueueSize = args.cores * 10 if args.sizes is not None: fragment_dist = FragmentSizes.open(args.sizes) else: fragment_dist = FragmentSizes(0, upper=vmat.upper) fragment_dist.calculateSizes(args.bam, chunks) params = NucParameters(vmat=vmat, fragmentsizes=fragment_dist, bam=args.bam, fasta=args.fasta, pwm=args.pwm, occ_track=args.occ_track, sd=args.sd, nonredundant_sep=args.nuc_sep, redundant_sep=args.redundant_sep, min_z=args.min_z, min_lr=args.min_lr, atac=args.atac) sets = chunks.split(items=args.cores * 5) pool1 = mp.Pool(processes=max(1, args.cores - 1)) if args.write_all: outputs = [ 'nucpos', 'nucpos.redundant', 'nucleoatac_signal', 'nucleoatac_signal.smooth', 'nucleoatac_background', 'nucleoatac_raw' ] else: outputs = [ 'nucpos', 'nucpos.redundant', 'nucleoatac_signal', 'nucleoatac_signal.smooth' ] handles = {} write_queues = {} write_processes = {} for i in outputs: if i not in ['nucpos', 'nucpos.redundant', 'nfrpos']: handles[i] = open(args.out + '.' + i + '.bedgraph', 'w') else: handles[i] = open(args.out + '.' + i + '.bed', 'w') handles[i].close() write_queues[i] = mp.JoinableQueue(maxsize=maxQueueSize) write_processes[i] = mp.Process(target=_writeFuncs[i], args=(write_queues[i], args.out)) write_processes[i].start() for j in sets: tmp = pool1.map(_nucHelper, list(zip(j, itertools.repeat(params)))) for result in tmp: for i in outputs: write_queues[i].put(result[i]) pool1.close() pool1.join() for i in outputs: write_queues[i].put('STOP') for i in outputs: write_processes[i].join() if i not in ['nucpos', 'nucpos.redundant']: pysam.tabix_compress(args.out + '.' + i + '.bedgraph', args.out + '.' + i + '.bedgraph.gz', force=True) shell_command('rm ' + args.out + '.' + i + '.bedgraph') pysam.tabix_index(args.out + '.' + i + '.bedgraph.gz', preset="bed", force=True) else: pysam.tabix_compress(args.out + '.' + i + '.bed', args.out + '.' + i + '.bed.gz', force=True) shell_command('rm ' + args.out + '.' + i + '.bed') pysam.tabix_index(args.out + '.' + i + '.bed.gz', preset="bed", force=True)
def run_occ(args): """run occupancy calling """ if args.fasta: chrs = read_chrom_sizes_from_fasta(args.fasta) else: chrs = read_chrom_sizes_from_bam(args.bam) pwm = PWM.open(args.pwm) chunks = ChunkList.read(args.bed, chromDict = chrs, min_offset = args.flank + args.upper/2 + max(pwm.up,pwm.down) + args.nuc_sep/2) chunks.slop(chrs, up = args.nuc_sep/2, down = args.nuc_sep/2) chunks.merge() maxQueueSize = args.cores*10 fragment_dist = FragmentMixDistribution(0, upper = args.upper) if args.sizes is not None: tmp = FragmentSizes.open(args.sizes) fragment_dist.fragmentsizes = FragmentSizes(0, args.upper, vals = tmp.get(0,args.upper)) else: fragment_dist.getFragmentSizes(args.bam, chunks) fragment_dist.modelNFR() fragment_dist.plotFits(args.out + '.occ_fit.eps') fragment_dist.fragmentsizes.save(args.out + '.fragmentsizes.txt') params = OccupancyParameters(fragment_dist, args.upper, args.fasta, args.pwm, sep = args.nuc_sep, min_occ = args.min_occ, flank = args.flank, bam = args.bam, ci = args.confidence_interval, step = args.step) sets = chunks.split(items = args.cores * 5) pool1 = mp.Pool(processes = max(1,args.cores-1)) out_handle1 = open(args.out + '.occ.bedgraph','w') out_handle1.close() out_handle2 = open(args.out + '.occ.lower_bound.bedgraph','w') out_handle2.close() out_handle3 = open(args.out + '.occ.upper_bound.bedgraph','w') out_handle3.close() write_queue = mp.JoinableQueue(maxsize = maxQueueSize) write_process = mp.Process(target = _writeOcc, args=(write_queue, args.out)) write_process.start() peaks_handle = open(args.out + '.occpeaks.bed','w') peaks_handle.close() peaks_queue = mp.JoinableQueue() peaks_process = mp.Process(target = _writePeaks, args=(peaks_queue, args.out)) peaks_process.start() nuc_dist = np.zeros(args.upper) for j in sets: tmp = pool1.map(_occHelper, zip(j,itertools.repeat(params))) for result in tmp: nuc_dist += result[0] write_queue.put(result[1]) peaks_queue.put(result[2]) pool1.close() pool1.join() write_queue.put('STOP') peaks_queue.put('STOP') write_process.join() peaks_process.join() pysam.tabix_compress(args.out + '.occpeaks.bed', args.out + '.occpeaks.bed.gz',force = True) shell_command('rm ' + args.out + '.occpeaks.bed') pysam.tabix_index(args.out + '.occpeaks.bed.gz', preset = "bed", force = True) for i in ('occ','occ.lower_bound','occ.upper_bound'): pysam.tabix_compress(args.out + '.' + i + '.bedgraph', args.out + '.'+i+'.bedgraph.gz',force = True) shell_command('rm ' + args.out + '.' + i + '.bedgraph') pysam.tabix_index(args.out + '.' + i + '.bedgraph.gz', preset = "bed", force = True) dist_out = FragmentSizes(0, args.upper, vals = nuc_dist) dist_out.save(args.out + '.nuc_dist.txt') print "Making figure" #make figure fig = plt.figure() plt.plot(range(0,args.upper),dist_out.get(0,args.upper),label = "Nucleosome Distribution") plt.xlabel("Fragment Size") plt.ylabel("Frequency") fig.savefig(args.out+'.nuc_dist.eps') plt.close(fig)
def setUp(self): """setup Test_Track class""" bed_list = ChunkList.read("example/example.bed") self.chunk = bed_list[0]
def setUp(self): """setup Test_Ins class by making a fragmentlist""" bed_list = ChunkList.read("example/example.bed") self.chunk = bed_list[0]
def run_nuc(args): """run occupancy calling """ vmat = VMat.open(args.vmat) if args.fasta: chrs = read_chrom_sizes_from_fasta(args.fasta) else: chrs = read_chrom_sizes_from_bam(args.bam) pwm = PWM.open(args.pwm) chunks = ChunkList.read(args.bed, chromDict = chrs, min_offset = vmat.mat.shape[1] + vmat.upper/2 + max(pwm.up,pwm.down) + args.nuc_sep/2, min_length = args.nuc_sep * 2) chunks.slop(chrs, up = args.nuc_sep/2, down = args.nuc_sep/2) chunks.merge() maxQueueSize = args.cores*10 if args.sizes is not None: fragment_dist = FragmentSizes.open(args.sizes) else: fragment_dist = FragmentSizes(0, upper = vmat.upper) fragment_dist.calculateSizes(args.bam, chunks) params = NucParameters(vmat = vmat, fragmentsizes = fragment_dist, bam = args.bam, fasta = args.fasta, pwm = args.pwm, occ_track = args.occ_track, sd = args.sd, nonredundant_sep = args.nuc_sep, redundant_sep = args.redundant_sep, min_z = args.min_z, min_lr = args.min_lr , atac = args.atac) sets = chunks.split(items = args.cores*5) pool1 = mp.Pool(processes = max(1,args.cores-1)) if args.write_all: outputs = ['nucpos','nucpos.redundant','nucleoatac_signal','nucleoatac_signal.smooth', 'nucleoatac_background','nucleoatac_raw'] else: outputs = ['nucpos','nucpos.redundant','nucleoatac_signal','nucleoatac_signal.smooth'] handles = {} write_queues = {} write_processes = {} for i in outputs: if i not in ['nucpos','nucpos.redundant','nfrpos']: handles[i] = open(args.out + '.'+i+'.bedgraph','w') else: handles[i] = open(args.out + '.'+i+'.bed','w') handles[i].close() write_queues[i] = mp.JoinableQueue(maxsize = maxQueueSize) write_processes[i] = mp.Process(target = _writeFuncs[i], args=(write_queues[i], args.out)) write_processes[i].start() for j in sets: tmp = pool1.map(_nucHelper, zip(j,itertools.repeat(params))) for result in tmp: for i in outputs: write_queues[i].put(result[i]) pool1.close() pool1.join() for i in outputs: write_queues[i].put('STOP') for i in outputs: write_processes[i].join() if i not in ['nucpos','nucpos.redundant']: pysam.tabix_compress(args.out + '.' + i + '.bedgraph', args.out + '.' + i + '.bedgraph.gz',force = True) shell_command('rm ' + args.out + '.' + i + '.bedgraph') pysam.tabix_index(args.out + '.' + i + '.bedgraph.gz', preset = "bed", force = True) else: pysam.tabix_compress(args.out + '.' + i + '.bed', args.out + '.' + i + '.bed.gz',force = True) shell_command('rm ' + args.out + '.' + i + '.bed') pysam.tabix_index(args.out + '.' + i + '.bed.gz', preset = "bed", force = True)
def setUp(self): """setup Test_Ins class by making a fragmentlist""" bed_list = ChunkList.read('example/example.bed') self.chunk = bed_list[0]
def setUp(self): """setup Test_Track class""" bed_list = ChunkList.read('example/example.bed') self.chunk = bed_list[0]