def get_sample_cdf(samples): dtype = 'tot' snp_nums = {chrom: Counter() for chrom in ['mt', 'nuc']} cov = {chrom: 0 for chrom in ['mt', 'nuc']} for sample in samples.values(): coverage, snp_freq = sample.mt.get_data(dtype) cov['mt'] += np.count_nonzero(coverage >= min_cov) snps = sample.mt.get_snps(min_freq, min_cov, dtype) inds = np.nonzero(snps)[0] snp_nums['mt'] += Counter(snps[inds]) coverage, snp_freq = sample.nuc.get_data(dtype) cov['nuc'] += np.count_nonzero(coverage >= min_cov) snps = sample.nuc.get_snps(min_freq, min_cov, dtype) inds = np.nonzero(snps)[0] snp_nums['nuc'] += Counter(snps[inds]) x_mit, cdf_mit = cs.cdf(snp_nums['mt']) x, frac_mit = cs.cdf(snp_nums['mt'], norm=False) x_nuc, cdf_nuc = cs.cdf(snp_nums['nuc']) x, frac_nuc = cs.cdf(snp_nums['nuc'], norm=False) frac_mit = frac_mit / cov['mt'] frac_nuc = frac_nuc / cov['nuc'] mito = {'x': x_mit, 'cdf': cdf_mit, 'frac': frac_mit} nuc = {'x': x_nuc, 'cdf': cdf_nuc, 'frac': frac_nuc} return mito, nuc
def snp_cdf(self, min_freq, min_cov, frac=True, norm=False): cov_ind = np.count_nonzero(self.coverage >= min_cov) freqs = self.get_snps(min_freq, min_cov) inds = np.nonzero(freqs)[0] counts = Counter(freqs[inds]) x, c = cs.cdf(counts, norm=norm) if frac: c = c / cov_ind return x, c
def snp_cdf(self, min_freq, min_cov, dtype='tot', frac=True, norm=False): #coverage, snp_freq = self.get_data(dtype) cov_ind = np.count_nonzero(self.coverage[dtype] >= min_cov) freqs, locus = self.get_snps(min_freq, min_cov, dtype=dtype) #inds = np.nonzero(freqs)[0] counts = Counter(freqs) x, c = cs.cdf(counts, norm=norm) if frac: c = c / cov_ind return x, c
mpl.rcParams.update({ 'text.usetex': True, 'font.family': 'serif', 'font.serif': 'Computer Modern Roman', 'font.size': 14, 'figure.autolayout': True }) fnames = [] savefigs = True data11 = np.load('Desai/alignment_srr5406290.npz' ) #Yeast_Nanopore_Aug30_barcode07/alignment_barcode07.npz') #data12 = np.load('Yeast_Nanopore_Aug30_barcode09/alignment_barcode09.npz') x11_nuc, cdf11_nuc = cs.cdf(data11['nuc_len'][()]) x11_mt, cdf11_mt = cs.cdf(data11['mt_len'][()]) #x12_nuc, cdf12_nuc = cs.cdf(data12['nuc_len'][()]) #x12_mt, cdf12_mt = cs.cdf(data12['mt_len'][()]) # fig_len = plt.figure() fig_len.gca().plot(x11_mt, 1 - cdf11_mt, 'C0-', label='mitochondria') fig_len.gca().plot(x11_nuc, 1 - cdf11_nuc, 'C1-', label='nuclear') #fig_len.gca().semilogy(x12_mt * 1e-3, 1 - cdf12_mt, 'C0--') #fig_len.gca().semilogy(x12_nuc * 1e-3, 1 - cdf12_nuc, 'C1--') fig_len.gca().set_xlabel(r'$r$ (bp)') ##fig_len.gca().set_title('1 - CDF of read length') fig_len.gca().set_title(r'$\mathrm{Prob}(\mathrm{read\; length} > r)$', fontsize=14) handles, labels = fig_len.gca().get_legend_handles_labels() line11 = mlines.Line2D([], [], color='k', linestyle='-')