def graph(depthfile, messages, outdir=os.getcwd(), prefix="Test", uncoverthreshold=5): depth = pysam.TabixFile(depthfile) plot_data = defaultdict(list) genes = str trans = str for exon_message in sorted(messages, key=lambda x: _exon_vaule(x[2])): if len(exon_message) != 6: continue trans, genes, exon_name, refchrom, refstart, refend = exon_message try: chrom = str(refchrom) if chrom.startswith("chrM"): chrom = 'chrM_NC_012920.1' start = int(refstart) end = int(refend) except Exception: continue if chrom not in depth.contigs: continue array = list() for depth_m in depth.fetch(chrom, start - 1, end): rows = depth_m.strip().split('\t') if len(rows) < 3: raise IOError("depth file format Error !") d = rows[-1] array.append(int(d)) if not len(array): array = [0] * (end - start + 1) depth_message = DescribeArray(array) plot_data["averagedepth"].append(depth_message.average) plot_data["mediandepth"].append(depth_message.median) coverate = float(depth_message.get_frequece(uncoverthreshold).strip('%')) plot_data["coverate"].append(coverate) plot_data["x_axis"].append(exon_name) depth.close() plot(plot_data, outdir, prefix, genes, trans)
def depths(self, bed, prefix=None, read_filter='all', qual_threshold=15, count_threshold=None, uncover_threshold=5): threshold_type = type(count_threshold) if threshold_type is list or threshold_type is tuple: count_threshold = set(count_threshold) elif count_threshold is set: pass else: count_threshold = {1, 4, 10, 20, 30, 100} uncover_threshold = max(int(uncover_threshold), 1) count_threshold.add(uncover_threshold) count_threshold = sorted(count_threshold, key=int) outdir, name = os.path.split(os.path.abspath(prefix)) if prefix else (os.getcwd(), 'Rhea_Chip') depth = os.path.join(outdir, '%s.depth.tsv' % name) bedstat = os.path.join(outdir, '%s.bed.stat' % name) stats = os.path.join(outdir, '%s.stat' % name) chromstat = os.path.join(outdir, '%s.chrom.stat' % name) uncover = os.path.join(outdir, '%s.uncover.bed' % name) _depth = smart_open(depth, 'w') _bedstat = smart_open(bedstat, 'w') _stats = smart_open(stats, 'w') _chromstat = smart_open(chromstat, 'w') # _depth.write("#Chrom\tPos\tRef\tCov_A\tCov_C\tCov_G\tCov_T\tDepth\tWinGC_%s\n" % str(gcwin)) _depth.write("#Chrom\tPos\tRef\tCov_A\tCov_C\tCov_G\tCov_T\tDepth\n") _bedstat.write("#Chr\tStart\tStop\tAverage\tMedian\tMax\tMin\n") _stats.write("##A Simple introduction about %s\n" % self.name) _chromstat.write("#Chr\tAverage\tMedian\tMax\tMin\t") _chromstat.write("\t".join(["Coverage (>=%sX)" % str(key) for key in count_threshold]) + '\n') chroms = defaultdict(dict) rangedict = defaultdict(int) regiondict = defaultdict(int) total_base = 0 region_num = 0 with smart_open(bed) as regions: for region in regions: rows = region.strip().split() if len(rows) < 3: continue try: chrom = rows[0] start = max(int(rows[1]) - 1, 0) stop = min(int(rows[2]) + 1, self.reference.get_reference_length(chrom)) except Exception: continue cov_a, cov_c, cov_g, cov_t = self._reader.count_coverage(chrom, start, stop, read_callback=read_filter, quality_threshold=int(qual_threshold)) bases = self.reference.fetch(chrom, start, stop).upper() reg = list() chrom = "chr" + re.sub("^chr", "", chrom) if chrom.startswith("chrM"): chrom = 'chrM_NC_012920.1' for n in xrange(start, stop): offset = n - start dep = [bases[offset], cov_a[offset], cov_c[offset], cov_g[offset], cov_t[offset]] base_depth = sum(dep[1:]) dep.append(base_depth) # gc_radio = round(self.count_gc(bases[offset:offset + 201]), 4) # dep.append(gc_radio) chroms[chrom][n] = dep for num in count_threshold: num = int(num) if base_depth >= num: rangedict[num] += 1 total_base += 1 reg.append(base_depth) region_num += 1 array = DescribeArray(reg) averages = str(round(array.average, 2)) mediandepth = str(round(array.median, 2)) maxdepth = str(round(array.max, 2)) mindepth = str(round(array.min, 2)) _bedstat.write("\t".join([chrom, rows[1], rows[2], averages, mediandepth, maxdepth, mindepth]) + '\n') for num in count_threshold: num = int(num) if array.average >= num: regiondict[num] += 1 uncover_range = list() n_array = list() for chrom in sorted(chroms.keys(), key=lambda x: _chrom_valued(x)): dep = sorted(chroms[chrom].iteritems(), key=lambda x: int(x[0])) array = list() for p, d in dep: _depth.write("\t".join([chrom, str(p), '\t'.join([str(i) for i in d])]) + '\n') array.append(d) n_array.append(d[5]) array = DescribeArray(array, col=5) averages = str(round(array.average, 2)) mediandepth = str(round(array.median, 2)) maxdepth = str(round(array.max, 2)) mindepth = str(round(array.min, 2)) chromcover = [array.get_frequece(thre, col=5) for thre in count_threshold] _chromstat.write("\t".join([chrom, averages, mediandepth, maxdepth, mindepth] + chromcover) + '\n') if read_filter == 'all': uncover_bases = [int(p) for p, d in dep if d[5] < uncover_threshold] uncover_range.extend(formact_number_list_to_range(uncover_bases, tag=chrom)) if read_filter == 'all': uncoverout = smart_open(uncover, 'w') uncoverout.write("#Chr\tStart\tStop\n") uncoverout.writelines(uncover_range) uncoverout.close() array = DescribeArray(n_array) _stats.write("Average depth : %.2f\n" % array.average) _stats.write("Median depth : %.2f\n" % array.median) _stats.write("Max depth : %.2f\n" % array.max) _stats.write("Min depth : %.2f\n" % array.min) for number in count_threshold: number = int(number) _stats.write("Coverage (>={0:d}X) : {1:.2f}%\n".format( number, (float(rangedict[number]) / total_base) * 100, 2)) for number in count_threshold: number = int(number) _stats.write("Region Coverage (>={0:d}X) : {1:.2f}%\n".format( number, (float(regiondict[number]) / region_num) * 100, 2)) _depth.close() _bedstat.close() _stats.close() _chromstat.close() dep_f = CreatIndex(depth) _ = dep_f.check_index(seq_col=0, start_col=1, end_col=1)
def depths(self, bed, prefix=None, read_filter='all', qual_threshold=15, count_threshold=None, uncover_threshold=5): threshold_type = type(count_threshold) if threshold_type is list or threshold_type is tuple: count_threshold = set(count_threshold) elif count_threshold is set: pass else: count_threshold = {1, 4, 10, 20, 30, 100} uncover_threshold = max(int(uncover_threshold), 1) count_threshold.add(uncover_threshold) count_threshold = sorted(count_threshold, key=int) outdir, name = os.path.split( os.path.abspath(prefix)) if prefix else (os.getcwd(), 'Rhea_Chip') depth = os.path.join(outdir, '%s.depth.tsv' % name) bedstat = os.path.join(outdir, '%s.bed.stat' % name) stats = os.path.join(outdir, '%s.stat' % name) chromstat = os.path.join(outdir, '%s.chrom.stat' % name) uncover = os.path.join(outdir, '%s.uncover.bed' % name) _depth = smart_open(depth, 'w') _bedstat = smart_open(bedstat, 'w') _stats = smart_open(stats, 'w') _chromstat = smart_open(chromstat, 'w') # _depth.write("#Chrom\tPos\tRef\tCov_A\tCov_C\tCov_G\tCov_T\tDepth\tWinGC_%s\n" % str(gcwin)) _depth.write("#Chrom\tPos\tRef\tCov_A\tCov_C\tCov_G\tCov_T\tDepth\n") _bedstat.write("#Chr\tStart\tStop\tAverage\tMedian\tMax\tMin\n") _stats.write("##A Simple introduction about %s\n" % self.name) _chromstat.write("#Chr\tAverage\tMedian\tMax\tMin\t") _chromstat.write("\t".join( ["Coverage (>=%sX)" % str(key) for key in count_threshold]) + '\n') chroms = defaultdict(dict) rangedict = defaultdict(int) regiondict = defaultdict(int) total_base = 0 region_num = 0 with smart_open(bed) as regions: for region in regions: rows = region.strip().split() if len(rows) < 3: continue try: chrom = rows[0] start = max(int(rows[1]) - 1, 0) stop = min( int(rows[2]) + 1, self.reference.get_reference_length(chrom)) except Exception: continue cov_a, cov_c, cov_g, cov_t = self._reader.count_coverage( chrom, start, stop, read_callback=read_filter, quality_threshold=int(qual_threshold)) bases = self.reference.fetch(chrom, start, stop).upper() reg = list() chrom = "chr" + re.sub("^chr", "", chrom) if chrom.startswith("chrM"): chrom = 'chrM_NC_012920.1' for n in xrange(start, stop): offset = n - start dep = [ bases[offset], cov_a[offset], cov_c[offset], cov_g[offset], cov_t[offset] ] base_depth = sum(dep[1:]) dep.append(base_depth) # gc_radio = round(self.count_gc(bases[offset:offset + 201]), 4) # dep.append(gc_radio) chroms[chrom][n] = dep for num in count_threshold: num = int(num) if base_depth >= num: rangedict[num] += 1 total_base += 1 reg.append(base_depth) region_num += 1 array = DescribeArray(reg) averages = str(round(array.average, 2)) mediandepth = str(round(array.median, 2)) maxdepth = str(round(array.max, 2)) mindepth = str(round(array.min, 2)) _bedstat.write("\t".join([ chrom, rows[1], rows[2], averages, mediandepth, maxdepth, mindepth ]) + '\n') for num in count_threshold: num = int(num) if array.average >= num: regiondict[num] += 1 uncover_range = list() n_array = list() for chrom in sorted(chroms.keys(), key=lambda x: _chrom_valued(x)): dep = sorted(chroms[chrom].iteritems(), key=lambda x: int(x[0])) array = list() for p, d in dep: _depth.write("\t".join( [chrom, str(p), '\t'.join([str(i) for i in d])]) + '\n') array.append(d) n_array.append(d[5]) array = DescribeArray(array, col=5) averages = str(round(array.average, 2)) mediandepth = str(round(array.median, 2)) maxdepth = str(round(array.max, 2)) mindepth = str(round(array.min, 2)) chromcover = [ array.get_frequece(thre, col=5) for thre in count_threshold ] _chromstat.write( "\t".join([chrom, averages, mediandepth, maxdepth, mindepth] + chromcover) + '\n') if read_filter == 'all': uncover_bases = [ int(p) for p, d in dep if d[5] < uncover_threshold ] uncover_range.extend( formact_number_list_to_range(uncover_bases, tag=chrom)) if read_filter == 'all': uncoverout = smart_open(uncover, 'w') uncoverout.write("#Chr\tStart\tStop\n") uncoverout.writelines(uncover_range) uncoverout.close() array = DescribeArray(n_array) _stats.write("Average depth : %.2f\n" % array.average) _stats.write("Median depth : %.2f\n" % array.median) _stats.write("Max depth : %.2f\n" % array.max) _stats.write("Min depth : %.2f\n" % array.min) for number in count_threshold: number = int(number) _stats.write("Coverage (>={0:d}X) : {1:.2f}%\n".format( number, (float(rangedict[number]) / total_base) * 100, 2)) for number in count_threshold: number = int(number) _stats.write("Region Coverage (>={0:d}X) : {1:.2f}%\n".format( number, (float(regiondict[number]) / region_num) * 100, 2)) _depth.close() _bedstat.close() _stats.close() _chromstat.close() dep_f = CreatIndex(depth) _ = dep_f.check_index(seq_col=0, start_col=1, end_col=1)