Beispiel #1
0
def graph(depthfile, messages, outdir=os.getcwd(), prefix="Test", uncoverthreshold=5):
	depth = pysam.TabixFile(depthfile)
	plot_data = defaultdict(list)
	genes = str
	trans = str
	for exon_message in sorted(messages, key=lambda x: _exon_vaule(x[2])):
		if len(exon_message) != 6:
			continue
		trans, genes, exon_name, refchrom, refstart, refend = exon_message
		try:
			chrom = str(refchrom)
			if chrom.startswith("chrM"):
				chrom = 'chrM_NC_012920.1'
			start = int(refstart)
			end = int(refend)
		except Exception:
			continue
		if chrom not in depth.contigs:
			continue
		array = list()
		for depth_m in depth.fetch(chrom, start - 1, end):
			rows = depth_m.strip().split('\t')
			if len(rows) < 3:
				raise IOError("depth file format Error !")
			d = rows[-1]
			array.append(int(d))
		if not len(array):
			array = [0] * (end - start + 1)
		depth_message = DescribeArray(array)
		plot_data["averagedepth"].append(depth_message.average)
		plot_data["mediandepth"].append(depth_message.median)
		coverate = float(depth_message.get_frequece(uncoverthreshold).strip('%'))
		plot_data["coverate"].append(coverate)
		plot_data["x_axis"].append(exon_name)
	depth.close()
	plot(plot_data, outdir, prefix, genes, trans)
Beispiel #2
0
	def depths(self, bed, prefix=None, read_filter='all', qual_threshold=15, count_threshold=None, uncover_threshold=5):
		threshold_type = type(count_threshold)
		if threshold_type is list or threshold_type is tuple:
			count_threshold = set(count_threshold)
		elif count_threshold is set:
			pass
		else:
			count_threshold = {1, 4, 10, 20, 30, 100}
		uncover_threshold = max(int(uncover_threshold), 1)
		count_threshold.add(uncover_threshold)
		count_threshold = sorted(count_threshold, key=int)
		outdir, name = os.path.split(os.path.abspath(prefix)) if prefix else (os.getcwd(), 'Rhea_Chip')
		depth = os.path.join(outdir, '%s.depth.tsv' % name)
		bedstat = os.path.join(outdir, '%s.bed.stat' % name)
		stats = os.path.join(outdir, '%s.stat' % name)
		chromstat = os.path.join(outdir, '%s.chrom.stat' % name)
		uncover = os.path.join(outdir, '%s.uncover.bed' % name)
		_depth = smart_open(depth, 'w')
		_bedstat = smart_open(bedstat, 'w')
		_stats = smart_open(stats, 'w')
		_chromstat = smart_open(chromstat, 'w')
		# _depth.write("#Chrom\tPos\tRef\tCov_A\tCov_C\tCov_G\tCov_T\tDepth\tWinGC_%s\n" % str(gcwin))
		_depth.write("#Chrom\tPos\tRef\tCov_A\tCov_C\tCov_G\tCov_T\tDepth\n")
		_bedstat.write("#Chr\tStart\tStop\tAverage\tMedian\tMax\tMin\n")
		_stats.write("##A Simple introduction about %s\n" % self.name)
		_chromstat.write("#Chr\tAverage\tMedian\tMax\tMin\t")
		_chromstat.write("\t".join(["Coverage (>=%sX)" % str(key) for key in count_threshold]) + '\n')
		chroms = defaultdict(dict)
		rangedict = defaultdict(int)
		regiondict = defaultdict(int)
		total_base = 0
		region_num = 0
		with smart_open(bed) as regions:
			for region in regions:
				rows = region.strip().split()
				if len(rows) < 3:
					continue
				try:
					chrom = rows[0]
					start = max(int(rows[1]) - 1, 0)
					stop = min(int(rows[2]) + 1, self.reference.get_reference_length(chrom))
				except Exception:
					continue
				cov_a, cov_c, cov_g, cov_t = self._reader.count_coverage(chrom, start, stop, read_callback=read_filter,
				                                                         quality_threshold=int(qual_threshold))
				bases = self.reference.fetch(chrom, start, stop).upper()
				reg = list()
				chrom = "chr" + re.sub("^chr", "", chrom)
				if chrom.startswith("chrM"):
					chrom = 'chrM_NC_012920.1'
				for n in xrange(start, stop):
					offset = n - start
					dep = [bases[offset], cov_a[offset], cov_c[offset], cov_g[offset], cov_t[offset]]
					base_depth = sum(dep[1:])
					dep.append(base_depth)
					# gc_radio = round(self.count_gc(bases[offset:offset + 201]), 4)
					# dep.append(gc_radio)
					chroms[chrom][n] = dep
					for num in count_threshold:
						num = int(num)
						if base_depth >= num:
							rangedict[num] += 1
					total_base += 1
					reg.append(base_depth)
				region_num += 1
				array = DescribeArray(reg)
				averages = str(round(array.average, 2))
				mediandepth = str(round(array.median, 2))
				maxdepth = str(round(array.max, 2))
				mindepth = str(round(array.min, 2))
				_bedstat.write("\t".join([chrom, rows[1], rows[2], averages, mediandepth, maxdepth, mindepth]) + '\n')
				for num in count_threshold:
					num = int(num)
					if array.average >= num:
						regiondict[num] += 1
		uncover_range = list()
		n_array = list()
		for chrom in sorted(chroms.keys(), key=lambda x: _chrom_valued(x)):
			dep = sorted(chroms[chrom].iteritems(), key=lambda x: int(x[0]))
			array = list()
			for p, d in dep:
				_depth.write("\t".join([chrom, str(p), '\t'.join([str(i) for i in d])]) + '\n')
				array.append(d)
				n_array.append(d[5])
			array = DescribeArray(array, col=5)
			averages = str(round(array.average, 2))
			mediandepth = str(round(array.median, 2))
			maxdepth = str(round(array.max, 2))
			mindepth = str(round(array.min, 2))
			chromcover = [array.get_frequece(thre, col=5) for thre in count_threshold]
			_chromstat.write("\t".join([chrom, averages, mediandepth, maxdepth, mindepth] + chromcover) + '\n')
			if read_filter == 'all':
				uncover_bases = [int(p) for p, d in dep if d[5] < uncover_threshold]
				uncover_range.extend(formact_number_list_to_range(uncover_bases, tag=chrom))
		if read_filter == 'all':
			uncoverout = smart_open(uncover, 'w')
			uncoverout.write("#Chr\tStart\tStop\n")
			uncoverout.writelines(uncover_range)
			uncoverout.close()
		array = DescribeArray(n_array)
		_stats.write("Average depth : %.2f\n" % array.average)
		_stats.write("Median depth : %.2f\n" % array.median)
		_stats.write("Max depth : %.2f\n" % array.max)
		_stats.write("Min depth : %.2f\n" % array.min)
		for number in count_threshold:
			number = int(number)
			_stats.write("Coverage (>={0:d}X) : {1:.2f}%\n".format(
				number, (float(rangedict[number]) / total_base) * 100, 2))
		for number in count_threshold:
			number = int(number)
			_stats.write("Region Coverage (>={0:d}X) : {1:.2f}%\n".format(
				number, (float(regiondict[number]) / region_num) * 100, 2))
		_depth.close()
		_bedstat.close()
		_stats.close()
		_chromstat.close()
		dep_f = CreatIndex(depth)
		_ = dep_f.check_index(seq_col=0, start_col=1, end_col=1)
Beispiel #3
0
 def depths(self,
            bed,
            prefix=None,
            read_filter='all',
            qual_threshold=15,
            count_threshold=None,
            uncover_threshold=5):
     threshold_type = type(count_threshold)
     if threshold_type is list or threshold_type is tuple:
         count_threshold = set(count_threshold)
     elif count_threshold is set:
         pass
     else:
         count_threshold = {1, 4, 10, 20, 30, 100}
     uncover_threshold = max(int(uncover_threshold), 1)
     count_threshold.add(uncover_threshold)
     count_threshold = sorted(count_threshold, key=int)
     outdir, name = os.path.split(
         os.path.abspath(prefix)) if prefix else (os.getcwd(), 'Rhea_Chip')
     depth = os.path.join(outdir, '%s.depth.tsv' % name)
     bedstat = os.path.join(outdir, '%s.bed.stat' % name)
     stats = os.path.join(outdir, '%s.stat' % name)
     chromstat = os.path.join(outdir, '%s.chrom.stat' % name)
     uncover = os.path.join(outdir, '%s.uncover.bed' % name)
     _depth = smart_open(depth, 'w')
     _bedstat = smart_open(bedstat, 'w')
     _stats = smart_open(stats, 'w')
     _chromstat = smart_open(chromstat, 'w')
     # _depth.write("#Chrom\tPos\tRef\tCov_A\tCov_C\tCov_G\tCov_T\tDepth\tWinGC_%s\n" % str(gcwin))
     _depth.write("#Chrom\tPos\tRef\tCov_A\tCov_C\tCov_G\tCov_T\tDepth\n")
     _bedstat.write("#Chr\tStart\tStop\tAverage\tMedian\tMax\tMin\n")
     _stats.write("##A Simple introduction about %s\n" % self.name)
     _chromstat.write("#Chr\tAverage\tMedian\tMax\tMin\t")
     _chromstat.write("\t".join(
         ["Coverage (>=%sX)" % str(key) for key in count_threshold]) + '\n')
     chroms = defaultdict(dict)
     rangedict = defaultdict(int)
     regiondict = defaultdict(int)
     total_base = 0
     region_num = 0
     with smart_open(bed) as regions:
         for region in regions:
             rows = region.strip().split()
             if len(rows) < 3:
                 continue
             try:
                 chrom = rows[0]
                 start = max(int(rows[1]) - 1, 0)
                 stop = min(
                     int(rows[2]) + 1,
                     self.reference.get_reference_length(chrom))
             except Exception:
                 continue
             cov_a, cov_c, cov_g, cov_t = self._reader.count_coverage(
                 chrom,
                 start,
                 stop,
                 read_callback=read_filter,
                 quality_threshold=int(qual_threshold))
             bases = self.reference.fetch(chrom, start, stop).upper()
             reg = list()
             chrom = "chr" + re.sub("^chr", "", chrom)
             if chrom.startswith("chrM"):
                 chrom = 'chrM_NC_012920.1'
             for n in xrange(start, stop):
                 offset = n - start
                 dep = [
                     bases[offset], cov_a[offset], cov_c[offset],
                     cov_g[offset], cov_t[offset]
                 ]
                 base_depth = sum(dep[1:])
                 dep.append(base_depth)
                 # gc_radio = round(self.count_gc(bases[offset:offset + 201]), 4)
                 # dep.append(gc_radio)
                 chroms[chrom][n] = dep
                 for num in count_threshold:
                     num = int(num)
                     if base_depth >= num:
                         rangedict[num] += 1
                 total_base += 1
                 reg.append(base_depth)
             region_num += 1
             array = DescribeArray(reg)
             averages = str(round(array.average, 2))
             mediandepth = str(round(array.median, 2))
             maxdepth = str(round(array.max, 2))
             mindepth = str(round(array.min, 2))
             _bedstat.write("\t".join([
                 chrom, rows[1], rows[2], averages, mediandepth, maxdepth,
                 mindepth
             ]) + '\n')
             for num in count_threshold:
                 num = int(num)
                 if array.average >= num:
                     regiondict[num] += 1
     uncover_range = list()
     n_array = list()
     for chrom in sorted(chroms.keys(), key=lambda x: _chrom_valued(x)):
         dep = sorted(chroms[chrom].iteritems(), key=lambda x: int(x[0]))
         array = list()
         for p, d in dep:
             _depth.write("\t".join(
                 [chrom, str(p), '\t'.join([str(i) for i in d])]) + '\n')
             array.append(d)
             n_array.append(d[5])
         array = DescribeArray(array, col=5)
         averages = str(round(array.average, 2))
         mediandepth = str(round(array.median, 2))
         maxdepth = str(round(array.max, 2))
         mindepth = str(round(array.min, 2))
         chromcover = [
             array.get_frequece(thre, col=5) for thre in count_threshold
         ]
         _chromstat.write(
             "\t".join([chrom, averages, mediandepth, maxdepth, mindepth] +
                       chromcover) + '\n')
         if read_filter == 'all':
             uncover_bases = [
                 int(p) for p, d in dep if d[5] < uncover_threshold
             ]
             uncover_range.extend(
                 formact_number_list_to_range(uncover_bases, tag=chrom))
     if read_filter == 'all':
         uncoverout = smart_open(uncover, 'w')
         uncoverout.write("#Chr\tStart\tStop\n")
         uncoverout.writelines(uncover_range)
         uncoverout.close()
     array = DescribeArray(n_array)
     _stats.write("Average depth : %.2f\n" % array.average)
     _stats.write("Median depth : %.2f\n" % array.median)
     _stats.write("Max depth : %.2f\n" % array.max)
     _stats.write("Min depth : %.2f\n" % array.min)
     for number in count_threshold:
         number = int(number)
         _stats.write("Coverage (>={0:d}X) : {1:.2f}%\n".format(
             number, (float(rangedict[number]) / total_base) * 100, 2))
     for number in count_threshold:
         number = int(number)
         _stats.write("Region Coverage (>={0:d}X) : {1:.2f}%\n".format(
             number, (float(regiondict[number]) / region_num) * 100, 2))
     _depth.close()
     _bedstat.close()
     _stats.close()
     _chromstat.close()
     dep_f = CreatIndex(depth)
     _ = dep_f.check_index(seq_col=0, start_col=1, end_col=1)