コード例 #1
0
ファイル: depthQC.py プロジェクト: ZhiweiHwang/Rhea_Chip
	def depths(self, bed, prefix=None, read_filter='all', qual_threshold=15, count_threshold=None, uncover_threshold=5):
		threshold_type = type(count_threshold)
		if threshold_type is list or threshold_type is tuple:
			count_threshold = set(count_threshold)
		elif count_threshold is set:
			pass
		else:
			count_threshold = {1, 4, 10, 20, 30, 100}
		uncover_threshold = max(int(uncover_threshold), 1)
		count_threshold.add(uncover_threshold)
		count_threshold = sorted(count_threshold, key=int)
		outdir, name = os.path.split(os.path.abspath(prefix)) if prefix else (os.getcwd(), 'Rhea_Chip')
		depth = os.path.join(outdir, '%s.depth.tsv' % name)
		bedstat = os.path.join(outdir, '%s.bed.stat' % name)
		stats = os.path.join(outdir, '%s.stat' % name)
		chromstat = os.path.join(outdir, '%s.chrom.stat' % name)
		uncover = os.path.join(outdir, '%s.uncover.bed' % name)
		_depth = smart_open(depth, 'w')
		_bedstat = smart_open(bedstat, 'w')
		_stats = smart_open(stats, 'w')
		_chromstat = smart_open(chromstat, 'w')
		# _depth.write("#Chrom\tPos\tRef\tCov_A\tCov_C\tCov_G\tCov_T\tDepth\tWinGC_%s\n" % str(gcwin))
		_depth.write("#Chrom\tPos\tRef\tCov_A\tCov_C\tCov_G\tCov_T\tDepth\n")
		_bedstat.write("#Chr\tStart\tStop\tAverage\tMedian\tMax\tMin\n")
		_stats.write("##A Simple introduction about %s\n" % self.name)
		_chromstat.write("#Chr\tAverage\tMedian\tMax\tMin\t")
		_chromstat.write("\t".join(["Coverage (>=%sX)" % str(key) for key in count_threshold]) + '\n')
		chroms = defaultdict(dict)
		rangedict = defaultdict(int)
		regiondict = defaultdict(int)
		total_base = 0
		region_num = 0
		with smart_open(bed) as regions:
			for region in regions:
				rows = region.strip().split()
				if len(rows) < 3:
					continue
				try:
					chrom = rows[0]
					start = max(int(rows[1]) - 1, 0)
					stop = min(int(rows[2]) + 1, self.reference.get_reference_length(chrom))
				except Exception:
					continue
				cov_a, cov_c, cov_g, cov_t = self._reader.count_coverage(chrom, start, stop, read_callback=read_filter,
				                                                         quality_threshold=int(qual_threshold))
				bases = self.reference.fetch(chrom, start, stop).upper()
				reg = list()
				chrom = "chr" + re.sub("^chr", "", chrom)
				if chrom.startswith("chrM"):
					chrom = 'chrM_NC_012920.1'
				for n in xrange(start, stop):
					offset = n - start
					dep = [bases[offset], cov_a[offset], cov_c[offset], cov_g[offset], cov_t[offset]]
					base_depth = sum(dep[1:])
					dep.append(base_depth)
					# gc_radio = round(self.count_gc(bases[offset:offset + 201]), 4)
					# dep.append(gc_radio)
					chroms[chrom][n] = dep
					for num in count_threshold:
						num = int(num)
						if base_depth >= num:
							rangedict[num] += 1
					total_base += 1
					reg.append(base_depth)
				region_num += 1
				array = DescribeArray(reg)
				averages = str(round(array.average, 2))
				mediandepth = str(round(array.median, 2))
				maxdepth = str(round(array.max, 2))
				mindepth = str(round(array.min, 2))
				_bedstat.write("\t".join([chrom, rows[1], rows[2], averages, mediandepth, maxdepth, mindepth]) + '\n')
				for num in count_threshold:
					num = int(num)
					if array.average >= num:
						regiondict[num] += 1
		uncover_range = list()
		n_array = list()
		for chrom in sorted(chroms.keys(), key=lambda x: _chrom_valued(x)):
			dep = sorted(chroms[chrom].iteritems(), key=lambda x: int(x[0]))
			array = list()
			for p, d in dep:
				_depth.write("\t".join([chrom, str(p), '\t'.join([str(i) for i in d])]) + '\n')
				array.append(d)
				n_array.append(d[5])
			array = DescribeArray(array, col=5)
			averages = str(round(array.average, 2))
			mediandepth = str(round(array.median, 2))
			maxdepth = str(round(array.max, 2))
			mindepth = str(round(array.min, 2))
			chromcover = [array.get_frequece(thre, col=5) for thre in count_threshold]
			_chromstat.write("\t".join([chrom, averages, mediandepth, maxdepth, mindepth] + chromcover) + '\n')
			if read_filter == 'all':
				uncover_bases = [int(p) for p, d in dep if d[5] < uncover_threshold]
				uncover_range.extend(formact_number_list_to_range(uncover_bases, tag=chrom))
		if read_filter == 'all':
			uncoverout = smart_open(uncover, 'w')
			uncoverout.write("#Chr\tStart\tStop\n")
			uncoverout.writelines(uncover_range)
			uncoverout.close()
		array = DescribeArray(n_array)
		_stats.write("Average depth : %.2f\n" % array.average)
		_stats.write("Median depth : %.2f\n" % array.median)
		_stats.write("Max depth : %.2f\n" % array.max)
		_stats.write("Min depth : %.2f\n" % array.min)
		for number in count_threshold:
			number = int(number)
			_stats.write("Coverage (>={0:d}X) : {1:.2f}%\n".format(
				number, (float(rangedict[number]) / total_base) * 100, 2))
		for number in count_threshold:
			number = int(number)
			_stats.write("Region Coverage (>={0:d}X) : {1:.2f}%\n".format(
				number, (float(regiondict[number]) / region_num) * 100, 2))
		_depth.close()
		_bedstat.close()
		_stats.close()
		_chromstat.close()
		dep_f = CreatIndex(depth)
		_ = dep_f.check_index(seq_col=0, start_col=1, end_col=1)
コード例 #2
0
ファイル: AnnotateVCF.py プロジェクト: youyoulove/Rhea_Chip
	def parse(self, outdir=os.getcwd(), kickout_function=None, follow_function=None):
		logger.info("General VCF Phrasing begin !!!")
		self.vcf.parse_metainfo()
		titles = ["Chrom", "Start", "Stop", "Refer", "Call", "Zygosity", "VarType", "Filter", "ADepth", "ARatio",
		          "PL", "NeighborGID", "PhasedGID", "MutationName", "GeneSym", "EntrezGeneID", "Transcript",
		          "TransBioType", "cHGVS", "Protein", "pHGVS", "Strand", "PrimaryTag", "FunctionName", "Impact",
		          "ExInID", "cDNAPos", "AAPos", "CDSPos", "AnnoTag", "StandardMutation"]
		annotation = namedtuple("Annotation", titles)
		dbtitle = self.DBAnno.dbtitle.split("\t")
		titles.extend(dbtitle)
		kickouts = kickout_function.split(",") if kickout_function else list()
		follows = follow_function.split(",") if follow_function else None
		for name, samples in self.vcf.readlines.iteritems():
			logger.info("Start to annotate sample %s" % name)
			message_num = 0
			result_out = list()
			for messages in samples:
				chrom = messages.Chrom
				pos = int(messages.Pos)
				gt = re.compile("(\d)[|/](\d)").match(messages.GT)
				if not gt:
					continue
				message_num += 1
				if message_num % 250 == 0:
					logger.info("Complete " + str(message_num) + " articles")
				l1, l2 = int(gt.group(1)), int(gt.group(2))
				refer = messages.Ref
				alter_1 = messages.Alter[l1 - 1] if l1 > 0 else refer
				alter_2 = messages.Alter[l2 - 1] if l2 > 0 else refer
				alter_out_set = set()
				eff_info_dict = self.get_snpeff_info(messages.Info['ANN'])
				try:
					nb_id = messages.NB
				except AttributeError:
					nb_id = "."
				try:
					pb_id = messages.PB
				except AttributeError:
					pb_id = "."
				filter_tag = messages.Filter
				if l1 == l2:
					zygosity = "hom-ref" if l1 == 0 else "hom-alt"
					alter_out_set.add(alter_1)
				else:
					zygosity = "het-alt"
					if l1 != 0:
						alter_out_set.add(alter_1)
					if l2 != 0:
						alter_out_set.add(alter_2)
				for alters in alter_out_set:
					offset_s, offset_e, nor_refer, nor_alter, vartype, close_anno = \
						self.closet_anno(chrom, pos, refer, alters)
					start = pos + offset_s
					stop = pos + + offset_e
					for info_rows in eff_info_dict[alters]:
						if not len(info_rows):
							info_rows = (".",) * 16
						genesym, gene_id, trans, trans_bio, chgvs, protein_id, phgvs, strand, primary, \
						functions, impact, exon_num, cdna_pos, aa_pos, cds_pos, anno_tag = info_rows
						functions = re.sub("_variant", "", functions)
						if (functions in kickouts) or (follows and functions not in follow_function):
							continue
						if chgvs != ".":
							mutation_name = "{0}({1}): {2}".format(trans, genesym, chgvs)
							mutation_name += " (%s)" % phgvs if phgvs != "." else ""
						else:
							mutation_name = "."
						try:
							alle_depth = int(messages.AD.split(",")[l2])
							try:
								base_depth = int(messages.DP)
								alle_radio = round(float(alle_depth) / base_depth, 2)
							except Exception:
								alle_radio = "."
						except Exception:
							alle_depth = "."
							alle_radio = "."
						try:
							phred = ",".join(messages.PL.split(",")[3 * (l2 - 1): 3 * l2])
						except Exception:
							phred = "."
						varinfo = [chrom, start, stop, nor_refer, nor_alter, zygosity, vartype, filter_tag, alle_depth,
						           alle_radio, phred, nb_id, pb_id, mutation_name, genesym, gene_id, trans, trans_bio,
						           chgvs, protein_id, phgvs, strand, primary, functions, impact, exon_num, cdna_pos,
						           aa_pos, cds_pos, anno_tag, close_anno]
						variation = annotation._make(varinfo)
						dbinfo = self.DBAnno.dbanno(variation)
						for i in dbtitle:
							if i in dbinfo:
								varinfo.append("|".join(dbinfo[i]))
							else:
								varinfo.append(".")
						result_out.append(varinfo)
			sample_out = os.path.join(outdir, "%s.anno.tsv" % name)
			f_out = open(sample_out, "w")
			f_out.write("#" + "\t".join(titles) + '\n')
			for anno_message in sorted(result_out, key=lambda x: (_chrom_valued(x[0]), int(x[1]), int(x[2]))):
				f_out.write("\t".join(map(str, anno_message)) + '\n')
			f_out.close()
			fileout = pysam.tabix_index(sample_out, seq_col=0, start_col=1, end_col=2, force=True)
			logger.info(
				"Sample {0} [{1}]: Annotation completed, total {2} articles !".format(name, fileout, message_num))
コード例 #3
0
 def depths(self,
            bed,
            prefix=None,
            read_filter='all',
            qual_threshold=15,
            count_threshold=None,
            uncover_threshold=5):
     threshold_type = type(count_threshold)
     if threshold_type is list or threshold_type is tuple:
         count_threshold = set(count_threshold)
     elif count_threshold is set:
         pass
     else:
         count_threshold = {1, 4, 10, 20, 30, 100}
     uncover_threshold = max(int(uncover_threshold), 1)
     count_threshold.add(uncover_threshold)
     count_threshold = sorted(count_threshold, key=int)
     outdir, name = os.path.split(
         os.path.abspath(prefix)) if prefix else (os.getcwd(), 'Rhea_Chip')
     depth = os.path.join(outdir, '%s.depth.tsv' % name)
     bedstat = os.path.join(outdir, '%s.bed.stat' % name)
     stats = os.path.join(outdir, '%s.stat' % name)
     chromstat = os.path.join(outdir, '%s.chrom.stat' % name)
     uncover = os.path.join(outdir, '%s.uncover.bed' % name)
     _depth = smart_open(depth, 'w')
     _bedstat = smart_open(bedstat, 'w')
     _stats = smart_open(stats, 'w')
     _chromstat = smart_open(chromstat, 'w')
     # _depth.write("#Chrom\tPos\tRef\tCov_A\tCov_C\tCov_G\tCov_T\tDepth\tWinGC_%s\n" % str(gcwin))
     _depth.write("#Chrom\tPos\tRef\tCov_A\tCov_C\tCov_G\tCov_T\tDepth\n")
     _bedstat.write("#Chr\tStart\tStop\tAverage\tMedian\tMax\tMin\n")
     _stats.write("##A Simple introduction about %s\n" % self.name)
     _chromstat.write("#Chr\tAverage\tMedian\tMax\tMin\t")
     _chromstat.write("\t".join(
         ["Coverage (>=%sX)" % str(key) for key in count_threshold]) + '\n')
     chroms = defaultdict(dict)
     rangedict = defaultdict(int)
     regiondict = defaultdict(int)
     total_base = 0
     region_num = 0
     with smart_open(bed) as regions:
         for region in regions:
             rows = region.strip().split()
             if len(rows) < 3:
                 continue
             try:
                 chrom = rows[0]
                 start = max(int(rows[1]) - 1, 0)
                 stop = min(
                     int(rows[2]) + 1,
                     self.reference.get_reference_length(chrom))
             except Exception:
                 continue
             cov_a, cov_c, cov_g, cov_t = self._reader.count_coverage(
                 chrom,
                 start,
                 stop,
                 read_callback=read_filter,
                 quality_threshold=int(qual_threshold))
             bases = self.reference.fetch(chrom, start, stop).upper()
             reg = list()
             chrom = "chr" + re.sub("^chr", "", chrom)
             if chrom.startswith("chrM"):
                 chrom = 'chrM_NC_012920.1'
             for n in xrange(start, stop):
                 offset = n - start
                 dep = [
                     bases[offset], cov_a[offset], cov_c[offset],
                     cov_g[offset], cov_t[offset]
                 ]
                 base_depth = sum(dep[1:])
                 dep.append(base_depth)
                 # gc_radio = round(self.count_gc(bases[offset:offset + 201]), 4)
                 # dep.append(gc_radio)
                 chroms[chrom][n] = dep
                 for num in count_threshold:
                     num = int(num)
                     if base_depth >= num:
                         rangedict[num] += 1
                 total_base += 1
                 reg.append(base_depth)
             region_num += 1
             array = DescribeArray(reg)
             averages = str(round(array.average, 2))
             mediandepth = str(round(array.median, 2))
             maxdepth = str(round(array.max, 2))
             mindepth = str(round(array.min, 2))
             _bedstat.write("\t".join([
                 chrom, rows[1], rows[2], averages, mediandepth, maxdepth,
                 mindepth
             ]) + '\n')
             for num in count_threshold:
                 num = int(num)
                 if array.average >= num:
                     regiondict[num] += 1
     uncover_range = list()
     n_array = list()
     for chrom in sorted(chroms.keys(), key=lambda x: _chrom_valued(x)):
         dep = sorted(chroms[chrom].iteritems(), key=lambda x: int(x[0]))
         array = list()
         for p, d in dep:
             _depth.write("\t".join(
                 [chrom, str(p), '\t'.join([str(i) for i in d])]) + '\n')
             array.append(d)
             n_array.append(d[5])
         array = DescribeArray(array, col=5)
         averages = str(round(array.average, 2))
         mediandepth = str(round(array.median, 2))
         maxdepth = str(round(array.max, 2))
         mindepth = str(round(array.min, 2))
         chromcover = [
             array.get_frequece(thre, col=5) for thre in count_threshold
         ]
         _chromstat.write(
             "\t".join([chrom, averages, mediandepth, maxdepth, mindepth] +
                       chromcover) + '\n')
         if read_filter == 'all':
             uncover_bases = [
                 int(p) for p, d in dep if d[5] < uncover_threshold
             ]
             uncover_range.extend(
                 formact_number_list_to_range(uncover_bases, tag=chrom))
     if read_filter == 'all':
         uncoverout = smart_open(uncover, 'w')
         uncoverout.write("#Chr\tStart\tStop\n")
         uncoverout.writelines(uncover_range)
         uncoverout.close()
     array = DescribeArray(n_array)
     _stats.write("Average depth : %.2f\n" % array.average)
     _stats.write("Median depth : %.2f\n" % array.median)
     _stats.write("Max depth : %.2f\n" % array.max)
     _stats.write("Min depth : %.2f\n" % array.min)
     for number in count_threshold:
         number = int(number)
         _stats.write("Coverage (>={0:d}X) : {1:.2f}%\n".format(
             number, (float(rangedict[number]) / total_base) * 100, 2))
     for number in count_threshold:
         number = int(number)
         _stats.write("Region Coverage (>={0:d}X) : {1:.2f}%\n".format(
             number, (float(regiondict[number]) / region_num) * 100, 2))
     _depth.close()
     _bedstat.close()
     _stats.close()
     _chromstat.close()
     dep_f = CreatIndex(depth)
     _ = dep_f.check_index(seq_col=0, start_col=1, end_col=1)