Example #1
0
 def analysis(self, chrom, start, stop, win_len=200, sift_len=20):
     global pos_gc, win_gc
     chrom = str(chrom)
     tmp_start = start = int(start)
     stop = int(stop)
     flank_stop = min(
         int(stop) + win_len + 1,
         self.reference.get_reference_length(chrom))
     try:
         rmtk = list(self.rmtk.fetch(chrom, start, stop))
         bases = self.reference.fetch(chrom, start, flank_stop)
     except Exception as err:
         raise ValueError(err)
     self.contigs.add(chrom)
     for pos in xrange(start, stop + 1):
         flank_b = max(pos - win_len / 2, 0)
         flank_e = min(flank_b + win_len + 1,
                       self.reference.get_reference_length(chrom))
         base_gc = count_gc(
             self.reference.fetch(chrom, flank_b, flank_e).upper())
         pos_gc[chrom][pos] = base_gc
     feback = list()
     if len(rmtk):
         for lines in rmtk:
             rows = lines.strip().split("\t")
             begin = int(rows[1])
             end = int(rows[2])
             if begin > start:
                 feback.append([start, begin - 1])
             else:
                 begin = start
             end = min(stop, end)
             if begin > end:
                 continue
             start = end + 1
     else:
         feback.append([start, stop])
     for s, e in list(join_ranges(feback, offset=60)):
         if e - s < sift_len:
             continue
         for win in xrange(s, e, sift_len):
             offset = win - tmp_start
             seq = bases[offset:offset + win_len].upper()
             gc_radio = count_gc(seq)
             if 0.1 < gc_radio < 0.9:
                 win_gc[(chrom, win, win + win_len - 1)] = gc_radio
             else:
                 continue
Example #2
0
 def analysis(self, chrom, start, stop, win_len=200, sift_len=20):
     global pos_gc, win_gc
     chrom = str(chrom)
     tmp_start = start = int(start)
     stop = int(stop)
     flank_stop = min(int(stop) + win_len + 1, self.reference.get_reference_length(chrom))
     try:
         rmtk = list(self.rmtk.fetch(chrom, start, stop))
         bases = self.reference.fetch(chrom, start, flank_stop)
     except Exception as err:
         raise ValueError(err)
     self.contigs.add(chrom)
     for pos in xrange(start, stop + 1):
         flank_b = max(pos - win_len / 2, 0)
         flank_e = min(flank_b + win_len + 1, self.reference.get_reference_length(chrom))
         base_gc = count_gc(self.reference.fetch(chrom, flank_b, flank_e).upper())
         pos_gc[chrom][pos] = base_gc
     feback = list()
     if len(rmtk):
         for lines in rmtk:
             rows = lines.strip().split("\t")
             begin = int(rows[1])
             end = int(rows[2])
             if begin > start:
                 feback.append([start, begin - 1])
             else:
                 begin = start
             end = min(stop, end)
             if begin > end:
                 continue
             start = end + 1
     else:
         feback.append([start, stop])
     for s, e in list(join_ranges(feback, offset=60)):
         if e - s < sift_len:
             continue
         for win in xrange(s, e, sift_len):
             offset = win - tmp_start
             seq = bases[offset : offset + win_len].upper()
             gc_radio = count_gc(seq)
             if 0.1 < gc_radio < 0.9:
                 win_gc[(chrom, win, win + win_len - 1)] = gc_radio
             else:
                 continue
Example #3
0
def hmm_cnv(dep_data, regions, best_probability, trials, ploid=2.0, output=None, contral_wins=5):
	output = os.path.abspath(output) if output is not None else sys.stdout
	final_cnv = smart_open(output, 'w')
	cne = ViterbiTraining(dep_data, ploid, best_probability, trials)
	copy_est = cne.train()
	if np.count_nonzero(copy_est != ploid) / float(len(copy_est)) < 0.1 and len(copy_est) > 800:
		iterations = 0
		lastDif = differences = len(copy_est)
		n_copy_est = np.copy(copy_est)
		tmp_arg = [[0, 0, 0], [0, 0, 0]]
		while differences > 0 and iterations < 100:
			ndep = [int(ploid * dep_data[i] / float(copy_est[i]) + 0.5) for i in range(len(copy_est)) if copy_est[i]]
			nbarg = NegativeBinomial(ndep)
			nbarg.nbinom_fit()
			tmp_b = float(nbarg.best_probability)
			tmp_t = int(nbarg.trials)
			devi = float(nbarg.mindevi)
			cne = ViterbiTraining(dep_data, ploid, tmp_b, tmp_t)
			copy_est = cne.train(copy_est)
			iterations += 1
			differences = np.count_nonzero(np.array(n_copy_est) != copy_est)
			n_copy_est = np.copy(copy_est)
			if differences == lastDif:
				if (tmp_arg[0][2] == devi) and (tmp_arg[0][1] == tmp_t) and (tmp_arg[0][0] == tmp_b):
					break
			lastDif = differences
			tmp_arg = [[tmp_arg[1][0], tmp_arg[1][1], tmp_arg[1][2]], [tmp_b, tmp_t, devi]]
	trans_p = cne.mlEstimate(copy_est)
	cnvs = cne.posterior_decoding(trans_p, ploid)
	chrom = str(regions[0][0])
	for k, g in groupby(zip(regions, cnvs), lambda x: x[1][1]):
		if k == ploid:
			continue
		elems = list(g)
		for start, stop in join_ranges([d[0][1:] for d in elems]):
			p = [float(d[1][0]) for d in elems if d[0][1] >= start and d[0][2] <= stop]
			mpp = round(sum(p) / len(p), 3)
			if mpp < 0.95 or len(p) < contral_wins:
				continue
			bp_len = stop - start + 1
			mut_type = "gain" if k > ploid else "loss"
			final_cnv.write("\t".join(map(str, [chrom, ploid, start, stop, bp_len, k, mut_type, mpp])) + '\n')
	final_cnv.close()
Example #4
0
 def win_correct(self, chrom):
     pos_filter = defaultdict(int)
     filterSampleInChrom = set()
     bed_chrom = self.bed[chrom]
     depth_dict = dict()
     regions = list()
     sample_dep = defaultdict(list)
     win_sift_dep = defaultdict(list)
     wsdep = defaultdict(list)
     samples_filter = filter(lambda i: self.chrom_stat[i][chrom].ploid > 0,
                             self.samples)
     if len(samples_filter) < 2:
         return
     nbinom_data = list()
     nbinom_out = open(os.path.join(self.indir, "%s.nbinom.arg" % chrom),
                       'w')
     chr_cor = open(
         os.path.join(
             self.indir, "%s_W%dS%d.cor" %
             (chrom, self.CorrectWinLen, self.CorrectShiftLen)), 'w')
     ws_deps = {
         sample: os.path.join(
             self.indir, sample, "%s.W%dS%d.fixdep" %
             (chrom, self.CorrectWinLen, self.CorrectShiftLen))
         for sample in samples_filter
     }
     ws_dep = {
         sample: open(ws_deps[sample], 'w')
         for sample in samples_filter
     }
     for sample in samples_filter:
         chrom_d = list()
         fixdeps = pysam.TabixFile(
             os.path.join(self.indir,
                          "{0}/{0}.Fixdep.tsv.gz".format(sample)))
         depth_dict[sample] = fixdeps
         for line in fixdeps.fetch(chrom):
             rows = line.strip().split("\t")
             pos = int(rows[1])
             c_d = int(rows[-1])
             chrom_d.append([pos, c_d])
         depths = DescribeArray(chrom_d, col=1)
         for (pos, dep) in chrom_d:
             if dep < 0.6 * self.LowDepCut * depths.average:
                 pos_filter[pos] += 2
             elif dep < self.LowDepCut * depths.average:
                 pos_filter[pos] += 1
     for pos, number in pos_filter.iteritems():
         if number >= len(samples_filter):
             try:
                 bed_chrom.remove(pos)
             except ValueError:
                 continue
     bed_chrom = list(
         join_ranges(join_numbers(bed_chrom), offset=self.CorrectWinLen))
     for s, e in bed_chrom:
         if e - s < self.CorrectShiftLen:
             continue
         for win in xrange(s, e, self.CorrectShiftLen):
             end_p = min(win + self.CorrectWinLen - 1, e)
             regions.append((chrom, win, end_p))
             for sample in samples_filter:
                 depth = depth_dict[sample]
                 lines = list(depth.fetch(chrom, win,
                                          end_p)) or ["-1\t-1\t0\n"]
                 mdep = sum(
                     [int(line.strip().split("\t")[-1])
                      for line in lines]) / float(len(lines))
                 mdep = round(mdep, 2)
                 sample_dep[sample].append(mdep)
                 win_sift_dep[(chrom, win, end_p)].append(mdep)
     mdep_chr = {
         i: sum(j) / len(j)
         for i, j in sample_dep.iteritems() if len(j)
     }
     cormtx = np.corrcoef([sample_dep[i] for i in samples_filter])
     chr_cor.write(chrom + "\t" + "\t".join(samples_filter) + '\n')
     for sn in range(len(samples_filter)):
         chr_cor.write("\t".join([samples_filter[sn]] +
                                 map(str, cormtx[sn])))
         mcor = (cormtx[sn].sum() - 1.0) / (len(samples_filter) - 1.0)
         if mcor < 0.6:
             filterSampleInChrom.add(samples_filter[sn])
             chr_cor.write("\tLow correlation\n")
         else:
             chr_cor.write("\n")
     chr_cor.write("\n")
     if len(filterSampleInChrom) / float(len(samples_filter)) > 0.6:
         return
     for r in regions:
         chrom, start, stop = r
         d = win_sift_dep[r]
         dr = list()
         ndr = list()
         for sn in range(len(samples_filter)):
             sample = samples_filter[sn]
             if sample in filterSampleInChrom:
                 continue
             contorl = self.LowDepCut * mdep_chr[sample]
             dr.append(d[sn] /
                       mdep_chr[sample]) if mdep_chr[sample] > 0 else 0
             if d[sn] > contorl:
                 ndr.append(d[sn] / mdep_chr[sample])
         mdr = np.median(np.array(ndr)) if len(ndr) > 3 else 1.0
         if mdr < self.LowDepCut:
             mdr = 1.0
         for sn in range(len(samples_filter)):
             d[sn] /= mdr
             wsdep[samples_filter[sn]].append(d[sn])
     for s, d in wsdep.iteritems():
         if s in filterSampleInChrom:
             continue
         tmp_des = DescribeArray(d)
         d = [min(5.0 * tmp_des.average, i) for i in d]
         tmp_des = DescribeArray(d)
         d = [int(i * 60.0 / tmp_des.average + 0.5) for i in d]
         if len(regions) == len(d):
             for i in range(len(regions)):
                 reg = "\t".join(map(str, regions[i]))
                 ws_dep[s].write("%s\t%i\n" % (reg, d[i]))
                 nbinom_data.append(d[i])
     chr_cor.close()
     for s, d in ws_deps.iteritems():
         ws_dep[s].close()
         depth_dict[s].close()
         if os.path.isfile(d):
             _ = pysam.tabix_index(d,
                                   seq_col=0,
                                   start_col=1,
                                   end_col=2,
                                   force=True)
     nbarg = NegativeBinomial(nbinom_data)
     nbarg.nbinom_fit()
     trials = nbarg.trials
     best_probability = nbarg.best_probability
     min_devi = nbarg.mindevi
     nbinom_out.write(
         "\t".join(map(str, [trials, best_probability, min_devi])) + '\n')
     nbinom_out.close()
Example #5
0
	def win_correct(self, chrom):
		pos_filter = defaultdict(int)
		filterSampleInChrom = set()
		bed_chrom = self.bed[chrom]
		depth_dict = dict()
		regions = list()
		sample_dep = defaultdict(list)
		win_sift_dep = defaultdict(list)
		wsdep = defaultdict(list)
		samples_filter = filter(lambda i: self.chrom_stat[i][chrom].ploid > 0, self.samples)
		if len(samples_filter) < 2:
			return
		nbinom_data = list()
		nbinom_out = open(os.path.join(self.indir, "%s.nbinom.arg" % chrom), 'w')
		chr_cor = open(os.path.join(self.indir, "%s_W%dS%d.cor" % (chrom, self.CorrectWinLen, self.CorrectShiftLen)),
		               'w')
		ws_deps = {sample: os.path.join(self.indir, sample, "%s.W%dS%d.fixdep" %
		                                (chrom, self.CorrectWinLen, self.CorrectShiftLen))
		           for sample in samples_filter}
		ws_dep = {sample: open(ws_deps[sample], 'w') for sample in samples_filter}
		for sample in samples_filter:
			chrom_d = list()
			fixdeps = pysam.TabixFile(os.path.join(self.indir, "{0}/{0}.Fixdep.tsv.gz".format(sample)))
			depth_dict[sample] = fixdeps
			for line in fixdeps.fetch(chrom):
				rows = line.strip().split("\t")
				pos = int(rows[1])
				c_d = int(rows[-1])
				chrom_d.append([pos, c_d])
			depths = DescribeArray(chrom_d, col=1)
			for (pos, dep) in chrom_d:
				if dep < 0.6 * self.LowDepCut * depths.average:
					pos_filter[pos] += 2
				elif dep < self.LowDepCut * depths.average:
					pos_filter[pos] += 1
		for pos, number in pos_filter.iteritems():
			if number >= len(samples_filter):
				try:
					bed_chrom.remove(pos)
				except ValueError:
					continue
		bed_chrom = list(join_ranges(join_numbers(bed_chrom), offset=self.CorrectWinLen))
		for s, e in bed_chrom:
			if e - s < self.CorrectShiftLen:
				continue
			for win in xrange(s, e, self.CorrectShiftLen):
				end_p = min(win + self.CorrectWinLen - 1, e)
				regions.append((chrom, win, end_p))
				for sample in samples_filter:
					depth = depth_dict[sample]
					lines = list(depth.fetch(chrom, win, end_p)) or ["-1\t-1\t0\n"]
					mdep = sum([int(line.strip().split("\t")[-1]) for line in lines]) / float(len(lines))
					mdep = round(mdep, 2)
					sample_dep[sample].append(mdep)
					win_sift_dep[(chrom, win, end_p)].append(mdep)
		mdep_chr = {i: sum(j) / len(j) for i, j in sample_dep.iteritems() if len(j)}
		cormtx = np.corrcoef([sample_dep[i] for i in samples_filter])
		chr_cor.write(chrom + "\t" + "\t".join(samples_filter) + '\n')
		for sn in range(len(samples_filter)):
			chr_cor.write("\t".join([samples_filter[sn]] + map(str, cormtx[sn])))
			mcor = (cormtx[sn].sum() - 1.0) / (len(samples_filter) - 1.0)
			if mcor < 0.6:
				filterSampleInChrom.add(samples_filter[sn])
				chr_cor.write("\tLow correlation\n")
			else:
				chr_cor.write("\n")
		chr_cor.write("\n")
		if len(filterSampleInChrom) / float(len(samples_filter)) > 0.6:
			return
		for r in regions:
			chrom, start, stop = r
			d = win_sift_dep[r]
			dr = list()
			ndr = list()
			for sn in range(len(samples_filter)):
				sample = samples_filter[sn]
				if sample in filterSampleInChrom:
					continue
				contorl = self.LowDepCut * mdep_chr[sample]
				dr.append(d[sn] / mdep_chr[sample]) if mdep_chr[sample] > 0 else 0
				if d[sn] > contorl:
					ndr.append(d[sn] / mdep_chr[sample])
			mdr = np.median(np.array(ndr)) if len(ndr) > 3 else 1.0
			if mdr < self.LowDepCut:
				mdr = 1.0
			for sn in range(len(samples_filter)):
				d[sn] /= mdr
				wsdep[samples_filter[sn]].append(d[sn])
		for s, d in wsdep.iteritems():
			if s in filterSampleInChrom:
				continue
			tmp_des = DescribeArray(d)
			d = [min(5.0 * tmp_des.average, i) for i in d]
			tmp_des = DescribeArray(d)
			d = [int(i * 60.0 / tmp_des.average + 0.5) for i in d]
			if len(regions) == len(d):
				for i in range(len(regions)):
					reg = "\t".join(map(str, regions[i]))
					ws_dep[s].write("%s\t%i\n" % (reg, d[i]))
					nbinom_data.append(d[i])
		chr_cor.close()
		for s, d in ws_deps.iteritems():
			ws_dep[s].close()
			depth_dict[s].close()
			if os.path.isfile(d):
				_ = pysam.tabix_index(d, seq_col=0, start_col=1, end_col=2, force=True)
		nbarg = NegativeBinomial(nbinom_data)
		nbarg.nbinom_fit()
		trials = nbarg.trials
		best_probability = nbarg.best_probability
		min_devi = nbarg.mindevi
		nbinom_out.write("\t".join(map(str, [trials, best_probability, min_devi])) + '\n')
		nbinom_out.close()