def hmm_cnv(dep_data, regions, best_probability, trials, ploid=2.0, output=None, contral_wins=5): output = os.path.abspath(output) if output is not None else sys.stdout final_cnv = smart_open(output, 'w') cne = ViterbiTraining(dep_data, ploid, best_probability, trials) copy_est = cne.train() if np.count_nonzero(copy_est != ploid) / float(len(copy_est)) < 0.1 and len(copy_est) > 800: iterations = 0 lastDif = differences = len(copy_est) n_copy_est = np.copy(copy_est) tmp_arg = [[0, 0, 0], [0, 0, 0]] while differences > 0 and iterations < 100: ndep = [int(ploid * dep_data[i] / float(copy_est[i]) + 0.5) for i in range(len(copy_est)) if copy_est[i]] nbarg = NegativeBinomial(ndep) nbarg.nbinom_fit() tmp_b = float(nbarg.best_probability) tmp_t = int(nbarg.trials) devi = float(nbarg.mindevi) cne = ViterbiTraining(dep_data, ploid, tmp_b, tmp_t) copy_est = cne.train(copy_est) iterations += 1 differences = np.count_nonzero(np.array(n_copy_est) != copy_est) n_copy_est = np.copy(copy_est) if differences == lastDif: if (tmp_arg[0][2] == devi) and (tmp_arg[0][1] == tmp_t) and (tmp_arg[0][0] == tmp_b): break lastDif = differences tmp_arg = [[tmp_arg[1][0], tmp_arg[1][1], tmp_arg[1][2]], [tmp_b, tmp_t, devi]] trans_p = cne.mlEstimate(copy_est) cnvs = cne.posterior_decoding(trans_p, ploid) chrom = str(regions[0][0]) for k, g in groupby(zip(regions, cnvs), lambda x: x[1][1]): if k == ploid: continue elems = list(g) for start, stop in join_ranges([d[0][1:] for d in elems]): p = [float(d[1][0]) for d in elems if d[0][1] >= start and d[0][2] <= stop] mpp = round(sum(p) / len(p), 3) if mpp < 0.95 or len(p) < contral_wins: continue bp_len = stop - start + 1 mut_type = "gain" if k > ploid else "loss" final_cnv.write("\t".join(map(str, [chrom, ploid, start, stop, bp_len, k, mut_type, mpp])) + '\n') final_cnv.close()
def win_correct(self, chrom): pos_filter = defaultdict(int) filterSampleInChrom = set() bed_chrom = self.bed[chrom] depth_dict = dict() regions = list() sample_dep = defaultdict(list) win_sift_dep = defaultdict(list) wsdep = defaultdict(list) samples_filter = filter(lambda i: self.chrom_stat[i][chrom].ploid > 0, self.samples) if len(samples_filter) < 2: return nbinom_data = list() nbinom_out = open(os.path.join(self.indir, "%s.nbinom.arg" % chrom), 'w') chr_cor = open( os.path.join( self.indir, "%s_W%dS%d.cor" % (chrom, self.CorrectWinLen, self.CorrectShiftLen)), 'w') ws_deps = { sample: os.path.join( self.indir, sample, "%s.W%dS%d.fixdep" % (chrom, self.CorrectWinLen, self.CorrectShiftLen)) for sample in samples_filter } ws_dep = { sample: open(ws_deps[sample], 'w') for sample in samples_filter } for sample in samples_filter: chrom_d = list() fixdeps = pysam.TabixFile( os.path.join(self.indir, "{0}/{0}.Fixdep.tsv.gz".format(sample))) depth_dict[sample] = fixdeps for line in fixdeps.fetch(chrom): rows = line.strip().split("\t") pos = int(rows[1]) c_d = int(rows[-1]) chrom_d.append([pos, c_d]) depths = DescribeArray(chrom_d, col=1) for (pos, dep) in chrom_d: if dep < 0.6 * self.LowDepCut * depths.average: pos_filter[pos] += 2 elif dep < self.LowDepCut * depths.average: pos_filter[pos] += 1 for pos, number in pos_filter.iteritems(): if number >= len(samples_filter): try: bed_chrom.remove(pos) except ValueError: continue bed_chrom = list( join_ranges(join_numbers(bed_chrom), offset=self.CorrectWinLen)) for s, e in bed_chrom: if e - s < self.CorrectShiftLen: continue for win in xrange(s, e, self.CorrectShiftLen): end_p = min(win + self.CorrectWinLen - 1, e) regions.append((chrom, win, end_p)) for sample in samples_filter: depth = depth_dict[sample] lines = list(depth.fetch(chrom, win, end_p)) or ["-1\t-1\t0\n"] mdep = sum( [int(line.strip().split("\t")[-1]) for line in lines]) / float(len(lines)) mdep = round(mdep, 2) sample_dep[sample].append(mdep) win_sift_dep[(chrom, win, end_p)].append(mdep) mdep_chr = { i: sum(j) / len(j) for i, j in sample_dep.iteritems() if len(j) } cormtx = np.corrcoef([sample_dep[i] for i in samples_filter]) chr_cor.write(chrom + "\t" + "\t".join(samples_filter) + '\n') for sn in range(len(samples_filter)): chr_cor.write("\t".join([samples_filter[sn]] + map(str, cormtx[sn]))) mcor = (cormtx[sn].sum() - 1.0) / (len(samples_filter) - 1.0) if mcor < 0.6: filterSampleInChrom.add(samples_filter[sn]) chr_cor.write("\tLow correlation\n") else: chr_cor.write("\n") chr_cor.write("\n") if len(filterSampleInChrom) / float(len(samples_filter)) > 0.6: return for r in regions: chrom, start, stop = r d = win_sift_dep[r] dr = list() ndr = list() for sn in range(len(samples_filter)): sample = samples_filter[sn] if sample in filterSampleInChrom: continue contorl = self.LowDepCut * mdep_chr[sample] dr.append(d[sn] / mdep_chr[sample]) if mdep_chr[sample] > 0 else 0 if d[sn] > contorl: ndr.append(d[sn] / mdep_chr[sample]) mdr = np.median(np.array(ndr)) if len(ndr) > 3 else 1.0 if mdr < self.LowDepCut: mdr = 1.0 for sn in range(len(samples_filter)): d[sn] /= mdr wsdep[samples_filter[sn]].append(d[sn]) for s, d in wsdep.iteritems(): if s in filterSampleInChrom: continue tmp_des = DescribeArray(d) d = [min(5.0 * tmp_des.average, i) for i in d] tmp_des = DescribeArray(d) d = [int(i * 60.0 / tmp_des.average + 0.5) for i in d] if len(regions) == len(d): for i in range(len(regions)): reg = "\t".join(map(str, regions[i])) ws_dep[s].write("%s\t%i\n" % (reg, d[i])) nbinom_data.append(d[i]) chr_cor.close() for s, d in ws_deps.iteritems(): ws_dep[s].close() depth_dict[s].close() if os.path.isfile(d): _ = pysam.tabix_index(d, seq_col=0, start_col=1, end_col=2, force=True) nbarg = NegativeBinomial(nbinom_data) nbarg.nbinom_fit() trials = nbarg.trials best_probability = nbarg.best_probability min_devi = nbarg.mindevi nbinom_out.write( "\t".join(map(str, [trials, best_probability, min_devi])) + '\n') nbinom_out.close()
def win_correct(self, chrom): pos_filter = defaultdict(int) filterSampleInChrom = set() bed_chrom = self.bed[chrom] depth_dict = dict() regions = list() sample_dep = defaultdict(list) win_sift_dep = defaultdict(list) wsdep = defaultdict(list) samples_filter = filter(lambda i: self.chrom_stat[i][chrom].ploid > 0, self.samples) if len(samples_filter) < 2: return nbinom_data = list() nbinom_out = open(os.path.join(self.indir, "%s.nbinom.arg" % chrom), 'w') chr_cor = open(os.path.join(self.indir, "%s_W%dS%d.cor" % (chrom, self.CorrectWinLen, self.CorrectShiftLen)), 'w') ws_deps = {sample: os.path.join(self.indir, sample, "%s.W%dS%d.fixdep" % (chrom, self.CorrectWinLen, self.CorrectShiftLen)) for sample in samples_filter} ws_dep = {sample: open(ws_deps[sample], 'w') for sample in samples_filter} for sample in samples_filter: chrom_d = list() fixdeps = pysam.TabixFile(os.path.join(self.indir, "{0}/{0}.Fixdep.tsv.gz".format(sample))) depth_dict[sample] = fixdeps for line in fixdeps.fetch(chrom): rows = line.strip().split("\t") pos = int(rows[1]) c_d = int(rows[-1]) chrom_d.append([pos, c_d]) depths = DescribeArray(chrom_d, col=1) for (pos, dep) in chrom_d: if dep < 0.6 * self.LowDepCut * depths.average: pos_filter[pos] += 2 elif dep < self.LowDepCut * depths.average: pos_filter[pos] += 1 for pos, number in pos_filter.iteritems(): if number >= len(samples_filter): try: bed_chrom.remove(pos) except ValueError: continue bed_chrom = list(join_ranges(join_numbers(bed_chrom), offset=self.CorrectWinLen)) for s, e in bed_chrom: if e - s < self.CorrectShiftLen: continue for win in xrange(s, e, self.CorrectShiftLen): end_p = min(win + self.CorrectWinLen - 1, e) regions.append((chrom, win, end_p)) for sample in samples_filter: depth = depth_dict[sample] lines = list(depth.fetch(chrom, win, end_p)) or ["-1\t-1\t0\n"] mdep = sum([int(line.strip().split("\t")[-1]) for line in lines]) / float(len(lines)) mdep = round(mdep, 2) sample_dep[sample].append(mdep) win_sift_dep[(chrom, win, end_p)].append(mdep) mdep_chr = {i: sum(j) / len(j) for i, j in sample_dep.iteritems() if len(j)} cormtx = np.corrcoef([sample_dep[i] for i in samples_filter]) chr_cor.write(chrom + "\t" + "\t".join(samples_filter) + '\n') for sn in range(len(samples_filter)): chr_cor.write("\t".join([samples_filter[sn]] + map(str, cormtx[sn]))) mcor = (cormtx[sn].sum() - 1.0) / (len(samples_filter) - 1.0) if mcor < 0.6: filterSampleInChrom.add(samples_filter[sn]) chr_cor.write("\tLow correlation\n") else: chr_cor.write("\n") chr_cor.write("\n") if len(filterSampleInChrom) / float(len(samples_filter)) > 0.6: return for r in regions: chrom, start, stop = r d = win_sift_dep[r] dr = list() ndr = list() for sn in range(len(samples_filter)): sample = samples_filter[sn] if sample in filterSampleInChrom: continue contorl = self.LowDepCut * mdep_chr[sample] dr.append(d[sn] / mdep_chr[sample]) if mdep_chr[sample] > 0 else 0 if d[sn] > contorl: ndr.append(d[sn] / mdep_chr[sample]) mdr = np.median(np.array(ndr)) if len(ndr) > 3 else 1.0 if mdr < self.LowDepCut: mdr = 1.0 for sn in range(len(samples_filter)): d[sn] /= mdr wsdep[samples_filter[sn]].append(d[sn]) for s, d in wsdep.iteritems(): if s in filterSampleInChrom: continue tmp_des = DescribeArray(d) d = [min(5.0 * tmp_des.average, i) for i in d] tmp_des = DescribeArray(d) d = [int(i * 60.0 / tmp_des.average + 0.5) for i in d] if len(regions) == len(d): for i in range(len(regions)): reg = "\t".join(map(str, regions[i])) ws_dep[s].write("%s\t%i\n" % (reg, d[i])) nbinom_data.append(d[i]) chr_cor.close() for s, d in ws_deps.iteritems(): ws_dep[s].close() depth_dict[s].close() if os.path.isfile(d): _ = pysam.tabix_index(d, seq_col=0, start_col=1, end_col=2, force=True) nbarg = NegativeBinomial(nbinom_data) nbarg.nbinom_fit() trials = nbarg.trials best_probability = nbarg.best_probability min_devi = nbarg.mindevi nbinom_out.write("\t".join(map(str, [trials, best_probability, min_devi])) + '\n') nbinom_out.close()