def __init__(self, **kwargs): self.LowDepCut = float(kwargs["low_dep_cut"]) self.CorrectWinLen = int(kwargs["correct_win_len"]) self.CorrectShiftLen = int(kwargs["correct_shift_len"]) chroms = SaveLoad(os.path.abspath(kwargs["chromstat"])) s_chrom = str(kwargs["chrom"]).split(",") if kwargs["chrom"] else None self.chrom_stat = chroms.load() self.samples = sorted(self.chrom_stat.keys()) self.contigs = sorted(self.chrom_stat[self.samples[0]].keys(), key=lambda x: _chrom_valued(x)) if s_chrom is not None and len(s_chrom): self.contigs = filter(lambda x: x in s_chrom, self.contigs) self.indir = os.path.abspath(kwargs["indir"]) self.bed = dict() regions = defaultdict(list) bed = open(os.path.abspath(kwargs["region"]), 'r') for line in bed: if line.startswith("#"): continue rows = line.strip().split("\t") if len(rows) < 3: continue chrom = str(rows[0]) if chrom not in self.contigs: continue start = int(rows[1]) stop = int(rows[2]) regions[chrom].extend(range(start, stop + 1)) for chrom in self.contigs: self.bed[chrom] = sorted(regions[chrom])
def bedAnalysis(**kwargs): global pos_gc, win_gc bed = os.path.abspath(kwargs["bed"]) reference = os.path.abspath(kwargs["reference"]) db = os.path.abspath(kwargs["db"]) outdir = os.path.abspath(kwargs["outdir"]) winlen = int(kwargs["winlen"]) if "winlen" in kwargs else 200 siftlen = int(kwargs["siftlen"]) if "siftlen" in kwargs else 20 depth_f = [os.path.abspath(i) for i in kwargs["depthfile"].split(",") if os.path.isfile(i)] model = RegionAnalysis(reference, db) bed_gc_out = SaveLoad(os.path.join(outdir, "win.gc")) pos_gc_out = SaveLoad(os.path.join(outdir, "pos.gc")) chrom_stat = SaveLoad(os.path.join(outdir, "chrom.stat")) with smart_open(bed) as f_in: for line in f_in: rows = line.strip().split("\t") chrom = str(rows[0]) if chrom not in pos_gc: pos_gc[chrom] = dict() start = int(rows[1]) stop = int(rows[2]) + 1 try: model.analysis(chrom, start, stop, winlen, siftlen) except ValueError: continue bed_gc_out.save(win_gc) pos_gc_out.save(pos_gc) chrom_stat.save(model.chrom_stat(depth_f)) model.__del__() return bed_gc_out.fname, pos_gc_out.fname, chrom_stat.fname
def run(self, debug=False): pool = Pool( processes=cpu(use_mem=3221225472, cpu_limit=len(self.contigs))) for chrom in self.contigs: pool.apply_async(self.win_correct, args=(chrom, )) pool.close() pool.join() for chrom in self.contigs: cnvdata = dict() nbarg = os.path.join(self.indir, "%s.nbinom.arg" % chrom) if os.path.isfile(nbarg): f_in = smart_open(nbarg) trials, best_probability, devi = f_in.readline().strip().split( "\t") f_in.close() if not debug: os.remove(nbarg) else: continue for sample in self.samples: cnvdata[sample] = CNVdata() cnvdata[sample].trials = int(trials) cnvdata[sample].best_probability = float(best_probability) cnvdata[sample].min_devi = float(devi) cnvdata[sample].ploid = int( self.chrom_stat[sample][chrom].ploid) cnvdata[sample].regions = list() cnvdata[sample].data = list() dep_data = os.path.join( self.indir, sample, "%s.W%dS%d.fixdep.gz" % (chrom, self.CorrectWinLen, self.CorrectShiftLen)) if not os.path.isfile(dep_data): continue with smart_open(dep_data) as f_in: for line in f_in: if line.startswith("#"): continue chrom, start, stop, deps = line.strip().split("\t") start = int(start) stop = int(stop) deps = int(deps) cnvdata[sample].regions.append([chrom, start, stop]) cnvdata[sample].data.append(deps) c_stat = SaveLoad(os.path.join(self.indir, "%s.cnv.args" % chrom)) c_stat.save(cnvdata)
def run(self, debug=False): pool = Pool(processes=cpu(use_mem=3221225472, cpu_limit=len(self.contigs))) for chrom in self.contigs: pool.apply_async(self.win_correct, args=(chrom,)) pool.close() pool.join() for chrom in self.contigs: cnvdata = dict() nbarg = os.path.join(self.indir, "%s.nbinom.arg" % chrom) if os.path.isfile(nbarg): f_in = smart_open(nbarg) trials, best_probability, devi = f_in.readline().strip().split("\t") f_in.close() if not debug: os.remove(nbarg) else: continue for sample in self.samples: cnvdata[sample] = CNVdata() cnvdata[sample].trials = int(trials) cnvdata[sample].best_probability = float(best_probability) cnvdata[sample].min_devi = float(devi) cnvdata[sample].ploid = int(self.chrom_stat[sample][chrom].ploid) cnvdata[sample].regions = list() cnvdata[sample].data = list() dep_data = os.path.join(self.indir, sample, "%s.W%dS%d.fixdep.gz" % (chrom, self.CorrectWinLen, self.CorrectShiftLen)) if not os.path.isfile(dep_data): continue with smart_open(dep_data) as f_in: for line in f_in: if line.startswith("#"): continue chrom, start, stop, deps = line.strip().split("\t") start = int(start) stop = int(stop) deps = int(deps) cnvdata[sample].regions.append([chrom, start, stop]) cnvdata[sample].data.append(deps) c_stat = SaveLoad(os.path.join(self.indir, "%s.cnv.args" % chrom)) c_stat.save(cnvdata)
def __init__(self, **kwargs): self.outdir = os.path.abspath(kwargs["indir"]) self.win_len = int(kwargs["correct_win_len"]) or 30 self.shift_len = int(kwargs["correct_shift_len"]) or 25 self.contral_wins = int(100.0 / self.shift_len + 0.5) + 1 chroms = str(kwargs["chrom"]).split(",") if kwargs["chrom"] else None samples = str(kwargs["sample"]).split(",") if kwargs["sample"] else None all_samples = set() contigs = list() self.cnvdata = defaultdict(dict) self.sample_win_data = defaultdict(dict) for cnv_data in glob(os.path.join(self.outdir, "chr*.cnv.args")): chrom = ".".join(os.path.basename(cnv_data).split(".")[0:-2]) if chroms is not None and chrom not in chroms: continue cnvdata = SaveLoad(cnv_data) cnvdata = cnvdata.load() contigs.append(chrom) for sample in cnvdata.keys(): dep_f = os.path.join(self.outdir, sample, "%s.W%iS%i.fixdep.gz" % (chrom, self.win_len, self.shift_len)) if os.path.isfile(dep_f) and os.path.isfile(dep_f + '.tbi'): self.sample_win_data[chrom][sample] = dep_f if samples is not None and sample not in samples: continue all_samples.add(sample) self.cnvdata[sample][chrom] = cnvdata[sample] self.samples = sorted(all_samples) self.contigs = sorted(contigs, key=lambda x: _chrom_valued(x)) databases = os.path.abspath(kwargs["dbdir"]) t_db = os.path.abspath(kwargs["transdb"]) if "transdb" in kwargs else os.path.join(databases, "transdb", "ncbi_anno_rel104.dbref.db") for db in glob(os.path.join(databases, "*", "*.cnvdb.config")): db = os.path.abspath(db) dbname = os.path.basename(os.path.dirname(db)) _AnnotationDB[dbname].add(db) self.reference = os.path.abspath(kwargs["reference"]) if kwargs["reference"] else \ os.path.join(databases, 'aln_db/hg19/hg19_chM_male_mask.fa') self.DBAnno = CNVAnnotation(self.reference, _AnnotationDB) self.HGVS = HGVS(t_db)
def gc_correct(**kwargs): depthf = os.path.abspath(kwargs["input"]) if not os.path.isfile(depthf) or not os.path.isfile(depthf + '.tbi'): return sample = str(kwargs['sample']) if kwargs['sample'] else os.path.basename(depthf).split(".")[0] outdir = os.path.join(os.path.abspath(kwargs['outdir']), sample) if not os.path.exists(outdir): os.makedirs(outdir) out = os.path.join(outdir, "%s.Fixdep.tsv" % sample) wins = SaveLoad(os.path.abspath(kwargs["wingc"])) wingc = wins.load() poss = SaveLoad(os.path.abspath(kwargs["posgc"])) posgc = poss.load() chroms = SaveLoad(os.path.abspath(kwargs["chromstat"])) chrom_stat = chroms.load() f_out = smart_open(out, 'w') f_out.writelines("#Chrom\tPos\tFixDepth\n") gc_depth = list() dep_f = pysam.TabixFile(depthf) for rows, gc_content in sorted(wingc.iteritems(), key=lambda x: (_chrom_valued(x[0][0]), x[0][1])): chrom = rows[0] start = rows[1] stop = rows[2] - 1 if chrom not in chrom_stat[sample] or chrom_stat[sample][chrom] < 1: continue try: depths = [int(line.strip().split("\t")[-1]) for line in dep_f.fetch(chrom, start, stop)] win_mean_dep = min(sum(depths) / float(len(depths)), 6.0 * chrom_stat[sample][chrom].average) win_mean_dep *= 2.0 / chrom_stat[sample][chrom].ploid except Exception: win_mean_dep = 0.0 gc_depth.append([gc_content, win_mean_dep]) gc_depth = DescribeArray(gc_depth, col=1) gcdep = gc_depth.array[gc_depth.array[:, 1] > 0.05 * gc_depth.median] prd = unique_rows(lowess(gcdep[:, 1], gcdep[:, 0], frac=0.25)) mdp = np.median(prd[:, 1]) if mdp <= 0.0: raise ValueError("Sample %s depth file Error !" % depthf) lgc = gcl = max(10000, int(prd[:, 0].max() * 10000)) loe = [-0.0001, ] * gcl gcj = 0 for gc, dp in prd: gcj = int(round(gc, 4) * 10000) if gcj < gcl: gcl = gcj loe[gcj] = mdp / float(dp) if dp > 0 else 1.0 for gc in xrange(gcl): loe[gc] = min(loe[gcl], 10.0) for i in xrange(gcl + 1, gcj): if loe[i] < 0: ls = i - 1 lv = loe[i - 1] rs = i + 1 while loe[rs] < 0 and rs < len(loe): rs += 1 rv = loe[rs] loe[i] = min((lv + (rs - float(ls)) * rv) / (rs - float(ls) + 1.0), 10.0) for i in xrange(gcj + 1, lgc): loe[i] = min(loe[gcj], 10.0) for line in dep_f.fetch(): rows = line.strip().split("\t") chrom = str(rows[0]) pos = int(rows[1]) deps = int(rows[-1]) try: fixdeps = int(deps * loe[int(round(posgc[chrom][pos], 4) * 10000)]) except KeyError: continue f_out.writelines("\t".join(map(str, [chrom, pos, fixdeps])) + '\n') f_out.close() dep_f.close() _ = pysam.tabix_index(out, seq_col=0, start_col=1, end_col=1, force=True)
def gc_correct(**kwargs): depthf = os.path.abspath(kwargs["input"]) if not os.path.isfile(depthf) or not os.path.isfile(depthf + '.tbi'): return sample = str(kwargs['sample']) if kwargs['sample'] else os.path.basename( depthf).split(".")[0] outdir = os.path.join(os.path.abspath(kwargs['outdir']), sample) if not os.path.exists(outdir): os.makedirs(outdir) out = os.path.join(outdir, "%s.Fixdep.tsv" % sample) wins = SaveLoad(os.path.abspath(kwargs["wingc"])) wingc = wins.load() poss = SaveLoad(os.path.abspath(kwargs["posgc"])) posgc = poss.load() chroms = SaveLoad(os.path.abspath(kwargs["chromstat"])) chrom_stat = chroms.load() f_out = smart_open(out, 'w') f_out.writelines("#Chrom\tPos\tFixDepth\n") gc_depth = list() dep_f = pysam.TabixFile(depthf) for rows, gc_content in sorted(wingc.iteritems(), key=lambda x: (_chrom_valued(x[0][0]), x[0][1])): chrom = rows[0] start = rows[1] stop = rows[2] - 1 if chrom not in chrom_stat[sample] or chrom_stat[sample][chrom] < 1: continue try: depths = [ int(line.strip().split("\t")[-1]) for line in dep_f.fetch(chrom, start, stop) ] win_mean_dep = min( sum(depths) / float(len(depths)), 6.0 * chrom_stat[sample][chrom].average) win_mean_dep *= 2.0 / chrom_stat[sample][chrom].ploid except Exception: win_mean_dep = 0.0 gc_depth.append([gc_content, win_mean_dep]) gc_depth = DescribeArray(gc_depth, col=1) gcdep = gc_depth.array[gc_depth.array[:, 1] > 0.05 * gc_depth.median] prd = unique_rows(lowess(gcdep[:, 1], gcdep[:, 0], frac=0.25)) mdp = np.median(prd[:, 1]) if mdp <= 0.0: raise ValueError("Sample %s depth file Error !" % depthf) lgc = gcl = max(10000, int(prd[:, 0].max() * 10000)) loe = [ -0.0001, ] * gcl gcj = 0 for gc, dp in prd: gcj = int(round(gc, 4) * 10000) if gcj < gcl: gcl = gcj loe[gcj] = mdp / float(dp) if dp > 0 else 1.0 for gc in xrange(gcl): loe[gc] = min(loe[gcl], 10.0) for i in xrange(gcl + 1, gcj): if loe[i] < 0: ls = i - 1 lv = loe[i - 1] rs = i + 1 while loe[rs] < 0 and rs < len(loe): rs += 1 rv = loe[rs] loe[i] = min((lv + (rs - float(ls)) * rv) / (rs - float(ls) + 1.0), 10.0) for i in xrange(gcj + 1, lgc): loe[i] = min(loe[gcj], 10.0) for line in dep_f.fetch(): rows = line.strip().split("\t") chrom = str(rows[0]) pos = int(rows[1]) deps = int(rows[-1]) try: fixdeps = int(deps * loe[int(round(posgc[chrom][pos], 4) * 10000)]) except KeyError: continue f_out.writelines("\t".join(map(str, [chrom, pos, fixdeps])) + '\n') f_out.close() dep_f.close() _ = pysam.tabix_index(out, seq_col=0, start_col=1, end_col=1, force=True)
def bedAnalysis(**kwargs): global pos_gc, win_gc bed = os.path.abspath(kwargs["bed"]) reference = os.path.abspath(kwargs["reference"]) db = os.path.abspath(kwargs["db"]) outdir = os.path.abspath(kwargs["outdir"]) winlen = int(kwargs["winlen"]) if "winlen" in kwargs else 200 siftlen = int(kwargs["siftlen"]) if "siftlen" in kwargs else 20 depth_f = [ os.path.abspath(i) for i in kwargs["depthfile"].split(",") if os.path.isfile(i) ] model = RegionAnalysis(reference, db) bed_gc_out = SaveLoad(os.path.join(outdir, "win.gc")) pos_gc_out = SaveLoad(os.path.join(outdir, "pos.gc")) chrom_stat = SaveLoad(os.path.join(outdir, "chrom.stat")) with smart_open(bed) as f_in: for line in f_in: rows = line.strip().split("\t") chrom = str(rows[0]) if chrom not in pos_gc: pos_gc[chrom] = dict() start = int(rows[1]) stop = int(rows[2]) + 1 try: model.analysis(chrom, start, stop, winlen, siftlen) except ValueError: continue bed_gc_out.save(win_gc) pos_gc_out.save(pos_gc) chrom_stat.save(model.chrom_stat(depth_f)) model.__del__() return bed_gc_out.fname, pos_gc_out.fname, chrom_stat.fname