Ejemplo n.º 1
0
	def __init__(self, **kwargs):
		self.LowDepCut = float(kwargs["low_dep_cut"])
		self.CorrectWinLen = int(kwargs["correct_win_len"])
		self.CorrectShiftLen = int(kwargs["correct_shift_len"])
		chroms = SaveLoad(os.path.abspath(kwargs["chromstat"]))
		s_chrom = str(kwargs["chrom"]).split(",") if kwargs["chrom"] else None
		self.chrom_stat = chroms.load()
		self.samples = sorted(self.chrom_stat.keys())
		self.contigs = sorted(self.chrom_stat[self.samples[0]].keys(), key=lambda x: _chrom_valued(x))
		if s_chrom is not None and len(s_chrom):
			self.contigs = filter(lambda x: x in s_chrom, self.contigs)
		self.indir = os.path.abspath(kwargs["indir"])
		self.bed = dict()
		regions = defaultdict(list)
		bed = open(os.path.abspath(kwargs["region"]), 'r')
		for line in bed:
			if line.startswith("#"):
				continue
			rows = line.strip().split("\t")
			if len(rows) < 3:
				continue
			chrom = str(rows[0])
			if chrom not in self.contigs:
				continue
			start = int(rows[1])
			stop = int(rows[2])
			regions[chrom].extend(range(start, stop + 1))
		for chrom in self.contigs:
			self.bed[chrom] = sorted(regions[chrom])
Ejemplo n.º 2
0
 def __init__(self, **kwargs):
     self.LowDepCut = float(kwargs["low_dep_cut"])
     self.CorrectWinLen = int(kwargs["correct_win_len"])
     self.CorrectShiftLen = int(kwargs["correct_shift_len"])
     chroms = SaveLoad(os.path.abspath(kwargs["chromstat"]))
     s_chrom = str(kwargs["chrom"]).split(",") if kwargs["chrom"] else None
     self.chrom_stat = chroms.load()
     self.samples = sorted(self.chrom_stat.keys())
     self.contigs = sorted(self.chrom_stat[self.samples[0]].keys(),
                           key=lambda x: _chrom_valued(x))
     if s_chrom is not None and len(s_chrom):
         self.contigs = filter(lambda x: x in s_chrom, self.contigs)
     self.indir = os.path.abspath(kwargs["indir"])
     self.bed = dict()
     regions = defaultdict(list)
     bed = open(os.path.abspath(kwargs["region"]), 'r')
     for line in bed:
         if line.startswith("#"):
             continue
         rows = line.strip().split("\t")
         if len(rows) < 3:
             continue
         chrom = str(rows[0])
         if chrom not in self.contigs:
             continue
         start = int(rows[1])
         stop = int(rows[2])
         regions[chrom].extend(range(start, stop + 1))
     for chrom in self.contigs:
         self.bed[chrom] = sorted(regions[chrom])
Ejemplo n.º 3
0
def bedAnalysis(**kwargs):
    global pos_gc, win_gc
    bed = os.path.abspath(kwargs["bed"])
    reference = os.path.abspath(kwargs["reference"])
    db = os.path.abspath(kwargs["db"])
    outdir = os.path.abspath(kwargs["outdir"])
    winlen = int(kwargs["winlen"]) if "winlen" in kwargs else 200
    siftlen = int(kwargs["siftlen"]) if "siftlen" in kwargs else 20
    depth_f = [os.path.abspath(i) for i in kwargs["depthfile"].split(",") if os.path.isfile(i)]
    model = RegionAnalysis(reference, db)
    bed_gc_out = SaveLoad(os.path.join(outdir, "win.gc"))
    pos_gc_out = SaveLoad(os.path.join(outdir, "pos.gc"))
    chrom_stat = SaveLoad(os.path.join(outdir, "chrom.stat"))
    with smart_open(bed) as f_in:
        for line in f_in:
            rows = line.strip().split("\t")
            chrom = str(rows[0])
            if chrom not in pos_gc:
                pos_gc[chrom] = dict()
            start = int(rows[1])
            stop = int(rows[2]) + 1
            try:
                model.analysis(chrom, start, stop, winlen, siftlen)
            except ValueError:
                continue
    bed_gc_out.save(win_gc)
    pos_gc_out.save(pos_gc)
    chrom_stat.save(model.chrom_stat(depth_f))
    model.__del__()
    return bed_gc_out.fname, pos_gc_out.fname, chrom_stat.fname
Ejemplo n.º 4
0
 def run(self, debug=False):
     pool = Pool(
         processes=cpu(use_mem=3221225472, cpu_limit=len(self.contigs)))
     for chrom in self.contigs:
         pool.apply_async(self.win_correct, args=(chrom, ))
     pool.close()
     pool.join()
     for chrom in self.contigs:
         cnvdata = dict()
         nbarg = os.path.join(self.indir, "%s.nbinom.arg" % chrom)
         if os.path.isfile(nbarg):
             f_in = smart_open(nbarg)
             trials, best_probability, devi = f_in.readline().strip().split(
                 "\t")
             f_in.close()
             if not debug:
                 os.remove(nbarg)
         else:
             continue
         for sample in self.samples:
             cnvdata[sample] = CNVdata()
             cnvdata[sample].trials = int(trials)
             cnvdata[sample].best_probability = float(best_probability)
             cnvdata[sample].min_devi = float(devi)
             cnvdata[sample].ploid = int(
                 self.chrom_stat[sample][chrom].ploid)
             cnvdata[sample].regions = list()
             cnvdata[sample].data = list()
             dep_data = os.path.join(
                 self.indir, sample, "%s.W%dS%d.fixdep.gz" %
                 (chrom, self.CorrectWinLen, self.CorrectShiftLen))
             if not os.path.isfile(dep_data):
                 continue
             with smart_open(dep_data) as f_in:
                 for line in f_in:
                     if line.startswith("#"):
                         continue
                     chrom, start, stop, deps = line.strip().split("\t")
                     start = int(start)
                     stop = int(stop)
                     deps = int(deps)
                     cnvdata[sample].regions.append([chrom, start, stop])
                     cnvdata[sample].data.append(deps)
         c_stat = SaveLoad(os.path.join(self.indir, "%s.cnv.args" % chrom))
         c_stat.save(cnvdata)
Ejemplo n.º 5
0
	def run(self, debug=False):
		pool = Pool(processes=cpu(use_mem=3221225472, cpu_limit=len(self.contigs)))
		for chrom in self.contigs:
			pool.apply_async(self.win_correct, args=(chrom,))
		pool.close()
		pool.join()
		for chrom in self.contigs:
			cnvdata = dict()
			nbarg = os.path.join(self.indir, "%s.nbinom.arg" % chrom)
			if os.path.isfile(nbarg):
				f_in = smart_open(nbarg)
				trials, best_probability, devi = f_in.readline().strip().split("\t")
				f_in.close()
				if not debug:
					os.remove(nbarg)
			else:
				continue
			for sample in self.samples:
				cnvdata[sample] = CNVdata()
				cnvdata[sample].trials = int(trials)
				cnvdata[sample].best_probability = float(best_probability)
				cnvdata[sample].min_devi = float(devi)
				cnvdata[sample].ploid = int(self.chrom_stat[sample][chrom].ploid)
				cnvdata[sample].regions = list()
				cnvdata[sample].data = list()
				dep_data = os.path.join(self.indir, sample,
				                        "%s.W%dS%d.fixdep.gz" % (chrom, self.CorrectWinLen, self.CorrectShiftLen))
				if not os.path.isfile(dep_data):
					continue
				with smart_open(dep_data) as f_in:
					for line in f_in:
						if line.startswith("#"):
							continue
						chrom, start, stop, deps = line.strip().split("\t")
						start = int(start)
						stop = int(stop)
						deps = int(deps)
						cnvdata[sample].regions.append([chrom, start, stop])
						cnvdata[sample].data.append(deps)
			c_stat = SaveLoad(os.path.join(self.indir, "%s.cnv.args" % chrom))
			c_stat.save(cnvdata)
Ejemplo n.º 6
0
	def __init__(self, **kwargs):
		self.outdir = os.path.abspath(kwargs["indir"])
		self.win_len = int(kwargs["correct_win_len"]) or 30
		self.shift_len = int(kwargs["correct_shift_len"]) or 25
		self.contral_wins = int(100.0 / self.shift_len + 0.5) + 1
		chroms = str(kwargs["chrom"]).split(",") if kwargs["chrom"] else None
		samples = str(kwargs["sample"]).split(",") if kwargs["sample"] else None
		all_samples = set()
		contigs = list()
		self.cnvdata = defaultdict(dict)
		self.sample_win_data = defaultdict(dict)
		for cnv_data in glob(os.path.join(self.outdir, "chr*.cnv.args")):
			chrom = ".".join(os.path.basename(cnv_data).split(".")[0:-2])
			if chroms is not None and chrom not in chroms:
				continue
			cnvdata = SaveLoad(cnv_data)
			cnvdata = cnvdata.load()
			contigs.append(chrom)
			for sample in cnvdata.keys():
				dep_f = os.path.join(self.outdir, sample, "%s.W%iS%i.fixdep.gz" % (chrom, self.win_len, self.shift_len))
				if os.path.isfile(dep_f) and os.path.isfile(dep_f + '.tbi'):
					self.sample_win_data[chrom][sample] = dep_f
				if samples is not None and sample not in samples:
					continue
				all_samples.add(sample)
				self.cnvdata[sample][chrom] = cnvdata[sample]
		self.samples = sorted(all_samples)
		self.contigs = sorted(contigs, key=lambda x: _chrom_valued(x))
		databases = os.path.abspath(kwargs["dbdir"])
		t_db = os.path.abspath(kwargs["transdb"]) if "transdb" in kwargs else os.path.join(databases, "transdb",
		                                                                                   "ncbi_anno_rel104.dbref.db")
		for db in glob(os.path.join(databases, "*", "*.cnvdb.config")):
			db = os.path.abspath(db)
			dbname = os.path.basename(os.path.dirname(db))
			_AnnotationDB[dbname].add(db)
		self.reference = os.path.abspath(kwargs["reference"]) if kwargs["reference"] else \
			os.path.join(databases, 'aln_db/hg19/hg19_chM_male_mask.fa')
		self.DBAnno = CNVAnnotation(self.reference, _AnnotationDB)
		self.HGVS = HGVS(t_db)
Ejemplo n.º 7
0
def gc_correct(**kwargs):
	depthf = os.path.abspath(kwargs["input"])
	if not os.path.isfile(depthf) or not os.path.isfile(depthf + '.tbi'):
		return
	sample = str(kwargs['sample']) if kwargs['sample'] else os.path.basename(depthf).split(".")[0]
	outdir = os.path.join(os.path.abspath(kwargs['outdir']), sample)
	if not os.path.exists(outdir):
		os.makedirs(outdir)
	out = os.path.join(outdir, "%s.Fixdep.tsv" % sample)
	wins = SaveLoad(os.path.abspath(kwargs["wingc"]))
	wingc = wins.load()
	poss = SaveLoad(os.path.abspath(kwargs["posgc"]))
	posgc = poss.load()
	chroms = SaveLoad(os.path.abspath(kwargs["chromstat"]))
	chrom_stat = chroms.load()
	f_out = smart_open(out, 'w')
	f_out.writelines("#Chrom\tPos\tFixDepth\n")
	gc_depth = list()
	dep_f = pysam.TabixFile(depthf)
	for rows, gc_content in sorted(wingc.iteritems(), key=lambda x: (_chrom_valued(x[0][0]), x[0][1])):
		chrom = rows[0]
		start = rows[1]
		stop = rows[2] - 1
		if chrom not in chrom_stat[sample] or chrom_stat[sample][chrom] < 1:
			continue
		try:
			depths = [int(line.strip().split("\t")[-1]) for line in dep_f.fetch(chrom, start, stop)]
			win_mean_dep = min(sum(depths) / float(len(depths)), 6.0 * chrom_stat[sample][chrom].average)
			win_mean_dep *= 2.0 / chrom_stat[sample][chrom].ploid
		except Exception:
			win_mean_dep = 0.0
		gc_depth.append([gc_content, win_mean_dep])
	gc_depth = DescribeArray(gc_depth, col=1)
	gcdep = gc_depth.array[gc_depth.array[:, 1] > 0.05 * gc_depth.median]
	prd = unique_rows(lowess(gcdep[:, 1], gcdep[:, 0], frac=0.25))
	mdp = np.median(prd[:, 1])
	if mdp <= 0.0:
		raise ValueError("Sample %s depth file Error !" % depthf)
	lgc = gcl = max(10000, int(prd[:, 0].max() * 10000))
	loe = [-0.0001, ] * gcl
	gcj = 0
	for gc, dp in prd:
		gcj = int(round(gc, 4) * 10000)
		if gcj < gcl:
			gcl = gcj
		loe[gcj] = mdp / float(dp) if dp > 0 else 1.0
	for gc in xrange(gcl):
		loe[gc] = min(loe[gcl], 10.0)
	for i in xrange(gcl + 1, gcj):
		if loe[i] < 0:
			ls = i - 1
			lv = loe[i - 1]
			rs = i + 1
			while loe[rs] < 0 and rs < len(loe):
				rs += 1
			rv = loe[rs]
			loe[i] = min((lv + (rs - float(ls)) * rv) / (rs - float(ls) + 1.0), 10.0)
	for i in xrange(gcj + 1, lgc):
		loe[i] = min(loe[gcj], 10.0)
	for line in dep_f.fetch():
		rows = line.strip().split("\t")
		chrom = str(rows[0])
		pos = int(rows[1])
		deps = int(rows[-1])
		try:
			fixdeps = int(deps * loe[int(round(posgc[chrom][pos], 4) * 10000)])
		except KeyError:
			continue
		f_out.writelines("\t".join(map(str, [chrom, pos, fixdeps])) + '\n')
	f_out.close()
	dep_f.close()
	_ = pysam.tabix_index(out, seq_col=0, start_col=1, end_col=1, force=True)
Ejemplo n.º 8
0
def gc_correct(**kwargs):
    depthf = os.path.abspath(kwargs["input"])
    if not os.path.isfile(depthf) or not os.path.isfile(depthf + '.tbi'):
        return
    sample = str(kwargs['sample']) if kwargs['sample'] else os.path.basename(
        depthf).split(".")[0]
    outdir = os.path.join(os.path.abspath(kwargs['outdir']), sample)
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    out = os.path.join(outdir, "%s.Fixdep.tsv" % sample)
    wins = SaveLoad(os.path.abspath(kwargs["wingc"]))
    wingc = wins.load()
    poss = SaveLoad(os.path.abspath(kwargs["posgc"]))
    posgc = poss.load()
    chroms = SaveLoad(os.path.abspath(kwargs["chromstat"]))
    chrom_stat = chroms.load()
    f_out = smart_open(out, 'w')
    f_out.writelines("#Chrom\tPos\tFixDepth\n")
    gc_depth = list()
    dep_f = pysam.TabixFile(depthf)
    for rows, gc_content in sorted(wingc.iteritems(),
                                   key=lambda x:
                                   (_chrom_valued(x[0][0]), x[0][1])):
        chrom = rows[0]
        start = rows[1]
        stop = rows[2] - 1
        if chrom not in chrom_stat[sample] or chrom_stat[sample][chrom] < 1:
            continue
        try:
            depths = [
                int(line.strip().split("\t")[-1])
                for line in dep_f.fetch(chrom, start, stop)
            ]
            win_mean_dep = min(
                sum(depths) / float(len(depths)),
                6.0 * chrom_stat[sample][chrom].average)
            win_mean_dep *= 2.0 / chrom_stat[sample][chrom].ploid
        except Exception:
            win_mean_dep = 0.0
        gc_depth.append([gc_content, win_mean_dep])
    gc_depth = DescribeArray(gc_depth, col=1)
    gcdep = gc_depth.array[gc_depth.array[:, 1] > 0.05 * gc_depth.median]
    prd = unique_rows(lowess(gcdep[:, 1], gcdep[:, 0], frac=0.25))
    mdp = np.median(prd[:, 1])
    if mdp <= 0.0:
        raise ValueError("Sample %s depth file Error !" % depthf)
    lgc = gcl = max(10000, int(prd[:, 0].max() * 10000))
    loe = [
        -0.0001,
    ] * gcl
    gcj = 0
    for gc, dp in prd:
        gcj = int(round(gc, 4) * 10000)
        if gcj < gcl:
            gcl = gcj
        loe[gcj] = mdp / float(dp) if dp > 0 else 1.0
    for gc in xrange(gcl):
        loe[gc] = min(loe[gcl], 10.0)
    for i in xrange(gcl + 1, gcj):
        if loe[i] < 0:
            ls = i - 1
            lv = loe[i - 1]
            rs = i + 1
            while loe[rs] < 0 and rs < len(loe):
                rs += 1
            rv = loe[rs]
            loe[i] = min((lv + (rs - float(ls)) * rv) / (rs - float(ls) + 1.0),
                         10.0)
    for i in xrange(gcj + 1, lgc):
        loe[i] = min(loe[gcj], 10.0)
    for line in dep_f.fetch():
        rows = line.strip().split("\t")
        chrom = str(rows[0])
        pos = int(rows[1])
        deps = int(rows[-1])
        try:
            fixdeps = int(deps * loe[int(round(posgc[chrom][pos], 4) * 10000)])
        except KeyError:
            continue
        f_out.writelines("\t".join(map(str, [chrom, pos, fixdeps])) + '\n')
    f_out.close()
    dep_f.close()
    _ = pysam.tabix_index(out, seq_col=0, start_col=1, end_col=1, force=True)
Ejemplo n.º 9
0
def bedAnalysis(**kwargs):
    global pos_gc, win_gc
    bed = os.path.abspath(kwargs["bed"])
    reference = os.path.abspath(kwargs["reference"])
    db = os.path.abspath(kwargs["db"])
    outdir = os.path.abspath(kwargs["outdir"])
    winlen = int(kwargs["winlen"]) if "winlen" in kwargs else 200
    siftlen = int(kwargs["siftlen"]) if "siftlen" in kwargs else 20
    depth_f = [
        os.path.abspath(i) for i in kwargs["depthfile"].split(",")
        if os.path.isfile(i)
    ]
    model = RegionAnalysis(reference, db)
    bed_gc_out = SaveLoad(os.path.join(outdir, "win.gc"))
    pos_gc_out = SaveLoad(os.path.join(outdir, "pos.gc"))
    chrom_stat = SaveLoad(os.path.join(outdir, "chrom.stat"))
    with smart_open(bed) as f_in:
        for line in f_in:
            rows = line.strip().split("\t")
            chrom = str(rows[0])
            if chrom not in pos_gc:
                pos_gc[chrom] = dict()
            start = int(rows[1])
            stop = int(rows[2]) + 1
            try:
                model.analysis(chrom, start, stop, winlen, siftlen)
            except ValueError:
                continue
    bed_gc_out.save(win_gc)
    pos_gc_out.save(pos_gc)
    chrom_stat.save(model.chrom_stat(depth_f))
    model.__del__()
    return bed_gc_out.fname, pos_gc_out.fname, chrom_stat.fname