Beispiel #1
0
def exon_graph(depth, refseq, prefix, outdir=os.getcwd(), trans=None, genes=None, threads=0):
	beds = HGVS(refseq, trans=trans, genes=genes)
	messages = beds.get_exons()
	threads = cpu_count() if not int(threads) else int(threads)
	pool = multiprocessing.Pool(processes=threads)
	for transcript, trans_messages in messages.iteritems():
		pool.apply_async(graph, (depth, trans_messages, outdir, prefix))
	pool.close()
	pool.join()
Beispiel #2
0
 def __init__(self, reference, trans=None, genes=None):
     self.refdb = os.path.join(database_dir, "transdb",
                               "ncbi_anno_rel104.dbref.db")
     reference = os.path.abspath(reference)
     self.HGVS = HGVS(self.refdb, reference, trans, genes)
     self._dbhandles = list()
     dbtitle = defaultdict(float)
     for f in glob(os.path.join(database_dir, "*", "*.bedanno.config")):
         dbs = AnnoVar(os.path.abspath(f))
         t = "\t".join([str(i) for i in sorted(dbs.search_value.values())])
         dbtitle[t] = dbs.order
         self._dbhandles.append(dbs)
     self.dbtitle = "\t".join(
         [j for j in sorted(dbtitle.keys(), key=lambda x: dbtitle[x])])
Beispiel #3
0
	def __init__(self, reference, trans=None, genes=None):
		self.refdb = os.path.join(database_dir, "transdb", "ncbi_anno_rel104.dbref.db")
		reference = os.path.abspath(reference)
		self.HGVS = HGVS(self.refdb, reference, trans, genes)
		self._dbhandles = list()
		dbtitle = defaultdict(float)
		for f in glob(os.path.join(database_dir, "*", "*.bedanno.config")):
			dbs = AnnoVar(os.path.abspath(f))
			t = "\t".join([str(i) for i in sorted(dbs.search_value.values())])
			dbtitle[t] = dbs.order
			self._dbhandles.append(dbs)
		self.dbtitle = "\t".join([j for j in sorted(dbtitle.keys(), key=lambda x: dbtitle[x])])
Beispiel #4
0
	def __init__(self, **kwargs):
		self.outdir = os.path.abspath(kwargs["indir"])
		self.win_len = int(kwargs["correct_win_len"]) or 30
		self.shift_len = int(kwargs["correct_shift_len"]) or 25
		self.contral_wins = int(100.0 / self.shift_len + 0.5) + 1
		chroms = str(kwargs["chrom"]).split(",") if kwargs["chrom"] else None
		samples = str(kwargs["sample"]).split(",") if kwargs["sample"] else None
		all_samples = set()
		contigs = list()
		self.cnvdata = defaultdict(dict)
		self.sample_win_data = defaultdict(dict)
		for cnv_data in glob(os.path.join(self.outdir, "chr*.cnv.args")):
			chrom = ".".join(os.path.basename(cnv_data).split(".")[0:-2])
			if chroms is not None and chrom not in chroms:
				continue
			cnvdata = SaveLoad(cnv_data)
			cnvdata = cnvdata.load()
			contigs.append(chrom)
			for sample in cnvdata.keys():
				dep_f = os.path.join(self.outdir, sample, "%s.W%iS%i.fixdep.gz" % (chrom, self.win_len, self.shift_len))
				if os.path.isfile(dep_f) and os.path.isfile(dep_f + '.tbi'):
					self.sample_win_data[chrom][sample] = dep_f
				if samples is not None and sample not in samples:
					continue
				all_samples.add(sample)
				self.cnvdata[sample][chrom] = cnvdata[sample]
		self.samples = sorted(all_samples)
		self.contigs = sorted(contigs, key=lambda x: _chrom_valued(x))
		databases = os.path.abspath(kwargs["dbdir"])
		t_db = os.path.abspath(kwargs["transdb"]) if "transdb" in kwargs else os.path.join(databases, "transdb",
		                                                                                   "ncbi_anno_rel104.dbref.db")
		for db in glob(os.path.join(databases, "*", "*.cnvdb.config")):
			db = os.path.abspath(db)
			dbname = os.path.basename(os.path.dirname(db))
			_AnnotationDB[dbname].add(db)
		self.reference = os.path.abspath(kwargs["reference"]) if kwargs["reference"] else \
			os.path.join(databases, 'aln_db/hg19/hg19_chM_male_mask.fa')
		self.DBAnno = CNVAnnotation(self.reference, _AnnotationDB)
		self.HGVS = HGVS(t_db)
Beispiel #5
0
class BedAnno(object):
    def __init__(self, reference, trans=None, genes=None):
        self.refdb = os.path.join(database_dir, "transdb",
                                  "ncbi_anno_rel104.dbref.db")
        reference = os.path.abspath(reference)
        self.HGVS = HGVS(self.refdb, reference, trans, genes)
        self._dbhandles = list()
        dbtitle = defaultdict(float)
        for f in glob(os.path.join(database_dir, "*", "*.bedanno.config")):
            dbs = AnnoVar(os.path.abspath(f))
            t = "\t".join([str(i) for i in sorted(dbs.search_value.values())])
            dbtitle[t] = dbs.order
            self._dbhandles.append(dbs)
        self.dbtitle = "\t".join(
            [j for j in sorted(dbtitle.keys(), key=lambda x: dbtitle[x])])

    def __del__(self):
        self.HGVS.__del__()
        for dbs in self._dbhandles:
            dbs.__del__()

    def dbanno(self, variation):
        dbinfo = defaultdict(set)
        for dbs in self._dbhandles:
            try:
                infos = dbs.fetch(**variation.__dict__)
                for info in infos:
                    for k, v in info.__dict__.iteritems():
                        if v != ".":
                            dbinfo[k].add(v)
            except Exception:
                continue
        return dbinfo

    def bedanno(self, bedfile, fileout=sys.stdout):
        if not (bedfile and os.path.isfile(bedfile)):
            raise IOError("Fail to load file: %s" % bedfile)
        regions = smart_open(bedfile).readlines()
        f_out = smart_open(fileout, 'w')
        titles = "Transcript\tgeneSym\tgeneSym\tcHGVS\tProtein\tStand\tExonRegions"
        for line in regions:
            rows = line.strip().split("\t")
            if len(rows) < 3:
                f_out.write(line)
            try:
                chrom = str(rows[0])
                start = int(rows[1])
                stop = int(rows[2])
            except ValueError:
                if len(self.dbtitle):
                    f_out.write("\t".join(rows) +
                                '\t'.join([titles, self.dbtitle]) + '\n')
                else:
                    f_out.write("\t".join(rows) + '\t' + titles + '\n')
                continue
            anno_m = self.HGVS.annobed(chrom, start, stop)
            for anno in anno_m:
                dbinfo = self.dbanno(anno)
                rows[0] = str(anno.Chrom)
                rows[1] = str(anno.Start)
                rows[2] = str(anno.Stop)
                trans = str(anno.Transcript)
                gene = str(anno.geneSym)
                chgvs = str(anno.cHgvs)
                protein = str(anno.Protein)
                stand = str(anno.Stand)
                exons = str(anno.ExonRegions)
                anno_message = [
                    str(i) for i in rows +
                    [trans, gene, chgvs, protein, stand, exons]
                ]
                for i in self.dbtitle.split("\t"):
                    if i in dbinfo:
                        anno_message.append(";".join(dbinfo[i]))
                    else:
                        anno_message.append(".")
                f_out.write("\t".join(anno_message) + '\n')
        f_out.close()
Beispiel #6
0
class BedAnno(object):
	def __init__(self, reference, trans=None, genes=None):
		self.refdb = os.path.join(database_dir, "transdb", "ncbi_anno_rel104.dbref.db")
		reference = os.path.abspath(reference)
		self.HGVS = HGVS(self.refdb, reference, trans, genes)
		self._dbhandles = list()
		dbtitle = defaultdict(float)
		for f in glob(os.path.join(database_dir, "*", "*.bedanno.config")):
			dbs = AnnoVar(os.path.abspath(f))
			t = "\t".join([str(i) for i in sorted(dbs.search_value.values())])
			dbtitle[t] = dbs.order
			self._dbhandles.append(dbs)
		self.dbtitle = "\t".join([j for j in sorted(dbtitle.keys(), key=lambda x: dbtitle[x])])

	def __del__(self):
		self.HGVS.__del__()
		for dbs in self._dbhandles:
			dbs.__del__()

	def dbanno(self, variation):
		dbinfo = defaultdict(set)
		for dbs in self._dbhandles:
			try:
				infos = dbs.fetch(**variation.__dict__)
				for info in infos:
					for k, v in info.__dict__.iteritems():
						if v != ".":
							dbinfo[k].add(v)
			except Exception:
				continue
		return dbinfo

	def bedanno(self, bedfile, fileout=sys.stdout):
		if not (bedfile and os.path.isfile(bedfile)):
			raise IOError("Fail to load file: %s" % bedfile)
		regions = smart_open(bedfile).readlines()
		f_out = smart_open(fileout, 'w')
		titles = "Transcript\tgeneSym\tgeneSym\tcHGVS\tProtein\tStand\tExonRegions"
		for line in regions:
			rows = line.strip().split("\t")
			if len(rows) < 3:
				f_out.write(line)
			try:
				chrom = str(rows[0])
				start = int(rows[1])
				stop = int(rows[2])
			except ValueError:
				if len(self.dbtitle):
					f_out.write("\t".join(rows) + '\t'.join([titles, self.dbtitle]) + '\n')
				else:
					f_out.write("\t".join(rows) + '\t' + titles + '\n')
				continue
			anno_m = self.HGVS.annobed(chrom, start, stop)
			for anno in anno_m:
				dbinfo = self.dbanno(anno)
				rows[0] = str(anno.Chrom)
				rows[1] = str(anno.Start)
				rows[2] = str(anno.Stop)
				trans = str(anno.Transcript)
				gene = str(anno.geneSym)
				chgvs = str(anno.cHgvs)
				protein = str(anno.Protein)
				stand = str(anno.Stand)
				exons = str(anno.ExonRegions)
				anno_message = [str(i) for i in rows + [trans, gene, chgvs, protein, stand, exons]]
				for i in self.dbtitle.split("\t"):
					if i in dbinfo:
						anno_message.append(";".join(dbinfo[i]))
					else:
						anno_message.append(".")
				f_out.write("\t".join(anno_message) + '\n')
		f_out.close()
Beispiel #7
0
class CNVAnalysis(object):
	global _AnnotationDB

	def __init__(self, **kwargs):
		self.outdir = os.path.abspath(kwargs["indir"])
		self.win_len = int(kwargs["correct_win_len"]) or 30
		self.shift_len = int(kwargs["correct_shift_len"]) or 25
		self.contral_wins = int(100.0 / self.shift_len + 0.5) + 1
		chroms = str(kwargs["chrom"]).split(",") if kwargs["chrom"] else None
		samples = str(kwargs["sample"]).split(",") if kwargs["sample"] else None
		all_samples = set()
		contigs = list()
		self.cnvdata = defaultdict(dict)
		self.sample_win_data = defaultdict(dict)
		for cnv_data in glob(os.path.join(self.outdir, "chr*.cnv.args")):
			chrom = ".".join(os.path.basename(cnv_data).split(".")[0:-2])
			if chroms is not None and chrom not in chroms:
				continue
			cnvdata = SaveLoad(cnv_data)
			cnvdata = cnvdata.load()
			contigs.append(chrom)
			for sample in cnvdata.keys():
				dep_f = os.path.join(self.outdir, sample, "%s.W%iS%i.fixdep.gz" % (chrom, self.win_len, self.shift_len))
				if os.path.isfile(dep_f) and os.path.isfile(dep_f + '.tbi'):
					self.sample_win_data[chrom][sample] = dep_f
				if samples is not None and sample not in samples:
					continue
				all_samples.add(sample)
				self.cnvdata[sample][chrom] = cnvdata[sample]
		self.samples = sorted(all_samples)
		self.contigs = sorted(contigs, key=lambda x: _chrom_valued(x))
		databases = os.path.abspath(kwargs["dbdir"])
		t_db = os.path.abspath(kwargs["transdb"]) if "transdb" in kwargs else os.path.join(databases, "transdb",
		                                                                                   "ncbi_anno_rel104.dbref.db")
		for db in glob(os.path.join(databases, "*", "*.cnvdb.config")):
			db = os.path.abspath(db)
			dbname = os.path.basename(os.path.dirname(db))
			_AnnotationDB[dbname].add(db)
		self.reference = os.path.abspath(kwargs["reference"]) if kwargs["reference"] else \
			os.path.join(databases, 'aln_db/hg19/hg19_chM_male_mask.fa')
		self.DBAnno = CNVAnnotation(self.reference, _AnnotationDB)
		self.HGVS = HGVS(t_db)

	def call_cnvs(self, sample):
		pool = Pool(processes=cpu(use_mem=2147483648, cpu_limit=len(self.contigs)))
		for chrom in self.contigs:
			dep_data = self.cnvdata[sample][chrom].data
			ploid = self.cnvdata[sample][chrom].ploid
			if ploid < 1 or not len(dep_data):
				continue
			best_p = self.cnvdata[sample][chrom].best_probability
			trials = self.cnvdata[sample][chrom].trials
			regions = self.cnvdata[sample][chrom].regions
			output = os.path.join(self.outdir, sample, "%s.cnv" % chrom)
			hmm_cnv(dep_data, regions, best_p, trials, ploid, output, self.contral_wins)
			pool.apply_async(hmm_cnv, (dep_data, regions, best_p, trials, ploid, output, self.contral_wins))
		pool.close()
		pool.join()

	def z_score(self, chrom, start, stop, sample):
		dep_data = list()
		deps = list()
		dep_handle = pysam.Tabixfile(self.sample_win_data[chrom][sample])
		for i in dep_handle.fetch(chrom):
			p_s, p_e, p_d = map(int, i.strip().split("\t")[1:4])
			dep_data.append(p_d)
			if (p_s <= start <= p_e) or (start <= p_s <= stop) or (p_s <= stop <= p_e):
				deps.append(p_d)
		dep_handle.close()
		if len(deps) == 0:
			return 0
		d_m = np.mean(dep_data)
		s_d = np.std(dep_data)
		score = 0.0
		for i in deps:
			score += (i - d_m + 0.0) / s_d
		return round(score / len(deps), 4)

	def plot(self, sample, f_in):
		fin = smart_open(f_in)
		f_out = os.path.join(os.path.dirname(f_in), sample + ".cnv.out.pdf")
		pdf = PdfPages(f_out)
		for line in fin.readlines():
			rows = line.strip().split("\t")
			try:
				chrom = str(rows[0])
				start = max(int(rows[2]) - self.win_len, 0)
				stop = int(rows[3]) + self.win_len
			except ValueError:
				continue
			plt.figure()
			samples = sorted(self.sample_win_data[chrom].keys())
			deps = list()
			for s in samples:
				d = list()
				dep_handle = pysam.Tabixfile(self.sample_win_data[chrom][s])
				for lines in dep_handle.fetch(chrom, start, stop):
					p_s, p_e, p_d = map(int, lines.strip().split("\t")[1:4])
					d.append(p_d)
				dep_handle.close()
				deps.append(d)
			deps = np.array(deps)
			deps = np.log2(deps / deps.mean(axis=0))
			for s in range(len(samples)):
				d = deps[s]
				if samples[s] == sample:
					plt.plot(d, color="k")
				else:
					plt.plot(d, color="m")
			plt.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off')
			plt.title("%s_%s_%s_%s" % (rows[0], rows[2], rows[3], rows[5]))
			pdf.savefig()
			plt.close()
		pdf.close()

	def annotation(self, sample, debug=False):
		output = os.path.join(self.outdir, sample, "%s.cnv.anno.tsv" % sample)
		f_out = smart_open(output, 'w')
		titles = ["#Chrom", "Ploid", "Start", "Stop", "length", "copyNumber", "Mtype", "meanP", "Z-score", "GCpr",
		          "MutationName"]
		dbtitle = self.DBAnno.dbtitle.split("\t")
		titles.extend(dbtitle)
		f_out.write("\t".join(titles) + '\n')
		for chrom in self.contigs:
			cnvs = os.path.join(self.outdir, sample, "%s.cnv" % chrom)
			if not os.path.exists(cnvs):
				continue
			f_in = smart_open(cnvs)
			for line in f_in:
				rows = line.strip().split("\t")
				try:
					chrom = str(rows[0])
					start = int(rows[2])
					stop = int(rows[3])
					mtype = str(rows[6])
				except ValueError:
					continue
				z_s = self.z_score(chrom, start, stop, sample)
				gcr = count_gc(self.DBAnno.refer.fetch(chrom, start, stop))
				if not (0.3 <= gcr <= 0.7):
					continue
				variation = dict([("Chrom", chrom), ("Start", start), ("Stop", stop), ("Mtype", mtype)])
				m_name = set()
				for hgvs in self.HGVS.annobed(chrom, start, stop):
					trans = str(hgvs.Transcript)
					gene = str(hgvs.geneSym)
					chgvs = str(hgvs.cHgvs)
					protein = str(hgvs.Protein)
					exons = str(hgvs.ExonRegions)
					mess = ":".join(filter(lambda x: x != ".", [trans, protein, gene, chgvs, exons]))
					m_name.add(mess)
				dbinfo = self.DBAnno.dbanno(variation)
				anno_message = [str(i) for i in rows]
				anno_message.append(str(z_s))
				anno_message.append(str(gcr))
				anno_message.append("|".join(m_name))
				for i in dbtitle:
					if i in dbinfo:
						anno_message.append("|".join(dbinfo[i]))
					else:
						anno_message.append(".")
				f_out.write("\t".join(anno_message) + '\n')
			f_in.close()
			if not debug:
				os.remove(cnvs)
		f_out.close()
		return output