def exon_graph(depth, refseq, prefix, outdir=os.getcwd(), trans=None, genes=None, threads=0): beds = HGVS(refseq, trans=trans, genes=genes) messages = beds.get_exons() threads = cpu_count() if not int(threads) else int(threads) pool = multiprocessing.Pool(processes=threads) for transcript, trans_messages in messages.iteritems(): pool.apply_async(graph, (depth, trans_messages, outdir, prefix)) pool.close() pool.join()
def __init__(self, reference, trans=None, genes=None): self.refdb = os.path.join(database_dir, "transdb", "ncbi_anno_rel104.dbref.db") reference = os.path.abspath(reference) self.HGVS = HGVS(self.refdb, reference, trans, genes) self._dbhandles = list() dbtitle = defaultdict(float) for f in glob(os.path.join(database_dir, "*", "*.bedanno.config")): dbs = AnnoVar(os.path.abspath(f)) t = "\t".join([str(i) for i in sorted(dbs.search_value.values())]) dbtitle[t] = dbs.order self._dbhandles.append(dbs) self.dbtitle = "\t".join( [j for j in sorted(dbtitle.keys(), key=lambda x: dbtitle[x])])
def __init__(self, reference, trans=None, genes=None): self.refdb = os.path.join(database_dir, "transdb", "ncbi_anno_rel104.dbref.db") reference = os.path.abspath(reference) self.HGVS = HGVS(self.refdb, reference, trans, genes) self._dbhandles = list() dbtitle = defaultdict(float) for f in glob(os.path.join(database_dir, "*", "*.bedanno.config")): dbs = AnnoVar(os.path.abspath(f)) t = "\t".join([str(i) for i in sorted(dbs.search_value.values())]) dbtitle[t] = dbs.order self._dbhandles.append(dbs) self.dbtitle = "\t".join([j for j in sorted(dbtitle.keys(), key=lambda x: dbtitle[x])])
def __init__(self, **kwargs): self.outdir = os.path.abspath(kwargs["indir"]) self.win_len = int(kwargs["correct_win_len"]) or 30 self.shift_len = int(kwargs["correct_shift_len"]) or 25 self.contral_wins = int(100.0 / self.shift_len + 0.5) + 1 chroms = str(kwargs["chrom"]).split(",") if kwargs["chrom"] else None samples = str(kwargs["sample"]).split(",") if kwargs["sample"] else None all_samples = set() contigs = list() self.cnvdata = defaultdict(dict) self.sample_win_data = defaultdict(dict) for cnv_data in glob(os.path.join(self.outdir, "chr*.cnv.args")): chrom = ".".join(os.path.basename(cnv_data).split(".")[0:-2]) if chroms is not None and chrom not in chroms: continue cnvdata = SaveLoad(cnv_data) cnvdata = cnvdata.load() contigs.append(chrom) for sample in cnvdata.keys(): dep_f = os.path.join(self.outdir, sample, "%s.W%iS%i.fixdep.gz" % (chrom, self.win_len, self.shift_len)) if os.path.isfile(dep_f) and os.path.isfile(dep_f + '.tbi'): self.sample_win_data[chrom][sample] = dep_f if samples is not None and sample not in samples: continue all_samples.add(sample) self.cnvdata[sample][chrom] = cnvdata[sample] self.samples = sorted(all_samples) self.contigs = sorted(contigs, key=lambda x: _chrom_valued(x)) databases = os.path.abspath(kwargs["dbdir"]) t_db = os.path.abspath(kwargs["transdb"]) if "transdb" in kwargs else os.path.join(databases, "transdb", "ncbi_anno_rel104.dbref.db") for db in glob(os.path.join(databases, "*", "*.cnvdb.config")): db = os.path.abspath(db) dbname = os.path.basename(os.path.dirname(db)) _AnnotationDB[dbname].add(db) self.reference = os.path.abspath(kwargs["reference"]) if kwargs["reference"] else \ os.path.join(databases, 'aln_db/hg19/hg19_chM_male_mask.fa') self.DBAnno = CNVAnnotation(self.reference, _AnnotationDB) self.HGVS = HGVS(t_db)
class BedAnno(object): def __init__(self, reference, trans=None, genes=None): self.refdb = os.path.join(database_dir, "transdb", "ncbi_anno_rel104.dbref.db") reference = os.path.abspath(reference) self.HGVS = HGVS(self.refdb, reference, trans, genes) self._dbhandles = list() dbtitle = defaultdict(float) for f in glob(os.path.join(database_dir, "*", "*.bedanno.config")): dbs = AnnoVar(os.path.abspath(f)) t = "\t".join([str(i) for i in sorted(dbs.search_value.values())]) dbtitle[t] = dbs.order self._dbhandles.append(dbs) self.dbtitle = "\t".join( [j for j in sorted(dbtitle.keys(), key=lambda x: dbtitle[x])]) def __del__(self): self.HGVS.__del__() for dbs in self._dbhandles: dbs.__del__() def dbanno(self, variation): dbinfo = defaultdict(set) for dbs in self._dbhandles: try: infos = dbs.fetch(**variation.__dict__) for info in infos: for k, v in info.__dict__.iteritems(): if v != ".": dbinfo[k].add(v) except Exception: continue return dbinfo def bedanno(self, bedfile, fileout=sys.stdout): if not (bedfile and os.path.isfile(bedfile)): raise IOError("Fail to load file: %s" % bedfile) regions = smart_open(bedfile).readlines() f_out = smart_open(fileout, 'w') titles = "Transcript\tgeneSym\tgeneSym\tcHGVS\tProtein\tStand\tExonRegions" for line in regions: rows = line.strip().split("\t") if len(rows) < 3: f_out.write(line) try: chrom = str(rows[0]) start = int(rows[1]) stop = int(rows[2]) except ValueError: if len(self.dbtitle): f_out.write("\t".join(rows) + '\t'.join([titles, self.dbtitle]) + '\n') else: f_out.write("\t".join(rows) + '\t' + titles + '\n') continue anno_m = self.HGVS.annobed(chrom, start, stop) for anno in anno_m: dbinfo = self.dbanno(anno) rows[0] = str(anno.Chrom) rows[1] = str(anno.Start) rows[2] = str(anno.Stop) trans = str(anno.Transcript) gene = str(anno.geneSym) chgvs = str(anno.cHgvs) protein = str(anno.Protein) stand = str(anno.Stand) exons = str(anno.ExonRegions) anno_message = [ str(i) for i in rows + [trans, gene, chgvs, protein, stand, exons] ] for i in self.dbtitle.split("\t"): if i in dbinfo: anno_message.append(";".join(dbinfo[i])) else: anno_message.append(".") f_out.write("\t".join(anno_message) + '\n') f_out.close()
class BedAnno(object): def __init__(self, reference, trans=None, genes=None): self.refdb = os.path.join(database_dir, "transdb", "ncbi_anno_rel104.dbref.db") reference = os.path.abspath(reference) self.HGVS = HGVS(self.refdb, reference, trans, genes) self._dbhandles = list() dbtitle = defaultdict(float) for f in glob(os.path.join(database_dir, "*", "*.bedanno.config")): dbs = AnnoVar(os.path.abspath(f)) t = "\t".join([str(i) for i in sorted(dbs.search_value.values())]) dbtitle[t] = dbs.order self._dbhandles.append(dbs) self.dbtitle = "\t".join([j for j in sorted(dbtitle.keys(), key=lambda x: dbtitle[x])]) def __del__(self): self.HGVS.__del__() for dbs in self._dbhandles: dbs.__del__() def dbanno(self, variation): dbinfo = defaultdict(set) for dbs in self._dbhandles: try: infos = dbs.fetch(**variation.__dict__) for info in infos: for k, v in info.__dict__.iteritems(): if v != ".": dbinfo[k].add(v) except Exception: continue return dbinfo def bedanno(self, bedfile, fileout=sys.stdout): if not (bedfile and os.path.isfile(bedfile)): raise IOError("Fail to load file: %s" % bedfile) regions = smart_open(bedfile).readlines() f_out = smart_open(fileout, 'w') titles = "Transcript\tgeneSym\tgeneSym\tcHGVS\tProtein\tStand\tExonRegions" for line in regions: rows = line.strip().split("\t") if len(rows) < 3: f_out.write(line) try: chrom = str(rows[0]) start = int(rows[1]) stop = int(rows[2]) except ValueError: if len(self.dbtitle): f_out.write("\t".join(rows) + '\t'.join([titles, self.dbtitle]) + '\n') else: f_out.write("\t".join(rows) + '\t' + titles + '\n') continue anno_m = self.HGVS.annobed(chrom, start, stop) for anno in anno_m: dbinfo = self.dbanno(anno) rows[0] = str(anno.Chrom) rows[1] = str(anno.Start) rows[2] = str(anno.Stop) trans = str(anno.Transcript) gene = str(anno.geneSym) chgvs = str(anno.cHgvs) protein = str(anno.Protein) stand = str(anno.Stand) exons = str(anno.ExonRegions) anno_message = [str(i) for i in rows + [trans, gene, chgvs, protein, stand, exons]] for i in self.dbtitle.split("\t"): if i in dbinfo: anno_message.append(";".join(dbinfo[i])) else: anno_message.append(".") f_out.write("\t".join(anno_message) + '\n') f_out.close()
class CNVAnalysis(object): global _AnnotationDB def __init__(self, **kwargs): self.outdir = os.path.abspath(kwargs["indir"]) self.win_len = int(kwargs["correct_win_len"]) or 30 self.shift_len = int(kwargs["correct_shift_len"]) or 25 self.contral_wins = int(100.0 / self.shift_len + 0.5) + 1 chroms = str(kwargs["chrom"]).split(",") if kwargs["chrom"] else None samples = str(kwargs["sample"]).split(",") if kwargs["sample"] else None all_samples = set() contigs = list() self.cnvdata = defaultdict(dict) self.sample_win_data = defaultdict(dict) for cnv_data in glob(os.path.join(self.outdir, "chr*.cnv.args")): chrom = ".".join(os.path.basename(cnv_data).split(".")[0:-2]) if chroms is not None and chrom not in chroms: continue cnvdata = SaveLoad(cnv_data) cnvdata = cnvdata.load() contigs.append(chrom) for sample in cnvdata.keys(): dep_f = os.path.join(self.outdir, sample, "%s.W%iS%i.fixdep.gz" % (chrom, self.win_len, self.shift_len)) if os.path.isfile(dep_f) and os.path.isfile(dep_f + '.tbi'): self.sample_win_data[chrom][sample] = dep_f if samples is not None and sample not in samples: continue all_samples.add(sample) self.cnvdata[sample][chrom] = cnvdata[sample] self.samples = sorted(all_samples) self.contigs = sorted(contigs, key=lambda x: _chrom_valued(x)) databases = os.path.abspath(kwargs["dbdir"]) t_db = os.path.abspath(kwargs["transdb"]) if "transdb" in kwargs else os.path.join(databases, "transdb", "ncbi_anno_rel104.dbref.db") for db in glob(os.path.join(databases, "*", "*.cnvdb.config")): db = os.path.abspath(db) dbname = os.path.basename(os.path.dirname(db)) _AnnotationDB[dbname].add(db) self.reference = os.path.abspath(kwargs["reference"]) if kwargs["reference"] else \ os.path.join(databases, 'aln_db/hg19/hg19_chM_male_mask.fa') self.DBAnno = CNVAnnotation(self.reference, _AnnotationDB) self.HGVS = HGVS(t_db) def call_cnvs(self, sample): pool = Pool(processes=cpu(use_mem=2147483648, cpu_limit=len(self.contigs))) for chrom in self.contigs: dep_data = self.cnvdata[sample][chrom].data ploid = self.cnvdata[sample][chrom].ploid if ploid < 1 or not len(dep_data): continue best_p = self.cnvdata[sample][chrom].best_probability trials = self.cnvdata[sample][chrom].trials regions = self.cnvdata[sample][chrom].regions output = os.path.join(self.outdir, sample, "%s.cnv" % chrom) hmm_cnv(dep_data, regions, best_p, trials, ploid, output, self.contral_wins) pool.apply_async(hmm_cnv, (dep_data, regions, best_p, trials, ploid, output, self.contral_wins)) pool.close() pool.join() def z_score(self, chrom, start, stop, sample): dep_data = list() deps = list() dep_handle = pysam.Tabixfile(self.sample_win_data[chrom][sample]) for i in dep_handle.fetch(chrom): p_s, p_e, p_d = map(int, i.strip().split("\t")[1:4]) dep_data.append(p_d) if (p_s <= start <= p_e) or (start <= p_s <= stop) or (p_s <= stop <= p_e): deps.append(p_d) dep_handle.close() if len(deps) == 0: return 0 d_m = np.mean(dep_data) s_d = np.std(dep_data) score = 0.0 for i in deps: score += (i - d_m + 0.0) / s_d return round(score / len(deps), 4) def plot(self, sample, f_in): fin = smart_open(f_in) f_out = os.path.join(os.path.dirname(f_in), sample + ".cnv.out.pdf") pdf = PdfPages(f_out) for line in fin.readlines(): rows = line.strip().split("\t") try: chrom = str(rows[0]) start = max(int(rows[2]) - self.win_len, 0) stop = int(rows[3]) + self.win_len except ValueError: continue plt.figure() samples = sorted(self.sample_win_data[chrom].keys()) deps = list() for s in samples: d = list() dep_handle = pysam.Tabixfile(self.sample_win_data[chrom][s]) for lines in dep_handle.fetch(chrom, start, stop): p_s, p_e, p_d = map(int, lines.strip().split("\t")[1:4]) d.append(p_d) dep_handle.close() deps.append(d) deps = np.array(deps) deps = np.log2(deps / deps.mean(axis=0)) for s in range(len(samples)): d = deps[s] if samples[s] == sample: plt.plot(d, color="k") else: plt.plot(d, color="m") plt.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') plt.title("%s_%s_%s_%s" % (rows[0], rows[2], rows[3], rows[5])) pdf.savefig() plt.close() pdf.close() def annotation(self, sample, debug=False): output = os.path.join(self.outdir, sample, "%s.cnv.anno.tsv" % sample) f_out = smart_open(output, 'w') titles = ["#Chrom", "Ploid", "Start", "Stop", "length", "copyNumber", "Mtype", "meanP", "Z-score", "GCpr", "MutationName"] dbtitle = self.DBAnno.dbtitle.split("\t") titles.extend(dbtitle) f_out.write("\t".join(titles) + '\n') for chrom in self.contigs: cnvs = os.path.join(self.outdir, sample, "%s.cnv" % chrom) if not os.path.exists(cnvs): continue f_in = smart_open(cnvs) for line in f_in: rows = line.strip().split("\t") try: chrom = str(rows[0]) start = int(rows[2]) stop = int(rows[3]) mtype = str(rows[6]) except ValueError: continue z_s = self.z_score(chrom, start, stop, sample) gcr = count_gc(self.DBAnno.refer.fetch(chrom, start, stop)) if not (0.3 <= gcr <= 0.7): continue variation = dict([("Chrom", chrom), ("Start", start), ("Stop", stop), ("Mtype", mtype)]) m_name = set() for hgvs in self.HGVS.annobed(chrom, start, stop): trans = str(hgvs.Transcript) gene = str(hgvs.geneSym) chgvs = str(hgvs.cHgvs) protein = str(hgvs.Protein) exons = str(hgvs.ExonRegions) mess = ":".join(filter(lambda x: x != ".", [trans, protein, gene, chgvs, exons])) m_name.add(mess) dbinfo = self.DBAnno.dbanno(variation) anno_message = [str(i) for i in rows] anno_message.append(str(z_s)) anno_message.append(str(gcr)) anno_message.append("|".join(m_name)) for i in dbtitle: if i in dbinfo: anno_message.append("|".join(dbinfo[i])) else: anno_message.append(".") f_out.write("\t".join(anno_message) + '\n') f_in.close() if not debug: os.remove(cnvs) f_out.close() return output