for (sf, chr, locus, ref, alt), r in snvdata.iteritems(): chrom = chrreg.label2chrom(sf, chr) assert (chrom) snvkey = (chrom, locus, ref, alt) if snvkey not in snvdata1: snvdata1[snvkey] = (chrom, locus, ref, alt, r) for bamfile in opt.alignments: chrreg.add_bamlabels(bamfile) chrreg.determine_chrom_order() snvdata = sorted(snvdata1.values(), key=lambda s: (chrreg.chrom_order(s[0]), s[1], s[2], s[3])) # extrasnvheaders = filter(lambda h: h in usedsnvheaders, extrasnvheaders) progress.message("SNVs: %d\n" % len(snvdata)) outheaders = snvheaders + filter( None, """ SNVCountForward SNVCountReverse RefCountForward RefCountReverse SNVCount RefCount GoodReads %BadRead R HomoVarSc HetSc HomoRefSc
sampsig = "".join( map(str, [1 * (len(types2files[t]) > 0) for t in "GDNA SDNA NRNA TRNA".split()])) if sampsig == "1111": events = AllSamplesEvent elif sampsig == "1100": events = DNAOnlyEvent elif sampsig == "1010": events = NormalOnlyEvent elif sampsig == "0111": events = NoGDNAEvent else: raise RuntimeError("Bad combination of sample files") progress.message("Testing for events: %s." % (", ".join(events.listall()), )) events.setCounts(GDNA, SDNA, NRNA, TRNA) cosmic_headers = [] if opt.cosmic: progress.stage("Parsing COSMIC annotation file") if opt.cosmic.endswith('.gz'): f = gzip.open(opt.cosmic, mode='rt', encoding='utf8') else: f = open(opt.cosmic, mode='rt', encoding='utf8') reader = csv.DictReader(f, delimiter='\t') for cos in reader: if cos['Mutation genome position']: chr, locus = cos['Mutation genome position'].split(':', 1) pos_st, pos_ed = locus.split('-', 1)
continue if not re.search(r'^[ACGT](,[ACGT])*$', alt): continue for h in r: if r.get(h): usedsnpheaders.add(h) cannonr = (",".join(map(lambda t: "%s:%s" % t, sorted(r.items())))) snpkey = (chr, locus, ref, alt, cannonr) if snpkey not in snpdata: snpdata[snpkey] = (chr, locus, ref, alt, r) progress.update() progress.done() snpdata = sorted(snpdata.values()) extrasnpheaders = filter(lambda h: h in usedsnpheaders, extrasnpheaders) progress.message("SNPs: %d" % len(snpdata)) progress.stage("Read splice junction data", len(opt.junctions)) juncdata = set() for juncfile in opt.junctions: junc = BEDFile(filename=juncfile) for r in junc: chr = r['chrom'] st = int(r['chromStart']) ed = int(r['chromEnd']) bs = map(int, r['blockSizes'].split(',')) assert(len(bs) == 2) gap = (st + bs[0], ed - bs[1]) key = (chr, gap) juncdata.add(key) progress.update()
snvdata1 = {} for (sf, chr, locus, ref, alt), r in snvdata.iteritems(): chrom = chrreg.label2chrom(sf,chr) assert(chrom) snvkey = (chrom,locus,ref,alt) if snvkey not in snvdata1: snvdata1[snvkey] = (chrom,locus,ref,alt,r) for bamfile in opt.alignments: chrreg.add_bamlabels(bamfile) chrreg.determine_chrom_order() snvdata = sorted(snvdata1.values(),key=lambda s: (chrreg.chrom_order(s[0]),s[1],s[2],s[3])) # extrasnvheaders = filter(lambda h: h in usedsnvheaders, extrasnvheaders) progress.message("SNVs: %d\n" % len(snvdata)) outheaders = snvheaders + filter(None, """ SNVCountForward SNVCountReverse RefCountForward RefCountReverse SNVCount RefCount GoodReads %BadRead R HomoVarSc HetSc HomoRefSc VarDomSc
cmdargs = " ".join(args) execution_log = """ readCounts Options: ReadCounts Files (-c): %s Matrix Output (-M): %s Min. Reads (-m): %s%s Quiet (-q): %s Outfile File (-o): %s Command-Line: readCountsMatrix %s """ % (", ".join(opt.counts), None if not matrix else opt.matrix, opt.minreads, "" if opt.matrix not in ("Ref:Var", "Ref;Var") or opt.minreads == 0 else " (ignored)", opt.quiet, opt.output, cmdargs) progress.message(execution_log) from dataset import XLSFileTable, CSVFileTable, TSVFileTable, XLSXFileTable, TXTFileTable progress.stage("Read ReadCounts input files", len(opt.counts)) headers = "CHROM POS REF ALT ReadGroup RefCount SNVCount GoodReads".split() # NOTE: This *MUST* correspond to the columns in the readCounts .txt file output txtheaders = "CHROM POS REF ALT ReadGroup SNVCountForward SNVCountReverse RefCountForward RefCountReverse SNVCount RefCount GoodReads".split( ) allrg = set() vafmatrix = defaultdict(dict) for filename in opt.counts: base, extn = filename.rsplit('.', 1) extn = extn.lower() if extn == 'csv':
if fatal: sys.exit(1) from event import * sampsig = "".join(map(str,map(lambda t: 1*(len(types2files[t])>0),"GDNA SDNA NRNA TRNA".split()))) if sampsig == "1111": events = AllSamplesEvent elif sampsig == "1100": events = DNAOnlyEvent elif sampsig == "1010": events = NormalOnlyEvent elif sampsig == "0111": events = NoGDNAEvent else: raise RuntimeError("Bad combination of sample files") progress.message("Testing for events: %s."%(", ".join(events.listall()),)) events.setCounts(GDNA,SDNA,NRNA,TRNA) cosmic_headers = [] if opt.cosmic: progress.stage("Parsing COSMIC annotation file") if opt.cosmic.endswith('.gz'): f = gzip.open(opt.cosmic, 'r') else: f = open(opt.cosmic, 'r') reader = csv.DictReader(f, delimiter='\t') for cos in reader: if cos['Mutation genome position']: chr,locus = cos['Mutation genome position'].split(':',1) pos_st,pos_ed = locus.split('-',1)
for juncfile in juncchroms: chrreg.add_labels(juncfile, juncchroms[juncfile]) juncdata1 = set() for jf, chr, gap in juncdata: chrom = chrreg.label2chrom(jf, chr) juncdata1.add((chrom, gap)) chrreg.determine_chrom_order() snpdata = sorted(list(snvdata1.values()), key=lambda s: (chrreg.chrom_order(s[0]), s[1], s[2], s[3])) extrasnpheaders = [h for h in extrasnpheaders if h in usedsnpheaders] progress.message("SNPs: %d" % len(snpdata)) juncdata = sorted( ((chr, gap[i], gap) for i in (0, 1) for chr, gap in juncdata1), key=lambda j: (chrreg.chrom_order(j[0]), j[1], j[2])) progress.message("Exon/Intron junctions: %d" % len(juncdata)) outheaders = snpheaders + [ _f for _f in """ NumofJuncs Distance Junctions SNPJuncIntronCount SNPJuncNoIntronCount NoSNPJuncIntronCount NoSNPJuncNoIntronCount
Advanced: Min. Reads (-m) %s (applied only to VAF matrix) Max. Reads (-M): %s Read Groups (-G): %s%s Threads per BAM (-t): %s Quiet (-q): %s Command-Line: scReadCounts %s """ % (", ".join(opt.snvs), ", ".join(opt.alignments), opt.filter, "" if readfilter == None else "\n" + indent(readfilter.tostr(), 10), opt.output, opt.minreads, opt.maxreads, None if readgroup == None else opt.readgroup, "" if readgroup == None else "\n" + indent(readgroup.tostr(), 12), opt.tpb, opt.quiet, cmdargs) progress.message(execution_log) args = [] args.extend(["-s", " ".join(opt.snvs)]) args.extend(["-r", " ".join(opt.alignments)]) args.extend(["-f", opt.filter]) args.extend(["-o", opt.output]) args.extend(["-m", 0]) if opt.maxreads != maxreads_default: args.extend(["-M", opt.maxreads]) if readgroup != None: args.extend(["-G", opt.readgroup]) args.extend(["-t", opt.tpb]) if opt.quiet: args.extend(["-q"]) args = [str(x) for x in args]
Unique Reads (-U): %s Read Groups (-G): %s%s Threads per BAM (-t): %s Full Headers (-F): %s Quiet (-q): %s Debug (-d): %s Command-Line: readCounts %s """ % (", ".join(opt.snvs), ", ".join( opt.alignments), opt.filter, "" if readfilter == None else "\n" + indent(readfilter.tostr(), 10), opt.output, opt.minreads, opt.maxreads, opt.unique, None if readgroup == None else opt.readgroup, "" if readgroup == None else "\n" + indent(readgroup.tostr(), 12), opt.tpb, opt.full, opt.quiet, opt.debug, cmdargs) progress.message(execution_log) if opt.maxreads == None: opt.maxreads = 1e+20 from dataset import XLSFileTable, CSVFileTable, TSVFileTable, XLSXFileTable, TXTFileTable, BEDFile, VCFFile progress.stage("Read SNV data", len(opt.snvs)) snvheaders = [_f for _f in """ CHROM POS REF ALT """.split() if _f] snvdata = {} # extrasnvheaders = [] # usedsnvheaders = set() snvchroms = defaultdict(set)