def mutscan_signature_rsq(): for dir in mysetting.rsqMutscanDirL: fileL = filter(lambda x: 'bak' not in x, map(lambda x: x.rstrip(), os.popen('find %s -name *dbsnp_flt' % dir).readlines())) for file in fileL: if 'splice_Z' in file: sid = re.match('(.*)_splice[2]*.mutscan.dbsnp_flt', os.path.basename(file)).group(1) else: sid = re.match('(.*)_RSq_splice.mutscan.dbsnp_flt', os.path.basename(file)).group(1) cntH = {} total = 0 inFile = open(file, 'r') for line in inFile: colL = line.rstrip().split('\t') chrom = colL[0] ref = colL[2] alt = colL[3] if ref == 'N' or len(alt)>1: continue n_ref = int(colL[4]) n_alt = int(colL[5]) if n_alt >= MIN_MUT_N and (n_alt+n_ref) >= MIN_COV: if ref not in ['C','T']: ref = mybasic.rc(ref) alt = mybasic.rc(alt) if (ref,alt) in cntH: cntH[(ref,alt)] += 1 else: cntH[(ref,alt)] = 1 total += 1 ##for line for (r,a) in cntH: sys.stdout.write('%s\t%s>%s\t%s\t%s\n' % (sid, r,a, cntH[(r,a)], total))
def mutation_signaturei_t(inDir, outName=''): if outName == '': outFile = sys.stdout else: outFile = open(outName, 'w') outFile.write("samp_id\tmutation\tcontext\tfreq\tn_mut\tn_total\n") mutFileNL = map( lambda x: x.rstrip(), os.popen('ls %s/*.mutect | grep -v union_pos' % inDir).readlines()) for mutFileN in mutFileNL: sampN = mutFileN.split('/')[-1].split('.')[0].split('_')[0] print sampN, mutFileN mutFile = open(mutFileN, 'r') mutFile.readline() headerL = mutFile.readline().rstrip().split('\t') idxH = {} sigH = {} cntH = {} for i in range(len(headerL)): idxH[headerL[i]] = i total = 0 for line in mutFile: colL = line.rstrip().split('\t') chr = colL[idxH['contig']] pos = colL[idxH['position']] context = colL[idxH['context']] ref = colL[idxH['ref_allele']] alt = colL[idxH['alt_allele']] status = colL[idxH['judgement']] if status == 'REJECT' or chr == 'chrMT' or chr == 'chrM': continue total += 1 tri = context[2] + ref + context[4] if ref == 'C' or ref == 'T': nt_ch = ref + '>' + alt else: nt_ch = rc(ref) + '>' + rc(alt) tri = rc(tri) if (nt_ch, tri) in sigH: sigH[(nt_ch, tri)] += 1 else: sigH[(nt_ch, tri)] = 1 if (nt_ch) in cntH: cntH[(nt_ch)] += 1 else: cntH[(nt_ch)] = 1 mutFile.close() for key in sigH: (type, tri) = key freq = sigH[key] outFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (sampN, type, tri, freq, cntH[type], total)) outFile.flush() outFile.close()
def mutation_signaturei_t(inDir, outName=''): if outName == '': outFile = sys.stdout else: outFile = open(outName, 'w') outFile.write("samp_id\tmutation\tcontext\tfreq\tn_mut\tn_total\n") mutFileNL = map(lambda x: x.rstrip(), os.popen('ls %s/*.mutect | grep -v union_pos' % inDir).readlines()) for mutFileN in mutFileNL: sampN = mutFileN.split('/')[-1].split('.')[0].split('_')[0] print sampN, mutFileN mutFile = open(mutFileN, 'r') mutFile.readline() headerL = mutFile.readline().rstrip().split('\t') idxH = {} sigH = {} cntH = {} for i in range(len(headerL)): idxH[headerL[i]] = i total = 0 for line in mutFile: colL = line.rstrip().split('\t') chr = colL[idxH['contig']] pos = colL[idxH['position']] context = colL[idxH['context']] ref = colL[idxH['ref_allele']] alt = colL[idxH['alt_allele']] status = colL[idxH['judgement']] if status == 'REJECT' or chr == 'chrMT' or chr == 'chrM': continue total += 1 tri = context[2] + ref + context[4] if ref == 'C' or ref == 'T': nt_ch = ref + '>' + alt else: nt_ch = rc(ref) + '>' + rc(alt) tri = rc(tri) if (nt_ch,tri) in sigH: sigH[(nt_ch,tri)] += 1 else: sigH[(nt_ch,tri)] = 1 if (nt_ch) in cntH: cntH[(nt_ch)] += 1 else: cntH[(nt_ch)] = 1 mutFile.close() for key in sigH: (type, tri) = key freq = sigH[key] outFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (sampN, type, tri, freq, cntH[type], total)) outFile.flush() outFile.close()
def process_bp(inFileName,outFileName,regionL): result = mygsnap.gsnapFile(inFileName,True) outFile = open(outFileName, 'w') outFile.write('browser full knownGene\n') outFile.write('track name="%s" visibility=2\n' % inFileName) for rL in result: if not (rL[0].nLoci==1 and rL[1].nLoci==1 and rL[0].pairRel=='concordant'): raise Exception locL = [mygenome.locus(rL[0].matchL()[0].segL[0][2]), mygenome.locus(rL[1].matchL()[0].segL[0][2])] flag = False for loc in locL: for region in regionL: if loc.overlap(region) > 0: flag = True if flag: print '^%s.*%s$\n' % (rL[0].seq(),mybasic.rc(rL[1].seq())), for loc in locL: outFile.write('%s\t%s\t%s\n' % (loc.chrom,loc.chrSta,loc.chrEnd))
def process_bp(inFileName,outFileName,coordH,regionL): result = mygsnap.gsnapFile(inFileName,True) outFile = open(outFileName, 'w') outFile.write('browser full knownGene\n') outFile.write('track name="%s" visibility=2\n' % inFileName) for rL in result: if not (rL[0].nLoci==1 and rL[1].nLoci==1 and rL[0].pairRel=='unpaired'): raise Exception locL = [mygenome.locus(rL[0].matchL()[0].segL[0][2]), mygenome.locus(rL[1].matchL()[0].segL[0][2])] for loc in locL: loc.chrSta += coordH[loc.chrom][1] -1 loc.chrEnd += coordH[loc.chrom][1] -1 loc.chrom = coordH[loc.chrom][0] flag = False for loc in locL: for region in regionL: if loc.overlap(region) > 0: flag = True if flag: print '^%s.*%s$\n' % (rL[0].seq(),mybasic.rc(rL[1].seq())), for loc in locL: outFile.write('%s\t%s\t%s\n' % (loc.chrom,loc.chrSta,loc.chrEnd))
def process_bp(inGsnapFileName): result = mygsnap.gsnapFile(inGsnapFileName, False) #outBpFile = open(outBpFileName, 'w') seqH = {} for r in result: match = r.matchL()[0] if not '(transloc)' in r.pairRel: raise Exception if len(match.segL) != 2: raise Exception s1 = match.segL[0][2] s2 = match.segL[1][2] direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1) bp1 = re.match('[+-]([^:]+):[0-9]+..([0-9]+)', s1).groups() bp2 = re.match('[+-]([^:]+):([0-9]+)..[0-9]+', s2).groups() # if bp1[0] == bp2[0]: # continue if direction == 'sense': seq = r.seq() offset = int(match.segL[0][1].split('..')[1]) bp12 = (bp1, bp2) else: seq = mybasic.rc(r.seq(), 'DNA') offset = len(seq) - int(match.segL[0][1].split('..')[1]) bp12 = (bp2, bp1) mybasic.addHash(seqH, bp12, (offset, seq)) seqL = seqH.items() seqL.sort(lambda x, y: cmp(len(y[1]), len(x[1]))) for ((bp1, bp2), vL) in seqL: vL.sort(lambda x, y: cmp(y[0], x[0])) maxOffset = vL[0][0] print '\n', bp1, bp2, len(vL), '\n' for (offset, seq) in vL: print '%s%s %s' % (' ' * (maxOffset - offset), seq[:offset], seq[offset:])
def process_bp(inGsnapFileName, outBpFileName): result = mygsnap.gsnapFile(inGsnapFileName, False) outBpFile = open(outBpFileName, 'w') seqH = {} for r in result: match = r.matchL()[0] if not '(transloc)' in r.pairRel: raise Exception if len(match.segL) != 2: raise Exception s1 = match.segL[0][2] s2 = match.segL[1][2] if s1[0] != s2[0]: raise Exception strand = s1[0] s1T = re.match('[+-]([^:]+):[0-9]+..([0-9]+)', s1).groups() s2T = re.match('[+-]([^:]+):([0-9]+)..[0-9]+', s2).groups() if strand == '+': seq = r.seq() offset = int(match.segL[0][1].split('..')[1]) junction = (s1T, s2T) else: seq = mybasic.rc(r.seq(), 'DNA') offset = len(seq) - int(match.segL[0][1].split('..')[1]) junction = (s2T, s1T) mybasic.addHash(seqH, junction, (offset, seq)) for ((j1, j2), vL) in seqH.items(): vL.sort(lambda x, y: cmp(x[0], y[0])) vL_mod = [] for (offset, seq) in vL: offset = blockSize - offset + 1 vL_mod.append('%s:%s' % (offset, seq)) outBpFile.write('%s:%s-%s,%s:%s-%s,%s\n' % (j1[0].split('_')[0], int(j1[1]) - blockSize, j1[1], j1[0].split('_')[0], j1[1], int(j1[1]) + blockSize, '|'.join(vL_mod)))
def mutect_weblogo_sub(sampN, inFileN, outFileN, pdfFileN): inFile = open(inFileN, 'r') inFile.readline() #comment line headerL = inFile.readline().rstrip().split('\t') idxH = {} for i in range(len(headerL)): idxH[headerL[i]] = i outFile = open(outFileN,'w') for line in inFile: colL = line.rstrip().split('\t') context = colL[idxH['context']] ref = colL[idxH['ref_allele']] alt = colL[idxH['alt_allele']] status = colL[idxH['judgement']] if status == 'REJECT': continue head = context[:3] tail = context[-3:] context = head + ref + tail if ref not in ['C','T']: context = mybasic.rc(context) ref = mybasic.rc(ref) alt = mybasic.rc(alt) if ref == 'C' and alt == 'T':## TMZ context only outFile.write('%s\n' % context) outFile.flush() outFile.close() fin = open(outFileN,'r') seqs = weblogolib.read_seq_data(fin) data = weblogolib.LogoData.from_seqs(seqs) options = weblogolib.LogoOptions() options.show_fineprint = False options.first_index = -3 options.logo_title = sampN format = weblogolib.LogoFormat(data, options) fout = open(pdfFileN, 'w') weblogolib.pdf_formatter(data, format, fout)
def mutect_weblogo_sub(sampN, inFileN, outFileN, pdfFileN): inFile = open(inFileN, 'r') inFile.readline() #comment line headerL = inFile.readline().rstrip().split('\t') idxH = {} for i in range(len(headerL)): idxH[headerL[i]] = i outFile = open(outFileN, 'w') for line in inFile: colL = line.rstrip().split('\t') context = colL[idxH['context']] ref = colL[idxH['ref_allele']] alt = colL[idxH['alt_allele']] status = colL[idxH['judgement']] if status == 'REJECT': continue head = context[:3] tail = context[-3:] context = head + ref + tail if ref not in ['C', 'T']: context = mybasic.rc(context) ref = mybasic.rc(ref) alt = mybasic.rc(alt) if ref == 'C' and alt == 'T': ## TMZ context only outFile.write('%s\n' % context) outFile.flush() outFile.close() fin = open(outFileN, 'r') seqs = weblogolib.read_seq_data(fin) data = weblogolib.LogoData.from_seqs(seqs) options = weblogolib.LogoOptions() options.show_fineprint = False options.first_index = -3 options.logo_title = sampN format = weblogolib.LogoFormat(data, options) fout = open(pdfFileN, 'w') weblogolib.pdf_formatter(data, format, fout)
def process_bp(inGsnapFileName,outBpFileName): result = mygsnap.gsnapFile(inGsnapFileName,False) outBpFile = open(outBpFileName, 'w') seqH = {} for r in result: match = r.matchL()[0] if not '(transloc)' in r.pairRel: raise Exception if len(match.segL) != 2: raise Exception s1 = match.segL[0][2] s2 = match.segL[1][2] if s1[0] != s2[0]: raise Exception strand = s1[0] s1T = re.match('[+-]([^:]+):[0-9]+..([0-9]+)',s1).groups() s2T = re.match('[+-]([^:]+):([0-9]+)..[0-9]+',s2).groups() if strand == '+': seq = r.seq() offset = int(match.segL[0][1].split('..')[1]) junction = (s1T, s2T) else: seq = mybasic.rc(r.seq(),'DNA') offset = len(seq)-int(match.segL[0][1].split('..')[1]) junction = (s2T, s1T) mybasic.addHash(seqH,junction,(offset,seq)) for ((k1,k2), v) in seqH.items(): v.sort(lambda x,y: cmp(y[0],x[0])) k1T = re.match() k2T = re.match() k1_pos = k2_pos = k1_seq = k2_seq = outBpFile.write('%s,%s,%s\n' % (':'.join(k1),':'.join(k2),'|'.join(['%s:%s' % (offset,seq) for (offset,seq) in v])))
def process_bp(inGsnapFileName): result = mygsnap.gsnapFile(inGsnapFileName,False) #outBpFile = open(outBpFileName, 'w') seqH = {} for r in result: match = r.matchL()[0] if not '(transloc)' in r.pairRel: raise Exception if len(match.segL) != 2: raise Exception s1 = match.segL[0][2] s2 = match.segL[1][2] direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1) bp1 = re.match('[+-]([^:]+):[0-9]+..([0-9]+)',s1).groups() bp2 = re.match('[+-]([^:]+):([0-9]+)..[0-9]+',s2).groups() # if bp1[0] == bp2[0]: # continue if direction == 'sense': seq = r.seq() offset = int(match.segL[0][1].split('..')[1]) bp12 = (bp1, bp2) else: seq = mybasic.rc(r.seq(),'DNA') offset = len(seq)-int(match.segL[0][1].split('..')[1]) bp12 = (bp2, bp1) mybasic.addHash(seqH,bp12,(offset,seq)) seqL = seqH.items() seqL.sort(lambda x,y: cmp(len(y[1]),len(x[1]))) for ((bp1,bp2), vL) in seqL: vL.sort(lambda x,y: cmp(y[0],x[0])) maxOffset = vL[0][0] print '\n',bp1,bp2,len(vL),'\n' for (offset,seq) in vL: print '%s%s %s' % (' ' * (maxOffset-offset),seq[:offset],seq[offset:])
def mutscan_signature_rsq(): for dir in mysetting.rsqMutscanDirL: fileL = filter( lambda x: 'bak' not in x, map(lambda x: x.rstrip(), os.popen('find %s -name *dbsnp_flt' % dir).readlines())) for file in fileL: if 'splice_Z' in file: sid = re.match('(.*)_splice[2]*.mutscan.dbsnp_flt', os.path.basename(file)).group(1) else: sid = re.match('(.*)_RSq_splice.mutscan.dbsnp_flt', os.path.basename(file)).group(1) cntH = {} total = 0 inFile = open(file, 'r') for line in inFile: colL = line.rstrip().split('\t') chrom = colL[0] ref = colL[2] alt = colL[3] if ref == 'N' or len(alt) > 1: continue n_ref = int(colL[4]) n_alt = int(colL[5]) if n_alt >= MIN_MUT_N and (n_alt + n_ref) >= MIN_COV: if ref not in ['C', 'T']: ref = mybasic.rc(ref) alt = mybasic.rc(alt) if (ref, alt) in cntH: cntH[(ref, alt)] += 1 else: cntH[(ref, alt)] = 1 total += 1 ##for line for (r, a) in cntH: sys.stdout.write('%s\t%s>%s\t%s\t%s\n' % (sid, r, a, cntH[(r, a)], total))
def process_bp(inGsnapFileName,outBpFileName): result = mygsnap.gsnapFile(inGsnapFileName,False) outBpFile = open(outBpFileName, 'w') seqH = {} for r in result: match = r.matchL()[0] if not '(transloc)' in r.pairRel: raise Exception if len(match.segL) != 2: raise Exception s1 = match.segL[0][2] s2 = match.segL[1][2] if s1[0] != s2[0]: raise Exception strand = s1[0] s1T = re.match('[+-]([^:]+):[0-9]+..([0-9]+)',s1).groups() s2T = re.match('[+-]([^:]+):([0-9]+)..[0-9]+',s2).groups() if strand == '+': seq = r.seq() offset = int(match.segL[0][1].split('..')[1]) junction = (s1T, s2T) else: seq = mybasic.rc(r.seq(),'DNA') offset = len(seq)-int(match.segL[0][1].split('..')[1]) junction = (s2T, s1T) mybasic.addHash(seqH,junction,(offset,seq)) for ((j1,j2), vL) in seqH.items(): vL.sort(lambda x,y: cmp(x[0],y[0])) vL_mod = [] for (offset,seq) in vL: offset = blockSize-offset+1 vL_mod.append('%s:%s' % (offset,seq)) outBpFile.write('%s:%s-%s,%s:%s-%s,%s\n' % (j1[0].split('_')[0],int(j1[1])-blockSize,j1[1], j1[0].split('_')[0],j1[1],int(j1[1])+blockSize, '|'.join(vL_mod)))
def trim4x(inFqFileName, outFqFilePrefix, trimLen): if inFqFileName == 'stdin': inFqFile = sys.stdin else: inFqFile = open(inFqFileName) outFqFile1 = open('%s.1.fastq' % outFqFilePrefix, 'w') outFqFile2 = open('%s.2.fastq' % outFqFilePrefix, 'w') while 1: line = inFqFile.readline() if not line: break seq = inFqFile.readline()[:-1] if line[0] != '@': raise Exception seqN = line[1:].rstrip().split(' ')[0] inFqFile.readline() qual = inFqFile.readline()[:-1] if 'N' in seq[:trimLen] or 'N' in seq[-trimLen:]: continue outFqFile1.write('@%s/1\n%s\n+\n%s\n' % (seqN, seq[:trimLen], qual[:trimLen])) outFqFile2.write( '@%s/2\n%s\n+\n%s\n' % (seqN, mybasic.rc(seq[-trimLen:]), mybasic.rev(qual[-trimLen:]))) outFqFile1.close() outFqFile2.close()
def trim4x(inFqFileName, outFqFilePrefix, trimLen): if inFqFileName == "stdin": inFqFile = sys.stdin else: inFqFile = open(inFqFileName) outFqFile1 = open("%s.1.fastq" % outFqFilePrefix, "w") outFqFile2 = open("%s.2.fastq" % outFqFilePrefix, "w") while 1: line = inFqFile.readline() if not line: break seq = inFqFile.readline()[:-1] if line[0] != "@": raise Exception seqN = line[1:].rstrip().split(" ")[0] inFqFile.readline() qual = inFqFile.readline()[:-1] if "N" in seq[:trimLen] or "N" in seq[-trimLen:]: continue outFqFile1.write("@%s/1\n%s\n+\n%s\n" % (seqN, seq[:trimLen], qual[:trimLen])) outFqFile2.write("@%s/2\n%s\n+\n%s\n" % (seqN, mybasic.rc(seq[-trimLen:]), mybasic.rev(qual[-trimLen:]))) outFqFile1.close() outFqFile2.close()
#!/usr/bin/python import sys import mybasic motifL = ['TAAT', 'TAATT', 'TAATTG'] motifL_rc = [mybasic.rc(m, 'DNA') for m in motifL] bed = open('/data1/IRCR/PKS/promoter_hg19.bed') fa = open('/data1/IRCR/PKS/promoter_hg19.fa') geneNameL = [x.split('\t')[3] for x in bed] sys.stdout.write('geneName') for i in range(len(motifL)): sys.stdout.write('\tm%sf\tm%sr\tm%st' % (i + 1, i + 1, i + 1)) sys.stdout.write('\n') idx = 0 while True: h = fa.readline()[:-1] s = fa.readline()[:-1].upper() sys.stdout.write('%s' % (geneNameL[idx])) countL = (s.count(motifL[i]), s.count(motifL_rc[i]))
def mutscan_signature(mode='WXS', outFileN=''): if outFileN == '': outFile = sys.stdout else: outFile = open(outFileN, 'w') dirLH = {'WXS': mysetting.wxsMutscanDirL, 'RSQ': mysetting.rsqMutscanDirL} contextH = {} for dir in dirLH[mode]: fileL = filter(lambda x: 'bak' not in x, map(lambda x: x.rstrip(), os.popen('find %s -name *dbsnp_flt' % dir).readlines())) for file in fileL: if mode=='RSQ': if os.path.basename(file) == 'S647_splice.mutscan.dbsnp_flt': ## duplicated files continue if 'splice_Z' in file: sid = re.match('(.*)_splice2.mutscan.dbsnp_flt', os.path.basename(file)).group(1) else: sid = re.match('(.*)_RSq_splice.mutscan.dbsnp_flt', os.path.basename(file)).group(1) else: sid = re.match('(.*).mutscan.dbsnp_flt', os.path.basename(file)).group(1) sigH = {} cntH = {} total = 0 inFile = open(file, 'r') for line in inFile: colL = line.rstrip().split('\t') chrom = colL[0] pos = int(colL[1]) ref = colL[2] alt = colL[3] if ref == 'N' or len(alt) > 1: continue n_ref = int(colL[4]) n_alt = int(colL[5]) if n_alt >= MIN_MUT_N and (n_alt+n_ref) >= MIN_COV: start = pos - 1 end = pos + 1 if chrom in contextH and pos in contextH[chrom]: context = contextH[chrom][pos] else: resL = os.popen('samtools faidx /data1/Sequence/ucsc_hg19/hg19.fasta %s:%s-%s' % (chrom,start,end)).readlines() context = resL[1].rstrip().upper() if chrom not in contextH: contextH[chrom] = {} contextH[chrom][pos] = context if ref not in ['C','T']: ref = mybasic.rc(ref) alt = mybasic.rc(alt) context = mybasic.rc(context) ch = ref + '>' + alt if ch in cntH: cntH[ch] += 1 else: cntH[ch] = 1 if (ch,context) in sigH: sigH[(ch,context)] += 1 else: sigH[(ch,context)] = 1 total += 1 #if ##for line for (type,context) in sigH: outFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (sid, type, context, sigH[(type,context)], cntH[type], total)) ##for file #for dir outFile.flush() outFile.close()
def mutation_signature_ttt(inDirN, outDirN): sampN = os.path.basename(inDirN) outName = '%s/%s.mutation_signature.txt' % (outDirN, sampN) # outFile = open(outName, 'w') # # outFile.write("samp_id\tmutation\tcontext\tfreq\tn_mut\tn_total\n") mutFileNL = map(lambda x: x.rstrip(), os.popen('ls %s/*mutect' % inDirN).readlines()) if mutFileNL == []: mutFileNL = map(lambda x: x.rstrip(), os.popen('ls %s/*rerun' % inDirN).readlines()) pdfFileN = '%s/%s.mutation_signature.pdf' % (outDirN, sampN) os.system('Rscript %s/NGS/mutation/mutect_mutation_signature_plot.R %s %s' % (mysetting.SRC_HOME, outName, pdfFileN)) return() for mutFileN in mutFileNL: (id, postfix) = re.search('(.*)_([A-Z0-9]{1,})_[TKNCS]{2}', sampN).groups() if postfix != 'T': sid = '%s_%s' % (id, postfix) else: sid = id mutFile = open(mutFileN, 'r') mutFile.readline() headerL = mutFile.readline().rstrip().split('\t') idxH = {} sigH = {} cntH = {} for i in range(len(headerL)): idxH[headerL[i]] = i total = 0 for line in mutFile: colL = line.rstrip().split('\t') chr = colL[idxH['contig']] pos = colL[idxH['position']] context = colL[idxH['context']] ref = colL[idxH['ref_allele']] alt = colL[idxH['alt_allele']] status = colL[idxH['judgement']] if status == 'REJECT' or chr == 'chrMT' or chr == 'chrM': continue total += 1 tri = context[2] + ref + context[4] if ref == 'C' or ref == 'T': nt_ch = ref + '>' + alt else: nt_ch = rc(ref) + '>' + rc(alt) tri = rc(tri) if (nt_ch,tri) in sigH: sigH[(nt_ch,tri)] += 1 else: sigH[(nt_ch,tri)] = 1 if (nt_ch) in cntH: cntH[(nt_ch)] += 1 else: cntH[(nt_ch)] = 1 #for line mutFile.close() for key in sigH: (type, tri) = key freq = sigH[key] outFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (sid, type, tri, freq, cntH[type], total)) #for mutFile outFile.flush() outFile.close()
def mutation_signature(inDirN, outDirN, outName=''): sampN = os.path.basename(inDirN) if outName == '': outName = '%s/%s.mutation_signature.txt' % (outDirN, sampN) outFile = open(outName, 'w') outFile.write('samp_id\tmutation\tcontext\tfreq\tn_mut\tn_total\n') # header mutFileNL = map(lambda x: x.rstrip(), os.popen('ls %s/*mutect*filter.vcf' % inDirN).readlines()) if mutFileNL != []: mutH = {} for mutFileN in mutFileNL: (id, postfix) = re.search('(.*)_([A-Z0-9]{1,})_[TKNCS]{2}', sampN).groups() if postfix != 'T': sid = '%s_%s' % (id, postfix) else: sid = id mutFile = open(mutFileN, 'r') for line in mutFile: if line[0] == '#': continue colL = line.rstrip().split('\t') chr = colL[0] pos = colL[1] ref = colL[3] alt = colL[4] mutH[(chr,pos,ref,alt)] = 1 #for line #for mutFileN #if there's mutation vcf mutFileNL = map(lambda x: x.rstrip(), os.popen('ls %s/*mutect' % inDirN).readlines()) if mutFileNL == []: mutFileNL = map(lambda x: x.rstrip(), os.popen('ls %s/*mutect_rerun' % inDirN).readlines()) if mutFileNL != []: sigH = {} cntH = {} total = 0 for mutFileN in mutFileNL: mutFile = open(mutFileN, 'r') mutFile.readline() headerL = mutFile.readline().rstrip().split('\t') idxH = {} for i in range(len(headerL)): idxH[headerL[i]] = i for line in mutFile: colL = line.rstrip().split('\t') chr = colL[idxH['contig']] pos = colL[idxH['position']] context = colL[idxH['context']] ref = colL[idxH['ref_allele']] alt = colL[idxH['alt_allele']] if (chr,pos,ref,alt) in mutH: total += 1 tri = context[2] + ref + context[4] if ref == 'C' or ref == 'T': nt_ch = ref + '>' + alt else: nt_ch = rc(ref) + '>' + rc(alt) tri = rc(tri) if (nt_ch, tri) in sigH: sigH[(nt_ch, tri)] += 1 else: sigH[(nt_ch, tri)] = 1 if nt_ch in cntH: cntH[nt_ch] += 1 else: cntH[nt_ch] = 1 # if not filtered out #for line #for mutFileN for key in sigH: (type, tri) = key freq = sigH[key] outFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (sid, type, tri, freq, cntH[type], total))
def main(inFileName,geneList=[]): dataH = {} # nameL = ('Mutation GRCh37 genome position', 'Mutation GRCh37 strand','Gene name','ID_sample','ID_tumour','Primary site', \ # 'Site subtype','Primary histology','Histology subtype','Genome-wide screen','Mutation ID','Mutation CDS','Mutation AA', \ # 'Mutation Description','Mutation zygosity','Mutation somatic status','Pubmed_PMID','Sample source','Tumor origin','Comments') nameL = ('Gene name','Mutation CDS','Mutation AA','Mutation Description','Mutation GRCh37 genome position','Mutation GRCh37 strand','Mutation somatic status') inFile = open(inFileName) headerL = inFile.readline()[:-1].split('\t') idxH = dict([(x, headerL.index(x)) for x in nameL]) for line in inFile: valueL = line[:-1].split('\t') geneN = valueL[idxH['Gene name']] if '_ENST' in geneN: geneN = geneN.split('_ENST')[0] if len(geneList)>0 and geneN not in geneList: continue coord = valueL[idxH['Mutation GRCh37 genome position']] if not coord: continue somatic = valueL[idxH['Mutation somatic status']] if not 'somatic' in somatic: continue (chrNum,chrSta,chrEnd) = re.search('([^:-]+):([^:-]+)-([^:-]+)', coord).groups() cds = valueL[idxH['Mutation CDS']] aa = valueL[idxH['Mutation AA']] desc = valueL[idxH['Mutation Description']] strand = valueL[idxH['Mutation GRCh37 strand']] rm = re.match('c\.[\+\-_0-9]+([atgcATGC]*)(>|ins|del)([atgcATGC]*)',cds) if rm: (ref,vtype,alt) = rm.groups() else: ref,alt = '','' if strand == '-': ref = mybasic.rc(ref) alt = mybasic.rc(alt) chr = chrNum if chr == '23': chr = 'X' chrNum = 'X' elif chr == '24': chr = 'Y' chrNum = 'Y' elif chr == '25': chr = 'M' chrNum = 'M' # if vtype == 'del': # rm = re.search('([ACGT]+)', alt.upper()) # ## if deleted bases are specified # if alt != '' and rm: # ## check if deleted bases are the same as reference sequences at the location # new_ref = os.popen('samtools faidx /data1/Sequence/ucsc_hg19/hg19.fa chr%s:%s-%s' % (chr,chrSta,chrEnd)).readlines()[1:] # new_ref = "".join(map(lambda x: x.rstrip().upper(), new_ref)) # if new_ref == alt.upper(): # chrSta = str(int(chrSta) - 1) # ref = os.popen('samtools faidx /data1/Sequence/ucsc_hg19/hg19.fa chr%s:%s-%s' % (chr,chrSta,chrEnd)).readlines()[1:] # ref = "".join(map(lambda x: x.rstrip().upper(), ref)) # alt = ref[0] key = (chrNum,chrSta,chrEnd,strand,ref,alt) if key in dataH: mybasic.pushHash(dataH[key],'geneN',geneN) mybasic.pushHash(dataH[key],'cds',cds) mybasic.pushHash(dataH[key],'aa',aa) mybasic.pushHash(dataH[key],'desc',desc) else: dataH[key] = {'geneN':set([geneN]), 'cds':set([cds]), 'aa':set([aa]), 'desc':set([desc])} for ((chrNum,chrSta,chrEnd,strand,ref,alt),infoH) in dataH.iteritems(): sys.stdout.write('chr%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (chrNum,chrSta,chrEnd,strand, ref,alt,\ ','.join(filter(lambda x: not x.startswith('ENSG'), list(infoH['geneN']))), ','.join(infoH['cds']), ','.join(infoH['aa']), ','.join(infoH['desc'])))
def mutation_signature_ttt(inDirN, outDirN): sampN = os.path.basename(inDirN) outName = '%s/%s.mutation_signature.txt' % (outDirN, sampN) # outFile = open(outName, 'w') # # outFile.write("samp_id\tmutation\tcontext\tfreq\tn_mut\tn_total\n") mutFileNL = map(lambda x: x.rstrip(), os.popen('ls %s/*mutect' % inDirN).readlines()) if mutFileNL == []: mutFileNL = map(lambda x: x.rstrip(), os.popen('ls %s/*rerun' % inDirN).readlines()) pdfFileN = '%s/%s.mutation_signature.pdf' % (outDirN, sampN) os.system( 'Rscript %s/NGS/mutation/mutect_mutation_signature_plot.R %s %s' % (mysetting.SRC_HOME, outName, pdfFileN)) return () for mutFileN in mutFileNL: (id, postfix) = re.search('(.*)_([A-Z0-9]{1,})_[TKNCS]{2}', sampN).groups() if postfix != 'T': sid = '%s_%s' % (id, postfix) else: sid = id mutFile = open(mutFileN, 'r') mutFile.readline() headerL = mutFile.readline().rstrip().split('\t') idxH = {} sigH = {} cntH = {} for i in range(len(headerL)): idxH[headerL[i]] = i total = 0 for line in mutFile: colL = line.rstrip().split('\t') chr = colL[idxH['contig']] pos = colL[idxH['position']] context = colL[idxH['context']] ref = colL[idxH['ref_allele']] alt = colL[idxH['alt_allele']] status = colL[idxH['judgement']] if status == 'REJECT' or chr == 'chrMT' or chr == 'chrM': continue total += 1 tri = context[2] + ref + context[4] if ref == 'C' or ref == 'T': nt_ch = ref + '>' + alt else: nt_ch = rc(ref) + '>' + rc(alt) tri = rc(tri) if (nt_ch, tri) in sigH: sigH[(nt_ch, tri)] += 1 else: sigH[(nt_ch, tri)] = 1 if (nt_ch) in cntH: cntH[(nt_ch)] += 1 else: cntH[(nt_ch)] = 1 #for line mutFile.close() for key in sigH: (type, tri) = key freq = sigH[key] outFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (sid, type, tri, freq, cntH[type], total)) #for mutFile outFile.flush() outFile.close()
def mutation_signature(inDirN, outDirN, outName=''): sampN = os.path.basename(inDirN) if outName == '': outName = '%s/%s.mutation_signature.txt' % (outDirN, sampN) outFile = open(outName, 'w') outFile.write( 'samp_id\tmutation\tcontext\tfreq\tn_mut\tn_total\n') # header mutFileNL = map(lambda x: x.rstrip(), os.popen('ls %s/*mutect*filter.vcf' % inDirN).readlines()) if mutFileNL != []: mutH = {} for mutFileN in mutFileNL: (id, postfix) = re.search('(.*)_([A-Z0-9]{1,})_[TKNCS]{2}', sampN).groups() if postfix != 'T': sid = '%s_%s' % (id, postfix) else: sid = id mutFile = open(mutFileN, 'r') for line in mutFile: if line[0] == '#': continue colL = line.rstrip().split('\t') chr = colL[0] pos = colL[1] ref = colL[3] alt = colL[4] mutH[(chr, pos, ref, alt)] = 1 #for line #for mutFileN #if there's mutation vcf mutFileNL = map(lambda x: x.rstrip(), os.popen('ls %s/*mutect' % inDirN).readlines()) if mutFileNL == []: mutFileNL = map(lambda x: x.rstrip(), os.popen('ls %s/*mutect_rerun' % inDirN).readlines()) if mutFileNL != []: sigH = {} cntH = {} total = 0 for mutFileN in mutFileNL: mutFile = open(mutFileN, 'r') mutFile.readline() headerL = mutFile.readline().rstrip().split('\t') idxH = {} for i in range(len(headerL)): idxH[headerL[i]] = i for line in mutFile: colL = line.rstrip().split('\t') chr = colL[idxH['contig']] pos = colL[idxH['position']] context = colL[idxH['context']] ref = colL[idxH['ref_allele']] alt = colL[idxH['alt_allele']] if (chr, pos, ref, alt) in mutH: total += 1 tri = context[2] + ref + context[4] if ref == 'C' or ref == 'T': nt_ch = ref + '>' + alt else: nt_ch = rc(ref) + '>' + rc(alt) tri = rc(tri) if (nt_ch, tri) in sigH: sigH[(nt_ch, tri)] += 1 else: sigH[(nt_ch, tri)] = 1 if nt_ch in cntH: cntH[nt_ch] += 1 else: cntH[nt_ch] = 1 # if not filtered out #for line #for mutFileN for key in sigH: (type, tri) = key freq = sigH[key] outFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (sid, type, tri, freq, cntH[type], total))
def make_samse(ifileN, ofileN): headerL = make_header('/data1/Sequence/ucsc_hg19/hg19.chrom.sizes') # ofile = open(ofileN, 'w') for header in headerL: print header # ofile.write('%s\n' % header) result = mygsnap.gsnapFile( '/pipeline/test_ini_gsnap2sam/S022_single.gsnap', False) #result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/test.gsnap',False) #result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/S022_pair.gsnap',True) ## for unpaired for r in result: qname = r.rid() flag = 0x0 rname = '*' pos = 0 mapq = 0 cigar = '' rnext = '*' ## assume --npath=1 (maximum 1 alignment per read) pnext = 0 ## assume --npath=1 (maximum 1 alignment per read) tlen = 0 ## assume --npath=1 (maximum 1 alignment per read) seq = r.seq() qual = r.qual() extra = 'NH:i:1\tHI:i:1' ## assume --npath=1 (maximum 1 alignment per read) if r.nLoci > 1: flag = flag | 0x4 cigar = '*' new_cigar = '*' extra = '' print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, new_cigar, rnext, pnext, tlen, seq, qual, extra)) else: if r.pairRel == '(transloc)': match = r.matchL()[0] segL = match.getSegInfo() mapq = segL[0].mapq seq = r.seq() qual = r.qual() for seg in segL: flag = 0x0 (strand, rname, pos1, pos2) = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', seg.seg[2]).groups() pos = min(int(pos1), int(pos2)) (cigar, clip) = seg.toCIGAR_trans() if clip < 0: ## first half seq2 = seq[:clip] qual2 = qual[:clip] else: ## second half seq2 = seq[clip:] qual2 = qual[clip:] if strand == '-': flag = flag | 0x10 seq2 = mybasic.rc(seq2) qual2 = mybasic.rev(qual2) # print qname,seg.toCIGAR_trans() print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq2, qual2, extra)) else: match = r.matchL()[ 0] ## assume --npath=1 (maximum 1 alignment per read) segL = match.getSegInfo() (strand, rname, pos1, pos2) = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', segL[0].seg[2]).groups() pos = min(int(pos1), int(pos2)) mapq = segL[0].mapq seg_nm = segL[0].numSub cigar2 = match.toCIGAR() # print qname, match.toCIGAR() if segL[0].start != '' and segL[0].start != '0': cigar = str(segL[0].start) + 'S' if strand == '-': cigar = str(segL[0].numMatch + segL[0].numSub) + 'M' + cigar if segL[0].ins != '' and segL[0].ins != '0': cigar = str(segL[0].ins) + 'I' + cigar else: cigar = cigar + str(segL[0].numMatch + segL[0].numSub) + 'M' if segL[0].ins != '' and segL[0].ins != '0': cigar = cigar + str(segL[0].ins) + 'I' if len(segL) == 1: new_cigar = segL[0].toCIGAR(True) else: new_cigar = segL[0].toCIGAR() prev_cigar = new_cigar index = 0 for seg in segL[1:]: index = index + 1 if index == (len(segL) - 1): final = True else: final = False rm = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', seg.seg[2]).groups() match = str(seg.numMatch + seg.numSub) + 'M' if seg.ins != '' and seg.ins != '0': ins = str(seg.ins) + 'I' else: ins = '' if pos == 0 or pos > min(int(rm[2]), int(rm[3])): pos = min(int(rm[2]), int(rm[3])) if strand == '-': dist = int(pos2) - int(rm[2]) - 1 if dist > 0: cigar = match + ins + str(dist) + 'N' + cigar else: cigar = match + ins + cigar else: dist = int(rm[2]) - int(pos2) - 1 if dist > 0: cigar = cigar + str(dist) + 'N' + match + ins else: cigar = cigar + match + ins seg_nm = seg_nm + seg.numSub pos1 = rm[2] pos2 = rm[3] cur_cigar = seg.toCIGAR(final) if strand == '-': if dist > 0 and 'D' not in prev_cigar and 'I' not in prev_cigar: new_cigar = cur_cigar + str(dist) + 'N' + new_cigar else: new_cigar = cur_cigar + new_cigar else: if dist > 0 and 'D' not in prev_cigar and 'I' not in prev_cigar: new_cigar = new_cigar + str(dist) + 'N' + cur_cigar else: new_cigar = new_cigar + cur_cigar prev_cigar = cur_cigar if segL[-1].end != '' and segL[-1].end != '0': ## last segment if strand == '-': cigar = str(segL[-1].end) + 'S' + cigar else: cigar = cigar + str(segL[-1].end) + 'S' extra = extra + ('\tNM:i:%s' % seg_nm) if strand == '-': flag = flag | 0x10 seq = mybasic.rc(seq) qual = mybasic.rev(qual) ## print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual, extra)) print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar2, rnext, pnext, tlen, seq, qual, extra))
def main(inFileName, geneList=[]): dataH = {} # nameL = ('Mutation GRCh37 genome position', 'Mutation GRCh37 strand','Gene name','ID_sample','ID_tumour','Primary site', \ # 'Site subtype','Primary histology','Histology subtype','Genome-wide screen','Mutation ID','Mutation CDS','Mutation AA', \ # 'Mutation Description','Mutation zygosity','Mutation somatic status','Pubmed_PMID','Sample source','Tumor origin','Comments') nameL = ('Gene name', 'Mutation CDS', 'Mutation AA', 'Mutation Description', 'Mutation GRCh37 genome position', 'Mutation GRCh37 strand', 'Mutation somatic status') inFile = open(inFileName) headerL = inFile.readline()[:-1].split('\t') idxH = dict([(x, headerL.index(x)) for x in nameL]) for line in inFile: valueL = line[:-1].split('\t') geneN = valueL[idxH['Gene name']] if '_ENST' in geneN: geneN = geneN.split('_ENST')[0] if len(geneList) > 0 and geneN not in geneList: continue coord = valueL[idxH['Mutation GRCh37 genome position']] if not coord: continue somatic = valueL[idxH['Mutation somatic status']] if not 'somatic' in somatic: continue (chrNum, chrSta, chrEnd) = re.search('([^:-]+):([^:-]+)-([^:-]+)', coord).groups() cds = valueL[idxH['Mutation CDS']] aa = valueL[idxH['Mutation AA']] desc = valueL[idxH['Mutation Description']] strand = valueL[idxH['Mutation GRCh37 strand']] rm = re.match('c\.[\+\-_0-9]+([atgcATGC]*)(>|ins|del)([atgcATGC]*)', cds) if rm: (ref, vtype, alt) = rm.groups() else: ref, alt = '', '' if strand == '-': ref = mybasic.rc(ref) alt = mybasic.rc(alt) chr = chrNum if chr == '23': chr = 'X' chrNum = 'X' elif chr == '24': chr = 'Y' chrNum = 'Y' elif chr == '25': chr = 'M' chrNum = 'M' # if vtype == 'del': # rm = re.search('([ACGT]+)', alt.upper()) # ## if deleted bases are specified # if alt != '' and rm: # ## check if deleted bases are the same as reference sequences at the location # new_ref = os.popen('samtools faidx /data1/Sequence/ucsc_hg19/hg19.fa chr%s:%s-%s' % (chr,chrSta,chrEnd)).readlines()[1:] # new_ref = "".join(map(lambda x: x.rstrip().upper(), new_ref)) # if new_ref == alt.upper(): # chrSta = str(int(chrSta) - 1) # ref = os.popen('samtools faidx /data1/Sequence/ucsc_hg19/hg19.fa chr%s:%s-%s' % (chr,chrSta,chrEnd)).readlines()[1:] # ref = "".join(map(lambda x: x.rstrip().upper(), ref)) # alt = ref[0] key = (chrNum, chrSta, chrEnd, strand, ref, alt) if key in dataH: mybasic.pushHash(dataH[key], 'geneN', geneN) mybasic.pushHash(dataH[key], 'cds', cds) mybasic.pushHash(dataH[key], 'aa', aa) mybasic.pushHash(dataH[key], 'desc', desc) else: dataH[key] = { 'geneN': set([geneN]), 'cds': set([cds]), 'aa': set([aa]), 'desc': set([desc]) } for ((chrNum, chrSta, chrEnd, strand, ref, alt), infoH) in dataH.iteritems(): sys.stdout.write('chr%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (chrNum,chrSta,chrEnd,strand, ref,alt,\ ','.join(filter(lambda x: not x.startswith('ENSG'), list(infoH['geneN']))), ','.join(infoH['cds']), ','.join(infoH['aa']), ','.join(infoH['desc'])))
#!/usr/bin/python import sys import mybasic motifL = ["TAAT", "TAATT", "TAATTG"] motifL_rc = [mybasic.rc(m, "DNA") for m in motifL] bed = open("/data1/IRCR/PKS/promoter_hg19.bed") fa = open("/data1/IRCR/PKS/promoter_hg19.fa") geneNameL = [x.split("\t")[3] for x in bed] sys.stdout.write("geneName") for i in range(len(motifL)): sys.stdout.write("\tm%sf\tm%sr\tm%st" % (i + 1, i + 1, i + 1)) sys.stdout.write("\n") idx = 0 while True: h = fa.readline()[:-1] s = fa.readline()[:-1].upper() sys.stdout.write("%s" % (geneNameL[idx])) countL = (s.count(motifL[i]), s.count(motifL_rc[i]))
def mutation_signature(inDirN, outDirN, outName=""): sampN = os.path.basename(inDirN) if outName == "": outName = "%s/%s.mutation_signature.txt" % (outDirN, sampN) outFile = open(outName, "w") outFile.write("samp_id\tmutation\tcontext\tfreq\tn_mut\tn_total\n") # header mutFileNL = map(lambda x: x.rstrip(), os.popen("ls %s/*mutect*filter.vcf" % inDirN).readlines()) if mutFileNL != []: mutH = {} for mutFileN in mutFileNL: (id, postfix) = re.search("(.*)_([A-Z0-9]{1,})_[TKNCS]{2}", sampN).groups() if postfix != "T": sid = "%s_%s" % (id, postfix) else: sid = id mutFile = open(mutFileN, "r") for line in mutFile: if line[0] == "#": continue colL = line.rstrip().split("\t") chr = colL[0] pos = colL[1] ref = colL[3] alt = colL[4] mutH[(chr, pos, ref, alt)] = 1 # for line # for mutFileN # if there's mutation vcf mutFileNL = map(lambda x: x.rstrip(), os.popen("ls %s/*mutect" % inDirN).readlines()) if mutFileNL == []: mutFileNL = map(lambda x: x.rstrip(), os.popen("ls %s/*mutect_rerun" % inDirN).readlines()) if mutFileNL != []: sigH = {} cntH = {} total = 0 for mutFileN in mutFileNL: mutFile = open(mutFileN, "r") mutFile.readline() headerL = mutFile.readline().rstrip().split("\t") idxH = {} for i in range(len(headerL)): idxH[headerL[i]] = i for line in mutFile: colL = line.rstrip().split("\t") chr = colL[idxH["contig"]] pos = colL[idxH["position"]] context = colL[idxH["context"]] ref = colL[idxH["ref_allele"]] alt = colL[idxH["alt_allele"]] if (chr, pos, ref, alt) in mutH: total += 1 tri = context[2] + ref + context[4] if ref == "C" or ref == "T": nt_ch = ref + ">" + alt else: nt_ch = rc(ref) + ">" + rc(alt) tri = rc(tri) if (nt_ch, tri) in sigH: sigH[(nt_ch, tri)] += 1 else: sigH[(nt_ch, tri)] = 1 if nt_ch in cntH: cntH[nt_ch] += 1 else: cntH[nt_ch] = 1 # if not filtered out # for line # for mutFileN for key in sigH: (type, tri) = key freq = sigH[key] outFile.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (sid, type, tri, freq, cntH[type], total)) # if raw mutect call outFile.flush() outFile.close()
def make_samse(ifileN, ofileN): headerL = make_header('/data1/Sequence/ucsc_hg19/hg19.chrom.sizes') # ofile = open(ofileN, 'w') for header in headerL: print header # ofile.write('%s\n' % header) result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/S022_single.gsnap',False) #result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/test.gsnap',False) #result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/S022_pair.gsnap',True) ## for unpaired for r in result: qname = r.rid() flag = 0x0 rname = '*' pos = 0 mapq = 0 cigar = '' rnext = '*' ## assume --npath=1 (maximum 1 alignment per read) pnext = 0 ## assume --npath=1 (maximum 1 alignment per read) tlen = 0 ## assume --npath=1 (maximum 1 alignment per read) seq = r.seq() qual = r.qual() extra = 'NH:i:1\tHI:i:1' ## assume --npath=1 (maximum 1 alignment per read) if r.nLoci > 1: flag = flag | 0x4 cigar = '*' new_cigar = '*' extra = '' print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, new_cigar, rnext, pnext, tlen, seq, qual, extra)) else: if r.pairRel == '(transloc)': match = r.matchL()[0] segL = match.getSegInfo() mapq = segL[0].mapq seq = r.seq() qual = r.qual() for seg in segL: flag = 0x0 (strand, rname, pos1, pos2) = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', seg.seg[2]).groups() pos = min(int(pos1), int(pos2)) (cigar,clip) = seg.toCIGAR_trans() if clip < 0: ## first half seq2 = seq[:clip] qual2 = qual[:clip] else: ## second half seq2 = seq[clip:] qual2 = qual[clip:] if strand == '-': flag = flag | 0x10 seq2 = mybasic.rc(seq2) qual2 = mybasic.rev(qual2) # print qname,seg.toCIGAR_trans() print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq2, qual2, extra)) else: match = r.matchL()[0] ## assume --npath=1 (maximum 1 alignment per read) segL = match.getSegInfo() (strand, rname, pos1, pos2) = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', segL[0].seg[2]).groups() pos = min(int(pos1), int(pos2)) mapq = segL[0].mapq seg_nm = segL[0].numSub cigar2 = match.toCIGAR() # print qname, match.toCIGAR() if segL[0].start != '' and segL[0].start != '0': cigar = str(segL[0].start) + 'S' if strand == '-': cigar = str(segL[0].numMatch + segL[0].numSub) + 'M' + cigar if segL[0].ins != '' and segL[0].ins != '0': cigar = str(segL[0].ins) + 'I' + cigar else: cigar = cigar + str(segL[0].numMatch + segL[0].numSub) + 'M' if segL[0].ins != '' and segL[0].ins != '0': cigar = cigar + str(segL[0].ins) + 'I' if len(segL) == 1: new_cigar = segL[0].toCIGAR(True) else: new_cigar = segL[0].toCIGAR() prev_cigar = new_cigar index = 0 for seg in segL[1:]: index = index + 1 if index == (len(segL) - 1): final = True else: final = False rm = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', seg.seg[2]).groups() match = str(seg.numMatch + seg.numSub) + 'M' if seg.ins != '' and seg.ins != '0': ins = str(seg.ins) + 'I' else: ins = '' if pos == 0 or pos > min(int(rm[2]), int(rm[3])): pos = min(int(rm[2]), int(rm[3])) if strand == '-': dist = int(pos2) - int(rm[2]) - 1 if dist > 0: cigar = match + ins + str(dist) + 'N' + cigar else: cigar = match + ins + cigar else: dist = int(rm[2]) - int(pos2) - 1 if dist > 0: cigar = cigar + str(dist) + 'N' + match + ins else: cigar = cigar + match + ins seg_nm = seg_nm + seg.numSub pos1 = rm[2] pos2 = rm[3] cur_cigar = seg.toCIGAR(final) if strand == '-': if dist > 0 and 'D' not in prev_cigar and 'I' not in prev_cigar: new_cigar = cur_cigar + str(dist) + 'N' + new_cigar else: new_cigar = cur_cigar + new_cigar else: if dist > 0 and 'D' not in prev_cigar and 'I' not in prev_cigar: new_cigar = new_cigar + str(dist) + 'N' + cur_cigar else: new_cigar = new_cigar + cur_cigar prev_cigar = cur_cigar if segL[-1].end != '' and segL[-1].end != '0': ## last segment if strand == '-': cigar = str(segL[-1].end) + 'S' + cigar else: cigar = cigar + str(segL[-1].end) + 'S' extra = extra + ('\tNM:i:%s' % seg_nm) if strand == '-': flag = flag | 0x10 seq = mybasic.rc(seq) qual = mybasic.rev(qual) ## print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual, extra)) print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar2, rnext, pnext, tlen, seq, qual, extra))
def mutscan_signature(mode='WXS', outFileN=''): if outFileN == '': outFile = sys.stdout else: outFile = open(outFileN, 'w') dirLH = {'WXS': mysetting.wxsMutscanDirL, 'RSQ': mysetting.rsqMutscanDirL} contextH = {} for dir in dirLH[mode]: fileL = filter( lambda x: 'bak' not in x, map(lambda x: x.rstrip(), os.popen('find %s -name *dbsnp_flt' % dir).readlines())) for file in fileL: if mode == 'RSQ': if os.path.basename( file ) == 'S647_splice.mutscan.dbsnp_flt': ## duplicated files continue if 'splice_Z' in file: sid = re.match('(.*)_splice2.mutscan.dbsnp_flt', os.path.basename(file)).group(1) else: sid = re.match('(.*)_RSq_splice.mutscan.dbsnp_flt', os.path.basename(file)).group(1) else: sid = re.match('(.*).mutscan.dbsnp_flt', os.path.basename(file)).group(1) sigH = {} cntH = {} total = 0 inFile = open(file, 'r') for line in inFile: colL = line.rstrip().split('\t') chrom = colL[0] pos = int(colL[1]) ref = colL[2] alt = colL[3] if ref == 'N' or len(alt) > 1: continue n_ref = int(colL[4]) n_alt = int(colL[5]) if n_alt >= MIN_MUT_N and (n_alt + n_ref) >= MIN_COV: start = pos - 1 end = pos + 1 if chrom in contextH and pos in contextH[chrom]: context = contextH[chrom][pos] else: resL = os.popen( 'samtools faidx /data1/Sequence/ucsc_hg19/hg19.fasta %s:%s-%s' % (chrom, start, end)).readlines() context = resL[1].rstrip().upper() if chrom not in contextH: contextH[chrom] = {} contextH[chrom][pos] = context if ref not in ['C', 'T']: ref = mybasic.rc(ref) alt = mybasic.rc(alt) context = mybasic.rc(context) ch = ref + '>' + alt if ch in cntH: cntH[ch] += 1 else: cntH[ch] = 1 if (ch, context) in sigH: sigH[(ch, context)] += 1 else: sigH[(ch, context)] = 1 total += 1 #if ##for line for (type, context) in sigH: outFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (sid, type, context, sigH[(type, context)], cntH[type], total)) ##for file #for dir outFile.flush() outFile.close()