Example #1
0
def mutscan_signature_rsq():
	for dir in mysetting.rsqMutscanDirL:
		fileL = filter(lambda x: 'bak' not in x, map(lambda x: x.rstrip(), os.popen('find %s -name *dbsnp_flt' % dir).readlines()))
		for file in fileL:
			if 'splice_Z' in file:
				sid = re.match('(.*)_splice[2]*.mutscan.dbsnp_flt', os.path.basename(file)).group(1)
			else:
				sid = re.match('(.*)_RSq_splice.mutscan.dbsnp_flt', os.path.basename(file)).group(1)
			cntH = {}
			total = 0
			inFile = open(file, 'r')
			for line in inFile:
				colL = line.rstrip().split('\t')
				chrom = colL[0]
				ref = colL[2]
				alt = colL[3]
				if ref == 'N' or len(alt)>1:
					continue
				n_ref = int(colL[4])
				n_alt = int(colL[5])

				if n_alt >= MIN_MUT_N and (n_alt+n_ref) >= MIN_COV:
					if ref not in ['C','T']:
						ref = mybasic.rc(ref)
						alt = mybasic.rc(alt)
					if (ref,alt) in cntH:
						cntH[(ref,alt)] += 1
					else:
						cntH[(ref,alt)] = 1
					total += 1
			##for line

			for (r,a) in cntH:
				sys.stdout.write('%s\t%s>%s\t%s\t%s\n' % (sid, r,a, cntH[(r,a)], total))
Example #2
0
def mutation_signaturei_t(inDir, outName=''):
    if outName == '':
        outFile = sys.stdout
    else:
        outFile = open(outName, 'w')

    outFile.write("samp_id\tmutation\tcontext\tfreq\tn_mut\tn_total\n")
    mutFileNL = map(
        lambda x: x.rstrip(),
        os.popen('ls %s/*.mutect | grep -v union_pos' % inDir).readlines())
    for mutFileN in mutFileNL:
        sampN = mutFileN.split('/')[-1].split('.')[0].split('_')[0]
        print sampN, mutFileN
        mutFile = open(mutFileN, 'r')
        mutFile.readline()
        headerL = mutFile.readline().rstrip().split('\t')
        idxH = {}
        sigH = {}
        cntH = {}
        for i in range(len(headerL)):
            idxH[headerL[i]] = i
        total = 0
        for line in mutFile:
            colL = line.rstrip().split('\t')
            chr = colL[idxH['contig']]
            pos = colL[idxH['position']]
            context = colL[idxH['context']]
            ref = colL[idxH['ref_allele']]
            alt = colL[idxH['alt_allele']]
            status = colL[idxH['judgement']]
            if status == 'REJECT' or chr == 'chrMT' or chr == 'chrM':
                continue

            total += 1
            tri = context[2] + ref + context[4]
            if ref == 'C' or ref == 'T':
                nt_ch = ref + '>' + alt
            else:
                nt_ch = rc(ref) + '>' + rc(alt)
                tri = rc(tri)
            if (nt_ch, tri) in sigH:
                sigH[(nt_ch, tri)] += 1
            else:
                sigH[(nt_ch, tri)] = 1
            if (nt_ch) in cntH:
                cntH[(nt_ch)] += 1
            else:
                cntH[(nt_ch)] = 1

        mutFile.close()
        for key in sigH:
            (type, tri) = key
            freq = sigH[key]
            outFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' %
                          (sampN, type, tri, freq, cntH[type], total))
    outFile.flush()
    outFile.close()
Example #3
0
def mutation_signaturei_t(inDir, outName=''):
	if outName == '':
		outFile = sys.stdout
	else:
		outFile = open(outName, 'w')

	outFile.write("samp_id\tmutation\tcontext\tfreq\tn_mut\tn_total\n")
	mutFileNL = map(lambda x: x.rstrip(), os.popen('ls %s/*.mutect | grep -v union_pos' % inDir).readlines())
	for mutFileN in mutFileNL:
		sampN = mutFileN.split('/')[-1].split('.')[0].split('_')[0]
		print sampN, mutFileN
		mutFile = open(mutFileN, 'r')
		mutFile.readline()
		headerL = mutFile.readline().rstrip().split('\t')
		idxH = {}
		sigH = {}
		cntH = {}
		for i in range(len(headerL)):
			idxH[headerL[i]] = i
		total = 0
		for line in mutFile:
			colL = line.rstrip().split('\t')
			chr = colL[idxH['contig']]
			pos = colL[idxH['position']]
			context = colL[idxH['context']]
			ref = colL[idxH['ref_allele']]
			alt = colL[idxH['alt_allele']]
			status = colL[idxH['judgement']]
			if status == 'REJECT' or chr == 'chrMT' or chr == 'chrM':
				continue

			total += 1
			tri = context[2] + ref + context[4]
			if ref == 'C' or ref == 'T':
				nt_ch = ref + '>' + alt
			else:
				nt_ch = rc(ref) + '>' + rc(alt)
				tri = rc(tri)
			if (nt_ch,tri) in sigH:
				sigH[(nt_ch,tri)] += 1
			else:
				sigH[(nt_ch,tri)] = 1
			if (nt_ch) in cntH:
				cntH[(nt_ch)] += 1
			else:
				cntH[(nt_ch)] = 1

		mutFile.close()
		for key in sigH:
			(type, tri) = key
			freq = sigH[key]
			outFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (sampN, type, tri, freq, cntH[type], total))
	outFile.flush()
	outFile.close()
Example #4
0
def process_bp(inFileName,outFileName,regionL):

	result = mygsnap.gsnapFile(inFileName,True)
	outFile = open(outFileName, 'w')

	outFile.write('browser full knownGene\n')
	outFile.write('track name="%s" visibility=2\n' % inFileName)

	for rL in result:

		if not (rL[0].nLoci==1 and rL[1].nLoci==1 and rL[0].pairRel=='concordant'):
			raise Exception

		locL = [mygenome.locus(rL[0].matchL()[0].segL[0][2]), mygenome.locus(rL[1].matchL()[0].segL[0][2])]

		flag = False

		for loc in locL:
			for region in regionL:
				if loc.overlap(region) > 0:
					flag = True
			
		if flag:

			print '^%s.*%s$\n' % (rL[0].seq(),mybasic.rc(rL[1].seq())),

			for loc in locL:
				outFile.write('%s\t%s\t%s\n' % (loc.chrom,loc.chrSta,loc.chrEnd))
def process_bp(inFileName,outFileName,coordH,regionL):

	result = mygsnap.gsnapFile(inFileName,True)
	outFile = open(outFileName, 'w')

	outFile.write('browser full knownGene\n')
	outFile.write('track name="%s" visibility=2\n' % inFileName)

	for rL in result:

		if not (rL[0].nLoci==1 and rL[1].nLoci==1 and rL[0].pairRel=='unpaired'):
			raise Exception

		locL = [mygenome.locus(rL[0].matchL()[0].segL[0][2]), mygenome.locus(rL[1].matchL()[0].segL[0][2])]

		for loc in locL:

			loc.chrSta += coordH[loc.chrom][1] -1
			loc.chrEnd += coordH[loc.chrom][1] -1
			loc.chrom = coordH[loc.chrom][0]

		flag = False

		for loc in locL:
			for region in regionL:
				if loc.overlap(region) > 0:
					flag = True
			
		if flag:

			print '^%s.*%s$\n' % (rL[0].seq(),mybasic.rc(rL[1].seq())),

			for loc in locL:
				outFile.write('%s\t%s\t%s\n' % (loc.chrom,loc.chrSta,loc.chrEnd))
def process_bp(inGsnapFileName):

    result = mygsnap.gsnapFile(inGsnapFileName, False)
    #outBpFile = open(outBpFileName, 'w')

    seqH = {}

    for r in result:

        match = r.matchL()[0]

        if not '(transloc)' in r.pairRel:
            raise Exception

        if len(match.segL) != 2:
            raise Exception

        s1 = match.segL[0][2]
        s2 = match.segL[1][2]

        direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1)

        bp1 = re.match('[+-]([^:]+):[0-9]+..([0-9]+)', s1).groups()
        bp2 = re.match('[+-]([^:]+):([0-9]+)..[0-9]+', s2).groups()

        #		if bp1[0] == bp2[0]:
        #			continue

        if direction == 'sense':
            seq = r.seq()
            offset = int(match.segL[0][1].split('..')[1])
            bp12 = (bp1, bp2)
        else:
            seq = mybasic.rc(r.seq(), 'DNA')
            offset = len(seq) - int(match.segL[0][1].split('..')[1])
            bp12 = (bp2, bp1)

        mybasic.addHash(seqH, bp12, (offset, seq))

    seqL = seqH.items()
    seqL.sort(lambda x, y: cmp(len(y[1]), len(x[1])))

    for ((bp1, bp2), vL) in seqL:

        vL.sort(lambda x, y: cmp(y[0], x[0]))

        maxOffset = vL[0][0]

        print '\n', bp1, bp2, len(vL), '\n'

        for (offset, seq) in vL:

            print '%s%s %s' % (' ' * (maxOffset - offset), seq[:offset],
                               seq[offset:])
def process_bp(inGsnapFileName, outBpFileName):

    result = mygsnap.gsnapFile(inGsnapFileName, False)
    outBpFile = open(outBpFileName, 'w')

    seqH = {}

    for r in result:

        match = r.matchL()[0]

        if not '(transloc)' in r.pairRel:
            raise Exception

        if len(match.segL) != 2:
            raise Exception

        s1 = match.segL[0][2]
        s2 = match.segL[1][2]

        if s1[0] != s2[0]:
            raise Exception

        strand = s1[0]

        s1T = re.match('[+-]([^:]+):[0-9]+..([0-9]+)', s1).groups()
        s2T = re.match('[+-]([^:]+):([0-9]+)..[0-9]+', s2).groups()

        if strand == '+':
            seq = r.seq()
            offset = int(match.segL[0][1].split('..')[1])
            junction = (s1T, s2T)
        else:
            seq = mybasic.rc(r.seq(), 'DNA')
            offset = len(seq) - int(match.segL[0][1].split('..')[1])
            junction = (s2T, s1T)

        mybasic.addHash(seqH, junction, (offset, seq))

    for ((j1, j2), vL) in seqH.items():

        vL.sort(lambda x, y: cmp(x[0], y[0]))

        vL_mod = []

        for (offset, seq) in vL:

            offset = blockSize - offset + 1
            vL_mod.append('%s:%s' % (offset, seq))

        outBpFile.write('%s:%s-%s,%s:%s-%s,%s\n' %
                        (j1[0].split('_')[0], int(j1[1]) - blockSize, j1[1],
                         j1[0].split('_')[0], j1[1], int(j1[1]) + blockSize,
                         '|'.join(vL_mod)))
Example #8
0
def mutect_weblogo_sub(sampN, inFileN, outFileN, pdfFileN):
	inFile = open(inFileN, 'r')
	inFile.readline() #comment line
	headerL = inFile.readline().rstrip().split('\t')
	idxH = {}
	for i in range(len(headerL)):
		idxH[headerL[i]] = i

	outFile = open(outFileN,'w')
	for line in inFile:
		colL = line.rstrip().split('\t')
		context = colL[idxH['context']]
		ref = colL[idxH['ref_allele']]
		alt = colL[idxH['alt_allele']]
		status = colL[idxH['judgement']]
		if status == 'REJECT':
			continue

		head = context[:3]
		tail = context[-3:]
		context = head + ref + tail
		if ref not in ['C','T']:
			context = mybasic.rc(context)
			ref = mybasic.rc(ref)
			alt = mybasic.rc(alt)

		if ref == 'C' and alt == 'T':## TMZ context only
			outFile.write('%s\n' % context)
	outFile.flush()
	outFile.close()
	
	fin = open(outFileN,'r')
	seqs = weblogolib.read_seq_data(fin)
	data = weblogolib.LogoData.from_seqs(seqs)
	options = weblogolib.LogoOptions()
	options.show_fineprint = False
	options.first_index = -3
	options.logo_title = sampN
	format = weblogolib.LogoFormat(data, options)
	fout = open(pdfFileN, 'w')
	weblogolib.pdf_formatter(data, format, fout)
Example #9
0
def mutect_weblogo_sub(sampN, inFileN, outFileN, pdfFileN):
    inFile = open(inFileN, 'r')
    inFile.readline()  #comment line
    headerL = inFile.readline().rstrip().split('\t')
    idxH = {}
    for i in range(len(headerL)):
        idxH[headerL[i]] = i

    outFile = open(outFileN, 'w')
    for line in inFile:
        colL = line.rstrip().split('\t')
        context = colL[idxH['context']]
        ref = colL[idxH['ref_allele']]
        alt = colL[idxH['alt_allele']]
        status = colL[idxH['judgement']]
        if status == 'REJECT':
            continue

        head = context[:3]
        tail = context[-3:]
        context = head + ref + tail
        if ref not in ['C', 'T']:
            context = mybasic.rc(context)
            ref = mybasic.rc(ref)
            alt = mybasic.rc(alt)

        if ref == 'C' and alt == 'T':  ## TMZ context only
            outFile.write('%s\n' % context)
    outFile.flush()
    outFile.close()

    fin = open(outFileN, 'r')
    seqs = weblogolib.read_seq_data(fin)
    data = weblogolib.LogoData.from_seqs(seqs)
    options = weblogolib.LogoOptions()
    options.show_fineprint = False
    options.first_index = -3
    options.logo_title = sampN
    format = weblogolib.LogoFormat(data, options)
    fout = open(pdfFileN, 'w')
    weblogolib.pdf_formatter(data, format, fout)
Example #10
0
def process_bp(inGsnapFileName,outBpFileName):

	result = mygsnap.gsnapFile(inGsnapFileName,False)
	outBpFile = open(outBpFileName, 'w')

	seqH = {}

	for r in result:

		match = r.matchL()[0]

		if not '(transloc)' in r.pairRel:
			raise Exception

		if len(match.segL) != 2:
			raise Exception

		s1 = match.segL[0][2]
		s2 = match.segL[1][2]

		if s1[0] != s2[0]:
			raise Exception

		strand = s1[0]

		s1T = re.match('[+-]([^:]+):[0-9]+..([0-9]+)',s1).groups()
		s2T = re.match('[+-]([^:]+):([0-9]+)..[0-9]+',s2).groups()

		if strand == '+':
			seq = r.seq()
			offset = int(match.segL[0][1].split('..')[1])
			junction = (s1T, s2T)
		else:
			seq = mybasic.rc(r.seq(),'DNA')
			offset = len(seq)-int(match.segL[0][1].split('..')[1])
			junction = (s2T, s1T)

		mybasic.addHash(seqH,junction,(offset,seq))

	for ((k1,k2), v) in seqH.items():

		v.sort(lambda x,y: cmp(y[0],x[0]))

		k1T = re.match()
		k2T = re.match()

		k1_pos = 
		k2_pos = 

		k1_seq = 
		k2_seq = 

		outBpFile.write('%s,%s,%s\n' % (':'.join(k1),':'.join(k2),'|'.join(['%s:%s' % (offset,seq) for (offset,seq) in v])))
Example #11
0
def process_bp(inGsnapFileName):

	result = mygsnap.gsnapFile(inGsnapFileName,False)
	#outBpFile = open(outBpFileName, 'w')

	seqH = {}

	for r in result:

		match = r.matchL()[0]

		if not '(transloc)' in r.pairRel:
			raise Exception

		if len(match.segL) != 2:
			raise Exception

		s1 = match.segL[0][2]
		s2 = match.segL[1][2]

		direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1)

		bp1 = re.match('[+-]([^:]+):[0-9]+..([0-9]+)',s1).groups()
		bp2 = re.match('[+-]([^:]+):([0-9]+)..[0-9]+',s2).groups()

#		if bp1[0] == bp2[0]:
#			continue

		if direction == 'sense':
			seq = r.seq()
			offset = int(match.segL[0][1].split('..')[1])
			bp12 = (bp1, bp2)
		else:
			seq = mybasic.rc(r.seq(),'DNA')
			offset = len(seq)-int(match.segL[0][1].split('..')[1])
			bp12 = (bp2, bp1)

		mybasic.addHash(seqH,bp12,(offset,seq))

	seqL = seqH.items()
	seqL.sort(lambda x,y: cmp(len(y[1]),len(x[1])))

	for ((bp1,bp2), vL) in seqL:

		vL.sort(lambda x,y: cmp(y[0],x[0]))

		maxOffset = vL[0][0]

		print '\n',bp1,bp2,len(vL),'\n'

		for (offset,seq) in vL:

			print '%s%s %s' % (' ' * (maxOffset-offset),seq[:offset],seq[offset:])
def mutscan_signature_rsq():
    for dir in mysetting.rsqMutscanDirL:
        fileL = filter(
            lambda x: 'bak' not in x,
            map(lambda x: x.rstrip(),
                os.popen('find %s -name *dbsnp_flt' % dir).readlines()))
        for file in fileL:
            if 'splice_Z' in file:
                sid = re.match('(.*)_splice[2]*.mutscan.dbsnp_flt',
                               os.path.basename(file)).group(1)
            else:
                sid = re.match('(.*)_RSq_splice.mutscan.dbsnp_flt',
                               os.path.basename(file)).group(1)
            cntH = {}
            total = 0
            inFile = open(file, 'r')
            for line in inFile:
                colL = line.rstrip().split('\t')
                chrom = colL[0]
                ref = colL[2]
                alt = colL[3]
                if ref == 'N' or len(alt) > 1:
                    continue
                n_ref = int(colL[4])
                n_alt = int(colL[5])

                if n_alt >= MIN_MUT_N and (n_alt + n_ref) >= MIN_COV:
                    if ref not in ['C', 'T']:
                        ref = mybasic.rc(ref)
                        alt = mybasic.rc(alt)
                    if (ref, alt) in cntH:
                        cntH[(ref, alt)] += 1
                    else:
                        cntH[(ref, alt)] = 1
                    total += 1
            ##for line

            for (r, a) in cntH:
                sys.stdout.write('%s\t%s>%s\t%s\t%s\n' %
                                 (sid, r, a, cntH[(r, a)], total))
Example #13
0
def process_bp(inGsnapFileName,outBpFileName):

	result = mygsnap.gsnapFile(inGsnapFileName,False)
	outBpFile = open(outBpFileName, 'w')

	seqH = {}

	for r in result:

		match = r.matchL()[0]

		if not '(transloc)' in r.pairRel:
			raise Exception

		if len(match.segL) != 2:
			raise Exception

		s1 = match.segL[0][2]
		s2 = match.segL[1][2]

		if s1[0] != s2[0]:
			raise Exception

		strand = s1[0]

		s1T = re.match('[+-]([^:]+):[0-9]+..([0-9]+)',s1).groups()
		s2T = re.match('[+-]([^:]+):([0-9]+)..[0-9]+',s2).groups()

		if strand == '+':
			seq = r.seq()
			offset = int(match.segL[0][1].split('..')[1])
			junction = (s1T, s2T)
		else:
			seq = mybasic.rc(r.seq(),'DNA')
			offset = len(seq)-int(match.segL[0][1].split('..')[1])
			junction = (s2T, s1T)

		mybasic.addHash(seqH,junction,(offset,seq))

	for ((j1,j2), vL) in seqH.items():

		vL.sort(lambda x,y: cmp(x[0],y[0]))

		vL_mod = []

		for (offset,seq) in vL:

			offset = blockSize-offset+1
			vL_mod.append('%s:%s' % (offset,seq))

		outBpFile.write('%s:%s-%s,%s:%s-%s,%s\n' % (j1[0].split('_')[0],int(j1[1])-blockSize,j1[1], j1[0].split('_')[0],j1[1],int(j1[1])+blockSize, '|'.join(vL_mod)))
Example #14
0
def trim4x(inFqFileName, outFqFilePrefix, trimLen):

    if inFqFileName == 'stdin':
        inFqFile = sys.stdin
    else:
        inFqFile = open(inFqFileName)

    outFqFile1 = open('%s.1.fastq' % outFqFilePrefix, 'w')
    outFqFile2 = open('%s.2.fastq' % outFqFilePrefix, 'w')

    while 1:

        line = inFqFile.readline()

        if not line:
            break

        seq = inFqFile.readline()[:-1]

        if line[0] != '@':
            raise Exception

        seqN = line[1:].rstrip().split(' ')[0]

        inFqFile.readline()

        qual = inFqFile.readline()[:-1]

        if 'N' in seq[:trimLen] or 'N' in seq[-trimLen:]:
            continue

        outFqFile1.write('@%s/1\n%s\n+\n%s\n' %
                         (seqN, seq[:trimLen], qual[:trimLen]))
        outFqFile2.write(
            '@%s/2\n%s\n+\n%s\n' %
            (seqN, mybasic.rc(seq[-trimLen:]), mybasic.rev(qual[-trimLen:])))

    outFqFile1.close()
    outFqFile2.close()
Example #15
0
File: trim4x.py Project: SMC1/JK1
def trim4x(inFqFileName, outFqFilePrefix, trimLen):

    if inFqFileName == "stdin":
        inFqFile = sys.stdin
    else:
        inFqFile = open(inFqFileName)

    outFqFile1 = open("%s.1.fastq" % outFqFilePrefix, "w")
    outFqFile2 = open("%s.2.fastq" % outFqFilePrefix, "w")

    while 1:

        line = inFqFile.readline()

        if not line:
            break

        seq = inFqFile.readline()[:-1]

        if line[0] != "@":
            raise Exception

        seqN = line[1:].rstrip().split(" ")[0]

        inFqFile.readline()

        qual = inFqFile.readline()[:-1]

        if "N" in seq[:trimLen] or "N" in seq[-trimLen:]:
            continue

        outFqFile1.write("@%s/1\n%s\n+\n%s\n" % (seqN, seq[:trimLen], qual[:trimLen]))
        outFqFile2.write("@%s/2\n%s\n+\n%s\n" % (seqN, mybasic.rc(seq[-trimLen:]), mybasic.rev(qual[-trimLen:])))

    outFqFile1.close()
    outFqFile2.close()
Example #16
0
#!/usr/bin/python

import sys
import mybasic

motifL = ['TAAT', 'TAATT', 'TAATTG']
motifL_rc = [mybasic.rc(m, 'DNA') for m in motifL]

bed = open('/data1/IRCR/PKS/promoter_hg19.bed')
fa = open('/data1/IRCR/PKS/promoter_hg19.fa')

geneNameL = [x.split('\t')[3] for x in bed]

sys.stdout.write('geneName')

for i in range(len(motifL)):
    sys.stdout.write('\tm%sf\tm%sr\tm%st' % (i + 1, i + 1, i + 1))

sys.stdout.write('\n')

idx = 0

while True:

    h = fa.readline()[:-1]
    s = fa.readline()[:-1].upper()

    sys.stdout.write('%s' % (geneNameL[idx]))

    countL = (s.count(motifL[i]), s.count(motifL_rc[i]))
Example #17
0
def mutscan_signature(mode='WXS', outFileN=''):
	if outFileN == '':
		outFile = sys.stdout
	else:
		outFile = open(outFileN, 'w')
	dirLH = {'WXS': mysetting.wxsMutscanDirL, 'RSQ': mysetting.rsqMutscanDirL}
	contextH = {}
	for dir in dirLH[mode]:
		fileL = filter(lambda x: 'bak' not in x, map(lambda x: x.rstrip(), os.popen('find %s -name *dbsnp_flt' % dir).readlines()))
		for file in fileL:
			if mode=='RSQ':
				if os.path.basename(file) == 'S647_splice.mutscan.dbsnp_flt': ## duplicated files
					continue
				if 'splice_Z' in file:
					sid = re.match('(.*)_splice2.mutscan.dbsnp_flt', os.path.basename(file)).group(1)
				else:
					sid = re.match('(.*)_RSq_splice.mutscan.dbsnp_flt', os.path.basename(file)).group(1)
			else:
				sid = re.match('(.*).mutscan.dbsnp_flt', os.path.basename(file)).group(1)
			sigH = {}
			cntH = {}
			total = 0
			inFile = open(file, 'r')
			for line in inFile:
				colL = line.rstrip().split('\t')
				chrom = colL[0]
				pos = int(colL[1])
				ref = colL[2]
				alt = colL[3]
				if ref == 'N' or len(alt) > 1:
					continue
				n_ref = int(colL[4])
				n_alt = int(colL[5])

				if n_alt >= MIN_MUT_N and (n_alt+n_ref) >= MIN_COV:
					start = pos - 1
					end = pos + 1
					if chrom in contextH and pos in contextH[chrom]:
						context = contextH[chrom][pos]
					else:
						resL = os.popen('samtools faidx /data1/Sequence/ucsc_hg19/hg19.fasta %s:%s-%s' % (chrom,start,end)).readlines()
						context = resL[1].rstrip().upper()
						if chrom not in contextH:
							contextH[chrom] = {}
						contextH[chrom][pos] = context

					if ref not in ['C','T']:
						ref = mybasic.rc(ref)
						alt = mybasic.rc(alt)
						context = mybasic.rc(context)

					ch = ref + '>' + alt
					if ch in cntH:
						cntH[ch] += 1
					else:	
						cntH[ch] = 1
					if (ch,context) in sigH:
						sigH[(ch,context)] += 1
					else:
						sigH[(ch,context)] = 1
					total += 1
				#if
			##for line

			for (type,context) in sigH:
				outFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (sid, type, context, sigH[(type,context)], cntH[type], total))
		##for file
	#for dir
	outFile.flush()
	outFile.close()
Example #18
0
def mutation_signature_ttt(inDirN, outDirN):
	sampN = os.path.basename(inDirN)
	outName = '%s/%s.mutation_signature.txt' % (outDirN, sampN)
#	outFile = open(outName, 'w')
#	
#	outFile.write("samp_id\tmutation\tcontext\tfreq\tn_mut\tn_total\n")
	mutFileNL = map(lambda x: x.rstrip(), os.popen('ls %s/*mutect' % inDirN).readlines())
	if mutFileNL == []:
		mutFileNL = map(lambda x: x.rstrip(), os.popen('ls %s/*rerun' % inDirN).readlines())
	
	pdfFileN = '%s/%s.mutation_signature.pdf' % (outDirN, sampN)
	os.system('Rscript %s/NGS/mutation/mutect_mutation_signature_plot.R %s %s' % (mysetting.SRC_HOME, outName, pdfFileN))
	return()
	
	for mutFileN in mutFileNL:
		(id, postfix) = re.search('(.*)_([A-Z0-9]{1,})_[TKNCS]{2}', sampN).groups()
		if postfix != 'T':
			sid = '%s_%s' % (id, postfix)
		else:
			sid = id

		mutFile = open(mutFileN, 'r')
		mutFile.readline()
		headerL = mutFile.readline().rstrip().split('\t')
		idxH = {}
		sigH = {}
		cntH = {}
		for i in range(len(headerL)):
			idxH[headerL[i]] = i
		total = 0
		for line in mutFile:
			colL = line.rstrip().split('\t')
			chr = colL[idxH['contig']]
			pos = colL[idxH['position']]
			context = colL[idxH['context']]
			ref = colL[idxH['ref_allele']]
			alt = colL[idxH['alt_allele']]
			status = colL[idxH['judgement']]

			if status == 'REJECT' or chr == 'chrMT' or chr == 'chrM':
				continue

			total += 1
			tri = context[2] + ref + context[4]
			if ref == 'C' or ref == 'T':
				nt_ch = ref + '>' + alt
			else:
				nt_ch = rc(ref) + '>' + rc(alt)
				tri = rc(tri)
			if (nt_ch,tri) in sigH:
				sigH[(nt_ch,tri)] += 1
			else:
				sigH[(nt_ch,tri)] = 1
			if (nt_ch) in cntH:
				cntH[(nt_ch)] += 1
			else:
				cntH[(nt_ch)] = 1
		#for line
		mutFile.close()
		for key in sigH:
			(type, tri) = key
			freq = sigH[key]
			outFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (sid, type, tri, freq, cntH[type], total))
	#for mutFile
	outFile.flush()
	outFile.close()
Example #19
0
def mutation_signature(inDirN, outDirN, outName=''):
	sampN = os.path.basename(inDirN)
	if outName == '':
		outName = '%s/%s.mutation_signature.txt' % (outDirN, sampN)
	outFile = open(outName, 'w')
	outFile.write('samp_id\tmutation\tcontext\tfreq\tn_mut\tn_total\n') # header

	mutFileNL = map(lambda x: x.rstrip(), os.popen('ls %s/*mutect*filter.vcf' % inDirN).readlines())
	if mutFileNL != []:
		mutH = {}
		for mutFileN in mutFileNL:
			(id, postfix) = re.search('(.*)_([A-Z0-9]{1,})_[TKNCS]{2}', sampN).groups()
			if postfix != 'T':
				sid = '%s_%s' % (id, postfix)
			else:
				sid = id

			mutFile = open(mutFileN, 'r')
			for line in mutFile:
				if line[0] == '#':
					continue

				colL = line.rstrip().split('\t')
				chr = colL[0]
				pos = colL[1]
				ref = colL[3]
				alt = colL[4]
				mutH[(chr,pos,ref,alt)] = 1
			#for line
		#for mutFileN
	#if there's mutation vcf
	
	mutFileNL = map(lambda x: x.rstrip(), os.popen('ls %s/*mutect' % inDirN).readlines())
	if mutFileNL == []:
		mutFileNL = map(lambda x: x.rstrip(), os.popen('ls %s/*mutect_rerun' % inDirN).readlines())
	if mutFileNL != []:
		sigH = {}
		cntH = {}
		total = 0
		for mutFileN in mutFileNL:
			mutFile = open(mutFileN, 'r')
			mutFile.readline()
			headerL = mutFile.readline().rstrip().split('\t')
			idxH = {}
			for i in range(len(headerL)):
				idxH[headerL[i]] = i
			for line in mutFile:
				colL = line.rstrip().split('\t')
				chr = colL[idxH['contig']]
				pos = colL[idxH['position']]
				context = colL[idxH['context']]
				ref = colL[idxH['ref_allele']]
				alt = colL[idxH['alt_allele']]
				if (chr,pos,ref,alt) in mutH:
					total += 1
					tri = context[2] + ref + context[4]
					if ref == 'C' or ref == 'T':
						nt_ch = ref + '>' + alt
					else:
						nt_ch = rc(ref) + '>' + rc(alt)
						tri = rc(tri)
					if (nt_ch, tri) in sigH:
						sigH[(nt_ch, tri)] += 1
					else:
						sigH[(nt_ch, tri)] = 1
					if nt_ch in cntH:
						cntH[nt_ch] += 1
					else:
						cntH[nt_ch] = 1
				# if not filtered out
			#for line
		#for mutFileN

		for key in sigH:
			(type, tri) = key
			freq = sigH[key]
			outFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (sid, type, tri, freq, cntH[type], total))
Example #20
0
def main(inFileName,geneList=[]):

	dataH = {}

#	nameL = ('Mutation GRCh37 genome position', 'Mutation GRCh37 strand','Gene name','ID_sample','ID_tumour','Primary site', \
#		'Site subtype','Primary histology','Histology subtype','Genome-wide screen','Mutation ID','Mutation CDS','Mutation AA', \
#		'Mutation Description','Mutation zygosity','Mutation somatic status','Pubmed_PMID','Sample source','Tumor origin','Comments')

	nameL = ('Gene name','Mutation CDS','Mutation AA','Mutation Description','Mutation GRCh37 genome position','Mutation GRCh37 strand','Mutation somatic status')

	inFile = open(inFileName)

	headerL = inFile.readline()[:-1].split('\t')

	idxH = dict([(x, headerL.index(x)) for x in nameL])

	for line in inFile:

		valueL = line[:-1].split('\t')

		geneN = valueL[idxH['Gene name']]

		if '_ENST' in geneN:
			geneN = geneN.split('_ENST')[0]

		if len(geneList)>0 and geneN not in geneList:
			continue

		coord = valueL[idxH['Mutation GRCh37 genome position']]	

		if not coord:
			continue

		somatic = valueL[idxH['Mutation somatic status']]	

		if not 'somatic' in somatic:
			continue

		(chrNum,chrSta,chrEnd) = re.search('([^:-]+):([^:-]+)-([^:-]+)', coord).groups()

		cds = valueL[idxH['Mutation CDS']]	
		aa = valueL[idxH['Mutation AA']]	
		desc = valueL[idxH['Mutation Description']]	
		strand = valueL[idxH['Mutation GRCh37 strand']]	

		rm = re.match('c\.[\+\-_0-9]+([atgcATGC]*)(>|ins|del)([atgcATGC]*)',cds)

		if rm:
			(ref,vtype,alt) = rm.groups()
		else:
			ref,alt = '',''

		if strand == '-':
			ref = mybasic.rc(ref)
			alt = mybasic.rc(alt)

		chr = chrNum
		if chr == '23':
			chr = 'X'
			chrNum = 'X'
		elif chr == '24':
			chr = 'Y'
			chrNum = 'Y'
		elif chr == '25':
			chr = 'M'
			chrNum = 'M'

#		if vtype == 'del':
#			rm = re.search('([ACGT]+)', alt.upper())
#			## if deleted bases are specified
#			if alt != '' and rm:
#				## check if deleted bases are the same as reference sequences at the location
#				new_ref = os.popen('samtools faidx /data1/Sequence/ucsc_hg19/hg19.fa chr%s:%s-%s' % (chr,chrSta,chrEnd)).readlines()[1:]
#				new_ref = "".join(map(lambda x: x.rstrip().upper(), new_ref))
#				if new_ref == alt.upper():
#					chrSta = str(int(chrSta) - 1)
#					ref = os.popen('samtools faidx /data1/Sequence/ucsc_hg19/hg19.fa chr%s:%s-%s' % (chr,chrSta,chrEnd)).readlines()[1:]
#					ref = "".join(map(lambda x: x.rstrip().upper(), ref))
#					alt = ref[0]

		key = (chrNum,chrSta,chrEnd,strand,ref,alt)

		if key in dataH:
			mybasic.pushHash(dataH[key],'geneN',geneN)
			mybasic.pushHash(dataH[key],'cds',cds)
			mybasic.pushHash(dataH[key],'aa',aa)
			mybasic.pushHash(dataH[key],'desc',desc)
		else:
			dataH[key] = {'geneN':set([geneN]), 'cds':set([cds]), 'aa':set([aa]), 'desc':set([desc])}

	for ((chrNum,chrSta,chrEnd,strand,ref,alt),infoH) in dataH.iteritems():

		sys.stdout.write('chr%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (chrNum,chrSta,chrEnd,strand, ref,alt,\
			','.join(filter(lambda x: not x.startswith('ENSG'), list(infoH['geneN']))), ','.join(infoH['cds']), ','.join(infoH['aa']), ','.join(infoH['desc'])))
Example #21
0
def mutation_signature_ttt(inDirN, outDirN):
    sampN = os.path.basename(inDirN)
    outName = '%s/%s.mutation_signature.txt' % (outDirN, sampN)
    #	outFile = open(outName, 'w')
    #
    #	outFile.write("samp_id\tmutation\tcontext\tfreq\tn_mut\tn_total\n")
    mutFileNL = map(lambda x: x.rstrip(),
                    os.popen('ls %s/*mutect' % inDirN).readlines())
    if mutFileNL == []:
        mutFileNL = map(lambda x: x.rstrip(),
                        os.popen('ls %s/*rerun' % inDirN).readlines())

    pdfFileN = '%s/%s.mutation_signature.pdf' % (outDirN, sampN)
    os.system(
        'Rscript %s/NGS/mutation/mutect_mutation_signature_plot.R %s %s' %
        (mysetting.SRC_HOME, outName, pdfFileN))
    return ()

    for mutFileN in mutFileNL:
        (id, postfix) = re.search('(.*)_([A-Z0-9]{1,})_[TKNCS]{2}',
                                  sampN).groups()
        if postfix != 'T':
            sid = '%s_%s' % (id, postfix)
        else:
            sid = id

        mutFile = open(mutFileN, 'r')
        mutFile.readline()
        headerL = mutFile.readline().rstrip().split('\t')
        idxH = {}
        sigH = {}
        cntH = {}
        for i in range(len(headerL)):
            idxH[headerL[i]] = i
        total = 0
        for line in mutFile:
            colL = line.rstrip().split('\t')
            chr = colL[idxH['contig']]
            pos = colL[idxH['position']]
            context = colL[idxH['context']]
            ref = colL[idxH['ref_allele']]
            alt = colL[idxH['alt_allele']]
            status = colL[idxH['judgement']]

            if status == 'REJECT' or chr == 'chrMT' or chr == 'chrM':
                continue

            total += 1
            tri = context[2] + ref + context[4]
            if ref == 'C' or ref == 'T':
                nt_ch = ref + '>' + alt
            else:
                nt_ch = rc(ref) + '>' + rc(alt)
                tri = rc(tri)
            if (nt_ch, tri) in sigH:
                sigH[(nt_ch, tri)] += 1
            else:
                sigH[(nt_ch, tri)] = 1
            if (nt_ch) in cntH:
                cntH[(nt_ch)] += 1
            else:
                cntH[(nt_ch)] = 1
        #for line
        mutFile.close()
        for key in sigH:
            (type, tri) = key
            freq = sigH[key]
            outFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' %
                          (sid, type, tri, freq, cntH[type], total))
    #for mutFile
    outFile.flush()
    outFile.close()
Example #22
0
def mutation_signature(inDirN, outDirN, outName=''):
    sampN = os.path.basename(inDirN)
    if outName == '':
        outName = '%s/%s.mutation_signature.txt' % (outDirN, sampN)
    outFile = open(outName, 'w')
    outFile.write(
        'samp_id\tmutation\tcontext\tfreq\tn_mut\tn_total\n')  # header

    mutFileNL = map(lambda x: x.rstrip(),
                    os.popen('ls %s/*mutect*filter.vcf' % inDirN).readlines())
    if mutFileNL != []:
        mutH = {}
        for mutFileN in mutFileNL:
            (id, postfix) = re.search('(.*)_([A-Z0-9]{1,})_[TKNCS]{2}',
                                      sampN).groups()
            if postfix != 'T':
                sid = '%s_%s' % (id, postfix)
            else:
                sid = id

            mutFile = open(mutFileN, 'r')
            for line in mutFile:
                if line[0] == '#':
                    continue

                colL = line.rstrip().split('\t')
                chr = colL[0]
                pos = colL[1]
                ref = colL[3]
                alt = colL[4]
                mutH[(chr, pos, ref, alt)] = 1
            #for line
        #for mutFileN
    #if there's mutation vcf

    mutFileNL = map(lambda x: x.rstrip(),
                    os.popen('ls %s/*mutect' % inDirN).readlines())
    if mutFileNL == []:
        mutFileNL = map(lambda x: x.rstrip(),
                        os.popen('ls %s/*mutect_rerun' % inDirN).readlines())
    if mutFileNL != []:
        sigH = {}
        cntH = {}
        total = 0
        for mutFileN in mutFileNL:
            mutFile = open(mutFileN, 'r')
            mutFile.readline()
            headerL = mutFile.readline().rstrip().split('\t')
            idxH = {}
            for i in range(len(headerL)):
                idxH[headerL[i]] = i
            for line in mutFile:
                colL = line.rstrip().split('\t')
                chr = colL[idxH['contig']]
                pos = colL[idxH['position']]
                context = colL[idxH['context']]
                ref = colL[idxH['ref_allele']]
                alt = colL[idxH['alt_allele']]
                if (chr, pos, ref, alt) in mutH:
                    total += 1
                    tri = context[2] + ref + context[4]
                    if ref == 'C' or ref == 'T':
                        nt_ch = ref + '>' + alt
                    else:
                        nt_ch = rc(ref) + '>' + rc(alt)
                        tri = rc(tri)
                    if (nt_ch, tri) in sigH:
                        sigH[(nt_ch, tri)] += 1
                    else:
                        sigH[(nt_ch, tri)] = 1
                    if nt_ch in cntH:
                        cntH[nt_ch] += 1
                    else:
                        cntH[nt_ch] = 1
                # if not filtered out
            #for line
        #for mutFileN

        for key in sigH:
            (type, tri) = key
            freq = sigH[key]
            outFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' %
                          (sid, type, tri, freq, cntH[type], total))
Example #23
0
def make_samse(ifileN, ofileN):

    headerL = make_header('/data1/Sequence/ucsc_hg19/hg19.chrom.sizes')
    #	ofile = open(ofileN, 'w')
    for header in headerL:
        print header
#		ofile.write('%s\n' % header)

    result = mygsnap.gsnapFile(
        '/pipeline/test_ini_gsnap2sam/S022_single.gsnap', False)
    #result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/test.gsnap',False)
    #result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/S022_pair.gsnap',True)

    ## for unpaired
    for r in result:

        qname = r.rid()
        flag = 0x0
        rname = '*'
        pos = 0
        mapq = 0
        cigar = ''
        rnext = '*'  ## assume --npath=1 (maximum 1 alignment per read)
        pnext = 0  ## assume --npath=1 (maximum 1 alignment per read)
        tlen = 0  ## assume --npath=1 (maximum 1 alignment per read)
        seq = r.seq()
        qual = r.qual()
        extra = 'NH:i:1\tHI:i:1'  ## assume --npath=1 (maximum 1 alignment per read)

        if r.nLoci > 1:
            flag = flag | 0x4
            cigar = '*'
            new_cigar = '*'
            extra = ''
            print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' %
                  (qname, flag, rname, pos, mapq, new_cigar, rnext, pnext,
                   tlen, seq, qual, extra))
        else:
            if r.pairRel == '(transloc)':
                match = r.matchL()[0]
                segL = match.getSegInfo()
                mapq = segL[0].mapq
                seq = r.seq()
                qual = r.qual()
                for seg in segL:
                    flag = 0x0
                    (strand, rname, pos1,
                     pos2) = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)',
                                       seg.seg[2]).groups()
                    pos = min(int(pos1), int(pos2))
                    (cigar, clip) = seg.toCIGAR_trans()
                    if clip < 0:  ## first half
                        seq2 = seq[:clip]
                        qual2 = qual[:clip]
                    else:  ## second half
                        seq2 = seq[clip:]
                        qual2 = qual[clip:]
                    if strand == '-':
                        flag = flag | 0x10
                        seq2 = mybasic.rc(seq2)
                        qual2 = mybasic.rev(qual2)
#					print qname,seg.toCIGAR_trans()
                    print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' %
                          (qname, flag, rname, pos, mapq, cigar, rnext, pnext,
                           tlen, seq2, qual2, extra))
            else:
                match = r.matchL()[
                    0]  ## assume --npath=1 (maximum 1 alignment per read)

                segL = match.getSegInfo()
                (strand, rname, pos1,
                 pos2) = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)',
                                   segL[0].seg[2]).groups()
                pos = min(int(pos1), int(pos2))
                mapq = segL[0].mapq
                seg_nm = segL[0].numSub
                cigar2 = match.toCIGAR()
                #				print qname, match.toCIGAR()

                if segL[0].start != '' and segL[0].start != '0':
                    cigar = str(segL[0].start) + 'S'

                if strand == '-':
                    cigar = str(segL[0].numMatch +
                                segL[0].numSub) + 'M' + cigar
                    if segL[0].ins != '' and segL[0].ins != '0':
                        cigar = str(segL[0].ins) + 'I' + cigar
                else:
                    cigar = cigar + str(segL[0].numMatch +
                                        segL[0].numSub) + 'M'
                    if segL[0].ins != '' and segL[0].ins != '0':
                        cigar = cigar + str(segL[0].ins) + 'I'

                if len(segL) == 1:
                    new_cigar = segL[0].toCIGAR(True)
                else:
                    new_cigar = segL[0].toCIGAR()
                prev_cigar = new_cigar
                index = 0
                for seg in segL[1:]:
                    index = index + 1
                    if index == (len(segL) - 1):
                        final = True
                    else:
                        final = False
                    rm = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)',
                                   seg.seg[2]).groups()

                    match = str(seg.numMatch + seg.numSub) + 'M'

                    if seg.ins != '' and seg.ins != '0':
                        ins = str(seg.ins) + 'I'
                    else:
                        ins = ''

                    if pos == 0 or pos > min(int(rm[2]), int(rm[3])):
                        pos = min(int(rm[2]), int(rm[3]))
                    if strand == '-':
                        dist = int(pos2) - int(rm[2]) - 1
                        if dist > 0:
                            cigar = match + ins + str(dist) + 'N' + cigar
                        else:
                            cigar = match + ins + cigar
                    else:
                        dist = int(rm[2]) - int(pos2) - 1
                        if dist > 0:
                            cigar = cigar + str(dist) + 'N' + match + ins
                        else:
                            cigar = cigar + match + ins
                    seg_nm = seg_nm + seg.numSub
                    pos1 = rm[2]
                    pos2 = rm[3]
                    cur_cigar = seg.toCIGAR(final)
                    if strand == '-':
                        if dist > 0 and 'D' not in prev_cigar and 'I' not in prev_cigar:
                            new_cigar = cur_cigar + str(dist) + 'N' + new_cigar
                        else:
                            new_cigar = cur_cigar + new_cigar
                    else:
                        if dist > 0 and 'D' not in prev_cigar and 'I' not in prev_cigar:
                            new_cigar = new_cigar + str(dist) + 'N' + cur_cigar
                        else:
                            new_cigar = new_cigar + cur_cigar
                    prev_cigar = cur_cigar

                if segL[-1].end != '' and segL[-1].end != '0':  ## last segment
                    if strand == '-':
                        cigar = str(segL[-1].end) + 'S' + cigar
                    else:
                        cigar = cigar + str(segL[-1].end) + 'S'

                extra = extra + ('\tNM:i:%s' % seg_nm)

                if strand == '-':
                    flag = flag | 0x10
                    seq = mybasic.rc(seq)
                    qual = mybasic.rev(qual)

##			print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual, extra))
                print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' %
                      (qname, flag, rname, pos, mapq, cigar2, rnext, pnext,
                       tlen, seq, qual, extra))
Example #24
0
def main(inFileName, geneList=[]):

    dataH = {}

    #	nameL = ('Mutation GRCh37 genome position', 'Mutation GRCh37 strand','Gene name','ID_sample','ID_tumour','Primary site', \
    #		'Site subtype','Primary histology','Histology subtype','Genome-wide screen','Mutation ID','Mutation CDS','Mutation AA', \
    #		'Mutation Description','Mutation zygosity','Mutation somatic status','Pubmed_PMID','Sample source','Tumor origin','Comments')

    nameL = ('Gene name', 'Mutation CDS', 'Mutation AA',
             'Mutation Description', 'Mutation GRCh37 genome position',
             'Mutation GRCh37 strand', 'Mutation somatic status')

    inFile = open(inFileName)

    headerL = inFile.readline()[:-1].split('\t')

    idxH = dict([(x, headerL.index(x)) for x in nameL])

    for line in inFile:

        valueL = line[:-1].split('\t')

        geneN = valueL[idxH['Gene name']]

        if '_ENST' in geneN:
            geneN = geneN.split('_ENST')[0]

        if len(geneList) > 0 and geneN not in geneList:
            continue

        coord = valueL[idxH['Mutation GRCh37 genome position']]

        if not coord:
            continue

        somatic = valueL[idxH['Mutation somatic status']]

        if not 'somatic' in somatic:
            continue

        (chrNum, chrSta, chrEnd) = re.search('([^:-]+):([^:-]+)-([^:-]+)',
                                             coord).groups()

        cds = valueL[idxH['Mutation CDS']]
        aa = valueL[idxH['Mutation AA']]
        desc = valueL[idxH['Mutation Description']]
        strand = valueL[idxH['Mutation GRCh37 strand']]

        rm = re.match('c\.[\+\-_0-9]+([atgcATGC]*)(>|ins|del)([atgcATGC]*)',
                      cds)

        if rm:
            (ref, vtype, alt) = rm.groups()
        else:
            ref, alt = '', ''

        if strand == '-':
            ref = mybasic.rc(ref)
            alt = mybasic.rc(alt)

        chr = chrNum
        if chr == '23':
            chr = 'X'
            chrNum = 'X'
        elif chr == '24':
            chr = 'Y'
            chrNum = 'Y'
        elif chr == '25':
            chr = 'M'
            chrNum = 'M'


#		if vtype == 'del':
#			rm = re.search('([ACGT]+)', alt.upper())
#			## if deleted bases are specified
#			if alt != '' and rm:
#				## check if deleted bases are the same as reference sequences at the location
#				new_ref = os.popen('samtools faidx /data1/Sequence/ucsc_hg19/hg19.fa chr%s:%s-%s' % (chr,chrSta,chrEnd)).readlines()[1:]
#				new_ref = "".join(map(lambda x: x.rstrip().upper(), new_ref))
#				if new_ref == alt.upper():
#					chrSta = str(int(chrSta) - 1)
#					ref = os.popen('samtools faidx /data1/Sequence/ucsc_hg19/hg19.fa chr%s:%s-%s' % (chr,chrSta,chrEnd)).readlines()[1:]
#					ref = "".join(map(lambda x: x.rstrip().upper(), ref))
#					alt = ref[0]

        key = (chrNum, chrSta, chrEnd, strand, ref, alt)

        if key in dataH:
            mybasic.pushHash(dataH[key], 'geneN', geneN)
            mybasic.pushHash(dataH[key], 'cds', cds)
            mybasic.pushHash(dataH[key], 'aa', aa)
            mybasic.pushHash(dataH[key], 'desc', desc)
        else:
            dataH[key] = {
                'geneN': set([geneN]),
                'cds': set([cds]),
                'aa': set([aa]),
                'desc': set([desc])
            }

    for ((chrNum, chrSta, chrEnd, strand, ref, alt),
         infoH) in dataH.iteritems():

        sys.stdout.write('chr%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (chrNum,chrSta,chrEnd,strand, ref,alt,\
         ','.join(filter(lambda x: not x.startswith('ENSG'), list(infoH['geneN']))), ','.join(infoH['cds']), ','.join(infoH['aa']), ','.join(infoH['desc'])))
Example #25
0
#!/usr/bin/python

import sys
import mybasic

motifL = ["TAAT", "TAATT", "TAATTG"]
motifL_rc = [mybasic.rc(m, "DNA") for m in motifL]

bed = open("/data1/IRCR/PKS/promoter_hg19.bed")
fa = open("/data1/IRCR/PKS/promoter_hg19.fa")

geneNameL = [x.split("\t")[3] for x in bed]

sys.stdout.write("geneName")

for i in range(len(motifL)):
    sys.stdout.write("\tm%sf\tm%sr\tm%st" % (i + 1, i + 1, i + 1))

sys.stdout.write("\n")

idx = 0

while True:

    h = fa.readline()[:-1]
    s = fa.readline()[:-1].upper()

    sys.stdout.write("%s" % (geneNameL[idx]))

    countL = (s.count(motifL[i]), s.count(motifL_rc[i]))
Example #26
0
def mutation_signature(inDirN, outDirN, outName=""):
    sampN = os.path.basename(inDirN)
    if outName == "":
        outName = "%s/%s.mutation_signature.txt" % (outDirN, sampN)
    outFile = open(outName, "w")
    outFile.write("samp_id\tmutation\tcontext\tfreq\tn_mut\tn_total\n")  # header

    mutFileNL = map(lambda x: x.rstrip(), os.popen("ls %s/*mutect*filter.vcf" % inDirN).readlines())
    if mutFileNL != []:
        mutH = {}
        for mutFileN in mutFileNL:
            (id, postfix) = re.search("(.*)_([A-Z0-9]{1,})_[TKNCS]{2}", sampN).groups()
            if postfix != "T":
                sid = "%s_%s" % (id, postfix)
            else:
                sid = id

            mutFile = open(mutFileN, "r")
            for line in mutFile:
                if line[0] == "#":
                    continue

                colL = line.rstrip().split("\t")
                chr = colL[0]
                pos = colL[1]
                ref = colL[3]
                alt = colL[4]
                mutH[(chr, pos, ref, alt)] = 1
                # for line
                # for mutFileN
                # if there's mutation vcf

    mutFileNL = map(lambda x: x.rstrip(), os.popen("ls %s/*mutect" % inDirN).readlines())
    if mutFileNL == []:
        mutFileNL = map(lambda x: x.rstrip(), os.popen("ls %s/*mutect_rerun" % inDirN).readlines())
    if mutFileNL != []:
        sigH = {}
        cntH = {}
        total = 0
        for mutFileN in mutFileNL:
            mutFile = open(mutFileN, "r")
            mutFile.readline()
            headerL = mutFile.readline().rstrip().split("\t")
            idxH = {}
            for i in range(len(headerL)):
                idxH[headerL[i]] = i
            for line in mutFile:
                colL = line.rstrip().split("\t")
                chr = colL[idxH["contig"]]
                pos = colL[idxH["position"]]
                context = colL[idxH["context"]]
                ref = colL[idxH["ref_allele"]]
                alt = colL[idxH["alt_allele"]]
                if (chr, pos, ref, alt) in mutH:
                    total += 1
                    tri = context[2] + ref + context[4]
                    if ref == "C" or ref == "T":
                        nt_ch = ref + ">" + alt
                    else:
                        nt_ch = rc(ref) + ">" + rc(alt)
                        tri = rc(tri)
                    if (nt_ch, tri) in sigH:
                        sigH[(nt_ch, tri)] += 1
                    else:
                        sigH[(nt_ch, tri)] = 1
                    if nt_ch in cntH:
                        cntH[nt_ch] += 1
                    else:
                        cntH[nt_ch] = 1
                        # if not filtered out
                        # for line
                        # for mutFileN

        for key in sigH:
            (type, tri) = key
            freq = sigH[key]
            outFile.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (sid, type, tri, freq, cntH[type], total))
            # if raw mutect call
    outFile.flush()
    outFile.close()
Example #27
0
def make_samse(ifileN, ofileN):

	headerL = make_header('/data1/Sequence/ucsc_hg19/hg19.chrom.sizes')
#	ofile = open(ofileN, 'w')
	for header in headerL:
		print header
#		ofile.write('%s\n' % header)

	result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/S022_single.gsnap',False)
	#result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/test.gsnap',False)
	#result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/S022_pair.gsnap',True)

	## for unpaired
	for r in result:

		qname = r.rid()
		flag = 0x0
		rname = '*'
		pos = 0
		mapq = 0
		cigar = ''
		rnext = '*' ## assume --npath=1 (maximum 1 alignment per read)
		pnext = 0 ## assume --npath=1 (maximum 1 alignment per read)
		tlen = 0  ## assume --npath=1 (maximum 1 alignment per read)
		seq = r.seq()
		qual = r.qual()
		extra = 'NH:i:1\tHI:i:1' ## assume --npath=1 (maximum 1 alignment per read)

		if r.nLoci > 1:
			flag = flag | 0x4
			cigar = '*'
			new_cigar = '*'
			extra = ''
			print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, new_cigar, rnext, pnext, tlen, seq, qual, extra))
		else:
			if r.pairRel == '(transloc)':
				match = r.matchL()[0]
				segL = match.getSegInfo()
				mapq = segL[0].mapq
				seq = r.seq()
				qual = r.qual()
				for seg in segL:
					flag = 0x0
					(strand, rname, pos1, pos2) = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', seg.seg[2]).groups()
					pos = min(int(pos1), int(pos2))
					(cigar,clip) = seg.toCIGAR_trans()
					if clip < 0: ## first half
						seq2 = seq[:clip]
						qual2 = qual[:clip]
					else: ## second half
						seq2 = seq[clip:]
						qual2 = qual[clip:]
					if strand == '-':
						flag = flag | 0x10
						seq2 = mybasic.rc(seq2)
						qual2 = mybasic.rev(qual2)
#					print qname,seg.toCIGAR_trans()
					print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq2, qual2, extra))
			else:
				match = r.matchL()[0] ## assume --npath=1 (maximum 1 alignment per read)

				segL = match.getSegInfo()
				(strand, rname, pos1, pos2) = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', segL[0].seg[2]).groups()
				pos = min(int(pos1), int(pos2))
				mapq = segL[0].mapq
				seg_nm = segL[0].numSub
				cigar2 = match.toCIGAR()
#				print qname, match.toCIGAR()

				if segL[0].start != '' and segL[0].start != '0':
					cigar = str(segL[0].start) + 'S'

				if strand == '-':
					cigar = str(segL[0].numMatch + segL[0].numSub) + 'M' + cigar
					if segL[0].ins != '' and segL[0].ins != '0':
						cigar = str(segL[0].ins) + 'I' + cigar
				else:
					cigar = cigar + str(segL[0].numMatch + segL[0].numSub) + 'M'
					if segL[0].ins != '' and segL[0].ins != '0':
						cigar = cigar + str(segL[0].ins) + 'I'

				if len(segL) == 1:
					new_cigar = segL[0].toCIGAR(True)
				else:
					new_cigar = segL[0].toCIGAR()
				prev_cigar = new_cigar
				index = 0
				for seg in segL[1:]:
					index = index + 1
					if index == (len(segL) - 1):
						final = True
					else:
						final = False
					rm = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', seg.seg[2]).groups()

					match = str(seg.numMatch + seg.numSub) + 'M'

					if seg.ins != '' and seg.ins != '0':
						ins = str(seg.ins) + 'I'
					else:
						ins = ''

					if pos == 0 or pos > min(int(rm[2]), int(rm[3])):
						pos = min(int(rm[2]), int(rm[3]))
					if strand == '-':
						dist = int(pos2) - int(rm[2]) - 1
						if dist > 0:
							cigar = match + ins + str(dist) + 'N' + cigar
						else:
							cigar = match + ins + cigar
					else:
						dist = int(rm[2]) - int(pos2) - 1
						if dist > 0:
							cigar = cigar + str(dist) + 'N' + match + ins
						else:
							cigar = cigar + match + ins
					seg_nm = seg_nm + seg.numSub
					pos1 = rm[2]
					pos2 = rm[3]
					cur_cigar = seg.toCIGAR(final)
					if strand == '-':
						if dist > 0 and 'D' not in prev_cigar and 'I' not in prev_cigar:
							new_cigar = cur_cigar + str(dist) + 'N' + new_cigar
						else:
							new_cigar = cur_cigar + new_cigar
					else:
						if dist > 0 and 'D' not in prev_cigar and 'I' not in prev_cigar:
							new_cigar = new_cigar + str(dist) + 'N' + cur_cigar
						else:
							new_cigar = new_cigar + cur_cigar
					prev_cigar = cur_cigar

				if segL[-1].end != '' and segL[-1].end != '0': ## last segment
					if strand == '-':
						cigar = str(segL[-1].end) + 'S' + cigar
					else:
						cigar = cigar + str(segL[-1].end) + 'S'

				extra = extra + ('\tNM:i:%s' % seg_nm)

				if strand == '-':
					flag = flag | 0x10
					seq = mybasic.rc(seq)
					qual = mybasic.rev(qual)

##			print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual, extra))
				print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar2, rnext, pnext, tlen, seq, qual, extra))
def mutscan_signature(mode='WXS', outFileN=''):
    if outFileN == '':
        outFile = sys.stdout
    else:
        outFile = open(outFileN, 'w')
    dirLH = {'WXS': mysetting.wxsMutscanDirL, 'RSQ': mysetting.rsqMutscanDirL}
    contextH = {}
    for dir in dirLH[mode]:
        fileL = filter(
            lambda x: 'bak' not in x,
            map(lambda x: x.rstrip(),
                os.popen('find %s -name *dbsnp_flt' % dir).readlines()))
        for file in fileL:
            if mode == 'RSQ':
                if os.path.basename(
                        file
                ) == 'S647_splice.mutscan.dbsnp_flt':  ## duplicated files
                    continue
                if 'splice_Z' in file:
                    sid = re.match('(.*)_splice2.mutscan.dbsnp_flt',
                                   os.path.basename(file)).group(1)
                else:
                    sid = re.match('(.*)_RSq_splice.mutscan.dbsnp_flt',
                                   os.path.basename(file)).group(1)
            else:
                sid = re.match('(.*).mutscan.dbsnp_flt',
                               os.path.basename(file)).group(1)
            sigH = {}
            cntH = {}
            total = 0
            inFile = open(file, 'r')
            for line in inFile:
                colL = line.rstrip().split('\t')
                chrom = colL[0]
                pos = int(colL[1])
                ref = colL[2]
                alt = colL[3]
                if ref == 'N' or len(alt) > 1:
                    continue
                n_ref = int(colL[4])
                n_alt = int(colL[5])

                if n_alt >= MIN_MUT_N and (n_alt + n_ref) >= MIN_COV:
                    start = pos - 1
                    end = pos + 1
                    if chrom in contextH and pos in contextH[chrom]:
                        context = contextH[chrom][pos]
                    else:
                        resL = os.popen(
                            'samtools faidx /data1/Sequence/ucsc_hg19/hg19.fasta %s:%s-%s'
                            % (chrom, start, end)).readlines()
                        context = resL[1].rstrip().upper()
                        if chrom not in contextH:
                            contextH[chrom] = {}
                        contextH[chrom][pos] = context

                    if ref not in ['C', 'T']:
                        ref = mybasic.rc(ref)
                        alt = mybasic.rc(alt)
                        context = mybasic.rc(context)

                    ch = ref + '>' + alt
                    if ch in cntH:
                        cntH[ch] += 1
                    else:
                        cntH[ch] = 1
                    if (ch, context) in sigH:
                        sigH[(ch, context)] += 1
                    else:
                        sigH[(ch, context)] = 1
                    total += 1
                #if
            ##for line

            for (type, context) in sigH:
                outFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' %
                              (sid, type, context, sigH[(type, context)],
                               cntH[type], total))
        ##for file
    #for dir
    outFile.flush()
    outFile.close()