def overallCaptureFraction(dirinput, fileList, probes, region='intron'): print('Reading overall captured base changes...') from parseline import VCFObj from collections import defaultdict from Bio.Seq import Seq vafs = {'C>A': [], 'C>G': [], 'C>T': [], 'T>A': [], 'T>C': [], 'T>G': []} for i in fileList: temp = {'C>A': 0, 'C>G': 0, 'C>T': 0, 'T>A': 0, 'T>C': 0, 'T>G': 0} target = open(dirinput + '/' + i, 'r') for line in target: if '#' not in line and 'chr' in line: # skip the info vcfObj = VCFObj(line) # only use single substitutions and eliminate SNPs if len(vcfObj.wt) == 1 and len( vcfObj.var) == 1 and vcfObj.af < 0.1: varType = ('%s>%s' % (vcfObj.wt, vcfObj.var)) if varType not in vafs: varType = ('%s>%s' % (str(Seq(vcfObj.wt).complement()), str(Seq(vcfObj.var).complement()))) for i in probes: for j in probes[i]: if j in str(vcfObj.location): if region == 'intron' and i[0] == 'T': # add AO together temp[varType] += int(vcfObj.ao) elif region == 'exon' and not i[0] == 'T': # add AO together temp[varType] += int(vcfObj.ao) target.close() # calculate fractions totalAO = 0 for i in temp: totalAO += temp[i] for i in temp: temp[i] = float(temp[i]) / float(totalAO) # transfer data into vafs for i in temp: vafs[i].append(temp[i]) return vafs
def readVCF(inFiles, inDir, ref, probes): from parseline import VCFObj from Bio.Seq import Seq from getSequence import getRefSequence from collections import defaultdict totalAO = defaultdict(list) print 'TotExon','ExonCount','TotIntr','IntrCount','Exonic','Intronic','NormExon','NormIntron' for sample in inFiles: totalExon = 0 totalIntron = 0 exonCount = 0 intronCount = 0 target = open(inDir + '/' + sample + '/onlyProbedRegions.vcf', 'r') for line in target: if '#' not in line and 'chr' in line: # skip the info lobj = VCFObj(line) # only C or G could be in CpG context if (lobj.wt == 'C' or lobj.wt == 'G') and len(lobj.var) == 1 and lobj.af < 0.1: seq = getRefSequence(line, 1, ref) if seq[1:3] == 'CG' or seq[1:3] == 'GC': for i in probes: for loc in probes[i]: if str(loc) in str(lobj.location) and i[0] == 'T': exonCount += 1 totalExon += lobj.ao elif str(loc) in str(lobj.location) and not i[0] == 'T': intronCount += 1 totalIntron += lobj.ao # first normalize by total number of sites in each category exonic = totalExon / exonCount intronic = totalIntron / intronCount # normalize to 100 percent normexonic = exonic / (exonic + intronic) normintronic = intronic / (exonic + intronic) print totalExon, exonCount, totalIntron, intronCount, exonic, intronic, normexonic, normintronic # append normalized results totalAO['Exonic'].append(normexonic) totalAO['Intronic'].append(normintronic) target.close() print totalAO return totalAO
def parseAll(inFiles, inDir, ref, probes): from parseline import VCFObj from Bio.Seq import Seq from getSequence import getRefSequence from collections import defaultdict # allbases = {'C>A':{'intron':[0.4,0.5], 'exon':[0.6,0.5]}} indiv = defaultdict(list) allbases = {'C>A':indiv.copy(),'C>G':indiv.copy(),'C>T':indiv.copy(),'T>A':indiv.copy(),'T>C':indiv.copy(),'T>G':indiv.copy()} for sample in inFiles: exon = {'C>A':[0,0],'C>G':[0,0],'C>T':[0,0],'T>A':[0,0],'T>C':[0,0],'T>G':[0,0]} intron = {'C>A':[0,0],'C>G':[0,0],'C>T':[0,0],'T>A':[0,0],'T>C':[0,0],'T>G':[0,0]} target = open(inDir + '/' + sample + '/onlyProbedRegions.vcf', 'r') for line in target: if '#' not in line and 'chr' in line: # skip the info lobj = VCFObj(line) # only look at substitutions and non SNPs if len(lobj.wt) == 1 and len(lobj.var) == 1 and lobj.af < 0.1: label = ('%s>%s' % (lobj.wt, lobj.var)) if label not in exon: wt = str(Seq(lobj.wt).complement()) var = str(Seq(lobj.var).complement()) label = ('%s>%s' % (wt, var)) for i in probes: for loc in probes[i]: if str(loc) in str(lobj.location) and i[0] == 'T': intron[label][0] += lobj.ao intron[label][1] += 1 elif str(loc) in str(lobj.location) and not i[0] == 'T': exon[label][0] += lobj.ao exon[label][1] += 1 # first normalize by total number of sites in each category for i in exon: exon[label][0] = exon[label][0] / exon[label][1] intron[label][0] = intron[label][0] / intron[label][1] # normalize to 100 percent for i in exon: normexonic = exon[i][0] / (exon[i][0] + intron[i][0]) normintronic = intron[i][0] / (exon[i][0] + intron[i][0]) allbases[i]['exon'].append(normexonic) allbases[i]['intron'].append(normintronic) target.close() return allbases
def populatePandasDataframe(dirinput, fileList, probes, ref, upstream=10, downstream=10): import pandas as pd from Bio.Seq import Seq print('Building data structure...') allSamples = [] columns = ['Loc','WT','Var','Change','ConvChange','AO','DP','VAF','IntEx','Upstream','Downstream','Individual'] dat = [] tempAllVariants = [] sampleCount = 0 for sample in fileList: inFile = open(dirinput + '/' + sample + '/onlyProbedRegions.vcf', 'r') sampleCount += 1 for line in inFile: if '#' not in line and 'chr' in line: # skip the info lineobj = VCFObj(line) # convert to six changes if lineobj.wt == 'G' or lineobj.wt == 'A': wt = str(Seq(lineobj.wt).complement()) var = str(Seq(lineobj.var).complement()) else: wt = str(lineobj.wt) var = str(lineobj.var) surrounding = getRefSequence(lineobj, upstream, downstream, ref) up = str(surrounding[:upstream]) down = str(surrounding[-downstream:]) probeRegion = '' for probe in probes: if len(probeRegion) < 1: for loc in probes[probe]: if str(loc) in str(lineobj.location): if probe[0] == 'T': probeRegion = 'TIII' else: probeRegion = 'Exon' if len(lineobj.wt) == 1 and len(lineobj.var) == 1 and lineobj.af < 0.1: dat = [lineobj.location, str(lineobj.wt), str(lineobj.var), str(lineobj.wt) + '>' + str(lineobj.var), wt + '>' + var, lineobj.ao, lineobj.dp, lineobj.af, probeRegion, up, down, sampleCount] tempdat = pd.DataFrame(dat, index=columns) tempAllVariants.append(tempdat.T) inFile.close() allVariants = pd.concat(tempAllVariants, ignore_index=True) return allVariants
def parseAllVAF(inFiles, inDir, ref, probes): from parseline import VCFObj from Bio.Seq import Seq from getSequence import getRefSequence from collections import defaultdict import numpy as np # intron = {('CCG','C>A'):[0.75, 0.87]} # exon = {('CCG','C>A'):[0.75, 0.87]} intron = defaultdict(list) exon = defaultdict(list) for sample in inFiles: tempintron = defaultdict(list) tempexon = defaultdict(list) target = open(inDir + '/' + sample + '/onlyProbedRegions.vcf', 'r') for line in target: if '#' not in line and 'chr' in line: # skip the info lobj = VCFObj(line) # only look at substitutions and non SNPs if len(lobj.wt) == 1 and len(lobj.var) == 1 and lobj.af < 0.1: if lobj.wt == 'C' or lobj.wt == 'T': label = ('%s>%s' % (lobj.wt, lobj.var)) seq = getRefSequence(line, 1, ref) else: wt = str(Seq(lobj.wt).complement()) var = str(Seq(lobj.var).complement()) label = ('%s>%s' % (wt, var)) seq = str(Seq(getRefSequence(line, 1, ref)).reverse_complement()) for i in probes: for loc in probes[i]: if str(loc) in str(lobj.location) and i[0] == 'T': tempintron[(seq,label)].append(lobj.af) elif str(loc) in str(lobj.location) and not i[0] == 'T': tempexon[(seq,label)].append(lobj.af) for i in tempintron: vafmean = np.mean(tempintron[i]) intron[i].append(vafmean) for i in tempexon: vafmean = np.mean(tempexon[i]) exon[i].append(vafmean) target.close() return intron, exon
def getSequence(vcfLine, flankLength): from parseline import VCFObj from subprocess import check_output, STDOUT vcfObj = VCFObj(vcfLine) low = int(vcfObj.location) - flankLength high = int(vcfObj.location) + flankLength temp = check_output( 'wget -qO- http://genome.ucsc.edu/cgi-bin/das/hg19/dna?segment=%s:%s,%s' % (vcfObj.chrom, low, high), stderr=STDOUT, shell=True) finalSeq = '' for line in temp.split('\n'): if '<' not in line: finalSeq += line return finalSeq
def getRefSequence(vcfLine, flankLength, ref): from parseline import VCFObj from subprocess import check_output, STDOUT from string import upper vcfObj = VCFObj(vcfLine) low = int(vcfObj.location) - flankLength high = int(vcfObj.location) + flankLength temp = check_output('samtools faidx %s %s:%s-%s' % (ref, vcfObj.chrom, low, high), stderr=STDOUT, shell=True) finalSeq = '' for line in temp.split('\n'): if '>' not in line: finalSeq += line finalSeq = finalSeq.upper() return finalSeq