def processVariantData(vcf_data, bam_file, strainID = '', refName='', minQuality = 30, minCount = 1,minSize=0): ''' @return [vcf] @summary: Process variant data link combine with related sequencing reference. Also creates a report of all variant data. ''' #vReport = Report() #Testing new report object vReport = DataReport() vcfDataResult = [] print "Opening bam file [%s]" % (bam_file) samFile = pysam.Samfile(bam_file,"rb") refname = samFile.getrname(0) for vcf in vcf_data: vcf["STRAIN"] = strainID irefname = vcf["CHROM"] start = int(vcf["POS"]) ref = vcf["REF"] alt = vcf["ALT"] quality = vcf["QUAL"] end = start + len(ref) vcf["END"] = end alreads = samFile.fetch(irefname,start,end) alreads = list(alreads) rCount = len(alreads) vcf["READS"] = rCount vcf["READFILE"] = bam_file if float(quality < minQuality) or rCount < minCount or len(alt) < minSize: #print "vcf [%s] quality [%s] count [%s] size [%s] has failed to pass" % (vcf,quality,rCount,len(alt)) continue else: vcfDataResult.append (vcf) if irefname == '': irefname = refname vID = "%s_%s" % (strainID,start) vReport.add(vID,"strainID", strainID) vReport.add(vID,"start", start) vReport.add(vID,"end", end) vReport.add(vID,"chrom",irefname) vReport.add(vID,"count",rCount) vReport.add(vID,"qual",quality) vReport.add(vID,"ref",ref) vReport.add(vID,"alt",alt) vReport.add(vID,"bamFile",bam_file) #vReport.add(vID,"samFile",samFile) return (vcfDataResult, vReport)
def vcfCollectionReport(self,vcfCollection,vcf,reorder=True,useCount=True,fill_blank="NA"): ''' @summary: Build report object form variant calls groups by regions into "variance collection". Report is made to be easily written to delimited matrix / spread sheet format. ''' result = DataReport() coverageReport = DataReport() data = vcfCollection.items() data.sort() strainIDs = vcfCollection.keys() #sort strain IDs #if reorder: if True: print "reordering report" columnNames = strainIDs cmap = {} regex = ".*_([0-9]+).*" for cName in columnNames: match = re.match(regex,cName) if match != None: key = match.group(1) else: key = cName cmap[key] = cName strainKeys = cmap.keys() strainKeys.sort() #for (strainID,vcfRegions) in vcfCollection.items(): for strainKey in strainKeys: strainID = cmap[strainKey] vcfRegions = vcfCollection[strainID] print "Collecting [%s] regions" % (strainID) for (loc,vcfRegion) in vcfRegions.items(): count = vcfRegion["Count"] vcfData = vcfRegion['vcfData'] item = "[%s]:" % (count) coverageReport.add(loc,strainID,count) if useCount: if result.get(loc,"Region_Count") == None: #result.add(loc,"Region_Count",0) rCount = 0 else: rCount = result.get(loc,"Region_Count") rCount = float(rCount) if len(vcfData) != 0: rCount += 1 #result.add(loc,"Region_Count",rCount) for vcf in vcfData: item = item + "(%s,%s,%s):" % (vcf["POS"],vcf["READS"],vcf["ALT"]) if len(vcfData) != 0: result.add(loc,strainID,item) else: result.add(loc,strainID,fill_blank) return (result,coverageReport)