def convertAlignmentReadGroup2UCLAIDInVCF(self, inputFname, outputFname, minDepth=1, includeIndels=False,\ maxContigNumber=None): """ 2012.5.10 """ sys.stderr.write("Converting %s from VCF to EigenStrat ...\n"%(inputFname)) vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth) #replace Variant/PooledTissues/2002053/genome.algn.split.part17/5tissues.pooled.rmdup.bam with just monkey ID newSampleIDHeader = [] for sampleID in vcfFile.sampleIDHeader: readGroupData = VervetDB.VervetDB.parseAlignmentReadGroupWithoutDB(sampleID) UCLAID = readGroupData.individual_code newSampleIDHeader.append(UCLAID) #new header for every output contig newHeader = vcfFile.header[:vcfFile.sampleStartingColumn] + newSampleIDHeader counter = 0 real_counter = 0 outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs outVCFFile.header = newHeader outVCFFile.writeMetaAndHeader() for vcfRecord in vcfFile.parseIter(): counter += 1 if not includeIndels and (len(vcfRecord.refBase)!=1 or len(vcfRecord.altBase)!=1): #it's an indel if refBase or altBase is not just one base continue chr = vcfRecord.chr if maxContigNumber: contigNumber = int(self.contig_number_pattern.search(chr).group('contigNumber')) if contigNumber>maxContigNumber: continue real_counter += 1 # set genotype whose depth is below minDepth to ./. (=missing) for i in xrange(1, len(vcfRecord.data_row)): #[0] is the ref base callData = vcfRecord.data_row[i] if callData is None or callData.get('DP',0)<minDepth: sampleColumnIndex = i+vcfFile.sampleStartingColumn-1 vcfRecord.row[sampleColumnIndex] = './.' outVCFFile.writeVCFRecord(vcfRecord) vcfFile.close() #close all output files outVCFFile.close() sys.stderr.write("%s (out of %s) loci.\n"%(real_counter, counter))
def splitNamVCFIntoMultipleSingleChrVCF(self, inputFname, outputDir, minDepth=1, includeIndels=False, maxContigNumber=1000): """ 2012.5.10 Two things in Nam's VCF file are to be modified. 1. extract VRC UCLAID from its sample ID 2. replace vervet1_scaffolds_Contig137 with simply "Contig137" """ sys.stderr.write("Converting %s from VCF to EigenStrat ...\n"%(inputFname)) from pymodule.VCFFile import VCFFile vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth) #replace Variant/PooledTissues/2002053/genome.algn.split.part17/5tissues.pooled.rmdup.bam with just monkey ID import re newSampleIDHeader = [] for sampleID in vcfFile.sampleIDHeader: search_result = self.UCLAID_Pattern.search(sampleID) UCLAID = search_result.group('UCLAID') newSampleIDHeader.append(UCLAID) #new header for every output contig newHeader = vcfFile.header[:vcfFile.sampleStartingColumn] + newSampleIDHeader chr2outVCFFile = {} counter = 0 real_counter = 0 for vcfRecord in vcfFile.parseIter(): counter += 1 if not includeIndels and (len(vcfRecord.refBase)!=1 or len(vcfRecord.altBase)!=1): #it's an indel if refBase or altBase is not just one base continue contig_id_pattern_result = self.contig_id_pattern.search(vcfRecord.chr) chr = contig_id_pattern_result.group('contigID') if maxContigNumber: contigNumber = int(self.contig_number_pattern.search(chr).group('contigNumber')) if contigNumber>maxContigNumber: continue real_counter += 1 vcfRecord.chr = chr pos = vcfRecord.pos if chr not in chr2outVCFFile: outputFname = os.path.join(outputDir, '%s.vcf'%(chr)) outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs outVCFFile.header = newHeader outVCFFile.writeMetaAndHeader() chr2outVCFFile[chr] = outVCFFile outVCFFile = chr2outVCFFile.get(chr) # set genotype whose depth is below minDepth to ./. (=missing) for i in xrange(1, len(vcfRecord.data_row)): #[0] is the ref base callData = vcfRecord.data_row[i] if callData is None or callData.get('DP',0)<minDepth: sampleColumnIndex = i+vcfFile.sampleStartingColumn-1 vcfRecord.row[sampleColumnIndex] = './.' outVCFFile.writeVCFRecord(vcfRecord) vcfFile.close() #close all output files for chr, outVCFFile in chr2outVCFFile.iteritems(): outVCFFile.close() sys.stderr.write("%s (out of %s) loci from %s chromosomes.\n"%(real_counter, counter, len(chr2outVCFFile)))