def convertAlignmentReadGroup2UCLAIDInVCF(self, inputFname, outputFname, minDepth=1, includeIndels=False,\ maxContigNumber=None): """ 2012.5.10 """ sys.stderr.write("Converting %s from VCF to EigenStrat ...\n"%(inputFname)) vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth) #replace Variant/PooledTissues/2002053/genome.algn.split.part17/5tissues.pooled.rmdup.bam with just monkey ID newSampleIDHeader = [] for sampleID in vcfFile.sampleIDHeader: readGroupData = VervetDB.VervetDB.parseAlignmentReadGroupWithoutDB(sampleID) UCLAID = readGroupData.individual_code newSampleIDHeader.append(UCLAID) #new header for every output contig newHeader = vcfFile.header[:vcfFile.sampleStartingColumn] + newSampleIDHeader counter = 0 real_counter = 0 outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs outVCFFile.header = newHeader outVCFFile.writeMetaAndHeader() for vcfRecord in vcfFile.parseIter(): counter += 1 if not includeIndels and (len(vcfRecord.refBase)!=1 or len(vcfRecord.altBase)!=1): #it's an indel if refBase or altBase is not just one base continue chr = vcfRecord.chr if maxContigNumber: contigNumber = int(self.contig_number_pattern.search(chr).group('contigNumber')) if contigNumber>maxContigNumber: continue real_counter += 1 # set genotype whose depth is below minDepth to ./. (=missing) for i in xrange(1, len(vcfRecord.data_row)): #[0] is the ref base callData = vcfRecord.data_row[i] if callData is None or callData.get('DP',0)<minDepth: sampleColumnIndex = i+vcfFile.sampleStartingColumn-1 vcfRecord.row[sampleColumnIndex] = './.' outVCFFile.writeVCFRecord(vcfRecord) vcfFile.close() #close all output files outVCFFile.close() sys.stderr.write("%s (out of %s) loci.\n"%(real_counter, counter))
def splitNamVCFIntoMultipleSingleChrVCF(self, inputFname, outputDir, minDepth=1, includeIndels=False, maxContigNumber=1000): """ 2012.5.10 Two things in Nam's VCF file are to be modified. 1. extract VRC UCLAID from its sample ID 2. replace vervet1_scaffolds_Contig137 with simply "Contig137" """ sys.stderr.write("Converting %s from VCF to EigenStrat ...\n"%(inputFname)) from pymodule.VCFFile import VCFFile vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth) #replace Variant/PooledTissues/2002053/genome.algn.split.part17/5tissues.pooled.rmdup.bam with just monkey ID import re newSampleIDHeader = [] for sampleID in vcfFile.sampleIDHeader: search_result = self.UCLAID_Pattern.search(sampleID) UCLAID = search_result.group('UCLAID') newSampleIDHeader.append(UCLAID) #new header for every output contig newHeader = vcfFile.header[:vcfFile.sampleStartingColumn] + newSampleIDHeader chr2outVCFFile = {} counter = 0 real_counter = 0 for vcfRecord in vcfFile.parseIter(): counter += 1 if not includeIndels and (len(vcfRecord.refBase)!=1 or len(vcfRecord.altBase)!=1): #it's an indel if refBase or altBase is not just one base continue contig_id_pattern_result = self.contig_id_pattern.search(vcfRecord.chr) chr = contig_id_pattern_result.group('contigID') if maxContigNumber: contigNumber = int(self.contig_number_pattern.search(chr).group('contigNumber')) if contigNumber>maxContigNumber: continue real_counter += 1 vcfRecord.chr = chr pos = vcfRecord.pos if chr not in chr2outVCFFile: outputFname = os.path.join(outputDir, '%s.vcf'%(chr)) outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs outVCFFile.header = newHeader outVCFFile.writeMetaAndHeader() chr2outVCFFile[chr] = outVCFFile outVCFFile = chr2outVCFFile.get(chr) # set genotype whose depth is below minDepth to ./. (=missing) for i in xrange(1, len(vcfRecord.data_row)): #[0] is the ref base callData = vcfRecord.data_row[i] if callData is None or callData.get('DP',0)<minDepth: sampleColumnIndex = i+vcfFile.sampleStartingColumn-1 vcfRecord.row[sampleColumnIndex] = './.' outVCFFile.writeVCFRecord(vcfRecord) vcfFile.close() #close all output files for chr, outVCFFile in chr2outVCFFile.iteritems(): outVCFFile.close() sys.stderr.write("%s (out of %s) loci from %s chromosomes.\n"%(real_counter, counter, len(chr2outVCFFile)))
def extractSamples(self, db_vervet=None, inputFname=None, outputFname=None, \ tax_id_set=None, site_id_set=None, country_id_set=None, \ min_coverage=None, max_coverage=None, outputFormat=1, is_contaminated=None,\ **keywords): """ 2013.07.03 added argument is_contaminated (whether to fetch contaminated samples or not) 2013.04.30 added argument min_coverage, max_coverage 2012.10.10 added argument outputFormat. 2012.10.5 """ sys.stderr.write("Extracting samples from %s, %s sites & %s countries & %s taxonomies, min_coverage=%s, max_coverage=%s, outputFormat=%s, is_contaminated=%s ...\n"%\ (inputFname,\ getattr(site_id_set, '__len__', returnZeroFunc)(),\ getattr(country_id_set, '__len__', returnZeroFunc)(),\ getattr(tax_id_set, '__len__', returnZeroFunc)(), min_coverage, max_coverage,\ outputFormat, is_contaminated )) vcfFile = VCFFile(inputFname=inputFname) oldHeader = vcfFile.header oldHeaderLength = len(oldHeader) newHeader = oldHeader[:vcfFile.sampleStartingColumn] #anything before the samples are same no_of_samples = 0 col_index2sampleID = {} #this structure stores the selected samples and their column index for col_index, individual_name in vcfFile.get_col_index_individual_name_ls(): individualAlignment = db_vervet.parseAlignmentReadGroup(individual_name).individualAlignment if individualAlignment is not None: filteredAlignmentList = db_vervet.filterAlignments(alignmentLs=[individualAlignment], min_coverage=min_coverage, \ max_coverage=max_coverage, individual_site_id=None, \ sequence_filtered=None, individual_site_id_set=site_id_set, \ mask_genotype_method_id=None, parent_individual_alignment_id=None,\ country_id_set=country_id_set, tax_id_set=tax_id_set, excludeContaminant=False, \ is_contaminated=is_contaminated, excludeTissueIDSet=None,\ local_realigned=None, reduce_reads=None, report=False) if filteredAlignmentList: #non-empty, passed the filter newHeader.append(individual_name) no_of_samples += 1 col_index2sampleID[col_index] = individual_name else: sys.stderr.write("Warning: no individualAlignment for sample %s.\n"%(individual_name)) sys.exit(3) no_of_snps = 0 if outputFormat==1: outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs outVCFFile.header = newHeader outVCFFile.writeMetaAndHeader() newHeaderLength = len(newHeader) for vcfRecord in vcfFile: data_row =vcfRecord.row[:vcfFile.sampleStartingColumn] for i in xrange(vcfFile.sampleStartingColumn, oldHeaderLength): if i in col_index2sampleID: data_row.append(vcfRecord.row[i]) outVCFFile.writer.writerow(data_row) no_of_snps += 1 outVCFFile.close() elif outputFormat in [2,3]: outf = open(outputFname, 'w') if outputFormat==2: outf.write("sampleID\n") for col_index, sampleID in col_index2sampleID.iteritems(): outf.write("%s\n"%(sampleID)) outf.close() vcfFile.close() sys.stderr.write("%s samples X %s SNPs.\n"%(no_of_samples, no_of_snps))
def replicateVCFGenotypeColumns(self, inputFname, outputFname=None, replicateIndividualTag=None, sampleID2FamilyCount=None,\ minDepth=0): """ 2012.10.5 remove argument sampleStartingColumn 2012.5.10 VCFFile has been changed considerably and can act as a writer now. 2012.3.29 """ sys.stderr.write("Replicating some genotype columns in %s ...\n"%(inputFname)) vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth) outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs """ outf = open(outputFname, 'w') writer = csv.writer(outf, delimiter='\t') #write all the headers up till the last line (which describes the samples and etc.) for metaInfo in vcfFile.metaInfoLs: outf.write(metaInfo) """ #modify the sample-id header line sampleID2DataIndexLs = {} oldHeader = vcfFile.header oldHeaderLength = len(oldHeader) newHeader = oldHeader[:vcfFile.sampleStartingColumn] #anything before the samples are same no_of_samples = 0 for i in xrange(vcfFile.sampleStartingColumn, oldHeaderLength): #for sample_id in vcfFile.metaInfoLs[-1][vcfFile.sampleStartingColumn:]: sample_id = oldHeader[i].strip() newHeader.append('%s%s%s'%(sample_id, replicateIndividualTag, 1)) #1 because it's the 1st copy no_of_samples += 1 sampleID2DataIndexLs[sample_id] = [i] #1st copy for this sample #add additional column headers based on each one's occurrence extraColIndex2sampleID = {} for sample_id, familyCount in sampleID2FamilyCount.iteritems(): for i in xrange(1, familyCount): #if familyCount>1: if sample_id in sampleID2DataIndexLs: no_of_samples += 1 extraColIndex = len(newHeader) extraColIndex2sampleID[extraColIndex] = sample_id sampleID2DataIndexLs[sample_id].append(extraColIndex) replicate_order = len(sampleID2DataIndexLs[sample_id]) newHeader.append("%s%s%s"%(sample_id, replicateIndividualTag, replicate_order)) outVCFFile.header = newHeader outVCFFile.writeMetaAndHeader() newHeaderLength = len(newHeader) no_of_snps = 0 for vcfRecord in vcfFile.parseIter(): data_row =vcfRecord.row #2013.09.13 replace all "./." with full NA formating i.e. "./.:.:.:.", pending fields in the "format" column for i in xrange(vcfRecord.sampleStartingColumn, len(data_row)): if data_row[i]=='./.': #2013.09.15 expand this NA genotype for TrioCaller field_value_ls = [] for format_field in vcfRecord.format_column_ls: if format_field=='GT': field_value_ls.append('./.') elif format_field=='PL': #for TrioCaller field_value_ls.append('.,.,.') else: field_value_ls.append('.') #field_value_ls = ['./.'] + ['.']*(len(vcfRecord.format_column_name2index)-1) data_row[i] = ':'.join(field_value_ls) for i in xrange(oldHeaderLength, newHeaderLength): #add more genotype copies for those extra columns sample_id = extraColIndex2sampleID.get(i) sourceIndex = sampleID2DataIndexLs.get(sample_id)[0] data_row.append(data_row[sourceIndex]) outVCFFile.writer.writerow(data_row) no_of_snps += 1 outVCFFile.close() vcfFile.close() sys.stderr.write("%s samples X %s SNPs.\n"%(no_of_samples, no_of_snps))