def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) reader = VCFFile(inputFname=self.inputFname) alignmentFile = pysam.Samfile(self.alignmentFilename, "rb") writer = VCFFile(outputFname=self.outputFname, mode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() statWriter = MatrixFile(self.missingStatFname, mode='w', delimiter='\t') header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \ 'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads'] statWriter.writeHeader(header) counter = 0 real_counter = 0 minDepth = self.alignmentMedianDepth/self.alignmentDepthFold maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold for vcfRecord in reader: locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position) alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1) #start and end in fetch() are 0-based. locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\ minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead) locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator depth = locusLowMapQData.totalNoOfReads if depth>=minDepth and depth <=maxDepth: locusOutOfDepthIndicator = 0 #good else: locusOutOfDepthIndicator = 1 locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\ 1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \ locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads] statWriter.writerow(data_row) if locusLowQualityIndicator>0: real_counter += 1 #modify the VCF record #get sample ID column, then set its genotype missing vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True) #2014.1.4 output VCF record writer.writeVCFRecord(vcfRecord) counter += 1 reader.close() statWriter.close() writer.close() sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \ real_counter/float(counter)))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) snp_pos2genotypeVectorLs = self.readInSNPID2GenotypeVectorLs( self.inputFname).snp_pos2returnData writer = MatrixFile(self.outputFname, mode='w', delimiter='\t') header = [ 'chromosome', 'position', 'noOfMatches', 'noOfTotal', 'concordance' ] writer.writeHeader(header) counter = 0 real_counter = 0 no_of_pairs = 0 snp_pos_ls = sorted(snp_pos2genotypeVectorLs) for i in range(len(snp_pos_ls)): counter += 1 key = snp_pos_ls[i] chromosome, position = snp_pos_ls[i][:2] genotypeVectorLs = snp_pos2genotypeVectorLs.get(key) if len(genotypeVectorLs) > 1: real_counter += 1 for k in range(0, len(genotypeVectorLs) - 1): for l in range(k + 1, len(genotypeVectorLs)): no_of_pairs += 1 noOfMatches = 0 noOfTotal = 0 genotypeVector0 = genotypeVectorLs[k] genotypeVector1 = genotypeVectorLs[l] for j in range(len(genotypeVector0)): call1 = genotypeVector0[j]['GT'] call2 = genotypeVector1[j]['GT'] if call1 != 'NA' and call2 != 'NA': noOfTotal += 1 if SNP.nt2number[call1] == SNP.nt2number[ call2]: noOfMatches += 1 if noOfTotal > 0: concordance = float(noOfMatches) / float(noOfTotal) else: concordance = -1 data_row = [ chromosome, position, noOfMatches, noOfTotal, concordance ] writer.writerow(data_row) writer.close() sys.stderr.write("%s (out of %s, %s) snps have >1 same-position entries. %s pairs.\n"%(real_counter, counter, \ real_counter/float(counter), no_of_pairs))
def outputOverlapSites(self, overlapping_sites_set=None, outputFname=None): """ overlapping_sites_set is a set of (chromosome, pos) tuples. output is tab-delimited, 3-column. Last column is always 0 to mimic output of CalculateSNPMismatchRateOfTwoVCF.py chromosome position 0 """ sys.stderr.write("Outputting overlap %s sites ..." % (len(overlapping_sites_set))) header = ['chromosome', 'position', 'random'] overlapping_sites_list = list(overlapping_sites_set) writer = MatrixFile(outputFname, mode='w', delimiter='\t') writer.writerow(header) overlapping_sites_list.sort() for chromosome, pos in overlapping_sites_list: writer.writerow([chromosome, pos, 0]) sys.stderr.write("%s sites.\n" % (len(overlapping_sites_list)))
def outputFinalData(self, outputFname, key2dataLs=None, delimiter=None, header=None): """ header output is not dependent on key2dataLs anymore """ writer = MatrixFile(path=outputFname, delimiter=delimiter, mode='w') if header and delimiter: writer.writerow(header) if key2dataLs and delimiter: keyLs = sorted(key2dataLs) for key in keyLs: dataLs = key2dataLs.get(key) writer.writerow(list(key) + dataLs) writer.close()
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) switchPointData = self.readInStats(inputFname=self.inputFname) sys.stderr.write("Processing data ...") writer = MatrixFile(self.outputFname, mode='w') header = [ "maxSwitchFrequency", "genomeCovered", 'genomeCoveredFraction', "noOfLoci", 'noOfLociFraction' ] writer.writeHeader(header) data_matrix = switchPointData.data_matrix totalSpan = switchPointData.totalSpan totalNoOfLoci = switchPointData.totalNoOfLoci #sort it based on switchFrequency data_matrix.sort(reverse=True) maxSwitchFrequencyLs = [] cumulativeRegionSpanLs = [] cumulativeNoOfLociLs = [] for i in range(len(data_matrix)): switchFrequency, regionSpan, noOfLoci = data_matrix[i] maxSwitchFrequencyLs.append(switchFrequency) if i == 0: cumulativeRegionSpan = totalSpan - regionSpan cumulativeNoOfLoci = totalNoOfLoci - noOfLoci else: cumulativeRegionSpan = cumulativeRegionSpanLs[i - 1] - regionSpan cumulativeNoOfLoci = cumulativeNoOfLociLs[i - 1] - noOfLoci cumulativeRegionSpanLs.append(cumulativeRegionSpan) cumulativeNoOfLociLs.append(cumulativeNoOfLoci) writer.writerow([switchFrequency, cumulativeRegionSpan, cumulativeRegionSpan/float(totalSpan),\ cumulativeNoOfLoci, cumulativeNoOfLoci/float(totalNoOfLoci)]) writer.close() sys.stderr.write(".\n")
def calculateOverlappingSites(self, vcfFile1=None, vcfFile2=None, outputFname=None, overlappingSitesOutputFname=None,\ chromosome=None, chrLength=None): """ 2013.07.17 vcf files are no longer pre-loaded. read in locus ids first. """ writer = MatrixFile(outputFname, mode='w', delimiter='\t') header = ['#chromosome', 'length', '#sitesInInput1', '#sitesInInput2', '#overlapping', 'overlappingOverTotal', \ 'overlappingOverInput1', 'overlappingOverInput2', '#segregatingSitesNormalized', ] vcf1_locus_id_list = [] for row in vcfFile1.reader: vcf1_locus_id_list.append((row[0], row[1])) vcf2_locus_id_list = [] for row in vcfFile2.reader: vcf2_locus_id_list.append((row[0], row[1])) no_of_sites_of_input1 = len(vcf1_locus_id_list) no_of_sites_of_input2 = len(vcf2_locus_id_list) overlapping_sites_set = set(vcf1_locus_id_list) & set( vcf2_locus_id_list) if overlappingSitesOutputFname: #outputFname = "%s_overlapSitePos.tsv"%(outputFnamePrefix) self.outputOverlapSites( overlapping_sites_set=overlapping_sites_set, outputFname=overlappingSitesOutputFname) no_of_overlapping_sites = len(overlapping_sites_set) no_of_total_sites = no_of_sites_of_input1 + no_of_sites_of_input2 - no_of_overlapping_sites if no_of_total_sites > 0: overlapping_fraction = no_of_overlapping_sites / float( no_of_total_sites) else: overlapping_fraction = -1 if no_of_sites_of_input1 > 0: overlappingOverInput1 = no_of_overlapping_sites / float( no_of_sites_of_input1) else: overlappingOverInput1 = -1 if no_of_sites_of_input2 > 0: overlappingOverInput2 = no_of_overlapping_sites / float( no_of_sites_of_input2) else: overlappingOverInput2 = -1 no_of_samples = len(vcfFile1.sample_id2index) no_of_samples_in_vcf2 = len(vcfFile2.sample_id2index) overlapping_sample_id_set = set(vcfFile1.sample_id2index.keys()) & set( vcfFile2.sample_id2index.keys()) if no_of_samples != no_of_samples_in_vcf2: sys.stderr.write("Warning: sample size in %s is %s, in %s is %s. not matching.\n"%\ (vcfFile1.inputFname, no_of_samples, vcfFile2.inputFname, no_of_samples_in_vcf2)) #exclude the ref sample in the 1st column if no_of_samples > 1: normalizingConstant = float( utils.sumOfReciprocals(no_of_samples * 2 - 1)) else: normalizingConstant = 1 noOfSegregatesSitesNormalized = no_of_overlapping_sites / ( normalizingConstant * chrLength) writer.writerow(header) """ #reformat for output no_of_matches_per_sample_ls = map(repr, no_of_matches_per_sample_ls) no_of_non_NA_pairs_per_sample_ls = map(repr, no_of_non_NA_pairs_per_sample_ls) matchFractionLs = map(repr, matchFractionLs) """ writer.writerow([chromosome, chrLength, no_of_sites_of_input1, no_of_sites_of_input2, no_of_overlapping_sites, \ overlapping_fraction, overlappingOverInput1, overlappingOverInput2, \ noOfSegregatesSitesNormalized]) del writer return PassingData(overlapping_sample_id_set=overlapping_sample_id_set, overlapping_sites_set=overlapping_sites_set)
def calculatePerSampleMismatchFraction(self, vcfFile1=None, vcfFile2=None, outputFname=None, overlapping_sample_id_set=None,\ NA_call_encoding_set = set(['.', 'NA'])): """ 2013.08.13 bugfix, derive overlapping_sites_set by itself, rather than use calculateOverlappingSites() 2013.07.17 vcf files are no longer pre-loaded. 2012.8.16 """ sys.stderr.write( "Finding matches for each sample at overlapping sites ...") writer = MatrixFile(outputFname, mode='w', delimiter='\t') header = [ 'sample_id', 'no_of_matches', 'no_of_non_NA_pairs', 'matchFraction' ] no_of_samples_to_compare = len(overlapping_sample_id_set) vcfFile1._resetInput() vcfFile1.parseFile() vcfFile2._resetInput() vcfFile2.parseFile() overlapping_sites_set = set(vcfFile1.locus_id_ls) & set( vcfFile2.locus_id_ls) sys.stderr.write(" %s overlapping loci, " % (len(overlapping_sites_set))) header_ls_for_no_of_matches = [] header_ls_for_no_of_non_NA_pairs = [] header_ls_for_matchFraction = [] overlapping_sample_id_list = sorted(overlapping_sample_id_set) """ for sample_id in overlapping_sample_id_list: header_ls_for_no_of_matches.append('no_of_matches_for_%s'%(sample_id)) header_ls_for_no_of_non_NA_pairs.append('no_of_non_NA_pairs_for_%s'%(sample_id)) header_ls_for_matchFraction.append('matchFraction_for_%s'%(sample_id)) #header = header + header_ls_for_no_of_matches + header_ls_for_no_of_non_NA_pairs + header_ls_for_matchFraction """ no_of_matches_per_sample_ls = [0] * no_of_samples_to_compare no_of_non_NA_pairs_per_sample_ls = [0] * no_of_samples_to_compare for locus_id in overlapping_sites_set: row_index1 = vcfFile1.locus_id2row_index[locus_id] row_index2 = vcfFile2.locus_id2row_index[locus_id] for j in range(len(overlapping_sample_id_list)): sample_id = overlapping_sample_id_list[j] col_index1 = vcfFile1.sample_id2index.get(sample_id) col_index2 = vcfFile2.sample_id2index.get(sample_id) #2012.1.17 bugfix below. so that 'AG' and 'GA' are same. call1 = vcfFile1.genotype_call_matrix[row_index1][col_index1] call2 = vcfFile2.genotype_call_matrix[row_index2][col_index2] if call1 not in NA_call_encoding_set and call2 not in NA_call_encoding_set: no_of_non_NA_pairs_per_sample_ls[j] += 1 if nt2number[call1] == nt2number[ call2]: #2013.07.03 bugfix, 'AT' and 'TA' should be same. no phase no_of_matches_per_sample_ls[j] += 1 else: #do nothing pass matchFractionLs = [-1] * no_of_samples_to_compare for j in range(no_of_samples_to_compare): if no_of_non_NA_pairs_per_sample_ls[j] > 0: matchFractionLs[j] = no_of_matches_per_sample_ls[j] / float( no_of_non_NA_pairs_per_sample_ls[j]) writer.writerow(header) for i in range(no_of_samples_to_compare): data_row = [overlapping_sample_id_list[i], no_of_matches_per_sample_ls[i], no_of_non_NA_pairs_per_sample_ls[i],\ matchFractionLs[i]] writer.writerow(data_row) del writer sys.stderr.write("%s samples.\n" % (no_of_samples_to_compare))
def run(self): """ """ if self.debug: import pdb pdb.set_trace() reader = MatrixFile(path=self.inputFname) reader.constructColName2IndexFromHeader() meanMendelErrorIndex = reader.getColIndexGivenColHeader( "meanMendelError") noOfLociIndex = reader.getColIndexGivenColHeader("sampled_base_count") sumOfMendelErrorIndex = reader.getColIndexGivenColHeader( "sumOfMendelError") plinkPedigreeFile = PlinkPedigreeFile(path=self.pedigreeFname) familyStructureData = plinkPedigreeFile.getFamilyStructurePlinkWay() twoParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \ parentSetSize=2) singleParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \ parentSetSize=1) zeroParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \ parentSetSize=0) writer = MatrixFile(self.outputFname, mode='w', delimiter='\t') header = ["ID", "noOfTotalLoci", \ "noOfTwoParentFamilies", "noOfParentsInTwoParentFamilies", "noOfKidsInTwoParentFamilies", "noOfIndividualsInTwoParentFamilies", \ "noOfSingleParentFamilies", "noOfParentsInSingleParentFamilies", "noOfKidsInSingleParentFamilies", "noOfIndividualsInSingleParentFamilies", \ "noOfZeroParentFamilies", "noOfParentsInZeroParentFamilies", "noOfKidsInZeroParentFamilies", "noOfIndividualsInZeroParentFamilies", \ "noOfTotalMendelErrors", \ "noOfMendelErrorsPerLocusPerNuclearFamily", "noOfMendelErrorsPerNuclearFamily"] writer.writeHeader(header) for row in reader: meanMendelError = float(row[meanMendelErrorIndex]) noOfLoci = int(row[noOfLociIndex]) sumOfMendelError = int(row[sumOfMendelErrorIndex]) noOfNuclearFamilies = twoParentFamilyCountData.noOfFamilies if noOfNuclearFamilies > 0: noOfMendelErrorsPerLocusPerNuclearFamily = meanMendelError / float( noOfNuclearFamilies) noOfMendelErrorsPerNuclearFamily = sumOfMendelError / float( noOfNuclearFamilies) else: noOfMendelErrorsPerLocusPerNuclearFamily = -1 noOfMendelErrorsPerNuclearFamily = -1 data_row = [row[0], noOfLoci, \ noOfNuclearFamilies, twoParentFamilyCountData.noOfParents, twoParentFamilyCountData.noOfKids, \ twoParentFamilyCountData.noOfIndividuals,\ singleParentFamilyCountData.noOfFamilies, singleParentFamilyCountData.noOfParents, singleParentFamilyCountData.noOfKids,\ singleParentFamilyCountData.noOfIndividuals,\ zeroParentFamilyCountData.noOfFamilies, zeroParentFamilyCountData.noOfParents, zeroParentFamilyCountData.noOfKids,\ zeroParentFamilyCountData.noOfIndividuals,\ sumOfMendelError, \ noOfMendelErrorsPerLocusPerNuclearFamily,noOfMendelErrorsPerNuclearFamily ] writer.writerow(data_row) plinkPedigreeFile.close() reader.close() writer.close()