def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) reader = VCFFile(inputFname=self.inputFname) alignmentFile = pysam.Samfile(self.alignmentFilename, "rb") writer = VCFFile(outputFname=self.outputFname, mode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() statWriter = MatrixFile(self.missingStatFname, mode='w', delimiter='\t') header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \ 'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads'] statWriter.writeHeader(header) counter = 0 real_counter = 0 minDepth = self.alignmentMedianDepth/self.alignmentDepthFold maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold for vcfRecord in reader: locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position) alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1) #start and end in fetch() are 0-based. locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\ minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead) locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator depth = locusLowMapQData.totalNoOfReads if depth>=minDepth and depth <=maxDepth: locusOutOfDepthIndicator = 0 #good else: locusOutOfDepthIndicator = 1 locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\ 1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \ locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads] statWriter.writerow(data_row) if locusLowQualityIndicator>0: real_counter += 1 #modify the VCF record #get sample ID column, then set its genotype missing vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True) #2014.1.4 output VCF record writer.writeVCFRecord(vcfRecord) counter += 1 reader.close() statWriter.close() writer.close() sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \ real_counter/float(counter)))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) snp_pos2genotypeVectorLs = self.readInSNPID2GenotypeVectorLs( self.inputFname).snp_pos2returnData writer = MatrixFile(self.outputFname, mode='w', delimiter='\t') header = [ 'chromosome', 'position', 'noOfMatches', 'noOfTotal', 'concordance' ] writer.writeHeader(header) counter = 0 real_counter = 0 no_of_pairs = 0 snp_pos_ls = sorted(snp_pos2genotypeVectorLs) for i in range(len(snp_pos_ls)): counter += 1 key = snp_pos_ls[i] chromosome, position = snp_pos_ls[i][:2] genotypeVectorLs = snp_pos2genotypeVectorLs.get(key) if len(genotypeVectorLs) > 1: real_counter += 1 for k in range(0, len(genotypeVectorLs) - 1): for l in range(k + 1, len(genotypeVectorLs)): no_of_pairs += 1 noOfMatches = 0 noOfTotal = 0 genotypeVector0 = genotypeVectorLs[k] genotypeVector1 = genotypeVectorLs[l] for j in range(len(genotypeVector0)): call1 = genotypeVector0[j]['GT'] call2 = genotypeVector1[j]['GT'] if call1 != 'NA' and call2 != 'NA': noOfTotal += 1 if SNP.nt2number[call1] == SNP.nt2number[ call2]: noOfMatches += 1 if noOfTotal > 0: concordance = float(noOfMatches) / float(noOfTotal) else: concordance = -1 data_row = [ chromosome, position, noOfMatches, noOfTotal, concordance ] writer.writerow(data_row) writer.close() sys.stderr.write("%s (out of %s, %s) snps have >1 same-position entries. %s pairs.\n"%(real_counter, counter, \ real_counter/float(counter), no_of_pairs))
def outputFinalData(self, outputFname, key2dataLs=None, delimiter=None, header=None): """ header output is not dependent on key2dataLs anymore """ writer = MatrixFile(path=outputFname, delimiter=delimiter, mode='w') if header and delimiter: writer.writerow(header) if key2dataLs and delimiter: keyLs = sorted(key2dataLs) for key in keyLs: dataLs = key2dataLs.get(key) writer.writerow(list(key) + dataLs) writer.close()
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) switchPointData = self.readInStats(inputFname=self.inputFname) sys.stderr.write("Processing data ...") writer = MatrixFile(self.outputFname, mode='w') header = [ "maxSwitchFrequency", "genomeCovered", 'genomeCoveredFraction', "noOfLoci", 'noOfLociFraction' ] writer.writeHeader(header) data_matrix = switchPointData.data_matrix totalSpan = switchPointData.totalSpan totalNoOfLoci = switchPointData.totalNoOfLoci #sort it based on switchFrequency data_matrix.sort(reverse=True) maxSwitchFrequencyLs = [] cumulativeRegionSpanLs = [] cumulativeNoOfLociLs = [] for i in range(len(data_matrix)): switchFrequency, regionSpan, noOfLoci = data_matrix[i] maxSwitchFrequencyLs.append(switchFrequency) if i == 0: cumulativeRegionSpan = totalSpan - regionSpan cumulativeNoOfLoci = totalNoOfLoci - noOfLoci else: cumulativeRegionSpan = cumulativeRegionSpanLs[i - 1] - regionSpan cumulativeNoOfLoci = cumulativeNoOfLociLs[i - 1] - noOfLoci cumulativeRegionSpanLs.append(cumulativeRegionSpan) cumulativeNoOfLociLs.append(cumulativeNoOfLoci) writer.writerow([switchFrequency, cumulativeRegionSpan, cumulativeRegionSpan/float(totalSpan),\ cumulativeNoOfLoci, cumulativeNoOfLoci/float(totalNoOfLoci)]) writer.close() sys.stderr.write(".\n")
def readInStats(self, inputFname=None): """ 2013.07.15 """ sys.stderr.write("Reading stats from %s ..." % (inputFname)) data_matrix = [] reader = MatrixFile(inputFname) reader.constructColName2IndexFromHeader() switchFrequencyIndex = reader.getColIndexGivenColHeader( "noOfSwitchPoints_by_noOfLociWithUniqueHit") regionSpanIndex = reader.getColIndexGivenColHeader("regionSpan") noOfLociIndex = reader.getColIndexGivenColHeader("#sitesInInput2") totalSpan = 0 totalNoOfLoci = 0 counter = 0 for row in reader: counter += 1 switchFrequency = row[switchFrequencyIndex] regionSpan = row[regionSpanIndex] noOfLoci = row[noOfLociIndex] if switchFrequency and regionSpan and noOfLoci: #non-empty switchFrequency = float(switchFrequency) regionSpan = int(float(regionSpan)) noOfLoci = int(float(noOfLoci)) data_matrix.append([switchFrequency, regionSpan, noOfLoci]) totalSpan += regionSpan totalNoOfLoci += noOfLoci reader.close() sys.stderr.write(" %s valid entries (from %s rows) with totalSpan=%s, totalNoOfLoci=%s.\n"%\ (len(data_matrix), counter, totalSpan, totalNoOfLoci)) return PassingData(data_matrix=data_matrix, totalSpan=totalSpan, totalNoOfLoci=totalNoOfLoci)
def setup(self, **keywords): """ 2012.10.15 run before anything is run """ AbstractMatrixFileWalker.setup(self, **keywords) #self.writer = BeagleGenotypeFile(path=self.outputFname, mode='w') #read in the IBD check result self.ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.pedigreeKinshipFilePath, \ rowIDHeader=None, colIDHeader=None, \ rowIDIndex=0, colIDIndex=1, \ dataHeader=None, dataIndex=2, hasHeader=False) #. read in the alignment coverage data alignmentCoverageFile = MatrixFile( path=self.individualAlignmentCoverageFname) alignmentCoverageFile.constructColName2IndexFromHeader() alignmentReadGroup2coverageLs = alignmentCoverageFile.constructDictionary( keyColumnIndexList=[0], valueColumnIndexList=[1]) alignmentCoverageFile.close() sys.stderr.write( "Reading in all samples from %s VCF input files ... \n" % (len(self.inputFnameLs))) # read all the Beagle files individualID2HaplotypeData = {} for inputFname in self.inputFnameLs: vcfFile = VCFFile(inputFname=inputFname) #vcfFile.readInAllHaplotypes() for individualID in vcfFile.getSampleIDList(): individualID2HaplotypeData[individualID] = None #haplotypeList = vcfFile.getHaplotypeListOfOneSample(individualID) #individualID2HaplotypeData[individualID] = PassingData(haplotypeList=haplotypeList, # locusIDList=vcfFile.locusIDList) # get all haplotypes , etc. # get all sample IDs sys.stderr.write("%s individuals total.\n" % (len(individualID2HaplotypeData))) #. read in the pedigree or deduce it from Beagle Trio/Duo genotype file (columns) #. construct individualID2pedigreeContext, context: familySize=1/2/3, familyPosition=1/2 (parent/child) sys.stderr.write("Constructing individualID2pedigreeContext ...") plinkPedigreeFile = PlinkPedigreeFile(path=self.pedigreeFname) pGraph = plinkPedigreeFile.pedigreeGraph #shrink the graph to only individuals with data pGraph = nx.subgraph(pGraph, individualID2HaplotypeData.keys()) cc_subgraph_list = nx.connected_component_subgraphs( pGraph.to_undirected()) individualID2familyContext = {} outDegreeContainer = NumberContainer(minValue=0) familySizeContainer = NumberContainer(minValue=0) individualCoverageContainer = NumberContainer(minValue=0) familyCoverageContainer = NumberContainer(minValue=0) for cc_subgraph in cc_subgraph_list: familySize = len(cc_subgraph) familySizeContainer.addOneValue(familySize) familyCoverage = 0 for n in cc_subgraph: #assuming each family is a two-generation trio/nuclear family individualCoverage = self.getIndividualCoverage( individualID=n, alignmentReadGroup2coverageLs=alignmentReadGroup2coverageLs ) individualCoverage = float(individualCoverage) individualCoverageContainer.addOneValue(individualCoverage) familyCoverage += individualCoverage in_degree = pGraph.in_degree(n) out_degree = pGraph.out_degree(n) outDegreeContainer.addOneValue(out_degree) familyContext = PassingData(familySize=familySize, in_degree=in_degree, out_degree=out_degree, \ individualCoverage=individualCoverage,\ familyCoverage=None) if n not in individualID2familyContext: individualID2familyContext[n] = familyContext else: sys.stderr.write( "Node %s already in individualID2familyContext.\n" % (n)) familyCoverageContainer.addOneValue(familyCoverage) #set the family coverage for each member, used in weighing the individual. better covered family => better haplotype for n in cc_subgraph: individualID2familyContext[n].familyCoverage = familyCoverage plinkPedigreeFile.close() sys.stderr.write("%s individuals.\n" % (len(individualID2familyContext))) # weigh each unique individual based on its sequencing coverage + no of offspring => probability mass for each individual sys.stderr.write( "Weighing each individual , assigning probability mass ...") individualID2probabilityMass = {} for individualID, familyContext in individualID2familyContext.items(): outDegreeQuotient = outDegreeContainer.normalizeValue( familyContext.familySize) individualCoverageQuotient = individualCoverageContainer.normalizeValue( familyContext.individualCoverage) #familyCoverageQuotient = familyCoverageContainer.normalizeValue(familyContext.familyCoverage) importanceScore = outDegreeQuotient + individualCoverageQuotient representativeImportanceScore = importanceScore individualID2probabilityMass[ individualID] = representativeImportanceScore sys.stderr.write(" %s IDs with probability mass assigned.\n" % (len(individualID2probabilityMass))) self.individualID2probabilityMass = individualID2probabilityMass self.individualID2HaplotypeData = individualID2HaplotypeData
def run(self): """ """ if self.debug: import pdb pdb.set_trace() reader = MatrixFile(path=self.inputFname) reader.constructColName2IndexFromHeader() meanMendelErrorIndex = reader.getColIndexGivenColHeader( "meanMendelError") noOfLociIndex = reader.getColIndexGivenColHeader("sampled_base_count") sumOfMendelErrorIndex = reader.getColIndexGivenColHeader( "sumOfMendelError") plinkPedigreeFile = PlinkPedigreeFile(path=self.pedigreeFname) familyStructureData = plinkPedigreeFile.getFamilyStructurePlinkWay() twoParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \ parentSetSize=2) singleParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \ parentSetSize=1) zeroParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \ parentSetSize=0) writer = MatrixFile(self.outputFname, mode='w', delimiter='\t') header = ["ID", "noOfTotalLoci", \ "noOfTwoParentFamilies", "noOfParentsInTwoParentFamilies", "noOfKidsInTwoParentFamilies", "noOfIndividualsInTwoParentFamilies", \ "noOfSingleParentFamilies", "noOfParentsInSingleParentFamilies", "noOfKidsInSingleParentFamilies", "noOfIndividualsInSingleParentFamilies", \ "noOfZeroParentFamilies", "noOfParentsInZeroParentFamilies", "noOfKidsInZeroParentFamilies", "noOfIndividualsInZeroParentFamilies", \ "noOfTotalMendelErrors", \ "noOfMendelErrorsPerLocusPerNuclearFamily", "noOfMendelErrorsPerNuclearFamily"] writer.writeHeader(header) for row in reader: meanMendelError = float(row[meanMendelErrorIndex]) noOfLoci = int(row[noOfLociIndex]) sumOfMendelError = int(row[sumOfMendelErrorIndex]) noOfNuclearFamilies = twoParentFamilyCountData.noOfFamilies if noOfNuclearFamilies > 0: noOfMendelErrorsPerLocusPerNuclearFamily = meanMendelError / float( noOfNuclearFamilies) noOfMendelErrorsPerNuclearFamily = sumOfMendelError / float( noOfNuclearFamilies) else: noOfMendelErrorsPerLocusPerNuclearFamily = -1 noOfMendelErrorsPerNuclearFamily = -1 data_row = [row[0], noOfLoci, \ noOfNuclearFamilies, twoParentFamilyCountData.noOfParents, twoParentFamilyCountData.noOfKids, \ twoParentFamilyCountData.noOfIndividuals,\ singleParentFamilyCountData.noOfFamilies, singleParentFamilyCountData.noOfParents, singleParentFamilyCountData.noOfKids,\ singleParentFamilyCountData.noOfIndividuals,\ zeroParentFamilyCountData.noOfFamilies, zeroParentFamilyCountData.noOfParents, zeroParentFamilyCountData.noOfKids,\ zeroParentFamilyCountData.noOfIndividuals,\ sumOfMendelError, \ noOfMendelErrorsPerLocusPerNuclearFamily,noOfMendelErrorsPerNuclearFamily ] writer.writerow(data_row) plinkPedigreeFile.close() reader.close() writer.close()