def openWriteBeagleFiles(self, pedigreeFamilyData=None, outputFnamePrefix=None): """ 2013.05.02 The non-likelihood (unphased, trios, pairs) Beagle format: I id sample1 sample1 sample2 sample2 A diabetes 1 1 2 2 M rs12082861 C C C C M rs4912233 T C C C M rs12732823 G A A A M rs17451521 C C C C M rs12033358 C T T T The likelihood version is marker alleleA alleleB 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1002_711_2001039_GA_vs_524 Contig791:1086 C A 0.9693 0.0307 0.0000 0.6660 0.3338 0.0003 0.0000 Contig791:1649 G C 0.9406 0.0594 0.0000 0.9693 0.0307 0.0000 0.0000 Contig791:4084 A C 0.9980 0.0020 0.0000 0.9844 0.0156 0.0000 0.0000 The markers file has this format (markerID, position, alleleA, alleleB) Contig791:1086 1086 C A """ sys.stderr.write( "Opening beagle files (outputFnamePrefix =%s) to write ..." % (outputFnamePrefix)) familySize2BeagleFileHandler = {} familySize2SampleIDList = pedigreeFamilyData.familySize2SampleIDList counter = 0 for familySize, sampleIDList in familySize2SampleIDList.iteritems(): if familySize not in familySize2BeagleFileHandler: tmpOutputFnamePrefix = '%s_familySize%s' % (outputFnamePrefix, familySize) writer = MatrixFile(inputFname='%s.bgl' % (tmpOutputFnamePrefix), openMode='w', delimiter=' ') familySize2BeagleFileHandler[familySize] = writer if familySize == 1: headerRow = ['marker', 'alleleA', 'alleleB'] else: headerRow = ['I', 'id'] for sampleID in sampleIDList: if familySize == 1: #likelihood format has sample name replicated three times, rather than 2 times headerRow.extend([sampleID] * 3) else: headerRow.extend([sampleID] * 2) writer.writeHeader(headerRow) counter += 1 markersFile = MatrixFile(inputFname='%s.markers' % (outputFnamePrefix), openMode='w', delimiter=' ') counter += 1 sys.stderr.write("%s files outputted.\n" % (counter)) return PassingData( familySize2BeagleFileHandler=familySize2BeagleFileHandler, markersFile=markersFile)
def openOneInputFile(self, inputFname=None): """ 2013.09.05 split out of fileWalker() , added VCFFile """ if self.inputFileFormat==2: #2012.12.20 reader = YHFile(inputFname, openMode='r', tableName=self.h5TableName) elif self.inputFileFormat==3: #2012.11.22 reader = HDF5MatrixFile(inputFname, openMode='r') elif self.inputFileFormat==4: reader = VCFFile(inputFname=inputFname) else: reader = MatrixFile(inputFname) return reader
def outputFinalData(self, outputFname, key2dataLs=None, delimiter=None, header=None): """ 2013.07.18 header output is not dependent on key2dataLs anymore 2013.3.3 bugfix , added openMode='w' for MatrixFile() 2013.2.12 replace csv.writer with MatrixFile 2012.7.30 open the outputFname regardless whether there is data or not. 2012.1.9 """ writer = MatrixFile(inputFname=outputFname, delimiter=delimiter, openMode='w') if header and delimiter: writer.writerow(header) if key2dataLs and delimiter: keyLs = key2dataLs.keys() keyLs.sort() for key in keyLs: dataLs = key2dataLs.get(key) writer.writerow(list(key) + dataLs) writer.close()
def setup(self, **keywords): """ 2012.11.22 2012.10.25 do not open the file if it's a png file 2012.10.15 run before anything is run """ writer = None if self.outputFileFormat in [1,4]: suffix = os.path.splitext(self.outputFname)[1] if self.outputFname and suffix!='.png': writer = MatrixFile(self.outputFname, openMode='w', delimiter='\t') #writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') else: #HDF5MatrixFile #can't generate HDF5MatrixFile, because it needs dtypeList pass #pass it to the invariantPData self.invariantPData.writer = writer self.writer = writer
def setup(self, **keywords): """ 2013.09.30 """ parentClass.setup(self, **keywords) #. read in alignment coverage data trioInconsistencyFile = MatrixFile(inputFname=self.originalTrioInconsistencyFname) trioInconsistencyFile.constructColName2IndexFromHeader() self.trioID2dataList = trioInconsistencyFile.constructDictionary(keyColumnIndexList=[0], valueColumnIndexList=[1,2,3],\ keyUniqueInInputFile=True) trioInconsistencyFile.close() self.alignmentID2alignmentReadGroup = {} for alignmentReadGroup in self.alignmentReadGroup2individualID: alignmentID = alignmentReadGroup.split('_')[0] self.alignmentID2alignmentReadGroup[alignmentID] = alignmentReadGroup
def run(self): """ """ if self.debug: import pdb pdb.set_trace() reader = MatrixFile(inputFname=self.inputFname) reader.constructColName2IndexFromHeader() meanMendelErrorIndex = reader.getColIndexGivenColHeader( "meanMendelError") noOfLociIndex = reader.getColIndexGivenColHeader("sampled_base_count") sumOfMendelErrorIndex = reader.getColIndexGivenColHeader( "sumOfMendelError") plinkPedigreeFile = PlinkPedigreeFile(inputFname=self.pedigreeFname) familyStructureData = plinkPedigreeFile.getFamilyStructurePlinkWay() twoParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \ parentSetSize=2) singleParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \ parentSetSize=1) zeroParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \ parentSetSize=0) writer = MatrixFile(self.outputFname, openMode='w', delimiter='\t') header = ["ID", "noOfTotalLoci", \ "noOfTwoParentFamilies", "noOfParentsInTwoParentFamilies", "noOfKidsInTwoParentFamilies", "noOfIndividualsInTwoParentFamilies", \ "noOfSingleParentFamilies", "noOfParentsInSingleParentFamilies", "noOfKidsInSingleParentFamilies", "noOfIndividualsInSingleParentFamilies", \ "noOfZeroParentFamilies", "noOfParentsInZeroParentFamilies", "noOfKidsInZeroParentFamilies", "noOfIndividualsInZeroParentFamilies", \ "noOfTotalMendelErrors", \ "noOfMendelErrorsPerLocusPerNuclearFamily", "noOfMendelErrorsPerNuclearFamily"] writer.writeHeader(header) for row in reader: meanMendelError = float(row[meanMendelErrorIndex]) noOfLoci = int(row[noOfLociIndex]) sumOfMendelError = int(row[sumOfMendelErrorIndex]) noOfNuclearFamilies = twoParentFamilyCountData.noOfFamilies if noOfNuclearFamilies > 0: noOfMendelErrorsPerLocusPerNuclearFamily = meanMendelError / float( noOfNuclearFamilies) noOfMendelErrorsPerNuclearFamily = sumOfMendelError / float( noOfNuclearFamilies) else: noOfMendelErrorsPerLocusPerNuclearFamily = -1 noOfMendelErrorsPerNuclearFamily = -1 data_row = [row[0], noOfLoci, \ noOfNuclearFamilies, twoParentFamilyCountData.noOfParents, twoParentFamilyCountData.noOfKids, \ twoParentFamilyCountData.noOfIndividuals,\ singleParentFamilyCountData.noOfFamilies, singleParentFamilyCountData.noOfParents, singleParentFamilyCountData.noOfKids,\ singleParentFamilyCountData.noOfIndividuals,\ zeroParentFamilyCountData.noOfFamilies, zeroParentFamilyCountData.noOfParents, zeroParentFamilyCountData.noOfKids,\ zeroParentFamilyCountData.noOfIndividuals,\ sumOfMendelError, \ noOfMendelErrorsPerLocusPerNuclearFamily,noOfMendelErrorsPerNuclearFamily ] writer.writerow(data_row) plinkPedigreeFile.close() reader.close() writer.close()
def setup(self, **keywords): """ 2012.10.15 run before anything is run """ AbstractMatrixFileWalker.setup(self, **keywords) #self.writer = BeagleGenotypeFile(inputFname=self.outputFname, openMode='w') #read in the IBD check result self.ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.pedigreeKinshipFilePath, \ rowIDHeader=None, colIDHeader=None, \ rowIDIndex=0, colIDIndex=1, \ dataHeader=None, dataIndex=2, hasHeader=False) #. read in the alignment coverage data alignmentCoverageFile = MatrixFile(inputFname=self.individualAlignmentCoverageFname) alignmentCoverageFile.constructColName2IndexFromHeader() alignmentReadGroup2coverageLs = alignmentCoverageFile.constructDictionary(keyColumnIndexList=[0], valueColumnIndexList=[1]) alignmentCoverageFile.close() sys.stderr.write("Reading in all samples from %s VCF input files ... \n"%(len(self.inputFnameLs))) # read all the Beagle files individualID2HaplotypeData = {} for inputFname in self.inputFnameLs: vcfFile = VCFFile(inputFname=inputFname) #vcfFile.readInAllHaplotypes() for individualID in vcfFile.getSampleIDList(): individualID2HaplotypeData[individualID] = None #haplotypeList = vcfFile.getHaplotypeListOfOneSample(individualID) #individualID2HaplotypeData[individualID] = PassingData(haplotypeList=haplotypeList, # locusIDList=vcfFile.locusIDList) # get all haplotypes , etc. # get all sample IDs sys.stderr.write("%s individuals total.\n"%(len(individualID2HaplotypeData))) #. read in the pedigree or deduce it from Beagle Trio/Duo genotype file (columns) #. construct individualID2pedigreeContext, context: familySize=1/2/3, familyPosition=1/2 (parent/child) sys.stderr.write("Constructing individualID2pedigreeContext ...") plinkPedigreeFile = PlinkPedigreeFile(inputFname=self.pedigreeFname) pGraph = plinkPedigreeFile.pedigreeGraph #shrink the graph to only individuals with data pGraph = nx.subgraph(pGraph, individualID2HaplotypeData.keys()) cc_subgraph_list = nx.connected_component_subgraphs(pGraph.to_undirected()) individualID2familyContext = {} outDegreeContainer = NumberContainer(minValue=0) familySizeContainer = NumberContainer(minValue=0) individualCoverageContainer = NumberContainer(minValue=0) familyCoverageContainer = NumberContainer(minValue=0) for cc_subgraph in cc_subgraph_list: familySize= len(cc_subgraph) familySizeContainer.addOneValue(familySize) familyCoverage = 0 for n in cc_subgraph: #assuming each family is a two-generation trio/nuclear family individualCoverage = self.getIndividualCoverage(individualID=n, alignmentReadGroup2coverageLs=alignmentReadGroup2coverageLs) individualCoverage = float(individualCoverage) individualCoverageContainer.addOneValue(individualCoverage) familyCoverage += individualCoverage in_degree = pGraph.in_degree(n) out_degree = pGraph.out_degree(n) outDegreeContainer.addOneValue(out_degree) familyContext = PassingData(familySize=familySize, in_degree=in_degree, out_degree=out_degree, \ individualCoverage=individualCoverage,\ familyCoverage=None) if n not in individualID2familyContext: individualID2familyContext[n] = familyContext else: sys.stderr.write("Node %s already in individualID2familyContext.\n"%(n)) familyCoverageContainer.addOneValue(familyCoverage) #set the family coverage for each member, used in weighing the individual. better covered family => better haplotype for n in cc_subgraph: individualID2familyContext[n].familyCoverage = familyCoverage plinkPedigreeFile.close() sys.stderr.write("%s individuals.\n"%(len(individualID2familyContext))) # weigh each unique individual based on its sequencing coverage + no of offspring => probability mass for each individual sys.stderr.write("Weighing each individual , assigning probability mass ...") individualID2probabilityMass = {} for individualID, familyContext in individualID2familyContext.iteritems(): outDegreeQuotient = outDegreeContainer.normalizeValue(familyContext.familySize) individualCoverageQuotient = individualCoverageContainer.normalizeValue(familyContext.individualCoverage) #familyCoverageQuotient = familyCoverageContainer.normalizeValue(familyContext.familyCoverage) importanceScore = outDegreeQuotient + individualCoverageQuotient representativeImportanceScore = importanceScore individualID2probabilityMass[individualID] = representativeImportanceScore sys.stderr.write(" %s IDs with probability mass assigned.\n"%(len(individualID2probabilityMass))) self.individualID2probabilityMass = individualID2probabilityMass self.individualID2HaplotypeData = individualID2HaplotypeData
def setup(self, **keywords): """ 2013.09.30 """ parentClass.setup(self, **keywords) #. read in pedigree graph self.plinkPedigreeFile = PlinkPedigreeFile(inputFname=self.pedigreeFname) self.pedigreeGraph = self.plinkPedigreeFile.pedigreeGraph #. read in alignment coverage data alignmentCoverageFile = MatrixFile(inputFname=self.individualAlignmentCoverageFname) alignmentCoverageFile.constructColName2IndexFromHeader() self.alignmentReadGroup2coverage = alignmentCoverageFile.constructDictionary(keyColumnIndexList=[0], valueColumnIndexList=[1],\ keyUniqueInInputFile=True, valueDataType=float) #. also get map from read_group to individual.id (used in graph) alignmentCoverageFile._resetInput() self.alignmentReadGroup2individualID = alignmentCoverageFile.constructDictionary(keyColumnIndexList=[0], valueColumnIndexList=[2], \ keyUniqueInInputFile=True) #. also get map from individual.id to read_group (used in graph) alignmentCoverageFile._resetInput() self.individualID2alignmentReadGroup = alignmentCoverageFile.constructDictionary(keyColumnIndexList=[2], valueColumnIndexList=[0], \ keyUniqueInInputFile=True) alignmentCoverageFile.close()
def traverse(self): """ self.noHeader: #2012.8.10 2012.1.9 """ newHeader = [] key2dataLs = {} #key is the keyColumn, dataLs corresponds to the sum of each column from valueColumnLs delimiter = None noOfDataColumnsFromPriorFiles = 0 for inputFname in self.inputFnameLs: if not os.path.isfile(inputFname): if self.exitNonZeroIfAnyInputFileInexistent: sys.exit(3) else: continue reader = None try: inputFile = utils.openGzipFile(inputFname) delimiter = figureOutDelimiter(inputFile) reader = MatrixFile(inputFile=inputFile, delimiter=delimiter) except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() valueColumnLs = [] try: header = reader.next() self.handleNewHeader(header, newHeader, self.keyColumnLs, valueColumnLs, keyColumnSet=self.keyColumnSet) if self.noHeader: #2012.8.10 inputFile.seek(0) reader = MatrixFile(inputFile=inputFile, delimiter=delimiter) except: #in case something wrong (i.e. file is empty) sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() if reader is not None and valueColumnLs: visitedKeySet = set() for row in reader: try: self.handleValueColumns(row, key2dataLs=key2dataLs, keyColumnLs=self.keyColumnLs, \ valueColumnLs=valueColumnLs, noOfDataColumnsFromPriorFiles=noOfDataColumnsFromPriorFiles, \ visitedKeySet=visitedKeySet) except: #in case something wrong (i.e. file is empty) sys.stderr.write('Ignore this row: %s.\n'%repr(row)) sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() del reader #append empty data to keys who are not present in this current "reader" file totalKeySet = set(key2dataLs.keys()) unvisitedKeySet = totalKeySet - visitedKeySet for key in unvisitedKeySet: for i in valueColumnLs: key2dataLs[key].append('') noOfDataColumnsFromPriorFiles += len(valueColumnLs) if self.noHeader: #2012.8.10 newHeader = None returnData = PassingData(key2dataLs=key2dataLs, delimiter=delimiter, header=newHeader) return returnData
def traverse(self): """ 2012.1.9 """ newHeader = [] key2dataLs = { } #key is the keyColumn, dataLs corresponds to the sum of each column from valueColumnLs delimiter = None for inputFname in self.inputFnameLs: if not os.path.isfile(inputFname): if self.exitNonZeroIfAnyInputFileInexistent: sys.exit(3) else: continue reader = None try: inputFile = utils.openGzipFile(inputFname) delimiter = figureOutDelimiter(inputFile) reader = MatrixFile(inputFile=inputFile, delimiter=delimiter) except: sys.stderr.write('Except type: %s\n' % repr(sys.exc_info())) import traceback traceback.print_exc() try: #if isCSVReader: header = reader.next() #else: # header = inputFile.readline().strip().split() #whatever splits them self.handleNewHeader(header, newHeader, self.keyColumnLs, self.valueColumnLs, keyColumnSet=self.keyColumnSet) if self.noHeader: #2012.8.10 inputFile.seek(0) reader = MatrixFile(inputFile=inputFile, delimiter=delimiter) except: #in case something wrong (i.e. file is empty) sys.stderr.write('Except type: %s\n' % repr(sys.exc_info())) import traceback traceback.print_exc() if reader is not None: for row in reader: #if not isCSVReader: # row = row.strip().split() try: self.handleValueColumns( row, key2dataLs=key2dataLs, keyColumnLs=self.keyColumnLs, valueColumnLs=self.valueColumnLs) except: #in case something wrong (i.e. file is empty) sys.stderr.write('Ignore this row: %s.\n' % repr(row)) sys.stderr.write('Except type: %s\n' % repr(sys.exc_info())) import traceback traceback.print_exc() del reader if self.noHeader: #2012.8.10 newHeader = None returnData = PassingData(key2dataLs=key2dataLs, delimiter=delimiter, header=newHeader) return returnData