def setup(self, **keywords): """ 2013.09.30 """ parentClass.setup(self, **keywords) #. read in pedigree graph self.plinkPedigreeFile = PlinkPedigreeFile(inputFname=self.pedigreeFname) self.pedigreeGraph = self.plinkPedigreeFile.pedigreeGraph #. read in alignment coverage data alignmentCoverageFile = MatrixFile(inputFname=self.individualAlignmentCoverageFname) alignmentCoverageFile.constructColName2IndexFromHeader() self.alignmentReadGroup2coverage = alignmentCoverageFile.constructDictionary(keyColumnIndexList=[0], valueColumnIndexList=[1],\ keyUniqueInInputFile=True, valueDataType=float) #. also get map from read_group to individual.id (used in graph) alignmentCoverageFile._resetInput() self.alignmentReadGroup2individualID = alignmentCoverageFile.constructDictionary(keyColumnIndexList=[0], valueColumnIndexList=[2], \ keyUniqueInInputFile=True) #. also get map from individual.id to read_group (used in graph) alignmentCoverageFile._resetInput() self.individualID2alignmentReadGroup = alignmentCoverageFile.constructDictionary(keyColumnIndexList=[2], valueColumnIndexList=[0], \ keyUniqueInInputFile=True) alignmentCoverageFile.close()
def setup(self, **keywords): """ 2013.09.30 """ parentClass.setup(self, **keywords) #. read in alignment coverage data trioInconsistencyFile = MatrixFile(inputFname=self.originalTrioInconsistencyFname) trioInconsistencyFile.constructColName2IndexFromHeader() self.trioID2dataList = trioInconsistencyFile.constructDictionary(keyColumnIndexList=[0], valueColumnIndexList=[1,2,3],\ keyUniqueInInputFile=True) trioInconsistencyFile.close() self.alignmentID2alignmentReadGroup = {} for alignmentReadGroup in self.alignmentReadGroup2individualID: alignmentID = alignmentReadGroup.split('_')[0] self.alignmentID2alignmentReadGroup[alignmentID] = alignmentReadGroup
def setup(self, **keywords): """ 2012.10.15 run before anything is run """ AbstractMatrixFileWalker.setup(self, **keywords) #self.writer = BeagleGenotypeFile(inputFname=self.outputFname, openMode='w') #read in the IBD check result self.ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.pedigreeKinshipFilePath, \ rowIDHeader=None, colIDHeader=None, \ rowIDIndex=0, colIDIndex=1, \ dataHeader=None, dataIndex=2, hasHeader=False) #. read in the alignment coverage data alignmentCoverageFile = MatrixFile(inputFname=self.individualAlignmentCoverageFname) alignmentCoverageFile.constructColName2IndexFromHeader() alignmentReadGroup2coverageLs = alignmentCoverageFile.constructDictionary(keyColumnIndexList=[0], valueColumnIndexList=[1]) alignmentCoverageFile.close() sys.stderr.write("Reading in all samples from %s VCF input files ... \n"%(len(self.inputFnameLs))) # read all the Beagle files individualID2HaplotypeData = {} for inputFname in self.inputFnameLs: vcfFile = VCFFile(inputFname=inputFname) #vcfFile.readInAllHaplotypes() for individualID in vcfFile.getSampleIDList(): individualID2HaplotypeData[individualID] = None #haplotypeList = vcfFile.getHaplotypeListOfOneSample(individualID) #individualID2HaplotypeData[individualID] = PassingData(haplotypeList=haplotypeList, # locusIDList=vcfFile.locusIDList) # get all haplotypes , etc. # get all sample IDs sys.stderr.write("%s individuals total.\n"%(len(individualID2HaplotypeData))) #. read in the pedigree or deduce it from Beagle Trio/Duo genotype file (columns) #. construct individualID2pedigreeContext, context: familySize=1/2/3, familyPosition=1/2 (parent/child) sys.stderr.write("Constructing individualID2pedigreeContext ...") plinkPedigreeFile = PlinkPedigreeFile(inputFname=self.pedigreeFname) pGraph = plinkPedigreeFile.pedigreeGraph #shrink the graph to only individuals with data pGraph = nx.subgraph(pGraph, individualID2HaplotypeData.keys()) cc_subgraph_list = nx.connected_component_subgraphs(pGraph.to_undirected()) individualID2familyContext = {} outDegreeContainer = NumberContainer(minValue=0) familySizeContainer = NumberContainer(minValue=0) individualCoverageContainer = NumberContainer(minValue=0) familyCoverageContainer = NumberContainer(minValue=0) for cc_subgraph in cc_subgraph_list: familySize= len(cc_subgraph) familySizeContainer.addOneValue(familySize) familyCoverage = 0 for n in cc_subgraph: #assuming each family is a two-generation trio/nuclear family individualCoverage = self.getIndividualCoverage(individualID=n, alignmentReadGroup2coverageLs=alignmentReadGroup2coverageLs) individualCoverage = float(individualCoverage) individualCoverageContainer.addOneValue(individualCoverage) familyCoverage += individualCoverage in_degree = pGraph.in_degree(n) out_degree = pGraph.out_degree(n) outDegreeContainer.addOneValue(out_degree) familyContext = PassingData(familySize=familySize, in_degree=in_degree, out_degree=out_degree, \ individualCoverage=individualCoverage,\ familyCoverage=None) if n not in individualID2familyContext: individualID2familyContext[n] = familyContext else: sys.stderr.write("Node %s already in individualID2familyContext.\n"%(n)) familyCoverageContainer.addOneValue(familyCoverage) #set the family coverage for each member, used in weighing the individual. better covered family => better haplotype for n in cc_subgraph: individualID2familyContext[n].familyCoverage = familyCoverage plinkPedigreeFile.close() sys.stderr.write("%s individuals.\n"%(len(individualID2familyContext))) # weigh each unique individual based on its sequencing coverage + no of offspring => probability mass for each individual sys.stderr.write("Weighing each individual , assigning probability mass ...") individualID2probabilityMass = {} for individualID, familyContext in individualID2familyContext.iteritems(): outDegreeQuotient = outDegreeContainer.normalizeValue(familyContext.familySize) individualCoverageQuotient = individualCoverageContainer.normalizeValue(familyContext.individualCoverage) #familyCoverageQuotient = familyCoverageContainer.normalizeValue(familyContext.familyCoverage) importanceScore = outDegreeQuotient + individualCoverageQuotient representativeImportanceScore = importanceScore individualID2probabilityMass[individualID] = representativeImportanceScore sys.stderr.write(" %s IDs with probability mass assigned.\n"%(len(individualID2probabilityMass))) self.individualID2probabilityMass = individualID2probabilityMass self.individualID2HaplotypeData = individualID2HaplotypeData