def getMonkeyIBDCheckData(self, inputFname=None): """ 2012.8.21 inputFname is output of plink ibd check. FID1 IID1 FID2 IID2 RT EZ Z0 Z1 Z2 PI_HAT PHE DST PPC RATIO 1 1996093 1 1995025 OT 0 1.0000 0.0000 0.0000 0.0000 -1 0.654218 0.3630 1.9764 1 1996093 1 2001039 OT 0 0.9832 0.0000 0.0168 0.0168 -1 0.653608 0.0318 1.8792 1 1996093 1 1984011 OT 0 1.0000 0.0000 0.0000 0.0000 -1 0.645011 0.0168 1.8624 1 1996093 1 1987004 OT 0 0.9260 0.0628 0.0113 0.0427 -1 0.660490 0.9999 2.2805 """ sys.stderr.write("Reading PI_hat from %s ... "%(inputFname)) ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=inputFname, rowIDHeader="IID1", colIDHeader="IID2", rowIDIndex=None, colIDIndex=None, \ dataHeader="PI_HAT", dataIndex=None, hasHeader=True) return ibdData """
def setup(self, **keywords): """ 2012.10.15 run before anything is run """ AbstractMatrixFileWalker.setup(self, **keywords) #read in the IBD check result if self.plinkIBDCheckOutputFname: ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.plinkIBDCheckOutputFname, rowIDHeader="IID1", colIDHeader="IID2", \ rowIDIndex=None, colIDIndex=None, \ dataHeader="PI_HAT", dataIndex=None, hasHeader=True) else: ibdData = None self.ibdData = ibdData self.data_matrix = [ ] #data structure to store all rows during fileWalker()
def getMonkeyKinshipData(self, inputFname=None): """ 2012.8.22 use SNP.readAdjacencyListDataIntoMatrix(), and defaultValue=0 2012.2.10 """ sys.stderr.write("Reading kinship from %s ... "%(inputFname)) kinshipData = SNP.readAdjacencyListDataIntoMatrix(inputFname=inputFname, rowIDHeader=None, colIDHeader=None, rowIDIndex=0, colIDIndex=1, \ dataHeader=None, dataIndex=2, hasHeader=False, defaultValue=0) #set kinshipData diagonal to 1 for i in xrange(len(kinshipData.row_id_ls)): kinshipData.data_matrix[i][i] = 1 return kinshipData """ header = reader.next() col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True) monkey1_id_index = col_name2index.get("monkeyId1") monkey2_id_index = col_name2index.get("monkeyId2") kinship_index = col_name2index.get("kinship") """ """
def run(self): """ """ if self.debug: import pdb pdb.set_trace() db_vervet = VervetDB.VervetDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, \ hostname=self.hostname, database=self.dbname, schema=self.schema, port=self.port) db_vervet.setup(create_tables=False) self.db_vervet = db_vervet kinshipData = self.getMonkeyKinshipData(inputFname=self.inputFname) #set kinshipData diagonal to 1 ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.plinkIBDCheckOutputFname, id1Header="IID1", id2Header="IID2", id1Index=None, id2Index=None, \ dataHeader="PI_HAT", dataIndex=None, hasHeader=True) monkey_id2plinkSex = SNP.getKey2ValueFromMatrixLikeFile(inputFname=self.plinkSexCheckOutputFname, \ keyHeaderLs=['IID'], valueHeaderLs=['SNPSEX'], keyIndexLs=None, valueIndexLs=None, \ hasHeader=True, valueDataType=int) kinshipIBDDeltaData = self.createDeltaMatrix(kinshipData=kinshipData, ibdData=ibdData) queueData = self.createKinshipIBDDeltaQueue(kinshipIBDDeltaData) kinshipIBDDeltaQueue = queueData.kinshipIBDDeltaQueue monkey_id2medianAbsDelta = queueData.monkey_id2medianAbsDelta writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') header = ['sourceMonkeyID', 'medianAbsDelta', 'noOfNonMissing', 'sourceMonkeySex', 'sourceMonkeyPlinkSex', \ 'sourceMonkeyMedianAbsDeltaDropAfterSwap', \ 'targetMonkeyID', 'sourceMonkeyNoOfNonMissingAfterSwap', \ 'targetMonkeyMedianAbsDelta', 'targetMonkeyNoOfNonMissing', 'targetMonkeySex', 'targetMonkeyPlinkSex', \ 'targetMonkeyMedianAbsDeltaAfterSwap', 'targetMonkeyNoOfNonMissingAfterSwap'] writer.writerow(header) i=0 while i <50 and len(kinshipIBDDeltaQueue)>0: negativeMedianAbsDelta, sourceMonkeyID, noOfNonMissing = heapq.heappop(kinshipIBDDeltaQueue)[:3] medianAbsDelta = -negativeMedianAbsDelta sourceMonkeyDBEntry = self.getMonkeyDBEntry(db_vervet=db_vervet, ucla_id=sourceMonkeyID) # 2012.8.22 draw some histogram to check what data looks like #self.drawKinshipIBDDeltaVectorHistogram(kinshipIBDDeltaData=kinshipIBDDeltaData, row_id=sourceMonkeyID, \ # outputFnamePrefix=self.outputFnamePrefix) medianAbsDeltaIncreaseQueue = [] for targetMonkeyID in kinshipData.row_id_ls: if targetMonkeyID!=sourceMonkeyID: targetMonkeyDBEntry = self.getMonkeyDBEntry(db_vervet=db_vervet, ucla_id=targetMonkeyID) #get the updated Median Delta for sourceMonkeyID pdata = self.calculateMedianAbsDelta(kinshipData=kinshipData, \ kinshipDataMonkeyID=targetMonkeyID, ibdData=ibdData, ibdDataMonkeyID=sourceMonkeyID) sourceMonkeyMedianAbsDeltaAfterSwap = pdata.medianAbsDelta sourceMonkeyNoOfNonMissingAfterSwap = pdata.noOfNonMissing #get the updated Median Delta for targetMonkeyID pdata = self.calculateMedianAbsDelta(kinshipData=kinshipData, \ kinshipDataMonkeyID=sourceMonkeyID, ibdData=ibdData, ibdDataMonkeyID=targetMonkeyID) targetMonkeyMedianAbsDeltaAfterSwap = pdata.medianAbsDelta targetMonkeyNoOfNonMissingAfterSwap = pdata.noOfNonMissing if sourceMonkeyMedianAbsDeltaAfterSwap is not None: #add to the queue #add the candidate monkey and how much median delta drops into the queue pdata = monkey_id2medianAbsDelta.get(targetMonkeyID) if pdata: targetMonkeyMedianAbsDelta = pdata.medianAbsDelta targetMonkeyNoOfNonMissing = pdata.noOfNonMissing else: targetMonkeyMedianAbsDelta = None targetMonkeyNoOfNonMissing = None item = [sourceMonkeyMedianAbsDeltaAfterSwap-medianAbsDelta, targetMonkeyID, sourceMonkeyNoOfNonMissingAfterSwap, \ targetMonkeyMedianAbsDelta, targetMonkeyNoOfNonMissing, targetMonkeyMedianAbsDeltaAfterSwap, \ targetMonkeyNoOfNonMissingAfterSwap] heapq.heappush(medianAbsDeltaIncreaseQueue, item) #the target monkey that increase the least (or drop the most) for the median delta is the prime candidate for label-swap i+=1 #output the top 5 candidates for each source monkey #output db sex for all monkeys and the plink sex check result j = 0 while j<5 and len(medianAbsDeltaIncreaseQueue)>0: sourceMonkeyMedianAbsDeltaDropAfterSwap, targetMonkeyID, sourceMonkeyNoOfNonMissingAfterSwap, \ targetMonkeyMedianAbsDelta, targetMonkeyNoOfNonMissing, targetMonkeyMedianAbsDeltaAfterSwap, targetMonkeyNoOfNonMissingAfterSwap =\ heapq.heappop(medianAbsDeltaIncreaseQueue)[:7] sourceMonkeySex = sourceMonkeyDBEntry.codeSexInNumber() sourceMonkeyPlinkSex = monkey_id2plinkSex.get(sourceMonkeyID) targetMonkeySex = targetMonkeyDBEntry.codeSexInNumber() targetMonkeyPlinkSex = monkey_id2plinkSex.get(targetMonkeyID) data_row = [sourceMonkeyID, medianAbsDelta, noOfNonMissing, sourceMonkeySex, sourceMonkeyPlinkSex,\ sourceMonkeyMedianAbsDeltaDropAfterSwap, targetMonkeyID, sourceMonkeyNoOfNonMissingAfterSwap,\ targetMonkeyMedianAbsDelta, targetMonkeyNoOfNonMissing, targetMonkeySex, targetMonkeyPlinkSex, targetMonkeyMedianAbsDeltaAfterSwap,\ targetMonkeyNoOfNonMissingAfterSwap] writer.writerow(data_row) j+= 1 del writer
def run(self): """ """ if self.debug: import pdb pdb.set_trace() db_vervet = VervetDB.VervetDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, \ hostname=self.hostname, database=self.dbname, schema=self.schema, port=self.port) db_vervet.setup(create_tables=False) self.db_vervet = db_vervet kinshipData = self.getMonkeyKinshipData(inputFname=self.inputFname) #set kinshipData diagonal to 1 ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.plinkIBDCheckOutputFname, rowIDHeader="IID1", colIDHeader="IID2", \ rowIDIndex=None, colIDIndex=None, \ dataHeader="PI_HAT", dataIndex=None, hasHeader=True) if self.minAbsDeltaForOutlier>0: #2012.8.23 cut data off for Sue if self.kinshipMonkeyIDSetFname: monkeyID2dataTuple = SNP.getKey2ValueFromMatrixLikeFile(inputFname=self.kinshipMonkeyIDSetFname, keyHeaderLs=['monkeyID'], \ valueHeaderLs=['noOfMismatches', 'noOfNonMissing'], keyIndexLs=None, valueIndexLs=None, \ hasHeader=True, valueDataType=float) kinshipMonkeyIDSet = set() for monkeyID, dataTuple in monkeyID2dataTuple.iteritems(): if dataTuple[0]==0 and dataTuple[1]>30: kinshipMonkeyIDSet.add(monkeyID) sys.stderr.write("%s monkeys in kinshipMonkeyIDSet.\n"%(len(kinshipMonkeyIDSet))) else: kinshipMonkeyIDSet = None if self.outputFnamePrefix: self.cutOffKinshipIBDDeltaAndOutput(db_vervet=db_vervet, kinshipData=kinshipData, ibdData=ibdData, \ outputFnamePrefix=self.outputFnamePrefix, minAbsDelta=self.minAbsDeltaForOutlier, kinshipMonkeyIDSet=kinshipMonkeyIDSet) #2012.8.24 output the delta matrix in PC1 order self.PCAOnAbsKinshipIBDDeltaMatrix(kinshipData=kinshipData, ibdData=ibdData, outputFnamePrefix=self.outputFnamePrefix) if self.plinkSexCheckOutputFname: monkey_id2plinkSex = SNP.getKey2ValueFromMatrixLikeFile(inputFname=self.plinkSexCheckOutputFname, \ keyHeaderLs=['IID'], valueHeaderLs=['SNPSEX'], keyIndexLs=None, valueIndexLs=None, \ hasHeader=True, valueDataType=int) else: monkey_id2plinkSex = {} kinshipIBDDeltaData = self.createDeltaMatrix(kinshipData=kinshipData, ibdData=ibdData, takeAbs=False) meanStdData = self.estimateAbsDeltaMeanStd(kinshipIBDDeltaData=kinshipIBDDeltaData, excludeTopFraction=0.2) queueData = self.createKinshipIBDDeltaChiSqStatQueue(kinshipData=kinshipData, ibdData=ibdData, \ mean=meanStdData.mean, std=meanStdData.std) queue = queueData.queue monkey_id2queueData = queueData.monkey_id2queueData writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') header = ['rank', 'monkeyID', 'chiSqStat', 'noOfNonMissing', 'chiSqPvalue', 'monkeySex','monkeyPlinkSex'] #if self.iterativeAlgorithm: # header.extend(['chiSqStatIter', 'noOfNonMissingIter', 'chiSqPvalueIter']) writer.writerow(header) i=0 while i<5000 and len(queue)>0: minusChiSqStat, sourceMonkeyID, noOfNonMissing, chiSqPvalue = heapq.heappop(queue)[:4] chiSqStat = -minusChiSqStat sourceMonkeyDBEntry = self.getMonkeyDBEntry(db_vervet=db_vervet, ucla_id=sourceMonkeyID) if sourceMonkeyDBEntry: sourceMonkeySex = sourceMonkeyDBEntry.codeSexInNumber() else: sourceMonkeySex = None sourceMonkeyPlinkSex = monkey_id2plinkSex.get(sourceMonkeyID) data_row = [i, sourceMonkeyID, chiSqStat, noOfNonMissing, chiSqPvalue, sourceMonkeySex, sourceMonkeyPlinkSex] if self.iterativeAlgorithm: """ if i>0: #calculate the new chisq stat and p-value. chiSqStatData = self.calculateChiSqStatOfDeltaVector(kinshipData=kinshipData, kinshipDataMonkeyID=sourceMonkeyID, \ ibdData=ibdData, ibdDataMonkeyID=sourceMonkeyID,\ mean=meanStdData.mean, std=meanStdData.std) noOfNonMissing = chiSqStatData.noOfNonMissing chiSqStat = chiSqStatData.chiSqStat chiSqPvalue = chiSqStatData.chiSqPvalue data_row.extend([chiSqStat,noOfNonMissing, chiSqPvalue]) """ queueData = self.updateKinshipIBDDeltaChiSqStatQueue(queue=queue, kinshipData=kinshipData, ibdData=ibdData, \ mean=meanStdData.mean, std=meanStdData.std, dropMonkeyID=sourceMonkeyID) #2012.8.23 old way not very efficient #remove itself. # ibdDataIndex = ibdData.row_id2row_index.get(sourceMonkeyID) # if ibdDataIndex: # ibdData.data_matrix[ibdDataIndex, :] = numpy.nan # ibdData.data_matrix[:, ibdDataIndex] = numpy.nan # ibdData.data_matrix.mask[ibdDataIndex, :] = True # ibdData.data_matrix.mask[:, ibdDataIndex] = True #queueData = self.createKinshipIBDDeltaChiSqStatQueue(kinshipData=kinshipData, ibdData=ibdData, \ # mean=meanStdData.mean, std=meanStdData.std,\ # given_row_id_ls=[row[1] for row in queue]) queue = queueData.queue monkey_id2queueData = queueData.monkey_id2queueData writer.writerow(data_row) i+= 1 del writer