def getMonkeyIBDCheckData(self, inputFname=None):
		"""
		2012.8.21
			inputFname is output of plink ibd check.
 FID1     IID1 FID2     IID2 RT    EZ      Z0      Z1      Z2  PI_HAT PHE       DST     PPC   RATIO
   1  1996093   1  1995025 OT     0  1.0000  0.0000  0.0000  0.0000  -1  0.654218  0.3630  1.9764
   1  1996093   1  2001039 OT     0  0.9832  0.0000  0.0168  0.0168  -1  0.653608  0.0318  1.8792
   1  1996093   1  1984011 OT     0  1.0000  0.0000  0.0000  0.0000  -1  0.645011  0.0168  1.8624
   1  1996093   1  1987004 OT     0  0.9260  0.0628  0.0113  0.0427  -1  0.660490  0.9999  2.2805
   		
		"""
		sys.stderr.write("Reading PI_hat from %s ... "%(inputFname))
		ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=inputFname, rowIDHeader="IID1", colIDHeader="IID2", rowIDIndex=None, colIDIndex=None, \
								dataHeader="PI_HAT", dataIndex=None, hasHeader=True)
		return ibdData
		"""
Ejemplo n.º 2
0
    def setup(self, **keywords):
        """
		2012.10.15
			run before anything is run
		"""
        AbstractMatrixFileWalker.setup(self, **keywords)
        #read in the IBD check result
        if self.plinkIBDCheckOutputFname:
            ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.plinkIBDCheckOutputFname, rowIDHeader="IID1", colIDHeader="IID2", \
                 rowIDIndex=None, colIDIndex=None, \
                 dataHeader="PI_HAT", dataIndex=None, hasHeader=True)
        else:
            ibdData = None
        self.ibdData = ibdData
        self.data_matrix = [
        ]  #data structure to store all rows during fileWalker()
	def getMonkeyKinshipData(self, inputFname=None):
		"""
		2012.8.22
			use SNP.readAdjacencyListDataIntoMatrix(), and defaultValue=0
		2012.2.10
		"""
		
		sys.stderr.write("Reading kinship from %s ... "%(inputFname))
		kinshipData = SNP.readAdjacencyListDataIntoMatrix(inputFname=inputFname, rowIDHeader=None, colIDHeader=None, rowIDIndex=0, colIDIndex=1, \
								dataHeader=None, dataIndex=2, hasHeader=False, defaultValue=0)
		#set kinshipData diagonal to 1
		for i in xrange(len(kinshipData.row_id_ls)):
			kinshipData.data_matrix[i][i] = 1
		return kinshipData
		"""
		header = reader.next()
		col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True)
		monkey1_id_index = col_name2index.get("monkeyId1")
		monkey2_id_index = col_name2index.get("monkeyId2")
		kinship_index = col_name2index.get("kinship")
		"""
		"""
	def run(self):
		"""
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		db_vervet = VervetDB.VervetDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, \
									hostname=self.hostname, database=self.dbname, schema=self.schema, port=self.port)
		db_vervet.setup(create_tables=False)
		self.db_vervet = db_vervet
		
		
		kinshipData = self.getMonkeyKinshipData(inputFname=self.inputFname)
		#set kinshipData diagonal to 1
		ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.plinkIBDCheckOutputFname, id1Header="IID1", id2Header="IID2", id1Index=None, id2Index=None, \
								dataHeader="PI_HAT", dataIndex=None, hasHeader=True)
		monkey_id2plinkSex = SNP.getKey2ValueFromMatrixLikeFile(inputFname=self.plinkSexCheckOutputFname, \
								keyHeaderLs=['IID'], valueHeaderLs=['SNPSEX'], keyIndexLs=None, valueIndexLs=None, \
								hasHeader=True, valueDataType=int)
		
		kinshipIBDDeltaData = self.createDeltaMatrix(kinshipData=kinshipData, ibdData=ibdData)
		queueData = self.createKinshipIBDDeltaQueue(kinshipIBDDeltaData)
		kinshipIBDDeltaQueue = queueData.kinshipIBDDeltaQueue
		monkey_id2medianAbsDelta = queueData.monkey_id2medianAbsDelta
		
		writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
		header = ['sourceMonkeyID', 'medianAbsDelta', 'noOfNonMissing', 'sourceMonkeySex', 'sourceMonkeyPlinkSex', \
				'sourceMonkeyMedianAbsDeltaDropAfterSwap', \
			'targetMonkeyID', 'sourceMonkeyNoOfNonMissingAfterSwap', \
			'targetMonkeyMedianAbsDelta', 'targetMonkeyNoOfNonMissing', 'targetMonkeySex', 'targetMonkeyPlinkSex', \
			'targetMonkeyMedianAbsDeltaAfterSwap', 'targetMonkeyNoOfNonMissingAfterSwap']
		writer.writerow(header)
		
		i=0
		while i <50 and len(kinshipIBDDeltaQueue)>0:
			negativeMedianAbsDelta, sourceMonkeyID, noOfNonMissing = heapq.heappop(kinshipIBDDeltaQueue)[:3]
			medianAbsDelta = -negativeMedianAbsDelta
			sourceMonkeyDBEntry = self.getMonkeyDBEntry(db_vervet=db_vervet, ucla_id=sourceMonkeyID)
			
			# 2012.8.22 draw some histogram to check what data looks like
			#self.drawKinshipIBDDeltaVectorHistogram(kinshipIBDDeltaData=kinshipIBDDeltaData, row_id=sourceMonkeyID, \
			#							outputFnamePrefix=self.outputFnamePrefix)
			
			medianAbsDeltaIncreaseQueue = []
			for targetMonkeyID in kinshipData.row_id_ls:
				if targetMonkeyID!=sourceMonkeyID:
					targetMonkeyDBEntry = self.getMonkeyDBEntry(db_vervet=db_vervet, ucla_id=targetMonkeyID)
					#get the updated Median Delta for sourceMonkeyID
					pdata = self.calculateMedianAbsDelta(kinshipData=kinshipData, \
										kinshipDataMonkeyID=targetMonkeyID, ibdData=ibdData, ibdDataMonkeyID=sourceMonkeyID)
					sourceMonkeyMedianAbsDeltaAfterSwap = pdata.medianAbsDelta
					sourceMonkeyNoOfNonMissingAfterSwap  = pdata.noOfNonMissing
					
					#get the updated Median Delta for targetMonkeyID
					pdata = self.calculateMedianAbsDelta(kinshipData=kinshipData, \
										kinshipDataMonkeyID=sourceMonkeyID, ibdData=ibdData, ibdDataMonkeyID=targetMonkeyID)
					targetMonkeyMedianAbsDeltaAfterSwap = pdata.medianAbsDelta
					targetMonkeyNoOfNonMissingAfterSwap = pdata.noOfNonMissing
					
					if sourceMonkeyMedianAbsDeltaAfterSwap is not None:	#add to the queue
						#add the candidate monkey and how much median delta drops into the queue
						pdata = monkey_id2medianAbsDelta.get(targetMonkeyID)
						if pdata:
							targetMonkeyMedianAbsDelta = pdata.medianAbsDelta
							targetMonkeyNoOfNonMissing = pdata.noOfNonMissing
						else:
							targetMonkeyMedianAbsDelta = None
							targetMonkeyNoOfNonMissing = None
						item = [sourceMonkeyMedianAbsDeltaAfterSwap-medianAbsDelta, targetMonkeyID, sourceMonkeyNoOfNonMissingAfterSwap, \
							targetMonkeyMedianAbsDelta, targetMonkeyNoOfNonMissing, targetMonkeyMedianAbsDeltaAfterSwap, \
							targetMonkeyNoOfNonMissingAfterSwap]
						heapq.heappush(medianAbsDeltaIncreaseQueue, item)
				
			#the target monkey that increase the least (or drop the most) for the median delta is the prime candidate for label-swap 
			i+=1
			#output the top 5 candidates for each source monkey
			#output db sex for all monkeys and the plink sex check result
			j = 0
			while j<5 and len(medianAbsDeltaIncreaseQueue)>0:
				sourceMonkeyMedianAbsDeltaDropAfterSwap, targetMonkeyID, sourceMonkeyNoOfNonMissingAfterSwap, \
						targetMonkeyMedianAbsDelta, targetMonkeyNoOfNonMissing, targetMonkeyMedianAbsDeltaAfterSwap, targetMonkeyNoOfNonMissingAfterSwap =\
							heapq.heappop(medianAbsDeltaIncreaseQueue)[:7]
				sourceMonkeySex = sourceMonkeyDBEntry.codeSexInNumber()
				sourceMonkeyPlinkSex = monkey_id2plinkSex.get(sourceMonkeyID)
				
				targetMonkeySex = targetMonkeyDBEntry.codeSexInNumber()
				targetMonkeyPlinkSex = monkey_id2plinkSex.get(targetMonkeyID)
				
				data_row = [sourceMonkeyID, medianAbsDelta, noOfNonMissing, sourceMonkeySex, sourceMonkeyPlinkSex,\
						sourceMonkeyMedianAbsDeltaDropAfterSwap, targetMonkeyID, sourceMonkeyNoOfNonMissingAfterSwap,\
						targetMonkeyMedianAbsDelta, targetMonkeyNoOfNonMissing, targetMonkeySex, targetMonkeyPlinkSex, targetMonkeyMedianAbsDeltaAfterSwap,\
						targetMonkeyNoOfNonMissingAfterSwap]
				writer.writerow(data_row)
				j+= 1
		del writer
	def run(self):
		"""
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		db_vervet = VervetDB.VervetDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, \
									hostname=self.hostname, database=self.dbname, schema=self.schema, port=self.port)
		db_vervet.setup(create_tables=False)
		self.db_vervet = db_vervet
		
		
		kinshipData = self.getMonkeyKinshipData(inputFname=self.inputFname)
		#set kinshipData diagonal to 1
		ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.plinkIBDCheckOutputFname, rowIDHeader="IID1", colIDHeader="IID2", \
										rowIDIndex=None, colIDIndex=None, \
								dataHeader="PI_HAT", dataIndex=None, hasHeader=True)
		
		if self.minAbsDeltaForOutlier>0:
			#2012.8.23 cut data off for Sue
			if self.kinshipMonkeyIDSetFname:
				monkeyID2dataTuple = SNP.getKey2ValueFromMatrixLikeFile(inputFname=self.kinshipMonkeyIDSetFname, keyHeaderLs=['monkeyID'], \
									valueHeaderLs=['noOfMismatches', 'noOfNonMissing'], keyIndexLs=None, valueIndexLs=None, \
									hasHeader=True, valueDataType=float)
				kinshipMonkeyIDSet = set()
				for monkeyID, dataTuple in monkeyID2dataTuple.iteritems():
					if dataTuple[0]==0 and dataTuple[1]>30:
						kinshipMonkeyIDSet.add(monkeyID)
				sys.stderr.write("%s monkeys in kinshipMonkeyIDSet.\n"%(len(kinshipMonkeyIDSet)))
			else:
				kinshipMonkeyIDSet = None
			if self.outputFnamePrefix:
				self.cutOffKinshipIBDDeltaAndOutput(db_vervet=db_vervet, kinshipData=kinshipData, ibdData=ibdData, \
						outputFnamePrefix=self.outputFnamePrefix, minAbsDelta=self.minAbsDeltaForOutlier, kinshipMonkeyIDSet=kinshipMonkeyIDSet)
		
		#2012.8.24 output the delta matrix in PC1 order
		self.PCAOnAbsKinshipIBDDeltaMatrix(kinshipData=kinshipData,  ibdData=ibdData, outputFnamePrefix=self.outputFnamePrefix)
		
		if self.plinkSexCheckOutputFname:
			monkey_id2plinkSex = SNP.getKey2ValueFromMatrixLikeFile(inputFname=self.plinkSexCheckOutputFname, \
								keyHeaderLs=['IID'], valueHeaderLs=['SNPSEX'], keyIndexLs=None, valueIndexLs=None, \
								hasHeader=True, valueDataType=int)
		else:
			monkey_id2plinkSex = {}
		
		kinshipIBDDeltaData = self.createDeltaMatrix(kinshipData=kinshipData, ibdData=ibdData, takeAbs=False)
		
		meanStdData = self.estimateAbsDeltaMeanStd(kinshipIBDDeltaData=kinshipIBDDeltaData, excludeTopFraction=0.2)
		
		queueData = self.createKinshipIBDDeltaChiSqStatQueue(kinshipData=kinshipData, ibdData=ibdData, \
													mean=meanStdData.mean, std=meanStdData.std)
		
		queue = queueData.queue
		monkey_id2queueData = queueData.monkey_id2queueData
		
		writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
		header = ['rank', 'monkeyID', 'chiSqStat', 'noOfNonMissing', 'chiSqPvalue', 'monkeySex','monkeyPlinkSex']
		#if self.iterativeAlgorithm:
		#	header.extend(['chiSqStatIter', 'noOfNonMissingIter', 'chiSqPvalueIter'])
		writer.writerow(header)
		
		i=0
		while i<5000 and len(queue)>0:
			
			minusChiSqStat, sourceMonkeyID, noOfNonMissing, chiSqPvalue = heapq.heappop(queue)[:4]
			chiSqStat = -minusChiSqStat
			sourceMonkeyDBEntry = self.getMonkeyDBEntry(db_vervet=db_vervet, ucla_id=sourceMonkeyID)
			if sourceMonkeyDBEntry:
				sourceMonkeySex = sourceMonkeyDBEntry.codeSexInNumber()
			else:
				sourceMonkeySex = None
			sourceMonkeyPlinkSex = monkey_id2plinkSex.get(sourceMonkeyID)
			data_row = [i, sourceMonkeyID, chiSqStat, noOfNonMissing, chiSqPvalue, sourceMonkeySex, sourceMonkeyPlinkSex]
			
			
			if self.iterativeAlgorithm:
				"""
				if i>0:	#calculate the new chisq stat and p-value.
					chiSqStatData = self.calculateChiSqStatOfDeltaVector(kinshipData=kinshipData, kinshipDataMonkeyID=sourceMonkeyID, \
						ibdData=ibdData, ibdDataMonkeyID=sourceMonkeyID,\
						mean=meanStdData.mean, std=meanStdData.std)
					noOfNonMissing = chiSqStatData.noOfNonMissing
					chiSqStat = chiSqStatData.chiSqStat
					chiSqPvalue = chiSqStatData.chiSqPvalue
					data_row.extend([chiSqStat,noOfNonMissing,  chiSqPvalue])
				"""
				
				queueData = self.updateKinshipIBDDeltaChiSqStatQueue(queue=queue, kinshipData=kinshipData, ibdData=ibdData, \
								mean=meanStdData.mean, std=meanStdData.std, dropMonkeyID=sourceMonkeyID)
				#2012.8.23 old way not very efficient
				#remove itself.
#				ibdDataIndex = ibdData.row_id2row_index.get(sourceMonkeyID)
#				if ibdDataIndex:
#					ibdData.data_matrix[ibdDataIndex, :] = numpy.nan
#					ibdData.data_matrix[:, ibdDataIndex] = numpy.nan
#					ibdData.data_matrix.mask[ibdDataIndex, :] = True
#					ibdData.data_matrix.mask[:, ibdDataIndex] = True
				#queueData = self.createKinshipIBDDeltaChiSqStatQueue(kinshipData=kinshipData, ibdData=ibdData, \
				#									mean=meanStdData.mean, std=meanStdData.std,\
				#									given_row_id_ls=[row[1] for row in queue])
		
				queue = queueData.queue
				monkey_id2queueData = queueData.monkey_id2queueData
			writer.writerow(data_row)
			i+= 1
		del writer