Beispiel #1
0
	def outputSNPDataInNewCoordinate(self, querySNPDataFname=None, querySNPID2NewReferenceCoordinateLs=None,\
									newSNPDataOutputFname=None, newSNPDataOutputFormat=1):
		"""
		2013.07.03 added argument newSNPDataOutputFormat
			
		2012.10.14
			split out of findSNPPositionOnNewRef()
		"""
		sys.stderr.write("Converting querySNPDataFname %s into individual X SNP format, format=%s ... "%\
						(querySNPDataFname, newSNPDataOutputFormat))
		"""
Sample  Geno    SNP
1999010 CC      cs_primer1082_247
1999068 CC      cs_primer1082_247
2000022 CT      cs_primer1082_247
2000064 CT      cs_primer1082_247
2000117 CC      cs_primer1082_247

		"""
		inf = utils.openGzipFile(querySNPDataFname)
		reader = csv.reader(inf, delimiter=figureOutDelimiter(inf))
		col_name2index = getColName2IndexFromHeader(reader.next())
		
		sampleIndex = col_name2index.get("Sample")
		genotypeIndex = col_name2index.get("Geno")
		SNPIDIndex = col_name2index.get("SNP")
		
		row_id2index = {}
		row_id_ls = []
		col_id_ls = []
		col_id2index = {}
		row_col_index2genotype = {}
		for row in reader:
			sampleID = row[sampleIndex]
			genotype = row[genotypeIndex]
			querySNPID = row[SNPIDIndex]
			if querySNPID in querySNPID2NewReferenceCoordinateLs:
				newRefCoordinateLs = querySNPID2NewReferenceCoordinateLs.get(querySNPID)
				if len(newRefCoordinateLs)==1:
					newRefCoordinate = newRefCoordinateLs[0]
					if newSNPDataOutputFormat==2:
						col_id = '%s_%s'%(newRefCoordinate.newChr, newRefCoordinate.newRefStart)
					else:
						col_id = '%s_%s_%s'%(newRefCoordinate.newChr, newRefCoordinate.newRefStart, newRefCoordinate.newRefStop)
					queryStrand = newRefCoordinate.queryStrand
					if col_id not in col_id2index:
						col_id2index[col_id] = len(col_id2index)
						col_id_ls.append(col_id)
					if sampleID not in row_id2index:
						row_id2index[sampleID] = len(row_id2index)
						row_id_ls.append(sampleID)
					if queryStrand == "-":
						genotype = SNP.reverseComplement(genotype)
					row_index = row_id2index[sampleID]
					col_index = col_id2index[col_id]
					row_col_index2genotype[(row_index, col_index)] = genotype
				else:
					continue
		data_matrix = numpy.zeros([len(row_id_ls), len(col_id2index)], dtype=numpy.int8)
		
		for row_col_index, genotype in row_col_index2genotype.iteritems():
			row_index, col_index = row_col_index[:2]
			data_matrix[row_index, col_index] = SNP.nt2number[genotype]
		sys.stderr.write("\n")
		snpData = SNP.SNPData(row_id_ls=row_id_ls, col_id_ls=col_id_ls, data_matrix=data_matrix)
		snpData.tofile(newSNPDataOutputFname)