def run(self):
        """
		2013.07.24
		"""

        if self.debug:
            import pdb
            pdb.set_trace()

        #inf = utils.openGzipFile(self.inputFname)
        reader = MatrixFile(inputFname=self.inputFname)
        reader.constructColName2IndexFromHeader()
        writer = MatrixFile(inputFname=self.outputFname,
                            openMode='w',
                            delimiter='\t')
        header = ["SNPID", "oldChromosome", "Chromosome", "Start", "Stop", "N"]
        writer.writeHeader(header)

        counter = 0
        for row in reader:
            new_row = self.processRow(row)
            writer.writerow(new_row)
            counter += 1
        sys.stderr.write("%s lines processed.\n" % (counter))

        del reader
        del writer
Exemple #2
0
	def run(self):
		"""
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		#inf = utils.openGzipFile(self.inputFname)
		reader = MatrixFile(inputFname=self.inputFname)	#a TPED file
		
		writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
		counter = 0
		tfamIndividualData = self.getIndividualID2IndexFromTFAMFile(tfamFname=self.tfamFname)
		individualID2Index = tfamIndividualData.individualID2Index
		noOfIndividuals = len(individualID2Index)
		
		noOfExtraIndividuals = None
		for row in reader:
			#chromosome, snp_id, genetic_distace, physical_distance = row[:4]
			noOfExistingIndividuals = len(row[4:])/2
			noOfExtraIndividuals = noOfIndividuals - noOfExistingIndividuals
			writer.writerow(row+ [0]*2*noOfExtraIndividuals)
			counter += 1
			
		del reader
		del writer
		sys.stderr.write("%s rows (loci) and added %s extra individuals.\n"%(counter, noOfExtraIndividuals))
Exemple #3
0
    def getLocusID2MissingFraction(self, inputFname=None):
        """
		2014.01.08
			
		"""
        sys.stderr.write("Reading in the missing statistics from %s ... " %
                         (inputFname))
        locusID2Stat = {}

        reader = MatrixFile(inputFname=inputFname)
        reader.constructColName2IndexFromHeader()
        locusIDIndex = reader.getColIndexGivenColHeader("locusID")
        statIndex = reader.getColIndexGivenColHeader("occurrence_byFixedValue")
        counter = 0
        for row in reader:
            locusID = row[locusIDIndex]
            chromosome, start = locusID.split('_')[:2]
            start = int(start)
            stat = float(row[statIndex])

            key = (chromosome, start, start)
            if key in locusID2Stat:
                if stat < locusID2Stat[key]:
                    #take lowest value
                    locusID2Stat[key] = stat
            else:
                locusID2Stat[key] = stat
            counter += 1
        del reader
        sys.stderr.write(
            " %s unique loci with missing fraction out of %s total loci.\n" %
            (len(locusID2Stat), counter))
        return locusID2Stat
	def run(self):
		"""
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		reader = MatrixFile(self.inputFname)
		reader.constructColName2IndexFromHeader()
		noOfMendelErrorColumnIndex = reader.getColIndexGivenColHeader(colHeader='N')
		SNPIDColumnIndex = reader.getColIndexGivenColHeader(colHeader='SNP')
		writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
		header = ['chromosome', 'position', 'noOfMendelErrors']
		writer.writerow(header)
		
		counter = 0
		real_counter = 0
		for row in reader:
			SNPID = row[SNPIDColumnIndex]
			noOfMendelErrors = int(row[noOfMendelErrorColumnIndex])
			if noOfMendelErrors <=self.maxNoOfMendelError:
				chr, pos = SNPID.split('_')
				data_row = [chr, pos, noOfMendelErrors]
				writer.writerow(data_row)
				real_counter += 1
			counter += 1
			
		del reader
		del writer
		sys.stderr.write("%s/%s lines outputted.\n"%(real_counter, counter))
Exemple #5
0
	def getMendelErrorIndividualLocusData(self, mendelErrorFname=None, individualID2Index=None):
		"""
		2013.1.29
		
		"""
		sys.stderr.write("Getting data on loci involved in mendel-errors from %s ..."%(mendelErrorFname))
		locus_id2individual_index_ls = {}
		#inf = utils.openGzipFile(mendelErrorFname, 'r')
		reader = MatrixFile(inputFname=mendelErrorFname)
		#header = reader.next()
		reader.constructColName2IndexFromHeader()
		counter = 0
		for row in reader:
			individual_id = row[reader.getColIndexGivenColHeader('KID')]
			if individual_id in individualID2Index:
				index =individualID2Index.get(individual_id)
			else:
				sys.stderr.write("Individual %s not in individualID2Index.\n"%(individual_id))
				sys.exit(3)
			snp_id = row[3]
			if snp_id not in locus_id2individual_index_ls:
				locus_id2individual_index_ls[snp_id] = []
			locus_id2individual_index_ls[snp_id].append(index)
			counter += 1
		del reader
		sys.stderr.write(" %s calls of %s loci, involved in mendel errors.\n"%\
						(counter, len(locus_id2individual_index_ls)))
		return locus_id2individual_index_ls
Exemple #6
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		outputDir = os.path.split(self.outputFname)[0]
		if outputDir and not os.path.isdir(outputDir):
			os.makedirs(outputDir)
		
		reader = VCFFile(inputFname=self.inputFname)
		
		alignmentFile = pysam.Samfile(self.alignmentFilename, "rb")
		
		writer = VCFFile(outputFname=self.outputFname, openMode='w')
		writer.metaInfoLs = reader.metaInfoLs
		writer.header = reader.header
		writer.writeMetaAndHeader()
		
		statWriter = MatrixFile(self.missingStatFname, openMode='w', delimiter='\t')
		header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \
				'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads']
		statWriter.writeHeader(header)
		
		counter = 0
		real_counter = 0
		minDepth = self.alignmentMedianDepth/self.alignmentDepthFold
		maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold
		
		for vcfRecord in reader:
			locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position)
			alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1)	#start and end in fetch() are 0-based.
			locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\
												minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead)
			locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator
			depth = locusLowMapQData.totalNoOfReads
			if depth>=minDepth and depth <=maxDepth:
				locusOutOfDepthIndicator = 0 	#good
			else:
				locusOutOfDepthIndicator = 1
			
			locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator
			data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\
						1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \
						locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads]
			statWriter.writerow(data_row)
			if locusLowQualityIndicator>0:
				real_counter += 1
				#modify the VCF record
				#get sample ID column, then set its genotype missing
				vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True)
			#2014.1.4 output VCF record
			writer.writeVCFRecord(vcfRecord)
			counter += 1
		reader.close()
		statWriter.close()
		writer.close()
		sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \
												real_counter/float(counter)))
Exemple #7
0
    def setup(self, **keywords):
        """
		noOfTotalIntervals = 0
		noOfCrossChromosomeIntervals = 0
		
		targetChromosome 2 mapData
			intervalDeltaList	=> median
			orientation  (queryStrand)
				0=forward
				1=backward
			mean	=> using 80% of data (sort the delta list, then take 10% to 90% of the list)
			stddev	=> if stddev is zero, use 1.
		
		locusKey (oldChromosome, oldStart, oldStop) 2 mapData
			targetCoordinate (newChromosome, newStart, newStop).
			leftIntervalDelta: None = boundary
			rightIntervalDelta: None = boundary, 10E10 = cross chromosome
			
			probability: max( P(SNP_i_left_interval), P(SNP_i_right_interval)).
				P(interval):
					If one interval is on the same chromosome,  P(target-chromosome)*P(interval delta size)
					If not, P(chromosome-cross event). 
			
		Not implemented: for a whole genome input (rather than a window),
			an RBTree of windows should be used to counter regional effect.
		
		2013.11.24
			run before anything is run
		"""
        AbstractMatrixFileWalker.setup(self, **keywords)

        self.noOfTotalIntervals = 0.0
        self.noOfCrossChromosomeIntervals = 0.0  #make it float for division

        self.targetChromosome2mapData = {}
        self.locusKey2mapData = {}
        self.previousLocusData = None

        #write header for the main output
        header = [
            'oldChromosome', 'oldStart', 'oldStop', 'oldStrand',
            'newChromosome', 'newStart', 'newStop', 'mapPvalue'
        ]
        self.writer.writerow(header)
        self.invariantPData.headerOutputted = True  #avoid double header output

        #open the other writer and write header
        self.sideOutput = MatrixFile(self.locusIntervalDeltaOutputFname,
                                     openMode='w',
                                     delimiter='\t')
        header = [
            'oldChromosome', 'oldStart', 'oldStop', 'oldStrand',
            'newChromosome', 'newStart', 'newStop', 'intervalDelta'
        ]
        self.sideOutput.writeHeader(header)
Exemple #8
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		
		outputDir = os.path.split(self.outputFname)[0]
		if outputDir and not os.path.isdir(outputDir):
			os.makedirs(outputDir)
		
		snp_pos2genotypeVectorLs =self.readInSNPID2GenotypeVectorLs(self.inputFname).snp_pos2returnData
		
		
		
		writer = MatrixFile(self.outputFname, openMode='w', delimiter='\t')
		header = ['chromosome', 'position', 'noOfMatches', 'noOfTotal', 'concordance']
		writer.writeHeader(header)
		
		
		counter = 0
		real_counter = 0
		no_of_pairs = 0
		snp_pos_ls = snp_pos2genotypeVectorLs.keys()
		snp_pos_ls.sort()
		for i in xrange(len(snp_pos_ls)):
			counter += 1
			key = snp_pos_ls[i]
			chromosome, position = snp_pos_ls[i][:2]
			genotypeVectorLs = snp_pos2genotypeVectorLs.get(key)
			if len(genotypeVectorLs)>1:
				real_counter += 1
				for k in xrange(0, len(genotypeVectorLs)-1):
					for l in xrange(k+1, len(genotypeVectorLs)):
						no_of_pairs +=1
						noOfMatches = 0
						noOfTotal = 0
						genotypeVector0 = genotypeVectorLs[k]
						genotypeVector1 = genotypeVectorLs[l]
						for j in xrange(len(genotypeVector0)):
							call1 = genotypeVector0[j]['GT']
							call2 = genotypeVector1[j]['GT']
							if call1!='NA' and call2!='NA':
								noOfTotal += 1
								if SNP.nt2number[call1]==SNP.nt2number[call2]:
									noOfMatches += 1
						if noOfTotal>0:
							concordance = float(noOfMatches)/float(noOfTotal)
						else:
							concordance = -1
						data_row = [chromosome, position,noOfMatches, noOfTotal, concordance ]
						writer.writerow(data_row)
		writer.close()
		sys.stderr.write("%s (out of %s, %s) snps have >1 same-position entries. %s pairs.\n"%(real_counter, counter, \
												real_counter/float(counter), no_of_pairs))
Exemple #9
0
	def outputGenotypeMarkedMissingStat(self, outputFname=None, \
									individual_index2no_of_genotype_marked_missing=None,\
									individualIDList=None):
		"""
		2013.07.24
		"""
		if outputFname and individual_index2no_of_genotype_marked_missing is not None:
			writer = MatrixFile(inputFname=outputFname, openMode='w', delimiter='\t')
			header = ["individualID", "noOfGenotypesMarkedMissing"]
			writer.writeHeader(header)
			for individual_index, no_of_genotype_marked_missing in individual_index2no_of_genotype_marked_missing.iteritems():
				individual_id = individualIDList[individual_index]
				writer.writerow([individual_id, no_of_genotype_marked_missing])
			writer.close()
Exemple #10
0
    def readInCoordinateMap(self, coordinateMapFname=None):
        """
		2013.07.11
			querySNPID      queryStrand     queryChromosome queryStart      queryStop       queryRefBase    queryAltBase    queryAlignmentSpan
			queryAlignmentStart     queryAlignmentStop      newChr  newRefStart     newRefStop      newRefBase      targetAlignmentSpan
			targetAlignmentStart    targetAlignmentStop
		"""
        sys.stderr.write("Reading in the coordinate map from %s ..." %
                         (coordinateMapFname))
        oldCoordinate2newCoordinateDataLs = {}
        reader = MatrixFile(inputFname=coordinateMapFname)
        reader.constructColName2IndexFromHeader()
        oldChromosomeIndex = reader.getColIndexGivenColHeader(
            "queryChromosome")
        oldStartIndex = reader.getColIndexGivenColHeader("queryStart")
        strandIndex = reader.getColIndexGivenColHeader("queryStrand")
        oldRefBaseIndex = reader.getColIndexGivenColHeader("queryRefBase")
        oldAltBaseIndex = reader.getColIndexGivenColHeader("queryAltBase")

        newChromosomeIndex = reader.getColIndexGivenColHeader("newChr")
        newStartIndex = reader.getColIndexGivenColHeader("newRefStart")
        newStopIndex = reader.getColIndexGivenColHeader("newRefStop")
        newRefBaseIndex = reader.getColIndexGivenColHeader("newRefBase")
        counter = 0
        for row in reader:
            oldChromosome = row[oldChromosomeIndex]
            oldStart = int(row[oldStartIndex])
            strand = row[strandIndex]
            oldRefBase = row[oldRefBaseIndex]
            oldAltBase = row[oldAltBaseIndex]

            newChromosome = row[newChromosomeIndex]
            newStart = int(row[newStartIndex])
            newStop = int(row[newStopIndex])
            newRefBase = row[newRefBaseIndex]

            key = (oldChromosome, oldStart)
            if key not in oldCoordinate2newCoordinateDataLs:
                oldCoordinate2newCoordinateDataLs[key] = []
            oldCoordinate2newCoordinateDataLs[key].append(PassingData(strand=strand, oldRefBase=oldRefBase, \
                     oldAltBase=oldAltBase, newChromosome=newChromosome, newStart=newStart,\
                     newStop=newStop, newRefBase=newRefBase))
            counter += 1
        del reader
        sys.stderr.write("%s old coordinates with %s new coordinates.\n" %
                         (len(oldCoordinate2newCoordinateDataLs), counter))
        return oldCoordinate2newCoordinateDataLs
Exemple #11
0
	def run(self):
		"""
		2013.07.24
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		#inf = utils.openGzipFile(self.inputFname)
		reader = MatrixFile(inputFname=self.inputFname)
		writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
		counter = 0
		if self.run_type==4:	#2013.2.1
			tfamIndividualData = self.getIndividualID2IndexFromTFAMFile(tfamFname=self.tfamFname)
			individualID2Index = tfamIndividualData.individualID2Index
			individualIDList = tfamIndividualData.individualIDList
			locus_id2individual_index_ls = self.getMendelErrorIndividualLocusData(mendelErrorFname=self.mendelErrorFname, \
												individualID2Index=individualID2Index)
			individual_index2no_of_genotype_marked_missing = {}
		else:
			individualID2Index = None
			individualIDList = None
			locus_id2individual_index_ls = None
			individual_index2no_of_genotype_marked_missing = None
		for row in reader:
			if self.run_type==2:
				new_row = self.processRow_ChangeChromosomeIDToX(row)
			elif self.run_type==3:
				new_row = self.processRow_addPositionStartBase(row)
			elif self.run_type==4:
				new_row = self.markGenotypeMissingIfInvolvedInMendelError(row=row, \
											locus_id2individual_index_ls=locus_id2individual_index_ls,\
											individual_index2no_of_genotype_marked_missing=individual_index2no_of_genotype_marked_missing)
				
			else:
				new_row = self.processRow(row)
			writer.writerow(new_row)
			counter += 1
		sys.stderr.write("%s lines modified.\n"%(counter))
		
		del reader
		del writer
		self.outputGenotypeMarkedMissingStat(outputFname=self.markMissingStatFname, \
								individual_index2no_of_genotype_marked_missing=individual_index2no_of_genotype_marked_missing, \
								individualIDList=individualIDList)
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        switchPointData = self.readInStats(inputFname=self.inputFname)

        sys.stderr.write("Processing data ...")
        writer = MatrixFile(self.outputFname, openMode='w')
        header = [
            "maxSwitchFrequency", "genomeCovered", 'genomeCoveredFraction',
            "noOfLoci", 'noOfLociFraction'
        ]
        writer.writeHeader(header)

        data_matrix = switchPointData.data_matrix
        totalSpan = switchPointData.totalSpan
        totalNoOfLoci = switchPointData.totalNoOfLoci

        #sort it based on switchFrequency
        data_matrix.sort(reverse=True)
        maxSwitchFrequencyLs = []
        cumulativeRegionSpanLs = []
        cumulativeNoOfLociLs = []
        for i in xrange(len(data_matrix)):
            switchFrequency, regionSpan, noOfLoci = data_matrix[i]
            maxSwitchFrequencyLs.append(switchFrequency)
            if i == 0:
                cumulativeRegionSpan = totalSpan - regionSpan

                cumulativeNoOfLoci = totalNoOfLoci - noOfLoci
            else:
                cumulativeRegionSpan = cumulativeRegionSpanLs[i -
                                                              1] - regionSpan
                cumulativeNoOfLoci = cumulativeNoOfLociLs[i - 1] - noOfLoci
            cumulativeRegionSpanLs.append(cumulativeRegionSpan)
            cumulativeNoOfLociLs.append(cumulativeNoOfLoci)
            writer.writerow([switchFrequency, cumulativeRegionSpan, cumulativeRegionSpan/float(totalSpan),\
                cumulativeNoOfLoci, cumulativeNoOfLoci/float(totalNoOfLoci)])
        writer.close()
        sys.stderr.write(".\n")
Exemple #13
0
	def getIndividualID2IndexFromTFAMFile(self, tfamFname=None):
		"""
		2013.07.24 return individualIDList as well
		2013.1.29
		"""
		sys.stderr.write("Getting individualID2Index from tfam file %s ..."%(tfamFname))
		individualID2Index = {}
		individualIDList = []
		reader = MatrixFile(inputFname=tfamFname)
		counter = 0
		for row in reader:
			individualID = row[1]
			individualID2Index[individualID] = len(individualID2Index)
			individualIDList.append(individualID)
			counter += 1
		del reader
		sys.stderr.write(" %s individuals.\n"%(len(individualID2Index)))
		return PassingData(individualID2Index=individualID2Index, individualIDList=individualIDList)
	def readInDataToPlot(self, input_fname, sampling_probability=1.0):
		"""
		2015.01.23 added argument sampling_probability to sub-sample data
		2013.07.11 use MatrixFile to read in the file
		2009-5-20
			add the column index into the column header for easy picking
		2009-3-13
			wrap the float conversion part into try...except to report what goes wrong
		2009-3-13
		"""
		if sampling_probability>1 or sampling_probability<0:
			sampling_probability=1.0
		reader = MatrixFile(inputFname=input_fname)
		self.column_header=reader.next()
		for i in range(len(self.column_header)):
			self.column_header[i] = '%s %s'%(i, self.column_header[i])
		no_of_cols = len(self.column_header)
		self.column_types = [str]*2 + [float]*(no_of_cols-2)
		self.column_editable_flag_ls = [True, True] + [False]*(no_of_cols-2)
		self.list_2d = []
		for row in reader:
			if sampling_probability>0 and sampling_probability<1:
				if random.random()>sampling_probability:	#skip
					continue
			float_part = row[2:]
			try:
				float_part = map(float, float_part)
			except:
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				traceback.print_exc()
			new_row = row[:2]+float_part
			self.list_2d.append(new_row)
		reader.close()
		self.setupColumns(self.treeview_matrix)
		#update status to reflect the input filename
		self.app1.set_title(os.path.basename(input_fname))
		self.app1_appbar1.push(input_fname)
		self.plotXY(self.ax, self.canvas, self.liststore, self.plot_title)
Exemple #15
0
    def getLocusNewID2mapPvalue(self, liftOverLocusMapPvalueFname=None):
        """
		2014.01.04
			oldChromosome, oldStart, oldStop, oldStrand, newChromosome, newStart, newStop, mapPvalue
		"""
        sys.stderr.write("Reading in the coordinate map from %s ..." %
                         (liftOverLocusMapPvalueFname))
        locusNewID2mapPvalue = {}
        reader = MatrixFile(inputFname=liftOverLocusMapPvalueFname)
        reader.constructColName2IndexFromHeader()
        strandIndex = reader.getColIndexGivenColHeader("oldStrand")
        newChromosomeIndex = reader.getColIndexGivenColHeader("newChromosome")
        newStartIndex = reader.getColIndexGivenColHeader("newStart")
        newStopIndex = reader.getColIndexGivenColHeader("newStop")
        mapPvalueIndex = reader.getColIndexGivenColHeader("mapPvalue")
        counter = 0
        for row in reader:
            strand = row[strandIndex]
            newChromosome = row[newChromosomeIndex]
            newStart = int(row[newStartIndex])
            newStop = int(row[newStopIndex])
            mapPvalue = float(row[mapPvalueIndex])

            key = (newChromosome, newStart, newStop)
            if key in locusNewID2mapPvalue:
                if mapPvalue < locusNewID2mapPvalue[key]:
                    #take lowest value
                    locusNewID2mapPvalue[key] = mapPvalue
            else:
                locusNewID2mapPvalue[key] = mapPvalue
            counter += 1
        del reader
        sys.stderr.write(
            "%s unique loci with map p-value out of %s total loci.\n" %
            (len(locusNewID2mapPvalue), counter))
        return locusNewID2mapPvalue
    def readInStats(self, inputFname=None):
        """
		2013.07.15
		"""
        sys.stderr.write("Reading stats from %s ..." % (inputFname))

        data_matrix = []

        reader = MatrixFile(inputFname)
        reader.constructColName2IndexFromHeader()
        switchFrequencyIndex = reader.getColIndexGivenColHeader(
            "noOfSwitchPoints_by_noOfLociWithUniqueHit")
        regionSpanIndex = reader.getColIndexGivenColHeader("regionSpan")
        noOfLociIndex = reader.getColIndexGivenColHeader("#sitesInInput2")

        totalSpan = 0
        totalNoOfLoci = 0
        counter = 0
        for row in reader:
            counter += 1
            switchFrequency = row[switchFrequencyIndex]
            regionSpan = row[regionSpanIndex]
            noOfLoci = row[noOfLociIndex]
            if switchFrequency and regionSpan and noOfLoci:  #non-empty
                switchFrequency = float(switchFrequency)
                regionSpan = int(float(regionSpan))
                noOfLoci = int(float(noOfLoci))
                data_matrix.append([switchFrequency, regionSpan, noOfLoci])
                totalSpan += regionSpan
                totalNoOfLoci += noOfLoci
        reader.close()
        sys.stderr.write(" %s valid entries (from %s rows) with totalSpan=%s, totalNoOfLoci=%s.\n"%\
            (len(data_matrix), counter, totalSpan, totalNoOfLoci))
        return PassingData(data_matrix=data_matrix,
                           totalSpan=totalSpan,
                           totalNoOfLoci=totalNoOfLoci)
Exemple #17
0
	def outputSwitchPointInfo(self, querySNPID2NewReferenceCoordinateLs=None, outputFname=None):
		"""
		2013.07.11
			output the switch point (adjacent sites mapped to two different chromosomes) information
		"""
		
		sys.stderr.write("Converting querySNPID2NewReferenceCoordinateLs to oldCoordinateKey2newCoordinateDataLs ... ")
		oldCoordinateKey2newCoordinateDataLs = {}
		counter = 0
		for querySNPID, newRefCoordinateLs in querySNPID2NewReferenceCoordinateLs.iteritems():
			oldCoordinateKey = None
			counter += len(newRefCoordinateLs)
			for newRefCoordinate in newRefCoordinateLs:
				if oldCoordinateKey is None:
					oldCoordinateKey = (newRefCoordinate.queryChromosome, newRefCoordinate.queryStart, newRefCoordinate.queryStop)
				if oldCoordinateKey not in oldCoordinateKey2newCoordinateDataLs:
					oldCoordinateKey2newCoordinateDataLs[oldCoordinateKey] = []
				oldCoordinateKey2newCoordinateDataLs[oldCoordinateKey].append(newRefCoordinate)
		sys.stderr.write(" %s old coordinate keys with %s new coordinates.\n"%(len(oldCoordinateKey2newCoordinateDataLs),\
																		counter))
		
		sys.stderr.write("Finding switch points ...")
		counter =0
		real_counter = 0
		noOfRecordsWithMultiNewCoords = 0

		oldChromosome2SwitchData = {}
		
		oldCoordinateKeyLs = oldCoordinateKey2newCoordinateDataLs.keys()
		oldCoordinateKeyLs.sort()
		for oldCoordinateKey in oldCoordinateKeyLs:
			counter +=1
			newRefCoordinateLs = oldCoordinateKey2newCoordinateDataLs.get(oldCoordinateKey)
			
			oldChromosome = oldCoordinateKey[0]
			
			if oldChromosome not in oldChromosome2SwitchData:
				oldChromosome2SwitchData[oldChromosome] = PassingData(noOfLociWithUniqueHit=0, noOfLoci=0, \
														spanStart=oldCoordinateKey[1], \
														spanStop=oldCoordinateKey[2], noOfSwitchPoints=0,\
														previousNewChromosome=None, previousNewRefStart=None,\
														previousNewRefStop=None,\
														previousOrientationOnNewChromosome=None)
			
			switchData = oldChromosome2SwitchData[oldChromosome]
			switchData.noOfLoci += 1
			
			if len(newRefCoordinateLs)>1:
				noOfRecordsWithMultiNewCoords += 1
				continue
			
			switchData.noOfLociWithUniqueHit += 1
			newRefCoordinate = newRefCoordinateLs[0]
			
			if switchData.previousNewChromosome is not None:
				if newRefCoordinate.newChr!=switchData.previousNewChromosome:
					switchData.noOfSwitchPoints += 1
					#reset the orientation
					switchData.previousOrientationOnNewChromosome = None
					
				else:	#on the same chromosome
					currentOrientation = (newRefCoordinate.newRefStart - switchData.previousNewRefStart)>=0
					if switchData.previousOrientationOnNewChromosome is not None:
						if currentOrientation !=switchData.previousOrientationOnNewChromosome:
							switchData.noOfSwitchPoints += 1
					switchData.previousOrientationOnNewChromosome = currentOrientation
					
			#adjust the spanStop
			if newRefCoordinate.queryStop > switchData.spanStop:
				switchData.spanStop = newRefCoordinate.queryStop
					
			
			switchData.previousNewChromosome = newRefCoordinate.newChr
			switchData.previousNewRefStart = newRefCoordinate.newRefStart
			switchData.previousNewRefStop = newRefCoordinate.newRefStop
			real_counter  += 1
		if counter >0:
			fraction = real_counter/float(counter)
		else:
			fraction = -1
		sys.stderr.write("%s (out of %s, %s) records found new coordinates. %s records with >1 new coordinates, discarded.\n"%(real_counter, counter, \
																	fraction, noOfRecordsWithMultiNewCoords))
		
		
		sys.stderr.write("Outputting switch points of %s old chromosomes ..."%(len(oldChromosome2SwitchData)))
		statFile = MatrixFile(inputFname=outputFname, openMode='w', delimiter='\t')
		header = ['oldChromosome', "noOfSwitchPoints", "regionSpan", "noOfLociWithUniqueHit", "noOfSwitchesPerLocus", "noOfLoci"]
		statFile.writeHeader(header)
		noOfTotalSwitchPoints = 0
		noOfTotalLoci = 0
		for oldChromosome, switchData in oldChromosome2SwitchData.iteritems():
			if switchData.noOfLociWithUniqueHit>0:
				switchPointFraction = switchData.noOfSwitchPoints/float(switchData.noOfLociWithUniqueHit)
			else:
				switchPointFraction = -1
			data_row = [oldChromosome, switchData.noOfSwitchPoints, switchData.spanStop-switchData.spanStart+1, \
					switchData.noOfLociWithUniqueHit, switchPointFraction, len(oldCoordinateKey2newCoordinateDataLs)]
			statFile.writerow(data_row)
			noOfTotalSwitchPoints += switchData.noOfSwitchPoints
			noOfTotalLoci += switchData.noOfLociWithUniqueHit
		statFile.close()
		sys.stderr.write(' %s total switch points, %s total loci with unique hit.\n'%(noOfTotalSwitchPoints, noOfTotalLoci))
Exemple #18
0
#!/usr/bin/env python

import os, sys
inputFname = os.path.expanduser("~/RefGenomes/dustPlus10_M1-22XY.bed.gz")
inputFname = os.path.expanduser("~/script/varcmp/scripts/LCR-hs37d5.bed.gz")
inputFname = os.path.expanduser("~/RefGenomes/dust_M1-22XY.bed.gz")
inputFname = os.path.expanduser("/illumina/scratch/CompetitiveAnalysis/CAG/Data/AnnotDB/Repeats/SegDups/genomicSuperDups_hg19.bed")

inputFname = os.path.expanduser("~/RefGenomes/dustPlus10_M1-22XY.overlap.genomicSuperDups_hg19.merged.bed")
inputFname=sys.argv[1]
sys.path.insert(0, os.path.expanduser('~/lib/python'))
sys.path.insert(0, os.path.join(os.path.expanduser('~/script')))
from pymodule import utils
from pymodule import MatrixFile
reader = MatrixFile(inputFname=inputFname, openMode='r', delimiter='\t')
span=0

for row in reader:
    if row[0][0]=='#':
        continue
    subSpan = int(row[2])-int(row[1]) + 1
    span += subSpan

print("span is %s \n"%(span))