def run(self): """ 2013.07.24 """ if self.debug: import pdb pdb.set_trace() #inf = utils.openGzipFile(self.inputFname) reader = MatrixFile(inputFname=self.inputFname) reader.constructColName2IndexFromHeader() writer = MatrixFile(inputFname=self.outputFname, openMode='w', delimiter='\t') header = ["SNPID", "oldChromosome", "Chromosome", "Start", "Stop", "N"] writer.writeHeader(header) counter = 0 for row in reader: new_row = self.processRow(row) writer.writerow(new_row) counter += 1 sys.stderr.write("%s lines processed.\n" % (counter)) del reader del writer
def run(self): """ """ if self.debug: import pdb pdb.set_trace() #inf = utils.openGzipFile(self.inputFname) reader = MatrixFile(inputFname=self.inputFname) #a TPED file writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') counter = 0 tfamIndividualData = self.getIndividualID2IndexFromTFAMFile(tfamFname=self.tfamFname) individualID2Index = tfamIndividualData.individualID2Index noOfIndividuals = len(individualID2Index) noOfExtraIndividuals = None for row in reader: #chromosome, snp_id, genetic_distace, physical_distance = row[:4] noOfExistingIndividuals = len(row[4:])/2 noOfExtraIndividuals = noOfIndividuals - noOfExistingIndividuals writer.writerow(row+ [0]*2*noOfExtraIndividuals) counter += 1 del reader del writer sys.stderr.write("%s rows (loci) and added %s extra individuals.\n"%(counter, noOfExtraIndividuals))
def getLocusID2MissingFraction(self, inputFname=None): """ 2014.01.08 """ sys.stderr.write("Reading in the missing statistics from %s ... " % (inputFname)) locusID2Stat = {} reader = MatrixFile(inputFname=inputFname) reader.constructColName2IndexFromHeader() locusIDIndex = reader.getColIndexGivenColHeader("locusID") statIndex = reader.getColIndexGivenColHeader("occurrence_byFixedValue") counter = 0 for row in reader: locusID = row[locusIDIndex] chromosome, start = locusID.split('_')[:2] start = int(start) stat = float(row[statIndex]) key = (chromosome, start, start) if key in locusID2Stat: if stat < locusID2Stat[key]: #take lowest value locusID2Stat[key] = stat else: locusID2Stat[key] = stat counter += 1 del reader sys.stderr.write( " %s unique loci with missing fraction out of %s total loci.\n" % (len(locusID2Stat), counter)) return locusID2Stat
def run(self): """ """ if self.debug: import pdb pdb.set_trace() reader = MatrixFile(self.inputFname) reader.constructColName2IndexFromHeader() noOfMendelErrorColumnIndex = reader.getColIndexGivenColHeader(colHeader='N') SNPIDColumnIndex = reader.getColIndexGivenColHeader(colHeader='SNP') writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') header = ['chromosome', 'position', 'noOfMendelErrors'] writer.writerow(header) counter = 0 real_counter = 0 for row in reader: SNPID = row[SNPIDColumnIndex] noOfMendelErrors = int(row[noOfMendelErrorColumnIndex]) if noOfMendelErrors <=self.maxNoOfMendelError: chr, pos = SNPID.split('_') data_row = [chr, pos, noOfMendelErrors] writer.writerow(data_row) real_counter += 1 counter += 1 del reader del writer sys.stderr.write("%s/%s lines outputted.\n"%(real_counter, counter))
def getMendelErrorIndividualLocusData(self, mendelErrorFname=None, individualID2Index=None): """ 2013.1.29 """ sys.stderr.write("Getting data on loci involved in mendel-errors from %s ..."%(mendelErrorFname)) locus_id2individual_index_ls = {} #inf = utils.openGzipFile(mendelErrorFname, 'r') reader = MatrixFile(inputFname=mendelErrorFname) #header = reader.next() reader.constructColName2IndexFromHeader() counter = 0 for row in reader: individual_id = row[reader.getColIndexGivenColHeader('KID')] if individual_id in individualID2Index: index =individualID2Index.get(individual_id) else: sys.stderr.write("Individual %s not in individualID2Index.\n"%(individual_id)) sys.exit(3) snp_id = row[3] if snp_id not in locus_id2individual_index_ls: locus_id2individual_index_ls[snp_id] = [] locus_id2individual_index_ls[snp_id].append(index) counter += 1 del reader sys.stderr.write(" %s calls of %s loci, involved in mendel errors.\n"%\ (counter, len(locus_id2individual_index_ls))) return locus_id2individual_index_ls
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) reader = VCFFile(inputFname=self.inputFname) alignmentFile = pysam.Samfile(self.alignmentFilename, "rb") writer = VCFFile(outputFname=self.outputFname, openMode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() statWriter = MatrixFile(self.missingStatFname, openMode='w', delimiter='\t') header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \ 'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads'] statWriter.writeHeader(header) counter = 0 real_counter = 0 minDepth = self.alignmentMedianDepth/self.alignmentDepthFold maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold for vcfRecord in reader: locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position) alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1) #start and end in fetch() are 0-based. locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\ minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead) locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator depth = locusLowMapQData.totalNoOfReads if depth>=minDepth and depth <=maxDepth: locusOutOfDepthIndicator = 0 #good else: locusOutOfDepthIndicator = 1 locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\ 1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \ locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads] statWriter.writerow(data_row) if locusLowQualityIndicator>0: real_counter += 1 #modify the VCF record #get sample ID column, then set its genotype missing vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True) #2014.1.4 output VCF record writer.writeVCFRecord(vcfRecord) counter += 1 reader.close() statWriter.close() writer.close() sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \ real_counter/float(counter)))
def setup(self, **keywords): """ noOfTotalIntervals = 0 noOfCrossChromosomeIntervals = 0 targetChromosome 2 mapData intervalDeltaList => median orientation (queryStrand) 0=forward 1=backward mean => using 80% of data (sort the delta list, then take 10% to 90% of the list) stddev => if stddev is zero, use 1. locusKey (oldChromosome, oldStart, oldStop) 2 mapData targetCoordinate (newChromosome, newStart, newStop). leftIntervalDelta: None = boundary rightIntervalDelta: None = boundary, 10E10 = cross chromosome probability: max( P(SNP_i_left_interval), P(SNP_i_right_interval)). P(interval): If one interval is on the same chromosome, P(target-chromosome)*P(interval delta size) If not, P(chromosome-cross event). Not implemented: for a whole genome input (rather than a window), an RBTree of windows should be used to counter regional effect. 2013.11.24 run before anything is run """ AbstractMatrixFileWalker.setup(self, **keywords) self.noOfTotalIntervals = 0.0 self.noOfCrossChromosomeIntervals = 0.0 #make it float for division self.targetChromosome2mapData = {} self.locusKey2mapData = {} self.previousLocusData = None #write header for the main output header = [ 'oldChromosome', 'oldStart', 'oldStop', 'oldStrand', 'newChromosome', 'newStart', 'newStop', 'mapPvalue' ] self.writer.writerow(header) self.invariantPData.headerOutputted = True #avoid double header output #open the other writer and write header self.sideOutput = MatrixFile(self.locusIntervalDeltaOutputFname, openMode='w', delimiter='\t') header = [ 'oldChromosome', 'oldStart', 'oldStop', 'oldStrand', 'newChromosome', 'newStart', 'newStop', 'intervalDelta' ] self.sideOutput.writeHeader(header)
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) snp_pos2genotypeVectorLs =self.readInSNPID2GenotypeVectorLs(self.inputFname).snp_pos2returnData writer = MatrixFile(self.outputFname, openMode='w', delimiter='\t') header = ['chromosome', 'position', 'noOfMatches', 'noOfTotal', 'concordance'] writer.writeHeader(header) counter = 0 real_counter = 0 no_of_pairs = 0 snp_pos_ls = snp_pos2genotypeVectorLs.keys() snp_pos_ls.sort() for i in xrange(len(snp_pos_ls)): counter += 1 key = snp_pos_ls[i] chromosome, position = snp_pos_ls[i][:2] genotypeVectorLs = snp_pos2genotypeVectorLs.get(key) if len(genotypeVectorLs)>1: real_counter += 1 for k in xrange(0, len(genotypeVectorLs)-1): for l in xrange(k+1, len(genotypeVectorLs)): no_of_pairs +=1 noOfMatches = 0 noOfTotal = 0 genotypeVector0 = genotypeVectorLs[k] genotypeVector1 = genotypeVectorLs[l] for j in xrange(len(genotypeVector0)): call1 = genotypeVector0[j]['GT'] call2 = genotypeVector1[j]['GT'] if call1!='NA' and call2!='NA': noOfTotal += 1 if SNP.nt2number[call1]==SNP.nt2number[call2]: noOfMatches += 1 if noOfTotal>0: concordance = float(noOfMatches)/float(noOfTotal) else: concordance = -1 data_row = [chromosome, position,noOfMatches, noOfTotal, concordance ] writer.writerow(data_row) writer.close() sys.stderr.write("%s (out of %s, %s) snps have >1 same-position entries. %s pairs.\n"%(real_counter, counter, \ real_counter/float(counter), no_of_pairs))
def outputGenotypeMarkedMissingStat(self, outputFname=None, \ individual_index2no_of_genotype_marked_missing=None,\ individualIDList=None): """ 2013.07.24 """ if outputFname and individual_index2no_of_genotype_marked_missing is not None: writer = MatrixFile(inputFname=outputFname, openMode='w', delimiter='\t') header = ["individualID", "noOfGenotypesMarkedMissing"] writer.writeHeader(header) for individual_index, no_of_genotype_marked_missing in individual_index2no_of_genotype_marked_missing.iteritems(): individual_id = individualIDList[individual_index] writer.writerow([individual_id, no_of_genotype_marked_missing]) writer.close()
def readInCoordinateMap(self, coordinateMapFname=None): """ 2013.07.11 querySNPID queryStrand queryChromosome queryStart queryStop queryRefBase queryAltBase queryAlignmentSpan queryAlignmentStart queryAlignmentStop newChr newRefStart newRefStop newRefBase targetAlignmentSpan targetAlignmentStart targetAlignmentStop """ sys.stderr.write("Reading in the coordinate map from %s ..." % (coordinateMapFname)) oldCoordinate2newCoordinateDataLs = {} reader = MatrixFile(inputFname=coordinateMapFname) reader.constructColName2IndexFromHeader() oldChromosomeIndex = reader.getColIndexGivenColHeader( "queryChromosome") oldStartIndex = reader.getColIndexGivenColHeader("queryStart") strandIndex = reader.getColIndexGivenColHeader("queryStrand") oldRefBaseIndex = reader.getColIndexGivenColHeader("queryRefBase") oldAltBaseIndex = reader.getColIndexGivenColHeader("queryAltBase") newChromosomeIndex = reader.getColIndexGivenColHeader("newChr") newStartIndex = reader.getColIndexGivenColHeader("newRefStart") newStopIndex = reader.getColIndexGivenColHeader("newRefStop") newRefBaseIndex = reader.getColIndexGivenColHeader("newRefBase") counter = 0 for row in reader: oldChromosome = row[oldChromosomeIndex] oldStart = int(row[oldStartIndex]) strand = row[strandIndex] oldRefBase = row[oldRefBaseIndex] oldAltBase = row[oldAltBaseIndex] newChromosome = row[newChromosomeIndex] newStart = int(row[newStartIndex]) newStop = int(row[newStopIndex]) newRefBase = row[newRefBaseIndex] key = (oldChromosome, oldStart) if key not in oldCoordinate2newCoordinateDataLs: oldCoordinate2newCoordinateDataLs[key] = [] oldCoordinate2newCoordinateDataLs[key].append(PassingData(strand=strand, oldRefBase=oldRefBase, \ oldAltBase=oldAltBase, newChromosome=newChromosome, newStart=newStart,\ newStop=newStop, newRefBase=newRefBase)) counter += 1 del reader sys.stderr.write("%s old coordinates with %s new coordinates.\n" % (len(oldCoordinate2newCoordinateDataLs), counter)) return oldCoordinate2newCoordinateDataLs
def run(self): """ 2013.07.24 """ if self.debug: import pdb pdb.set_trace() #inf = utils.openGzipFile(self.inputFname) reader = MatrixFile(inputFname=self.inputFname) writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') counter = 0 if self.run_type==4: #2013.2.1 tfamIndividualData = self.getIndividualID2IndexFromTFAMFile(tfamFname=self.tfamFname) individualID2Index = tfamIndividualData.individualID2Index individualIDList = tfamIndividualData.individualIDList locus_id2individual_index_ls = self.getMendelErrorIndividualLocusData(mendelErrorFname=self.mendelErrorFname, \ individualID2Index=individualID2Index) individual_index2no_of_genotype_marked_missing = {} else: individualID2Index = None individualIDList = None locus_id2individual_index_ls = None individual_index2no_of_genotype_marked_missing = None for row in reader: if self.run_type==2: new_row = self.processRow_ChangeChromosomeIDToX(row) elif self.run_type==3: new_row = self.processRow_addPositionStartBase(row) elif self.run_type==4: new_row = self.markGenotypeMissingIfInvolvedInMendelError(row=row, \ locus_id2individual_index_ls=locus_id2individual_index_ls,\ individual_index2no_of_genotype_marked_missing=individual_index2no_of_genotype_marked_missing) else: new_row = self.processRow(row) writer.writerow(new_row) counter += 1 sys.stderr.write("%s lines modified.\n"%(counter)) del reader del writer self.outputGenotypeMarkedMissingStat(outputFname=self.markMissingStatFname, \ individual_index2no_of_genotype_marked_missing=individual_index2no_of_genotype_marked_missing, \ individualIDList=individualIDList)
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) switchPointData = self.readInStats(inputFname=self.inputFname) sys.stderr.write("Processing data ...") writer = MatrixFile(self.outputFname, openMode='w') header = [ "maxSwitchFrequency", "genomeCovered", 'genomeCoveredFraction', "noOfLoci", 'noOfLociFraction' ] writer.writeHeader(header) data_matrix = switchPointData.data_matrix totalSpan = switchPointData.totalSpan totalNoOfLoci = switchPointData.totalNoOfLoci #sort it based on switchFrequency data_matrix.sort(reverse=True) maxSwitchFrequencyLs = [] cumulativeRegionSpanLs = [] cumulativeNoOfLociLs = [] for i in xrange(len(data_matrix)): switchFrequency, regionSpan, noOfLoci = data_matrix[i] maxSwitchFrequencyLs.append(switchFrequency) if i == 0: cumulativeRegionSpan = totalSpan - regionSpan cumulativeNoOfLoci = totalNoOfLoci - noOfLoci else: cumulativeRegionSpan = cumulativeRegionSpanLs[i - 1] - regionSpan cumulativeNoOfLoci = cumulativeNoOfLociLs[i - 1] - noOfLoci cumulativeRegionSpanLs.append(cumulativeRegionSpan) cumulativeNoOfLociLs.append(cumulativeNoOfLoci) writer.writerow([switchFrequency, cumulativeRegionSpan, cumulativeRegionSpan/float(totalSpan),\ cumulativeNoOfLoci, cumulativeNoOfLoci/float(totalNoOfLoci)]) writer.close() sys.stderr.write(".\n")
def getIndividualID2IndexFromTFAMFile(self, tfamFname=None): """ 2013.07.24 return individualIDList as well 2013.1.29 """ sys.stderr.write("Getting individualID2Index from tfam file %s ..."%(tfamFname)) individualID2Index = {} individualIDList = [] reader = MatrixFile(inputFname=tfamFname) counter = 0 for row in reader: individualID = row[1] individualID2Index[individualID] = len(individualID2Index) individualIDList.append(individualID) counter += 1 del reader sys.stderr.write(" %s individuals.\n"%(len(individualID2Index))) return PassingData(individualID2Index=individualID2Index, individualIDList=individualIDList)
def readInDataToPlot(self, input_fname, sampling_probability=1.0): """ 2015.01.23 added argument sampling_probability to sub-sample data 2013.07.11 use MatrixFile to read in the file 2009-5-20 add the column index into the column header for easy picking 2009-3-13 wrap the float conversion part into try...except to report what goes wrong 2009-3-13 """ if sampling_probability>1 or sampling_probability<0: sampling_probability=1.0 reader = MatrixFile(inputFname=input_fname) self.column_header=reader.next() for i in range(len(self.column_header)): self.column_header[i] = '%s %s'%(i, self.column_header[i]) no_of_cols = len(self.column_header) self.column_types = [str]*2 + [float]*(no_of_cols-2) self.column_editable_flag_ls = [True, True] + [False]*(no_of_cols-2) self.list_2d = [] for row in reader: if sampling_probability>0 and sampling_probability<1: if random.random()>sampling_probability: #skip continue float_part = row[2:] try: float_part = map(float, float_part) except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) traceback.print_exc() new_row = row[:2]+float_part self.list_2d.append(new_row) reader.close() self.setupColumns(self.treeview_matrix) #update status to reflect the input filename self.app1.set_title(os.path.basename(input_fname)) self.app1_appbar1.push(input_fname) self.plotXY(self.ax, self.canvas, self.liststore, self.plot_title)
def getLocusNewID2mapPvalue(self, liftOverLocusMapPvalueFname=None): """ 2014.01.04 oldChromosome, oldStart, oldStop, oldStrand, newChromosome, newStart, newStop, mapPvalue """ sys.stderr.write("Reading in the coordinate map from %s ..." % (liftOverLocusMapPvalueFname)) locusNewID2mapPvalue = {} reader = MatrixFile(inputFname=liftOverLocusMapPvalueFname) reader.constructColName2IndexFromHeader() strandIndex = reader.getColIndexGivenColHeader("oldStrand") newChromosomeIndex = reader.getColIndexGivenColHeader("newChromosome") newStartIndex = reader.getColIndexGivenColHeader("newStart") newStopIndex = reader.getColIndexGivenColHeader("newStop") mapPvalueIndex = reader.getColIndexGivenColHeader("mapPvalue") counter = 0 for row in reader: strand = row[strandIndex] newChromosome = row[newChromosomeIndex] newStart = int(row[newStartIndex]) newStop = int(row[newStopIndex]) mapPvalue = float(row[mapPvalueIndex]) key = (newChromosome, newStart, newStop) if key in locusNewID2mapPvalue: if mapPvalue < locusNewID2mapPvalue[key]: #take lowest value locusNewID2mapPvalue[key] = mapPvalue else: locusNewID2mapPvalue[key] = mapPvalue counter += 1 del reader sys.stderr.write( "%s unique loci with map p-value out of %s total loci.\n" % (len(locusNewID2mapPvalue), counter)) return locusNewID2mapPvalue
def readInStats(self, inputFname=None): """ 2013.07.15 """ sys.stderr.write("Reading stats from %s ..." % (inputFname)) data_matrix = [] reader = MatrixFile(inputFname) reader.constructColName2IndexFromHeader() switchFrequencyIndex = reader.getColIndexGivenColHeader( "noOfSwitchPoints_by_noOfLociWithUniqueHit") regionSpanIndex = reader.getColIndexGivenColHeader("regionSpan") noOfLociIndex = reader.getColIndexGivenColHeader("#sitesInInput2") totalSpan = 0 totalNoOfLoci = 0 counter = 0 for row in reader: counter += 1 switchFrequency = row[switchFrequencyIndex] regionSpan = row[regionSpanIndex] noOfLoci = row[noOfLociIndex] if switchFrequency and regionSpan and noOfLoci: #non-empty switchFrequency = float(switchFrequency) regionSpan = int(float(regionSpan)) noOfLoci = int(float(noOfLoci)) data_matrix.append([switchFrequency, regionSpan, noOfLoci]) totalSpan += regionSpan totalNoOfLoci += noOfLoci reader.close() sys.stderr.write(" %s valid entries (from %s rows) with totalSpan=%s, totalNoOfLoci=%s.\n"%\ (len(data_matrix), counter, totalSpan, totalNoOfLoci)) return PassingData(data_matrix=data_matrix, totalSpan=totalSpan, totalNoOfLoci=totalNoOfLoci)
def outputSwitchPointInfo(self, querySNPID2NewReferenceCoordinateLs=None, outputFname=None): """ 2013.07.11 output the switch point (adjacent sites mapped to two different chromosomes) information """ sys.stderr.write("Converting querySNPID2NewReferenceCoordinateLs to oldCoordinateKey2newCoordinateDataLs ... ") oldCoordinateKey2newCoordinateDataLs = {} counter = 0 for querySNPID, newRefCoordinateLs in querySNPID2NewReferenceCoordinateLs.iteritems(): oldCoordinateKey = None counter += len(newRefCoordinateLs) for newRefCoordinate in newRefCoordinateLs: if oldCoordinateKey is None: oldCoordinateKey = (newRefCoordinate.queryChromosome, newRefCoordinate.queryStart, newRefCoordinate.queryStop) if oldCoordinateKey not in oldCoordinateKey2newCoordinateDataLs: oldCoordinateKey2newCoordinateDataLs[oldCoordinateKey] = [] oldCoordinateKey2newCoordinateDataLs[oldCoordinateKey].append(newRefCoordinate) sys.stderr.write(" %s old coordinate keys with %s new coordinates.\n"%(len(oldCoordinateKey2newCoordinateDataLs),\ counter)) sys.stderr.write("Finding switch points ...") counter =0 real_counter = 0 noOfRecordsWithMultiNewCoords = 0 oldChromosome2SwitchData = {} oldCoordinateKeyLs = oldCoordinateKey2newCoordinateDataLs.keys() oldCoordinateKeyLs.sort() for oldCoordinateKey in oldCoordinateKeyLs: counter +=1 newRefCoordinateLs = oldCoordinateKey2newCoordinateDataLs.get(oldCoordinateKey) oldChromosome = oldCoordinateKey[0] if oldChromosome not in oldChromosome2SwitchData: oldChromosome2SwitchData[oldChromosome] = PassingData(noOfLociWithUniqueHit=0, noOfLoci=0, \ spanStart=oldCoordinateKey[1], \ spanStop=oldCoordinateKey[2], noOfSwitchPoints=0,\ previousNewChromosome=None, previousNewRefStart=None,\ previousNewRefStop=None,\ previousOrientationOnNewChromosome=None) switchData = oldChromosome2SwitchData[oldChromosome] switchData.noOfLoci += 1 if len(newRefCoordinateLs)>1: noOfRecordsWithMultiNewCoords += 1 continue switchData.noOfLociWithUniqueHit += 1 newRefCoordinate = newRefCoordinateLs[0] if switchData.previousNewChromosome is not None: if newRefCoordinate.newChr!=switchData.previousNewChromosome: switchData.noOfSwitchPoints += 1 #reset the orientation switchData.previousOrientationOnNewChromosome = None else: #on the same chromosome currentOrientation = (newRefCoordinate.newRefStart - switchData.previousNewRefStart)>=0 if switchData.previousOrientationOnNewChromosome is not None: if currentOrientation !=switchData.previousOrientationOnNewChromosome: switchData.noOfSwitchPoints += 1 switchData.previousOrientationOnNewChromosome = currentOrientation #adjust the spanStop if newRefCoordinate.queryStop > switchData.spanStop: switchData.spanStop = newRefCoordinate.queryStop switchData.previousNewChromosome = newRefCoordinate.newChr switchData.previousNewRefStart = newRefCoordinate.newRefStart switchData.previousNewRefStop = newRefCoordinate.newRefStop real_counter += 1 if counter >0: fraction = real_counter/float(counter) else: fraction = -1 sys.stderr.write("%s (out of %s, %s) records found new coordinates. %s records with >1 new coordinates, discarded.\n"%(real_counter, counter, \ fraction, noOfRecordsWithMultiNewCoords)) sys.stderr.write("Outputting switch points of %s old chromosomes ..."%(len(oldChromosome2SwitchData))) statFile = MatrixFile(inputFname=outputFname, openMode='w', delimiter='\t') header = ['oldChromosome', "noOfSwitchPoints", "regionSpan", "noOfLociWithUniqueHit", "noOfSwitchesPerLocus", "noOfLoci"] statFile.writeHeader(header) noOfTotalSwitchPoints = 0 noOfTotalLoci = 0 for oldChromosome, switchData in oldChromosome2SwitchData.iteritems(): if switchData.noOfLociWithUniqueHit>0: switchPointFraction = switchData.noOfSwitchPoints/float(switchData.noOfLociWithUniqueHit) else: switchPointFraction = -1 data_row = [oldChromosome, switchData.noOfSwitchPoints, switchData.spanStop-switchData.spanStart+1, \ switchData.noOfLociWithUniqueHit, switchPointFraction, len(oldCoordinateKey2newCoordinateDataLs)] statFile.writerow(data_row) noOfTotalSwitchPoints += switchData.noOfSwitchPoints noOfTotalLoci += switchData.noOfLociWithUniqueHit statFile.close() sys.stderr.write(' %s total switch points, %s total loci with unique hit.\n'%(noOfTotalSwitchPoints, noOfTotalLoci))
#!/usr/bin/env python import os, sys inputFname = os.path.expanduser("~/RefGenomes/dustPlus10_M1-22XY.bed.gz") inputFname = os.path.expanduser("~/script/varcmp/scripts/LCR-hs37d5.bed.gz") inputFname = os.path.expanduser("~/RefGenomes/dust_M1-22XY.bed.gz") inputFname = os.path.expanduser("/illumina/scratch/CompetitiveAnalysis/CAG/Data/AnnotDB/Repeats/SegDups/genomicSuperDups_hg19.bed") inputFname = os.path.expanduser("~/RefGenomes/dustPlus10_M1-22XY.overlap.genomicSuperDups_hg19.merged.bed") inputFname=sys.argv[1] sys.path.insert(0, os.path.expanduser('~/lib/python')) sys.path.insert(0, os.path.join(os.path.expanduser('~/script'))) from pymodule import utils from pymodule import MatrixFile reader = MatrixFile(inputFname=inputFname, openMode='r', delimiter='\t') span=0 for row in reader: if row[0][0]=='#': continue subSpan = int(row[2])-int(row[1]) + 1 span += subSpan print("span is %s \n"%(span))