def scanSequence(seqList, dirName): '''Given list of sequences --> get all reads that have sequence ''' fileNames = cg.recurseDir(dirName, end = '.sequence') if len(fileNames) > 1: print fileNames print 'there is more than one sequence file in this directory' return 1 else: fN = fileNames[0] #for seq in seqList: seq = seqList fIndex = cgIndex.lineIndex(fN, header = False) fIndex.passCheckFunction(cgIndex.mapSequenceCheckFunction) fIndex.binarySearch(seq) #places file pointer at beginning of sequence line #extend and report fIndex.extendUp(seq) finalReads = [] for line in fIndex.file: if fIndex.checkFunction(seq, line) == 0: finalReads.append(line.strip()) else: return finalReads
def scanVectorsFile(fN, tccList): '''Given tcc list --> scan wig files and return coord:value... ''' timer = cg.cgTimer() timer.start() coordDict = {} # tcc: [list values] for tcc in tccList: chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc) #goto correct line in index fIndex = cgIndex.lineIndex(fN, header = True) #!!!there actually is a header...have to deal with this... fIndex.passCheckFunction(cgIndex.wigCheckFunction) fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning stop = False for line in fIndex.file: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) lEnd = int(cg.ss(line)[2]) lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd): coordDict[i] = lValue if stop: break #fIndex.close() return coordDict
def scanSequence(seqList, dirName): '''Given list of sequences --> get all reads that have sequence ''' fileNames = cg.recurseDir(dirName, end='.sequence') if len(fileNames) > 1: print fileNames print 'there is more than one sequence file in this directory' return 1 else: fN = fileNames[0] #for seq in seqList: seq = seqList fIndex = cgIndex.lineIndex(fN, header=False) fIndex.passCheckFunction(cgIndex.mapSequenceCheckFunction) fIndex.binarySearch( seq) #places file pointer at beginning of sequence line #extend and report fIndex.extendUp(seq) finalReads = [] for line in fIndex.file: if fIndex.checkFunction(seq, line) == 0: finalReads.append(line.strip()) else: return finalReads
def svCoord(tccList, config = None): '''Given tcc list --> scan Organism wig files and coord:value... ''' #init config = c.getConfig(config) org = config.conf['organism'] wigDir = config.conf['wigSetDir'] wigSetName = config.conf['wigSetName'] splitIntoChroms = config.conf['wigChromSplit'] if splitIntoChroms == 'True': splitIntoChroms = True else: splitIntoChroms = False coordDict = {} # tcc: [list values] for tcc in tccList: chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc) if splitIntoChroms: fN = wigDir + '/%s.%s.%s.wig' % (wigSetName, chrom, strand) else: fN = wigDir + '/Merge.%s.%s.wig' % (org.lower(), strand) fIndex = cgIndex.lineIndex(fN, header = True) fIndex.passCheckFunction(cgIndex.wigCheckFunction) fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning stop = False for line in fIndex.file: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) + 1 #print 'lBeg', lBeg lEnd = int(cg.ss(line)[2]) #print 'lEnd', lEnd #print '--' lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd + 1): coordDict[i] = lValue if stop: break fIndex.close() #close the file and the index after use... return coordDict
def save(self, outFN=None): if outFN == None: outFN = self._dataFileName if self._rangeSpecified: outFN += '.range.%s.%s' % (self._rangeSpecified[0], self._rangeSpecified[1]) #skip to start of specified range if self._rangeSpecified: fIndex = cgIndex.lineIndex(self._dataFileName) fIndex.passCheckFunction(cgIndex.primaryIDCheckFunction) fIndex.binarySearch(self._rangeSpecified[0]) f = fIndex.file else: f = open(self._dataFileName, 'r') #create new file contents newLines = [] for line in f: ls = line.strip().split('\t') id = int(ls[0]) #skip those that weren't selected and in range ''' #I'm not sure I want to only write the ones with conditions... if self._conditions: if id not in self._selectedIDs: continue ''' #stop checking for ids once out of range if self._rangeSpecified: if id > self._rangeSpecified[1]: break #save the rest for attName in self._selectedAttNames: newVal = self._attName_casteToFxn[attName]( self._attName_id_value[attName][id]) ls = lineUpdate(ls, newVal, self._attName_columnPosition[attName]) newLines.append('%s\n' % '\t'.join(ls)) f.close() #output file f = open(outFN, 'w') f.writelines(newLines) f.close() #exit signal for parallel processes if self._rangeSpecified: f = open(outFN + '.exitSignal', 'w') f.write('DONE') f.close()
def save(self, outFN = None): if outFN == None: outFN = self._dataFileName if self._rangeSpecified: outFN += '.range.%s.%s' % (self._rangeSpecified[0], self._rangeSpecified[1]) #skip to start of specified range if self._rangeSpecified: fIndex = cgIndex.lineIndex(self._dataFileName) fIndex.passCheckFunction(cgIndex.primaryIDCheckFunction) fIndex.binarySearch(self._rangeSpecified[0]) f = fIndex.file else: f = open(self._dataFileName, 'r') #create new file contents newLines = [] for line in f: ls = line.strip().split('\t') id = int(ls[0]) #skip those that weren't selected and in range ''' #I'm not sure I want to only write the ones with conditions... if self._conditions: if id not in self._selectedIDs: continue ''' #stop checking for ids once out of range if self._rangeSpecified: if id > self._rangeSpecified[1]: break #save the rest for attName in self._selectedAttNames: newVal = self._attName_casteToFxn[attName](self._attName_id_value[attName][id]) ls = lineUpdate(ls, newVal, self._attName_columnPosition[attName]) newLines.append('%s\n' % '\t'.join(ls)) f.close() #output file f = open(outFN, 'w') f.writelines(newLines) f.close() #exit signal for parallel processes if self._rangeSpecified: f = open(outFN + '.exitSignal', 'w') f.write('DONE') f.close()
def scanVectorsOrganism(tccList, config=None): '''Given tcc list --> scan Organism wig files and coord:value... ''' config = c.getConfig(config) coordDict = {} # tcc: [list values] for tcc in tccList: chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc) #print 'Checking Tcc' org = config.conf['organism'] mConf = c.getConfig('Main.conf') wigDir = mConf.conf['wig%s' % org] fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(), strand, chrom) #print 'Checking Index' #goto correct line in index fIndex = cgIndex.lineIndex( fN, header=True ) #!!!there actually is a header...have to deal with this... fIndex.passCheckFunction(cgIndex.wigCheckFunction) fIndex.binarySearch( tcc) #places file pointer at beginning of tcc as beginning stop = False for line in fIndex.file: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) lEnd = int(cg.ss(line)[2]) lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd): coordDict[i] = lValue if stop: break return coordDict
def scanVectorsOrganism(tccList, config = None): '''Given tcc list --> scan Organism wig files and coord:value... ''' config = c.getConfig(config) coordDict = {} # tcc: [list values] for tcc in tccList: chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc) #print 'Checking Tcc' org = config.conf['organism'] mConf = c.getConfig('Main.conf') wigDir = mConf.conf['wig%s' % org] fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(),strand,chrom) #print 'Checking Index' #goto correct line in index fIndex = cgIndex.lineIndex(fN, header = True) #!!!there actually is a header...have to deal with this... fIndex.passCheckFunction(cgIndex.wigCheckFunction) fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning stop = False for line in fIndex.file: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) lEnd = int(cg.ss(line)[2]) lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd): coordDict[i] = lValue if stop: break return coordDict
def scanCoord(tcc, dirName): fileNames = cg.recurseDir(dirName, end='.starts') #get name of file for index chrom, strand, start, end = cg.tccSplit(tcc) nameCheck = '%s.%s' % (chrom, strand) fN = 'None' for fileName in fileNames: if nameCheck in fileName: fN = fileName if fN == 'None': print 'No Index file for', nameCheck return 0 fIndex = cgIndex.lineIndex(fN, header=False) fIndex.passCheckFunction(cgIndex.mapStartCheckFunction) fIndex.binarySearch( tcc, skipEnd=True) #places file pointer at beginning of sequence line #Check if you need to move down one line checkLine = fIndex.getLineFromByte(fIndex.currentByte) fIndex.passCheckFunction( cgIndex.mapStartRangeCheckFunction ) #Note i'm passing now, but it is also used in extending if fIndex.checkFunction(tcc, checkLine) != 0: fIndex.file.readline() fIndex.currentByte = fIndex.file.tell() #Now extend up until in range, down until in range --> return reads. fIndex.extendUp(tcc) finalReads = [] for line in fIndex.file: if fIndex.checkFunction(tcc, line) == 0: finalReads.append(line.strip()) else: return finalReads
def scanCoord(tcc, dirName): fileNames = cg.recurseDir(dirName, end = '.starts') #get name of file for index chrom, strand, start, end = cg.tccSplit(tcc) nameCheck = '%s.%s' % (chrom, strand) fN = 'None' for fileName in fileNames: if nameCheck in fileName: fN = fileName if fN == 'None': print 'No Index file for', nameCheck return 0 fIndex = cgIndex.lineIndex(fN, header = False) fIndex.passCheckFunction(cgIndex.mapStartCheckFunction) fIndex.binarySearch(tcc, skipEnd = True) #places file pointer at beginning of sequence line #Check if you need to move down one line checkLine = fIndex.getLineFromByte(fIndex.currentByte) fIndex.passCheckFunction(cgIndex.mapStartRangeCheckFunction) #Note i'm passing now, but it is also used in extending if fIndex.checkFunction(tcc, checkLine) != 0: fIndex.file.readline() fIndex.currentByte = fIndex.file.tell() #Now extend up until in range, down until in range --> return reads. fIndex.extendUp(tcc) finalReads = [] for line in fIndex.file: if fIndex.checkFunction(tcc, line) == 0: finalReads.append(line.strip()) else: return finalReads
def load(self, attNames, paraInfo = [None, None], idRange = [], conditions = {}): '''paraInfo is [runNumber, numberOfRuns]. First parallel is checked, and then idRange''' self._conditions = conditions #if running a parallel job, split it into the right ids... if paraInfo != [None, None]: idRange = getIDRange(paraInfo, self._dataFileName) #if running parallel or specific range, mark range info self._selectedAttNames = attNames if idRange: self._rangeSpecified = [idRange[0], idRange[-1]] #get casting and column info self.loadTranscriptionInfo(attNames) #initialize master dict for attName in attNames: self._attName_id_value[attName] = {} #get number of slots f = open(self._dataFileName, 'r') numSlots = len(f.readline().split('\t')) f.close() loadTime = 0.0 stripTime = 0.0 idTime = 0.0 tranTime = 0.0 conditionTime = 0.0 #skip to start of specified range if self._rangeSpecified: fIndex = cgIndex.lineIndex(self._dataFileName) fIndex.passCheckFunction(cgIndex.primaryIDCheckFunction) fIndex.binarySearch(self._rangeSpecified[0]) f = fIndex.file else: f = open(self._dataFileName, 'r') #transcribe values for line in f: ls = line.strip().split('\t') id = int(ls[0]) #id is always first slot #only transcribe selected range! if idRange: if id > idRange[1]: break #transcribe for attName in attNames: if self._attName_columnPosition[attName] < numSlots: if ls[self._attName_columnPosition[attName]] != '.': self._attName_id_value[attName][id] = self._attName_casteFromFxn[attName](ls[self._attName_columnPosition[attName]]) else: self._attName_id_value[attName][id] = copy(self._attName_defaultValue[attName]) else: self._attName_id_value[attName][id] = copy(self._attName_defaultValue[attName]) #do conditions if conditions: for attName in conditions: if attName == 'ID': if conditions['ID'](id): self._selectedIDs.add(id) else: for aName in attNames: del self._attName_id_value[aName][id] else: if conditions[attName](self._attName_id_value[attName][id]): self._selectedIDs.add(id) else: for aName in attNames: del self._attName_id_value[aName][id] f.close() #bind attribute names to dictionaries for attName in attNames: self.bindAttribute(attName)
def load(self, attNames, paraInfo=[None, None], idRange=[], conditions={}): '''paraInfo is [runNumber, numberOfRuns]. First parallel is checked, and then idRange''' self._conditions = conditions #if running a parallel job, split it into the right ids... if paraInfo != [None, None]: idRange = getIDRange(paraInfo, self._dataFileName) #if running parallel or specific range, mark range info self._selectedAttNames = attNames if idRange: self._rangeSpecified = [idRange[0], idRange[-1]] #get casting and column info self.loadTranscriptionInfo(attNames) #initialize master dict for attName in attNames: self._attName_id_value[attName] = {} #get number of slots f = open(self._dataFileName, 'r') numSlots = len(f.readline().split('\t')) f.close() loadTime = 0.0 stripTime = 0.0 idTime = 0.0 tranTime = 0.0 conditionTime = 0.0 #skip to start of specified range if self._rangeSpecified: fIndex = cgIndex.lineIndex(self._dataFileName) fIndex.passCheckFunction(cgIndex.primaryIDCheckFunction) fIndex.binarySearch(self._rangeSpecified[0]) f = fIndex.file else: f = open(self._dataFileName, 'r') #transcribe values for line in f: ls = line.strip().split('\t') id = int(ls[0]) #id is always first slot #only transcribe selected range! if idRange: if id > idRange[1]: break #transcribe for attName in attNames: if self._attName_columnPosition[attName] < numSlots: if ls[self._attName_columnPosition[attName]] != '.': self._attName_id_value[attName][ id] = self._attName_casteFromFxn[attName]( ls[self._attName_columnPosition[attName]]) else: self._attName_id_value[attName][id] = copy( self._attName_defaultValue[attName]) else: self._attName_id_value[attName][id] = copy( self._attName_defaultValue[attName]) #do conditions if conditions: for attName in conditions: if attName == 'ID': if conditions['ID'](id): self._selectedIDs.add(id) else: for aName in attNames: del self._attName_id_value[aName][id] else: if conditions[attName]( self._attName_id_value[attName][id]): self._selectedIDs.add(id) else: for aName in attNames: del self._attName_id_value[aName][id] f.close() #bind attribute names to dictionaries for attName in attNames: self.bindAttribute(attName)
import sys import cgIndex fIndex = cgIndex.lineIndex(sys.argv[1]) fIndex.passCheckFunction(cgIndex.primaryIDCheckFunction) fIndex.binarySearch(4000000) print fIndex.file.readline()