def getLongestRepeat(seq, minTimes, kmerLength): #get all di seqs, uniquify kSeqs = bioLibCG.returnFrames(seq, kmerLength) kSeqs = set(kSeqs) highestSLen = 0 for kmer in kSeqs: #slides for slide in range(0, kmerLength): sLen = 0 for i in range(slide, len(seq), kmerLength): try: if seq[i:i + kmerLength] == kmer: sLen += 1 else: #if stretch is long enough, mask if sLen >= minTimes: if sLen > highestSLen: highestSLen = sLen sLen = 0 else: sLen = 0 except IndexError: #check for masking if sLen > minTimes: if sLen > highestSLen: highestSLen = sLen sLen = 0 print highestSLen return highestSLen
def plotInfoNTFrameEnrichment(dir, frameWidth, outFN): frameNum_seqs = {} for fChrom in bioLibCG.humanChromosomes: for fStrand in ('1', '-1'): print fChrom, fStrand fN = '%s/%s.%s.primeSeqs' % (dir, fChrom, fStrand) f = open(fN, 'r') for line in f: ls = line.strip().split('\t') seq = ls[0] for i, frame in enumerate(bioLibCG.returnFrames(seq, frameWidth)): frameNum_seqs.setdefault(i, []).append(frame) f.close() let_frameCounts = {} for fNum in sorted(frameNum_seqs.keys()): seqs = frameNum_seqs[fNum] let_count = getSeqEnrichment(seqs) for let, count in let_count.items(): let_frameCounts.setdefault(let, []).append(count) fOut = open(outFN, 'w') for let, fCounts in let_frameCounts.items(): fCounts = ','.join([str(x) for x in fCounts]) fOut.write('%s\t%s\n' % (let, fCounts)) fOut.close()
def getLongestRepeat(seq, minTimes, kmerLength): #get all di seqs, uniquify kSeqs = bioLibCG.returnFrames(seq, kmerLength) kSeqs = set(kSeqs) seqLength = len(seq) highestSLen = 0 for kmer in kSeqs: for slide in range(0, kmerLength): sLen = 0 adjustedRange = seqLength - (kmerLength - 1) #takes into account excluding right side numbers scanRange = range(slide, adjustedRange, kmerLength) for i in scanRange: #print seq[:i] + ' [' + seq[i:i + kmerLength] + ']', highestSLen, sLen, i, i + kmerLength if (seq[i:i + kmerLength] == kmer): sLen += 1 #take care of last nt if i == scanRange[-1]: if sLen >= minTimes: if sLen > highestSLen: highestSLen = sLen sLen = 0 else: #if stretch is long enough, mask if sLen >= minTimes: if sLen > highestSLen: highestSLen = sLen sLen = 0 return highestSLen
def kMask(seq, mask, minTimes, kmerLength): if not mask: mask = [False for x in seq] #get all di seqs, uniquify kSeqs = bioLibCG.returnFrames(seq, kmerLength) kSeqs = set(kSeqs) for kmer in kSeqs: #slides for slide in range(0, kmerLength): sLen = 0 for i in range(slide, len(seq), kmerLength): try: if seq[i:i + kmerLength] == kmer: sLen += 1 else: #if stretch is long enough, mask if sLen >= minTimes: maskStart = i - (kmerLength * sLen) for i in range(maskStart, i): mask[i] = True sLen = 0 else: sLen = 0 except IndexError: #check for masking if sLen > minTimes: maskStart = i - (kmerLength * sLen) for i in range(maskStart, i): mask[i] = True sLen = 0 return mask
def plotInfoNTFrameEnrichment(dir, frameWidth, outFN): frameNum_seqs = {} for fChrom in bioLibCG.humanChromosomes: for fStrand in ('1', '-1'): print fChrom, fStrand fN = '%s/%s.%s.primeSeqs' % (dir, fChrom, fStrand) f = open(fN, 'r') for line in f: ls = line.strip().split('\t') seq = ls[0] for i, frame in enumerate( bioLibCG.returnFrames(seq, frameWidth)): frameNum_seqs.setdefault(i, []).append(frame) f.close() let_frameCounts = {} for fNum in sorted(frameNum_seqs.keys()): seqs = frameNum_seqs[fNum] let_count = getSeqEnrichment(seqs) for let, count in let_count.items(): let_frameCounts.setdefault(let, []).append(count) fOut = open(outFN, 'w') for let, fCounts in let_frameCounts.items(): fCounts = ','.join([str(x) for x in fCounts]) fOut.write('%s\t%s\n' % (let, fCounts)) fOut.close()
def getLongestRepeat(seq, minTimes, kmerLength): #get all di seqs, uniquify kSeqs = bioLibCG.returnFrames(seq, kmerLength) kSeqs = set(kSeqs) seqLength = len(seq) highestSLen = 0 for kmer in kSeqs: for slide in range(0, kmerLength): sLen = 0 adjustedRange = seqLength - ( kmerLength - 1 ) #takes into account excluding right side numbers scanRange = range(slide, adjustedRange, kmerLength) for i in scanRange: #print seq[:i] + ' [' + seq[i:i + kmerLength] + ']', highestSLen, sLen, i, i + kmerLength if (seq[i:i + kmerLength] == kmer): sLen += 1 #take care of last nt if i == scanRange[-1]: if sLen >= minTimes: if sLen > highestSLen: highestSLen = sLen sLen = 0 else: #if stretch is long enough, mask if sLen >= minTimes: if sLen > highestSLen: highestSLen = sLen sLen = 0 return highestSLen
def createWordDatabase(seqList, wordSize): #targetList should be a list of cgSeqs wordDict = {} # word: [id, word start, word end] for tSeq in seqList: for i,word in enumerate(bioLibCG.returnFrames(tSeq.sequence, wordSize)): try: wordDict[word].append([tSeq.id, i, i + wordSize - 1]) except KeyError: wordDict[word] = [[tSeq.id, i, i + wordSize - 1]] return wordDict
def createWordDatabase(seqList, wordSize): #targetList should be a list of cgSeqs wordDict = {} # word: [id, word start, word end] for tSeq in seqList: for i, word in enumerate(bioLibCG.returnFrames(tSeq.sequence, wordSize)): try: wordDict[word].append([tSeq.id, i, i + wordSize - 1]) except KeyError: wordDict[word] = [[tSeq.id, i, i + wordSize - 1]] return wordDict
def returnDiFreq(seqs): #collect di count di_freq = {} totalFrames = 0.0 for seq in seqs: dis = bioLibCG.returnFrames(seq, 2) totalFrames += len(dis) for di in dis: di_freq[di] = di_freq.get(di, 0) + 1.0 #convert to frequencey di_freq = dict( (x, di_freq[x]/totalFrames) for x in di_freq) return di_freq
def alignQuery(qSeq, wordDatabase, sequenceDatabase, wordSize, maxNumMismatches, fOut): #wordDataBase : [id, wordStart, wordEnd]F #find all words in sequence --> align if word in database for i,word in enumerate(bioLibCG.returnFrames(qSeq.sequence, wordSize)): if word in wordDatabase: for aInfo in wordDatabase[word]: qWordPos = [i, i + wordSize - 1] tSeq = cgSeq(aInfo[0], sequenceDatabase[aInfo[0]]) tWordPos = [aInfo[1], aInfo[2]] newAlignment = alignSeqs(qSeq, tSeq, qWordPos, tWordPos, maxNumMismatches) if newAlignment: fOut.write(newAlignment.alignmentOutput() + '\n')
def returnDiFreq(seqs): #collect di count di_freq = {} totalFrames = 0.0 for seq in seqs: dis = bioLibCG.returnFrames(seq, 2) totalFrames += len(dis) for di in dis: di_freq[di] = di_freq.get(di, 0) + 1.0 #convert to frequencey di_freq = dict((x, di_freq[x] / totalFrames) for x in di_freq) return di_freq
def alignQuery(qSeq, wordDatabase, sequenceDatabase, wordSize, maxNumMismatches, fOut): #wordDataBase : [id, wordStart, wordEnd]F #find all words in sequence --> align if word in database for i, word in enumerate(bioLibCG.returnFrames(qSeq.sequence, wordSize)): if word in wordDatabase: for aInfo in wordDatabase[word]: qWordPos = [i, i + wordSize - 1] tSeq = cgSeq(aInfo[0], sequenceDatabase[aInfo[0]]) tWordPos = [aInfo[1], aInfo[2]] newAlignment = alignSeqs(qSeq, tSeq, qWordPos, tWordPos, maxNumMismatches) if newAlignment: fOut.write(newAlignment.alignmentOutput() + '\n')
def locateASignals(dataFN, outFN, rn = None, tn = None): #load data NX = cgNexusFlat.Nexus(dataFN, ASite) NX.load(['coord', 'sequence'], [rn, tn]) f = open(outFN, 'w') for id in NX.ids: chrom, strand, start, end = bioLibCG.tccSplit(NX.coord[id]) if len(NX.sequence[id]) < 10: continue print NX.sequence[id], '\n' checkFrames = bioLibCG.returnFrames(NX.sequence[id], 6) for i, frame in enumerate(checkFrames): if frame == 'AATAAA': #assume 0-based...? siteStart, siteEnd = start + i, start + i + 5 f.write('%s\n' % bioLibCG.makeTcc(chrom, strand, siteStart, siteEnd)) f.close()
def get20mers(aluFN): gf = GenomeFetch.GenomeFetch('hg19') seq_count = {} f = open(aluFN, 'r') for line in f: ls = line.strip().split('\t') coord = ls[0] chrom, start, end = coord.split(':')[0], coord.split(':')[1].split( '-')[0], coord.split(':')[1].split('-')[1] strand = bioLibCG.switchStrandFormat(ls[2]) tcc = bioLibCG.makeTcc(chrom, strand, start, end) seq = gf.getSequence(tcc) frames = bioLibCG.returnFrames(seq, 20) if frames == 1: continue for smallSeq in frames: count = seq_count.get(smallSeq, 0) seq_count[smallSeq] = count + 1 for seq, count in seq_count.items(): print '%s\t%s' % (seq, count)
def get20mers(aluFN): gf = GenomeFetch.GenomeFetch('hg19') seq_count = {} f = open(aluFN, 'r') for line in f: ls = line.strip().split('\t') coord = ls[0] chrom, start, end = coord.split(':')[0], coord.split(':')[1].split('-')[0], coord.split(':')[1].split('-')[1] strand = bioLibCG.switchStrandFormat(ls[2]) tcc = bioLibCG.makeTcc(chrom, strand, start, end) seq = gf.getSequence(tcc) frames = bioLibCG.returnFrames(seq, 20) if frames == 1: continue for smallSeq in frames: count = seq_count.get(smallSeq, 0) seq_count[smallSeq] = count + 1 for seq, count in seq_count.items(): print '%s\t%s' % (seq, count)
def createSubSequences(oID_sequence, frameLength): return dict( (oID, set(bioLibCG.returnFrames(oID_sequence[oID], frameLength))) for oID in oID_sequence)