Esempio n. 1
0
def getLongestRepeat(seq, minTimes, kmerLength):
    
    #get all di seqs, uniquify
    kSeqs = bioLibCG.returnFrames(seq, kmerLength)
    kSeqs = set(kSeqs)

    highestSLen = 0
    for kmer in kSeqs:
        #slides
        for slide in range(0, kmerLength):

            sLen = 0
            for i in range(slide, len(seq), kmerLength):

                try:
                    if seq[i:i + kmerLength] == kmer:
                        sLen += 1
                    else:
                        #if stretch is long enough, mask
                        if sLen >= minTimes:
                            if sLen > highestSLen: highestSLen = sLen
                            sLen = 0
                        else:
                            sLen = 0
                except IndexError:
                    #check for masking
                    if sLen > minTimes:
                        if sLen > highestSLen: highestSLen = sLen
                        sLen = 0
    print highestSLen                        
    return highestSLen
def plotInfoNTFrameEnrichment(dir, frameWidth, outFN):

    frameNum_seqs = {}
    for fChrom in bioLibCG.humanChromosomes:
        for fStrand in ('1', '-1'):
            print fChrom, fStrand
            fN = '%s/%s.%s.primeSeqs' % (dir, fChrom, fStrand)
            f = open(fN, 'r')
            for line in f:
                ls = line.strip().split('\t')
                seq = ls[0]
                for i, frame in enumerate(bioLibCG.returnFrames(seq, frameWidth)):
                    frameNum_seqs.setdefault(i, []).append(frame)
            f.close()


    let_frameCounts = {}
    for fNum in sorted(frameNum_seqs.keys()):
        seqs = frameNum_seqs[fNum]
        let_count = getSeqEnrichment(seqs)
        for let, count in let_count.items():
            let_frameCounts.setdefault(let, []).append(count)


    fOut = open(outFN, 'w')
    for let, fCounts in let_frameCounts.items():
        fCounts = ','.join([str(x) for x in fCounts])
        fOut.write('%s\t%s\n' % (let, fCounts))
    fOut.close()
Esempio n. 3
0
def getLongestRepeat(seq, minTimes, kmerLength):
    
    #get all di seqs, uniquify
    kSeqs = bioLibCG.returnFrames(seq, kmerLength)
    kSeqs = set(kSeqs)
    seqLength = len(seq)

    highestSLen = 0
    for kmer in kSeqs:
        for slide in range(0, kmerLength):
            sLen = 0
            adjustedRange = seqLength - (kmerLength - 1) #takes into account excluding right side numbers
            scanRange = range(slide, adjustedRange, kmerLength)
            for i in scanRange:
                #print seq[:i] + ' [' + seq[i:i + kmerLength] + ']', highestSLen, sLen, i, i + kmerLength
                if (seq[i:i + kmerLength] == kmer):
                    sLen += 1
                    
                    #take care of last nt
                    if i == scanRange[-1]:
                        if sLen >= minTimes:
                            if sLen > highestSLen: highestSLen = sLen
                            sLen = 0

                else:
                    #if stretch is long enough, mask
                    if sLen >= minTimes:
                        if sLen > highestSLen: highestSLen = sLen
                    sLen = 0                            
    
    return highestSLen
Esempio n. 4
0
def kMask(seq, mask, minTimes, kmerLength):

    if not mask:
        mask = [False for x in seq]
    #get all di seqs, uniquify
    kSeqs = bioLibCG.returnFrames(seq, kmerLength)
    kSeqs = set(kSeqs)


    for kmer in kSeqs:
        #slides
        for slide in range(0, kmerLength):

            sLen = 0
            for i in range(slide, len(seq), kmerLength):

                try:
                    if seq[i:i + kmerLength] == kmer:
                        sLen += 1
                    else:
                        #if stretch is long enough, mask
                        if sLen >= minTimes:
                            maskStart = i - (kmerLength * sLen)
                            for i in range(maskStart, i): mask[i] = True
                            sLen = 0
                        else:
                            sLen = 0
                except IndexError:
                    #check for masking
                    if sLen > minTimes:
                        maskStart = i - (kmerLength * sLen)
                        for i in range(maskStart, i): mask[i] = True
                        sLen = 0
    
    return mask
def plotInfoNTFrameEnrichment(dir, frameWidth, outFN):

    frameNum_seqs = {}
    for fChrom in bioLibCG.humanChromosomes:
        for fStrand in ('1', '-1'):
            print fChrom, fStrand
            fN = '%s/%s.%s.primeSeqs' % (dir, fChrom, fStrand)
            f = open(fN, 'r')
            for line in f:
                ls = line.strip().split('\t')
                seq = ls[0]
                for i, frame in enumerate(
                        bioLibCG.returnFrames(seq, frameWidth)):
                    frameNum_seqs.setdefault(i, []).append(frame)
            f.close()

    let_frameCounts = {}
    for fNum in sorted(frameNum_seqs.keys()):
        seqs = frameNum_seqs[fNum]
        let_count = getSeqEnrichment(seqs)
        for let, count in let_count.items():
            let_frameCounts.setdefault(let, []).append(count)

    fOut = open(outFN, 'w')
    for let, fCounts in let_frameCounts.items():
        fCounts = ','.join([str(x) for x in fCounts])
        fOut.write('%s\t%s\n' % (let, fCounts))
    fOut.close()
Esempio n. 6
0
def getLongestRepeat(seq, minTimes, kmerLength):

    #get all di seqs, uniquify
    kSeqs = bioLibCG.returnFrames(seq, kmerLength)
    kSeqs = set(kSeqs)
    seqLength = len(seq)

    highestSLen = 0
    for kmer in kSeqs:
        for slide in range(0, kmerLength):
            sLen = 0
            adjustedRange = seqLength - (
                kmerLength - 1
            )  #takes into account excluding right side numbers
            scanRange = range(slide, adjustedRange, kmerLength)
            for i in scanRange:
                #print seq[:i] + ' [' + seq[i:i + kmerLength] + ']', highestSLen, sLen, i, i + kmerLength
                if (seq[i:i + kmerLength] == kmer):
                    sLen += 1

                    #take care of last nt
                    if i == scanRange[-1]:
                        if sLen >= minTimes:
                            if sLen > highestSLen: highestSLen = sLen
                            sLen = 0

                else:
                    #if stretch is long enough, mask
                    if sLen >= minTimes:
                        if sLen > highestSLen: highestSLen = sLen
                    sLen = 0

    return highestSLen
Esempio n. 7
0
def kMask(seq, mask, minTimes, kmerLength):

        if not mask:
                mask = [False for x in seq]
        #get all di seqs, uniquify
        kSeqs = bioLibCG.returnFrames(seq, kmerLength)
        kSeqs = set(kSeqs)


        for kmer in kSeqs:
                #slides
                for slide in range(0, kmerLength):

                        sLen = 0
                        for i in range(slide, len(seq), kmerLength):

                                try:
                                        if seq[i:i + kmerLength] == kmer:
                                                sLen += 1
                                        else:
                                                #if stretch is long enough, mask
                                                if sLen >= minTimes:
                                                        maskStart = i - (kmerLength * sLen)
                                                        for i in range(maskStart, i): mask[i] = True
                                                        sLen = 0
                                                else:
                                                        sLen = 0
                                except IndexError:
                                        #check for masking
                                        if sLen > minTimes:
                                                maskStart = i - (kmerLength * sLen)
                                                for i in range(maskStart, i): mask[i] = True
                                                sLen = 0
        
        return mask
Esempio n. 8
0
def createWordDatabase(seqList, wordSize):
        #targetList should be a list of cgSeqs

        wordDict = {} # word: [id, word start, word end]
        for tSeq in seqList:
                for i,word in enumerate(bioLibCG.returnFrames(tSeq.sequence, wordSize)):
                        try:
                                wordDict[word].append([tSeq.id, i, i + wordSize - 1])
                        except KeyError:
                                wordDict[word] = [[tSeq.id, i, i + wordSize - 1]]
        return wordDict
Esempio n. 9
0
def createWordDatabase(seqList, wordSize):
    #targetList should be a list of cgSeqs

    wordDict = {}  # word: [id, word start, word end]
    for tSeq in seqList:
        for i, word in enumerate(bioLibCG.returnFrames(tSeq.sequence,
                                                       wordSize)):
            try:
                wordDict[word].append([tSeq.id, i, i + wordSize - 1])
            except KeyError:
                wordDict[word] = [[tSeq.id, i, i + wordSize - 1]]
    return wordDict
Esempio n. 10
0
    def returnDiFreq(seqs):
        #collect di count
        di_freq = {}
        totalFrames = 0.0
        for seq in seqs:
            dis = bioLibCG.returnFrames(seq, 2)
            totalFrames += len(dis)
            for di in dis:
                di_freq[di] = di_freq.get(di, 0) + 1.0

        #convert to frequencey
        di_freq = dict( (x, di_freq[x]/totalFrames) for x in di_freq)
        return di_freq
Esempio n. 11
0
def alignQuery(qSeq, wordDatabase, sequenceDatabase, wordSize, maxNumMismatches, fOut):
        #wordDataBase : [id, wordStart, wordEnd]F
        #find all words in sequence --> align if word in database

        for i,word in enumerate(bioLibCG.returnFrames(qSeq.sequence, wordSize)):
                if word in wordDatabase:
                        for aInfo in wordDatabase[word]:
                                qWordPos = [i, i + wordSize - 1]
                                tSeq = cgSeq(aInfo[0], sequenceDatabase[aInfo[0]])
                                tWordPos = [aInfo[1], aInfo[2]]
                                newAlignment = alignSeqs(qSeq, tSeq, qWordPos, tWordPos, maxNumMismatches)
                                if newAlignment:
                                        fOut.write(newAlignment.alignmentOutput() + '\n')
Esempio n. 12
0
    def returnDiFreq(seqs):
        #collect di count
        di_freq = {}
        totalFrames = 0.0
        for seq in seqs:
            dis = bioLibCG.returnFrames(seq, 2)
            totalFrames += len(dis)
            for di in dis:
                di_freq[di] = di_freq.get(di, 0) + 1.0

        #convert to frequencey
        di_freq = dict((x, di_freq[x] / totalFrames) for x in di_freq)
        return di_freq
Esempio n. 13
0
def alignQuery(qSeq, wordDatabase, sequenceDatabase, wordSize,
               maxNumMismatches, fOut):
    #wordDataBase : [id, wordStart, wordEnd]F
    #find all words in sequence --> align if word in database

    for i, word in enumerate(bioLibCG.returnFrames(qSeq.sequence, wordSize)):
        if word in wordDatabase:
            for aInfo in wordDatabase[word]:
                qWordPos = [i, i + wordSize - 1]
                tSeq = cgSeq(aInfo[0], sequenceDatabase[aInfo[0]])
                tWordPos = [aInfo[1], aInfo[2]]
                newAlignment = alignSeqs(qSeq, tSeq, qWordPos, tWordPos,
                                         maxNumMismatches)
                if newAlignment:
                    fOut.write(newAlignment.alignmentOutput() + '\n')
Esempio n. 14
0
def locateASignals(dataFN, outFN, rn = None, tn = None):

    #load data
    NX = cgNexusFlat.Nexus(dataFN, ASite)
    NX.load(['coord', 'sequence'], [rn, tn])

    f = open(outFN, 'w')
    for id in NX.ids:
        chrom, strand, start, end = bioLibCG.tccSplit(NX.coord[id])
        if len(NX.sequence[id]) < 10: continue
        print NX.sequence[id], '\n'
        checkFrames = bioLibCG.returnFrames(NX.sequence[id], 6)
        for i, frame in enumerate(checkFrames):
            if frame == 'AATAAA':
                #assume 0-based...?
                siteStart, siteEnd = start + i, start + i + 5
                f.write('%s\n' % bioLibCG.makeTcc(chrom, strand, siteStart, siteEnd))
    f.close()
Esempio n. 15
0
def get20mers(aluFN):

    gf = GenomeFetch.GenomeFetch('hg19')
    seq_count = {}
    f = open(aluFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        coord = ls[0]
        chrom, start, end = coord.split(':')[0], coord.split(':')[1].split(
            '-')[0], coord.split(':')[1].split('-')[1]
        strand = bioLibCG.switchStrandFormat(ls[2])
        tcc = bioLibCG.makeTcc(chrom, strand, start, end)
        seq = gf.getSequence(tcc)
        frames = bioLibCG.returnFrames(seq, 20)
        if frames == 1:
            continue
        for smallSeq in frames:
            count = seq_count.get(smallSeq, 0)
            seq_count[smallSeq] = count + 1

    for seq, count in seq_count.items():
        print '%s\t%s' % (seq, count)
Esempio n. 16
0
def get20mers(aluFN):
        
        gf = GenomeFetch.GenomeFetch('hg19')
        seq_count = {}
        f = open(aluFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                coord = ls[0]
                chrom, start, end = coord.split(':')[0], coord.split(':')[1].split('-')[0], coord.split(':')[1].split('-')[1]
                strand = bioLibCG.switchStrandFormat(ls[2])
                tcc = bioLibCG.makeTcc(chrom, strand, start, end)
                seq = gf.getSequence(tcc)
                frames = bioLibCG.returnFrames(seq, 20)
                if frames == 1:
                        continue
                for smallSeq in frames:
                        count = seq_count.get(smallSeq, 0)
                        seq_count[smallSeq] = count + 1

        
        for seq, count in seq_count.items():
                print '%s\t%s' % (seq, count)
Esempio n. 17
0
def createSubSequences(oID_sequence, frameLength):
 
    return dict( (oID, set(bioLibCG.returnFrames(oID_sequence[oID], frameLength))) for oID in oID_sequence)
Esempio n. 18
0
def createSubSequences(oID_sequence, frameLength):

    return dict(
        (oID, set(bioLibCG.returnFrames(oID_sequence[oID], frameLength)))
        for oID in oID_sequence)