コード例 #1
0
 def encodeSeq(self, bucketIndex, seq, pos):
     #encode the left part
     K = dna.reverse_complement(str(bucketIndex))
     for base in dna.reverse_complement(seq[:pos]):
         pred = self.Successors_in_graph(K)
         if len(pred) == 1:
             if pred[0][-1] != base:
                 self.newNodeNum += 1
                 self.numFlag.append('0b1')
                 symbol = self.dna2num[base]
                 if base > pred[0][-1]:
                     symbol = self.dna2num[base] - 1
                 self.encodeSeqPathL.write(
                     self.freq3,
                     symbol)  #save the reverse complement sequence
             else:
                 self.simpleNodeNum += 1
                 self.numFlag.append('0b0')
             K = pred[0]
         else:
             if len(pred) == 0:
                 self.tipNodeNum += 1
                 self.encodeSeqPathL.write(self.freq4, self.dna2num[base])
             else:
                 self.bifurNodeNum += 1
                 self.getFreqs(K)
                 self.encodeSeqPathL.write(self.freqs, self.dna2num[base])
             K = self.Suffix(K) + base
     #encode the right part
     K = str(bucketIndex)
     for base in seq[pos + self.indexLen:]:
         succ = self.Successors_in_graph(K)
         if len(succ) == 1:
             if succ[0][-1] != base:
                 self.newNodeNum += 1
                 self.numFlag.append('0b1')
                 symbol = self.dna2num[base]
                 if base > succ[0][-1]:
                     symbol = self.dna2num[base] - 1
                 self.encodeSeqPathR.write(self.freq3, symbol)
             else:
                 self.simpleNodeNum += 1
                 self.numFlag.append('0b0')
             K = succ[0]
         else:
             if len(succ) == 0:
                 self.tipNodeNum += 1
                 self.encodeSeqPathR.write(self.freq4, self.dna2num[base])
             else:
                 self.bifurNodeNum += 1
                 self.getFreqs(K)
                 self.encodeSeqPathR.write(self.freqs, self.dna2num[base])
             K = self.Suffix(K) + base
     return
コード例 #2
0
 def addtoGraph(self, seq, pos):
     for kmer in self.getKmerR(seq, pos):
         if str(kmer[:-1]) not in self.bucketDict:
             self.bucketDict.setdefault(str(kmer[:-1]), [1, 1, 1, 1])
         self.bucketDict[str(kmer[:-1])][self.dna2num[str(kmer[-1])]] += 1
     for kmer in self.getKmerL(seq, pos):
         if dna.reverse_complement(kmer[1:]) not in self.bucketDict:
             self.bucketDict.setdefault(dna.reverse_complement(kmer[1:]),
                                        [1, 1, 1, 1])
         self.bucketDict[dna.reverse_complement(
             kmer[1:])][3 - self.dna2num[kmer[0]]] += 1
     return
コード例 #3
0
 def decompressGraph(self, bucketIndex, indexPos):
     sequence = bucketIndex
     #decode the left path
     K = dna.reverse_complement(bucketIndex)
     base = ""
     for i in range(indexPos):
         pred = self.Successors_in_graph(K)
         if len(pred) == 1:
             if self.numFlag.read(bool, 1)[0]:  # New node:
                 self.newNodeNum += 1
                 symbol = self.decodeSeqPathL.read(self.freq3)
                 base = self.num2dna[symbol]
                 if base >= pred[0][-1]:
                     base = self.num2dna[symbol + 1]
             else:
                 self.simpleNodeNum += 1
                 base = pred[0][-1]
             K = pred[0]
         else:
             if len(pred) == 0:
                 self.tipNodeNum += 1
                 symbol = self.decodeSeqPathL.read(self.freq4)
                 base = self.num2dna[symbol]
             else:
                 self.bifurNodeNum += 1
                 self.getFreqs(K)
                 symbol = self.decodeSeqPathL.read(self.freqs)
                 base = self.num2dna[symbol]
             K = self.Suffix(K) + base
         sequence = self.recdna[base] + sequence
     #decode the right path
     K = bucketIndex
     for i in range(self.seqLen - indexPos - self.indexLen):
         succ = self.Successors_in_graph(K)
         if len(succ) == 1:
             if self.numFlag.read(bool, 1)[0]:  # New node
                 self.newNodeNum += 1
                 symbol = self.decodeSeqPathR.read(self.freq3)
                 base = self.num2dna[symbol]
                 if base >= succ[0][-1]:
                     base = self.num2dna[symbol + 1]
             else:
                 self.simpleNodeNum += 1
                 base = succ[0][-1]
             K = succ[0]
         else:
             if len(succ) == 0:
                 self.tipNodeNum += 1
                 symbol = self.decodeSeqPathR.read(self.freq4)
                 base = self.num2dna[symbol]
             else:
                 self.bifurNodeNum += 1
                 self.getFreqs(K)
                 symbol = self.decodeSeqPathR.read(self.freqs)
                 base = self.num2dna[symbol]
             K = self.Suffix(K) + base
         sequence += base
     return sequence
コード例 #4
0
def extract_windows_score(N, H, bg, location, w, params):
    """
    """
    R = []
    ratio = params['ratio']
    # convert position to new ref
    l = [p - location[0] for p in H[w]]

    if len(l) < params['occ'][0] or len(l) > params['occ'][1]:
        return []

    #extract windows
    mu = bg.mu(w, location)[:-len(w) + 1]

    #print w
    mu = [mu[i] * N[len(w)][i] for i in range(len(mu))]
    alpha = [x * ratio for x in mu]

    for a, b, obsOcc in score(l, alpha, mu):
        a, b = a + location[0], b + location[0]
        if b - a + 1 < params['width'][0] or b - a + 1 > params['width'][1]:
            continue

        #test occ
        if obsOcc < params['occ'][0] or obsOcc > params['occ'][1]:
            continue

        n = sum(N[len(w)][a - location[0]:b - location[0] + 1])

        try:
            obsFreq = obsOcc / float(n)
        except:
            cli.warning('n error')
            pass

        expFreq = bg.freq(w, (a, b))
        expOcc = expFreq * n

        pv = Stats.dist.ppois(obsOcc, expOcc)
        #pv = Stats.dist.pbinom(obsOcc, n, expFreq)
        ev = 1.0

        label = '%s|%s' % (w, reverse_complement(w))
        R.append([
            w, label, obsFreq, expFreq, obsOcc, expOcc, pv, ev, -log10(ev), a,
            b, b - a + 1, 0, n, 0, 0
        ])
        #cli.info(R[-1])

    for r in R:
        r[7] = r[6] * len(H) * len(R)
        r[8] = -log10(r[7])
        r[12] = len(R)

    return R
コード例 #5
0
def oligo2MM(filename):
    """Load data from oligo-analysis formated file (can be gzipped)
    """
    if filename.endswith('.gz'):
        f = gzip.open(filename)
    else:
        f = open(filename)

    rc = 0
    priori = {'A': 0, 'C': 0, 'G': 0, 'T': 0}
    i = 0
    for line in f:
        if line.startswith('#') or line.startswith(';'):
            if line.find('grouped by pairs of reverse complements') > 0:
                rc = 1
            continue
        elements = line.strip().split()
        w, freq, count = elements[0], float(elements[2]), int(elements[3])
        w = w.upper()

        # choose markov order
        if i == 0:
            mm = MM(len(w) - 1, pseudo=0.0)
            #mm.order = len(w) - 1
        i += 1

        if rc:
            wrc = reverse_complement(w)
            prefix = wrc[:-1]
            if w != wrc:
                freq = freq / 2.0
                mm.S[prefix] = mm.S.get(prefix, 0) + freq
                mm.T[prefix] = mm.T.get(prefix, {})
                mm.T[prefix][wrc[-1]] = freq
                for letter in wrc:
                    priori[letter] += freq
                #priori[prefix] += freq

        prefix = w[:-1]
        mm.S[prefix] = mm.S.get(prefix, 0) + freq
        mm.T[prefix] = mm.T.get(prefix, {})
        mm.T[prefix][w[-1]] = freq
        #priori
        for letter in w:
            priori[letter] += freq
        #priori[prefix] += freq

    S = float(sum(priori.values()))
    mm.priori = [priori[b] / S for b in ALPHABET]
    mm.freq()
    #print mm.priori
    #print mm.order
    #print mm.S
    #print mm.T
    return mm
コード例 #6
0
def count_words_hash(sequences, l, searchLocation, strand='+-', overlap=False):
    """Count each word of length l in sequences
    l               -- oligonucleotide length
    searchLocation  -- location tuple example (-200,-1)
    strand          -- + or +- 
    overlap         -- allow auto-overlapping

    return N, H
    """
    location = find_location(sequences)
    H = {}  #hash table key=oligonucleotide value=list of occurrence position
    N = {}  #scanned bases per position for each word size
    N[l] = [0] * (searchLocation[1] - searchLocation[0] + 1)
    scannedPositions = 0
    scannedWords = 0
    info = cli.Info(len(sequences), 1, 1)

    for s in sequences:
        info('Counting words in [%+05d:%+05d]' %
             (searchLocation[0], searchLocation[1]))
        dna = s.sequence
        HS = {}  # for current sequence

        a, b = max(searchLocation[0],
                   s.location[0]), min(searchLocation[1],
                                       s.location[1] - l + 1)

        for I in range(a, b + 1):
            scannedPositions += 1
            i = I - s.location[0]

            if dna[i:i + l].find('N') >= 0:
                continue

            if strand == '+':
                w = dna[i:i + l]
            elif strand == '+-':
                wf = dna[i:i + l]
                wr = reverse_complement(dna[i:i + l])
                w = min(wf, wr)

            N[l][I - searchLocation[0]] += 1

            if not overlap and HS.get(w, [a - l])[-1] + l > I:
                continue

            H.setdefault(w, []).append(I)
            HS.setdefault(w, []).append(I)
            scannedWords += 1
    return dict(N=N,
                H=H,
                scannedPositions=scannedPositions,
                scannedWords=scannedWords)
コード例 #7
0
ファイル: bucket.py プロジェクト: rongjiewang/BdBG
 def reSortSeqence(self):  ##for reasin single read
     """Run over the given file and sort the sequences."""
     index, reverseFlag, indexPos = self.getMostCommenIndex()
     self.bucketTable[index] = self.bucketTable.get(index, 0) + 1
     if reverseFlag:
         self.read['sequence'] = dna.reverse_complement(
             str(self.record['sequence']))
     else:
         self.read['sequence'] = str(self.record['sequence'])
     self.read['reverse'] = reverseFlag
     self.read['indexPos'] = indexPos
     self.sequenceTable.setdefault(index,
                                   []).append(copy.deepcopy(self.read))
     return
コード例 #8
0
ファイル: cloning.py プロジェクト: Kortemme-Lab/klab
def sanitize_codon_list(codon_list, forbidden_seqs=()):
    """
    Make silent mutations to the given codon lists to remove any undesirable 
    sequences that are present within it.  Undesirable sequences include 
    restriction sites, which may be optionally specified as a second argument, 
    and homopolymers above a pre-defined length.  The return value is the 
    number of corrections made to the codon list.
    """

    # Unit test missing for:
    #   Homopolymer fixing

    for codon in codon_list:
        if len(codon) != 3:
            raise ValueError("Codons must have exactly 3 bases: '{}'".format(codon))

    # Compile a collection of all the sequences we don't want to appear in the 
    # gene.  This includes the given restriction sites and their reverse 
    # complements, plus any homopolymers above a pre-defined length.

    bad_seqs = set()
    
    bad_seqs.union(
            restriction_sites.get(seq, seq)
            for seq in forbidden_seqs)

    bad_seqs.union(
            dna.reverse_complement(seq)
            for seq in bad_seqs)

    bad_seqs.union(
            base * (gen9.homopolymer_max_lengths[base] + 1)
            for base in dna.dna_bases)

    bad_seqs = [
            dna.dna_to_re(bs)
            for bs in bad_seqs]

    # Remove every bad sequence from the gene by making silent mutations to the 
    # codon list.
    
    num_corrections = 0

    for bad_seq in bad_seqs:
        while remove_bad_sequence(codon_list, bad_seq, bad_seqs):
            num_corrections += 1

    return num_corrections
コード例 #9
0
def group_rc(c):
    """
    TODO : grouprc and overlapping on both strands
    """
    groupedc = {}
    for w in c:
        wrc = dna.reverse_complement(w)
        x = min(w, wrc)
        if x in groupedc:
            continue
        groupedc[x] = c[w]
        if w != wrc and wrc in c:
            groupedc[x] += c[wrc]
        else:
            groupedc[x] *= 2
    return groupedc
コード例 #10
0
ファイル: bucket.py プロジェクト: rongjiewang/BdBG
    def getMostCommenIndex(self):
        read = self.record['sequence']
        rcread = dna.reverse_complement(str(read))
        freq_kmer = 0
        mostIndex = 0
        indexPos = 0
        reverseFlag = False
        kmerIter = 0
        for kmer in self.getBucketKmers(read):
            m = self.getCommenOverlapKmer(kmer, read)
            if m > freq_kmer:
                freq_kmer = m
                mostIndex = kmer
                indexPos = kmerIter
            kmerIter += 1
        kmerIter = 0
        for kmer in self.getBucketKmers(rcread):
            m = self.getCommenOverlapKmer(kmer, rcread)
            if m > freq_kmer:
                freq_kmer = m
                mostIndex = kmer
                reverseFlag = True
                indexPos = kmerIter
            kmerIter += 1
        #add new kmer to mutiDict
        if freq_kmer > 0:
            if (reverseFlag):
                self.addTomutiDict(mostIndex, rcread)
            else:
                self.addTomutiDict(mostIndex, read)
        else:
            minKmer, reverseFlag, indexPos = self.getReadMinKmers()
            if (reverseFlag):
                self.addTomutiDict(minKmer, rcread)
            else:
                self.addTomutiDict(minKmer, read)

        #find a commen index
        if freq_kmer:
            return mostIndex, reverseFlag, indexPos
        else:
            return minKmer, reverseFlag, indexPos
コード例 #11
0
 def __getitem__(self, key):
     if self.strand == '+-':
         wrc = reverse_complement(key)
         if key != wrc:
             if self.dyad:
                 return self.P(key[:self.monad]) * self.P(
                     key[-self.monad:]) + self.P(wrc[:self.monad]) * self.P(
                         wrc[-self.monad:])
             else:
                 return self.P(key) + self.P(wrc)
         else:
             if self.dyad:
                 return self.P(key[:self.monad]) * self.P(key[-self.monad:])
             else:
                 return self.P(key)
     else:
         if self.dyad:
             return self.P(key[:self.monad]) * self.P(key[-self.monad:])
         else:
             return self.P(key)
コード例 #12
0
ファイル: bucket.py プロジェクト: rongjiewang/BdBG
 def reassigned(self):
     for countIndex in self.bucketTable.keys():
         if self.bucketTable[countIndex] == 1:
             if self.sequenceTable[countIndex][0][
                     'reverse']:  ##recover the raw sequence
                 self.record['sequence'] = dna.reverse_complement(
                     str(self.sequenceTable[countIndex][0]['sequence']))
             else:
                 self.record['sequence'] = self.sequenceTable[countIndex][
                     0]['sequence']
             self.read['N'] = self.sequenceTable[countIndex][0][
                 'N']  ## take N infor for reassign
             self.read['order'] = self.sequenceTable[countIndex][0]['order']
             self.read['len'] = self.sequenceTable[countIndex][0]['len']
             self.seqLen = len(self.record['sequence'])
             del self.mutiDict[countIndex]
             del self.bucketTable[countIndex]
             del self.sequenceTable[countIndex]
             self.reSortSeqence()
     return
コード例 #13
0
ファイル: bucket.py プロジェクト: rongjiewang/BdBG
    def getReadMinKmers(self):
        read = self.record['sequence']
        rcread = dna.reverse_complement(str(read))
        minKmer = self.maxKmer
        reverseFlag = False
        indexPos = 0
        kmerIter = 0
        for kmer in self.getBucketKmers(read):
            if kmer < minKmer:
                minKmer = kmer
                indexPos = kmerIter
            kmerIter += 1

        kmerIter = 0
        for kmer in self.getBucketKmers(rcread):
            if kmer < minKmer:
                minKmer = kmer
                reverseFlag = True
                indexPos = kmerIter
            kmerIter += 1
        return minKmer, reverseFlag, indexPos
コード例 #14
0
def get_positions_two_strands(c, overlap=False):
    positions = {}
    for wf in c:
        wrc = dna.reverse_complement(wf)
        w = min(wf, wrc)
        if w in positions:
            continue

        l = []
        f = c[wf]
        for i in f:
            l += [j[0] for j in f[i]]

        if wf != wrc:
            r = c.get(wrc, {})
            for i in r:
                l += [j[0] for j in r[i]]

        l.sort()
        positions[w] = l

    return positions
コード例 #15
0
    def __next(self, a, b, obsOcc):
        width = b - a + 1
        if width < self.MIN_WIDTH or width > self.MAX_WIDTH:
            return
        if obsOcc < self.MIN_OCC or obsOcc > self.MAX_WIDTH:
            return

        n = sum(self.N[len(self.w)][a - self.location[0]:b - self.location[0] +
                                    1])
        try:
            obsFreq = obsOcc / float(n)
        except:
            cli.warning('n error')

        expFreq = self.bg.freq(self.w, (a, b))
        expOcc = expFreq * n

        #pv = ppois(obsOcc, expOcc)
        #pv = ppois_cached(obsOcc, expOcc)
        #pv = pbinom_right_left_cached(obsOcc, n, expFreq)
        if self.params['under']:
            pv = pbinom_left(obsOcc, n, expFreq)
        else:
            pv = pbinom(obsOcc, n, expFreq)

        ev = 1.0
        label = '%s|%s' % (self.w, reverse_complement(self.w))
        w = self.w

        spaces = self.w.count('N')
        if spaces >= 1:
            label = label.replace('N' * spaces, 'n{%d}' % spaces)
            w = self.w.replace('N' * spaces, 'n{%d}' % spaces)
        self.R.append([
            w, label, obsFreq, expFreq, obsOcc, expOcc, pv, ev, -log10(ev), a,
            b, b - a + 1, 0, n, 0, 0
        ])
コード例 #16
0
def count_dyads_hash(sequences,
                     l,
                     spacing,
                     searchLocation,
                     strand='+-',
                     overlap=False):
    """Count each dyad of length l in sequences
    l               -- oligonucleotide length
    spacing         -- spacing
    searchLocation  -- location tuple example (-200,-1)
    strand          -- + or +- 
    overlap         -- allow auto-overlapping

    """
    lmonad = l
    l = 2 * lmonad + spacing

    location = find_location(sequences)
    H = {}
    N = {}
    N[l] = [0] * (searchLocation[1] - searchLocation[0] + 1)
    scannedPositions = 0
    scannedWords = 0
    info = cli.Info(len(sequences), 1, 1)

    for s in sequences:
        info('Counting words in [%+05d:%+05d]' %
             (searchLocation[0], searchLocation[1]))
        dna = s.sequence
        HS = {}
        a, b = max(searchLocation[0],
                   s.location[0]), min(searchLocation[1],
                                       s.location[1] - l + 1)

        for I in range(a, b + 1):
            scannedPositions += 1
            i = I - s.location[0]
            if dna[i:i + l].find('N') >= 0:
                continue

            if strand == '+':
                w = dna[i:i +
                        lmonad] + 'N' * spacing + dna[i + lmonad + spacing:i +
                                                      2 * lmonad + spacing]
            elif strand == '+-':
                wf = dna[i:i +
                         lmonad] + 'N' * spacing + dna[i + lmonad + spacing:i +
                                                       2 * lmonad + spacing]
                wr = reverse_complement(wf)
                w = min(wf, wr)

            N[l][I - searchLocation[0]] += 1
            if not overlap and HS.get(w, [a - l])[-1] + l > I:
                continue

            H.setdefault(w, []).append(I)
            HS.setdefault(w, []).append(I)
            scannedWords += 1
    return dict(N=N,
                H=H,
                scannedPositions=scannedPositions,
                scannedWords=scannedWords)
コード例 #17
0
ファイル: bucket.py プロジェクト: rongjiewang/BdBG
 def mergePairRead(self, record1, record2):
     self.record['sequence'] = str(
         record1['sequence']) + dna.reverse_complement(
             str(record2['sequence']))
     return
コード例 #18
0
def print_count(c):
    l = [(k, v) for k, v in list(c.items())]
    l.sort()
    for w, wcount in l:
        wrc = dna.reverse_complement(w)
        print('%s|%s %4d' % (w, wrc, wcount))
コード例 #19
0
 def outPutSeqence(self):
     if self.readrc.read(bool, 1)[0]:
         self.sequence = dna.reverse_complement(self.sequence)
     self.replaceN()
     self.outFile.write(self.sequence + "\n")
     return
コード例 #20
0
 def ajustSeqDir(self):
     if self.readrc.read(bool, 1)[0]:
         self.sequence = dna.reverse_complement(self.sequence)
     return