Example #1
0
def extract_windows(sequences, bg, location=None, wl=None, params=None):
    location = location or bg.location
    r = count_words(sequences,
                    bg.l,
                    location,
                    strand=bg.strand,
                    overlap=bg.overlap,
                    params=params)

    R = []
    H = r['H']
    N = r['N']
    #print H
    wl = wl or list(H.keys())
    info = cli.Info(len(wl), 1, 1)

    for w in wl:
        info('Extracting windows')

        # if params['heuristic'] == 'score':
        #     x = extract_windows_score(N, H, bg, location=location, w=w, params=params)
        # else:
        extractor = Extractor(N, H, bg, location=location, w=w, params=params)
        x = extractor.run()
        R.extend(x)

    #print pbinom_cached.stats()
    return {'R': R, 'scannedWords': r['scannedWords']}
Example #2
0
def count_words_hash(sequences, l, searchLocation, strand='+-', overlap=False):
    """Count each word of length l in sequences
    l               -- oligonucleotide length
    searchLocation  -- location tuple example (-200,-1)
    strand          -- + or +- 
    overlap         -- allow auto-overlapping

    return N, H
    """
    location = find_location(sequences)
    H = {}  #hash table key=oligonucleotide value=list of occurrence position
    N = {}  #scanned bases per position for each word size
    N[l] = [0] * (searchLocation[1] - searchLocation[0] + 1)
    scannedPositions = 0
    scannedWords = 0
    info = cli.Info(len(sequences), 1, 1)

    for s in sequences:
        info('Counting words in [%+05d:%+05d]' %
             (searchLocation[0], searchLocation[1]))
        dna = s.sequence
        HS = {}  # for current sequence

        a, b = max(searchLocation[0],
                   s.location[0]), min(searchLocation[1],
                                       s.location[1] - l + 1)

        for I in range(a, b + 1):
            scannedPositions += 1
            i = I - s.location[0]

            if dna[i:i + l].find('N') >= 0:
                continue

            if strand == '+':
                w = dna[i:i + l]
            elif strand == '+-':
                wf = dna[i:i + l]
                wr = reverse_complement(dna[i:i + l])
                w = min(wf, wr)

            N[l][I - searchLocation[0]] += 1

            if not overlap and HS.get(w, [a - l])[-1] + l > I:
                continue

            H.setdefault(w, []).append(I)
            HS.setdefault(w, []).append(I)
            scannedWords += 1
    return dict(N=N,
                H=H,
                scannedPositions=scannedPositions,
                scannedWords=scannedWords)
Example #3
0
def realtest(N=2):
    #filename = 'E2F.fa'
    filename = 'Test/MM0.fa'
    sequences = dna.fasta2sequences(filename)
    print(sequences)
    st = SuffixTree(maxDepth=N,
                    overlapping=True,
                    maxIUPAC=N,
                    NExtension=(1, 1),
                    storePosition=0)
    info = cli.Info(len(sequences))
    for i in range(len(sequences)):
        info('Processing sequence %s' % sequences[i].id)
        st.add_dna(sequences[i].sequence)

    #display(st.root)
    #count = group_rc(st.count(minLength=N, maxLength=N))
    #print_count(count)
    return st
Example #4
0
    def build_markov(self, sequences, order=1):
        info = cli.Info((self.location[1] - self.l + 1 - self.location[0]) /
                        self.step + 1)
        for i in range(self.location[0], self.location[1] - self.l + 1,
                       self.step):
            info('building Markov BG pos=%i' % i)
            loc = (i, i + self.W - 1)
            mm = MM(order)
            mm.dyad = self.params['dyad']
            mm.monad = self.l
            mm.overlap = self.overlap
            mm.strand = self.strand
            mm.set_NExtension(self.params['spacing'])
            mm.learn([s.get_dna(loc) for s in sequences])
            self[loc] = mm

        sorted_keys = list(self.keys())
        sorted_keys.sort()
        self.intervals = sorted_keys
Example #5
0
    def build(self, sequences):
        info = cli.Info((self.location[1] - self.l + 1 - self.location[0]) /
                        self.step + 1)
        for i in range(self.location[0], self.location[1] - self.l + 1,
                       self.step):
            loc = (i, i + self.W - 1)
            info('building BG pos=%i' % i)
            r = count_words(sequences,
                            self.l,
                            loc,
                            strand=self.strand,
                            overlap=self.overlap,
                            params=self.params)
            H = r['H']
            N = r['N']

            #compute frequency
            for k in H:
                H[k] = len(H[k]) / float(sum(N[len(k)]))
            self[loc] = H
        sorted_keys = list(self.keys())
        sorted_keys.sort()
        self.intervals = sorted_keys
Example #6
0
 def add_sequences(self, sequences):
     info = cli.Info(len(sequences))
     for i in range(len(sequences)):
         info('Proccessing sequence %s' % sequences[i].id)
         self.add_dna(sequences[i].sequence)
Example #7
0
def count_words_tree(sequences,
                     l,
                     searchLocation,
                     strand='+-',
                     overlap=False,
                     error=0,
                     spacing=(1, 1)):
    """Count each word of length l in sequences
    l               -- oligonucleotide length
    searchLocation  -- location tuple example (-200,-1)
    strand          -- + or +- 
    overlap         -- allow auto-overlapping

    return N, H
    """
    location = find_location(sequences)
    #H = {} #hash table key=oligonucleotide value=list of occurrence position
    N = {}  #scanned base count per position fo each word size
    N[l] = [0] * (searchLocation[1] - searchLocation[0] + 1)
    scannedPositions = 0
    scannedWords = 0
    info = cli.Info(len(sequences), 1, 1)

    #
    # construct SuffixTree
    #
    st = ST.SuffixTree(maxDepth=l,
                       overlapping=overlap,
                       maxIUPAC=error,
                       NExtension=spacing,
                       storePosition=True)

    for s in sequences:
        info('Counting words in [%+05d:%+05d]' %
             (searchLocation[0], searchLocation[1]))
        a, b = max(searchLocation[0],
                   s.location[0]), min(searchLocation[1],
                                       s.location[1] - l + 1)
        dna = s.get_dna((a, b + l + 1))
        st.add_dna(dna, shift=a)
        for I in range(a, b + 1):
            i = I - s.location[0]
            w = dna[i:i + l]
            if w.find('N') >= 0:
                continue
            N[l][I - searchLocation[0]] += 1

        #@DEBUG
        #ST.display(st.root, maxDepth=6, full=1)

    #
    # Count
    #

    #@DEBUG
    #keys = st.extract(minLength=l, maxLength=l).keys()
    #keys.sort()
    #print '\n'.join(keys)

    C = st.extract(minLength=l, maxLength=l)

    if strand == '+-':
        H = ST.get_positions_two_strands(C, overlap)
    else:
        H = ST.get_positions(C)

    return dict(N=N,
                H=H,
                scannedPositions=scannedPositions,
                scannedWords=scannedWords)
Example #8
0
def count_dyads_hash(sequences,
                     l,
                     spacing,
                     searchLocation,
                     strand='+-',
                     overlap=False):
    """Count each dyad of length l in sequences
    l               -- oligonucleotide length
    spacing         -- spacing
    searchLocation  -- location tuple example (-200,-1)
    strand          -- + or +- 
    overlap         -- allow auto-overlapping

    """
    lmonad = l
    l = 2 * lmonad + spacing

    location = find_location(sequences)
    H = {}
    N = {}
    N[l] = [0] * (searchLocation[1] - searchLocation[0] + 1)
    scannedPositions = 0
    scannedWords = 0
    info = cli.Info(len(sequences), 1, 1)

    for s in sequences:
        info('Counting words in [%+05d:%+05d]' %
             (searchLocation[0], searchLocation[1]))
        dna = s.sequence
        HS = {}
        a, b = max(searchLocation[0],
                   s.location[0]), min(searchLocation[1],
                                       s.location[1] - l + 1)

        for I in range(a, b + 1):
            scannedPositions += 1
            i = I - s.location[0]
            if dna[i:i + l].find('N') >= 0:
                continue

            if strand == '+':
                w = dna[i:i +
                        lmonad] + 'N' * spacing + dna[i + lmonad + spacing:i +
                                                      2 * lmonad + spacing]
            elif strand == '+-':
                wf = dna[i:i +
                         lmonad] + 'N' * spacing + dna[i + lmonad + spacing:i +
                                                       2 * lmonad + spacing]
                wr = reverse_complement(wf)
                w = min(wf, wr)

            N[l][I - searchLocation[0]] += 1
            if not overlap and HS.get(w, [a - l])[-1] + l > I:
                continue

            H.setdefault(w, []).append(I)
            HS.setdefault(w, []).append(I)
            scannedWords += 1
    return dict(N=N,
                H=H,
                scannedPositions=scannedPositions,
                scannedWords=scannedWords)