def extract_windows(sequences, bg, location=None, wl=None, params=None): location = location or bg.location r = count_words(sequences, bg.l, location, strand=bg.strand, overlap=bg.overlap, params=params) R = [] H = r['H'] N = r['N'] #print H wl = wl or list(H.keys()) info = cli.Info(len(wl), 1, 1) for w in wl: info('Extracting windows') # if params['heuristic'] == 'score': # x = extract_windows_score(N, H, bg, location=location, w=w, params=params) # else: extractor = Extractor(N, H, bg, location=location, w=w, params=params) x = extractor.run() R.extend(x) #print pbinom_cached.stats() return {'R': R, 'scannedWords': r['scannedWords']}
def count_words_hash(sequences, l, searchLocation, strand='+-', overlap=False): """Count each word of length l in sequences l -- oligonucleotide length searchLocation -- location tuple example (-200,-1) strand -- + or +- overlap -- allow auto-overlapping return N, H """ location = find_location(sequences) H = {} #hash table key=oligonucleotide value=list of occurrence position N = {} #scanned bases per position for each word size N[l] = [0] * (searchLocation[1] - searchLocation[0] + 1) scannedPositions = 0 scannedWords = 0 info = cli.Info(len(sequences), 1, 1) for s in sequences: info('Counting words in [%+05d:%+05d]' % (searchLocation[0], searchLocation[1])) dna = s.sequence HS = {} # for current sequence a, b = max(searchLocation[0], s.location[0]), min(searchLocation[1], s.location[1] - l + 1) for I in range(a, b + 1): scannedPositions += 1 i = I - s.location[0] if dna[i:i + l].find('N') >= 0: continue if strand == '+': w = dna[i:i + l] elif strand == '+-': wf = dna[i:i + l] wr = reverse_complement(dna[i:i + l]) w = min(wf, wr) N[l][I - searchLocation[0]] += 1 if not overlap and HS.get(w, [a - l])[-1] + l > I: continue H.setdefault(w, []).append(I) HS.setdefault(w, []).append(I) scannedWords += 1 return dict(N=N, H=H, scannedPositions=scannedPositions, scannedWords=scannedWords)
def realtest(N=2): #filename = 'E2F.fa' filename = 'Test/MM0.fa' sequences = dna.fasta2sequences(filename) print(sequences) st = SuffixTree(maxDepth=N, overlapping=True, maxIUPAC=N, NExtension=(1, 1), storePosition=0) info = cli.Info(len(sequences)) for i in range(len(sequences)): info('Processing sequence %s' % sequences[i].id) st.add_dna(sequences[i].sequence) #display(st.root) #count = group_rc(st.count(minLength=N, maxLength=N)) #print_count(count) return st
def build_markov(self, sequences, order=1): info = cli.Info((self.location[1] - self.l + 1 - self.location[0]) / self.step + 1) for i in range(self.location[0], self.location[1] - self.l + 1, self.step): info('building Markov BG pos=%i' % i) loc = (i, i + self.W - 1) mm = MM(order) mm.dyad = self.params['dyad'] mm.monad = self.l mm.overlap = self.overlap mm.strand = self.strand mm.set_NExtension(self.params['spacing']) mm.learn([s.get_dna(loc) for s in sequences]) self[loc] = mm sorted_keys = list(self.keys()) sorted_keys.sort() self.intervals = sorted_keys
def build(self, sequences): info = cli.Info((self.location[1] - self.l + 1 - self.location[0]) / self.step + 1) for i in range(self.location[0], self.location[1] - self.l + 1, self.step): loc = (i, i + self.W - 1) info('building BG pos=%i' % i) r = count_words(sequences, self.l, loc, strand=self.strand, overlap=self.overlap, params=self.params) H = r['H'] N = r['N'] #compute frequency for k in H: H[k] = len(H[k]) / float(sum(N[len(k)])) self[loc] = H sorted_keys = list(self.keys()) sorted_keys.sort() self.intervals = sorted_keys
def add_sequences(self, sequences): info = cli.Info(len(sequences)) for i in range(len(sequences)): info('Proccessing sequence %s' % sequences[i].id) self.add_dna(sequences[i].sequence)
def count_words_tree(sequences, l, searchLocation, strand='+-', overlap=False, error=0, spacing=(1, 1)): """Count each word of length l in sequences l -- oligonucleotide length searchLocation -- location tuple example (-200,-1) strand -- + or +- overlap -- allow auto-overlapping return N, H """ location = find_location(sequences) #H = {} #hash table key=oligonucleotide value=list of occurrence position N = {} #scanned base count per position fo each word size N[l] = [0] * (searchLocation[1] - searchLocation[0] + 1) scannedPositions = 0 scannedWords = 0 info = cli.Info(len(sequences), 1, 1) # # construct SuffixTree # st = ST.SuffixTree(maxDepth=l, overlapping=overlap, maxIUPAC=error, NExtension=spacing, storePosition=True) for s in sequences: info('Counting words in [%+05d:%+05d]' % (searchLocation[0], searchLocation[1])) a, b = max(searchLocation[0], s.location[0]), min(searchLocation[1], s.location[1] - l + 1) dna = s.get_dna((a, b + l + 1)) st.add_dna(dna, shift=a) for I in range(a, b + 1): i = I - s.location[0] w = dna[i:i + l] if w.find('N') >= 0: continue N[l][I - searchLocation[0]] += 1 #@DEBUG #ST.display(st.root, maxDepth=6, full=1) # # Count # #@DEBUG #keys = st.extract(minLength=l, maxLength=l).keys() #keys.sort() #print '\n'.join(keys) C = st.extract(minLength=l, maxLength=l) if strand == '+-': H = ST.get_positions_two_strands(C, overlap) else: H = ST.get_positions(C) return dict(N=N, H=H, scannedPositions=scannedPositions, scannedWords=scannedWords)
def count_dyads_hash(sequences, l, spacing, searchLocation, strand='+-', overlap=False): """Count each dyad of length l in sequences l -- oligonucleotide length spacing -- spacing searchLocation -- location tuple example (-200,-1) strand -- + or +- overlap -- allow auto-overlapping """ lmonad = l l = 2 * lmonad + spacing location = find_location(sequences) H = {} N = {} N[l] = [0] * (searchLocation[1] - searchLocation[0] + 1) scannedPositions = 0 scannedWords = 0 info = cli.Info(len(sequences), 1, 1) for s in sequences: info('Counting words in [%+05d:%+05d]' % (searchLocation[0], searchLocation[1])) dna = s.sequence HS = {} a, b = max(searchLocation[0], s.location[0]), min(searchLocation[1], s.location[1] - l + 1) for I in range(a, b + 1): scannedPositions += 1 i = I - s.location[0] if dna[i:i + l].find('N') >= 0: continue if strand == '+': w = dna[i:i + lmonad] + 'N' * spacing + dna[i + lmonad + spacing:i + 2 * lmonad + spacing] elif strand == '+-': wf = dna[i:i + lmonad] + 'N' * spacing + dna[i + lmonad + spacing:i + 2 * lmonad + spacing] wr = reverse_complement(wf) w = min(wf, wr) N[l][I - searchLocation[0]] += 1 if not overlap and HS.get(w, [a - l])[-1] + l > I: continue H.setdefault(w, []).append(I) HS.setdefault(w, []).append(I) scannedWords += 1 return dict(N=N, H=H, scannedPositions=scannedPositions, scannedWords=scannedWords)