def count_words_tree(sequences, l, searchLocation, strand='+-', overlap=False, error=0, spacing=(1, 1)): """Count each word of length l in sequences l -- oligonucleotide length searchLocation -- location tuple example (-200,-1) strand -- + or +- overlap -- allow auto-overlapping return N, H """ location = find_location(sequences) #H = {} #hash table key=oligonucleotide value=list of occurrence position N = {} #scanned base count per position fo each word size N[l] = [0] * (searchLocation[1] - searchLocation[0] + 1) scannedPositions = 0 scannedWords = 0 info = cli.Info(len(sequences), 1, 1) # # construct SuffixTree # st = ST.SuffixTree(maxDepth=l, overlapping=overlap, maxIUPAC=error, NExtension=spacing, storePosition=True) for s in sequences: info('Counting words in [%+05d:%+05d]' % (searchLocation[0], searchLocation[1])) a, b = max(searchLocation[0], s.location[0]), min(searchLocation[1], s.location[1] - l + 1) dna = s.get_dna((a, b + l + 1)) st.add_dna(dna, shift=a) for I in range(a, b + 1): i = I - s.location[0] w = dna[i:i + l] if w.find('N') >= 0: continue N[l][I - searchLocation[0]] += 1 #@DEBUG #ST.display(st.root, maxDepth=6, full=1) # # Count # #@DEBUG #keys = st.extract(minLength=l, maxLength=l).keys() #keys.sort() #print '\n'.join(keys) C = st.extract(minLength=l, maxLength=l) if strand == '+-': H = ST.get_positions_two_strands(C, overlap) else: H = ST.get_positions(C) return dict(N=N, H=H, scannedPositions=scannedPositions, scannedWords=scannedWords)