def search2(P, sStr):
    '''
    find the substring P, all occurances
    '''
    sa = tks.simple_kark_sort(sStr)
    m = len(P)
    n = len(sStr)
    start, end = -1, -1
    # lower bound
    left,  right = 0, n  # length of sa is n+1
    while left < right:
        mid = (left + right) >> 1
        comp = cmp(sStr[sa[mid]:sa[mid] + m], P)
        if comp >= 0:
            right = mid
        else:
            left = mid + 1
    start = left
    if sStr[sa[left]: sa[left] + m] != P:
        return []

    # upper bound
    left,  right = 0, n  # length of sa is n+1
    while left < right:
        mid = (left + right) >> 1
        comp = cmp(sStr[sa[mid]:sa[mid] + m], P)
        if comp > 0:
            right = mid
        else:
            left = mid + 1
    end = left
    result = [sa[i] for i in range(start, end)]
    result.sort()
    return result
Beispiel #2
0
Datei: bwt.py Projekt: altitu/BIF
def getBWTAndSA(s, psa):
    sa = tks.simple_kark_sort(s)
    bwt = [0] * len(sa)
    for i in range(len(sa)):
        bwt[i] = s[(sa[i] - 1) % len(s)]
    newsa = subsampleArray(sa, psa)
    return (bwt, newsa)
Beispiel #3
0
def get_pair_longest_overlap(fragments, min_overlap):
    '''
    Generator returning maximum overlap matches betwen pairs of fragments.
    
    Algorithm:
        Concatenate fragments + labels into a single string
        Build a suffix array from string
        Compute the longest common prefix (LCP) for each element in the array
        Sort LCP array by size of LCP
        for each element in sorted LCP:
            where the LCP is greater than the minimum overlap
            Extract the LCP element label and the label of the following element. 
            These two elements have the largest overlap in the suffix array so yield them.
    '''
    
    # Build the concatenated fragment + label string.
    concat_frags = build_fragment_str(fragments)

    # Build a suffix array via the karkkainen sanders algorithm
    # Then compute the longest common prefixes
    sa = tks.simple_kark_sort(concat_frags)
    lcp = tks.LCP(concat_frags,sa)
    
    # Sort the LCP by size largest to smallest.
    sorted_lcp = sorted(enumerate(lcp),key=lambda x:x[1], reverse=True)
   
    # Iterate through sorted LCP list.
    for cur_lcp_pos, max_lcp_val in sorted_lcp:
        # If the overlap of this LCP entry is smaller than the minimum overlap
        # then stop yielding label pairs
        if max_lcp_val < min_overlap:
            break

        # Step through contiguous elements in the suffix array and extract
        # labels.
        labels = []
        while len(labels) < 2:
            # Labels are integers prefixed with "$$$" and followed by "!!!"
            label_start = concat_frags.find('$$$', sa[cur_lcp_pos])
            
            if label_start < 0:
                break
                
            label_start += 3
            label_end = concat_frags.find('!!!', label_start)

            # Extract the label and convert from string to int
            label = int(concat_frags[label_start: label_end])
    
            labels.append(label)
            cur_lcp_pos += 1
        
        # If the two entries in the suffix array come from the same fragment
        # then go to the next highest LCP entry.
        if len(labels) < 2 or labels[0] == labels[1]:
            continue

        yield labels[0], labels[1]
Beispiel #4
0
def get_sa_lcp(s):
  s = unicode(s,'utf-8','replace')
  n = len(s)
  sa = tks.simple_kark_sort(s)
  lcp = tks.LCP(s,sa)
  #print sa
  #print lcp
  # return is special because their sizes are not exactly len(s)
  return (sa[:len(s)],lcp[:len(s)])
  def __init__(self, s) :
    self.s = s
    self.n = len(s)
    _,sa = tks.simple_kark_sort(s)
    self.sa = sa[:self.n]
#    self.lcp = tks.LCP(s,self.sa)
    self.bwt = ''.join([s[self.sa[i]-1] for i in xrange(self.n)])
    self.init_bwt()
    self.init_isa()
def longest(s):
    '''
    longest repeating substring
    '''
    sa = tks.simple_kark_sort(s)
    lcp = tks.LCP(s, sa)
    maxI, maxV = -1, -1
    for i, v in enumerate(lcp):
        if v > maxV:
            maxI, maxV = i, v
    return s[sa[maxI]:sa[maxI] + maxV]
def longest(s):
    n = len(s)
    ns, sa = tks.simple_kark_sort(s)
    lcp = tks.LCP(s, sa)
    maxI, maxV = -1, -1
    for i, v in enumerate(lcp):
        if v > maxV:
            maxI, maxV = i, v
    count = 1
    for i, v in enumerate(lcp):
        if s[sa[i]] == s[sa[maxI]] and v == maxV:
            count += 1
    return count, s[sa[maxI]:sa[maxI] + maxV]
Beispiel #8
0
def get_seq(fasta: str):
    """
    Permet d'avoir la sequence sans prendre en compte la premiere ligne du fichier fasta commençant par ">".
    Elle va ouvrir le fichier entré en paramètre puis lire la deuxième ligne et en faire le suffixe array

    :param fasta: sequence fasta de reference
    :return: La sequence du genome de reference et le suffixe array de cette sequence
    """
    with open(fasta) as fasta_file:  # ouverture du fichier fasta
        for line in fasta_file:  # lecture du fichier fasta
            if line[0] != ">":  # Première ligne commençant pas ">" ignoré
                s = line.strip() + "$"  # stockage de la séquence
                sa = tks.simple_kark_sort(s)  # stockage du suffix array sa
                return s, sa
def search(P, sStr):
    '''
    find first substring P
    '''
    sa = tks.simple_kark_sort(sStr)
    m = len(P)
    n = len(sStr)
    left,  right = 0, n  # length of sa is n+1
    while left < right:
        mid = (left + right) >> 1
        comp = cmp(sStr[sa[mid]:sa[mid] + m], P)
        if comp >= 0:
            right = mid
        else:
            left = mid + 1
    if sStr[sa[left]: sa[left] + m] == P:
        return sa[left]
    else:
        return -1
Beispiel #10
0
#Initialization of reads and readsInv, its reverse complementary
reads, readsBioPalind = [], []
for line in open(readsfile, "r"):
    if line[0] != ">":  #lines with > do not contain sequences, but merely comments about the sequences.

        reads.append(
            line[:-1].lower()
        )  #-1 to remove \n. To lower case for practical reasons when calling posdict.
        readsBioPalind.append(biologicalPalyndrome(
            line[:-1].lower()))  #We also stock the biological palyndromes

#We create SA, BWT, Rank and F from reference

print("generating SA")
startChrono()
refSA = tks.simple_kark_sort(reference)
print(" done in " + str(endChrono()) + " s")
print("generating BWT")
startChrono()
refBWT = getBWT(reference, refSA)
print(" done in " + str(endChrono()) + " s")
print("generating ranks")
startChrono()
refRank = getRank(refBWT)
print(" done in " + str(endChrono()) + " s")
print("generating F")
startChrono()
refF = getF(refBWT)
print(" done in " + str(endChrono()) + " s")
"""""" """""" """
usefull functions
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import tools_karkkainen_sanders as tks


#s = open('ooo.txt').read()
#s = open('Python.htm').read()
s = 'ab'*10000
#s = 'abab'
s = unicode(s,'utf-8','replace')
n = len(s)
ns, sa = tks.simple_kark_sort(s)
lcp = tks.LCP(s,sa)
#print sa
#print lcp
#print sa

for i in xrange(n-1) :
  if(s[sa[i]:] > s[sa[i+1]:]) :
    print s[sa[i]:][:40]
    print s[sa[i+1]:][:40]
    print '='*50
Beispiel #12
0
import tools_karkkainen_sanders as tks
import sys

sys.stdin = open('input.txt')
numTest = int(input())
for itertest in range(numTest):
    line = raw_input().strip()
    m = len(line)
    line = line * 2
    SA = tks.simple_kark_sort(line)
    for v in SA:
        if v < m:
            print v + 1
            break
 def build(self):
     self.sa = tks.simple_kark_sort(self.corpus_str)
     self.lcp = tks.LCP(self.corpus_str, self.sa)
def construct_suffix_array(x):
    x = unicode(x,'utf-8','replace')
    n = len(x)
    return tks.simple_kark_sort(x)[0:n]
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import tools_karkkainen_sanders as tks

s = 'ababbbaaabbaddavvabba'
s = unicode(s, 'utf-8', 'replace')
n = len(s)
sa = tks.simple_kark_sort(s)
lcp = tks.LCP(s, sa)
print sa
print lcp

# 1/0

for i in xrange(n - 1):
    #  if s[sa[i]:] > s[sa[i+1]:]:
    print s[sa[i]:][:40]
    #print s[sa[i+1]:][:40]
    #print '='*50
	def build(self):
		self.sa = tks.simple_kark_sort(self.corpus_str)
		self.lcp = tks.LCP(self.corpus_str, self.sa)
Beispiel #17
0
        line = raw_input().strip()
        if line == 'END TDP CODEBASE':
            break
        tdpStr.append(line + '\n')
    jcnStr = []
    raw_input()
    while True:
        line = raw_input()
        if line.strip() == 'END JCN CODEBASE':
            break
        jcnStr.append(line + '\n')
    N = len(''.join(jcnStr))
    M = len(''.join(tdpStr))
    S = ''.join(jcnStr) + '$' + ''.join(tdpStr)

    SA = tks.simple_kark_sort(S)
    LCP = tks.LCP(S, SA)

    sortedLCP = []
    L = N + M + 1
    for i in range(L):
        for j in range(i, L):
            begin = SA[i]
            end = SA[j + 1]
            if (begin > N and end < N) or (begin < N and end > N):
                sortedLCP.append((-1 * min(LCP[i:j + 1]), min(begin, end)))

    sortedLCP.sort()
    used = []
    for dup in sortedLCP:
        if K == 0:
Beispiel #18
0
if format(args) != 'Namespace()':
    ref = open(str(args.ref), 'r')

    ################READ AND KEEPBACK SEQUENCE OF INPUT FILE################
    """
    To use the Burrows Wheeler algorithm we need to add a "$" at the end of sequence to be usable by the other functions
    """
    sequence = ''
    for line in ref:
        line = str(line).replace('\n', '')  ##add to delete line break
        if '>' not in line:
            sequence = str(line) + "$"

    ################SA[i] KEEP BACK################
    sa = tks.simple_kark_sort(
        sequence
    )  ##keep back of SA[i] calculated thanks to tools_karkkainen_sanders
    get_BWT(sequence, sa)

    ################CREATION OF OUT FILE################
    """
    Save the index in a file as dataframe format, path indication in argues at the beginning
    """

    d = {'SA[i]': sa, 'BWT': get_BWT(sequence, sa)}
    df = pd.DataFrame(data=d)
    df.to_csv(str(args.out),
              encoding='utf-8',
              index=False,
              mode='w',
              header=True)
        line = raw_input().strip()
        if line == 'END TDP CODEBASE':
            break
        tdpStr.append(line + '\n')
    jcnStr = []
    raw_input()
    while True:
        line = raw_input()
        if line.strip() == 'END JCN CODEBASE':
            break
        jcnStr.append(line + '\n')
    N = len(''.join(jcnStr))
    M = len(''.join(tdpStr))
    S = ''.join(jcnStr) + '$' + ''.join(tdpStr)

    SA = tks.simple_kark_sort(S)
    LCP = tks.LCP(S, SA)

    sortedLCP = []
    L = N + M + 1
    for i in range(L):
        for j in range(i, L):
            begin = SA[i]
            end = SA[j + 1]
            if (begin > N and end < N) or (begin < N and end > N):
                sortedLCP.append((-1 * min(LCP[i:j + 1]), min(begin, end)))

    sortedLCP.sort()
    used = []
    for dup in sortedLCP:
        if K == 0: