Beispiel #1
0
def string_search(P, T):
    matches = []
    if len(P) == 0 or len(T) < len(P):
        return matches

    # Preprocessing
    R = bad_character_table(P)
    L = good_suffix_table(P)
    F = full_shift_table(P)

    k = len(P) - 1      # Represents alignment of end of P relative to T
    previous_k = -1     # Represents alignment in previous phase (Galil's rule)
    while k < len(T):
        i = len(P) - 1  # Character to compare in P
        h = k           # Character to compare in T
        while i >= 0 and h > previous_k and P[i] == T[h]:   # Matches starting from end of P
            i -= 1
            h -= 1
        if i == -1 or h == previous_k:  # Match has been found (Galil's rule)
            matches.append(k - len(P) + 1)
            k += len(P)-F[1] if len(P) > 1 else 1
        else:   # No match, shift by max of bad character and good suffix rules
            char_shift = i - R[alphabet_index(T[h])][i]
            if i+1 == len(P):   # Mismatch happened on first attempt
                suffix_shift = 1
            elif L[i+1] == -1:   # Matched suffix does not appear anywhere in P
                suffix_shift = len(P) - F[i+1]
            else:               # Matched suffix appears in P
                suffix_shift = len(P) - L[i+1]
            shift = max(char_shift, suffix_shift)
            previous_k = k if shift >= i+1 else previous_k  # Galil's rule
            k += shift
    return matches
Beispiel #2
0
def bad_character_table(S):
    if len(S) == 0:
        return [[] for a in range(26)]
    R = [[-1] for a in range(26)]
    alpha = [-1 for a in range(26)]
    for i, c in enumerate(S):
        alpha[alphabet_index(c)] = i
        for j, a in enumerate(alpha):
            R[j].append(a)
    return R
 def test_single_char_string(self):
     alpha = 'abcdefghijklmnopqrstuvwxyz'
     for c in alpha:
         expected = [[-1,-1] for x in range(26)]
         expected[alphabet_index(c)][1] = 0
         self.assertEqual(expected, bad_character_table(c))
def string_search(P, T):
    if len(P) == 0 or len(T) < len(P):
        return []

    matches = []

    # Preprocessing
    N = fundamental_preprocess(T[::-1]) # S[::-1] reverses S
    N.reverse()
    R = bad_character_table(P)
    L = good_suffix_table(P)
    F = full_shift_table(P)
    M = [-1 for c in T]

    k = len(P) - 1      # Represents alignment of end of P relative to T
    i = len(P) - 1      # Character to compare in P
    h = k               # Character to compare in T
    match = False       # Indicates whether an exact match has been found in this phase
    mismatch = False    # Indicates whether a mismatch has occurred

    while k < len(T):
        if M[h] == -1 or M[h] == 0 or N[i] == 0:    # Phase case 1
            #print 'Case 1'
            if T[h] == P[i]: 
                if i == 0:  # Case 1a
                    match = True
                    mismatch = False
                else:       # Case 1b
                    i -= 1
                    h -= 1
                    match = False
                    mismatch = False
            else:           # Case 1c
                match = False
                mismatch = True
        elif (M[h] < N[i] and M[h] != -1) or (M[h] == N[i] and 0 < N[i] < i+1): # Case 2 & 5
            #print 'Case 2 & 5'
            i -= M[h]
            h -= M[h]
            match = False
            mismatch = False
        elif M[h] >= N[i] and N[i] == i+1 > 0:  # Phase case 3
            #print 'Case 3'
            match = True 
            mismatch = False
        elif M[h] > N[i] and N[i] < i+1:    # Phase case 4
            #print 'Case 4'
            i -= N[i]
            h -= N[i]
            match = False
            mismatch = True
        if match:
            matches.append(k - len(P) + 1)
            M[k] = k - h
            k += len(P)-F[1] if len(P) > 1 else 1
            i = len(P) - 1
            h = k
            match = False
            mismatch = False
        if mismatch:
            char_shift = i - R[alphabet_index(T[h])][i]
            if i+1 == len(P):   # Mismatch happened on first attempt
                suffix_shift = 1
            elif L[i+1] == -1:   # Matched suffix does not appear anywhere in P
                suffix_shift = len(P) - F[i+1]
            else:               # Matched suffix appears in P
                suffix_shift = len(P) - L[i+1]
            M[k] = k - h
            k += max(char_shift, suffix_shift)
            i = len(P) - 1
            h = k
            match = False
            mismatch = False
    return matches
 def test_uppercase(self):
     for i in range(26):
         self.assertEqual(i, alphabet_index(self.alpha[i].upper()))
 def test_lowercase(self):
     for i in range(26):
         self.assertEqual(i, alphabet_index(self.alpha[i]))