Exemple #1
0
def multiple_pattern_matching(text, patterns):
    """
    CODE CHALLENGE: Solve the Multiple Pattern Matching Problem.
    Input: A string Text followed by a collection of strings Patterns.
    Output: All starting positions in Text where a string from Patterns appears as a substring.
    """
    # to cope with not $-ending text
    if text[-1] != '$':
        text += '$'
    s = bwt(text)
    a = suffix_array(text)

    def scan_symbol_count(c):
        # update current dict
        current_dict[c] = current_dict.get(c, 0) + 1
        # copy current dict to the current symbol array
        return current_dict.copy()

    current_dict = {}
    count_symbol = [{}] + [scan_symbol_count(c) for c in s]

    # create first_occurence
    def scan_first_occurence((i, c)):
        if c not in first_occurence:
            first_occurence[c] = i

    first_occurence = {}
    map(scan_first_occurence, enumerate(sorted(s)))

    def pattern_positions(pattern):
        top = 0
        bottom = len(s) - 1
        while top <= bottom:
            if pattern:
                symbol = pattern[-1]
                pattern = pattern[:-1]
                if count_symbol[bottom + 1].get(
                        symbol, 0) > count_symbol[top].get(symbol, 0):
                    top = first_occurence[symbol] + count_symbol[top].get(
                        symbol, 0)
                    bottom = first_occurence[symbol] + count_symbol[
                        bottom + 1].get(symbol, 0) - 1
                else:
                    return []
            else:
                return [a[i] for i in range(top, bottom + 1)]
        return []

    pos = []
    for pattern in patterns:
        pos += pattern_positions(pattern)
    pos.sort()
    return pos
def multiple_pattern_matching(text,patterns):
    """
    CODE CHALLENGE: Solve the Multiple Pattern Matching Problem.
    Input: A string Text followed by a collection of strings Patterns.
    Output: All starting positions in Text where a string from Patterns appears as a substring.
    """
    # to cope with not $-ending text
    if text[-1] != '$':
            text+='$'
    s = bwt(text)
    a = suffix_array(text)
    
    
    def scan_symbol_count(c):
        # update current dict
        current_dict[c] = current_dict.get(c,0)+1
        # copy current dict to the current symbol array
        return current_dict.copy()
    current_dict = {}
    count_symbol = [{}] + [scan_symbol_count(c) for c in s]
    
    # create first_occurence 
    def scan_first_occurence((i,c)):
        if c not in first_occurence:
            first_occurence[c] = i
    first_occurence = {}    
    map(scan_first_occurence,enumerate(sorted(s)))
    
    def pattern_positions(pattern):
        top = 0
        bottom = len(s) - 1
        while top <= bottom:
            if pattern:
                symbol = pattern[-1]
                pattern = pattern[:-1]
                if count_symbol[bottom+1].get(symbol,0) > count_symbol[top].get(symbol,0):   
                        top = first_occurence[symbol] + count_symbol[top].get(symbol,0)
                        bottom = first_occurence[symbol] + count_symbol[bottom+1].get(symbol,0) - 1
                else:
                    return []
            else:
                return [a[i] for i in range(top,bottom+1)]
        return []
    
    pos = []
    for pattern in patterns:
        pos += pattern_positions(pattern)
    pos.sort()
    return pos
Exemple #3
0
def multiple_approximate_pattern_matching(text, patterns, d):
    """
    CODE CHALLENGE: Solve the Multiple Approximate Pattern Matching Problem.
    Input: A string Text, followed by a collection of strings Patterns, and an integer d.
    Output: All positions where one of the strings in Patterns appears as a substring of Text with
    at most d mismatches.
    """
    # to cope with not $-ending text
    if text[-1] != '$':
        text += '$'

    s = bwt(text)
    a = suffix_array(text)

    def scan_symbol_count(c):
        # update current dict
        current_dict[c] = current_dict.get(c, 0) + 1
        # copy current dict to the current symbol array
        return current_dict.copy()

    current_dict = {}
    count_symbol = [{}] + [scan_symbol_count(c) for c in s]

    # create first_occurence
    def scan_first_occurence((i, c)):
        if c not in first_occurence:
            first_occurence[c] = i

    first_occurence = {}
    map(scan_first_occurence, enumerate(sorted(s)))

    # move from patterns to seeds
    def pattern_to_seeds(p):
        l = len(p)
        assert l > d
        minsize = l / (d + 1)
        cut = range(0, l - minsize + 1, minsize)
        cut.append(l)
        seeds = [(p[cut[i - 1]:cut[i]], cut[i - 1])
                 for i in range(1, len(cut))]
        return seeds

    def seed_positions(seed):
        top = 0
        bottom = len(s) - 1
        while top <= bottom:
            if seed:
                symbol = seed[-1]
                seed = seed[:-1]
                if count_symbol[bottom + 1].get(
                        symbol, 0) > count_symbol[top].get(symbol, 0):
                    top = first_occurence[symbol] + count_symbol[top].get(
                        symbol, 0)
                    bottom = first_occurence[symbol] + count_symbol[
                        bottom + 1].get(symbol, 0) - 1
                else:
                    return []
            else:
                return [a[i] for i in range(top, bottom + 1)]
        return []

    def is_approximately_matching(offset, p):
        mismatches = 0
        for i, c in enumerate(p):
            if (c != text[offset + i]):
                mismatches += 1
                if mismatches > d:
                    return False
        return True

    def approximate_pattern_positions(p):
        pattern_positions = set()
        so = pattern_to_seeds(p)
        for (seed, offset) in so:
            candidate_positions = seed_positions(seed)
            for candidate_position in candidate_positions:
                pattern_position = candidate_position - offset
                if pattern_position < 0:
                    # candidate matching before text starts ....
                    continue
                if pattern_position + len(p) > len(text):
                    # candidate matching after text stops ....
                    continue
                if is_approximately_matching(pattern_position, p):
                    # candidate matching with at most d mismatches
                    pattern_positions.add(pattern_position)
        return list(pattern_positions)

    pos = []
    for pattern in patterns:
        pos += approximate_pattern_positions(pattern)
    pos.sort()
    return pos
def multiple_approximate_pattern_matching(text,patterns,d):
    """
    CODE CHALLENGE: Solve the Multiple Approximate Pattern Matching Problem.
    Input: A string Text, followed by a collection of strings Patterns, and an integer d.
    Output: All positions where one of the strings in Patterns appears as a substring of Text with
    at most d mismatches.
    """
    # to cope with not $-ending text
    if text[-1] != '$':
            text+='$'
            
    s = bwt(text)
    a = suffix_array(text)
    
    def scan_symbol_count(c):
        # update current dict
        current_dict[c] = current_dict.get(c,0)+1
        # copy current dict to the current symbol array
        return current_dict.copy()
    current_dict = {}
    count_symbol = [{}] + [scan_symbol_count(c) for c in s]
    
    # create first_occurence 
    def scan_first_occurence((i,c)):
        if c not in first_occurence:
            first_occurence[c] = i
    first_occurence = {}    
    map(scan_first_occurence,enumerate(sorted(s)))
    
    # move from patterns to seeds
    def pattern_to_seeds(p):
        l = len(p)
        assert l>d
        minsize = l/(d+1)
        cut = range(0,l-minsize+1,minsize)
        cut.append(l)
        seeds = [(p[cut[i-1]:cut[i]],cut[i-1]) for i in range(1,len(cut))]
        return seeds
    
    def seed_positions(seed):
        top = 0
        bottom = len(s) - 1
        while top <= bottom:
            if seed:
                symbol = seed[-1]
                seed = seed[:-1]
                if count_symbol[bottom+1].get(symbol,0) > count_symbol[top].get(symbol,0):   
                        top = first_occurence[symbol] + count_symbol[top].get(symbol,0)
                        bottom = first_occurence[symbol] + count_symbol[bottom+1].get(symbol,0) - 1
                else:
                    return []
            else:
                return [a[i] for i in range(top,bottom+1)]
        return []

    def is_approximately_matching(offset,p):
        mismatches = 0
        for i,c in enumerate(p):
            if (c!=text[offset+i]):
                mismatches += 1
                if mismatches > d:
                    return False
        return True
        
    
    def approximate_pattern_positions(p):
        pattern_positions = set()
        so = pattern_to_seeds(p)
        for (seed,offset) in so:
            candidate_positions = seed_positions(seed)
            for candidate_position in candidate_positions:
                pattern_position = candidate_position - offset
                if pattern_position < 0:
                    # candidate matching before text starts ....
                    continue
                if pattern_position + len(p) > len(text):
                    # candidate matching after text stops ....
                    continue
                if is_approximately_matching(pattern_position,p):
                    # candidate matching with at most d mismatches
                    pattern_positions.add(pattern_position)
        return list(pattern_positions)
       
    pos = []
    for pattern in patterns:
        pos += approximate_pattern_positions(pattern)
    pos.sort()
    return pos