Exemple #1
0
def suffix_tree_edges(word):
    '''Returns the edge subsrings associated with the suffix tree for the given word.'''
    # Most of the work is done by the generalized suffix tree script (see scripts folder).
    gst = GeneralizedSuffixTree(word)

    # Get a list of all edge substrings from the generalized suffix tree.
    edges = [gst.edge_substring(e) for e in gst.edges.values()]

    # Return the edges in suffix tree format (i.e. want endings $0 to be $).
    # Note: This is necessary because we're using a generalized suffix tree, which uses $0, $1, ..., $N
    # as the out of alphabet suffixes in order to distinguish between word 0, word 1, ..., word N.
    return [e[:-1] if '$' in e else e for e in edges]
Exemple #2
0
def suffix_tree_edges(word):
    '''Returns the edge subsrings associated with the suffix tree for the given word.'''
    # Most of the work is done by the generalized suffix tree script (see scripts folder).
    gst = GeneralizedSuffixTree(word)

    # Get a list of all edge substrings from the generalized suffix tree.
    edges = [gst.edge_substring(e) for e in gst.edges.values()]

    # Return the edges in suffix tree format (i.e. want endings $0 to be $).
    # Note: This is necessary because we're using a generalized suffix tree, which uses $0, $1, ..., $N
    # as the out of alphabet suffixes in order to distinguish between word 0, word 1, ..., word N.
    return [e[:-1] if '$' in e else e for e in edges]
Exemple #3
0
def longest_common_substring(string_list):
    '''Returns the longest common substring among all strings in string_list.'''
    # Construct the generalized suffix tree for the input text.
    gst = GeneralizedSuffixTree(string_list)

    # Find all nodes that are traversed by all words in text, meaning that the substring up to that node is in all words in text.
    candidate_nodes = filter(lambda i: len(gst.nodes[i].words) == len(string_list), xrange(len(gst.nodes)))

    # Get the deepest node of from the candidate nodes, where depth corresponds to substring length.
    deepest_node = max(candidate_nodes, key=lambda i: gst.node_depth(i))

    # Return the substring corresponding to a traversal up to the deepest node.
    return gst.node_substring(deepest_node)
Exemple #4
0
def longest_repeat_substring(word, n):
    '''Returns the longest substring that appears at least n times in the given word.'''
    # Construct the suffix tree.
    gst = GeneralizedSuffixTree(word)

    # Find all nodes with at least n children.
    # The number of children a node has tells us how many times is associated substring appears within the string.
    candidate_nodes = filter(lambda i: len(gst.nodes[i].children) >= n, xrange(len(gst.nodes)))

    # Get the longest substring that appears at least n times.
    # Recall: node depth = proper length of substring, i.e. the length discounting the out of alphabet characters.
    best_node = max(candidate_nodes, key=lambda i: gst.node_depth(i))

    return gst.node_substring(best_node)
Exemple #5
0
def longest_common_substring(string_list):
    '''Returns the longest common substring among all strings in string_list.'''
    # Construct the generalized suffix tree for the input text.
    gst = GeneralizedSuffixTree(string_list)

    # Find all nodes that are traversed by all words in text, meaning that the substring up to that node is in all words in text.
    candidate_nodes = filter(
        lambda i: len(gst.nodes[i].words) == len(string_list),
        xrange(len(gst.nodes)))

    # Get the deepest node of from the candidate nodes, where depth corresponds to substring length.
    deepest_node = max(candidate_nodes, key=lambda i: gst.node_depth(i))

    # Return the substring corresponding to a traversal up to the deepest node.
    return gst.node_substring(deepest_node)
Exemple #6
0
def longest_repeat_substring(word, n):
    '''Returns the longest substring that appears at least n times in the given word.'''
    # Construct the suffix tree.
    gst = GeneralizedSuffixTree(word)

    # Find all nodes with at least n children.
    # The number of children a node has tells us how many times is associated substring appears within the string.
    candidate_nodes = filter(lambda i: len(gst.nodes[i].children) >= n,
                             xrange(len(gst.nodes)))

    # Get the longest substring that appears at least n times.
    # Recall: node depth = proper length of substring, i.e. the length discounting the out of alphabet characters.
    best_node = max(candidate_nodes, key=lambda i: gst.node_depth(i))

    return gst.node_substring(best_node)
Exemple #7
0
def shortest_nonshared_substring(string_list):
    '''Returns the shortest nonshared substring unique to the first word in string_list.'''
    # Construct the generalized suffix tree for the input text.
    gst = GeneralizedSuffixTree(string_list)

    # Find all nodes that are traversed only by the first word in text, meaning that the substring up to that node is only in the first word.
    candidate_nodes = filter(lambda i: gst.nodes[i].words == {0}, xrange(len(gst.nodes)))

    # Filter out all nodes corresponding to the out of alphabet character unique to first word, as these are trivally only traveresed by the first word.
    # If the out of alphabet character is the only character on the edge, then its parent must be traversed by another word.
    candidate_nodes = filter(lambda i: gst.edge_substring(gst.edges[gst.nodes[i].parent,i]) != '$0', candidate_nodes)

    # To get the shortest substring, only take the first character of the last edge, hence the substring has length parent_length + 1.
    shortest = min(candidate_nodes, key=lambda i: gst.node_depth(gst.nodes[i].parent)+1)

    # Shortest nonshared substring is the substring up to the first character of the edge leading to the optimal node.
    return gst.node_substring(gst.nodes[shortest].parent) + gst.edge_substring(gst.edges[gst.nodes[shortest].parent,shortest])[0]
Exemple #8
0
def shortest_nonshared_substring(string_list):
    '''Returns the shortest nonshared substring unique to the first word in string_list.'''
    # Construct the generalized suffix tree for the input text.
    gst = GeneralizedSuffixTree(string_list)

    # Find all nodes that are traversed only by the first word in text, meaning that the substring up to that node is only in the first word.
    candidate_nodes = filter(lambda i: gst.nodes[i].words == {0},
                             xrange(len(gst.nodes)))

    # Filter out all nodes corresponding to the out of alphabet character unique to first word, as these are trivally only traveresed by the first word.
    # If the out of alphabet character is the only character on the edge, then its parent must be traversed by another word.
    candidate_nodes = filter(
        lambda i: gst.edge_substring(gst.edges[gst.nodes[i].parent, i]) !=
        '$0', candidate_nodes)

    # To get the shortest substring, only take the first character of the last edge, hence the substring has length parent_length + 1.
    shortest = min(candidate_nodes,
                   key=lambda i: gst.node_depth(gst.nodes[i].parent) + 1)

    # Shortest nonshared substring is the substring up to the first character of the edge leading to the optimal node.
    return gst.node_substring(gst.nodes[shortest].parent) + gst.edge_substring(
        gst.edges[gst.nodes[shortest].parent, shortest])[0]