def compute_rank_successors(string, ranks): # Build rank table and rank lookup. (rank_table, rank_lookup) = build_rank_table_and_lookup(string, ranks) # Build the successors table. succ = [None for x in range(len(string))] rmqtable = rangemaxq.rmq_pre(rank_table) for rank in rank_lookup: if (rank + 1) not in rank_lookup or rank == float('inf'): continue cur_rank = rank_lookup[rank] next_rank = rank_lookup[rank + 1] if len(next_rank) < 1: # The imput string must be complete. raise RuntimeError('Thre cannot be 0 chars with rank %d.' % rank) j1 = j2 = 0 for i in range(len(cur_rank)): pos = cur_rank[i] while j2 < len(next_rank) and pos > next_rank[j2]: j1 = j2 j2 += 1 if j2 == len(next_rank): j2 = j1 dist1 = compute_rank_distance(next_rank[j1], pos, rmqtable, rank_table, absolute = False) dist2 = compute_rank_distance(pos, next_rank[j2], rmqtable, rank_table, absolute = False) if dist1 <= dist2: succ[pos] = next_rank[j1] else: succ[pos] = next_rank[j2] return succ
def maximal_substrings(string): alphabet = list(set(string)) # List of that collects the intervals of maximal substrings. intervals = [] for i in range(len(string)): # Determine the maximal substring starting at i. j = len(string) if i > 0: j = i while j < len(string) and string[i-1] != string[j]: j += 1 substring = string[i:j] # Initialize all required data structures. occurlist = compute_occurlist(substring) ranks = compute_ranks(occurlist, alphabet) rank_int = compute_rank_intervals(string, ranks) rank_succ = compute_rank_successors(string, ranks) (rank_table, rank_lookup) = build_rank_table_and_lookup(string, ranks) rmqtable = rangemaxq.rmq_pre(rank_table) # Initalize list with paths of rank 0 in increasing order. # Each element contains a tuple (position, (left, right)) # Where position is the last position of the path, while left and right # are the bounds of the minimal interval that contains the path. path_list = [] if 0 in rank_lookup: for pos in rank_lookup[0]: path_list.append((pos, (pos, pos + 1))) # Function that we will use to test if interval1 is contained # within interval2. def is_subset(interval1, interval2): return interval1[0] >= interval2[0] and interval1[1] <= interval2[1] while len(path_list) > 0: # ------------------------------------------------------------------------ # Test if some of the paths in the list can result in locations to output. # ------------------------------------------------------------------------ # Find first path with bounds contained within the rank interval of its # last position. k = 0 while k < len(path_list) and not is_subset(path_list[k][1], rank_int[path_list[k][0]]): k += 1 first = None if k == len(path_list) else path_list[k] if first != None and rank_int[first[0]][0] >= i: intervals.append(rank_int[first[0]]) prev_rank_int = rank_int[first[0]] for cur in path_list[k:]: cur_rank_int = rank_int[cur[0]] if is_subset(cur[1], cur_rank_int) \ and cur_rank_int != prev_rank_int: intervals.append(cur_rank_int) prev_rank_int = cur_rank_int # ------------------------------------------------------------------------ # Compute the next level. # ------------------------------------------------------------------------ # Build a list of paths that should be deleted, either because they don't # have a successor, or because their last position doesn't have the # smallest rank distance to the successor among all the last positions # that share the same successor. batch_succ = None batch_begin = 0 nearest_in_batch = None min_dist = float('inf') mark_delete = [] for k in range(len(path_list)): cur_path = path_list[k] cur_dist = float('inf') if rank_succ[cur_path[0]] == None \ else compute_rank_distance(cur_path[0], rank_succ[cur_path[0]], rmqtable, rank_table) # If we've reached the end of a batch of paths with the same successor # or we reached the end of the list, we take the one that's nearest # to the successor and discard the rest. if batch_succ != rank_succ[cur_path[0]]: # Go through the batch and mark for deletion all except the nearest. for j in range(batch_begin, k): if rank_succ[path_list[j][0]] == None or j != nearest_in_batch: mark_delete.append(j) # Begin a new batch. batch_succ = rank_succ[cur_path[0]] batch_begin = nearest_in_batch = k min_dist = cur_dist elif cur_dist < min_dist: # If we are inside a batch, update the minimum. min_dist = cur_dist nearest_in_batch = k # Go through the batch and mark for deletion all except the nearest. for j in range(batch_begin, k + 1): if rank_succ[path_list[j][0]] == None or j != nearest_in_batch: mark_delete.append(j) # Delete all marked paths. for k in reversed(range(len(mark_delete))): path_list.pop(mark_delete[k]) # Extend all the remaining paths with their successors and update their # bounds. for k in range(len(path_list)): cur_path = path_list[k] succ = rank_succ[cur_path[0]] left_bound = min(cur_path[1][0], succ) right_bound = max(cur_path[1][1], succ + 1) path_list[k] = (succ, (left_bound, right_bound)) return intervals