Ejemplo n.º 1
def compute_rank_successors(string, ranks):

  # Build rank table and rank lookup.
  (rank_table, rank_lookup) = build_rank_table_and_lookup(string, ranks)

  # Build the successors table.
  succ = [None for x in range(len(string))]
  rmqtable = rangemaxq.rmq_pre(rank_table)

  for rank in rank_lookup:
    if (rank + 1) not in rank_lookup or rank == float('inf'): continue
    cur_rank = rank_lookup[rank]
    next_rank = rank_lookup[rank + 1]

    if len(next_rank) < 1:
      # The imput string must be complete.
      raise RuntimeError('Thre cannot be 0 chars with rank %d.' % rank)

    j1 = j2 = 0
    for i in range(len(cur_rank)):
      pos = cur_rank[i]
      while j2 < len(next_rank) and pos > next_rank[j2]:
        j1 = j2
        j2 += 1

      if j2 == len(next_rank): j2 = j1

      dist1 = compute_rank_distance(next_rank[j1], pos,
        rmqtable, rank_table, absolute = False)

      dist2 = compute_rank_distance(pos, next_rank[j2],
        rmqtable, rank_table, absolute = False)

      if dist1 <= dist2:
        succ[pos] = next_rank[j1]
        succ[pos] = next_rank[j2]

  return succ
Ejemplo n.º 2
def maximal_substrings(string):
  alphabet = list(set(string))

  # List of that collects the intervals of maximal substrings.
  intervals = []

  for i in range(len(string)):
    # Determine the maximal substring starting at i.
    j = len(string)
    if i > 0:
      j = i
      while j < len(string) and string[i-1] != string[j]: j += 1
    substring = string[i:j]

    # Initialize all required data structures.
    occurlist = compute_occurlist(substring)
    ranks = compute_ranks(occurlist, alphabet)
    rank_int = compute_rank_intervals(string, ranks)
    rank_succ = compute_rank_successors(string, ranks)
    (rank_table, rank_lookup) = build_rank_table_and_lookup(string, ranks)
    rmqtable = rangemaxq.rmq_pre(rank_table)

    # Initalize list with paths of rank 0 in increasing order.
    # Each element contains a tuple (position, (left, right))
    # Where position is the last position of the path, while left and right
    # are the bounds of the minimal interval that contains the path.
    path_list = []
    if 0 in rank_lookup:
      for pos in rank_lookup[0]:
        path_list.append((pos, (pos, pos + 1)))

    # Function that we will use to test if interval1 is contained
    # within interval2.
    def is_subset(interval1, interval2):
      return interval1[0] >= interval2[0] and interval1[1] <= interval2[1]

    while len(path_list) > 0:
      # ------------------------------------------------------------------------
      # Test if some of the paths in the list can result in locations to output.
      # ------------------------------------------------------------------------

      # Find first path with bounds contained within the rank interval of its
      # last position.
      k = 0
      while k < len(path_list) and not is_subset(path_list[k][1], rank_int[path_list[k][0]]):
        k += 1
      first = None if k == len(path_list) else path_list[k]

      if first != None and rank_int[first[0]][0] >= i:

        prev_rank_int = rank_int[first[0]]
        for cur in path_list[k:]:
          cur_rank_int = rank_int[cur[0]]
          if is_subset(cur[1], cur_rank_int) \
            and cur_rank_int != prev_rank_int:
            prev_rank_int = cur_rank_int

      # ------------------------------------------------------------------------
      # Compute the next level.
      # ------------------------------------------------------------------------

      # Build a list of paths that should be deleted, either because they don't
      # have a successor, or because their last position doesn't have the
      # smallest rank distance to the successor among all the last positions
      # that share the same successor.
      batch_succ = None
      batch_begin = 0
      nearest_in_batch = None
      min_dist = float('inf')
      mark_delete = []

      for k in range(len(path_list)):
        cur_path = path_list[k]
        cur_dist = float('inf') if rank_succ[cur_path[0]] == None \
          else compute_rank_distance(cur_path[0], rank_succ[cur_path[0]], rmqtable, rank_table)

        # If we've reached the end of a batch of paths with the same successor
        # or we reached the end of the list, we take the one that's nearest
        # to the successor and discard the rest.
        if batch_succ != rank_succ[cur_path[0]]:

          # Go through the batch and mark for deletion all except the nearest.
          for j in range(batch_begin, k):
            if rank_succ[path_list[j][0]] == None or j != nearest_in_batch:

          # Begin a new batch.
          batch_succ = rank_succ[cur_path[0]]
          batch_begin = nearest_in_batch = k
          min_dist = cur_dist

        elif cur_dist < min_dist:
          # If we are inside a batch, update the minimum.
          min_dist = cur_dist
          nearest_in_batch = k

      # Go through the batch and mark for deletion all except the nearest.
      for j in range(batch_begin, k + 1):
        if rank_succ[path_list[j][0]] == None or j != nearest_in_batch:

      # Delete all marked paths.
      for k in reversed(range(len(mark_delete))):

      # Extend all the remaining paths with their successors and update their
      # bounds.
      for k in range(len(path_list)):
        cur_path = path_list[k]
        succ = rank_succ[cur_path[0]]
        left_bound = min(cur_path[1][0], succ)
        right_bound = max(cur_path[1][1], succ + 1)
        path_list[k] = (succ, (left_bound, right_bound))

  return intervals