Ejemplo n.º 1
0
 def find_best(self, look_for, start=0, end=-1):
     end = len(self.text) if end < 0 else end
     if end - start < 2 * len(look_for):
         return self.sw_align(look_for, start, end)
     # What happens in this case?
     window_size = len(look_for)
     windows = {}
     for i, ngram in enumerate(ngrams(" " + look_for + " ", 3)):
         if ngram in self.ngrams:
             ngram_bucket = self.ngrams[ngram]
             for occurrence in ngram_bucket:
                 if occurrence < start or occurrence > end:
                     continue
                 window = occurrence // window_size
                 windows[window] = (windows[window] +
                                    1) if window in windows else 1
     candidate_windows = sorted(windows.keys(),
                                key=lambda w: windows[w],
                                reverse=True)
     best = (-1, -1, 0, None)
     last_window_grams = 0.1
     for window in candidate_windows[:self.max_candidates]:
         ngram_factor = windows[window] / last_window_grams
         if ngram_factor < self.candidate_threshold:
             break
         last_window_grams = windows[window]
         interval_start = max(start, int((window - 1) * window_size))
         interval_end = min(end, int((window + 2) * window_size))
         search_result = self.sw_align(look_for, interval_start,
                                       interval_end)
         if search_result[2] > best[2]:
             best = search_result
     return best
Ejemplo n.º 2
0
 def __init__(
     self,
     text,
     max_candidates=10,
     candidate_threshold=0.92,
     match_score=100,
     mismatch_score=-100,
     gap_score=-100,
     char_similarities=None,
 ):
     self.text = text
     self.max_candidates = max_candidates
     self.candidate_threshold = candidate_threshold
     self.match_score = match_score
     self.mismatch_score = mismatch_score
     self.gap_score = gap_score
     self.char_similarities = char_similarities
     assert (self.char_similarities is None
             ), "Custom character similarities not supported at this time"
     self.ngrams = {}
     # build inverted index of ngram to where it occurs
     # character ngrams. Good.
     for i, ngram in enumerate(ngrams(" " + text + " ", 3)):
         if ngram in self.ngrams:
             ngram_bucket = self.ngrams[ngram]
         else:
             ngram_bucket = self.ngrams[ngram] = []
         ngram_bucket.append(i)
Ejemplo n.º 3
0
 def __init__(self,
              text,
              max_candidates=10,
              candidate_threshold=0.92,
              match_score=100,
              mismatch_score=-100,
              gap_score=-100,
              char_similarities=None):
     self.text = text
     self.max_candidates = max_candidates
     self.candidate_threshold = candidate_threshold
     self.match_score = match_score
     self.mismatch_score = mismatch_score
     self.gap_score = gap_score
     self.char_similarities = char_similarities
     self.ngrams = {}
     for i, ngram in enumerate(ngrams(' ' + text + ' ', 3)):
         if ngram in self.ngrams:
             ngram_bucket = self.ngrams[ngram]
         else:
             ngram_bucket = self.ngrams[ngram] = []
         ngram_bucket.append(i)