Example #1
0
 def get_snippet(self,
                 hit,
                 normal=lambda x: x,
                 highlight=lambda x: x,
                 maxsents=3,
                 maxchars=100,
                 maxlr=20,
                 default_snippet_sentid=0):
     # Normally it assumes that self.iter() is already called
     # so the contexts for this location is not empty. When it is empty,
     # fill out with the default snippet string.
     (idx, docid) = hit
     try:
         (loc, mtime) = idx_docid2info(idx, docid)
     except KeyError:
         raise
     try:
         title = idx_sent(idx, docid, 0)
     except KeyError:
         title = None
     contexts = self.snippets.get(hit, [default_snippet_sentid])
     snippet = u''
     sentid0 = None
     for sentid in sorted(contexts):
         # Avoid repeating.
         if sentid0 == sentid: continue
         # For each position, we take maxsents sentences.
         text = ''
         try:
             for (i, s) in enumerate(idx_sents(idx, docid, sentid)):
                 if text:
                     text += ' '
                 text += s
                 if maxsents <= i + 1 or maxchars <= len(text): break
         except KeyError:
             pass
         x = self.matched_range(text)
         if len(x) == 1:
             # No highlight (no pattern specified).
             snippet += normal(text[:maxchars]) + u'...'
         else:
             # Highlight the matched parts.
             assert 3 <= len(x)
             # prepend the leftmost context.
             (state, left) = x[0]
             if not state:
                 snippet += u'... ' + normal(left[-maxlr:])
             for (state, s) in x[1:-1]:
                 if not s: continue
                 if state:
                     snippet += highlight(s)
                 else:
                     snippet += normal(s)
             # append the rightmost context.
             (state, right) = x[-1]
             if not state:
                 snippet += normal(right[:maxlr]) + u'...'
         if maxchars - len(snippet) < maxlr: break
         sentid0 = sentid
     return (loc, mtime, title, snippet)
Example #2
0
 def index_lastloc(self):
   lastloc = None
   for (_,idx) in self.iteridxs():
     (ndocs,_) = idx_info(idx)
     (lastloc,_) = idx_docid2info(idx, ndocs-1)
     # the first index must be newest, so we stop here.
     break
   return lastloc
Example #3
0
 def index_lastloc(self):
     lastloc = None
     for (_, idx) in self.iteridxs():
         (ndocs, _) = idx_info(idx)
         (lastloc, _) = idx_docid2info(idx, ndocs - 1)
         # the first index must be newest, so we stop here.
         break
     return lastloc
Example #4
0
 def get_snippet(self, hit, 
                 normal=lambda x:x, highlight=lambda x:x,
                 maxsents=3, maxchars=100, maxlr=20, 
                 default_snippet_sentid=0):
   # Normally it assumes that self.iter() is already called 
   # so the contexts for this location is not empty. When it is empty,
   # fill out with the default snippet string.
   (idx, docid) = hit
   try:
     (loc, mtime) = idx_docid2info(idx, docid)
   except KeyError:
     raise
   try:
     title = idx_sent(idx, docid, 0)
   except KeyError:
     title = None
   contexts = self.snippets.get(hit, [default_snippet_sentid])
   snippet = u''
   sentid0 = None
   for sentid in sorted(contexts):
     # Avoid repeating.
     if sentid0 == sentid: continue
     # For each position, we take maxsents sentences.
     text = ''
     try:
       for (i,s) in enumerate(idx_sents(idx, docid, sentid)):
         if text:
           text += ' '
         text += s
         if maxsents <= i+1 or maxchars <= len(text): break
     except KeyError:
       pass
     x = self.matched_range(text)
     if len(x) == 1:
       # No highlight (no pattern specified).
       snippet += normal(text[:maxchars]) + u'...'
     else:
       # Highlight the matched parts.
       assert 3 <= len(x)
       # prepend the leftmost context.
       (state,left) = x[0]
       if not state:
         snippet += u'... ' + normal(left[-maxlr:])
       for (state,s) in x[1:-1]:
         if not s: continue
         if state:
           snippet += highlight(s)
         else:
           snippet += normal(s)
       # append the rightmost context.
       (state,right) = x[-1]
       if not state:
         snippet += normal(right[:maxlr]) + u'...'
     if maxchars-len(snippet) < maxlr: break
     sentid0 = sentid
   return (loc, mtime, title, snippet)
Example #5
0
 def start_iter(self):
   "Iterates over the search resuts."
   from time import time
   for (idx,docid,contexts) in self.get_docids():
     t0 = 0
     if self.timeout:
       t0 = time()
     pol = 0
     if self.doc_preds:
       # Apply the document predicates.
       try:
         (loc,_) = idx_docid2info(idx, docid)
         for pred in self.doc_preds:
           pol = pred(loc)
           if pol: break
       except KeyError:
         pass
     # pol < 0: rejected immediately.
     # pol > 0: accepted immediately.
     # pol = 0: undecided (further examination required).
     hit = (idx,docid)
     self.narrowed += 1
     if pol == 0:
       # contexts (a list of sentids) is stored in descending order in an index file.
       filtered = []
       # Receives a list of pairs of sentids and regexp patterns: [([sentid],regpat), ...]
       # and returns a sentid list that actually matches to the patterns.
       # Unless ALL the patterns match, it returns a null.
       for (sentids,pat) in contexts:
         # make the list in ascending order.
         sentids.reverse()
         for sentid in sentids:
           try:
             sent = idx_sent(idx, docid, sentid)
           except KeyError:
             continue
           if not pat or pat.search(sent):
             filtered.append(sentid)
             break
         else:
           if not self.disjunctive:
             pol = -1
             break
       else:
         if filtered:
           pol = 1
           self.snippets[hit] = filtered
     if 0 < pol:
       self.found_docs.append(hit)
       yield hit
     # Abort if the specified time is passed.
     if self.timeout and t0+self.timeout <= time():
       raise SearchTimeout(self)
   return
Example #6
0
 def start_iter(self):
     "Iterates over the search resuts."
     from time import time
     for (idx, docid, contexts) in self.get_docids():
         t0 = 0
         if self.timeout:
             t0 = time()
         pol = 0
         if self.doc_preds:
             # Apply the document predicates.
             try:
                 (loc, _) = idx_docid2info(idx, docid)
                 for pred in self.doc_preds:
                     pol = pred(loc)
                     if pol: break
             except KeyError:
                 pass
         # pol < 0: rejected immediately.
         # pol > 0: accepted immediately.
         # pol = 0: undecided (further examination required).
         hit = (idx, docid)
         self.narrowed += 1
         if pol == 0:
             # contexts (a list of sentids) is stored in descending order in an index file.
             filtered = []
             # Receives a list of pairs of sentids and regexp patterns: [([sentid],regpat), ...]
             # and returns a sentid list that actually matches to the patterns.
             # Unless ALL the patterns match, it returns a null.
             for (sentids, pat) in contexts:
                 # make the list in ascending order.
                 sentids.reverse()
                 for sentid in sentids:
                     try:
                         sent = idx_sent(idx, docid, sentid)
                     except KeyError:
                         continue
                     if not pat or pat.search(sent):
                         filtered.append(sentid)
                         break
                 else:
                     if not self.disjunctive:
                         pol = -1
                         break
             else:
                 if filtered:
                     pol = 1
                     self.snippets[hit] = filtered
         if 0 < pol:
             self.found_docs.append(hit)
             yield hit
         # Abort if the specified time is passed.
         if self.timeout and t0 + self.timeout <= time():
             raise SearchTimeout(self)
     return