def highlight(self, line, color): """ Highlight the matched words. """ import re pattern = normalizer.normalize_pattern(self.pattern) if self.icase: pattern = re.compile(pattern, re.I) else: pattern = re.compile(pattern) hline = '' start = 0 end = len(line) for match in pattern.finditer(line): s = match.start() e = match.end() # highlight for bash # forecolor:3, backcolor: 4 # black:0,red:1,green:2,orange:3,blue:4,purple:5,bluegreen:6,white:7+ replace = '\x1b[0;3{0}m'.format(self.highlightcolor(color)) + line[s:e] + '\x1b[0m' hline += line[start:s] hline += replace start = e hline += line[start:end] return hline
def domatch(self): """ Do the matching and counting based on lines in pdf's txt content file, use tuple to store matches use list to store matches within a line a file each match consists of line number and pattern with context The structure of match result looks like: [[(pageno, lineno, location, context),...],...] [[(1,1,location1, ...match...)],[(1,20,location2, ...match...),(2,32,location3, ...match...)]] """ import re pattern = normalizer.normalize_pattern(self.pattern) if self.icase: pattern = re.compile(pattern, re.I) else: pattern = re.compile(pattern) count = 0 results = [] for line in self.lines: # line: (pageno, lineno, cotent) pageno = line[0] lineno = line[1] linecontent = line[2] if pattern.search(linecontent): res = [] for match in pattern.finditer(linecontent): count += 1 # context if self.context > 0: s = self.startindex(match.start(), self.context) e = match.end() + self.context else: s = self.startposition(match.start(), linecontent) e = self.endposition(match.end(), linecontent) # location if self.location: # toc if self.dictionary: toc = TOC(self.file, path=self.dictionary) else: toc = TOC(self.file) toc_dictionary = toc.gettoc_filter_by_dictionary(self.lines) location = self.matchlocation(lineno, toc_dictionary) else: location = 'All' res.append((pageno,lineno,location,linecontent[s:e])) #print count results.append(res) self.count = count return results