def setup(self): self.grab.im_func.pattern = re.compile( ( r'(?:[^@./]\b(?!\.)|\A)(' # Match a boundary, but not on an e-mail address + url_regex() + r')[\[>)\]"\'.,;:]*(?:\s|\Z)' # End boundary ), re.I | re.DOTALL)
def _match_sub_selectors(regex): selector_patterns = { 'alpha': r'[a-zA-Z]+', 'any': r'.+', 'chunk': r'\S+', 'digits': r'\d+', 'number': r'\d*\.?\d+', 'url': url_regex(), 'word': r'\w+', } regex = regex.replace(' ', r'(?:\s+)') name_count = defaultdict(int) def selector_to_re(match): name = match.group(1) pattern = match.group(2) if name is None: return '(%s)' % selector_patterns[pattern] # Prevent conflicts when reusing a name name_count[name] += 1 name = '%s__%d_' % (name, name_count[name]) return '(?P<%s>%s)' % (name, selector_patterns[pattern]) regex = re.sub(r'{(?:(\w+):)?(%s)}' % '|'.join(selector_patterns.keys()), selector_to_re, regex) if not regex.startswith('^'): regex = '^' + regex if not regex.endswith('$'): regex = regex + '$' return regex
def _match_sub_selectors(regex): selector_patterns = { 'alpha' : r'[a-zA-Z]+', 'any' : r'.+', 'chunk' : r'\S+', 'digits' : r'\d+', 'number' : r'\d*\.?\d+', 'url' : url_regex(), 'word' : r'\w+', } regex = regex.replace(' ', r'(?:\s+)') name_count = defaultdict(int) def selector_to_re(match): name = match.group(1) pattern = match.group(2) if name is None: return '(%s)' % selector_patterns[pattern] # Prevent conflicts when reusing a name name_count[name] += 1 name = '%s__%d_' % (name, name_count[name]) return '(?P<%s>%s)' % (name, selector_patterns[pattern]) regex = re.sub(r'{(?:(\w+):)?(%s)}' % '|'.join(selector_patterns.keys()), selector_to_re, regex) if not regex.startswith('^'): regex = '^' + regex if not regex.endswith('$'): regex = regex + '$' return regex
def setup(self): self.grab.im_func.pattern = re.compile(( r'(?:[^@./]\b(?!\.)|\A)(' # Match a boundary, but not on an e-mail address + url_regex() + r')[\[>)\]"\'.,;:]*(?:\s|\Z)' # End boundary ), re.I | re.DOTALL)