def parse_file(self, val): param, file = parse_param(val) if param: m = textlib._get_regexes(['file'], self.pwb_site)[0].match(file) file = m.group(1) if m else None if not file: print(f'Unparsable {val}') return param, file
def removeEmptySections(self, text): """Cleanup empty sections.""" # comments, categories, and interwikis skippings = ['comment', 'category', 'interwiki'] skip_regexes = _get_regexes(skippings, self.site) # we want only interwikis, not interlanguage links skip_regexes[1] = re.compile( skip_regexes[1].pattern.replace(':?', '')) # site defined templates skip_templates = { 'cs': ('Pahýl[ _]část',), # stub section } if self.site.code in skip_templates: for template in skip_templates[self.site.code]: skip_regexes.append( re.compile(r'\{\{\s*%s\s*\}\}' % template, re.I)) # empty lists skip_regexes.append(re.compile(r'(?m)^[\*#] *$')) # get stripped sections stripped_text = text for reg in skip_regexes: stripped_text = reg.sub(r'', stripped_text) strip_sections = textlib.extract_sections( stripped_text, self.site)[1] # get proper sections header, sections, footer = textlib.extract_sections(text, self.site) # iterate stripped sections and create a new page body new_body = [] for i, strip_section in enumerate(strip_sections): current_heading = sections[i][0] try: next_heading = sections[i + 1][0] except IndexError: next_heading = '' current_dep = (len(current_heading) - len(current_heading.lstrip('='))) next_dep = len(next_heading) - len(next_heading.lstrip('=')) if strip_section[1].strip() or current_dep < next_dep: new_body.extend(sections[i]) return header + ''.join(new_body) + footer
def removeEmptySections(self, text): """Cleanup empty sections.""" # userspace contains article stubs without nobots/in use templates if self.namespace == 2: return text skippings = ['comment', 'category'] skip_regexes = _get_regexes(skippings, self.site) # site defined templates skip_templates = { 'cs': ('Pahýl[ _]část',), # stub section } if self.site.code in skip_templates: for template in skip_templates[self.site.code]: skip_regexes.append( re.compile(r'\{\{\s*%s\s*\}\}' % template, re.I)) # empty lists skip_regexes.append(re.compile(r'(?m)^[\*#] *$')) # get stripped sections stripped_text = textlib.removeLanguageLinks(text, self.site, '\n') for reg in skip_regexes: stripped_text = reg.sub(r'', stripped_text) strip_sections = textlib.extract_sections( stripped_text, self.site)[1] # get proper sections header, sections, footer = textlib.extract_sections(text, self.site) # iterate stripped sections and create a new page body new_body = [] for i, strip_section in enumerate(strip_sections): current_heading = sections[i][0] try: next_heading = sections[i + 1][0] except IndexError: next_heading = '' current_dep = (len(current_heading) - len(current_heading.lstrip('='))) next_dep = len(next_heading) - len(next_heading.lstrip('=')) if strip_section[1].strip() or current_dep < next_dep: new_body.extend(sections[i]) return header + ''.join(new_body) + footer
def pattern(self): return textlib._get_regexes(['category'], self.site).pop()