Exemple #1
0
 def parse_file(self, val):
     param, file = parse_param(val)
     if param:
         m = textlib._get_regexes(['file'], self.pwb_site)[0].match(file)
         file = m.group(1) if m else None
     if not file:
         print(f'Unparsable {val}')
     return param, file
Exemple #2
0
    def removeEmptySections(self, text):
        """Cleanup empty sections."""
        # comments, categories, and interwikis
        skippings = ['comment', 'category', 'interwiki']
        skip_regexes = _get_regexes(skippings, self.site)
        # we want only interwikis, not interlanguage links
        skip_regexes[1] = re.compile(
            skip_regexes[1].pattern.replace(':?', ''))
        # site defined templates
        skip_templates = {
            'cs': ('Pahýl[ _]část',),  # stub section
        }
        if self.site.code in skip_templates:
            for template in skip_templates[self.site.code]:
                skip_regexes.append(
                    re.compile(r'\{\{\s*%s\s*\}\}' % template, re.I))
        # empty lists
        skip_regexes.append(re.compile(r'(?m)^[\*#] *$'))

        # get stripped sections
        stripped_text = text
        for reg in skip_regexes:
            stripped_text = reg.sub(r'', stripped_text)
        strip_sections = textlib.extract_sections(
            stripped_text, self.site)[1]

        # get proper sections
        header, sections, footer = textlib.extract_sections(text, self.site)

        # iterate stripped sections and create a new page body
        new_body = []
        for i, strip_section in enumerate(strip_sections):
            current_heading = sections[i][0]
            try:
                next_heading = sections[i + 1][0]
            except IndexError:
                next_heading = ''
            current_dep = (len(current_heading)
                           - len(current_heading.lstrip('=')))
            next_dep = len(next_heading) - len(next_heading.lstrip('='))
            if strip_section[1].strip() or current_dep < next_dep:
                new_body.extend(sections[i])
        return header + ''.join(new_body) + footer
Exemple #3
0
    def removeEmptySections(self, text):
        """Cleanup empty sections."""
        # userspace contains article stubs without nobots/in use templates
        if self.namespace == 2:
            return text

        skippings = ['comment', 'category']
        skip_regexes = _get_regexes(skippings, self.site)
        # site defined templates
        skip_templates = {
            'cs': ('Pahýl[ _]část',),  # stub section
        }
        if self.site.code in skip_templates:
            for template in skip_templates[self.site.code]:
                skip_regexes.append(
                    re.compile(r'\{\{\s*%s\s*\}\}' % template, re.I))
        # empty lists
        skip_regexes.append(re.compile(r'(?m)^[\*#] *$'))

        # get stripped sections
        stripped_text = textlib.removeLanguageLinks(text, self.site, '\n')
        for reg in skip_regexes:
            stripped_text = reg.sub(r'', stripped_text)
        strip_sections = textlib.extract_sections(
            stripped_text, self.site)[1]

        # get proper sections
        header, sections, footer = textlib.extract_sections(text, self.site)

        # iterate stripped sections and create a new page body
        new_body = []
        for i, strip_section in enumerate(strip_sections):
            current_heading = sections[i][0]
            try:
                next_heading = sections[i + 1][0]
            except IndexError:
                next_heading = ''
            current_dep = (len(current_heading)
                           - len(current_heading.lstrip('=')))
            next_dep = len(next_heading) - len(next_heading.lstrip('='))
            if strip_section[1].strip() or current_dep < next_dep:
                new_body.extend(sections[i])
        return header + ''.join(new_body) + footer
 def pattern(self):
     return textlib._get_regexes(['category'], self.site).pop()