Exemple #1
0
    def transform(self, ispdf=False):
        """Normalize the title."""
        # convert html entities
        if not ispdf:
            self.title = pywikibot.html2unicode(self.title)
        self.title = re.sub(r'-+', '-', self.title)
        # remove formatting, i.e long useless strings
        self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title)
        # remove \n and \r and Unicode spaces from titles
        self.title = re.sub(r'(?u)\s', ' ', self.title)
        self.title = re.sub(r'[\n\r\t]', ' ', self.title)
        # remove extra whitespaces
        # remove leading and trailing ./;/,/-/_/+/ /
        self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ '))

        self.avoid_uppercase()
        # avoid closing the link before the end
        self.title = self.title.replace(']', ']')
        # avoid multiple } being interpreted as a template inclusion
        self.title = self.title.replace('}}', '}}')
        # prevent multiple quotes being interpreted as '' or '''
        self.title = self.title.replace('\'\'', '\''')
        # avoid multiple | being interpreted as a template parameter
        self.title = self.title.replace('|', '|')

        self.title = pywikibot.unicode2html(self.title, self.site.encoding())
    def transform(self, ispdf=False):
        """Normalize the title"""
        # convert html entities
        if not ispdf:
            self.title = pywikibot.html2unicode(self.title)
        self.title = re.sub(r'-+', '-', self.title)
        # remove formatting, i.e long useless strings
        self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title)
        # remove \n and \r and Unicode spaces from titles
        self.title = re.sub(r'(?u)\s', ' ', self.title)
        self.title = re.sub(r'[\n\r\t]', ' ', self.title)
        # remove extra whitespaces
        # remove leading and trailing ./;/,/-/_/+/ /
        self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ '))

        self.avoid_uppercase()
        # avoid closing the link before the end
        self.title = self.title.replace(']', ']')
        # avoid multiple } being interpreted as a template inclusion
        self.title = self.title.replace('}}', '}}')
        # prevent multiple quotes being interpreted as '' or '''
        self.title = self.title.replace('\'\'', '\''')
        self.title = pywikibot.unicode2html(self.title, self.site.encoding())
    def transform(self, ispdf=False):
        """Normalize the title."""
        # convert html entities
        if not ispdf:
            self.title = pywikibot.html2unicode(self.title)
        self.title = re.sub(r"-+", "-", self.title)
        # remove formatting, i.e long useless strings
        self.title = re.sub(r"[\.+\-=]{4,}", " ", self.title)
        # remove \n and \r and Unicode spaces from titles
        self.title = re.sub(r"(?u)\s", " ", self.title)
        self.title = re.sub(r"[\n\r\t]", " ", self.title)
        # remove extra whitespaces
        # remove leading and trailing ./;/,/-/_/+/ /
        self.title = re.sub(r" +", " ", self.title.strip(r"=.;,-+_ "))

        self.avoid_uppercase()
        # avoid closing the link before the end
        self.title = self.title.replace("]", "]")
        # avoid multiple } being interpreted as a template inclusion
        self.title = self.title.replace("}}", "}}")
        # prevent multiple quotes being interpreted as '' or '''
        self.title = self.title.replace("''", "''")
        self.title = pywikibot.unicode2html(self.title, self.site.encoding())