Example #1
0
    def transform(self, ispdf=False):
        """Normalize the title"""
        #convert html entities
        if not ispdf:
            self.title = pywikibot.html2unicode(self.title)
        self.title = re.sub(r'-+', '-', self.title)
        #remove formatting, i.e long useless strings
        self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title)
        #remove \n and \r and Unicode spaces from titles
        self.title = re.sub(r'(?u)\s', ' ', self.title)
        self.title = re.sub(r'[\n\r\t]', ' ', self.title)
        #remove extra whitespaces
        #remove leading and trailing ./;/,/-/_/+/ /
        self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ '))

        self.avoid_uppercase()
        #avoid closing the link before the end
        self.title = self.title.replace(']', ']')
        #avoid multiple } being interpreted as a template inclusion
        self.title = self.title.replace('}}', '}}')
        #prevent multiple quotes being interpreted as '' or '''
        self.title = self.title.replace('\'\'', '\''')
        self.title = pywikibot.unicode2html(self.title, self.site.encoding())
Example #2
0
    def transform(self, ispdf = False):
        """Normalize the title"""
        #convert html entities
        if not ispdf:
            self.title = pywikibot.html2unicode(self.title)
        self.title = re.sub(r'-+', '-', self.title)
        #remove formatting, i.e long useless strings
        self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title)
        #remove \n and \r and Unicode spaces from titles
        self.title = re.sub(r'(?u)\s', ' ', self.title)
        self.title = re.sub(r'[\n\r\t]', ' ', self.title)
        #remove extra whitespaces
        #remove leading and trailing ./;/,/-/_/+/ /
        self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ '))

        self.avoid_uppercase()
        #avoid closing the link before the end
        self.title = self.title.replace(']', ']')
        #avoid multiple } being interpreted as a template inclusion
        self.title = self.title.replace('}}', '}}')
        #prevent multiple quotes being interpreted as '' or '''
        self.title = self.title.replace('\'\'', '\''')
        self.title = pywikibot.unicode2html(self.title, self.site.encoding())
Example #3
0
    def transform(self, ispdf=False):
        """Normalize the title"""
        # convert html entities
        if not ispdf:
            self.title = pywikibot.html2unicode(self.title)
        self.title = re.sub(r"-+", "-", self.title)
        # remove formatting, i.e long useless strings
        self.title = re.sub(r"[\.+\-=]{4,}", " ", self.title)
        # remove \n and \r and Unicode spaces from titles
        self.title = re.sub(r"(?u)\s", " ", self.title)
        self.title = re.sub(r"[\n\r\t]", " ", self.title)
        # remove extra whitespaces
        # remove leading and trailing ./;/,/-/_/+/ /
        self.title = re.sub(r" +", " ", self.title.strip(r"=.;,-+_ "))

        self.avoid_uppercase()
        # avoid closing the link before the end
        self.title = self.title.replace("]", "]")
        # avoid multiple } being interpreted as a template inclusion
        self.title = self.title.replace("}}", "}}")
        # prevent multiple quotes being interpreted as '' or '''
        self.title = self.title.replace("''", "''")
        self.title = pywikibot.unicode2html(self.title, self.site.encoding())
Example #4
0
 else:
     donow = todo
     # If there was more to do, the 'if len(todo)<61' part would have extended
     # todo beyond this size.
     cont = False
 try:
     wikipedia.getall(mysite, donow)
 except wikipedia.SaxError:
     # Ignore this error, and get the pages the traditional way.
     pass
 checked += len(donow)
 for pl in donow:
     R = re.compile(r"http://[^\s}<\]]+[^\s.,:;)\?!\]}<]")
     try:
         for url in R.findall(pl.get()):
             url = wikipedia.unicode2html(url, "ascii")
             try:
                 error = URLerrorFinder().open(url)
             except IOError:
                 error = -1
             if error in allowederrorcodes:
                 working += 1
             else:
                 nonworking += 1
                 print
                 wikipedia.output(u'Page "%s" links to:' % pl.title())
                 wikipedia.output(url)
                 wikipedia.output(u"Which gave error: %s %s" % (error, errorname(error)))
     # If anything is wrong with the Wikipedia page, just ignore
     except (wikipedia.NoPage, wikipedia.IsRedirectPage, wikipedia.LockedPage):
         pass
Example #5
0
 else:
     donow = todo
     # If there was more to do, the 'if len(todo)<61' part would have extended
     # todo beyond this size.
     cont = False
 try:
     wikipedia.getall(mysite, donow)
 except wikipedia.SaxError:
     # Ignore this error, and get the pages the traditional way.
     pass
 checked += len(donow)
 for pl in donow:
     R = re.compile(r'http://[^\s}<\]]+[^\s.,:;)\?!\]}<]')
     try:
         for url in R.findall(pl.get()):
             url = wikipedia.unicode2html(url, 'ascii')
             try:
                 error = URLerrorFinder().open(url)
             except IOError:
                 error = -1
             if error in allowederrorcodes:
                 working += 1
             else:
                 nonworking += 1
                 print
                 wikipedia.output(u'Page "%s" links to:' % pl.title())
                 wikipedia.output(url)
                 wikipedia.output(u'Which gave error: %s %s' %
                                  (error, errorname(error)))
     # If anything is wrong with the Wikipedia page, just ignore
     except (wikipedia.NoPage, wikipedia.IsRedirectPage,