def transform(self, ispdf=False): """Normalize the title""" #convert html entities if not ispdf: self.title = pywikibot.html2unicode(self.title) self.title = re.sub(r'-+', '-', self.title) #remove formatting, i.e long useless strings self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title) #remove \n and \r and Unicode spaces from titles self.title = re.sub(r'(?u)\s', ' ', self.title) self.title = re.sub(r'[\n\r\t]', ' ', self.title) #remove extra whitespaces #remove leading and trailing ./;/,/-/_/+/ / self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ ')) self.avoid_uppercase() #avoid closing the link before the end self.title = self.title.replace(']', ']') #avoid multiple } being interpreted as a template inclusion self.title = self.title.replace('}}', '}}') #prevent multiple quotes being interpreted as '' or ''' self.title = self.title.replace('\'\'', '\''') self.title = pywikibot.unicode2html(self.title, self.site.encoding())
def transform(self, ispdf = False): """Normalize the title""" #convert html entities if not ispdf: self.title = pywikibot.html2unicode(self.title) self.title = re.sub(r'-+', '-', self.title) #remove formatting, i.e long useless strings self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title) #remove \n and \r and Unicode spaces from titles self.title = re.sub(r'(?u)\s', ' ', self.title) self.title = re.sub(r'[\n\r\t]', ' ', self.title) #remove extra whitespaces #remove leading and trailing ./;/,/-/_/+/ / self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ ')) self.avoid_uppercase() #avoid closing the link before the end self.title = self.title.replace(']', ']') #avoid multiple } being interpreted as a template inclusion self.title = self.title.replace('}}', '}}') #prevent multiple quotes being interpreted as '' or ''' self.title = self.title.replace('\'\'', '\''') self.title = pywikibot.unicode2html(self.title, self.site.encoding())
def transform(self, ispdf=False): """Normalize the title""" # convert html entities if not ispdf: self.title = pywikibot.html2unicode(self.title) self.title = re.sub(r"-+", "-", self.title) # remove formatting, i.e long useless strings self.title = re.sub(r"[\.+\-=]{4,}", " ", self.title) # remove \n and \r and Unicode spaces from titles self.title = re.sub(r"(?u)\s", " ", self.title) self.title = re.sub(r"[\n\r\t]", " ", self.title) # remove extra whitespaces # remove leading and trailing ./;/,/-/_/+/ / self.title = re.sub(r" +", " ", self.title.strip(r"=.;,-+_ ")) self.avoid_uppercase() # avoid closing the link before the end self.title = self.title.replace("]", "]") # avoid multiple } being interpreted as a template inclusion self.title = self.title.replace("}}", "}}") # prevent multiple quotes being interpreted as '' or ''' self.title = self.title.replace("''", "''") self.title = pywikibot.unicode2html(self.title, self.site.encoding())
else: donow = todo # If there was more to do, the 'if len(todo)<61' part would have extended # todo beyond this size. cont = False try: wikipedia.getall(mysite, donow) except wikipedia.SaxError: # Ignore this error, and get the pages the traditional way. pass checked += len(donow) for pl in donow: R = re.compile(r"http://[^\s}<\]]+[^\s.,:;)\?!\]}<]") try: for url in R.findall(pl.get()): url = wikipedia.unicode2html(url, "ascii") try: error = URLerrorFinder().open(url) except IOError: error = -1 if error in allowederrorcodes: working += 1 else: nonworking += 1 print wikipedia.output(u'Page "%s" links to:' % pl.title()) wikipedia.output(url) wikipedia.output(u"Which gave error: %s %s" % (error, errorname(error))) # If anything is wrong with the Wikipedia page, just ignore except (wikipedia.NoPage, wikipedia.IsRedirectPage, wikipedia.LockedPage): pass
else: donow = todo # If there was more to do, the 'if len(todo)<61' part would have extended # todo beyond this size. cont = False try: wikipedia.getall(mysite, donow) except wikipedia.SaxError: # Ignore this error, and get the pages the traditional way. pass checked += len(donow) for pl in donow: R = re.compile(r'http://[^\s}<\]]+[^\s.,:;)\?!\]}<]') try: for url in R.findall(pl.get()): url = wikipedia.unicode2html(url, 'ascii') try: error = URLerrorFinder().open(url) except IOError: error = -1 if error in allowederrorcodes: working += 1 else: nonworking += 1 print wikipedia.output(u'Page "%s" links to:' % pl.title()) wikipedia.output(url) wikipedia.output(u'Which gave error: %s %s' % (error, errorname(error))) # If anything is wrong with the Wikipedia page, just ignore except (wikipedia.NoPage, wikipedia.IsRedirectPage,