def fix_HTML(self, terms, claims, data): ret = False for key in ['labels', 'descriptions']: for lang, value in terms[key].items(): while True: new = html2unicode(value.replace('^|^', '&')) if new == value: break terms[key][lang] = value = new ret = True for lang, aliases in terms['aliases'].items(): for i, value in enumerate(aliases): while True: new = html2unicode(value.replace('^|^', '&')) if new == value: break aliases[i] = value = new ret = True for values in claims.values(): for claim in values: if claim.type != 'monolingualtext': continue value = claim.target.text if claim.target else None changed = False while value: new = html2unicode(value.replace('^|^', '&')) if value == new: break claim.target.text = value = new changed = True if changed: data.append(claim.toJSON()) ret = True return ret
def resolveHtmlEntities(self, text): """Replace HTML entities with string.""" ignore = [ 38, # Ampersand (&) 39, # Single quotation mark (") per T26093 60, # Less than (<) 62, # Greater than (>) 91, # Opening square bracket ([) # - sometimes used intentionally inside links 93, # Closing square bracket (]) # - used intentionally inside links 124, # Vertical bar (|) # - used intentionally in navigation bar templates on w:de 160, # Non-breaking space ( ) # - not supported by Firefox textareas 173, # Soft-hypen (­) - enable editing 8206, # Left-to-right mark (<r;) 8207, # Right-to-left mark (&rtl;) ] if self.template: ignore += [32] # Space ( ) ignore += [58] # Colon (:) # TODO: T254350 - what other extension tags should be avoided? # (graph, math, score, timeline, etc.) text = pywikibot.html2unicode(text, ignore=ignore, exceptions=['comment', 'source']) return text
def transform(self, ispdf=False): """Normalize the title.""" # convert html entities if not ispdf: self.title = pywikibot.html2unicode(self.title) self.title = re.sub(r'-+', '-', self.title) # remove formatting, i.e long useless strings self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title) # remove \n and \r and Unicode spaces from titles self.title = re.sub(r'(?u)\s', ' ', self.title) self.title = re.sub(r'[\n\r\t]', ' ', self.title) # remove extra whitespaces # remove leading and trailing ./;/,/-/_/+/ / self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ ')) self.avoid_uppercase() # avoid closing the link before the end self.title = self.title.replace(']', ']') # avoid multiple } being interpreted as a template inclusion self.title = self.title.replace('}}', '}}') # prevent multiple quotes being interpreted as '' or ''' self.title = self.title.replace('\'\'', '\''') # avoid multiple | being interpreted as a template parameter self.title = self.title.replace('|', '|') self.title = pywikibot.unicode2html(self.title, self.site.encoding())
def resolveHtmlEntities(self, text): ignore = [ 38, # Ampersand (&) 39, # Bugzilla 24093 60, # Less than (<) 62, # Great than (>) 91, # Opening bracket - sometimes used intentionally inside links 93, # Closing bracket - sometimes used intentionally inside links 124, # Vertical bar (??) - used intentionally in navigation bar templates on de: 160, # Non-breaking space ( ) - not supported by Firefox textareas ] # ignore ' see http://eo.wikipedia.org/w/index.php?title=Liberec&diff=next&oldid=2320801 #if self.site.lang == 'eo': # ignore += [39] text = pywikibot.html2unicode(text, ignore = ignore) return text
def resolveHtmlEntities(self, text): ignore = [ 38, # Ampersand (&) 39, # Bugzilla 24093 60, # Less than (<) 62, # Great than (>) 91, # Opening bracket - sometimes used intentionally inside links 93, # Closing bracket - sometimes used intentionally inside links 124, # Vertical bar (??) - used intentionally in navigation bar templates on de: 160, # Non-breaking space ( ) - not supported by Firefox textareas 173, # Soft-hypen (­) - enable editing 8206, # left-to-right mark (<r;) 8207, # right-to-left mark (&rtl;) ] # ignore ' see http://eo.wikipedia.org/w/index.php?title=Liberec&diff=next&oldid=2320801 #if self.site.lang == 'eo': # ignore += [39] if self.template: ignore += [58] text = pywikibot.html2unicode(text, ignore=ignore) return text
def resolveHtmlEntities(self, text): ignore = [ 38, # Ampersand (&) 39, # Single quotation mark (") - Bugzilla 24093 60, # Less than (<) 62, # Great than (>) 91, # Opening square bracket ([) # - sometimes used intentionally inside links 93, # Closing square bracket (]) # - used intentionally inside links 124, # Vertical bar (|) # - used intentionally in navigation bar templates on w:de 160, # Non-breaking space ( ) # - not supported by Firefox textareas 173, # Soft-hypen (­) - enable editing 8206, # Left-to-right mark (<r;) 8207, # Right-to-left mark (&rtl;) ] if self.template: ignore += [58] text = pywikibot.html2unicode(text, ignore=ignore) return text
def transform(self, ispdf=False): """Normalize the title""" # convert html entities if not ispdf: self.title = pywikibot.html2unicode(self.title) self.title = re.sub(r'-+', '-', self.title) # remove formatting, i.e long useless strings self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title) # remove \n and \r and Unicode spaces from titles self.title = re.sub(r'(?u)\s', ' ', self.title) self.title = re.sub(r'[\n\r\t]', ' ', self.title) # remove extra whitespaces # remove leading and trailing ./;/,/-/_/+/ / self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ ')) self.avoid_uppercase() # avoid closing the link before the end self.title = self.title.replace(']', ']') # avoid multiple } being interpreted as a template inclusion self.title = self.title.replace('}}', '}}') # prevent multiple quotes being interpreted as '' or ''' self.title = self.title.replace('\'\'', '\''') self.title = pywikibot.unicode2html(self.title, self.site.encoding())
def transform(self, ispdf=False): """Normalize the title.""" # convert html entities if not ispdf: self.title = pywikibot.html2unicode(self.title) self.title = re.sub(r"-+", "-", self.title) # remove formatting, i.e long useless strings self.title = re.sub(r"[\.+\-=]{4,}", " ", self.title) # remove \n and \r and Unicode spaces from titles self.title = re.sub(r"(?u)\s", " ", self.title) self.title = re.sub(r"[\n\r\t]", " ", self.title) # remove extra whitespaces # remove leading and trailing ./;/,/-/_/+/ / self.title = re.sub(r" +", " ", self.title.strip(r"=.;,-+_ ")) self.avoid_uppercase() # avoid closing the link before the end self.title = self.title.replace("]", "]") # avoid multiple } being interpreted as a template inclusion self.title = self.title.replace("}}", "}}") # prevent multiple quotes being interpreted as '' or ''' self.title = self.title.replace("''", "''") self.title = pywikibot.unicode2html(self.title, self.site.encoding())