def fix_HTML(self, terms, claims, data):
     ret = False
     for key in ['labels', 'descriptions']:
         for lang, value in terms[key].items():
             while True:
                 new = html2unicode(value.replace('^|^', '&'))
                 if new == value:
                     break
                 terms[key][lang] = value = new
                 ret = True
     for lang, aliases in terms['aliases'].items():
         for i, value in enumerate(aliases):
             while True:
                 new = html2unicode(value.replace('^|^', '&'))
                 if new == value:
                     break
                 aliases[i] = value = new
                 ret = True
     for values in claims.values():
         for claim in values:
             if claim.type != 'monolingualtext':
                 continue
             value = claim.target.text if claim.target else None
             changed = False
             while value:
                 new = html2unicode(value.replace('^|^', '&'))
                 if value == new:
                     break
                 claim.target.text = value = new
                 changed = True
             if changed:
                 data.append(claim.toJSON())
                 ret = True
     return ret
 def fix_HTML(self, terms, claims, data):
     ret = False
     for key in ['labels', 'descriptions']:
         for lang, value in terms[key].items():
             while True:
                 new = html2unicode(value.replace('^|^', '&'))
                 if new == value:
                     break
                 terms[key][lang] = value = new
                 ret = True
     for lang, aliases in terms['aliases'].items():
         for i, value in enumerate(aliases):
             while True:
                 new = html2unicode(value.replace('^|^', '&'))
                 if new == value:
                     break
                 aliases[i] = value = new
                 ret = True
     for values in claims.values():
         for claim in values:
             if claim.type != 'monolingualtext':
                 continue
             value = claim.target.text if claim.target else None
             changed = False
             while value:
                 new = html2unicode(value.replace('^|^', '&'))
                 if value == new:
                     break
                 claim.target.text = value = new
                 changed = True
             if changed:
                 data.append(claim.toJSON())
                 ret = True
     return ret
Example #3
0
 def resolveHtmlEntities(self, text):
     """Replace HTML entities with string."""
     ignore = [
         38,  # Ampersand (&)
         39,  # Single quotation mark (") per T26093
         60,  # Less than (<)
         62,  # Greater than (>)
         91,  # Opening square bracket ([)
         # - sometimes used intentionally inside links
         93,  # Closing square bracket (])
         # - used intentionally inside links
         124,  # Vertical bar (|)
         # - used intentionally in navigation bar templates on w:de
         160,  # Non-breaking space ( )
         # - not supported by Firefox textareas
         173,  # Soft-hypen (­) - enable editing
         8206,  # Left-to-right mark (&ltr;)
         8207,  # Right-to-left mark (&rtl;)
     ]
     if self.template:
         ignore += [32]  # Space ( )
         ignore += [58]  # Colon (:)
     # TODO: T254350 - what other extension tags should be avoided?
     # (graph, math, score, timeline, etc.)
     text = pywikibot.html2unicode(text,
                                   ignore=ignore,
                                   exceptions=['comment', 'source'])
     return text
Example #4
0
    def transform(self, ispdf=False):
        """Normalize the title."""
        # convert html entities
        if not ispdf:
            self.title = pywikibot.html2unicode(self.title)
        self.title = re.sub(r'-+', '-', self.title)
        # remove formatting, i.e long useless strings
        self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title)
        # remove \n and \r and Unicode spaces from titles
        self.title = re.sub(r'(?u)\s', ' ', self.title)
        self.title = re.sub(r'[\n\r\t]', ' ', self.title)
        # remove extra whitespaces
        # remove leading and trailing ./;/,/-/_/+/ /
        self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ '))

        self.avoid_uppercase()
        # avoid closing the link before the end
        self.title = self.title.replace(']', ']')
        # avoid multiple } being interpreted as a template inclusion
        self.title = self.title.replace('}}', '}}')
        # prevent multiple quotes being interpreted as '' or '''
        self.title = self.title.replace('\'\'', '\''')
        # avoid multiple | being interpreted as a template parameter
        self.title = self.title.replace('|', '|')

        self.title = pywikibot.unicode2html(self.title, self.site.encoding())
Example #5
0
 def resolveHtmlEntities(self, text):
     ignore = [
          38,     # Ampersand (&)
          39,     # Bugzilla 24093
          60,     # Less than (<)
          62,     # Great than (>)
          91,     # Opening bracket - sometimes used intentionally inside links
          93,     # Closing bracket - sometimes used intentionally inside links
         124,     # Vertical bar (??) - used intentionally in navigation bar templates on de:
         160,     # Non-breaking space ( ) - not supported by Firefox textareas
     ]
     # ignore ' see http://eo.wikipedia.org/w/index.php?title=Liberec&diff=next&oldid=2320801
     #if self.site.lang == 'eo':
     #    ignore += [39]
     text = pywikibot.html2unicode(text, ignore = ignore)
     return text
Example #6
0
 def resolveHtmlEntities(self, text):
     ignore = [
         38,     # Ampersand (&)
         39,     # Bugzilla 24093
         60,     # Less than (<)
         62,     # Great than (>)
         91,     # Opening bracket - sometimes used intentionally inside links
         93,     # Closing bracket - sometimes used intentionally inside links
         124,    # Vertical bar (??) - used intentionally in navigation bar templates on de:
         160,    # Non-breaking space ( ) - not supported by Firefox textareas
         173,    # Soft-hypen (­) - enable editing
         8206,   # left-to-right mark (&ltr;)
         8207,   # right-to-left mark (&rtl;)
     ]
     # ignore ' see http://eo.wikipedia.org/w/index.php?title=Liberec&diff=next&oldid=2320801
     #if self.site.lang == 'eo':
     #    ignore += [39]
     if self.template:
         ignore += [58]
     text = pywikibot.html2unicode(text, ignore=ignore)
     return text
Example #7
0
 def resolveHtmlEntities(self, text):
     ignore = [
         38,     # Ampersand (&)
         39,     # Single quotation mark (") - Bugzilla 24093
         60,     # Less than (<)
         62,     # Great than (>)
         91,     # Opening square bracket ([)
                 # - sometimes used intentionally inside links
         93,     # Closing square bracket (])
                 # - used intentionally inside links
         124,    # Vertical bar (|)
                 # - used intentionally in navigation bar templates on w:de
         160,    # Non-breaking space ( )
                 # - not supported by Firefox textareas
         173,    # Soft-hypen (­) - enable editing
         8206,   # Left-to-right mark (&ltr;)
         8207,   # Right-to-left mark (&rtl;)
     ]
     if self.template:
         ignore += [58]
     text = pywikibot.html2unicode(text, ignore=ignore)
     return text
 def resolveHtmlEntities(self, text):
     ignore = [
         38,     # Ampersand (&)
         39,     # Single quotation mark (") - Bugzilla 24093
         60,     # Less than (<)
         62,     # Great than (>)
         91,     # Opening square bracket ([)
                 # - sometimes used intentionally inside links
         93,     # Closing square bracket (])
                 # - used intentionally inside links
         124,    # Vertical bar (|)
                 # - used intentionally in navigation bar templates on w:de
         160,    # Non-breaking space ( )
                 # - not supported by Firefox textareas
         173,    # Soft-hypen (­) - enable editing
         8206,   # Left-to-right mark (&ltr;)
         8207,   # Right-to-left mark (&rtl;)
     ]
     if self.template:
         ignore += [58]
     text = pywikibot.html2unicode(text, ignore=ignore)
     return text
Example #9
0
    def transform(self, ispdf=False):
        """Normalize the title"""
        # convert html entities
        if not ispdf:
            self.title = pywikibot.html2unicode(self.title)
        self.title = re.sub(r'-+', '-', self.title)
        # remove formatting, i.e long useless strings
        self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title)
        # remove \n and \r and Unicode spaces from titles
        self.title = re.sub(r'(?u)\s', ' ', self.title)
        self.title = re.sub(r'[\n\r\t]', ' ', self.title)
        # remove extra whitespaces
        # remove leading and trailing ./;/,/-/_/+/ /
        self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ '))

        self.avoid_uppercase()
        # avoid closing the link before the end
        self.title = self.title.replace(']', ']')
        # avoid multiple } being interpreted as a template inclusion
        self.title = self.title.replace('}}', '}}')
        # prevent multiple quotes being interpreted as '' or '''
        self.title = self.title.replace('\'\'', '\''')
        self.title = pywikibot.unicode2html(self.title, self.site.encoding())
Example #10
0
    def transform(self, ispdf=False):
        """Normalize the title."""
        # convert html entities
        if not ispdf:
            self.title = pywikibot.html2unicode(self.title)
        self.title = re.sub(r"-+", "-", self.title)
        # remove formatting, i.e long useless strings
        self.title = re.sub(r"[\.+\-=]{4,}", " ", self.title)
        # remove \n and \r and Unicode spaces from titles
        self.title = re.sub(r"(?u)\s", " ", self.title)
        self.title = re.sub(r"[\n\r\t]", " ", self.title)
        # remove extra whitespaces
        # remove leading and trailing ./;/,/-/_/+/ /
        self.title = re.sub(r" +", " ", self.title.strip(r"=.;,-+_ "))

        self.avoid_uppercase()
        # avoid closing the link before the end
        self.title = self.title.replace("]", "]")
        # avoid multiple } being interpreted as a template inclusion
        self.title = self.title.replace("}}", "}}")
        # prevent multiple quotes being interpreted as '' or '''
        self.title = self.title.replace("''", "''")
        self.title = pywikibot.unicode2html(self.title, self.site.encoding())