Python html2unicode Examples, pywikibot.html2unicode Python Examples

Example #1

0

Show file

File: wikidata_cleanup_toolkit.py Project: matejsuchanek/pywikibot-scripts

 def fix_HTML(self, terms, claims, data):
     ret = False
     for key in ['labels', 'descriptions']:
         for lang, value in terms[key].items():
             while True:
                 new = html2unicode(value.replace('^|^', '&'))
                 if new == value:
                     break
                 terms[key][lang] = value = new
                 ret = True
     for lang, aliases in terms['aliases'].items():
         for i, value in enumerate(aliases):
             while True:
                 new = html2unicode(value.replace('^|^', '&'))
                 if new == value:
                     break
                 aliases[i] = value = new
                 ret = True
     for values in claims.values():
         for claim in values:
             if claim.type != 'monolingualtext':
                 continue
             value = claim.target.text if claim.target else None
             changed = False
             while value:
                 new = html2unicode(value.replace('^|^', '&'))
                 if value == new:
                     break
                 claim.target.text = value = new
                 changed = True
             if changed:
                 data.append(claim.toJSON())
                 ret = True
     return ret

Example #2

0

Show file

File: wikidata_cleanup_toolkit.py Project: KaleemBhatti/pywikibot-scripts

 def fix_HTML(self, terms, claims, data):
     ret = False
     for key in ['labels', 'descriptions']:
         for lang, value in terms[key].items():
             while True:
                 new = html2unicode(value.replace('^|^', '&'))
                 if new == value:
                     break
                 terms[key][lang] = value = new
                 ret = True
     for lang, aliases in terms['aliases'].items():
         for i, value in enumerate(aliases):
             while True:
                 new = html2unicode(value.replace('^|^', '&'))
                 if new == value:
                     break
                 aliases[i] = value = new
                 ret = True
     for values in claims.values():
         for claim in values:
             if claim.type != 'monolingualtext':
                 continue
             value = claim.target.text if claim.target else None
             changed = False
             while value:
                 new = html2unicode(value.replace('^|^', '&'))
                 if value == new:
                     break
                 claim.target.text = value = new
                 changed = True
             if changed:
                 data.append(claim.toJSON())
                 ret = True
     return ret

Example #3

0

Show file

 def resolveHtmlEntities(self, text):
     """Replace HTML entities with string."""
     ignore = [
         38,  # Ampersand (&amp;)
         39,  # Single quotation mark (&quot;) per T26093
         60,  # Less than (&lt;)
         62,  # Greater than (&gt;)
         91,  # Opening square bracket ([)
         # - sometimes used intentionally inside links
         93,  # Closing square bracket (])
         # - used intentionally inside links
         124,  # Vertical bar (|)
         # - used intentionally in navigation bar templates on w:de
         160,  # Non-breaking space (&nbsp;)
         # - not supported by Firefox textareas
         173,  # Soft-hypen (&shy;) - enable editing
         8206,  # Left-to-right mark (&ltr;)
         8207,  # Right-to-left mark (&rtl;)
     ]
     if self.template:
         ignore += [32]  # Space ( )
         ignore += [58]  # Colon (:)
     # TODO: T254350 - what other extension tags should be avoided?
     # (graph, math, score, timeline, etc.)
     text = pywikibot.html2unicode(text,
                                   ignore=ignore,
                                   exceptions=['comment', 'source'])
     return text

Example #4

0

Show file

    def transform(self, ispdf=False):
        """Normalize the title."""
        # convert html entities
        if not ispdf:
            self.title = pywikibot.html2unicode(self.title)
        self.title = re.sub(r'-+', '-', self.title)
        # remove formatting, i.e long useless strings
        self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title)
        # remove \n and \r and Unicode spaces from titles
        self.title = re.sub(r'(?u)\s', ' ', self.title)
        self.title = re.sub(r'[\n\r\t]', ' ', self.title)
        # remove extra whitespaces
        # remove leading and trailing ./;/,/-/_/+/ /
        self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ '))

        self.avoid_uppercase()
        # avoid closing the link before the end
        self.title = self.title.replace(']', '&#93;')
        # avoid multiple } being interpreted as a template inclusion
        self.title = self.title.replace('}}', '}&#125;')
        # prevent multiple quotes being interpreted as '' or '''
        self.title = self.title.replace('\'\'', '\'&#39;')
        # avoid multiple | being interpreted as a template parameter
        self.title = self.title.replace('|', '&#124;')

        self.title = pywikibot.unicode2html(self.title, self.site.encoding())

Example #5

0

Show file

File: cosmetic_changes.py Project: azatoth/pywikipedia

 def resolveHtmlEntities(self, text):
     ignore = [
          38,     # Ampersand (&amp;)
          39,     # Bugzilla 24093
          60,     # Less than (&lt;)
          62,     # Great than (&gt;)
          91,     # Opening bracket - sometimes used intentionally inside links
          93,     # Closing bracket - sometimes used intentionally inside links
         124,     # Vertical bar (??) - used intentionally in navigation bar templates on de:
         160,     # Non-breaking space (&nbsp;) - not supported by Firefox textareas
     ]
     # ignore ' see http://eo.wikipedia.org/w/index.php?title=Liberec&diff=next&oldid=2320801
     #if self.site.lang == 'eo':
     #    ignore += [39]
     text = pywikibot.html2unicode(text, ignore = ignore)
     return text

Example #6

0

Show file

 def resolveHtmlEntities(self, text):
     ignore = [
         38,     # Ampersand (&amp;)
         39,     # Bugzilla 24093
         60,     # Less than (&lt;)
         62,     # Great than (&gt;)
         91,     # Opening bracket - sometimes used intentionally inside links
         93,     # Closing bracket - sometimes used intentionally inside links
         124,    # Vertical bar (??) - used intentionally in navigation bar templates on de:
         160,    # Non-breaking space (&nbsp;) - not supported by Firefox textareas
         173,    # Soft-hypen (&shy;) - enable editing
         8206,   # left-to-right mark (&ltr;)
         8207,   # right-to-left mark (&rtl;)
     ]
     # ignore ' see http://eo.wikipedia.org/w/index.php?title=Liberec&diff=next&oldid=2320801
     #if self.site.lang == 'eo':
     #    ignore += [39]
     if self.template:
         ignore += [58]
     text = pywikibot.html2unicode(text, ignore=ignore)
     return text

Example #7

0

Show file

 def resolveHtmlEntities(self, text):
     ignore = [
         38,     # Ampersand (&amp;)
         39,     # Single quotation mark (&quot;) - Bugzilla 24093
         60,     # Less than (&lt;)
         62,     # Great than (&gt;)
         91,     # Opening square bracket ([)
                 # - sometimes used intentionally inside links
         93,     # Closing square bracket (])
                 # - used intentionally inside links
         124,    # Vertical bar (|)
                 # - used intentionally in navigation bar templates on w:de
         160,    # Non-breaking space (&nbsp;)
                 # - not supported by Firefox textareas
         173,    # Soft-hypen (&shy;) - enable editing
         8206,   # Left-to-right mark (&ltr;)
         8207,   # Right-to-left mark (&rtl;)
     ]
     if self.template:
         ignore += [58]
     text = pywikibot.html2unicode(text, ignore=ignore)
     return text

Example #8

0

Show file

File: cosmetic_changes.py Project: skamithi/pywikibot-core

 def resolveHtmlEntities(self, text):
     ignore = [
         38,     # Ampersand (&amp;)
         39,     # Single quotation mark (&quot;) - Bugzilla 24093
         60,     # Less than (&lt;)
         62,     # Great than (&gt;)
         91,     # Opening square bracket ([)
                 # - sometimes used intentionally inside links
         93,     # Closing square bracket (])
                 # - used intentionally inside links
         124,    # Vertical bar (|)
                 # - used intentionally in navigation bar templates on w:de
         160,    # Non-breaking space (&nbsp;)
                 # - not supported by Firefox textareas
         173,    # Soft-hypen (&shy;) - enable editing
         8206,   # Left-to-right mark (&ltr;)
         8207,   # Right-to-left mark (&rtl;)
     ]
     if self.template:
         ignore += [58]
     text = pywikibot.html2unicode(text, ignore=ignore)
     return text

Example #9

0

Show file

File: reflinks.py Project: APerson241/pywikibot-core

    def transform(self, ispdf=False):
        """Normalize the title"""
        # convert html entities
        if not ispdf:
            self.title = pywikibot.html2unicode(self.title)
        self.title = re.sub(r'-+', '-', self.title)
        # remove formatting, i.e long useless strings
        self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title)
        # remove \n and \r and Unicode spaces from titles
        self.title = re.sub(r'(?u)\s', ' ', self.title)
        self.title = re.sub(r'[\n\r\t]', ' ', self.title)
        # remove extra whitespaces
        # remove leading and trailing ./;/,/-/_/+/ /
        self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ '))

        self.avoid_uppercase()
        # avoid closing the link before the end
        self.title = self.title.replace(']', '&#93;')
        # avoid multiple } being interpreted as a template inclusion
        self.title = self.title.replace('}}', '}&#125;')
        # prevent multiple quotes being interpreted as '' or '''
        self.title = self.title.replace('\'\'', '\'&#39;')
        self.title = pywikibot.unicode2html(self.title, self.site.encoding())

Example #10

0

Show file

File: reflinks.py Project: hasteur/pywikibot_scripts

    def transform(self, ispdf=False):
        """Normalize the title."""
        # convert html entities
        if not ispdf:
            self.title = pywikibot.html2unicode(self.title)
        self.title = re.sub(r"-+", "-", self.title)
        # remove formatting, i.e long useless strings
        self.title = re.sub(r"[\.+\-=]{4,}", " ", self.title)
        # remove \n and \r and Unicode spaces from titles
        self.title = re.sub(r"(?u)\s", " ", self.title)
        self.title = re.sub(r"[\n\r\t]", " ", self.title)
        # remove extra whitespaces
        # remove leading and trailing ./;/,/-/_/+/ /
        self.title = re.sub(r" +", " ", self.title.strip(r"=.;,-+_ "))

        self.avoid_uppercase()
        # avoid closing the link before the end
        self.title = self.title.replace("]", "&#93;")
        # avoid multiple } being interpreted as a template inclusion
        self.title = self.title.replace("}}", "}&#125;")
        # prevent multiple quotes being interpreted as '' or '''
        self.title = self.title.replace("''", "'&#39;")
        self.title = pywikibot.unicode2html(self.title, self.site.encoding())