Python HTMLParser.__init__ Examples, htmllib.HTMLParser.__init__ Python Examples

Example #1

0

Show file

File: encode.py Project: jaryhjz/cga-worldmap

    def __init__(self, fmt=AbstractFormatter):
        HTMLParser.__init__(self, fmt)
        self.result = ""
        self.open_tags = []
        # A list of forbidden tags.
        self.forbidden_tags = ["script", "embed", "iframe", "frame"]

        # A list of tags that require no closing tag.
        self.requires_no_close = ["img", "br"]

        # A dictionary showing the only attributes allowed for particular tags.
        # If a tag is not listed here, it is allowed no attributes.  Adding
        # "on" tags, like "onhover," would not be smart.  Also be very careful
        # of "background" and "style."
        self.allowed_attributes = {
            "a": ["href", "title", "target", "style"],
            "img": ["src", "alt", "border", "style"],
            "blockquote": ["type", "style"],
            "font": ["size", "face"],
            "h5": ["style"],
            "h4": ["style"],
            "h3": ["style"],
            "h2": ["style"],
            "h1": ["style"],
            "table": ["border", "width", "height", "style", "align", "bgcolor"],
            "tbody": ["border", "width", "height", "style", "align", "bgcolor"],
            "tr": ["border", "width", "height", "style", "align", "bgcolor"],
            "td": ["border", "width", "height", "style", "align", "bgcolor"],
            "div": ["border", "width", "height", "style", "align", "bgcolor"],
            "span": ["border", "width", "height", "style", "align", "bgcolor"],
        }

        # The only schemes allowed in URLs (for href and src attributes).
        # Adding "javascript" or "vbscript" to this list would not be smart.
        self.allowed_schemes = ["http", "https", "ftp"]

Example #2

0

Show file

File: xss.py Project: codartX/Coconut

    def __init__(self, fmt=AbstractFormatter):
        HTMLParser.__init__(self, fmt)
        self.result = ""
        self.open_tags = []
        # A list of the only tags allowed.  Be careful adding to this.  Adding
        # "script," for example, would not be smart.  'img' is out by default
        # because of the danger of IMG embedded commands, and/or web bugs.
        self.permitted_tags = [
            'a', 'b', 'blockquote', 'br', 'i', 'li', 'ol', 'ul', 'p', 'cite'
        ]

        # A list of tags that require no closing tag.
        self.requires_no_close = ['img', 'br']

        # A dictionary showing the only attributes allowed for particular tags.
        # If a tag is not listed here, it is allowed no attributes.  Adding
        # "on" tags, like "onhover," would not be smart.  Also be very careful
        # of "background" and "style."
        self.allowed_attributes = \
            {'a':['href','title'],
             'img':['src','alt'],
             'blockquote':['type']}

        # The only schemes allowed in URLs (for href and src attributes).
        # Adding "javascript" or "vbscript" to this list would not be smart.
        self.allowed_schemes = ['http', 'https', 'ftp']

Example #3

0

Show file

File: xss.py Project: dftaiwo/oneroom

    def __init__(self, fmt=AbstractFormatter):
        HTMLParser.__init__(self, fmt)
        self.result = ""
        self.open_tags = []
        # A list of the only tags allowed.  Be careful adding to this.  Adding
        # "script," for example, would not be smart.  'img' is out by default 
        # because of the danger of IMG embedded commands, and/or web bugs.
        self.permitted_tags = ['a', 'b', 'blockquote', 'br', 'i',
                          'li', 'ol', 'ul', 'p', 'cite']

        # A list of tags that require no closing tag.
        self.requires_no_close = ['img', 'br']

        # A dictionary showing the only attributes allowed for particular tags.
        # If a tag is not listed here, it is allowed no attributes.  Adding
        # "on" tags, like "onhover," would not be smart.  Also be very careful
        # of "background" and "style."
        self.allowed_attributes = \
            {'a':['href', 'title'],
             'img':['src', 'alt'],
             'blockquote':['type']}

        # The only schemes allowed in URLs (for href and src attributes).
        # Adding "javascript" or "vbscript" to this list would not be smart.
        self.allowed_schemes = ['http', 'https', 'ftp']

Example #4

0

Show file

File: encode.py Project: alejandra2013/cga-worldmap

    def __init__(self, fmt = AbstractFormatter):
        HTMLParser.__init__(self, fmt)
        self.result = ""
        self.open_tags = []
        # A list of forbidden tags.
        self.forbidden_tags = ['script', 'embed', 'iframe', 'frame' ]

        # A list of tags that require no closing tag.
        self.requires_no_close = ['img', 'br']

        # A dictionary showing the only attributes allowed for particular tags.
        # If a tag is not listed here, it is allowed no attributes.  Adding
        # "on" tags, like "onhover," would not be smart.  Also be very careful
        # of "background" and "style."
        self.allowed_attributes =\
            {'a':['href','title','target','style'],
             'img':['src','alt','border','style'],
             'blockquote':['type','style'],
             'table': ['border', 'width', 'height', 'style', 'align', 'bgcolor'],
             'tbody': ['border', 'width', 'height', 'style', 'align', 'bgcolor'],
             'tr': ['border', 'width', 'height', 'style', 'align', 'bgcolor'],
             'td': ['border', 'width', 'height', 'style', 'align', 'bgcolor'],
             'div': ['border', 'width', 'height', 'style', 'align', 'bgcolor'],
             'span': ['border', 'width', 'height', 'style', 'align', 'bgcolor'],
             }

        # The only schemes allowed in URLs (for href and src attributes).
        # Adding "javascript" or "vbscript" to this list would not be smart.
        self.allowed_schemes = ['http','https','ftp']

Example #5

0

Show file

File: sanitizer.py Project: twocngdagz/weppy

    def __init__(
        self,
        permitted_tags=['a', 'b', 'blockquote', 'br/', 'i',
                        'li', 'ol', 'ul', 'p', 'cite',
                        'code', 'pre', 'img/',],
        allowed_attributes={
            'a': ['href', 'title'],
            'img': ['src', 'alt'],
            'blockquote': ['type']},
        fmt=AbstractFormatter,
        strip_disallowed=False
    ):

        HTMLParser.__init__(self, fmt)
        self.result = ''
        self.open_tags = []
        self.permitted_tags = [i for i in permitted_tags if i[-1] != '/']
        self.requires_no_close = [i[:-1] for i in permitted_tags
                                  if i[-1] == '/']
        self.permitted_tags += self.requires_no_close
        self.allowed_attributes = allowed_attributes

        # The only schemes allowed in URLs (for href and src attributes).
        # Adding "javascript" or "vbscript" to this list would not be smart.

        self.allowed_schemes = ['http', 'https', 'ftp']

        #to strip or escape disallowed tags?
        self.strip_disallowed = strip_disallowed
        self.in_disallowed = False

Example #6

0

Show file

File: MetaHelp.py Project: CoachCoen/ECL

    def __init__(self, funFormatter, objHere):
        self.objH1 = None
        self.objH2 = None
        self.objH3 = None
        self.dodHelp = GetDOD(objHere, "E3Help")

        HTMLParser.__init__(self, funFormatter)

Example #7

0

Show file

File: sanitizer.py Project: Viper525/sonospy

    def __init__(
        self,
        permitted_tags=[
            'a',
            'b',
            'blockquote',
            'br/',
            'i',
            'li',
            'ol',
            'ul',
            'p',
            'cite',
            'code',
            'pre',
            'img/',
            ],
        allowed_attributes={'a': ['href', 'title'], 'img': ['src', 'alt'
                            ], 'blockquote': ['type']},
        fmt=AbstractFormatter,
        ):

        HTMLParser.__init__(self, fmt)
        self.result = ''
        self.open_tags = []
        self.permitted_tags = [i for i in permitted_tags if i[-1] != '/']
        self.requires_no_close = [i[:-1] for i in permitted_tags
                                  if i[-1] == '/']
        self.permitted_tags += self.requires_no_close
        self.allowed_attributes = allowed_attributes

        # The only schemes allowed in URLs (for href and src attributes).
        # Adding "javascript" or "vbscript" to this list would not be smart.

        self.allowed_schemes = ['http', 'https', 'ftp']

Example #8

0

Show file

File: prechm.py Project: Alisa-lisa/uni-stuff

 def __init__(self, formatter, path, output):
     HTMLParser.__init__(self, formatter)
     self.path = path    # relative path
     self.ft = output    # output file
     self.indent = 0     # number of tabs for pretty printing of files
     self.proc = False   # True when actively processing, else False
                         # (headers, footers, etc)
     # XXX This shouldn't need to be a stack -- anchors shouldn't nest.
     # XXX See SF bug <http://www.python.org/sf/546579>.
     self.hrefstack = [] # stack of hrefs from anchor begins

Example #9

0

Show file

File: prechm.py Project: vsajip/htmlhelp

 def __init__(self, formatter, path, output):
     HTMLParser.__init__(self, formatter)
     self.path = path    # relative path
     self.ft = output    # output file
     self.indent = 0     # number of tabs for pretty printing of files
     self.proc = False   # True when actively processing, else False
                         # (headers, footers, etc)
     # XXX This shouldn't need to be a stack -- anchors shouldn't nest.
     # XXX See SF bug <http://www.python.org/sf/546579>.
     self.hrefstack = [] # stack of hrefs from anchor begins

Example #10

0

Show file

File: cli.py Project: gruppobeatrice/rss-generator

   def __init__(self, formatter=AbstractFormatter(DumbWriter())):

      HTMLParser.__init__(self,formatter)
      self.intoTheBox=False      #ingresso/uscita dall'ambiente del box
      self.intoTheTitle=None     #ingresso/uscita dalla riga titolo
      self.intoTheNews=None      #ingresso/uscita dal testo della notizia
      self.effectiveTitle=False  #tag <span> del titolo
      self.effectiveDate=False   #tag <span> della data
      self.effectiveAuth=False   #tag <div> dell'autore
      self.writable=False        #testo scrivibile
      self.cookedText=""         #testo xml risultante

Example #11

0

Show file

File: di.py Project: gruppobeatrice/rss-generator

   def __init__(self, formatter=AbstractFormatter(DumbWriter())):

      #HTMLParser.__init__(self)
      HTMLParser.__init__(self,formatter)
      self.intoTheAnchor=False   #ingresso/uscita dai tag anchor
      self.intoTheDate=False     #ingresso/uscita dal box data
      self.intoTheTitle=False    #ingresso/uscita dal box titolo
      self.intoTheNews=False     #ingresso/uscita dal box notizia
      self.writable=False        #testo scrivibile
      self.maxNews=10            #totale notizie visualizzate
      self.cookedText=""         #testo xml risultante
      self.gotTitle=False        #flag indicante la reale presenza del titolo
      self.pseudoTitle=""        #titolo fake (se mancante)

Example #12

0

Show file

File: encode.py Project: neojjang/cga-worldmap

    def __init__(self, fmt=AbstractFormatter):
        HTMLParser.__init__(self, fmt)
        self.result = ""
        self.open_tags = []
        # A list of the only tags allowed.  Be careful adding to this.  Adding
        # "script," for example, would not be smart.  'img' is out by default
        # because of the danger of IMG embedded commands, and/or web bugs.
        self.permitted_tags = [
            "a",
            "b",
            "blockquote",
            "br",
            "i",
            "sup",
            "sub",
            "strike",
            "hr",
            "u",
            "li",
            "ol",
            "ul",
            "p",
            "cite",
            "img",
            "style",
            "font",
            "h1",
            "h2",
            "h3",
            "h4",
            "h5",
            "h6",
            "pre",
            "div",
        ]

        # A list of tags that require no closing tag.
        self.requires_no_close = ["img", "br"]

        # A dictionary showing the only attributes allowed for particular tags.
        # If a tag is not listed here, it is allowed no attributes.  Adding
        # "on" tags, like "onhover," would not be smart.  Also be very careful
        # of "background" and "style."
        self.allowed_attributes = {"a": ["href", "title"], "img": ["src", "alt"], "blockquote": ["type"]}

        # The only schemes allowed in URLs (for href and src attributes).
        # Adding "javascript" or "vbscript" to this list would not be smart.
        self.allowed_schemes = ["http", "https", "ftp"]

Example #13

0

Show file

File: encode.py Project: Niran814804102/cga-worldmap

    def __init__(self, fmt=AbstractFormatter):
        HTMLParser.__init__(self, fmt)
        self.result = ""
        self.open_tags = []
        # A list of forbidden tags.
        self.forbidden_tags = ['script', 'embed', 'iframe', 'frame']

        # A list of tags that require no closing tag.
        self.requires_no_close = ['img', 'br']

        # A dictionary showing the only attributes allowed for particular tags.
        # If a tag is not listed here, it is allowed no attributes.  Adding
        # "on" tags, like "onhover," would not be smart.  Also be very careful
        # of "background" and "style."

        #         <h5 style="text-align: center;"><b><i><u><font size="5" face="impact">THIS IS A TEST</font></u></i></b></h5>
        #         <blockquote style="margin: 0 0 0 40px; border: none; padding: 0px;"><p style="text-align: center;">
        #         <font size="5" face="arial" color="#cc3333">of the EBS</font></p><p style="text-align: center;">
        #         <font size="5" face="arial"><br></font></p><p style="text-align: center;"><font size="5" face="arial">
        #         <sup>reddit</sup><sub>2</sub></font></p>
        #         <p style="text-align: center;"><font size="5" face="arial"><sub><br></sub></font></p>
        #         <p style="text-align: center;"><font size="5" face="arial">fiiiiiii<sub>4</sub></font></p>
        #         <p style="text-align: center;"><font size="5" face="arial"><sub><br></sub></font></p>
        #         <p style="text-align: center;"><hr><br></p><p style="text-align: center;">
        #         <strike>strike</strike></p></blockquote>

        self.allowed_attributes =\
            {'a':['href','title','target','style'],
             'p': ['style'],
             'img':['src','alt','border','style','align'],
             'blockquote':['type','style','align'],
             'font':['size','face','align'],
             'h5':['style'],'h4':['style'],'h3':['style'],'h2':['style'],'h1':['style'],
             'table': ['border', 'width', 'height', 'style', 'align', 'bgcolor'],
             'tbody': ['border', 'width', 'height', 'style', 'align', 'bgcolor'],
             'tr': ['border', 'width', 'height', 'style', 'align', 'bgcolor'],
             'td': ['border', 'width', 'height', 'style', 'align', 'bgcolor'],
             'div': ['border', 'width', 'height', 'style', 'align', 'bgcolor'],
             'span': ['border', 'width', 'height', 'style', 'align', 'bgcolor'],
             }

        # The only schemes allowed in URLs (for href and src attributes).
        # Adding "javascript" or "vbscript" to this list would not be smart.
        self.allowed_schemes = ['http', 'https', 'ftp']

Example #14

0

Show file

File: encode.py Project: GeoNode/geonode

    def __init__(self, fmt=AbstractFormatter):
        HTMLParser.__init__(self, fmt)
        self.result = ""
        self.open_tags = []
        # A list of forbidden tags.
        self.forbidden_tags = ['script', 'embed', 'iframe', 'frame', ]

        # A list of tags that require no closing tag.
        self.requires_no_close = ['img', 'br']

        # A dictionary showing the only attributes allowed for particular tags.
        # If a tag is not listed here, it is allowed no attributes.  Adding
        # "on" tags, like "onhover," would not be smart.  Also be very careful
        # of "background" and "style."

#         <h5 style="text-align: center;"><b><i><u><font size="5" face="impact">THIS IS A TEST</font></u></i></b></h5>
#         <blockquote style="margin: 0 0 0 40px; border: none; padding: 0px;"><p style="text-align: center;">
#         <font size="5" face="arial" color="#cc3333">of the EBS</font></p><p style="text-align: center;">
#         <font size="5" face="arial"><br></font></p><p style="text-align: center;"><font size="5" face="arial">
#         <sup>reddit</sup><sub>2</sub></font></p>
#         <p style="text-align: center;"><font size="5" face="arial"><sub><br></sub></font></p>
#         <p style="text-align: center;"><font size="5" face="arial">fiiiiiii<sub>4</sub></font></p>
#         <p style="text-align: center;"><font size="5" face="arial"><sub><br></sub></font></p>
#         <p style="text-align: center;"><hr><br></p><p style="text-align: center;">
#         <strike>strike</strike></p></blockquote>

        self.allowed_attributes =\
            {'a': ['href', 'title', 'target', 'style'],
             'p': ['style'],
             'img': ['src', 'alt', 'border', 'style', 'align'],
             'blockquote': ['type', 'style', 'align'],
             'font': ['size', 'face', 'align'],
             'h5': ['style'], 'h4': ['style'], 'h3': ['style'], 'h2': ['style'], 'h1': ['style'],
             'table': ['border', 'width', 'height', 'style', 'align', 'bgcolor'],
             'tbody': ['border', 'width', 'height', 'style', 'align', 'bgcolor'],
             'tr': ['border', 'width', 'height', 'style', 'align', 'bgcolor'],
             'td': ['border', 'width', 'height', 'style', 'align', 'bgcolor'],
             'div': ['border', 'width', 'height', 'style', 'align', 'bgcolor'],
             'span': ['border', 'width', 'height', 'style', 'align', 'bgcolor'],
             }

        # The only schemes allowed in URLs (for href and src attributes).
        # Adding "javascript" or "vbscript" to this list would not be smart.
        self.allowed_schemes = ['http', 'https', 'ftp']

Example #15

0

Show file

File: xssParser.py Project: kingmk/motss

    def __init__(self, fmt = AbstractFormatter):
        HTMLParser.__init__(self, fmt)
        self.result = ""
        self.open_tags = []
        self.at_users = []

        self.permitted_tags = ['a', 'b', 'blockquote', 'br', 'i', 
        					'li', 'ol', 'ul', 'p', 'cite', 'span',
                            'font', 'strike', 'div']

        self.requires_no_close = ['img', 'br']

        self.allowed_attributes = \
            {'a':['href','title', 'style'],
             'img':['src', 'width', 'height'],
             'blockquote':['type'],
             'span':['style'],
             'font':['color'],
             'div' :['style']}

        self.allowed_schemes = ['http','https','ftp']

Example #16

0

Show file

File: save_image.py Project: a-yasui/atysPy

 def __init__(self):
     HTMLParser.__init__(self, NullFormatter())
     self.reg = re.compile(r".+/(?P<name>.+\.(?:" + "|".join(_IMAGE_EXT) + ")).*$")
     self.links = []

Example #17

0

Show file

File: TCEQadaptor.py Project: twdb/txhis

 def __init__(self, formatter):  # class constructor
     HTMLParser.__init__(self, formatter)  # base class constructor
     self.selectBasins = []  # create an empty list for BasinID
     self.selectYears = []  # creat an empty list for Years recorded
     self.insideBasinIdSelect = False
     self.insideYearSelect = False

Example #18

0

Show file

 def __init__(self):
     HTMLParser.__init__(self, AbstractFormatter(NullWriter()))
     self.result = []
     self.requires_no_close = ['img', 'br']

Example #19

0

Show file

File: manga_fetch.py Project: louxiu/Manga_downloader

 def __init__(self):
     HTMLParser.__init__(self, '')
     self.script_tag_count = 0
     self.images_items = ''

Example #20

0

Show file

File: BaseParser.py Project: kawasaki2013/sangiin-votes

 def __init__(self):
     HTMLParser.__init__(self, NullFormatter())

Example #21

0

Show file

 def __init__ (self):
     HTMLParser.__init__(self, NullFormatter())
     self.reg = re.compile(r'.+/(?P<name>.+\.(?:' + "|".join(_IMAGE_EXT) + ')).*$')
     self.links = []

Example #22

0

Show file

 def __init__(self, given_formatter):
   HTMLParser.__init__(self, given_formatter)
   self.urls = []

Example #23

0

Show file

File: parser.py Project: nitindhar7/pycrawler

 def __init__(self):
     self.__links = []
     self.__markup = ''
     self.__parser = HTMLParser.__init__(self, formatter.NullFormatter())
     self.__pyurlopener = lib.PyURLOpener()

Example #24

0

Show file

 def __init__(self, formatter, verbose=0):
     HTMLParser.__init__(self, formatter, verbose)
     self._tableStack = []
     self._currentTable = None
     self._currentCell = None
     self._tablesFound = []

Example #25

0

Show file

File: crawler_ex.py Project: Johnson-wu/python

 def __init__(self, wait_parse, url):
     HTMLParser.__init__(self)
     self.wait_parse = wait_parse
     self.url = url

Example #26

0

Show file

    def __init__(self, require_link_target=False):
        HTMLParser.__init__(self, AbstractFormatter(NullWriter()))
        self.result = []
        self.open_tags = []
        # A list of the only tags allowed.  Be careful adding to this.  Adding
        # "script," for example, would not be smart.  'img' is out by default
        # because of the danger of IMG embedded commands, and/or web bugs.
        self.permitted_tags = [
            'a',
            'b',
            'br',
            'em',
            'i',
            'li',
            'ol',
            'ul',
            'p',
            'strong',
            'u',
            'div',
            'h1',
            'h2',
            'h3',
            'h4',
            'h5',
            'h6',
            'blockquote',
            'q',
            'cite',
            'code',
            'samp',
            'kbd',
            'var',
            'dfn',
            'address',
            'big',
            'small',
            'ins',
            'del',
            'acronym',
            'abbr',
            'strike',
            's',
            'sub',
            'sup',
            'tt',
            'pre',
            'center',
            'font',
            'basefont',
            'multicol',
            'spacer',
            'layer',
            'ilayer',
            'nolayer',
            'img',
            'map',
            'area',
            'param',
            'hr',
            'nobr',
            'wbr',
            'ul',
            'ol',
            'li',
            'dl',
            'dt',
            'dd',
            'menu',
            'dir',
            'form',
            'input',
            'button',
            'label',
            'select',
            'option',
            'optgroup',
            'textarea',
            'fieldset',
            'legend',
            'table',
            'tr',
            'td',
            'th',
            'tbody',
            'tfoot',
            'thead',
            'caption',
            'col',
            'colgroup',
            'span',
        ]

        # A list of tags that require no closing tag.
        self.requires_no_close = ['img', 'br']

        # A dictionary showing the only attributes allowed for particular tags.
        # If a tag is not listed here, it is allowed no attributes.  Adding
        # "on" tags, like "onhover," would not be smart.  Also be very careful
        # of "background" and "style."
        self.allowed_attributes = {
            'a': ['href', 'target', 'rel'],
            'p': ['align'],
            'img': ['src', 'alt', 'border', 'title', "class"],
            'table':
            ['cellpadding', 'cellspacing', 'border', 'width', 'height'],
            'font': ['size', 'face', 'color', 'style', 'class'],
            'span': ['style'],
            'h3': ['style'],
            'td': ['rowspan', 'colspan', 'width', 'height'],
            'th': ['rowspan', 'colspan', 'width', 'height'],
        }

        # The only schemes allowed in URLs (for href and src attributes).
        # Adding "javascript" or "vbscript" to this list would not be smart.
        self.allowed_schemes = ['http', 'https', 'ftp', 'irc', 'mailto', '']

        # Boolean indicating whether links need to have a target attribute.
        self.require_link_target = require_link_target

Example #27

0

Show file

 def __init__(self, fmt, base):
     HTMLParser.__init__(self, fmt)
     self.base = base

Example #28

0

Show file

 def __init__(self):
     HTMLParser.__init__(self, '')
     self.vote_tag_count = 0
     self.votes_items = []

Example #29

0

Show file

 def __init__(self):
     HTMLParser.__init__(self, NullFormatter())
     self.clear_serv()

Example #30

0

Show file

File: html2oo.py Project: MaxTyutyunnikov/lino

 def __init__(self, formatter, verbose=0):
     HTMLParser.__init__(self, formatter, verbose)
     self._tableStack = []
     self._currentTable = None
     self._currentCell = None
     self._tablesFound =[]

Example #31

0

Show file

File: manga_fetch.py Project: louxiu/Manga_downloader

 def __init__(self):
     HTMLParser.__init__(self, '')
     self.script_tag_count = 0
     self.title = ''

Example #32

0

Show file

File: ex11_12_13_14.py Project: omelhoro/python-coreapps

 def __init__(self, formatter, verbose=0):
     HTMLParser.__init__(self,formatter, verbose)
     self.imagelist=[]

Example #33

0

Show file

File: slurp.py Project: Web5design/overplot

 def __init__(self):
   HTMLParser.__init__(self, NullFormatter())

Example #34

0

Show file

File: pocket_sendmail.py Project: louxiu/pocket-push

 def __init__(self):
     HTMLParser.__init__(self, '')
     self.vote_tag_count = 0
     self.votes_items = []

Example #35

0

Show file

 def __init__(self, allow_refs=False):
     HTMLParser.__init__(self, AbstractFormatter(NullWriter()))
     self.result = []
     self.allow_refs = allow_refs

Example #36

0

Show file

 def __init__(self, name, data):
     f = AbstractFormatter(DumbWriter(open(name, 'w'), 100))
     HTMLParser.__init__(self, f)
     self.feed(data)
     self.close()

Example #37

0

Show file

File: Html_cleaner.py Project: osborne6/luminotes

  def __init__( self, require_link_target = False ):
    HTMLParser.__init__( self, AbstractFormatter( NullWriter() ) )
    self.result = []
    self.open_tags = []
    # A list of the only tags allowed.  Be careful adding to this.  Adding
    # "script," for example, would not be smart.  'img' is out by default 
    # because of the danger of IMG embedded commands, and/or web bugs.
    self.permitted_tags = [
      'a',
      'b',
      'br',
      'em',
      'i',
      'li',
      'ol',
      'ul',
      'p',
      'strong',
      'u',
      'div',
      'h1',
      'h2',
      'h3',
      'h4',
      'h5',
      'h6',
      'blockquote',
      'q',
      'cite',
      'code',
      'samp',
      'kbd',
      'var',
      'dfn',
      'address',
      'big',
      'small',
      'ins',
      'del',
      'acronym',
      'abbr',
      'strike',
      's',
      'sub',
      'sup',
      'tt',
      'pre',
      'center',
      'font',
      'basefont',
      'multicol',
      'spacer',
      'layer',
      'ilayer',
      'nolayer',
      'img',
      'map',
      'area',
      'param',
      'hr',
      'nobr',
      'wbr',
      'ul',
      'ol',
      'li',
      'dl',
      'dt',
      'dd',
      'menu',
      'dir',
      'form',
      'input',
      'button',
      'label',
      'select',
      'option',
      'optgroup',
      'textarea',
      'fieldset',
      'legend',
      'table',
      'tr',
      'td',
      'th',
      'tbody',
      'tfoot',
      'thead',
      'caption',
      'col',
      'colgroup',
      'span',
    ]

    # A list of tags that require no closing tag.
    self.requires_no_close = [ 'img', 'br' ]

    # A dictionary showing the only attributes allowed for particular tags.
    # If a tag is not listed here, it is allowed no attributes.  Adding
    # "on" tags, like "onhover," would not be smart.  Also be very careful
    # of "background" and "style."
    self.allowed_attributes = {
      'a': [ 'href', 'target', 'rel' ],
      'p': [ 'align' ],
      'img': [ 'src', 'alt', 'border', 'title', "class" ],
      'table': [ 'cellpadding', 'cellspacing', 'border', 'width', 'height' ],
      'font': [ 'size', 'face', 'color', 'style', 'class' ],
      'span': [ 'style' ],
      'h3': [ 'style' ],
      'td': [ 'rowspan', 'colspan', 'width', 'height' ],
      'th': [ 'rowspan', 'colspan', 'width', 'height' ],
    }

    # The only schemes allowed in URLs (for href and src attributes).
    # Adding "javascript" or "vbscript" to this list would not be smart.
    self.allowed_schemes = ['http','https','ftp', 'irc','mailto','']

    # Boolean indicating whether links need to have a target attribute.
    self.require_link_target = require_link_target

Example #38

0

Show file

File: sniffer_html_parser.py Project: jtauber/redfoot-orig

 def __init__(self, adder):
     HTMLParser.__init__(self, formatter.NullFormatter(), 0)
     self.adder = adder

Example #39

0

Show file

File: Html_differ.py Project: osborne6/luminotes

 def __init__( self ):
   HTMLParser.__init__( self, AbstractFormatter( NullWriter() ) )
   self.result = []
   self.requires_no_close = [ 'img', 'br' ]

Example #40

0

Show file

File: Html_nuker.py Project: osborne6/luminotes

 def __init__( self, allow_refs = False ):
   HTMLParser.__init__( self, AbstractFormatter( NullWriter() ) )
   self.result = []
   self.allow_refs = allow_refs