def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
              lowercaseElementName=False, lowercaseAttrName=False):
     #Change case matching defaults as we only output lowercase html anyway
     #This solution doesn't seem ideal...
     HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
                            lowercaseElementName, lowercaseAttrName)
     # flag to indicate if stripping is going on or not
     self.stripping = 0
Exemple #2
0
 def __init__(self,
              stream,
              encoding=None,
              parseMeta=True,
              lowercaseElementName=False,
              lowercaseAttrName=False):
     #Change case matching defaults as we only output lowercase html anyway
     #This solution doesn't seem ideal...
     HTMLTokenizer.__init__(self, stream, encoding, parseMeta,
                            lowercaseElementName, lowercaseAttrName)
    def __iter__(self):
        for token in HTMLTokenizer.__iter__(self):
            # if its a start tag and is a risky block element (e.g. script), we
            # indicate that we are in striping mode. Its a counter which allows us
            # to handle nested risky block elements
            if self.strip_tokens and token["type"] in ["StartTag", "EndTag"] \
                and token["name"].lower() in HTMLSanitizerMixin.unacceptable_block_elements:
                if token["type"] == "StartTag":
                    self.stripping += 1
                elif token["type"] == "EndTag":
                    self.stripping -= 1

            # Only yield tokens if we are not in stripping mode
            if self.stripping < 1:
                token = self.sanitize_token(token, self.strip_tokens)
                if token:
                    yield token
Exemple #4
0
 def parse(self, stream, output=True):
     tokenizer = HTMLTokenizer(stream)
     for token in tokenizer:
         if output:
             print(token)
Exemple #5
0
 def __iter__(self):
     for token in HTMLTokenizer.__iter__(self):
         token = self.sanitize_token(token)
         if token:
             yield token
Exemple #6
0
 def __iter__(self):
     for token in HTMLTokenizer.__iter__(self):
         token = self.sanitize_token(token)
         if token:
             yield token