Esempio n. 1
0
    def add_symbol(self):
        symbol = Symbol()
        size = len(self.symbols)

        if size > 0:
            last = self.symbols[-1]

            # if the previous tag has started a script or a style,
            # do not include the contents of those tags as they
            # are large and unnecessary
            # NOTE: this does not skip inline tags
            if last.type == SymbolType.SCRIPT_START or last.type == SymbolType.STYLE_START:
                symbol = self.tokenizer.get_next_symbol()

                while (symbol.type == SymbolType.SCRIPT_START
                       or symbol.type == SymbolType.STYLE_START
                       or symbol.type == SymbolType.SCRIPT_END
                       or symbol.type == SymbolType.STYLE_END
                       or symbol.type == SymbolType.PLAIN_TEXT):
                    symbol = self.tokenizer.get_next_symbol()
            else:
                symbol = self.tokenizer.get_next_symbol()
        else:
            symbol = self.tokenizer.get_next_symbol()

            # do not add symbols until a start of the document has been reached
            # the start it represented as DOCTYPE tag
            if symbol.type != SymbolType.DOCTYPE:
                symbol = Symbol.empty()

        if not symbol.is_empty():
            # might be empty when the end of the HTML has been reached
            self.symbols.append(symbol)