Beispiel #1
0
    def parseHTML(self, htmlString):
        """
            parseHTML converts the raw htmlString into a list of tokens.
        """
        tsm = TokenizerStateMachine()
        tokenHandler = TokenHandler()
        tokenHandler.rootUrl = self.rootUrl
        i = 0
        while (i < len(htmlString)):
            i = i + tsm.handleCharacter(htmlString[i])
            if (not (tsm.currentEmittedToken == None)):
                if (isinstance(tsm.currentEmittedToken, StartTagToken)
                        and tsm.currentEmittedToken.name == "link"):
                    self.handleLinkToken(tsm.currentEmittedToken)
                else:
                    tokenHandler.processToken(tsm.currentEmittedToken)
                tsm.currentEmittedToken = None

        #self.extractParagraphText(tsm.tokens)
        self.strList.clear()
        self.renderList.clear()
        self.renderObjects.clear()
        #tokenHandler.getRenderList(tokenHandler.elementTreeRoot, self.renderObjects)
        #for r in self.renderObjects:
        #    print(r.text)
        #    if (not(r.fontSize == None)):
        #        print(r.fontSize)
        #print(tokenHandler.elementTreeRoot.getElementRepresentationString(""))
        #tokenHandler.getTextElements(tokenHandler.elementTreeRoot, self.strList)
        self.fillRenderList(tokenHandler.elementTreeRoot, self.renderList)
        for s in self.renderList:
            print(s)
Beispiel #2
0
    def convertTokenListToHTMLElementTree(self, url, tokenList):
        # The root url is needed by the TokenHandler for handling
        # possible relative links
        rootUrl = self.extractRootUrl(url)
        print(rootUrl)
        tokenHandler = TokenHandler(rootUrl)
        for token in tokenList:
            tokenHandler.processToken(token)

        #print(tokenHandler.elementTreeRoot.getElementRepresentationString(""))
        return tokenHandler.elementTreeRoot
Beispiel #3
0
def test_processTokenHandlesClosingTags():
    testTokens = [
        StartTagToken("p"),
        EndTagToken("p"),
        StartTagToken("a"),
        EndTagToken("a")
    ]

    th = TokenHandler("TestRootUrl/")
    for token in testTokens:
        th.processToken(token)

    assert th.elementTreeRoot.name == "#root#"
    assert len(th.elementTreeRoot.children) == 2
    assert th.elementTreeRoot.children[0].name == "p"
    assert th.elementTreeRoot.children[1].name == "a"
Beispiel #4
0
def test_processTokenProcessesIndependentTags():
    testTokens = [
        StartTagToken("br"),
        StartTagToken("img"),
        StartTagToken("html"),
        EndTagToken("html")
    ]
    testTokens[1].isSelfClosing = True

    th = TokenHandler("TestRootUrl/")
    for token in testTokens:
        th.processToken(token)

    assert len(th.elementTreeRoot.children) == 3
    assert th.elementTreeRoot.children[0].name == "br"
    assert th.elementTreeRoot.children[1].name == "img"
    assert th.elementTreeRoot.children[2].name == "html"
Beispiel #5
0
def test_processTokenHandlesUnacceptableTags():
    testTokens = [
        StartTagToken("p"),
        StartTagToken("NoHTMLTagShouldEverHaveThisName"),
        StartTagToken("body"),
        StartTagToken("a"),
        EndTagToken("a"),
        EndTagToken("body"),
        EndTagToken("NoHTMLTagShouldEverHaveThisName"),
        EndTagToken("p")
    ]

    th = TokenHandler("TestRootUrl/")
    for token in testTokens:
        th.processToken(token)

    assert len(th.elementTreeRoot.children) == 1
    assert th.elementTreeRoot.children[0].name == "p"