コード例 #1
0
    def handle_data(self, data):
        if len(self.__tagStack) == 0:
            return

        currentTag = self.__tagStack[0].lower()
        if currentTag == 'title':
            self.__title = htmltools.htmlToText(data)
        elif not currentTag in ('style', 'script', 'title'):
            self.__text.write(" ")
            data = texttools.stripWhiteSpaces(data) or ""
            data = data.strip()

            data = htmltools.htmlToText(data)
            self.__text.write(data)
コード例 #2
0
    def handle_data(self, data):
        if len(self.__tagStack) == 0:
            return


        currentTag =  self.__tagStack[0].lower()
        if currentTag == 'title':
            self.__title = htmltools.htmlToText(data)
        elif not currentTag in ('style', 'script', 'title'):
            self.__text.write(" ")
            data = texttools.stripWhiteSpaces(data) or ""
            data = data.strip()

            data = htmltools.htmlToText(data)
            self.__text.write(data)
コード例 #3
0
def htmlToText(data=None, input=None):
    h = MyHTMLParser()
    h.feed(data=data, input=input)
    h.close()

    text = texttools.stripWhiteSpaces(h.getText())
    #       print "title = %s" % h.getTitle()
    #       text = htmltools.htmlToText(texttools.stripWhiteSpaces(h.getText())) or ""
    #       text = texttools.stripWhiteSpaces(h.getText()) or ""
    if not h.getTitle():
        # we take the first 60 chars
        if len(text) < 60:
            title = text[0:60]
        else:
            end = 60
            while end > 0 and data[end] != ' ':
                end -= 1
                break
            title = text[0:end] + "..."
    else:
        title = h.getTitle()
#       return (htmltools.htmlToText(title), htmltools.htmlToText(text), h.getLinks())
    return (title, text, h.getLinks())
コード例 #4
0
def htmlToText(data = None, input = None):
    h = MyHTMLParser()
    h.feed(data=data, input=input)
    h.close()

    text = texttools.stripWhiteSpaces(h.getText())
#       print "title = %s" % h.getTitle()
#       text = htmltools.htmlToText(texttools.stripWhiteSpaces(h.getText())) or ""
#       text = texttools.stripWhiteSpaces(h.getText()) or ""
    if not h.getTitle():
        # we take the first 60 chars
        if len(text) < 60:
            title = text[0:60]
        else:
            end = 60
            while end > 0 and data[end] != ' ':
                end -= 1
                break
            title = text[0:end] + "..."
    else:
        title = h.getTitle()
#       return (htmltools.htmlToText(title), htmltools.htmlToText(text), h.getLinks())
    return (title, text, h.getLinks())