def handle_data(self, data): if len(self.__tagStack) == 0: return currentTag = self.__tagStack[0].lower() if currentTag == 'title': self.__title = htmltools.htmlToText(data) elif not currentTag in ('style', 'script', 'title'): self.__text.write(" ") data = texttools.stripWhiteSpaces(data) or "" data = data.strip() data = htmltools.htmlToText(data) self.__text.write(data)
def htmlToText(data=None, input=None): h = MyHTMLParser() h.feed(data=data, input=input) h.close() text = texttools.stripWhiteSpaces(h.getText()) # print "title = %s" % h.getTitle() # text = htmltools.htmlToText(texttools.stripWhiteSpaces(h.getText())) or "" # text = texttools.stripWhiteSpaces(h.getText()) or "" if not h.getTitle(): # we take the first 60 chars if len(text) < 60: title = text[0:60] else: end = 60 while end > 0 and data[end] != ' ': end -= 1 break title = text[0:end] + "..." else: title = h.getTitle() # return (htmltools.htmlToText(title), htmltools.htmlToText(text), h.getLinks()) return (title, text, h.getLinks())
def htmlToText(data = None, input = None): h = MyHTMLParser() h.feed(data=data, input=input) h.close() text = texttools.stripWhiteSpaces(h.getText()) # print "title = %s" % h.getTitle() # text = htmltools.htmlToText(texttools.stripWhiteSpaces(h.getText())) or "" # text = texttools.stripWhiteSpaces(h.getText()) or "" if not h.getTitle(): # we take the first 60 chars if len(text) < 60: title = text[0:60] else: end = 60 while end > 0 and data[end] != ' ': end -= 1 break title = text[0:end] + "..." else: title = h.getTitle() # return (htmltools.htmlToText(title), htmltools.htmlToText(text), h.getLinks()) return (title, text, h.getLinks())