class HTMLSequenceWrapperRecord(object): def __init__(self, element, url, mintextlen=10): self.cleaner = SimpleHTMLCleaner() self.mintextlen = mintextlen self.elem = element self.url = url # the whole text self.text = self.elem.text_content() self.text = self.cleaner.clean(self.text) self.chunks = [] self.__extract_chunks(self.elem) def has_value(self): if self.cleaner.contains_text(self.text) == False: return False return len(self.text) > self.mintextlen def get_chunks(self): return self.chunks def get_text(self): return self.text def _handle_elem(self, elem): if elem.text == None: return None if not self.cleaner.contains_text(elem): return None # new chunk chunk = TextChunk() ## extracting links if elem.get('href') != None: chunk.set_link(elem.get('href')) # extracting 'title' atribute in anchor if elem.tag == 'a' and elem.get('title') != None: chunk.set_comment(elem.get('title')) # extracting text txt = elem.text_content() chunk.set_text(self.cleaner.clean(txt)) # setting style fs = elem.style chunk.set_style(fs) chunk.set_tag(elem.tag) return chunk def __extract_chunks(self, elem): thischunk = self._handle_elem(elem) if thischunk != None: self.chunks.append(thischunk) for child in elem.iterchildren(): self.__extract_chunks(child) def __str__(self): return "<" + __modulename__ + ".HTMLSequenceWrapperRecord instance " + self.text + " >"
class HTMLSequenceWrapperRecord(object): def __init__(self, element, url, mintextlen=10): self.cleaner = SimpleHTMLCleaner() self.mintextlen = mintextlen self.elem = element self.url = url # the whole text self.text = self.elem.text_content() self.text = self.cleaner.clean(self.text) self.chunks = [] self.__extract_chunks(self.elem) def has_value(self): if self.cleaner.contains_text(self.text) == False: return False return len(self.text) > self.mintextlen def get_chunks(self): return self.chunks def get_text(self): return self.text def _handle_elem(self, elem): if elem.text == None: return None if not self.cleaner.contains_text(elem): return None # new chunk chunk = TextChunk() ## extracting links if elem.get('href') != None: chunk.set_link(elem.get('href')) # extracting 'title' atribute in anchor if elem.tag == 'a' and elem.get('title') != None: chunk.set_comment(elem.get('title')) # extracting text txt = elem.text_content() chunk.set_text(self.cleaner.clean(txt)) # setting style fs = elem.style chunk.set_style(fs) chunk.set_tag(elem.tag) return chunk def __extract_chunks(self, elem): thischunk = self._handle_elem(elem) if thischunk != None: self.chunks.append(thischunk) for child in elem.iterchildren(): self.__extract_chunks(child) def __str__(self): return "<"+__modulename__+".HTMLSequenceWrapperRecord instance " + self.text + " >"