def postproc(file): # Share file.set_value("share", ["everybody"]) # Title if file.class_id != "webpage": return handler = file.get_handler() events = handler.events elem = get_element(events, "h1") if elem: title = [unicode(x[1], "utf8") for x in elem.get_content_elements() if x[0] == TEXT] if title[-1] == u"¶": title.pop() title = u"".join(title) file.set_property("title", title, language) handler.events = events[: elem.start] + events[elem.end + 1 :]
def postproc(file): # Share file.set_value('share', ['everybody']) # Title if file.class_id != 'webpage': return handler = file.get_handler() events = handler.events elem = get_element(events, 'h1') if elem: title = [ unicode(x[1], 'utf8') for x in elem.get_content_elements() if x[0] == TEXT ] if title[-1] == u'¶': title.pop() title = u''.join(title) file.set_property('title', title, language) handler.events = events[:elem.start] + events[elem.end+1:]
def filter(path, mimetype, body): # HTML if mimetype == 'text/html': source = XHTMLFile(string=body) target = XHTMLFile() elem = get_element(source.events, 'div', **{'class': 'body'}) if not elem: print 'E', path return None elements = elem.get_content_elements() elements = rewrite_uris(elements, rewrite) elements = list(elements) target.set_body(elements) return target.to_str() # Skip elif mimetype in skip: return None # Keep elif mimetype in keep: return body # Unknown else: print 'X', path, mimetype return body
def filter(path, mimetype, body): # HTML if mimetype == "text/html": source = XHTMLFile(string=body) target = XHTMLFile() elem = get_element(source.events, "div", **{"class": "body"}) if not elem: print "E", path return None elements = elem.get_content_elements() elements = rewrite_uris(elements, rewrite) elements = list(elements) target.set_body(elements) return target.to_str() # Skip elif mimetype in skip: return None # Keep elif mimetype in keep: return body # Unknown else: print "X", path, mimetype return body
def get_body(self): """Returns the body element. """ return get_element(self.events, 'body')
def get_head(self): """Returns the head element. """ return get_element(self.events, 'head')