Exemple #1
0
 def postproc(file):
     # Share
     file.set_value("share", ["everybody"])
     # Title
     if file.class_id != "webpage":
         return
     handler = file.get_handler()
     events = handler.events
     elem = get_element(events, "h1")
     if elem:
         title = [unicode(x[1], "utf8") for x in elem.get_content_elements() if x[0] == TEXT]
         if title[-1] == u"¶":
             title.pop()
         title = u"".join(title)
         file.set_property("title", title, language)
         handler.events = events[: elem.start] + events[elem.end + 1 :]
Exemple #2
0
 def postproc(file):
     # Share
     file.set_value('share', ['everybody'])
     # Title
     if file.class_id != 'webpage':
         return
     handler = file.get_handler()
     events = handler.events
     elem = get_element(events, 'h1')
     if elem:
         title = [
             unicode(x[1], 'utf8')
             for x in elem.get_content_elements() if x[0] == TEXT ]
         if title[-1] == u'¶':
             title.pop()
         title = u''.join(title)
         file.set_property('title', title, language)
         handler.events = events[:elem.start] + events[elem.end+1:]
Exemple #3
0
 def postproc(file):
     # Share
     file.set_value('share', ['everybody'])
     # Title
     if file.class_id != 'webpage':
         return
     handler = file.get_handler()
     events = handler.events
     elem = get_element(events, 'h1')
     if elem:
         title = [
             unicode(x[1], 'utf8')
             for x in elem.get_content_elements() if x[0] == TEXT ]
         if title[-1] == u'¶':
             title.pop()
         title = u''.join(title)
         file.set_property('title', title, language)
         handler.events = events[:elem.start] + events[elem.end+1:]
Exemple #4
0
 def filter(path, mimetype, body):
     # HTML
     if mimetype == 'text/html':
         source = XHTMLFile(string=body)
         target = XHTMLFile()
         elem = get_element(source.events, 'div', **{'class': 'body'})
         if not elem:
             print 'E', path
             return None
         elements = elem.get_content_elements()
         elements = rewrite_uris(elements, rewrite)
         elements = list(elements)
         target.set_body(elements)
         return target.to_str()
     # Skip
     elif mimetype in skip:
         return None
     # Keep
     elif mimetype in keep:
         return body
     # Unknown
     else:
         print 'X', path, mimetype
         return body
Exemple #5
0
 def filter(path, mimetype, body):
     # HTML
     if mimetype == 'text/html':
         source = XHTMLFile(string=body)
         target = XHTMLFile()
         elem = get_element(source.events, 'div', **{'class': 'body'})
         if not elem:
             print 'E', path
             return None
         elements = elem.get_content_elements()
         elements = rewrite_uris(elements, rewrite)
         elements = list(elements)
         target.set_body(elements)
         return target.to_str()
     # Skip
     elif mimetype in skip:
         return None
     # Keep
     elif mimetype in keep:
         return body
     # Unknown
     else:
         print 'X', path, mimetype
         return body
Exemple #6
0
 def filter(path, mimetype, body):
     # HTML
     if mimetype == "text/html":
         source = XHTMLFile(string=body)
         target = XHTMLFile()
         elem = get_element(source.events, "div", **{"class": "body"})
         if not elem:
             print "E", path
             return None
         elements = elem.get_content_elements()
         elements = rewrite_uris(elements, rewrite)
         elements = list(elements)
         target.set_body(elements)
         return target.to_str()
     # Skip
     elif mimetype in skip:
         return None
     # Keep
     elif mimetype in keep:
         return body
     # Unknown
     else:
         print "X", path, mimetype
         return body
Exemple #7
0
 def get_body(self):
     """Returns the body element.
     """
     return get_element(self.events, 'body')
Exemple #8
0
 def get_head(self):
     """Returns the head element.
     """
     return get_element(self.events, 'head')