Esempio n. 1
0
 def parse(self, ui):
     blob_reader = blobstore.BlobReader(ui.delicious)
     blob_reader.seek(ui.cursor)
     ui.cursor = ui.cursor + 9500
     future = ui.put_async()
     data = blob_reader.read(10000)
     soup = BeautifulSoup(data)
     future.get_result()
     for tag in soup.findAll('dt'):
         if tag.nextSibling and tag.nextSibling.name == 'dd':
             comment = tag.nextSibling.text
         else:
             comment = ""
         deferred.defer(utils.delicious, tag, comment, ui.user, _target="worker", _queue="admin")
Esempio n. 2
0
def write_chapter_html(num, chapter):
    from libs.bs4 import BeautifulSoup as BeautifulSoup
    from libs.bs4.builder import _html5lib as html5lib

    safe_title = re.sub("[^0-9a-zA-Z]+", "_", chapter["title"].lower())
    file_name = "{0:03d}-{1}.tex".format(num, safe_title)

    chapter_url = FIMF_CHAPTERDL_HTML.format(chapter["id"])

    chapter_html = urllib2.urlopen(urllib2.Request(chapter_url, headers={"User-Agent": USER_AGENT})).read()

    # Use BeautifulSoup to parse it. html5lib is used because we want valid HTML
    bs = BeautifulSoup(chapter_html, ["html5lib"], html5lib.HTML5TreeBuilder())

    with codecs.open(file_name, "wb", encoding="utf-8") as f:
        f.write(u"\\chapter{{{0}}}\n\n".format(tex_escape(chapter["title"])))

        current_tag = bs.find("p")

        while current_tag:
            write_tag(f, current_tag)
            current_tag = current_tag.next_sibling

    return file_name[:-4]