def parse(self, ui): blob_reader = blobstore.BlobReader(ui.delicious) blob_reader.seek(ui.cursor) ui.cursor = ui.cursor + 9500 future = ui.put_async() data = blob_reader.read(10000) soup = BeautifulSoup(data) future.get_result() for tag in soup.findAll('dt'): if tag.nextSibling and tag.nextSibling.name == 'dd': comment = tag.nextSibling.text else: comment = "" deferred.defer(utils.delicious, tag, comment, ui.user, _target="worker", _queue="admin")
def write_chapter_html(num, chapter): from libs.bs4 import BeautifulSoup as BeautifulSoup from libs.bs4.builder import _html5lib as html5lib safe_title = re.sub("[^0-9a-zA-Z]+", "_", chapter["title"].lower()) file_name = "{0:03d}-{1}.tex".format(num, safe_title) chapter_url = FIMF_CHAPTERDL_HTML.format(chapter["id"]) chapter_html = urllib2.urlopen(urllib2.Request(chapter_url, headers={"User-Agent": USER_AGENT})).read() # Use BeautifulSoup to parse it. html5lib is used because we want valid HTML bs = BeautifulSoup(chapter_html, ["html5lib"], html5lib.HTML5TreeBuilder()) with codecs.open(file_name, "wb", encoding="utf-8") as f: f.write(u"\\chapter{{{0}}}\n\n".format(tex_escape(chapter["title"]))) current_tag = bs.find("p") while current_tag: write_tag(f, current_tag) current_tag = current_tag.next_sibling return file_name[:-4]