Beispiel #1
0
    def queue_transform(self, job, page, *args, **kwds):

        global download_records

        page.out_queue = None

        img_download_queue, parse_queue, img_store_queue = "download", "parse", "store"
        download_queue = img_download_queue

        try:
            if page.is_img() and page.url not in download_records:
                page.out_queue = download_queue
                db.save("img_download", page.url)
            elif page.is_img() and page.url in download_records:
                page.out_queue = img_store_queue
            elif not page.is_empty() and not hasattr(page, 'parsed'):
                page.out_queue = parse_queue
            # elif not page.is_empty() and hasattr(page, 'parsed'):
            #    page.out_queue = download_queue
            elif not page.is_empty() and not page.is_img() and hasattr(page, "parsed"):
                logger.info("add page child into download_queue")
                page.out_queue = download_queue
                db_save("parse", page.url)
                page.find_all_child()
                for child in page.childs:
                    db_save("childs", child.url)
            return page

        except Exception as exc:
            logger.exception(exc)
            time.sleep(10)
        return page
Beispiel #2
0
 def parse(*args, **kwds):
     page = kwds.get("page")
     page.parsed = True
     try:
         logger.info("at parsing %r url deepth:%r" %
                     (page.url, page.deepth))
         if page.fill_html():
             page.find_all_child()
             return page
     except Exception as e:
         logger.exception(e)
         return page