def queue_transform(self, job, page, *args, **kwds): global download_records page.out_queue = None img_download_queue, parse_queue, img_store_queue = "download", "parse", "store" download_queue = img_download_queue try: if page.is_img() and page.url not in download_records: page.out_queue = download_queue db.save("img_download", page.url) elif page.is_img() and page.url in download_records: page.out_queue = img_store_queue elif not page.is_empty() and not hasattr(page, 'parsed'): page.out_queue = parse_queue # elif not page.is_empty() and hasattr(page, 'parsed'): # page.out_queue = download_queue elif not page.is_empty() and not page.is_img() and hasattr(page, "parsed"): logger.info("add page child into download_queue") page.out_queue = download_queue db_save("parse", page.url) page.find_all_child() for child in page.childs: db_save("childs", child.url) return page except Exception as exc: logger.exception(exc) time.sleep(10) return page
def parse(*args, **kwds): page = kwds.get("page") page.parsed = True try: logger.info("at parsing %r url deepth:%r" % (page.url, page.deepth)) if page.fill_html(): page.find_all_child() return page except Exception as e: logger.exception(e) return page