def procMain(pid, states): states[pid] = STATE_IDLE # init black site list bSite = BlackSiteList(BLACK_SITE_FILE) try: while True: states[pid] = STATE_CONNECTING images = fetchImages() if images: states[pid] = STATE_BUSY for img in images: if (len(img.save_path) > 255): img.save_path = img.save_path[:255] save_dir = getDir() + '/' + img.save_path.replace( '/', '\\') file_name = None if img.name != "": file_name = img.name Log.d("{procMain} downloading image (%s) ", img.url) if Downloader.download(img.url, save_dir, file_name): img.request_state = REQUEST_STATE.SUCC img.save_path = save_dir else: site = img.url.split('/')[2] fail_count = bSite.getFaileSiteCount(site) img.request_state = REQUEST_STATE.FAIL # retry download retry_times = MAX_REQUEST_RETRY_TIME - fail_count if retry_times < 0: retry_times = 0 while retry_times > 0: Log.i( "{procMain} retry download image(%s) times(%d)", img.url, retry_times) retry_times = retry_times - 1 if Downloader.download(img.url, save_dir, file_name): img.request_state = REQUEST_STATE.SUCC img.save_path = save_dir break if img.request_state != REQUEST_STATE.SUCC: bSite.addFaileSite(site) bSite.save() #Log.e("{procMain} download image(%s) failed!", img.url) updateImages(images) else: time.sleep( 3) # sleep for a while to wait for the database update except KeyboardInterrupt: Log.i("{procMain} downloader process exit for a KeyboardInterrupt")
def procMain(pid, states): states[pid] = STATE_IDLE try: while True: states[pid] = STATE_CONNECTING Log.d("{procMain} fetching unvisited websites ...") websites = fetchWebsite() Log.d("{procMain} fetched websites(%d)", len(websites)) if websites: states[pid] = STATE_BUSY wbs = set() images = set() for web in websites: spider = Spider() if spider.fetchForUrl(web.url): web.request_state = REQUEST_STATE.SUCC for url in spider.hrefs: wbs.add(DBWebsite(url = url, from_url = web.id, priority = calcPriority(url, web.url))) for img in spider.imgs: images.add(DBImage(url = img, from_website = web.id, save_path = spider.title)) web.title = spider.title else: web.request_state = REQUEST_STATE.FAIL retry_times = MAX_REQUEST_RETRY_TIME while retry_times > 0: Log.i("{procMain} retry fetch url(%s) id(%d) times(%d)", web.url, web.id, retry_times) retry_times = retry_times - 1 if spider.fetchForUrl(web.url): web.request_state = REQUEST_STATE.SUCC for url in spider.hrefs: wbs.add(DBWebsite(url = url, from_url = web.id, priority = calcPriority(url, web.url))) for img in spider.imgs: images.add(DBImage(url = img, from_website = web.id, save_path = spider.title)) web.title = spider.title break if web.request_state != REQUEST_STATE.SUCC: Log.e("{procMain} fetch url(%s) id(%d) failed!", web.url, web.id) updateWebsiteStates(websites) uploadWesites(wbs) uploadImages(images) else: sleep(3) # sleep for a while to wait for the database update except KeyboardInterrupt: Log.i("{procMain} spider process exit for a KeyboardInterrupt")