def fetchForUrl(self, url): ret = False self.__feedingUrl = url try: Log.d("{Spider.fetchForUrl} requesting url(%s)", url) rs = requests.session() resp = rs.get(url, timeout = CONN_TIME_OUT, proxies = PROXY_CONFIG) if resp.status_code == 200: self.__feedingUrl = url resp.encoding = self.__getHtmlEncode(resp.text) #print '[Debug]Using encoding{%s} for HTML {%s}' % (resp.encoding, url) #print resp.text self.feed(resp.text) ret = True else: Log.e("{Spider.fetchForUrl} address(%s) can't be reached!", url) except requests.exceptions.ConnectionError as err: Log.e("{Spider.fetchForUrl} connect to address(%s) failed, exception<%s>", url, str(err),) except requests.exceptions.ReadTimeout as ex: Log.e("{Spider.fetchForUrl} connect to address(%s) time out", url) finally: self.__feedingUrl = None return ret
def procMain(pid, states): states[pid] = STATE_IDLE # init black site list bSite = BlackSiteList(BLACK_SITE_FILE) try: while True: states[pid] = STATE_CONNECTING images = fetchImages() if images: states[pid] = STATE_BUSY for img in images: if (len(img.save_path) > 255): img.save_path = img.save_path[:255] save_dir = getDir() + '/' + img.save_path.replace( '/', '\\') file_name = None if img.name != "": file_name = img.name Log.d("{procMain} downloading image (%s) ", img.url) if Downloader.download(img.url, save_dir, file_name): img.request_state = REQUEST_STATE.SUCC img.save_path = save_dir else: site = img.url.split('/')[2] fail_count = bSite.getFaileSiteCount(site) img.request_state = REQUEST_STATE.FAIL # retry download retry_times = MAX_REQUEST_RETRY_TIME - fail_count if retry_times < 0: retry_times = 0 while retry_times > 0: Log.i( "{procMain} retry download image(%s) times(%d)", img.url, retry_times) retry_times = retry_times - 1 if Downloader.download(img.url, save_dir, file_name): img.request_state = REQUEST_STATE.SUCC img.save_path = save_dir break if img.request_state != REQUEST_STATE.SUCC: bSite.addFaileSite(site) bSite.save() #Log.e("{procMain} download image(%s) failed!", img.url) updateImages(images) else: time.sleep( 3) # sleep for a while to wait for the database update except KeyboardInterrupt: Log.i("{procMain} downloader process exit for a KeyboardInterrupt")
def worker(pid, states): from time import sleep try: states[pid] = STATE_IDLE sleep(3) states[pid] = STATE_BUSY sleep(30) states[pid] = STATE_TERMINATE except KeyboardInterrupt: Log.d("worker end!")
def uploadImages(images): Log.d("{uploadImages} uploading images(%d) ...", len(images)) addr = (CONFIG.DB_HOST, CONFIG.RPC_PORT) try: client = get_client(addr) client.send(CONFIG.UPLOAD_IMAGE) client.send(images) if client.recv() == CONFIG.ACTION_FAILED: Log.e("{uploadImages} upload fetched images failed!") Log.d("{uploadImages} upload images done!") client.close() except EOFError: Log.e("{uploadImages} server has been closed")
def updateWebsiteStates(websites): Log.d("{updateWebsiteStates} updating websites ...") addr = (CONFIG.DB_HOST, CONFIG.RPC_PORT) try: client = get_client(addr) if websites and len(websites) > 0: client.send(CONFIG.UPDATE_WESITE_STATE) client.send(websites) if client.recv() == CONFIG.ACTION_FAILED: Log.e("{updateWebsiteStates} tell server to update website state failed!") Log.d("{updateWebsiteStates} updating websites done!") client.close() except EOFError: Log.e("{updateWebsiteStates} server has been closed")
def updateImages(images): Log.d("{updateImages} updating images ...") addr = (CONFIG.DB_HOST, CONFIG.RPC_PORT) try: client = get_client(addr, means='Socket') client.send(CONFIG.UPDATE_IMAGE_STATE) client.send(images) if client.recv() == CONFIG.ACTION_FAILED: Log.e("{updateImages} tell server to update images state failed!") client.close() except EOFError: Log.e("{updateImages} server has been closed") Log.d("{updateImages} update images done")
def procMain(pid, states): states[pid] = STATE_IDLE try: while True: states[pid] = STATE_CONNECTING Log.d("{procMain} fetching unvisited websites ...") websites = fetchWebsite() Log.d("{procMain} fetched websites(%d)", len(websites)) if websites: states[pid] = STATE_BUSY wbs = set() images = set() for web in websites: spider = Spider() if spider.fetchForUrl(web.url): web.request_state = REQUEST_STATE.SUCC for url in spider.hrefs: wbs.add(DBWebsite(url = url, from_url = web.id, priority = calcPriority(url, web.url))) for img in spider.imgs: images.add(DBImage(url = img, from_website = web.id, save_path = spider.title)) web.title = spider.title else: web.request_state = REQUEST_STATE.FAIL retry_times = MAX_REQUEST_RETRY_TIME while retry_times > 0: Log.i("{procMain} retry fetch url(%s) id(%d) times(%d)", web.url, web.id, retry_times) retry_times = retry_times - 1 if spider.fetchForUrl(web.url): web.request_state = REQUEST_STATE.SUCC for url in spider.hrefs: wbs.add(DBWebsite(url = url, from_url = web.id, priority = calcPriority(url, web.url))) for img in spider.imgs: images.add(DBImage(url = img, from_website = web.id, save_path = spider.title)) web.title = spider.title break if web.request_state != REQUEST_STATE.SUCC: Log.e("{procMain} fetch url(%s) id(%d) failed!", web.url, web.id) updateWebsiteStates(websites) uploadWesites(wbs) uploadImages(images) else: sleep(3) # sleep for a while to wait for the database update except KeyboardInterrupt: Log.i("{procMain} spider process exit for a KeyboardInterrupt")
def run(self): while not self.isTerminate: try: task = self.__threadpool.tasks.get(block = True, timeout = Worker.THREAD_IDEL_TIME) except Exception as e: #print "[Debug]None task founded, worker done" break; task.value = task.callback(*task.param) task.done = True if self.__threadpool.returns is not None: self.__threadpool.returns.put(task) #print "[Debug]Task done!" self.isTerminate = True self.__thread = None self.__threadpool.removeThread(self) Log.d("{Worker.run} work thread terminated!")
def handle_starttag(self, tag, attrs): if tag == 'img': attrs = dict(attrs) if attrs.has_key('src'): url = self.__adjustUrl(attrs['src']) if self.__isValidateImage(url): Log.d("image: %s", url) self.imgs.add(url) if tag == 'input': attrs = dict(attrs) if attrs.has_key('type') and attrs['type'] == 'image' and attrs.has_key('src'): url = self.__adjustUrl(attrs['src']) if self.__isValidateImage(url): Log.d("image: %s", url) self.imgs.add(url) if tag == 'a': attrs = dict(attrs) if attrs.has_key('href'): url = self.__adjustUrl(attrs['href']) if self.__isValidateUrl(url): Log.d("link: %s", url) self.hrefs.add(url) if self.title == None: if self.findTitle == False: if tag == 'div': attrs = dict(attrs); if attrs.has_key('class') and attrs['class'] == 't t2': self.findTitle = True; elif tag == 'h4': self.canGetTitle = True
def removeThread(self, thread): self.lockObj.acquire() self.threads.remove(thread) self.lockObj.release() Log.d("{Worker.removeThread} pool thread count(%d)", len(self.threads))
def addThread(self, thread): self.lockObj.acquire() self.threads.append(thread) self.lockObj.release() Log.d("{Worker.addThread} pool thread count(%d)", len(self.threads))
def handle_data(self, data): if self.title == None and self.findTitle == True and self.canGetTitle == True: self.title = data Log.d("title: %s", self.title);
for img in spider.imgs: images.add(DBImage(url = img, from_website = web.id, save_path = spider.title)) web.title = spider.title break if web.request_state != REQUEST_STATE.SUCC: Log.e("{procMain} fetch url(%s) id(%d) failed!", web.url, web.id) updateWebsiteStates(websites) uploadWesites(wbs) uploadImages(images) else: sleep(3) # sleep for a while to wait for the database update except KeyboardInterrupt: Log.i("{procMain} spider process exit for a KeyboardInterrupt") if __name__ == '__main__': Log.setup('spider') Log.d('setting up spider......') #procMain(1, {}) num = 1 if len(os.sys.argv) > 1: num = int(os.sys.argv[1]) if num < 1: num = 1 pm = ProcessManager(procMain, maxWorker = num) pm.run()
img.request_state = REQUEST_STATE.SUCC img.save_path = save_dir break if img.request_state != REQUEST_STATE.SUCC: bSite.addFaileSite(site) bSite.save() #Log.e("{procMain} download image(%s) failed!", img.url) updateImages(images) else: time.sleep( 3) # sleep for a while to wait for the database update except KeyboardInterrupt: Log.i("{procMain} downloader process exit for a KeyboardInterrupt") if __name__ == '__main__': Log.setup("download") Log.d('setting up downloader......') """ procMain(1, {}) """ num = 1 if len(os.sys.argv) > 1: num = int(os.sys.argv[1]) if num < 1: num = 1 pm = ProcessManager(procMain, maxWorker=num) pm.run()