def __adjustUrl(self, src_url): if not self.__feedingUrl: Log.e("{Spider.__adjustUrl} self.__feedingUrl is None") return src_url if src_url == '#': return self.__feedingUrl if src_url.startswith('//'): src_url = "http:" + src_url return src_url if src_url.startswith('http://') or src_url.startswith('https://'): return src_url if src_url.startswith('javascript:'): return src_url elems = src_url.split('/') feeding = self.__feedingUrl.split('/') for e in elems: if e == '.': elems.remove(e) while True: feeding.pop() if elems[0] != '..': break; elems.pop(0) url = feeding + elems return '/'.join(url)
def uploadImages(images): Log.d("{uploadImages} uploading images(%d) ...", len(images)) addr = (CONFIG.DB_HOST, CONFIG.RPC_PORT) try: client = get_client(addr) client.send(CONFIG.UPLOAD_IMAGE) client.send(images) if client.recv() == CONFIG.ACTION_FAILED: Log.e("{uploadImages} upload fetched images failed!") Log.d("{uploadImages} upload images done!") client.close() except EOFError: Log.e("{uploadImages} server has been closed")
def updateWebsiteStates(websites): Log.d("{updateWebsiteStates} updating websites ...") addr = (CONFIG.DB_HOST, CONFIG.RPC_PORT) try: client = get_client(addr) if websites and len(websites) > 0: client.send(CONFIG.UPDATE_WESITE_STATE) client.send(websites) if client.recv() == CONFIG.ACTION_FAILED: Log.e("{updateWebsiteStates} tell server to update website state failed!") Log.d("{updateWebsiteStates} updating websites done!") client.close() except EOFError: Log.e("{updateWebsiteStates} server has been closed")
def download(src, save_dir=DIR_PATH, file_name=None): ext = "" reg = IMGREG.search(src) if reg: ext = '.' + reg.group(1) if file_name is None: sha = hashlib.sha1() sha.update(src) file_name = sha.hexdigest() if not os.path.exists(save_dir): os.makedirs(save_dir) path = os.path.join(save_dir, file_name + ext) ret = False try: response = requests.get(src, stream=True, verify=False, timeout=CONNECT_TIME_OUT, proxies=PROXY_CONFIG) try: fp = open(path, 'wb') try: if response.raw.status == 200: shutil.copyfileobj(response.raw, fp) ret = True else: Log.e( "{Downloader.download} HTML response state(%d) while downloading (%s)", response.raw.status, src) except requests.packages.urllib3.exceptions.ReadTimeoutError as e: Log.e( "{Downloader.download} download file (%s) failed! Exception(%s)", src, str(e)) finally: fp.close() except IOError: Log.e("{Downloader.download} can't write file to path (%s)", path) except requests.exceptions.ConnectionError as err: Log.e("{Downloader.download} connect error: Exception<%s>", str(err)) except Exception as ex: Log.e("{Downloader.download} download (%s): Raise exception<%s>", src, str(ex)) if not ret and os.path.exists(path): os.remove(path) #delete failed image return ret
def updateImages(images): Log.d("{updateImages} updating images ...") addr = (CONFIG.DB_HOST, CONFIG.RPC_PORT) try: client = get_client(addr, means='Socket') client.send(CONFIG.UPDATE_IMAGE_STATE) client.send(images) if client.recv() == CONFIG.ACTION_FAILED: Log.e("{updateImages} tell server to update images state failed!") client.close() except EOFError: Log.e("{updateImages} server has been closed") Log.d("{updateImages} update images done")
def procMain(pid, states): states[pid] = STATE_IDLE try: while True: states[pid] = STATE_CONNECTING Log.d("{procMain} fetching unvisited websites ...") websites = fetchWebsite() Log.d("{procMain} fetched websites(%d)", len(websites)) if websites: states[pid] = STATE_BUSY wbs = set() images = set() for web in websites: spider = Spider() if spider.fetchForUrl(web.url): web.request_state = REQUEST_STATE.SUCC for url in spider.hrefs: wbs.add(DBWebsite(url = url, from_url = web.id, priority = calcPriority(url, web.url))) for img in spider.imgs: images.add(DBImage(url = img, from_website = web.id, save_path = spider.title)) web.title = spider.title else: web.request_state = REQUEST_STATE.FAIL retry_times = MAX_REQUEST_RETRY_TIME while retry_times > 0: Log.i("{procMain} retry fetch url(%s) id(%d) times(%d)", web.url, web.id, retry_times) retry_times = retry_times - 1 if spider.fetchForUrl(web.url): web.request_state = REQUEST_STATE.SUCC for url in spider.hrefs: wbs.add(DBWebsite(url = url, from_url = web.id, priority = calcPriority(url, web.url))) for img in spider.imgs: images.add(DBImage(url = img, from_website = web.id, save_path = spider.title)) web.title = spider.title break if web.request_state != REQUEST_STATE.SUCC: Log.e("{procMain} fetch url(%s) id(%d) failed!", web.url, web.id) updateWebsiteStates(websites) uploadWesites(wbs) uploadImages(images) else: sleep(3) # sleep for a while to wait for the database update except KeyboardInterrupt: Log.i("{procMain} spider process exit for a KeyboardInterrupt")
def fetchForUrl(self, url): ret = False self.__feedingUrl = url try: Log.d("{Spider.fetchForUrl} requesting url(%s)", url) rs = requests.session() resp = rs.get(url, timeout = CONN_TIME_OUT, proxies = PROXY_CONFIG) if resp.status_code == 200: self.__feedingUrl = url resp.encoding = self.__getHtmlEncode(resp.text) #print '[Debug]Using encoding{%s} for HTML {%s}' % (resp.encoding, url) #print resp.text self.feed(resp.text) ret = True else: Log.e("{Spider.fetchForUrl} address(%s) can't be reached!", url) except requests.exceptions.ConnectionError as err: Log.e("{Spider.fetchForUrl} connect to address(%s) failed, exception<%s>", url, str(err),) except requests.exceptions.ReadTimeout as ex: Log.e("{Spider.fetchForUrl} connect to address(%s) time out", url) finally: self.__feedingUrl = None return ret
def fetchWebsite(): addr = (CONFIG.DB_HOST, CONFIG.RPC_PORT) websites = [] try: client = get_client(addr, means = CONFIG.COMMUNICATE_MEANS) client.send(CONFIG.FETCH_WEBSITE) result = client.recv() if result == CONFIG.ACTION_FAILED: Log.e("{Spider.fetchWebsite} get validate website failed") else: websites = client.recv() client.close() except EOFError: Log.e("{Spider.fetchWebsite} server has been closed") except Exception as e: Log.e("{Spider.fetchWebsite} raise exceptions<%s>", str(e)) return websites
def fetchImages(): addr = (CONFIG.DB_HOST, CONFIG.RPC_PORT) images = None try: client = get_client(addr, means=CONFIG.COMMUNICATE_MEANS) client.send(CONFIG.FETCH_IMAGE) result = client.recv() if result == CONFIG.ACTION_FAILED: Log.e("{fetchImages} get download images failed") else: images = client.recv() client.close() except EOFError: Log.e("{fetchImages} server side has been closed") except Exception as e: Log.e("{fetchImages} raise exceptions: %s", str(e)) return images