def addUrl(self, url): with self._lock: if url not in self.visited: self.count += 1 self.visit.append(url) Log4Spider.debugLog(self, "add a url[[[", url, "]]]", "current size:[[[", self.count, "]]]")
def _urlWork(self): AsyncHTTPClient.configure(CurlAsyncHTTPClient) httpCli = AsyncHTTPClient() try: respone = yield httpCli.fetch(self.env['url']) except Exception as e: Log4Spider.errLog("urlWork fetch url: ", self.env['url'], "error exception: ", e) return soup = BeautifulSoup(respone.body) a_tags = soup.find_all() for a_tag in a_tags: attrs = a_tag.attrs for attr in attrs: Log4Spider.debugLog("tag: ", a_tag, "attr:", attr) if attr in ( 'href', 'src', '#src', '#src2' ): #find a url,some url likes javascript:void(null) are not filter url = url_path = a_tag[attr] url_path = url_path.replace("//", "/") if url_path.startswith("/"): url_parse = self.env['urlparse'] url = urlunparse([ url_parse.scheme, url_parse.netloc, url_path, "", "", "" ]) if url.startswith("http"): if not self.parse_url_own or url_parse.netloc in url: self._url_lists.append(url) else: Log4Spider.errLog("Find a unknown url:[[[", url, "]]]")
def run(self): while True: url = yield self.queue.get() Log4Spider.debugLog(self,"get url:",url) try: env = yield SpiderEnv(url).gen_env() except Exception as e: Log4Spider.errLog(self,"spider env failed url:",url,"exception:",e) continue self._find_url_handler(url) Log4Spider.infoLog(self,"url: ",url," --- class: ",self.handler_class) spider = self.handler_class(env,self.application,**self.handler_kwargs) yield spider.work() for url in spider.urlLists: Log4Spider.debugLog(self,"put url:",url) yield self.queue.put(url)
def run(self): while True: url = yield self.queue.get() Log4Spider.debugLog(self, "get url:", url) try: env = yield SpiderEnv(url).gen_env() except Exception as e: Log4Spider.errLog(self, "spider env failed url:", url, "exception:", e) continue self._find_url_handler(url) Log4Spider.infoLog(self, "url: ", url, " --- class: ", self.handler_class) spider = self.handler_class(env, self.application, **self.handler_kwargs) yield spider.work() for url in spider.urlLists: Log4Spider.debugLog(self, "put url:", url) yield self.queue.put(url)
def main(): settings = { "host": "localhost", "port": 6379, "db": 0 } try: client = motor.MotorClient("mongodb://localhost:27017/") database = client['jd_db'] except: print("mongodb init failed") sys.exit(0) app_settings = { "static_path":"static", "database":database } app = Application([ (r"^http://www\.jd\.com", "anger6Spider.spiders.jd_spiders.Jd_Home_Spider"), (r"^http://list\.jd\.com.*", "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"), (r"^http://channel\.jd\.com.*", "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"), (r"^http://search\.jd\.com/search?.*", "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"), (r"^http://item\.jd\.com.*", "anger6Spider.spiders.jd_spiders.Jd_Item_Spider"), ],**app_settings) cocurrency = 20 from anger6Spider.spiderQueue.redisqueue import RedisQueue queue = RedisQueue(**settings) queue._create_redis_cli() #yield queue.put("http://www.jianshu.com") #yield queue.put("http://www.jd.com") #yield queue.put("http://www.ivsky.com") #yield queue.put("http://www.jd.com") workers = [] for _ in range(cocurrency): workers.append(Worker(app,queue)) for worker in workers: Log4Spider.debugLog("worker begin:",worker) worker.run() Log4Spider.debugLog("waitiing for spiderQueue empty:") yield queue.join(None) Log4Spider.debugLog("main done!")
def main(): settings = {"host": "localhost", "port": 6379, "db": 0} try: client = motor.MotorClient("mongodb://localhost:27017/") database = client['jd_db'] except: print("mongodb init failed") sys.exit(0) app_settings = {"static_path": "static", "database": database} app = Application([ (r"^http://www\.jd\.com", "anger6Spider.spiders.jd_spiders.Jd_Home_Spider"), (r"^http://list\.jd\.com.*", "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"), (r"^http://channel\.jd\.com.*", "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"), (r"^http://search\.jd\.com/search?.*", "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"), (r"^http://item\.jd\.com.*", "anger6Spider.spiders.jd_spiders.Jd_Item_Spider"), ], **app_settings) cocurrency = 20 from anger6Spider.spiderQueue.redisqueue import RedisQueue queue = RedisQueue(**settings) queue._create_redis_cli() #yield queue.put("http://www.jianshu.com") #yield queue.put("http://www.jd.com") #yield queue.put("http://www.ivsky.com") #yield queue.put("http://www.jd.com") workers = [] for _ in range(cocurrency): workers.append(Worker(app, queue)) for worker in workers: Log4Spider.debugLog("worker begin:", worker) worker.run() Log4Spider.debugLog("waitiing for spiderQueue empty:") yield queue.join(None) Log4Spider.debugLog("main done!")
def addUrl(self,url): with self._lock: if url not in self.visited: self.count+=1 self.visit.append(url) Log4Spider.debugLog(self,"add a url[[[",url,"]]]","current size:[[[",self.count,"]]]")