def addUrl(self, url): if url in self.visited: pass else: self.visit.put(url) Log4Spider.infoLog(self, "add a url[[[", url, "]]]", "current size:[[[", self.visit.qsize(), "]]]")
def fetch_one_url(url): env_obj = SpiderEnv(url) env = yield env_obj.gen_env() urlSeek = Jd_Home_Spider(env,app) yield urlSeek.work() for url in urlSeek.urlLists: Log4Spider.infoLog(url) Log4Spider.infoLog(len(urlSeek.urlLists))
def fetch_one_url(url): env_obj = SpiderEnv(url) env = yield env_obj.gen_env() urlSeek = BaseSpider(env, app) yield urlSeek.work() for url in urlSeek.urlLists: Log4Spider.infoLog(url) Log4Spider.infoLog(len(urlSeek.urlLists))
def fetch_one_url(url): env_obj = SpiderEnv(url) env = yield env_obj.gen_env() urlSeek = Jd_Item_Spider(env,app) yield urlSeek.work() for url in urlSeek.urlLists: Log4Spider.infoLog(url) Log4Spider.infoLog(len(urlSeek.urlLists)) global num num-=1 if num == 0: event.set()
def fetch_one_url(url): env_obj = SpiderEnv(url) env = yield env_obj.gen_env() urlSeek = Jd_Item_Spider(env, app) yield urlSeek.work() for url in urlSeek.urlLists: Log4Spider.infoLog(url) Log4Spider.infoLog(len(urlSeek.urlLists)) global num num -= 1 if num == 0: event.set()
def main(): for url in [ "http://www.jianshu.com", "http://upload-images.jianshu.io/upload_images/1679702-7e810a34f3ef8d18.jpg?imageMogr2/auto-orient/strip%7CimageView2/1/w/300/h/300" ]: env_obj = SpiderEnv(url) env = yield env_obj.gen_env() urlSeek = UrlSeekSpider(env, None) yield urlSeek.work() for url in urlSeek.urlLists: Log4Spider.infoLog(url) Log4Spider.infoLog(len(urlSeek.urlLists))
def run(self): while True: url = yield self.queue.get() Log4Spider.debugLog(self,"get url:",url) try: env = yield SpiderEnv(url).gen_env() except Exception as e: Log4Spider.errLog(self,"spider env failed url:",url,"exception:",e) continue self._find_url_handler(url) Log4Spider.infoLog(self,"url: ",url," --- class: ",self.handler_class) spider = self.handler_class(env,self.application,**self.handler_kwargs) yield spider.work() for url in spider.urlLists: Log4Spider.debugLog(self,"put url:",url) yield self.queue.put(url)
def run(self): while True: url = yield self.queue.get() Log4Spider.debugLog(self, "get url:", url) try: env = yield SpiderEnv(url).gen_env() except Exception as e: Log4Spider.errLog(self, "spider env failed url:", url, "exception:", e) continue self._find_url_handler(url) Log4Spider.infoLog(self, "url: ", url, " --- class: ", self.handler_class) spider = self.handler_class(env, self.application, **self.handler_kwargs) yield spider.work() for url in spider.urlLists: Log4Spider.debugLog(self, "put url:", url) yield self.queue.put(url)
def addUrl(self,url): if url in self.visited: pass else: self.visit.put(url) Log4Spider.infoLog(self,"add a url[[[",url,"]]]","current size:[[[",self.visit.qsize(),"]]]")
def main(): parse = HtmlFetch(base_url) result = yield parse.fetch() Log4Spider.infoLog(result)