Ejemplo n.º 1
0
 def addUrl(self, url):
     if url in self.visited:
         pass
     else:
         self.visit.put(url)
         Log4Spider.infoLog(self, "add a url[[[", url, "]]]",
                            "current size:[[[", self.visit.qsize(), "]]]")
 def fetch_one_url(url):
     env_obj = SpiderEnv(url)
     env = yield env_obj.gen_env()
     urlSeek = Jd_Home_Spider(env,app)
     yield urlSeek.work()
     for url in urlSeek.urlLists:
        Log4Spider.infoLog(url)
     Log4Spider.infoLog(len(urlSeek.urlLists))
Ejemplo n.º 3
0
 def fetch_one_url(url):
     env_obj = SpiderEnv(url)
     env = yield env_obj.gen_env()
     urlSeek = BaseSpider(env, app)
     yield urlSeek.work()
     for url in urlSeek.urlLists:
         Log4Spider.infoLog(url)
     Log4Spider.infoLog(len(urlSeek.urlLists))
Ejemplo n.º 4
0
 def fetch_one_url(url):
     env_obj = SpiderEnv(url)
     env = yield env_obj.gen_env()
     urlSeek = Jd_Item_Spider(env,app)
     yield urlSeek.work()
     for url in urlSeek.urlLists:
        Log4Spider.infoLog(url)
     Log4Spider.infoLog(len(urlSeek.urlLists))
     global num
     num-=1
     if num == 0:
         event.set()
Ejemplo n.º 5
0
 def fetch_one_url(url):
     env_obj = SpiderEnv(url)
     env = yield env_obj.gen_env()
     urlSeek = Jd_Item_Spider(env, app)
     yield urlSeek.work()
     for url in urlSeek.urlLists:
         Log4Spider.infoLog(url)
     Log4Spider.infoLog(len(urlSeek.urlLists))
     global num
     num -= 1
     if num == 0:
         event.set()
Ejemplo n.º 6
0
 def main():
     for url in [
             "http://www.jianshu.com",
             "http://upload-images.jianshu.io/upload_images/1679702-7e810a34f3ef8d18.jpg?imageMogr2/auto-orient/strip%7CimageView2/1/w/300/h/300"
     ]:
         env_obj = SpiderEnv(url)
         env = yield env_obj.gen_env()
         urlSeek = UrlSeekSpider(env, None)
         yield urlSeek.work()
         for url in urlSeek.urlLists:
             Log4Spider.infoLog(url)
         Log4Spider.infoLog(len(urlSeek.urlLists))
Ejemplo n.º 7
0
    def run(self):
        while True:
            url = yield self.queue.get()
            Log4Spider.debugLog(self,"get url:",url)
            try:
                env = yield SpiderEnv(url).gen_env()
            except Exception as e:
                Log4Spider.errLog(self,"spider env failed url:",url,"exception:",e)
                continue

            self._find_url_handler(url)
            Log4Spider.infoLog(self,"url: ",url," --- class: ",self.handler_class)
            spider = self.handler_class(env,self.application,**self.handler_kwargs)
            yield spider.work()
            for url in spider.urlLists:
                    Log4Spider.debugLog(self,"put url:",url)
                    yield self.queue.put(url)
Ejemplo n.º 8
0
    def run(self):
        while True:
            url = yield self.queue.get()
            Log4Spider.debugLog(self, "get url:", url)
            try:
                env = yield SpiderEnv(url).gen_env()
            except Exception as e:
                Log4Spider.errLog(self, "spider env failed url:", url,
                                  "exception:", e)
                continue

            self._find_url_handler(url)
            Log4Spider.infoLog(self, "url: ", url, " --- class: ",
                               self.handler_class)
            spider = self.handler_class(env, self.application,
                                        **self.handler_kwargs)
            yield spider.work()
            for url in spider.urlLists:
                Log4Spider.debugLog(self, "put url:", url)
                yield self.queue.put(url)
Ejemplo n.º 9
0
 def addUrl(self,url):
     if url in self.visited:
         pass
     else:
         self.visit.put(url)
         Log4Spider.infoLog(self,"add a url[[[",url,"]]]","current size:[[[",self.visit.qsize(),"]]]")
Ejemplo n.º 10
0
 def main():
     parse = HtmlFetch(base_url)
     result = yield parse.fetch()
     Log4Spider.infoLog(result)
Ejemplo n.º 11
0
 def main():
     parse = HtmlFetch(base_url)
     result = yield parse.fetch()
     Log4Spider.infoLog(result)