예제 #1
0
 def addUrl(self, url):
     with self._lock:
         if url not in self.visited:
             self.count += 1
             self.visit.append(url)
     Log4Spider.debugLog(self, "add a url[[[", url, "]]]",
                         "current size:[[[", self.count, "]]]")
예제 #2
0
 def _urlWork(self):
     AsyncHTTPClient.configure(CurlAsyncHTTPClient)
     httpCli = AsyncHTTPClient()
     try:
         respone = yield httpCli.fetch(self.env['url'])
     except Exception as e:
         Log4Spider.errLog("urlWork fetch url: ", self.env['url'],
                           "error exception: ", e)
         return
     soup = BeautifulSoup(respone.body)
     a_tags = soup.find_all()
     for a_tag in a_tags:
         attrs = a_tag.attrs
         for attr in attrs:
             Log4Spider.debugLog("tag: ", a_tag, "attr:", attr)
             if attr in (
                     'href', 'src', '#src', '#src2'
             ):  #find a url,some url likes javascript:void(null) are not filter
                 url = url_path = a_tag[attr]
                 url_path = url_path.replace("//", "/")
                 if url_path.startswith("/"):
                     url_parse = self.env['urlparse']
                     url = urlunparse([
                         url_parse.scheme, url_parse.netloc, url_path, "",
                         "", ""
                     ])
                 if url.startswith("http"):
                     if not self.parse_url_own or url_parse.netloc in url:
                         self._url_lists.append(url)
                 else:
                     Log4Spider.errLog("Find a unknown url:[[[", url, "]]]")
예제 #3
0
    def run(self):
        while True:
            url = yield self.queue.get()
            Log4Spider.debugLog(self,"get url:",url)
            try:
                env = yield SpiderEnv(url).gen_env()
            except Exception as e:
                Log4Spider.errLog(self,"spider env failed url:",url,"exception:",e)
                continue

            self._find_url_handler(url)
            Log4Spider.infoLog(self,"url: ",url," --- class: ",self.handler_class)
            spider = self.handler_class(env,self.application,**self.handler_kwargs)
            yield spider.work()
            for url in spider.urlLists:
                    Log4Spider.debugLog(self,"put url:",url)
                    yield self.queue.put(url)
예제 #4
0
    def run(self):
        while True:
            url = yield self.queue.get()
            Log4Spider.debugLog(self, "get url:", url)
            try:
                env = yield SpiderEnv(url).gen_env()
            except Exception as e:
                Log4Spider.errLog(self, "spider env failed url:", url,
                                  "exception:", e)
                continue

            self._find_url_handler(url)
            Log4Spider.infoLog(self, "url: ", url, " --- class: ",
                               self.handler_class)
            spider = self.handler_class(env, self.application,
                                        **self.handler_kwargs)
            yield spider.work()
            for url in spider.urlLists:
                Log4Spider.debugLog(self, "put url:", url)
                yield self.queue.put(url)
예제 #5
0
def main():
    settings = {
                "host": "localhost",
                "port": 6379,
                "db": 0
    }

    try:
        client = motor.MotorClient("mongodb://localhost:27017/")
        database = client['jd_db']
    except:
        print("mongodb init failed")
        sys.exit(0)

    app_settings = {
        "static_path":"static",
        "database":database
    }

    app = Application([
        (r"^http://www\.jd\.com", "anger6Spider.spiders.jd_spiders.Jd_Home_Spider"),
        (r"^http://list\.jd\.com.*", "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"),
        (r"^http://channel\.jd\.com.*", "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"),
        (r"^http://search\.jd\.com/search?.*", "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"),
        (r"^http://item\.jd\.com.*", "anger6Spider.spiders.jd_spiders.Jd_Item_Spider"),
    ],**app_settings)

    cocurrency = 20



    from anger6Spider.spiderQueue.redisqueue import RedisQueue
    queue = RedisQueue(**settings)
    queue._create_redis_cli()
    #yield queue.put("http://www.jianshu.com")
    #yield queue.put("http://www.jd.com")
    #yield queue.put("http://www.ivsky.com")
    #yield queue.put("http://www.jd.com")

    workers = []
    for _ in range(cocurrency):
        workers.append(Worker(app,queue))

    for worker in workers:
        Log4Spider.debugLog("worker begin:",worker)
        worker.run()

    Log4Spider.debugLog("waitiing for spiderQueue empty:")
    yield queue.join(None)
    Log4Spider.debugLog("main done!")
예제 #6
0
def main():
    settings = {"host": "localhost", "port": 6379, "db": 0}

    try:
        client = motor.MotorClient("mongodb://localhost:27017/")
        database = client['jd_db']
    except:
        print("mongodb init failed")
        sys.exit(0)

    app_settings = {"static_path": "static", "database": database}

    app = Application([
        (r"^http://www\.jd\.com",
         "anger6Spider.spiders.jd_spiders.Jd_Home_Spider"),
        (r"^http://list\.jd\.com.*",
         "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"),
        (r"^http://channel\.jd\.com.*",
         "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"),
        (r"^http://search\.jd\.com/search?.*",
         "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"),
        (r"^http://item\.jd\.com.*",
         "anger6Spider.spiders.jd_spiders.Jd_Item_Spider"),
    ], **app_settings)

    cocurrency = 20

    from anger6Spider.spiderQueue.redisqueue import RedisQueue
    queue = RedisQueue(**settings)
    queue._create_redis_cli()
    #yield queue.put("http://www.jianshu.com")
    #yield queue.put("http://www.jd.com")
    #yield queue.put("http://www.ivsky.com")
    #yield queue.put("http://www.jd.com")

    workers = []
    for _ in range(cocurrency):
        workers.append(Worker(app, queue))

    for worker in workers:
        Log4Spider.debugLog("worker begin:", worker)
        worker.run()

    Log4Spider.debugLog("waitiing for spiderQueue empty:")
    yield queue.join(None)
    Log4Spider.debugLog("main done!")
예제 #7
0
 def addUrl(self,url):
     with self._lock:
         if url not in self.visited:
             self.count+=1
             self.visit.append(url)
     Log4Spider.debugLog(self,"add a url[[[",url,"]]]","current size:[[[",self.count,"]]]")