def _urlWork(self): AsyncHTTPClient.configure(CurlAsyncHTTPClient) httpCli = AsyncHTTPClient() try: respone = yield httpCli.fetch(self.env['url']) except Exception as e: Log4Spider.errLog("urlWork fetch url: ", self.env['url'], "error exception: ", e) return soup = BeautifulSoup(respone.body) a_tags = soup.find_all() for a_tag in a_tags: attrs = a_tag.attrs for attr in attrs: Log4Spider.debugLog("tag: ", a_tag, "attr:", attr) if attr in ( 'href', 'src', '#src', '#src2' ): #find a url,some url likes javascript:void(null) are not filter url = url_path = a_tag[attr] url_path = url_path.replace("//", "/") if url_path.startswith("/"): url_parse = self.env['urlparse'] url = urlunparse([ url_parse.scheme, url_parse.netloc, url_path, "", "", "" ]) if url.startswith("http"): if not self.parse_url_own or url_parse.netloc in url: self._url_lists.append(url) else: Log4Spider.errLog("Find a unknown url:[[[", url, "]]]")
def work(self): exec = self.app.executor try: driver = webdriver.PhantomJS(executable_path="/usr/bin/phantomjs") yield exec.submit(driver.get,self.env['url']) yield self.scrapy(driver) except Exception as e: Log4Spider.errLog(self,"webdriver.PhantomJS failed: ",e)
def work(self): exec = self.app.executor try: driver = webdriver.PhantomJS(executable_path="/usr/bin/phantomjs") yield exec.submit(driver.get, self.env['url']) yield self.scrapy(driver) except Exception as e: Log4Spider.errLog(self, "webdriver.PhantomJS failed: ", e)
def getUrlBySoup(self, soup): a_tags = soup.find_all() for a_tag in a_tags: attrs = a_tag.attrs for attr in attrs: if attr in ( 'href', 'src', '#src', '#src2' ): #find a url,some url likes javascript:void(null) are not filter url = url_path = a_tag[attr] if url_path.startswith("//"): url_path = "http:" + url_path if url_path.startswith("http:"): self._url_lists.append(url_path) else: Log4Spider.errLog("Find a unknown url:[[[", url, "]]]")
def run(self): while True: url = yield self.queue.get() Log4Spider.debugLog(self,"get url:",url) try: env = yield SpiderEnv(url).gen_env() except Exception as e: Log4Spider.errLog(self,"spider env failed url:",url,"exception:",e) continue self._find_url_handler(url) Log4Spider.infoLog(self,"url: ",url," --- class: ",self.handler_class) spider = self.handler_class(env,self.application,**self.handler_kwargs) yield spider.work() for url in spider.urlLists: Log4Spider.debugLog(self,"put url:",url) yield self.queue.put(url)
def run(self): while True: url = yield self.queue.get() Log4Spider.debugLog(self, "get url:", url) try: env = yield SpiderEnv(url).gen_env() except Exception as e: Log4Spider.errLog(self, "spider env failed url:", url, "exception:", e) continue self._find_url_handler(url) Log4Spider.infoLog(self, "url: ", url, " --- class: ", self.handler_class) spider = self.handler_class(env, self.application, **self.handler_kwargs) yield spider.work() for url in spider.urlLists: Log4Spider.debugLog(self, "put url:", url) yield self.queue.put(url)
def realWork(self): if self.env['mine'][1] in ('jpg', 'jpeg', 'png', 'gif'): AsyncHTTPClient.configure(CurlAsyncHTTPClient) def prepare_cul_opts(obj): parse = urlparse(self.env['url']) path = parse.path pic_name = parse.netloc + path.replace("/", "-") static_path = self.app.settings["static_path"] if not os.path.exists(static_path): os.mkdir(static_path) pic_path = "%s/%s" % (self.app.settings["static_path"], pic_name) Log4Spider.warnLog("PicDown path: ", pic_path) obj.setopt(pycurl.WRITEFUNCTION, open(pic_path, "wb").write) httpCli = AsyncHTTPClient() try: respone = yield httpCli.fetch( self.env['url'], prepare_curl_callback=prepare_cul_opts) except Exception as e: Log4Spider.errLog("PicDown failed url: ", self.env['url'], "error exception: ", e) return