def addUrl(self, url): with self._lock: if url not in self.visited: self.count += 1 self.visit.append(url) Log4Spider.debugLog(self, "add a url[[[", url, "]]]", "current size:[[[", self.count, "]]]")
def addUrl(self, url): if url in self.visited: pass else: self.visit.put(url) Log4Spider.infoLog(self, "add a url[[[", url, "]]]", "current size:[[[", self.visit.qsize(), "]]]")
def add_handler(self, url_handler): """Appends the given url_handlers to our handler list. """ print(url_handler) url_pattern = url_handler[0] if not url_pattern.endswith("$"): url_pattern += "$" handlers = [] #wildcard .*$ should have lowest priority #notice:first we only insert a empty handlers as a placeholder if self.handlers and self.handlers[-1][0].pattern == '.*$': self.handlers.insert(-1, (re.compile(url_pattern), handlers)) else: self.handlers.append((re.compile(url_pattern), handlers)) spec = url_handler if isinstance( spec, (tuple, list)): #the url_handler should be inited with some args assert len(spec) in (2, 3, 4) spec = URLSpec(*spec) handlers.append(spec) if spec.name: if spec.name and self.named_handlers: Log4Spider.warnLog( "Multiple handlers named %s; replacing previous value", spec.name) self.named_handlers[spec.name] = spec
def add_handler(self,url_handler): """Appends the given url_handlers to our handler list. """ print(url_handler) url_pattern = url_handler[0] if not url_pattern.endswith("$"): url_pattern+="$" handlers = [] #wildcard .*$ should have lowest priority #notice:first we only insert a empty handlers as a placeholder if self.handlers and self.handlers[-1][0].pattern == '.*$': self.handlers.insert(-1,(re.compile(url_pattern),handlers)) else: self.handlers.append((re.compile(url_pattern),handlers)) spec = url_handler if isinstance(spec,(tuple,list)): #the url_handler should be inited with some args assert len(spec) in (2,3,4) spec = URLSpec(*spec) handlers.append(spec) if spec.name: if spec.name and self.named_handlers: Log4Spider.warnLog( "Multiple handlers named %s; replacing previous value", spec.name ) self.named_handlers[spec.name] = spec
def fetch_one_url(url): env_obj = SpiderEnv(url) env = yield env_obj.gen_env() urlSeek = Jd_Home_Spider(env,app) yield urlSeek.work() for url in urlSeek.urlLists: Log4Spider.infoLog(url) Log4Spider.infoLog(len(urlSeek.urlLists))
def work(self): exec = self.app.executor try: driver = webdriver.PhantomJS(executable_path="/usr/bin/phantomjs") yield exec.submit(driver.get, self.env['url']) yield self.scrapy(driver) except Exception as e: Log4Spider.errLog(self, "webdriver.PhantomJS failed: ", e)
def work(self): exec = self.app.executor try: driver = webdriver.PhantomJS(executable_path="/usr/bin/phantomjs") yield exec.submit(driver.get,self.env['url']) yield self.scrapy(driver) except Exception as e: Log4Spider.errLog(self,"webdriver.PhantomJS failed: ",e)
def fetch_one_url(url): env_obj = SpiderEnv(url) env = yield env_obj.gen_env() urlSeek = BaseSpider(env, app) yield urlSeek.work() for url in urlSeek.urlLists: Log4Spider.infoLog(url) Log4Spider.infoLog(len(urlSeek.urlLists))
def prepare_cul_opts(obj): parse = urlparse(self.env['url']) path = parse.path pic_name = parse.netloc + path.replace("/", "-") static_path = self.app.settings["static_path"] if not os.path.exists(static_path): os.mkdir(static_path) pic_path = "%s/%s" % (self.app.settings["static_path"], pic_name) Log4Spider.warnLog("PicDown path: ", pic_path) obj.setopt(pycurl.WRITEFUNCTION, open(pic_path, "wb").write)
def fetch_one_url(url): env_obj = SpiderEnv(url) env = yield env_obj.gen_env() urlSeek = Jd_Item_Spider(env,app) yield urlSeek.work() for url in urlSeek.urlLists: Log4Spider.infoLog(url) Log4Spider.infoLog(len(urlSeek.urlLists)) global num num-=1 if num == 0: event.set()
def main(): for url in [ "http://www.jianshu.com", "http://upload-images.jianshu.io/upload_images/1679702-7e810a34f3ef8d18.jpg?imageMogr2/auto-orient/strip%7CimageView2/1/w/300/h/300" ]: env_obj = SpiderEnv(url) env = yield env_obj.gen_env() urlSeek = UrlSeekSpider(env, None) yield urlSeek.work() for url in urlSeek.urlLists: Log4Spider.infoLog(url) Log4Spider.infoLog(len(urlSeek.urlLists))
def fetch_one_url(url): env_obj = SpiderEnv(url) env = yield env_obj.gen_env() urlSeek = Jd_Item_Spider(env, app) yield urlSeek.work() for url in urlSeek.urlLists: Log4Spider.infoLog(url) Log4Spider.infoLog(len(urlSeek.urlLists)) global num num -= 1 if num == 0: event.set()
def scrapy(self,driver): bs = BeautifulSoup(driver.page_source) item = bs.find("div",id="product-intro").find("div",id="itemInfo") name = item.find("div",id="name").find("h1").text price = item.find("div",id="summary-price").find("strong",class_="p-price").text #print(price) price = re_price.search(price).group() discount = item.find("div",id="summary-price").find("span",class_="p-discount").text #print(discount) discount = re_price.search(discount).group() #print(name,price,discount) Log4Spider.dataLog("insert a shop",name,price,discount) self.db.shops.insert({"name":name,"price":price,"discount":discount}) super().scrapy(driver)
def getUrlBySoup(self, soup): a_tags = soup.find_all() for a_tag in a_tags: attrs = a_tag.attrs for attr in attrs: if attr in ( 'href', 'src', '#src', '#src2' ): #find a url,some url likes javascript:void(null) are not filter url = url_path = a_tag[attr] if url_path.startswith("//"): url_path = "http:" + url_path if url_path.startswith("http:"): self._url_lists.append(url_path) else: Log4Spider.errLog("Find a unknown url:[[[", url, "]]]")
def scrapy(self, driver): exec = self.app.executor #css_div = 'div[class="%s"] h3'%('item-inner') #div_book = yield exec.submit(driver.find_elements_by_css_selector,css_div) #for h in div_book: # ActionChains(driver).move_to_element(h).perform() pagesource = driver.page_source bs = BeautifulSoup(pagesource) eles = bs.findAll("div",{"class":"p-name"}) prices = bs.findAll("div",{"class":"p-price"}) for price in prices: i = price.find("i") if i: print(i.text) for ele,price in zip(eles,prices): Log4Spider.dataLog(ele.text,price.text) super().scrapy(driver)
def scrapy(self, driver): exec = self.app.executor #css_div = 'div[class="%s"] h3'%('item-inner') #div_book = yield exec.submit(driver.find_elements_by_css_selector,css_div) #for h in div_book: # ActionChains(driver).move_to_element(h).perform() pagesource = driver.page_source bs = BeautifulSoup(pagesource) eles = bs.findAll("div", {"class": "p-name"}) prices = bs.findAll("div", {"class": "p-price"}) for price in prices: i = price.find("i") if i: print(i.text) for ele, price in zip(eles, prices): Log4Spider.dataLog(ele.text, price.text) super().scrapy(driver)
def _urlWork(self): AsyncHTTPClient.configure(CurlAsyncHTTPClient) httpCli = AsyncHTTPClient() try: respone = yield httpCli.fetch(self.env['url']) except Exception as e: Log4Spider.errLog("urlWork fetch url: ", self.env['url'], "error exception: ", e) return soup = BeautifulSoup(respone.body) a_tags = soup.find_all() for a_tag in a_tags: attrs = a_tag.attrs for attr in attrs: Log4Spider.debugLog("tag: ", a_tag, "attr:", attr) if attr in ( 'href', 'src', '#src', '#src2' ): #find a url,some url likes javascript:void(null) are not filter url = url_path = a_tag[attr] url_path = url_path.replace("//", "/") if url_path.startswith("/"): url_parse = self.env['urlparse'] url = urlunparse([ url_parse.scheme, url_parse.netloc, url_path, "", "", "" ]) if url.startswith("http"): if not self.parse_url_own or url_parse.netloc in url: self._url_lists.append(url) else: Log4Spider.errLog("Find a unknown url:[[[", url, "]]]")
def scrapy(self, driver): bs = BeautifulSoup(driver.page_source) item = bs.find("div", id="product-intro").find("div", id="itemInfo") name = item.find("div", id="name").find("h1").text price = item.find("div", id="summary-price").find("strong", class_="p-price").text #print(price) price = re_price.search(price).group() discount = item.find("div", id="summary-price").find("span", class_="p-discount").text #print(discount) discount = re_price.search(discount).group() #print(name,price,discount) Log4Spider.dataLog("insert a shop", name, price, discount) self.db.shops.insert({ "name": name, "price": price, "discount": discount }) super().scrapy(driver)
def realWork(self): if self.env['mine'][1] in ('jpg', 'jpeg', 'png', 'gif'): AsyncHTTPClient.configure(CurlAsyncHTTPClient) def prepare_cul_opts(obj): parse = urlparse(self.env['url']) path = parse.path pic_name = parse.netloc + path.replace("/", "-") static_path = self.app.settings["static_path"] if not os.path.exists(static_path): os.mkdir(static_path) pic_path = "%s/%s" % (self.app.settings["static_path"], pic_name) Log4Spider.warnLog("PicDown path: ", pic_path) obj.setopt(pycurl.WRITEFUNCTION, open(pic_path, "wb").write) httpCli = AsyncHTTPClient() try: respone = yield httpCli.fetch( self.env['url'], prepare_curl_callback=prepare_cul_opts) except Exception as e: Log4Spider.errLog("PicDown failed url: ", self.env['url'], "error exception: ", e) return
def run(self): while True: url = yield self.queue.get() Log4Spider.debugLog(self,"get url:",url) try: env = yield SpiderEnv(url).gen_env() except Exception as e: Log4Spider.errLog(self,"spider env failed url:",url,"exception:",e) continue self._find_url_handler(url) Log4Spider.infoLog(self,"url: ",url," --- class: ",self.handler_class) spider = self.handler_class(env,self.application,**self.handler_kwargs) yield spider.work() for url in spider.urlLists: Log4Spider.debugLog(self,"put url:",url) yield self.queue.put(url)
def run(self): while True: url = yield self.queue.get() Log4Spider.debugLog(self, "get url:", url) try: env = yield SpiderEnv(url).gen_env() except Exception as e: Log4Spider.errLog(self, "spider env failed url:", url, "exception:", e) continue self._find_url_handler(url) Log4Spider.infoLog(self, "url: ", url, " --- class: ", self.handler_class) spider = self.handler_class(env, self.application, **self.handler_kwargs) yield spider.work() for url in spider.urlLists: Log4Spider.debugLog(self, "put url:", url) yield self.queue.put(url)
def main(): settings = { "host": "localhost", "port": 6379, "db": 0 } try: client = motor.MotorClient("mongodb://localhost:27017/") database = client['jd_db'] except: print("mongodb init failed") sys.exit(0) app_settings = { "static_path":"static", "database":database } app = Application([ (r"^http://www\.jd\.com", "anger6Spider.spiders.jd_spiders.Jd_Home_Spider"), (r"^http://list\.jd\.com.*", "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"), (r"^http://channel\.jd\.com.*", "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"), (r"^http://search\.jd\.com/search?.*", "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"), (r"^http://item\.jd\.com.*", "anger6Spider.spiders.jd_spiders.Jd_Item_Spider"), ],**app_settings) cocurrency = 20 from anger6Spider.spiderQueue.redisqueue import RedisQueue queue = RedisQueue(**settings) queue._create_redis_cli() #yield queue.put("http://www.jianshu.com") #yield queue.put("http://www.jd.com") #yield queue.put("http://www.ivsky.com") #yield queue.put("http://www.jd.com") workers = [] for _ in range(cocurrency): workers.append(Worker(app,queue)) for worker in workers: Log4Spider.debugLog("worker begin:",worker) worker.run() Log4Spider.debugLog("waitiing for spiderQueue empty:") yield queue.join(None) Log4Spider.debugLog("main done!")
def main(): settings = {"host": "localhost", "port": 6379, "db": 0} try: client = motor.MotorClient("mongodb://localhost:27017/") database = client['jd_db'] except: print("mongodb init failed") sys.exit(0) app_settings = {"static_path": "static", "database": database} app = Application([ (r"^http://www\.jd\.com", "anger6Spider.spiders.jd_spiders.Jd_Home_Spider"), (r"^http://list\.jd\.com.*", "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"), (r"^http://channel\.jd\.com.*", "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"), (r"^http://search\.jd\.com/search?.*", "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"), (r"^http://item\.jd\.com.*", "anger6Spider.spiders.jd_spiders.Jd_Item_Spider"), ], **app_settings) cocurrency = 20 from anger6Spider.spiderQueue.redisqueue import RedisQueue queue = RedisQueue(**settings) queue._create_redis_cli() #yield queue.put("http://www.jianshu.com") #yield queue.put("http://www.jd.com") #yield queue.put("http://www.ivsky.com") #yield queue.put("http://www.jd.com") workers = [] for _ in range(cocurrency): workers.append(Worker(app, queue)) for worker in workers: Log4Spider.debugLog("worker begin:", worker) worker.run() Log4Spider.debugLog("waitiing for spiderQueue empty:") yield queue.join(None) Log4Spider.debugLog("main done!")
def main(): parse = HtmlFetch(base_url) result = yield parse.fetch() Log4Spider.infoLog(result)
def addUrl(self,url): if url in self.visited: pass else: self.visit.put(url) Log4Spider.infoLog(self,"add a url[[[",url,"]]]","current size:[[[",self.visit.qsize(),"]]]")
def addUrl(self,url): with self._lock: if url not in self.visited: self.count+=1 self.visit.append(url) Log4Spider.debugLog(self,"add a url[[[",url,"]]]","current size:[[[",self.count,"]]]")