def parse(self, response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: return site = get_url_site(response.url) if site in self.parses: parser = self.parses[site] #self.log("Parser %s %s"%(response.url,parser.name),level=scrapy.log.INFO) for item in parser.parse(response) : yield item return base_url = get_base_url(response) for sel in response.xpath('//a/@href'): relative_url = sel.extract() abs_url =urljoin_rfc(base_url,relative_url) #print abs_url schema = get_url_scheme(abs_url) if schema not in ["http","https"]: continue site = get_url_site(abs_url) yield NimeiItem(url=abs_url,furl=response.url) yield self.baidu_rpc_request({"url":abs_url,"src_id":4})
def parse_all(self, response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: return base_url = get_base_url(response) base_site = get_url_site(base_url) for sel in response.xpath('//a/@href'): relative_url = sel.extract().encode(response.encoding) if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#": continue abs_url = urljoin_rfc(base_url,relative_url) abs_url = safe_url_string(abs_url,encoding=response.encoding) filename = abs_url.split("?")[0].split("/")[-1] if filename : ctype = filename.split(".")[-1].lower() else: ctype = None if ctype in ["jpeg","jpg","swf","rar","zip","gz","gif","mov","png","bmp","exe","pps","db","txt","pptx",'xls',"ppt","xlsx"]: continue yield self.baidu_rpc_request({"url":abs_url,"src_id":22}) site = get_url_site(abs_url) if site != base_site: continue if ctype in ["pdf","doc","docx","rtf",]: continue yield scrapy.Request(url=abs_url,callback=self.parse_all)
def parse(self, response): self.log("Crawled %s %d" % (response.url, response.status), level=scrapy.log.INFO) # self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: # self.log(response.headers,level=scrapy.log.INFO) yield scrapy.Request(response.url) return if response.__class__ != scrapy.http.HtmlResponse: return base_site = get_url_site(response.url) # print response.url,response.status base_url = response.url for sel in response.xpath('//a/@href'): relative_url = sel.extract() if not self.is_valid_url(relative_url): continue abs_url = urljoin_rfc(base_url, relative_url) # print abs_url schema = get_url_scheme(abs_url) if schema not in ["http", "https"]: continue site = get_url_site(abs_url) # yield NimeiItem(url=abs_url,furl=response.url) yield self.baidu_rpc_request({"url": abs_url, "src_id": 22}, furl=response.url) if site != base_site and site not in self.settings.get("ALLOW_SITES", []): continue self.log("SendCrawl %s" % (abs_url), level=scrapy.log.INFO) yield scrapy.Request(abs_url)
def parse_unit(self,response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: return site = get_url_site(response.url) base_url = get_base_url(response) for href in response.xpath("//a[@class='zt_name']/@href").extract(): # if not self.is_valid_url(href): # continue if href == "#":continue relative_url = href abs_url =urljoin_rfc(base_url,relative_url) yield self.baidu_rpc_request({"url":abs_url,"src_id":4},furl=response.url) yield scrapy.Request(url=abs_url,callback=self.parse_cdmd)