Ejemplo n.º 1
0
    def parse(self, response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return
        
        site = get_url_site(response.url)

        if site in self.parses:
            parser = self.parses[site]
            #self.log("Parser %s %s"%(response.url,parser.name),level=scrapy.log.INFO)
            for item in parser.parse(response) :
                yield item
            return

        base_url  = get_base_url(response)
        for sel in response.xpath('//a/@href'):
            relative_url = sel.extract()

            abs_url =urljoin_rfc(base_url,relative_url)
            #print abs_url
            schema = get_url_scheme(abs_url)
            if schema not in ["http","https"]:
                continue            
            site = get_url_site(abs_url)
            yield NimeiItem(url=abs_url,furl=response.url)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":4})
Ejemplo n.º 2
0
    def parse_all(self, response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return
        base_url  = get_base_url(response)
        base_site = get_url_site(base_url)

        for sel in response.xpath('//a/@href'):
            relative_url = sel.extract().encode(response.encoding)
            if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
                continue              
            abs_url = urljoin_rfc(base_url,relative_url)
            abs_url = safe_url_string(abs_url,encoding=response.encoding)

            filename = abs_url.split("?")[0].split("/")[-1]
            if filename :
                ctype  = filename.split(".")[-1].lower() 
            else:
                ctype = None
            if ctype in ["jpeg","jpg","swf","rar","zip","gz","gif","mov","png","bmp","exe","pps","db","txt","pptx",'xls',"ppt","xlsx"]:
                continue

            yield self.baidu_rpc_request({"url":abs_url,"src_id":22})

            site = get_url_site(abs_url)
            if site != base_site:
                continue
            if ctype in ["pdf","doc","docx","rtf",]:
                continue
            yield scrapy.Request(url=abs_url,callback=self.parse_all)
Ejemplo n.º 3
0
    def parse(self, response):
        self.log("Crawled %s %d" % (response.url, response.status), level=scrapy.log.INFO)
        # self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            # self.log(response.headers,level=scrapy.log.INFO)
            yield scrapy.Request(response.url)
            return
        if response.__class__ != scrapy.http.HtmlResponse:
            return

        base_site = get_url_site(response.url)
        # print response.url,response.status
        base_url = response.url
        for sel in response.xpath('//a/@href'):
            relative_url = sel.extract()
            if not self.is_valid_url(relative_url):
                continue
            abs_url = urljoin_rfc(base_url, relative_url)
            # print abs_url
            schema = get_url_scheme(abs_url)
            if schema not in ["http", "https"]:
                continue
            site = get_url_site(abs_url)

            # yield NimeiItem(url=abs_url,furl=response.url)
            yield self.baidu_rpc_request({"url": abs_url, "src_id": 22}, furl=response.url)
            if site != base_site and site not in self.settings.get("ALLOW_SITES", []):
                continue
            self.log("SendCrawl %s" % (abs_url), level=scrapy.log.INFO)
            yield scrapy.Request(abs_url)
Ejemplo n.º 4
0
    def parse_unit(self,response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return
        site = get_url_site(response.url)
        base_url  = get_base_url(response)

        for href in response.xpath("//a[@class='zt_name']/@href").extract():
            # if not self.is_valid_url(href):
            #     continue
            if href == "#":continue
            relative_url = href
            abs_url =urljoin_rfc(base_url,relative_url)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":4},furl=response.url)
            yield scrapy.Request(url=abs_url,callback=self.parse_cdmd)