Esempio n. 1
0
    def process_listpage(self):
        item_num = 0
        hxs = HtmlXPathSelector(self.response)
        products = hxs.select('//div[@id="proShow"]/ul/li')
        for prod in products:
            prod_name = \
                extract_value(prod.select('div[@class="inforBg"]/span/a/@title'))
            prod_url = \
                extract_value(prod.select('div[@class="inforBg"]/span/a/@href'))
            
            img_url = \
                extract_value(prod.select('div[@class="inforBg"]/p/img/@src'))

            request = self.make_request_from_response(\
                img_url,
                cur_idepth=self.basic_link_info.cur_idepth,
                prod_url=self.urljoin(prod_url),
                prod_name=prod_name, 
                cat = self.response.meta['cat']
            )
            self.crawl(request)

            item_num += 1

        self.next_page(hxs)
        return item_num
Esempio n. 2
0
    def next_page(self, hxs):
        url = None
        last_page = int(extract_value(
            hxs.select('//a[@id="pageLast"]/text()')))
        current_page = int(
            extract_value(
                hxs.select(
                    '//div[@class="snPages"]/a[@class="current"]/text()')))

        if current_page == 1:
            #http://www.suning.com/emall/pcd_10052_10051_-7_N_20089_20002_.html
            #['pcd', '10052', '10051', '-7', 'N', '20089', '20002', '.html']
            fs = self.response.url.split('/')[-1].split('_')
            url = "http://www.suning.com/emall/secondPointSearchNewCmd?"\
                "storeId=%s&catalogId=%s&categoryId=%s&topBrandName="\
                "&top=N&top_category=%s&sortIndex=5&currentPage=1&isList=0"\
                % (fs[1], fs[2], fs[-2], fs[-3],)
        elif current_page < last_page:
            url = self.response.url.replace(
                'currentPage=%s' % (current_page - 1),
                'currentPage=%s' % current_page)

        if url:
            request = self.make_request_from_response(url=url)
            self.crawl(request)
            self.log('next page:%s' % request.url)
Esempio n. 3
0
    def process_listpage(self):
        item_num = 0
        hxs = HtmlXPathSelector(self.response)
        goods = hxs.select(
            '//div[@class="floorConn"]/div[@class="goodsList"]/ul/li')
        gname_dict = {}
        gid_list = []
        for item in goods:
            name = extract_value(
                item.select('.//p[@class="productName"]/@title'))
            #url =  extract_value(item.select('div[@class="txt"]/a/@href'))
            gid = extract_value(item.select('.//span[@name="price"]/@id'))
            gid_list.append(gid)
            gname_dict[gid] = name
            item_num += 1

        request = self.make_request_from_response(
            url="%s%s" % (self.PRICE_REQUST_URL, ",".join(gid_list)),
            cur_idepth=self.basic_link_info.cur_idepth,
            gname_dict=gname_dict,
        )
        self.crawl(request)

        self.next_page(hxs)
        return item_num
Esempio n. 4
0
    def process_listpage(self):
        item_num = 0
        hxs = HtmlXPathSelector(self.response)
        products = hxs.select('//div[@id="proShow"]/ul/li')
        for prod in products:
            prod_name = \
                extract_value(prod.select('div[@class="inforBg"]/span/a/@title'))
            prod_url = \
                extract_value(prod.select('div[@class="inforBg"]/span/a/@href'))

            img_url = \
                extract_value(prod.select('div[@class="inforBg"]/p/img/@src'))

            request = self.make_request_from_response(\
                img_url,
                cur_idepth=self.basic_link_info.cur_idepth,
                prod_url=self.urljoin(prod_url),
                prod_name=prod_name,
                cat = self.response.meta['cat']
            )
            self.crawl(request)

            item_num += 1

        self.next_page(hxs)
        return item_num
Esempio n. 5
0
 def next_page(self, hxs):
     poa = hxs.select('//div[@id="bottom_pagenum"]/span/a')
     for item in poa:
         text = extract_value(item.select('text()'))
         if text.find('下一页') != -1:
             url = extract_value(item.select('@href'))
             request = self.make_request_from_response(
                 url="%s%s" % (self.BASE_URL, url), )
             self.crawl(request)
             self.log('next page:%s' % request.url)
             break
Esempio n. 6
0
 def next_page(self, hxs):
     poa = hxs.select('//div[@id="bottom_pagenum"]/span/a')
     for item in poa:
         text = extract_value(item.select('text()'))
         if text.find('下一页') != -1:
             url = extract_value(item.select('@href'))
             request = self.make_request_from_response(
                 url="%s%s" % (self.BASE_URL, url),
                 )
             self.crawl(request)
             self.log('next page:%s' % request.url)
             break
Esempio n. 7
0
    def process(self):
        item_num = 0
        hxs = HtmlXPathSelector(self.response)
        prolist = hxs.select('//div[@id="prodlist"]/li')
        for item in prolist:
            url = "%s%s" % (self.BASE_URL ,
                    extract_value(item.select('a/@href')))
            sprice = extract_value(
                item.select('p[@class="pimg"]/span[@class="pinfo"]/i[@class="ltprice"]/text()')
                )
            price = canonicalize_price(sprice)
            name = extract_value(
                item.select('p[@class="pimg"]/span[@class="pname"]/a/text()')
                )
            self.save(url, name, (), price)
            item_num += 1

        self.next_page(hxs)
        return item_num
Esempio n. 8
0
    def process_entrypage(self):
        item_num = 0
        hxs = HtmlXPathSelector(self.response)
        floors = hxs.select('//div[@class="sFloors l"]/div[@class="sFloor clearfix"]')
        #we don't need floor[0], which is book
        for i in range(1, len(floors)):
            cat1 = extract_value(floors[i].select('h3/a/text()'))
            
            subcats = floors[i].select('ul/li/dl/dt')
            for subcat in subcats:
                cat2 = extract_value(subcat.select('a/text()'))
                url = extract_value(subcat.select('a/@href'))
                request = self.make_request_from_response( \
                    self.urljoin(url),
                    cur_idepth=self.basic_link_info.cur_idepth,
                    cat=[cat1, cat2])
                self.crawl(request) 
                item_num += 1

        return item_num
Esempio n. 9
0
    def process_entrypage(self):
        item_num = 0
        hxs = HtmlXPathSelector(self.response)
        floors = hxs.select(
            '//div[@class="sFloors l"]/div[@class="sFloor clearfix"]')
        #we don't need floor[0], which is book
        for i in range(1, len(floors)):
            cat1 = extract_value(floors[i].select('h3/a/text()'))

            subcats = floors[i].select('ul/li/dl/dt')
            for subcat in subcats:
                cat2 = extract_value(subcat.select('a/text()'))
                url = extract_value(subcat.select('a/@href'))
                request = self.make_request_from_response( \
                    self.urljoin(url),
                    cur_idepth=self.basic_link_info.cur_idepth,
                    cat=[cat1, cat2])
                self.crawl(request)
                item_num += 1

        return item_num
Esempio n. 10
0
    def next_page(self, hxs):
        url = None
        last_page = int(extract_value(hxs.select('//a[@id="pageLast"]/text()')))
        current_page = int(extract_value(
            hxs.select('//div[@class="snPages"]/a[@class="current"]/text()')))

        if current_page == 1:
            #http://www.suning.com/emall/pcd_10052_10051_-7_N_20089_20002_.html
            #['pcd', '10052', '10051', '-7', 'N', '20089', '20002', '.html']
            fs = self.response.url.split('/')[-1].split('_')
            url = "http://www.suning.com/emall/secondPointSearchNewCmd?"\
                "storeId=%s&catalogId=%s&categoryId=%s&topBrandName="\
                "&top=N&top_category=%s&sortIndex=5&currentPage=1&isList=0"\
                % (fs[1], fs[2], fs[-2], fs[-3],)
        elif current_page < last_page:
            url = self.response.url.replace(
                'currentPage=%s' % (current_page-1), 'currentPage=%s' % current_page)

        if url:
            request = self.make_request_from_response(url=url)
            self.crawl(request)
            self.log('next page:%s' % request.url)
Esempio n. 11
0
    def process_entrypage(self):
        item_num = 0
        hxs = HtmlXPathSelector(self.response)
        links = hxs.select('//div[contains(@id, "JDS_")]')
        for index, link in enumerate(links):
            cat1 = extract_value(link.select('div[@class="mt"]/h2/a/text()'))
            dls = link.select('div[@class="mc"]/dl[@class="fore"]')
            for dl in dls:
                cat2 = extract_value(dl.select('dt/a/text()'))
                dds = dl.select('dd/em')
                for dd in dds:
                    cat3 = extract_value(dd.select('a/text()'))
                    catlist = (cat1, cat2, cat3)
                    url = dd.select('a/@href').extract()[0]
                    request = self.make_request_from_response( \
                        "http://www.360buy.com/%s"%url,
                        cur_idepth=self.basic_link_info.cur_idepth,
                        cat=catlist)
                    self.crawl(request)
                    item_num += 1

        return item_num
Esempio n. 12
0
    def process_entrypage(self):
        item_num = 0
        hxs = HtmlXPathSelector(self.response)
        links = hxs.select('//div[contains(@id, "JDS_")]')
        for index, link in enumerate(links):
            cat1 = extract_value(link.select('div[@class="mt"]/h2/a/text()'))
            dls = link.select('div[@class="mc"]/dl[@class="fore"]')
            for dl in dls:
                cat2 = extract_value(dl.select('dt/a/text()'))
                dds = dl.select('dd/em') 
                for dd in dds:
                    cat3 = extract_value(dd.select('a/text()'))
                    catlist = (cat1, cat2, cat3)
                    url = dd.select('a/@href').extract()[0]
                    request = self.make_request_from_response( \
                        "http://www.360buy.com/%s"%url,
                        cur_idepth=self.basic_link_info.cur_idepth,
                        cat=catlist)
                    self.crawl(request) 
                    item_num += 1

        return item_num
Esempio n. 13
0
    def process_listpage(self):
        item_num = 0
        hxs = HtmlXPathSelector(self.response)
        goods = hxs.select('//div[@class="floorConn"]/div[@class="goodsList"]/ul/li')
        gname_dict = {}
        gid_list = []
        for item in goods:
            name = extract_value(item.select('.//p[@class="productName"]/@title'))
            #url =  extract_value(item.select('div[@class="txt"]/a/@href'))
            gid = extract_value(item.select('.//span[@name="price"]/@id'))
            gid_list.append(gid)
            gname_dict[gid] = name
            item_num += 1

        request = self.make_request_from_response(
            url= "%s%s" % (self.PRICE_REQUST_URL, ",".join(gid_list)),
            cur_idepth=self.basic_link_info.cur_idepth,
            gname_dict=gname_dict,
            )
        self.crawl(request)

        self.next_page(hxs)
        return item_num
Esempio n. 14
0
    def process_listpage(self):
        item_num = 0
        hxs = HtmlXPathSelector(self.response)
        skus = hxs.select('//li[@sku]')
        for sku in skus:
            skuvalue = ''.join(sku.select('@sku').extract())
            imgurl = sku.select('div[@class="p-price"]/img/@src').extract()[0]
            url = sku.select('div[@class="p-name"]/a/@href').extract()[0]
            name = extract_value(sku.select('div[@class="p-name"]/a/text()'))
            request = self.make_request_from_response(\
                imgurl,
                cur_idepth=self.basic_link_info.cur_idepth,
                gurl=url, name=name, sku=skuvalue
                )
            self.crawl(request)
            item_num += 1

        self.next_page(hxs)
        return item_num
Esempio n. 15
0
    def process_listpage(self):
        item_num = 0
        hxs = HtmlXPathSelector(self.response)
        skus = hxs.select('//li[@sku]')
        for sku in skus:
            skuvalue = ''.join(sku.select('@sku').extract())
            imgurl = sku.select('div[@class="p-price"]/img/@src').extract()[0]
            url = sku.select('div[@class="p-name"]/a/@href').extract()[0]
            name = extract_value(sku.select('div[@class="p-name"]/a/text()'))
            request = self.make_request_from_response(\
                imgurl,
                cur_idepth=self.basic_link_info.cur_idepth,
                gurl=url, name=name, sku=skuvalue
                )
            self.crawl(request)
            item_num += 1

        self.next_page(hxs)
        return item_num