예제 #1
0
파일: myspd3.py 프로젝트: hanyaya/myspider
 def parse(self, response):
     item = AutopjtItem()
     #next_url = response.xpath("//li/a[@class='img']/@href").extract()
     item['link'] = response.xpath("//li/a[@class='img']/@href").extract()
     for url in item[
             'link']:  #因为Request中的第一个参数必须是字符串,也就是默认是一个url。所以,不能直接传item['link']这个列表
         yield scrapy.Request(
             url, meta={'url': url},
             callback=self.parse2)  #传url给回调函数,而不是item。因为,这里基本没用item的其他属性。
예제 #2
0
파일: myspd2.py 프로젝트: hanyaya/myspider
 def parse(self, response):
     item = AutopjtItem(
     )  #其实如果下面不用item['link']存放新的url。这里是不用实例化item的。如果父页面中也有要提取的数据,
     #(该数据!= 传下去的url)那么最好就是这里在for循环中实例化item,然后将整个都传下去。
     item['link'] = response.xpath("//li/a[@class='img']/@href").extract()
     for url in item[
             'link']:  #因为Request中的第一个参数必须是字符串,也就是默认是一个url。所以,不能直接传item['link']这个列表
         yield scrapy.Request(
             url, meta={'url': url},
             callback=self.parse2)  #传url给回调函数,而不是item。因为,这里基本没用item的其他属性。
예제 #3
0
파일: autospd.py 프로젝트: hanyaya/myspider
 def parse(self, response):
     new_url = response.xpath("//li/a[@class='img']/@href").extract()
     for i in range(
             0, len(new_url)
     ):  #上面分析得出的每一个url都需要单独爬取一次,也就是爬取的最小单元,所以要给每一次爬取都初始化一次item来存放数据。
         item = AutopjtItem()
         item['link'] = new_url[i]  #给item['link']赋值,赋给要进入下一层的url。
         yield scrapy.Request(item['link'],
                              meta={'item': item},
                              callback=self.parse2)
예제 #4
0
 def parse(self, response):
     print("work")
     item = AutopjtItem()
     item["name"] = response.xpath("//a[@class='pic']").extract()
     #item["price"] = response.xpath("//span[@class='price_n']/text()").extract()
     #item["link"] = response.xpath("//a[@class='pic']/@href").extract()
     #item["comnum"] = response.xpath("//a[@name='P_pl']/text()").extract()
     for i in range(len(item["name"])):
         print(item["name"][i])
     #print(item["name"])
     '''yield item
예제 #5
0
파일: myspd3.py 프로젝트: hanyaya/myspider
    def parse2(self, response):
        item = AutopjtItem()  #接受上一个函数传入的对应的参数item。
        item['name'] = response.xpath(
            "//div[@class='name_info']/h1/text()").extract()
        item['price'] = response.xpath("//p[@id='dd-price']/text()").extract()
        #item['link'] = response.xpath()
        item['comnum'] = response.xpath(
            "//div[@class='describe_detail']/span/text()").extract()
        item['link'] = response.meta['url']  #这里接受了从parse中传下来的url,并且修改了它,

        return item
예제 #6
0
    def parse(self, response):
        item = AutopjtItem()
        item['name'] = response.xpath('//a[@class="pic"]/@title').extract()
        item['price'] = response.xpath('//span[@class="price_n"]/text()').extract()
        item['link'] = response.xpath('//a[@class="pic"]/@href').extract()
        item['comnum'] = response.xpath('//a[@dd_name="单品评论"]/text()').extract()
        yield item

        for i in range(1,5):
            url = "http://category.dangdang.com/pg"+str(i)+"-cid4011034.html"
            yield Request(url,callback=self.parse)
예제 #7
0
 def parse(self, response):
     item = AutopjtItem()
     item['name'] = response.xpath("//a[@class='pic']/@title").extract()
     item['price'] = response.xpath(
         "//span[@class='price_n']/text()").extract()
     item['link'] = response.xpath("//a[@class='pic']/@href").extract()
     item['comnum'] = response.xpath(
         "//a[@name='itemlist-review']/text()").extract()
     yield item
     for i in range(1, 6):
         url = f'http://category.dangdang.com/pg{i}-cp01.41.00.00.00.00-shbig.html'
         yield Request(url, callback=self.parse)
예제 #8
0
 def parse(self, response):
     item = AutopjtItem()
     item["name"] = response.xpath('//a[@class="pic"]/@title').extract()
     item["price"] = response.xpath(
         '//span[@class="price_n"]/text()').extract()
     item["link"] = response.xpath('//a[@class="pic"]/@href').extract()
     item["comnum"] = response.xpath(
         '//a[@name="itemlist-review"]/text()').extract()
     yield item
     for i in range(1, 46):
         url = "http://category.dangdang.com/pg" + str(
             i) + "-cid4002203.html"
         yield Request(url, callback=self.parse)
예제 #9
0
 def parse(self, response):
     item = AutopjtItem()
     item["name"] = response.xpath("//a[@class='pic']/@title").extract()
     item["price"] = response.xpath(
         "//span[@class='search_now_price']/text()").extract()
     item["link"] = response.xpath("//a[@class='pic']/@href").extract()
     item["comnum"] = response.xpath(
         "//a[@class='search_comment_num']/text()").extract()
     yield item
     for i in range(1, 76):
         url = "http://category.dangdang.com/pg" + str(
             i) + "-cp01.19.10.00.00.00.html"
         yield Request(url, callback=self.parse)
예제 #10
0
 def parse(self, response):
     # pass
     item = AutopjtItem()
     item["name"] = response.xpath("//a[@class='pic']/@title").extract()
     item["price"] = response.xpath(
         "//span[@class='price_n']/text()").extract()
     item["link"] = response.xpath("//a[@class='pic']/@href").extract()
     item["comnum"] = response.xpath("//a[@name='P_pl']/text()").extract()
     yield item
     for i in range(1, 76):
         url = "http://category.dangdang.com/pg" + str(
             i) + "-cid4002203.html"
         yield Request(url, callback=self.parse)
예제 #11
0
    def parse(self, response):
        item = AutopjtItem()
        item['name'] = response.xpath("//a[@class = 'pic']/@title").extract()
        item['price'] = response.xpath(
            "//span[@class = 'price_n']/text()").extract()
        item['link'] = response.xpath("//a[@class = 'pic']/@href").extract()
        item['comnum'] = response.xpath(
            "//a[@name = 'itemlist-review']/text()").extract()

        yield item

        for i in range(1, 18):
            url = "http://category.dangdang.com/pg" + str(
                i) + "-cid4011029.html"
            yield Request(url, callback=self.parse)
예제 #12
0
 def parse(self, response):
     item = AutopjtItem()
     item['name'] = response.xpath('//a[@class="pic"]/@title').extract()
     print(item['name'])
     item['price'] = response.xpath(
         '//span[@class="price_n"]/text()').extract()
     print(item['price'])
     item['link'] = response.xpath('//a[@class="pic"]/@href').extract()
     print(item['link'])
     # item['comum'] = response.xpath('//a[@name="itemlist-review"]/text()').extract()
     # print(item['comum'])
     yield item
     for i in range(1, 5):
         url = 'http://category.dangdang.com/pg' + str(
             i) + '-cid4011029.html'
         yield Request(url, callback=self.parse)
예제 #13
0
    def parse(self, response):
        item = AutopjtItem()
        # 通过各Xpath表达式分别提取商品的名称、价格、链接、评论数等信息
        item["name"] = response.xpath("//a[@class='pic']/@title").extract()
        item["price"] = response.xpath(
            "//span[@class='price_n']/text()").extract()
        item["link"] = response.xpath("//a[@class='pic']/@href").extract()
        item["comnum"] = response.xpath(
            "//a[@name='itemlist-review']/text()").extract()

        yield item

        for i in range(1, 27):
            url = 'http://category.dangdang.com/pg' + str(
                i) + '-cid4011029.html'
            yield Request(url, callback=self.parse)
예제 #14
0
    def parse(self, response):
        item=AutopjtItem()
#通过各Xpath表达式分别提取商品的名称、价格、链接、评论数等信息
        item["name"]=response.xpath("//a[@class='pic']/@title").extract()
        item["price"]=response.xpath("//span[@class='price_n']/text()").extract()
        item["link"]=response.xpath("//a[@class='pic']/@href").extract()
        item["comnum"]=response.xpath("//a[@name='P_pl']/text()").extract()
#提取完后返回item
        yield item
#接下来很关键,通过循环自动爬取75页的数据
        for i in range(1,76):
#通过上面总结的网址格式构造要爬取的网址
            url="http://category.dangdang.com/pg"+str(i)+"-cid4002203.html"
#通过yield返回Request,并指定要爬取的网址和回调函数
#实现自动爬取
            yield Request(url, callback=self.parse)
예제 #15
0
 def parse(self, response):
     item = AutopjtItem()
     # 通过各XPath表达式分别提取商品的名称、价格、链接、评论数等信息
     item['name'] = response.xpath('//a[@class="pic"]/@title').extract()
     item['price'] = response.xpath(
         '//span[@class="price_n"]/text()').extract()
     item['link'] = response.xpath('//a[@class="pic"]/@href').extract()
     item['comnum'] = response.xpath(
         '//a[@id="comm_num_down"]/text()').extract()
     # 提取完后返回item
     yield item
     # 接下来很关键,通过循环自动爬取75页的数据
     for i in range(1, 76):
         # 通过上面总结的网址格式构造要爬取的网址
         url = 'http://category.dangdang.com/pg%s-cid10010584.html' % i
         # 通过yield返回Request, 并指定要爬取的网址和回调函数,实现自动爬取
         yield Request(url, callback=self.parse)
예제 #16
0
 def parse(self, response):
     item = AutopjtItem()
     #通过各XPath表达式分别提取商品的名称,价格,链接,评论数等信息
     item["name"] = response.xpath("//a[@class='pic']/@title").extract()
     item["price"] = response.xpath(
         "//span[@class='price_n']/text()").extract()
     item["link"] = response.xpath("//a[@class='pic']/@href").extract()
     item["comnum"] = response.xpath(
         "//a[@dd_name='单品评论']/text()").extract()
     #提取后返回item
     yield item
     #接下来通过循环自动爬取31页的数据
     for i in range(1, 31):
         url = "http://category.dangdang.com/pg" + str(
             i) + "-cid10010056.html"
         #通过yield返回Request,并指定要爬取的网址和回调函数
         #实现自动爬取
         yield Request(url, callback=self.parse)
예제 #17
0
    def parse(self, response):
        item = AutopjtItem()
        item["mingcheng"] = response.xpath(
            '//h3[@class="yxk_dataList_title"]/a/text()').extract()
        item["yibaomingrenshu"] = response.xpath(
            '//div[@class="yxk_dataList_info_bottom fn-clear"]/span[1]/text()'
        ).extract()
        item["price"] = response.xpath(
            '//a[@class="yxk_dataList_info_price"]/text()[2]').extract()
        item["link"] = response.xpath(
            '//a[@class="yxk_dataList_info_price"]/@href').extract()
        item["remark"] = response.xpath(
            '//p[@class="yxk_dataList_msg"]/text()').extract()

        yield item
        for i in range(2, 8):
            url = "http://www.youxiake.com/lines/list?code=around&place_id=2&days=1&month=0&holiday=0&price=0&tag=0&class_id=0&gts=0&gte=0&sdel=1&p=" + str(
                i)
            yield Request(url, callback=self.parse)
예제 #18
0
    def parse(self, response):
        item = AutopjtItem()
        # 通过个XPath表达式分别提取商品的名称、价格、链接、评论数等信息
        item['name'] = response.xpath('//a[@class="pic"]/@title').extract()
        item['price'] = response.xpath(
            '//span[@class="price_n"]/text()').extract()
        item['link'] = response.xpath('//a[@class="pic"]/@href').extract()
        item['comnum'] = response.xpath(
            '//a[@name="itemlist-review"]/text()').extract()
        # 提取完返回item
        yield item

        # 接下来很关键,通过循环自动爬取1~3页
        for i in range(1, 3):
            url = "http://category.dangdang.com/pg" + str(
                i) + "-cid4003599.html"
            # 通过yield返回Request,并指定要爬取的网址的回调函数
            # 实现自动爬取
            yield Request(url, callback=self.parse)
예제 #19
0
    def parse(self, response):
        item = AutopjtItem()
        item["mingcheng"] = response.xpath('//title/text()').extract()
        item["chuanyuefangshi"] = response.xpath(
            u'//*[contains(text(), "穿越方式:")]/text()').extract()
        item["tubujuli"] = response.xpath(
            u'//*[contains(text(), "徒步距离:")]/text()').extract()
        item["leijipashen"] = response.xpath(
            u'//*[contains(text(), "累计爬升:")]/text()').extract()
        item["fengjingzhishu"] = response.xpath(
            u'//*[contains(text(), "风景指数:")]/text()').extract()
        item["huodongqiangdu"] = response.xpath(
            u'//*[contains(text(), "活动强度:")]/text()').extract()
        item["lianjiedizhi"] = response.xpath('//link[1]/@href').extract()
        item["yibaomingrenshu"] = response.xpath(
            u'//dl[@class="nums mtw"]/dd/em/text()').extract()

        yield item
        for i in range(3391, 3470):
            url = "http://www.kxhuwai.com/thread-" + str(i) + "-1-1.html"
            yield Request(url, callback=self.parse)
예제 #20
0
 def parse(self, response):
     item = AutopjtItem()
     #通过xpath表达式提取商品名称、价格、链接、评论数量
     item['name'] = response.xpath("//a[@class='pic']/@title").extract()
     item['price'] = response.xpath(
         "//span[@class='price_n']/text()").extract()
     item['link'] = response.xpath("//a[@class='pic']/@href").extract()
     item['comnum'] = response.xpath(
         "//a[@name='itemlist-review']/text()").extract()
     #print(item['name'])
     #print(item['price'])
     #print(item['link'])
     #print(item['comnum'])
     # 提取完item返回
     yield item
     #关键部分
     for i in range(2, 10):
         #通过总结的网址格式构造爬取的网址
         url = 'http://category.dangdang.com/pg' + str(
             i) + '-cid4002203.html'
         #通过yield返回Request,并指定要爬取的网址和毁掉函数
         #实现自动爬取
         yield Request(url, callback=self.parse)
예제 #21
0
파일: autospd.py 프로젝트: cp9648/my_item
 def parse(self, response):
     for _list in response.xpath('//div[@id="search_nature_rg"]/ul/li'):
         item = AutopjtItem()
         # 商品名称
         item['name'] = _list.xpath('./a/@title').extract_first()
         # 商品链接
         item['price'] = _list.xpath('./a/@href').extract_first()
         # 商品评论数
         item['comnum'] = _list.xpath(
             './p[@class="star"]/a/text()').extract_first().strip('条评论')
         # 商品价格
         item['link'] = _list.xpath(
             './p/span/text()').extract_first().strip('¥')
         yield item
         print(item['name'])
     # 判断是否有下一页
     page = response.xpath(
         '//a[text()[contains(., "下一页")]]/@href').extract_first()
     if bool(page):
         fu = furl(response.url)
         fu_base = fu.copy().remove(path=True, args=True)
         next_page = fu_base.add(path=page).url  # 构造完整URL
         yield scrapy.Request(url=next_page, callback=self.parse)