Example #1
0
    def parse(self, response):
        item = JdspiderItem()
        selector = Selector(response)
        Books = selector.xpath('/html/body/div[8]/div[2]/div[3]/div/ul/li')
        for each in Books:
            num = each.xpath('div[@class="p-num"]/text()').extract()
            bookName = each.xpath('div[@class="p-detail"]/a/text()').extract()
            author = each.xpath('div[@class="p-detail"]/dl[1]/dd/a[1]/text()').extract()
            press = each.xpath('div[@class="p-detail"]/dl[2]/dd/a/text()').extract()

            temphref = each.xpath('div[@class="p-detail"]/a/@href').extract()
            temphref = str(temphref)
            BookID = str(re.search('com/(.*?)\.html',temphref).group(1))

            json_url = 'http://p.3.cn/prices/mgets?skuIds=J_' + BookID
            r = requests.get(json_url).text
            data = json.loads(r)[0]
            price = data['m']
            PreferentialPrice = data['p']

            item['number'] = num
            item['bookName'] = bookName
            item['author'] = author
            item['press'] = press
            item['BookID'] = BookID
            item['price'] = price
            item['PreferentialPrice'] = PreferentialPrice

            yield item

        nextLink = selector.xpath('/html/body/div[8]/div[2]/div[4]/div/div/span/a[7]/@href').extract()
        if nextLink:
            nextLink = nextLink[0]
            print(nextLink)
            yield Request(nextLink,callback=self.parse)
Example #2
0
 def parse_detail(self, response):
     item = JdspiderItem()
     item['collect_date'] = time.strftime("%Y-%m-%d %H:%M:%S")
     item['url'] = response.request.url
     item['cat'] = response.meta['cat']
     item['skuId'] = response.meta['skuId']
     item['title'] = response.xpath("//div[@class='item ellipsis']/text()").extract_first()
     print(item['title'])
     lis = response.xpath("//div[@class='p-parameter']/ul/li")
     item['publish'],item['ISBN'],item['edition'],item['brand'],item['series_name'],item['publish_date'] = "","","","","",""
     for li in lis:
         desc = re.sub('\r|\n|\t|\s','',li.xpath("string(.)").extract_first())
         item['publish'] = desc.split(':')[1] if desc.count('出版社') else item['publish']
         item['ISBN'] = desc.split(':')[1] if desc.count('ISBN') else item['ISBN']
         item['edition'] = desc.split(':')[1] if desc.count('版次') else item['edition']
         item['brand'] = desc.split(':')[1] if desc.count('品牌') else item['brand']
         item['series_name'] = desc.split(':')[1] if desc.count('丛书名') else item['series_name']
         item['publish_date'] = desc.split(':')[1] if desc.count('出版时间') else item['publish_date']
     
     yield scrapy.Request(
         "https://c0.3.cn/stock?skuId={}&cat={}&area=1_72_4137_0".format(item['skuId'], item['cat']),
         callback=self.get_other_info,
         meta={'item': item},
         dont_filter=False
     )
Example #3
0
    def parse(self, response):
        global count
        global tot_item
        global trg_item
        tot_item = 0
        trg_item = 0

        item = JdspiderItem()
        selector = Selector(response)
        Pages = selector.xpath('/html/body/li')
        for each in Pages:
            product_id = each.xpath('@data-sku').extract()
            name = each.xpath('div/div[3]/a/em/text()').extract()
            price = each.xpath('div/div[2]/strong/i/text()').extract()
            item['name'] = name
            item['product_id'] = product_id
            item['price'] = price

            tot_item += 1
            if '$$query$$' in name:
                trg_item += 1
            yield item
        count += 1
        print '+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++'
        print count
        print 'http://search.jd.com/s_new.php?keyword=$$query$$&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&offset=3&page=' + str(
            count) + '&s=26&scrolling=y&pos=30'
        print '+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++'
        if (tot_item * 0.5 > trg_item) and (count <= 10):
            yield Request(
                'http://search.jd.com/s_new.php?keyword=$$query$$&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&offset=3&page='
                + str(count) + '&s=26&scrolling=y&pos=30',
                callback=self.parse)
Example #4
0
    def parse(self, response):
        pre_item = copy.copy(response.meta)
        html = json.loads(response.text)

        for price in eval(html['datePrice']):
            item = JdspiderItem()
            item['product_id'] = pre_item['product_id']
            item['date'] = price[0]
            item['price'] = price[1]

            yield item
Example #5
0
 def parse(self, response):
     meta = copy.copy(response.meta)
     page = requests.get("http://tool.manmanbuy.com/history.aspx",
                         headers=headers,
                         params=meta['data'])
     content = json.loads(page.text)
     datePrices = eval('[' + content['datePrice'] + ']')
     for datePrice in datePrices:
         item = JdspiderItem()
         item['date'] = self._time_process(datePrice[0])
         item['product_id'] = meta['product_id']
         item['price'] = datePrice[1]
         item['campaign'] = datePrice[2]
         yield item
Example #6
0
    def detail_parse(self,response):
        #item对象
        item =JdspiderItem()
        try:
            #提取的字段 ID
            item['ID'] =response.xpath('//div[@class="p-parameter"]/ul[3]/li[2]/@title').extract()[0]
            # 提取的字段 name
            item['name'] =response.xpath('//div[@class="p-parameter"]/ul[3]/li[1]/@title').extract()[0]
            # 提取的字段 brand
            item['brand'] =response.xpath('//ul[@id="parameter-brand"]/li/@title').extract()[0]
            # 提取的字段 resolution
            item['resolution']=response.xpath('//div[@class="p-parameter"]//li[@class="fore0"]//div[@class="detail"]/p/@title').extract()[0]
            # 提取的字段 weight
            item['weight'] =response.xpath('//div[@class="p-parameter"]//ul[3]/li[4]/@title').extract()[0]
            #提取的字段 image_url
            item['image_url'] ='https:'+str(response.xpath('//img[@id="spec-img"]/@data-origin').extract()[0])
            # 提取的字段 store'
            item['store'] =response.xpath('//div[@class="p-parameter"]//ul[3]/li[3]/@title').extract()[0]
            #抛出
            yield item

            # 因累计评论这个字段不存在于源代码,所以需要在network中查找对应的加载文件,找到其url.

            try:
                #累计评论的详细url.
                comment_count_url ='https://club.jd.com/comment/productCommentSummaries.action?referenceIds='+str(item['ID'])
                #发起yield请求
                yield scrapy.Request(
                    comment_count_url,
                    callback=self.comment_count_parse,
                    meta={'item': item}
                )
            except:
                print("累计评论提取失败!")


            # 同样的道理,价格也属于动态字段,同样需要在加载文件中寻找.
            try:
                #价格的完整url.
                price_url ='https://pm.3.cn/prices/pcpmgets?callback=jQuery&skuids={}&origin=2'.format(str(item['ID']))
                #发起yield请求
                yield scrapy.Request(
                    price_url,
                    callback=self.price_parse,
                    meta={'item':item}
                )
            except:
                print("价格提取失败!")
        except:
            print("ERROR")
Example #7
0
 def parse(self, response):
     #print(response.text)
     html = json.loads(response.text)
     pre_item = copy.copy(response.meta)
     category_id = pre_item['category_id']
     shop_id = pre_item['shop_id']
     for data in html['data']:
         item = JdspiderItem()
         item['product_id'] = data['itemid']
         item['category_id'] = category_id
         item['product_name'] = data['t']
         item['shop_id'] = shop_id
         item['price'] = data['jp']
         item['sales_volume'] = data['w']
         item['date'] = datetime.datetime.now().strftime(
             '%Y-%m-%d %H:%M:%S')
         yield item
Example #8
0
    def parse(self, response):
        for link in LinkExtractor(allow=()).extract_links(response):
            yield scrapy.Request(link.url, callback=self.parse)

        product_id = self.get_product_id(response.url)
        if product_id:
            loader = JdItemLoader(item=JdspiderItem(), response=response)
            loader.add_xpath(
                'name',
                '//div[@id="crumb-wrap"]//div[@class="item ellipsis"]/text()'
            )  #normal page
            loader.add_xpath('name',
                             '//div[@class="breadcrumb"]/span[2]/a[2]/text()'
                             )  #eg:https://item.jd.com/2386353.html
            loader.add_xpath('title', \
            '//div[@class="w"]/div[@class="product-intro clearfix"]//div[@class="sku-name"]/text()') #normal page
            loader.add_xpath('title',
                             '//div[@id="itemInfo"]/div[@id="name"]/h1/text()'
                             )  #eg: https://item.jd.com/2386353.html
            loader.add_value('product_id', product_id)
            loader.add_xpath(
                'merchant',
                '//div[@class="J-hove-wrap EDropdown fr"]/div[@class="item"]/div[@class="name"]/a/text()'
            )
            loader.add_xpath('merchant',
                             '//div[@class="seller-infor"]/a/text()'
                             )  #eg: https://item.jd.com/2386353.html
            loader.add_xpath('merchant_grade', \
            '//div[@class="J-hove-wrap EDropdown fr"]/div[@class="item"]/div[@class="name"]/em/text()') #jd self
            loader.add_xpath('merchant_grade',
                             '//em[@class="evaluate-grade"]/span/a/text()'
                             )  #third part merchant score
            loader.add_xpath('merchant_grade',
                             '//div[@class="seller-infor"]/em/text()'
                             )  #eg:https://item.jd.com/2386353.html
            loader.add_xpath('merchant_grade',
                             '//div[@class="score-sum"]/span/text()'
                             )  #eg:https://item.jd.com/10605700987.html
            loader.add_value('utc_timestamp',
                             int(datetime.utcnow().timestamp()))
            item = loader.load_item()
            request = scrapy.Request('https://p.3.cn/prices/mgets?skuIds=J_' + str(product_id), \
                                    callback=self.parse_price)
            request.meta['item'] = item
            yield request
Example #9
0
File: text.py Project: rudyshine/JD
    def parse(self, response):
        item = JdspiderItem()
        selector = Selector(response)
        Products = selector.xpath('//*[@id="plist"]/ul/li')
        for each in Products:
            temphref = each.xpath('div/div[@class="p-img"]/a/@href').extract()
            temphref = str(temphref)
            ProductID = str(re.search('com/(.*?)\.html', temphref).group(1))

            product_typ_url = "https://item.jd.com/" + ProductID + ".html"
            print("====product_typ_url:", product_typ_url)
            # product_typ=Selector(response).xpath('//html/body/div[9]/div[2]/div[1]/div[2]/div[1]/div[1]/ul[2]/li[11]/text()').extract()
            product_typ = Selector(response).xpath(
                '//*[@class="parameter2 p-parameter-list"]/ul[2]/li[11]/text()'
            ).extract()
            print(product_typ)

            item['product_typ'] = product_typ
            yield item
Example #10
0
    def parse(self, response):
        project_list = response.xpath('//div[@class="p-name"]')
        item = JdspiderItem()
        for project in project_list:
            title_test = project.xpath(
                'normalize-space(./a/em/text())').extract_first()
            title_test_url = 'https:' + project.xpath(
                'normalize-space(./a/@href)').extract_first()
            item['title'] = title_test
            item['title_url'] = title_test_url
            yield scrapy.Request(item['title_url'],
                                 callback=self.product_list_parse,
                                 meta={'item': deepcopy(item)},
                                 dont_filter=True)

        callback_url = response.xpath(
            '//a[@class="pn-next"]/@href').extract_first()
        if callback_url:
            yield scrapy.Request('https:/' + callback_url, callback=self.parse)
Example #11
0
    def parse(self, response):
        item = JdspiderItem()
        selector = Selector(response)
        # Books = selector.xpath('/html/body/div[8]/div[2]/div[3]/div/ul/li')
        PhonesLink = selector.xpath(
            '//*[@id="J_goodsList"]/ul/li/div/div[4]/a/@href'
        )  #每个li下面都有一个手机跳转的链接
        for each in PhonesLink:
            # num = each.xpath('div[@class="p-num"]/text()').extract()
            # bookName = each.xpath('div[@class="p-detail"]/a/text()').extract()
            # author = each.xpath('div[@class="p-detail"]/dl[1]/dd/a[1]/text()').extract()
            # press = each.xpath('div[@class="p-detail"]/dl[2]/dd/a/text()').extract()

            temphref = each.xpath('div[@class="p-detail"]/a/@href').extract()
            temphref = str(temphref)
            # BookID = str(re.search('com/(.*?)\.html',temphref).group(1))
            phoneID = str(re.search('com/(.*?)\.html', temphref).group(1))

            json_url = 'http://p.3.cn/prices/mgets?skuIds=J_' + phoneID
            r = requests.get(json_url).text
            data = json.loads(r)[0]
            price = data['m']
            # PreferentialPrice = data['p']

            item['phoneName'] = name
            item['phoneID'] = phoneID
            item['phoneRAM'] = phoneRAM
            item['phoneColor'] = phoneColor
            item['phoneBattery'] = phoneBattery
            item['price'] = price
            item['frontcamera'] = frontcamera
            item['backcamera'] = backcamera

            yield item

        nextLink = selector.xpath(
            '/html/body/div[8]/div[2]/div[4]/div/div/span/a[7]/@href').extract(
            )
        if nextLink:
            nextLink = nextLink[0]
            print(nextLink)
            yield Request(nextLink, callback=self.parse)
Example #12
0
    def parse(self, response):
        # Brand = re.match(r'(^.*C_)(.*)', response.url).group(2)
        # result = unquote(Brand,'utf-8')
        project_list = response.xpath('//div[@class="p-name"]')
        for project in project_list:
            item = JdspiderItem()
            item['Brand'] = '华为(HUAWEI)'
            title_test = project.xpath(
                'normalize-space(./a/em/text())').extract_first()
            title_test_url = 'https:' + project.xpath(
                'normalize-space(./a/@href)').extract_first()
            item['title'] = title_test
            item['title_url'] = title_test_url
            yield scrapy.Request(item['title_url'],
                                 callback=self.product_list_parse,
                                 meta={'item': deepcopy(item)},
                                 dont_filter=True)

        callback_url = response.xpath(
            '//a[@class="pn-next"]/@href').extract_first()
        if callback_url:
            print('下一页')
            yield scrapy.Request('https:/' + callback_url, callback=self.parse)
Example #13
0
    def parse(self, response):
        item = JdspiderItem()
        selector = Selector(response)
        pName = selector.xpath(
            '/html/body/div[@id="bodyContainer"]/div[@id="bindingRoot"]/div[@class="grid_12 l-clearfix l-row"]/div[@id="job-listing-wrapper"]/div[@class="jobmail-signed-in jobs-exist first-page premium-jobs-exist no-tier1-jobs"]/section[@id="jobsListing"]/div[@class="jobs-list jobs-list-primary"]/article[@class="experimental-fade experimental-fade-completed"][1]/dl/dd[1]/p[@class="job-description"]/text()'
        ).extract()
        item['Name'] = pName
        yield item
        #Cates = selector.xpath('/html/body/div[@id="bodyContainer"]/div[@id="bindingRoot"]/div[@class="grid_12 l-clearfix l-row"]/div[@id="job-listing-wrapper"]/div[@class="jobmail-signed-in jobs-exist first-page premium-jobs-exist no-tier1-jobs"]/section[@id="jobsListing"]/div[@class="jobs-list jobs-list-premium"][1]/article')
        #for each in Cates:
        #	bPrice = each.xpath('dl/dd[1]/ul[@class="bullet-points"]/li[1]/text()').extract()
        #	pName = each.xpath('dl/dd[1]/h2/a[@class="job-title"]/text()').extract()
        #temphref = each.xpath('div[@class="p-detail"]/a/@href').extract()
        #temphref = str(temphref)
        #BookID = str(re.search('com/(.*?)\.html',temphref).group(1))
        #json_url = 'http://p.3.cn/prices/mgets?skuIds=J_'+ BookID
        #r = requests.get(json_url).text
        #data = json.loads(r)[0]
        #price = data['m']
        #PreferentialPrice = data['p']
        #	item['Price'] = bPrice
        #	item['Name'] = pName
        #item['author'] = author
        #item['press'] = press
        #item['BookID'] = BookID
        #item['price'] = price
        #item['PreferentialPrice'] = PreferentialPrice
        #yield item

        nextLink = selector.xpath(
            '/html/body/div[7]/div[5]/form[1]/div/a[2]/@href').extract()
        #/html/body/div[@id="J_searchWrap"]/div[@id="J_container"]/div[@id="J_main"]/div[@class="m-list"]/div[@class="ml-wrap"]/div[@class="page clearfix"]/div[@id="J_bottomPage"]/span[@class="p-num"]/a[@class="pn-next"]/@href').extract()
        #/html/body/div[8]/div[1]/div[2]/div[1]/div/div[3]/div/span/a[2]/@href'
        if nextLink:
            nextLink = nextLink[0]
            print(nextLink)
            yield Request(nextLink, callback=self.parse)
Example #14
0
    def parse_detail(self, response):
        item_loader = ArticleItemLoader(item=JdspiderItem(), response=response)
        phone_title = response.css(".sku-name::text").extract()[0].strip()
        match_re = re.match(u".*[\u4e00-\u9fa5]+", phone_title)
        if match_re:
            item_loader.add_css("title", ".sku-name::text")
        else:
            title = response.xpath("/html/body/div[8]/div/div[2]/div[1]/text()"
                                   ).extract()[1].strip()
            item_loader.add_value("title", title)
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("front_image_url", "#spec-n1 img::attr(src)")
        shop_name = response.css(".name a::text")
        if shop_name:
            item_loader.add_css("shop_name", ".name a::text")
        else:
            shop_name = "null"
            item_loader.add_value("shop_name", shop_name)
        price_item = response.xpath(
            "/html/body/div[8]/div/div[2]/div[3]/div/div[1]/div[2]/span[1]/span[2]/text()"
        )
        if price_item:
            price_item = price_item.extract()[0]
            item_loader.add_value("price", price_item)
        else:
            item_price = response.css('.dd .p-price .price::text').extract()[0]
            item_loader.add_value("price", item_price)
        item_loader.add_css("brand", ".p-parameter a::text")
        item_loader.add_xpath(
            "good_name", "//*[@id='crumb-wrap']/div/div[1]/div[9]/text()")
        item_loader.add_xpath("comment_nums",
                              "//*[@id='comment-count']/a/text()")
        item_loader.add_value("crawl_time", datetime.datetime.now())

        phone_item = item_loader.load_item()
        yield phone_item
Example #15
0
    def parse(self, response):

        pre_item = copy.copy(response.meta)

        #product_id_lst  = response.xpath("//ul[@class='gl-warp clearfix']/li/@data-sku").extract()
        division_id = pre_item['division_id']
        #product_name_lst = response.xpath("//div[@class='p-name p-name-type-2']/a/@title").extract()
        shop_id_lst = response.xpath(
            "//div[@class='p-shop']//a[@class='curr-shop hd-shopname']/@href"
        ).extract()
        shop_name_lst = response.xpath(
            "//div[@class='p-shop']//a[@class='curr-shop hd-shopname']/text()"
        ).extract()
        #goods_set = list(zip(product_id_lst,product_name_lst,shop_id_lst,shop_name_lst))
        goods_set = list(zip(shop_id_lst, shop_name_lst))
        for good in goods_set:
            item = JdspiderItem()
            #item['product_id'] = good[0]
            item['division_id'] = int(division_id)
            #item['product_name'] = good[1]
            item['shop_id'] = self._shops_id_process(good[0])
            item['shop_name'] = good[1]

            yield item
Example #16
0
    def parse(self, response):
        item = JdspiderItem()
        selector = Selector(response)
        # gl_items = selector.xpath('//li/div[@class="gl-i-wrap j-sku-item"]')
        gl_items = selector.xpath('//li[@class="gl-item"]')
        # print('得到网页的内容')
        for each in gl_items:
            print('开始解析得到的数据')
            # 得到物品名称
            name = each.xpath(
                'div/div[@class="p-name"]/a/em/text()').extract()[0].strip()
            # print(name)
            # 得到店铺的链接
            name_link = 'http:' + str(
                each.xpath('div/div[@class="p-name"]/a/@href').extract()[0])
            # print(name_link)

            temphref = each.xpath('div/div[@class="p-name"]/a/@href').extract()
            temphref = str(temphref)
            skuId = str(re.search('com/(.*?)\.html', temphref).group(1))
            # print(skuId)

            # 得到价格信息
            price_url = 'https://p.3.cn/prices/mgets?&skuIds=J_' + skuId
            print(price_url)
            price_text = requests.get(price_url).text
            data = json.loads(price_text)[0]
            o_price = data['m']
            c_price = data['p']
            print(o_price, c_price)

            # 得到评论信息
            commit_url = 'https://club.jd.com/comment/productCommentSummaries.action?&referenceIds=' + skuId
            print(commit_url)
            try:
                commit_text = requests.get(commit_url).text
                comment_count = json.loads(
                    commit_text)['CommentsCount'][0]['CommentCountStr']
                print(comment_count)
            except Exception as ex:
                print('request commit_url failed')
                print(ex)

            # 得到店铺名称
            shopId = each.xpath('div/@venderid').extract()[0]
            shop_url = 'https://rms.shop.jd.com/json/pop/shopInfo.action?ids=' + str(
                shopId)
            print(shop_url)
            try:
                shop_text = requests.get(shop_url).text
                data = json.loads(shop_text)
                shop_name = data[0]['name']
                print(shop_name)
            except Exception as ex:
                print('get shop id failed')
                print(ex)

            item['name'] = name
            item['ori_price'] = o_price
            item['cur_price'] = c_price
            item['commit'] = comment_count
            item['shop'] = shop_name
            item['ItemID'] = skuId
            item['shop_href'] = name_link

            yield item
            time.sleep(0.2)

        # nextLink = selector.xpath('/html/body/div[8]/div[2]/div[4]/div/div/span/a[7]/@href').extract()
        print('开始得到下一页的地址')
        nextLink = selector.xpath(
            '//div[@class="page clearfix"]/div/span/a[@class="pn-next"]/@href'
        ).extract()
        print(nextLink)
        if nextLink:
            nextLink = 'https://list.jd.com' + nextLink[0]
            print(nextLink)
            yield Request(nextLink, callback=self.parse)
Example #17
0
    def parse(self, response):
        item = JdspiderItem()
        selector = Selector(response)
        Products = selector.xpath('//*[@id="plist"]/ul/li')
        for each in Products:
            p_Name = each.xpath(
                'div/div[@class="p-name"]/a/em/text()').extract()[0]
            temphref = each.xpath('div/div[@class="p-img"]/a/@href').extract()
            temphref = str(temphref)
            ProductID = str(re.search('com/(.*?)\.html', temphref).group(1))
            # ProductID='1959718783'
            ##获取价格
            json_url_p = 'http://p.3.cn/prices/mgets?skuIds=J_' + ProductID
            try:
                data = requests.get(json_url_p, timeout=1000).json()[0]
                price = data['m']
                PreferentialPrice = data['p']
            except requests.exceptions.ConnectionError:  #requests.exceptions.ReadTimeout
                print('Timeout ConnectionError1:json_url_p')
                time.sleep(600)
                try:
                    data = requests.get(json_url_p, timeout=1000).json()[0]
                    price = data['m']
                    PreferentialPrice = data['p']
                except requests.exceptions.ConnectionError:
                    print('Timeout ConnectionError2:json_url_p')
                    time.sleep(3600)
                    data = requests.get(json_url_p, timeout=1000).json()[0]
                    price = data['m']
                    PreferentialPrice = data['p']
                except requests.exceptions.ReadTimeout:
                    print('Timeout,ReadTimeout:', json_url_p)
            except requests.exceptions.ReadTimeout:
                print('Timeout,ReadTimeout:', json_url_p)

            ##获取评论总数
            json_url_connent = 'https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds=' + ProductID
            try:
                data = requests.get(json_url_connent, timeout=1000).json()
                data = data['CommentsCount'][0]
                CommentCount = data['CommentCount']
                GoodRateShow = data['GoodRateShow']
                GoodCount = data['GoodCount']
                GeneralCount = data['GeneralCount']
                PoorCount = data['PoorCount']
            except requests.exceptions.ConnectionError:
                print('Timeout ConnectionError1:json_url_connent')
                time.sleep(600)
                try:
                    data = requests.get(json_url_connent, timeout=1000).json()
                    data = data['CommentsCount'][0]
                    CommentCount = data['CommentCount']
                    GoodRateShow = data['GoodRateShow']
                    GoodCount = data['GoodCount']
                    GeneralCount = data['GeneralCount']
                    PoorCount = data['PoorCount']
                except requests.exceptions.ConnectionError:
                    print('Timeout ConnectionError2:json_url_connent')
                    time.sleep(3600)
                    data = requests.get(json_url_connent, timeout=1000).json()
                    data = data['CommentsCount'][0]
                    CommentCount = data['CommentCount']
                    GoodRateShow = data['GoodRateShow']
                    GoodCount = data['GoodCount']
                    GeneralCount = data['GeneralCount']
                    PoorCount = data['PoorCount']
                except requests.exceptions.ReadTimeout:
                    print('Timeout,ReadTimeout:', json_url_connent)
            except requests.exceptions.ReadTimeout:
                print('Timeout,ReadTimeout:', json_url_connent)

            ##获取商品评论关键字
            json_url_keyword = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv79456&score=0&sortType=5&pageSize=10&isShadowSku=0&page=0&productId=' + ProductID
            # r = requests.get(json_url_keyword,timeout = 100)
            # time.sleep(2)
            # html = r.content.decode('gb2312', 'ignore')
            # keywords = re.findall(r',"name":"(.*?)",', html)
            # keyword = ' '.join(keywords)

            try:
                r = requests.get(json_url_keyword, timeout=1000)
                html = r.content.decode('gb2312', 'ignore')
                keywords = re.findall(r',"name":"(.*?)",', html)
                keyword = ' '.join(keywords)
            except requests.exceptions.ConnectionError:  # this is important
                print('Timeout ConnectionError1:json_url_keyword')
                time.sleep(600)
                try:
                    r = requests.get(json_url_keyword, timeout=1000)
                    html = r.content.decode('gb2312', 'ignore')
                    keywords = re.findall(r',"name":"(.*?)",', html)
                    keyword = ' '.join(keywords)
                except requests.exceptions.ConnectionError:  # this is important
                    print('Timeout ConnectionError2:json_url_keyword')
                    time.sleep(3600)
                    r = requests.get(json_url_keyword, timeout=1000)
                    html = r.content.decode('gb2312', 'ignore')
                    keywords = re.findall(r',"name":"(.*?)",', html)
                    keyword = ' '.join(keywords)
                except requests.exceptions.ReadTimeout:
                    print('Timeout,ReadTimeout:', json_url_keyword)
            except requests.exceptions.ReadTimeout:
                print('Timeout,ReadTimeout:', json_url_keyword)

            # ##获取商品参数 (冷风扇)
            # product_typ_url="https://item.jd.com/"+ ProductID+".html"
            # r = requests.get(product_typ_url,timeout = 100)
            # time.sleep(2)
            # soup = BeautifulSoup(r.text, 'lxml')
            # ips1 = soup.find_all('ul', class_="parameter2 p-parameter-list")
            # ips2 = soup.find_all('div', class_="detail-elevator-floor")
            # ips = [ips1, ips2]
            # try:
            #     for i in ips:
            #         type = re.findall(r'<li title=".*?">类别:(.*?)<', str(ips))[0]
            #         break
            # except IndexError:
            #     type = "没有对应数据"
            #     print(type)

            ##获取商品参数 (原汁)
            product_typ_url = "https://item.jd.com/" + ProductID + ".html"
            try:
                r = requests.get(product_typ_url, timeout=1000)
                soup = BeautifulSoup(r.text, 'lxml')
                try:
                    shop_name = re.findall(
                        r'<a clstag=".*?" href=".*?" target="_blank" title="(.*?)">',
                        str(soup))[0]
                except IndexError:
                    shop_name = "none"
                try:
                    brand = soup.find_all('ul', id="parameter-brand")
                    brand = re.findall(r'<li title="(.*?)"', str(brand))[0]
                except IndexError:
                    brand = "None"
                ips1 = soup.find_all('ul',
                                     class_="parameter2 p-parameter-list")
                ips2 = soup.find_all('div', class_="detail-elevator-floor")
                ips = [ips1, ips2]
                for i in ips:
                    type = re.findall(r'<li title=".*?">.*?:(.*?)<', str(ips))
                    try:
                        X_type = re.findall(r'<li title=".*?">.*?吸头:(.*?)<',
                                            str(ips))[0]
                    except IndexError:
                        X_type = "none"
                    # try:
                    #     F_type = re.findall(r'<li title=".*?">类别:(.*?)<', str(ips))[0]
                    # except IndexError:
                    #     F_type= "none"
                    # try:
                    #     Y_type = re.findall(r'<li title=".*?">类型:(.*?)<', str(ips))[0]
                    # except IndexError:
                    #     Y_type= "none"
            except requests.exceptions.ConnectionError:  # this is important
                print('Timeout ConnectionError1:product_typ_url')
                time.sleep(600)
                try:
                    r = requests.get(product_typ_url, timeout=1000)
                    soup = BeautifulSoup(r.text, 'lxml')
                    try:
                        shop_name = re.findall(
                            r'<a clstag=".*?" href=".*?" target="_blank" title="(.*?)">',
                            str(soup))[0]
                    except IndexError:
                        shop_name = "none"
                    try:
                        brand = soup.find_all('ul', id="parameter-brand")
                        brand = re.findall(r'<li title="(.*?)"', str(brand))[0]

                    except IndexError:
                        brand = "None"
                    ips1 = soup.find_all('ul',
                                         class_="parameter2 p-parameter-list")
                    ips2 = soup.find_all('div', class_="detail-elevator-floor")
                    ips = [ips1, ips2]
                    for i in ips:
                        type = re.findall(r'<li title=".*?">.*?:(.*?)<',
                                          str(ips))
                        try:
                            X_type = re.findall(
                                r'<li title=".*?">.*?吸头:(.*?)<', str(ips))[0]
                        except IndexError:
                            X_type = "none"
                        # try:
                        #     F_type = re.findall(r'<li title=".*?">类别:(.*?)<', str(ips))[0]
                        # except IndexError:
                        #     F_type = "none"
                        # try:
                        #     Y_type = re.findall(r'<li title=".*?">类型:(.*?)<', str(ips))[0]
                        # except IndexError:
                        #     Y_type = "none"
                except requests.exceptions.ConnectionError:  # this is important
                    print('Timeout ConnectionError2:product_typ_url')
                    time.sleep(3600)
                    r = requests.get(product_typ_url, timeout=1000)
                    soup = BeautifulSoup(r.text, 'lxml')
                    try:
                        shop_name = re.findall(
                            r'<a clstag=".*?" href=".*?" target="_blank" title="(.*?)">',
                            str(soup))[0]
                    except IndexError:
                        shop_name = "none"
                    try:
                        brand = soup.find_all('ul', id="parameter-brand")
                        brand = re.findall(r'<li title="(.*?)"', str(brand))[0]
                    except IndexError:
                        brand = "None"
                    ips1 = soup.find_all('ul',
                                         class_="parameter2 p-parameter-list")
                    ips2 = soup.find_all('div', class_="detail-elevator-floor")
                    ips = [ips1, ips2]
                    for i in ips:
                        type = re.findall(r'<li title=".*?">.*?:(.*?)<',
                                          str(ips))
                        try:
                            X_type = re.findall(
                                r'<li title=".*?">.*?吸头:(.*?)<', str(ips))[0]
                        except IndexError:
                            X_type = "none"
                        # try:
                        #     F_type = re.findall(r'<li title=".*?">类别:(.*?)<', str(ips))[0]
                        # except IndexError:
                        #     F_type = "none"
                        # try:
                        #     Y_type = re.findall(r'<li title=".*?">类型:(.*?)<', str(ips))[0]
                        # except IndexError:
                        #     Y_type = "none"
                except requests.exceptions.ReadTimeout:
                    print('Timeout,ReadTimeout:', product_typ_url)
            except requests.exceptions.ReadTimeout:
                print('Timeout,ReadTimeout:', product_typ_url)

            # ##item
            item['p_Name'] = p_Name
            item['shop_name'] = shop_name
            item['ProductID'] = ProductID
            item['price'] = price
            item['PreferentialPrice'] = PreferentialPrice

            item['CommentCount'] = CommentCount
            item['GoodRateShow'] = GoodRateShow
            item['GoodCount'] = GoodCount
            item['GeneralCount'] = GeneralCount
            item['PoorCount'] = PoorCount
            item['keyword'] = keyword

            item['brand'] = brand
            item['type'] = type
            item['X_type'] = X_type
            # item['F_type'] = F_type
            # item['Y_type'] = Y_type
            yield item

        # donetime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        # print("Sleep time start......")
        # time.sleep(5)
        # print("donetime is:", donetime)

        nextLink = selector.xpath(
            '//*[@id="J_bottomPage"]/span[1]/a[10]/@href').extract()
        if nextLink:
            nextLink = nextLink[0]
            yield Request('https://list.jd.com/' + nextLink,
                          callback=self.parse)
Example #18
0
    def parse(self, response):
        item = JdspiderItem()
        selector = Selector(response)
        Products = selector.xpath('//*[@id="plist"]/ul/li')
        for each in Products:
            # p_Name = each.xpath('div/div[@class="p-name"]/a/em/text()').extract()
            p_Name = each.xpath(
                'div/div[@class="p-name p-name-type-2"]/a/em/text()').extract(
                )
            shop_name = each.xpath(
                'div/div[@class="p-shop"]/@data-shop_name').extract()
            temphref = each.xpath('div/div[@class="p-img"]/a/@href').extract()
            temphref = str(temphref)
            ProductID = str(re.search('com/(.*?)\.html', temphref).group(1))
            # ProductID='1069555'
            ##获取价格
            json_url_p = 'http://p.3.cn/prices/mgets?skuIds=J_' + ProductID
            try:
                r = requests.get(json_url_p).text
                time.sleep(1)
                data = json.loads(r)[0]
                price = data['m']
                PreferentialPrice = data['p']
            except requests.exceptions.ConnectionError:  # this is important
                print('Timeout')
                time.sleep(600)
                r = requests.get(json_url_p).text
                time.sleep(1)
                data = json.loads(r)[0]
                price = data['m']
                PreferentialPrice = data['p']

            ##获取评论总数
            json_url_connent = 'https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds=' + ProductID
            try:
                r = requests.get(json_url_connent).text
                time.sleep(1)
                data = json.loads(r)
                data = data['CommentsCount'][0]
                CommentCount = data['CommentCount']
                GoodRateShow = data['GoodRateShow']
                GoodCount = data['GoodCount']
                GeneralCount = data['GeneralCount']
                PoorCount = data['PoorCount']
            except requests.exceptions.ConnectionError:  # this is important
                print('Timeout')
                time.sleep(600)
                r = requests.get(json_url_connent).text
                time.sleep(1)
                data = json.loads(r)
                data = data['CommentsCount'][0]
                CommentCount = data['CommentCount']
                GoodRateShow = data['GoodRateShow']
                GoodCount = data['GoodCount']
                GeneralCount = data['GeneralCount']
                PoorCount = data['PoorCount']

            json_url_keyword = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv79456&score=0&sortType=5&pageSize=10&isShadowSku=0&page=0&productId=' + ProductID

            try:
                r = requests.get(json_url_keyword)
                html = r.content.decode('gb2312', 'ignore')
                keywords = re.findall(r',"name":"(.*?)",', html)
                keyword = ' '.join(keywords)

            except requests.exceptions.ConnectionError:  # this is important
                print('Timeout')
                time.sleep(600)
                r = requests.get(json_url_keyword)
                html = r.content.decode('gb2312', 'ignore')
                keywords = re.findall(r',"name":"(.*?)",', html)
                keyword = ' '.join(keywords)

            ##获取商品参数
            product_typ_url = "https://item.jd.com/" + ProductID + ".html"
            try:
                r = requests.get(product_typ_url)
                time.sleep(1)
                soup = BeautifulSoup(r.text, 'lxml')
                ips1 = soup.find_all('ul',
                                     class_="parameter2 p-parameter-list")
                ips2 = soup.find_all('div', class_="detail-elevator-floor")
                ips = [ips1, ips2]
                try:
                    for i in ips:
                        type = re.findall(r'<li title=".*?">类别:(.*?)<',
                                          str(ips))[0]
                        # control_mode = re.findall(r'<li title=".*?">控制方式:(.*?)<', str(ips))[0]
                        FBnumber = re.findall(r'<li title=".*?">扇叶片数:(.*?)<',
                                              str(ips))[0]
                        break
                except IndexError:
                    type = "没有对应数据"
                    print(type)
            except requests.exceptions.ConnectionError:  # this is important
                print('Timeout')
                time.sleep(600)
                r = requests.get(product_typ_url)
                time.sleep(1)
                soup = BeautifulSoup(r.text, 'lxml')
                ips1 = soup.find_all('ul',
                                     class_="parameter2 p-parameter-list")
                ips2 = soup.find_all('div', class_="detail-elevator-floor")
                ips = [ips1, ips2]
                try:
                    for i in ips:
                        type = re.findall(r'<li title=".*?">类别:(.*?)<',
                                          str(ips))[0]
                        # control_mode = re.findall(r'<li title=".*?">控制方式:(.*?)<', str(ips))[0]
                        FBnumber = re.findall(r'<li title=".*?">扇叶片数:(.*?)<',
                                              str(ips))[0]
                        break
                except IndexError:
                    type = "没有对应数据"
                    print(type)

            # ##item
            item['p_Name'] = p_Name
            item['shop_name'] = shop_name
            item['ProductID'] = ProductID
            item['price'] = price
            item['PreferentialPrice'] = PreferentialPrice

            item['CommentCount'] = CommentCount
            item['GoodRateShow'] = GoodRateShow
            item['GoodCount'] = GoodCount
            item['GeneralCount'] = GeneralCount
            item['PoorCount'] = PoorCount
            item['keyword'] = keyword

            item['type'] = type
            # item['control_mode'] = control_mode
            item['FBnumber'] = FBnumber
            yield item

        donetime = time.strftime('%Y-%m-%d %H:%M:%S',
                                 time.localtime(time.time()))
        print("Sleep time start......")
        time.sleep(300)
        print("donetime is:", donetime)

        nextLink = selector.xpath(
            '//*[@id="J_bottomPage"]/span[1]/a[10]/@href').extract()
        if nextLink:
            nextLink = nextLink[0]
            yield Request('https://list.jd.com/' + nextLink,
                          callback=self.parse)