コード例 #1
0
 def parseLicense(self, response):
     document = response.meta["document"]
     try:
         appData = json.loads(response.text)["data"]
     except:
         return self.log("证书接口改版,返回非json", logging.ERROR)
     document["企业资质"]["管理等级"] = appData.get("level", "")
     document["企业资质"]["单位名称"] = appData.get("name", "")
     document["企业资质"]["许可证号"] = appData.get("licenseNo", "")
     document["企业资质"]["法定代表人"] = appData.get("legalRepresentative", "")
     document["企业资质"]["经营地址"] = appData.get("address", "")
     document["企业资质"]["有效期"] = appData.get("validDate", "")
     # 营业执照
     picUri = appData.get("businessLicenceImgUrl")
     if picUri:
         base64img = img2base64(picUri)
         if base64img != "data:image/png;base64,":
             document["证件截图"].append(
                 Image({
                     "url": picUri,
                     "base64": img2base64(picUri),
                     "licenseType": "营业执照"
                 }))
     # 餐饮服务许可证
     picUri = appData.get("restaurantLicenceImgUrl")
     if picUri:
         base64img = img2base64(picUri)
         if base64img != "data:image/png;base64,":
             document["证件截图"].append(
                 Image({
                     "url": picUri,
                     "base64": img2base64(picUri),
                     "licenseType": "餐饮服务许可证"
                 }))
     Store(document).save(self.storesColl)
コード例 #2
0
ファイル: 苏宁爬虫.py プロジェクト: CzaOrz/MiniTools
 def parseLicense(self, response):
     storeId = response.meta["storeId"]
     tryCount = response.meta["tryCount"]
     document = response.meta["document"]
     try:
         starInfo = requests.get(self.star_uri.format(storeId)).text
         starInfo = re.search('.*?({.*?})', starInfo).group(1)
         starInfo = json.loads(starInfo)
         document["店铺动态评分"]["用户评价"] = starInfo.get("Qstar", "")
         document["店铺动态评分"]["物流时效"] = starInfo.get("Astar", "")
         document["店铺动态评分"]["售后服务"] = starInfo.get("Dstar", "")
         document["店铺动态评分"]["用户评价-高于同行"] = starInfo.get("Qpercent", "")
         document["店铺动态评分"]["物流时效-高于同行"] = starInfo.get("Apercent", "")
         document["店铺动态评分"]["售后服务-高于同行"] = starInfo.get("Dpercent", "")
         document["企业资质"]["客服电话"] = starInfo.get("telPhone", "")
         document["企业资质"]["公司名称"] = starInfo.get("companyName", "")
         document["企业资质"]["国家"] = starInfo.get("countryName", "")
         document["企业资质"]["省"] = starInfo.get("companyProvince", "")
         document["企业资质"]["城市"] = starInfo.get("companyCity", "")
         document["企业资质"]["地址"] = starInfo.get("companyAddress", "")
     except:
         pass
     picUri = rea.search('^"(.*)"$', response.text).group(1)
     if picUri:
         picUri = response.urljoin(picUri)
         document["证件截图"].append(
             Image({
                 "url": picUri,
                 "base64": img2base64(picUri)
             }))
         Store(document).save(self.storesColl)
     else:
         self.log("验证码失败或改版...重试: {} storeId: {}".format(tryCount, storeId),
                  logging.WARNING)
         yield self.captchaRequest(storeId, tryCount + 1, document=document)
コード例 #3
0
 def parseMeiShi(self, response):
     try:
         appData = rea.search('window._appState\s*=\s*({.*?});\s*</script>',
                              response.text, re.S).group(1)
         appData = json.loads(appData)
         stores = appData["poiLists"]["poiInfos"]
     except:
         return self.log("页面改版,没有获取到主体列表", logging.ERROR)
     goodsNew = storesNew = 0
     for store in stores:
         storeName = store.get('title', '')
         storeId = str(store.get('poiId', ''))
         if not storeId:
             continue
         if storeId not in self.tempStoreSet:
             self.tempStoreSet.add(storeId)
             if self.storesColl.count({"店铺ID": storeId}):
                 continue
             # if storesNew > 3: continue
             storesNew += 1
             storeUrl = self.search_all_store_uri.format(storeId)
             document = {
                 "店铺名称": storeName,
                 "店铺网址": storeUrl,
                 "店铺首页截图": [],
                 "店铺类别": "",
                 "所在地": "",
                 "证件截图": [],
                 "企业资质": dict(),
                 "店铺标签": "",
                 "店铺动态评分": dict(),
                 "下载时间": datetime.datetime.now(),
                 "店铺ID": storeId,
                 "开店信息": dict()
             }
             # 图片链接
             picUri = store.get("frontImg", "")
             if picUri:
                 base64img = img2base64(picUri)
                 if base64img != "data:image/png;base64,":
                     document["店铺首页截图"].append(
                         Image({
                             "url": picUri,
                             "base64": base64img
                         }))
             # 店铺动态评分
             document["店铺动态评分"]["平均评分"] = store.get("avgScore", "")
             # 企业资质
             document["企业资质"]["店铺地址"] = store.get("address", "")
             yield FormRequest(self.license_uri,
                               self.parseLicense,
                               formdata={"poiId": storeId},
                               meta={"document": document})
     self.log("店铺信息: 共{}条其中{}条未爬".format(len(stores), storesNew),
              logging.INFO)
     if storesNew:
         self.page += 1
         yield Request(self.searchKey + 'pn{}/'.format(self.page),
                       self.parseMeiShi)
コード例 #4
0
ファイル: 卷皮网爬虫.py プロジェクト: CzaOrz/MiniTools
 def getFoodsLicense(self, response):
     document = response.meta['document']
     for picUri in response.xpath('//*[@class="main"]//img/@src').extract():
         picUri = response.urljoin(picUri)
         base64img = img2base64(picUri)
         if base64img == "data:image/png;base64,":
             pass
         else:
             document["证件截图"].append(
                 Image({
                     "url": picUri,
                     "base64": base64img,
                     "licenseType": "食品许可证"
                 }))
     Store(document).save(self.storesColl)
コード例 #5
0
ファイル: 卷皮网爬虫.py プロジェクト: CzaOrz/MiniTools
 def getLicense(self, response):
     document = response.meta['document']
     licenseId = response.meta['licenseId']
     for picUri in response.xpath('//*[@class="main"]//img/@src').extract():
         picUri = response.urljoin(picUri)
         base64img = img2base64(picUri)
         if base64img == "data:image/png;base64,":
             pass
         else:
             document["证件截图"].append(
                 Image({
                     "url": picUri,
                     "base64": base64img,
                     "licenseType": "营业执照"
                 }))
     # yield FormRequest(self.food_license_uri, formdata={"seller": licenseId}, callback=self.getFoodsLicense, meta={
     #     "licenseId": licenseId,
     #     "document": document
     # })
     from pprint import pprint
     pprint(document)
     Store(document).save(self.storesColl)
コード例 #6
0
ファイル: 苏宁爬虫.py プロジェクト: CzaOrz/MiniTools
    def goodsPageParse(self, response):
        """
        商品页面解析
        :param response:
        :return:
        """
        document = response.meta["document"]

        # 详情信息-商品详情
        for goodsInfo in response.xpath(
                '//*[@class="prod-detail-container"]//li'):
            key, value = rea.search('(.*?)[::](.*)',
                                    jxpath(goodsInfo, './/text()'),
                                    re.S).groups()
            if all((key, value)):
                document["详情信息"]["商品详情"][key.strip()] = value.strip()

        for goodsParam in response.xpath('//*[@id="J-procon-param"]//tr'):
            tds = goodsParam.xpath('./td')
            if len(tds) < 2:
                continue
            key, value = jxpath(tds[0],
                                './/text()'), jxpath(tds[1], './/text()')
            if all((key, value)):
                document["详情信息"]["商品详情"][key] = value

        for picUri in re.findall('<img onload=".*?src.?="(.*?)"',
                                 response.text):
            if picUri:
                picUri = response.urljoin(picUri)
                document["宣传图片"].append(
                    Image({
                        "url": picUri,
                        "base64": img2base64(picUri)
                    }))
        Good(document).save(self.goodsColl)
コード例 #7
0
    def parse(self, response):
        if "页面暂时无法访问" in response.text:
            return self.log("出现反爬,返回404页面", logging.ERROR)
        if "没有符合条件的商家" in response.text:
            self.emptySearchColl.insert_one({
                "type": self.collName,
                "key": self.searchKey
            })
            return self.log("未查询商家,记录到空查询", logging.ERROR)

        storesInfo = {}
        try:
            appData = rea.search('window.AppData\s*=\s*({.*?});\s*</script>',
                                 response.text, re.S).group(1)
            appData = json.loads(appData)
            data = appData["data"]["searchResult"]
            for store in data:
                storesInfo[str(store['id'])] = deepcopy(store)
        except:
            return self.log("页面改版,没有获取到主体列表", logging.ERROR)
        goodsNew = storesNew = 0
        stores = response.xpath('//*[@class="common-list-main"]/div')
        for store in stores:
            storeName = xpath(
                store,
                './div[@class="default-card"]//div[@class="list-item-desc"]/div/a/text()'
            )
            storeUrl = xpath(store, './div[@class="default-card"]/div/a/@href')
            if not storeUrl:
                continue
            storeUrl = response.urljoin(storeUrl)
            storeId = rea.search('.*?/(\d+?)/', storeUrl).group(1)
            if not storeId:
                continue
            if storeId not in self.tempStoreSet:
                self.tempStoreSet.add(storeId)
                if self.storesColl.count({"店铺ID": storeId}):
                    continue
                if storesNew > 3: continue
                storesNew += 1
                detailInfo = storesInfo.get(storeId)
                if not detailInfo:
                    continue
                document = {
                    "店铺名称": storeName,
                    "店铺网址": storeUrl,
                    "店铺首页截图": [],
                    "店铺类别": "",
                    "所在地": "",
                    "证件截图": [],
                    "企业资质": dict(),
                    "店铺标签": "",
                    "店铺动态评分": dict(),
                    "下载时间": datetime.datetime.now(),
                    "店铺ID": storeId,
                    "开店信息": dict()
                }
                # 图片链接
                picUri = detailInfo.get("imageUrl", "")
                picUri = re.sub('(https?://.*?)/.*?/(.*)', '\\1/\\2', picUri)
                if picUri:
                    base64img = img2base64(picUri)
                    if base64img != "data:image/png;base64,":
                        document["店铺首页截图"].append(
                            Image({
                                "url": picUri,
                                "base64": base64img
                            }))
                # 店铺动态评分
                document["店铺动态评分"]["平均评分"] = detailInfo.get("avgscore", "")
                # 店铺标签
                document["店铺标签"] = detailInfo.get("backCateName", "")
                # 企业资质
                document["企业资质"]["店铺地址"] = detailInfo.get("address", "")
                yield FormRequest(self.license_uri,
                                  self.parseLicense,
                                  formdata={"poiId": storeId},
                                  priority=1,
                                  meta={"document": document})
        self.log("店铺信息: 共{}条其中{}条未爬".format(len(stores), storesNew),
                 logging.INFO)
コード例 #8
0
ファイル: 京东爬虫.py プロジェクト: CzaOrz/MiniTools
    def parseLicense(self, response):
        """
        证书页面解析
        :param response:
        :return:
        """
        storeId = response.meta["storeId"]
        tryCount = response.meta["tryCount"]
        document = response.meta["document"]
        if "验证码" in response.text:
            self.log("验证码失败了...重试: {} storeId: {}".format(tryCount, storeId),
                     logging.WARNING)
            if "京东商城网店经营者资质信息" in response.text:
                yield self.captchaRequest(storeId,
                                          tryCount + 1,
                                          document=document)
            elif "京东国际网店经营者资质信息" in response.text:  # 转跨境电商
                self.log("转跨境电商验证码: {}".format(storeId))
                yield self.captchaRequest(storeId,
                                          tryCount + 1,
                                          document=document,
                                          uri=self.cross_captcha_uri())
            else:
                self.log("证书页面改版了, 当前url: {}".format(response.url),
                         logging.ERROR)
        elif "京东商城网店经营者营业执照信息" in response.text \
                or "京东国际网店经营者资质信息" in response.text:
            for picUri in response.xpath(
                    '//*[@class="qualification-img"]/@src').extract():
                picUri = response.urljoin(picUri)
                document["证件截图"].append(
                    Image({
                        "url": picUri,
                        "base64": img2base64(picUri)
                    }))
            for li in response.xpath('//*[@class="jScore"]//li'):
                text = jxpath(li, './/text()')
                key, value = rea.search('(.*?)[::](.*)', text, re.S).groups()
                if all((key, value)):
                    document["企业资质"][strip_s(key)] = strip_s(value)

            # 商店评分
            appId = xpath(response, '//*[@id="pageInstance_appId"]/@value')
            if appId:
                text = requests.get(self.score_uri.format(appId)).text
                for key, value in re.findall(
                        '(用户评价|物流履约|售后服务|服务态度)[::].*?(\d+\.\d+)', text, re.S):
                    document["店铺动态评分"][key] = value
            Store(document).save(self.storesColl)
        elif re.match("^https?://www.jd.com/?$", response.url):
            self.log("跳到首页了, 当前url: {}".format(response.url), logging.ERROR)
            self.errorStoreColl.insert_one({
                "type":
                self.collName,
                "key":
                self.searchKey,
                "msg":
                "跳转到了首页",
                "url":
                self.license_uri.format(storeId)
            })
            if not self.storesColl.count({"店铺ID": document["店铺ID"]}):
                Store(document).save(self.storesColl)
        else:
            self.log("可能跳到其他页面或者改版了, 当前url: {}".format(response.url),
                     logging.ERROR)
コード例 #9
0
ファイル: 京东爬虫.py プロジェクト: CzaOrz/MiniTools
    def goodsPageParse(self, response):
        """
        商品页面解析
        :param response:
        :return:
        """
        document = response.meta["document"]

        # 商品下架
        if "该商品已下柜" in response.text:
            document["售后信息"]["说明"].append("该商品已下柜")

        # 售后信息-说明
        skuid = re.search("skuid:\s*(\d+?),", response.text).group(1)
        venderId = re.search("venderId:\s*(\d+?),", response.text).group(1)
        cat = re.search("cat:\s*\[(.+?)\]", response.text).group(1)
        instruction = requests.get(self.instru_uri.format(
            skuid, venderId, cat)).text
        document["售后信息"]["说明"].extend(
            re.findall('"showName":\s*"(.*?)"', instruction))
        document["价格信息"]["京东价"] = rea.search('"jdPrice":.*?"op":\s*"(.*?)"',
                                             instruction).group(1) or ""

        # 评论信息
        comment_text = requests.get(self.comment_uri.format(skuid)).text
        document["累计评价"] = rea.search('"CommentCount":\s*(\d+),',
                                      comment_text).group(1) or ""

        # 售后信息-更多说明
        for instru in response.xpath('//*[@class="more-con"]//li'):
            document["售后信息"]["更多说明"].append(jxpath(instru, './/text()'))

        # 详情信息-商品详情
        for goodsInfo in response.xpath('//*[@class="p-parameter"]//li'):
            key, value = rea.search('(.*?)[::](.*)',
                                    jxpath(goodsInfo, './/text()'),
                                    re.S).groups()
            if all((key, value)):
                document["详情信息"]["商品详情"][key.strip()] = value.strip()

        # 详情信息-商品详情下面的宣传图片
        desc = rea.search("desc:\s*\'(//.+?)\'", response.text).group(1)
        if desc:
            try:
                description = requests.get(response.urljoin(desc)).json()
                for picUri in re.findall('(?:src|data-lazyload)="(.*?)"',
                                         description["content"]):
                    picUri = response.urljoin(picUri)
                    document["宣传图片"].append(
                        Image({
                            "url": picUri,
                            "base64": img2base64(picUri)
                        }))
            except:
                self.log("商品页的宣传图片接口可能改版了", logging.WARNING)

        # 售后信息-售后保障
        guarantee = wenben(response, '//*[@id="guarantee"]')
        if guarantee:
            document["售后信息"]["更多说明"].append(guarantee)

        Good(document).save(self.goodsColl)
コード例 #10
0
ファイル: 京东爬虫.py プロジェクト: CzaOrz/MiniTools
    def parse(self, response):
        if "抱歉,没有找到与" in response.text:
            return self.raiseAndCatchError()

        goodsNew = storesNew = storesNotMatch = 0
        stores = response.xpath('//*[@id="J_goodsList"]//li[@class="gl-item"]')
        for store in stores:
            storeUrl = xpath(store, './/*[@class="p-shop"]//a/@href')
            if not storeUrl:
                continue
            storeUrl = response.urljoin(storeUrl)
            storeId = rea.search('index-(\d+)\.html', storeUrl).group(1)
            if not storeId:
                continue
            goodsUrl = xpath(store, './/*[@class="p-img"]/a/@href')
            if not goodsUrl:
                continue
            goodsUrl = response.urljoin(goodsUrl)
            title = xpath(store, './/*[@class="p-img"]/a/@title')
            content = wenben(store, './/*[@class="p-name p-name-type-2"]')
            storeName = xpath(store, './/*[@class="p-shop"]//a/text()')

            # if storeName != self.searchKey:  # 展示店铺与搜索关键词不一致,则认为此商店已倒闭或者改名了
            #     storesNotMatch += 1
            #     continue

            if self.allowGoods and goodsUrl not in self.tempGoodsSet:
                if goodsNew > 9: continue
                self.tempGoodsSet.add(goodsUrl)
                if not self.goodsColl.count({"商品网址": goodsUrl, "商品名称": title}):
                    document = {
                        "商品名称": title,
                        "商品网址": goodsUrl,
                        "商品描述": content,
                        "商品首页截图": [],
                        "宣传图片": [],
                        "商品类别": "",
                        "店铺名称": storeName,
                        "店铺网址": storeUrl,
                        "下载时间": datetime.datetime.now(),
                        "店铺ID": storeId,
                        "售后信息": {
                            "说明": [],
                            "更多说明": []
                        },
                        "详情信息": {
                            "商品详情": dict(),
                        },
                        "累计评价": "",
                        "商品销量": "",
                        "价格信息": dict()
                    }
                    for img in store.xpath(
                            './/*[@class="ps-main" or @class="p-img"]//img'
                    ):  # 三种不同的页面结构
                        picUri = xpath(
                            img,
                            './@src|.//@data-lazy-img|.//@source-data-lazy-img'
                        )
                        if picUri and picUri.startswith("/"):
                            picUri = response.urljoin(picUri)
                            document["宣传图片"].append(
                                Image({
                                    "url": picUri,
                                    "base64": img2base64(picUri)
                                }))
                    yield Request(goodsUrl,
                                  callback=self.goodsPageParse,
                                  priority=1,
                                  meta={"document": document})
                    goodsNew += 1
            if storeId not in self.tempStoreSet:
                if self.allowGoods: continue
                if storesNew > 2: continue
                self.tempStoreSet.add(storeId)
                if not self.storesColl.count({"店铺ID": storeId}):
                    document = {
                        "店铺名称": storeName,
                        "店铺网址": storeUrl,
                        "店铺首页截图": [],
                        "店铺类别": "",
                        "所在地": "",
                        "证件截图": [],
                        "企业资质": dict(),
                        "店铺标签": "",
                        "店铺动态评分": dict(),
                        "下载时间": datetime.datetime.now(),
                        "店铺ID": storeId,
                        "开店信息": dict()
                    }
                    yield self.captchaRequest(storeId, document=document)
                    storesNew += 1
        self.log("商品信息: 共{}条其中{}条未爬".format(len(stores), goodsNew),
                 logging.INFO)
        self.log("店铺信息: 共{}条其中{}条未爬".format(len(stores), storesNew),
                 logging.INFO)
コード例 #11
0
ファイル: 苏宁爬虫.py プロジェクト: CzaOrz/MiniTools
    def parse(self, response):
        if "没有找到相关商品" in response.text:
            self.log("此次查询已记录为异常:`{}` 查询结果为空,直接返回".format(self.searchKey),
                     logging.ERROR)
            return self.emptySearchColl.insert_one({
                "type": self.collName,
                "key": self.searchKey
            })

        goodsNew = storesNew = 0
        stores = response.xpath('//li')
        for store in stores:
            # 店铺链接
            storeUrl = xpath(store, './/a[@class="store-name"]/@href')
            if not storeUrl:
                continue
            storeUrl = response.urljoin(storeUrl)
            try:
                storeData = eval(
                    xpath(store, './/a[@class="store-name"]/@sa-data'))
                storeId = storeData["shopid"]
            except:
                self.log("苏宁商品列表页可能改版了", logging.ERROR)
                continue
            if not storeId:
                continue
            goodsUrl = xpath(store, './/*[@class="img-block"]/a/@href')
            if not goodsUrl:
                continue
            goodsUrl = response.urljoin(goodsUrl)

            title = xpath(store, './/*[@class="title-selling-point"]/a/@title')
            content = wenben(store, './/*[@class="title-selling-point"]')
            storeName = xpath(store, './/a[@class="store-name"]/text()')

            if self.allowGoods and goodsUrl not in self.tempGoodsSet:
                if goodsNew > 9: continue
                self.tempGoodsSet.add(goodsUrl)
                if not self.goodsColl.count({"商品网址": goodsUrl}):
                    document = {
                        "商品名称": title,
                        "商品网址": goodsUrl,
                        "商品描述": content,
                        "商品首页截图": [],
                        "宣传图片": [],
                        "商品类别": "",
                        "店铺名称": storeName,
                        "店铺网址": storeUrl,
                        "下载时间": datetime.datetime.now(),
                        "店铺ID": storeId,
                        "售后信息": {
                            "说明": [],
                            "更多说明": []
                        },
                        "详情信息": {
                            "商品详情": dict(),
                        }
                    }
                    for picUri in store.xpath(
                            './/*[@class="res-img"]/div[@class="img-block"]/a/img/@src'
                    ).extract():
                        if picUri and picUri.startswith("/"):
                            picUri = response.urljoin(picUri)
                            document["宣传图片"].append(
                                Image({
                                    "url": picUri,
                                    "base64": img2base64(picUri)
                                }))
                    yield Request(goodsUrl,
                                  callback=self.goodsPageParse,
                                  priority=1,
                                  meta={"document": document})
                    goodsNew += 1
            if storeId not in self.tempStoreSet:
                if self.allowGoods: continue
                self.tempStoreSet.add(storeId)
                if not self.storesColl.count({"店铺ID": storeId}):
                    document = {
                        "店铺名称": storeName,
                        "店铺网址": storeUrl,
                        "店铺首页截图": [],
                        "店铺类别": "",
                        "所在地": "",
                        "证件截图": [],
                        "企业资质": dict(),
                        "店铺标签": "",
                        "店铺动态评分": dict(),
                        "下载时间": datetime.datetime.now(),
                        "店铺ID": storeId,
                        "开店信息": dict()
                    }
                    yield self.captchaRequest(storeId, document=document)
                    storesNew += 1
        scanned_printer(self, stores, storesNew)