def parseLicense(self, response): document = response.meta["document"] try: appData = json.loads(response.text)["data"] except: return self.log("证书接口改版,返回非json", logging.ERROR) document["企业资质"]["管理等级"] = appData.get("level", "") document["企业资质"]["单位名称"] = appData.get("name", "") document["企业资质"]["许可证号"] = appData.get("licenseNo", "") document["企业资质"]["法定代表人"] = appData.get("legalRepresentative", "") document["企业资质"]["经营地址"] = appData.get("address", "") document["企业资质"]["有效期"] = appData.get("validDate", "") # 营业执照 picUri = appData.get("businessLicenceImgUrl") if picUri: base64img = img2base64(picUri) if base64img != "data:image/png;base64,": document["证件截图"].append( Image({ "url": picUri, "base64": img2base64(picUri), "licenseType": "营业执照" })) # 餐饮服务许可证 picUri = appData.get("restaurantLicenceImgUrl") if picUri: base64img = img2base64(picUri) if base64img != "data:image/png;base64,": document["证件截图"].append( Image({ "url": picUri, "base64": img2base64(picUri), "licenseType": "餐饮服务许可证" })) Store(document).save(self.storesColl)
def parseLicense(self, response): storeId = response.meta["storeId"] tryCount = response.meta["tryCount"] document = response.meta["document"] try: starInfo = requests.get(self.star_uri.format(storeId)).text starInfo = re.search('.*?({.*?})', starInfo).group(1) starInfo = json.loads(starInfo) document["店铺动态评分"]["用户评价"] = starInfo.get("Qstar", "") document["店铺动态评分"]["物流时效"] = starInfo.get("Astar", "") document["店铺动态评分"]["售后服务"] = starInfo.get("Dstar", "") document["店铺动态评分"]["用户评价-高于同行"] = starInfo.get("Qpercent", "") document["店铺动态评分"]["物流时效-高于同行"] = starInfo.get("Apercent", "") document["店铺动态评分"]["售后服务-高于同行"] = starInfo.get("Dpercent", "") document["企业资质"]["客服电话"] = starInfo.get("telPhone", "") document["企业资质"]["公司名称"] = starInfo.get("companyName", "") document["企业资质"]["国家"] = starInfo.get("countryName", "") document["企业资质"]["省"] = starInfo.get("companyProvince", "") document["企业资质"]["城市"] = starInfo.get("companyCity", "") document["企业资质"]["地址"] = starInfo.get("companyAddress", "") except: pass picUri = rea.search('^"(.*)"$', response.text).group(1) if picUri: picUri = response.urljoin(picUri) document["证件截图"].append( Image({ "url": picUri, "base64": img2base64(picUri) })) Store(document).save(self.storesColl) else: self.log("验证码失败或改版...重试: {} storeId: {}".format(tryCount, storeId), logging.WARNING) yield self.captchaRequest(storeId, tryCount + 1, document=document)
def parseMeiShi(self, response): try: appData = rea.search('window._appState\s*=\s*({.*?});\s*</script>', response.text, re.S).group(1) appData = json.loads(appData) stores = appData["poiLists"]["poiInfos"] except: return self.log("页面改版,没有获取到主体列表", logging.ERROR) goodsNew = storesNew = 0 for store in stores: storeName = store.get('title', '') storeId = str(store.get('poiId', '')) if not storeId: continue if storeId not in self.tempStoreSet: self.tempStoreSet.add(storeId) if self.storesColl.count({"店铺ID": storeId}): continue # if storesNew > 3: continue storesNew += 1 storeUrl = self.search_all_store_uri.format(storeId) document = { "店铺名称": storeName, "店铺网址": storeUrl, "店铺首页截图": [], "店铺类别": "", "所在地": "", "证件截图": [], "企业资质": dict(), "店铺标签": "", "店铺动态评分": dict(), "下载时间": datetime.datetime.now(), "店铺ID": storeId, "开店信息": dict() } # 图片链接 picUri = store.get("frontImg", "") if picUri: base64img = img2base64(picUri) if base64img != "data:image/png;base64,": document["店铺首页截图"].append( Image({ "url": picUri, "base64": base64img })) # 店铺动态评分 document["店铺动态评分"]["平均评分"] = store.get("avgScore", "") # 企业资质 document["企业资质"]["店铺地址"] = store.get("address", "") yield FormRequest(self.license_uri, self.parseLicense, formdata={"poiId": storeId}, meta={"document": document}) self.log("店铺信息: 共{}条其中{}条未爬".format(len(stores), storesNew), logging.INFO) if storesNew: self.page += 1 yield Request(self.searchKey + 'pn{}/'.format(self.page), self.parseMeiShi)
def getFoodsLicense(self, response): document = response.meta['document'] for picUri in response.xpath('//*[@class="main"]//img/@src').extract(): picUri = response.urljoin(picUri) base64img = img2base64(picUri) if base64img == "data:image/png;base64,": pass else: document["证件截图"].append( Image({ "url": picUri, "base64": base64img, "licenseType": "食品许可证" })) Store(document).save(self.storesColl)
def getLicense(self, response): document = response.meta['document'] licenseId = response.meta['licenseId'] for picUri in response.xpath('//*[@class="main"]//img/@src').extract(): picUri = response.urljoin(picUri) base64img = img2base64(picUri) if base64img == "data:image/png;base64,": pass else: document["证件截图"].append( Image({ "url": picUri, "base64": base64img, "licenseType": "营业执照" })) # yield FormRequest(self.food_license_uri, formdata={"seller": licenseId}, callback=self.getFoodsLicense, meta={ # "licenseId": licenseId, # "document": document # }) from pprint import pprint pprint(document) Store(document).save(self.storesColl)
def goodsPageParse(self, response): """ 商品页面解析 :param response: :return: """ document = response.meta["document"] # 详情信息-商品详情 for goodsInfo in response.xpath( '//*[@class="prod-detail-container"]//li'): key, value = rea.search('(.*?)[::](.*)', jxpath(goodsInfo, './/text()'), re.S).groups() if all((key, value)): document["详情信息"]["商品详情"][key.strip()] = value.strip() for goodsParam in response.xpath('//*[@id="J-procon-param"]//tr'): tds = goodsParam.xpath('./td') if len(tds) < 2: continue key, value = jxpath(tds[0], './/text()'), jxpath(tds[1], './/text()') if all((key, value)): document["详情信息"]["商品详情"][key] = value for picUri in re.findall('<img onload=".*?src.?="(.*?)"', response.text): if picUri: picUri = response.urljoin(picUri) document["宣传图片"].append( Image({ "url": picUri, "base64": img2base64(picUri) })) Good(document).save(self.goodsColl)
def parse(self, response): if "页面暂时无法访问" in response.text: return self.log("出现反爬,返回404页面", logging.ERROR) if "没有符合条件的商家" in response.text: self.emptySearchColl.insert_one({ "type": self.collName, "key": self.searchKey }) return self.log("未查询商家,记录到空查询", logging.ERROR) storesInfo = {} try: appData = rea.search('window.AppData\s*=\s*({.*?});\s*</script>', response.text, re.S).group(1) appData = json.loads(appData) data = appData["data"]["searchResult"] for store in data: storesInfo[str(store['id'])] = deepcopy(store) except: return self.log("页面改版,没有获取到主体列表", logging.ERROR) goodsNew = storesNew = 0 stores = response.xpath('//*[@class="common-list-main"]/div') for store in stores: storeName = xpath( store, './div[@class="default-card"]//div[@class="list-item-desc"]/div/a/text()' ) storeUrl = xpath(store, './div[@class="default-card"]/div/a/@href') if not storeUrl: continue storeUrl = response.urljoin(storeUrl) storeId = rea.search('.*?/(\d+?)/', storeUrl).group(1) if not storeId: continue if storeId not in self.tempStoreSet: self.tempStoreSet.add(storeId) if self.storesColl.count({"店铺ID": storeId}): continue if storesNew > 3: continue storesNew += 1 detailInfo = storesInfo.get(storeId) if not detailInfo: continue document = { "店铺名称": storeName, "店铺网址": storeUrl, "店铺首页截图": [], "店铺类别": "", "所在地": "", "证件截图": [], "企业资质": dict(), "店铺标签": "", "店铺动态评分": dict(), "下载时间": datetime.datetime.now(), "店铺ID": storeId, "开店信息": dict() } # 图片链接 picUri = detailInfo.get("imageUrl", "") picUri = re.sub('(https?://.*?)/.*?/(.*)', '\\1/\\2', picUri) if picUri: base64img = img2base64(picUri) if base64img != "data:image/png;base64,": document["店铺首页截图"].append( Image({ "url": picUri, "base64": base64img })) # 店铺动态评分 document["店铺动态评分"]["平均评分"] = detailInfo.get("avgscore", "") # 店铺标签 document["店铺标签"] = detailInfo.get("backCateName", "") # 企业资质 document["企业资质"]["店铺地址"] = detailInfo.get("address", "") yield FormRequest(self.license_uri, self.parseLicense, formdata={"poiId": storeId}, priority=1, meta={"document": document}) self.log("店铺信息: 共{}条其中{}条未爬".format(len(stores), storesNew), logging.INFO)
def parseLicense(self, response): """ 证书页面解析 :param response: :return: """ storeId = response.meta["storeId"] tryCount = response.meta["tryCount"] document = response.meta["document"] if "验证码" in response.text: self.log("验证码失败了...重试: {} storeId: {}".format(tryCount, storeId), logging.WARNING) if "京东商城网店经营者资质信息" in response.text: yield self.captchaRequest(storeId, tryCount + 1, document=document) elif "京东国际网店经营者资质信息" in response.text: # 转跨境电商 self.log("转跨境电商验证码: {}".format(storeId)) yield self.captchaRequest(storeId, tryCount + 1, document=document, uri=self.cross_captcha_uri()) else: self.log("证书页面改版了, 当前url: {}".format(response.url), logging.ERROR) elif "京东商城网店经营者营业执照信息" in response.text \ or "京东国际网店经营者资质信息" in response.text: for picUri in response.xpath( '//*[@class="qualification-img"]/@src').extract(): picUri = response.urljoin(picUri) document["证件截图"].append( Image({ "url": picUri, "base64": img2base64(picUri) })) for li in response.xpath('//*[@class="jScore"]//li'): text = jxpath(li, './/text()') key, value = rea.search('(.*?)[::](.*)', text, re.S).groups() if all((key, value)): document["企业资质"][strip_s(key)] = strip_s(value) # 商店评分 appId = xpath(response, '//*[@id="pageInstance_appId"]/@value') if appId: text = requests.get(self.score_uri.format(appId)).text for key, value in re.findall( '(用户评价|物流履约|售后服务|服务态度)[::].*?(\d+\.\d+)', text, re.S): document["店铺动态评分"][key] = value Store(document).save(self.storesColl) elif re.match("^https?://www.jd.com/?$", response.url): self.log("跳到首页了, 当前url: {}".format(response.url), logging.ERROR) self.errorStoreColl.insert_one({ "type": self.collName, "key": self.searchKey, "msg": "跳转到了首页", "url": self.license_uri.format(storeId) }) if not self.storesColl.count({"店铺ID": document["店铺ID"]}): Store(document).save(self.storesColl) else: self.log("可能跳到其他页面或者改版了, 当前url: {}".format(response.url), logging.ERROR)
def goodsPageParse(self, response): """ 商品页面解析 :param response: :return: """ document = response.meta["document"] # 商品下架 if "该商品已下柜" in response.text: document["售后信息"]["说明"].append("该商品已下柜") # 售后信息-说明 skuid = re.search("skuid:\s*(\d+?),", response.text).group(1) venderId = re.search("venderId:\s*(\d+?),", response.text).group(1) cat = re.search("cat:\s*\[(.+?)\]", response.text).group(1) instruction = requests.get(self.instru_uri.format( skuid, venderId, cat)).text document["售后信息"]["说明"].extend( re.findall('"showName":\s*"(.*?)"', instruction)) document["价格信息"]["京东价"] = rea.search('"jdPrice":.*?"op":\s*"(.*?)"', instruction).group(1) or "" # 评论信息 comment_text = requests.get(self.comment_uri.format(skuid)).text document["累计评价"] = rea.search('"CommentCount":\s*(\d+),', comment_text).group(1) or "" # 售后信息-更多说明 for instru in response.xpath('//*[@class="more-con"]//li'): document["售后信息"]["更多说明"].append(jxpath(instru, './/text()')) # 详情信息-商品详情 for goodsInfo in response.xpath('//*[@class="p-parameter"]//li'): key, value = rea.search('(.*?)[::](.*)', jxpath(goodsInfo, './/text()'), re.S).groups() if all((key, value)): document["详情信息"]["商品详情"][key.strip()] = value.strip() # 详情信息-商品详情下面的宣传图片 desc = rea.search("desc:\s*\'(//.+?)\'", response.text).group(1) if desc: try: description = requests.get(response.urljoin(desc)).json() for picUri in re.findall('(?:src|data-lazyload)="(.*?)"', description["content"]): picUri = response.urljoin(picUri) document["宣传图片"].append( Image({ "url": picUri, "base64": img2base64(picUri) })) except: self.log("商品页的宣传图片接口可能改版了", logging.WARNING) # 售后信息-售后保障 guarantee = wenben(response, '//*[@id="guarantee"]') if guarantee: document["售后信息"]["更多说明"].append(guarantee) Good(document).save(self.goodsColl)
def parse(self, response): if "抱歉,没有找到与" in response.text: return self.raiseAndCatchError() goodsNew = storesNew = storesNotMatch = 0 stores = response.xpath('//*[@id="J_goodsList"]//li[@class="gl-item"]') for store in stores: storeUrl = xpath(store, './/*[@class="p-shop"]//a/@href') if not storeUrl: continue storeUrl = response.urljoin(storeUrl) storeId = rea.search('index-(\d+)\.html', storeUrl).group(1) if not storeId: continue goodsUrl = xpath(store, './/*[@class="p-img"]/a/@href') if not goodsUrl: continue goodsUrl = response.urljoin(goodsUrl) title = xpath(store, './/*[@class="p-img"]/a/@title') content = wenben(store, './/*[@class="p-name p-name-type-2"]') storeName = xpath(store, './/*[@class="p-shop"]//a/text()') # if storeName != self.searchKey: # 展示店铺与搜索关键词不一致,则认为此商店已倒闭或者改名了 # storesNotMatch += 1 # continue if self.allowGoods and goodsUrl not in self.tempGoodsSet: if goodsNew > 9: continue self.tempGoodsSet.add(goodsUrl) if not self.goodsColl.count({"商品网址": goodsUrl, "商品名称": title}): document = { "商品名称": title, "商品网址": goodsUrl, "商品描述": content, "商品首页截图": [], "宣传图片": [], "商品类别": "", "店铺名称": storeName, "店铺网址": storeUrl, "下载时间": datetime.datetime.now(), "店铺ID": storeId, "售后信息": { "说明": [], "更多说明": [] }, "详情信息": { "商品详情": dict(), }, "累计评价": "", "商品销量": "", "价格信息": dict() } for img in store.xpath( './/*[@class="ps-main" or @class="p-img"]//img' ): # 三种不同的页面结构 picUri = xpath( img, './@src|.//@data-lazy-img|.//@source-data-lazy-img' ) if picUri and picUri.startswith("/"): picUri = response.urljoin(picUri) document["宣传图片"].append( Image({ "url": picUri, "base64": img2base64(picUri) })) yield Request(goodsUrl, callback=self.goodsPageParse, priority=1, meta={"document": document}) goodsNew += 1 if storeId not in self.tempStoreSet: if self.allowGoods: continue if storesNew > 2: continue self.tempStoreSet.add(storeId) if not self.storesColl.count({"店铺ID": storeId}): document = { "店铺名称": storeName, "店铺网址": storeUrl, "店铺首页截图": [], "店铺类别": "", "所在地": "", "证件截图": [], "企业资质": dict(), "店铺标签": "", "店铺动态评分": dict(), "下载时间": datetime.datetime.now(), "店铺ID": storeId, "开店信息": dict() } yield self.captchaRequest(storeId, document=document) storesNew += 1 self.log("商品信息: 共{}条其中{}条未爬".format(len(stores), goodsNew), logging.INFO) self.log("店铺信息: 共{}条其中{}条未爬".format(len(stores), storesNew), logging.INFO)
def parse(self, response): if "没有找到相关商品" in response.text: self.log("此次查询已记录为异常:`{}` 查询结果为空,直接返回".format(self.searchKey), logging.ERROR) return self.emptySearchColl.insert_one({ "type": self.collName, "key": self.searchKey }) goodsNew = storesNew = 0 stores = response.xpath('//li') for store in stores: # 店铺链接 storeUrl = xpath(store, './/a[@class="store-name"]/@href') if not storeUrl: continue storeUrl = response.urljoin(storeUrl) try: storeData = eval( xpath(store, './/a[@class="store-name"]/@sa-data')) storeId = storeData["shopid"] except: self.log("苏宁商品列表页可能改版了", logging.ERROR) continue if not storeId: continue goodsUrl = xpath(store, './/*[@class="img-block"]/a/@href') if not goodsUrl: continue goodsUrl = response.urljoin(goodsUrl) title = xpath(store, './/*[@class="title-selling-point"]/a/@title') content = wenben(store, './/*[@class="title-selling-point"]') storeName = xpath(store, './/a[@class="store-name"]/text()') if self.allowGoods and goodsUrl not in self.tempGoodsSet: if goodsNew > 9: continue self.tempGoodsSet.add(goodsUrl) if not self.goodsColl.count({"商品网址": goodsUrl}): document = { "商品名称": title, "商品网址": goodsUrl, "商品描述": content, "商品首页截图": [], "宣传图片": [], "商品类别": "", "店铺名称": storeName, "店铺网址": storeUrl, "下载时间": datetime.datetime.now(), "店铺ID": storeId, "售后信息": { "说明": [], "更多说明": [] }, "详情信息": { "商品详情": dict(), } } for picUri in store.xpath( './/*[@class="res-img"]/div[@class="img-block"]/a/img/@src' ).extract(): if picUri and picUri.startswith("/"): picUri = response.urljoin(picUri) document["宣传图片"].append( Image({ "url": picUri, "base64": img2base64(picUri) })) yield Request(goodsUrl, callback=self.goodsPageParse, priority=1, meta={"document": document}) goodsNew += 1 if storeId not in self.tempStoreSet: if self.allowGoods: continue self.tempStoreSet.add(storeId) if not self.storesColl.count({"店铺ID": storeId}): document = { "店铺名称": storeName, "店铺网址": storeUrl, "店铺首页截图": [], "店铺类别": "", "所在地": "", "证件截图": [], "企业资质": dict(), "店铺标签": "", "店铺动态评分": dict(), "下载时间": datetime.datetime.now(), "店铺ID": storeId, "开店信息": dict() } yield self.captchaRequest(storeId, document=document) storesNew += 1 scanned_printer(self, stores, storesNew)