def parse(self, response): title = response.meta.get('title') pid = response.meta.get('productId') brand = response.meta.get('brand') serie = response.meta.get('serie') data = json.loads(response.body_as_unicode())['data'] # log.info(f'data {data}') soldNum = data['detail']['soldNum'] Product[pid].soldNum = soldNum commit() sizeList = data['sizeList'] sizeItem = data['item'] price = sizeItem['price'] / 100 # formatSize = sizeItem['formatSize'] # log.success(f'商品:{title} 编号:{pid} 价格: {price}/{formatSize} 交易数量: {soldNum}') log.success(f'商品:{title} 编号:{pid} 价格: {price} 交易数量: {soldNum}') for s in sizeList: item = s['item'] if not item: continue yield PriceItem( id=pid, brand=brand, serie=serie, title=title, size=s['size'], formatSize=s['formatSize'], price=item['price'] / 100, soldNum=soldNum, )
def process_item(self, item, spider): pid = item.get('id') title = item.get('title') articleNumber = item.get('articleNumber') url = item.get('url') soldNum = item.get('soldNum') logo = item.get('logo') categoryId = item.get('categoryId') images = item.get('images') sellDate = item.get('sellDate') authPrice = item.get('authPrice') goodsId = item.get('goodsId') sizeList = item.get('sizeList') imageAndText = item.get('imageAndText') detailJson = item.get('detailJson') if not Product.exists(id=pid): p = Product(id=pid) else: p = Product[pid] p.url = url p.title = title p.soldNum = soldNum p.logo = logo p.categoryId = categoryId p.images = images p.sellDate = sellDate p.articleNumber = articleNumber p.authPrice = authPrice p.goodsId = goodsId p.sizeList = sizeList p.imageAndText = imageAndText p.json = detailJson log.success(f'商品:{title} 编号:{pid} 发售日期:{sellDate} 售出量: {soldNum} ') return item
def check_db(): from DuTracker.tsdb import influxdb try: influxdb.ping() except Exception as e: log.error(f'InfluxDB 连接错误') sys.exit(1) else: log.success(f'InfluxDB 连接成功')
def parse_brandInfo(self, response): data = json.loads(response.body_as_unicode())['data'] unionId = response.meta.get('unionId') name = response.meta.get('name') num = data['total'] page = math.ceil(num / 20) log.success(f'品牌:{name} 编号:{unionId} 商品总数:{num} 页面数:{page}') for page in range(1, page + 1): yield Request(page_url(unionId, page), callback=self.parse_productId, meta={ 'unionId': unionId, 'name': self.brandIds[unionId] }, headers=headers())
def process_item(self, item, spider): pid = item.get('id') title = item.get('title') name = item.get('name') if Product.exists(id=pid): p = Product[pid] else: p = Product(id=pid) p.title = title if spider.name == 'brand': p.brand = name elif spider.name == 'serie': p.serie = name log.success(f'商品:{title} 编号:{pid}') return item
def parse_brandList(self, response): brandList = json.loads(response.body_as_unicode())['data']['list'] for brand in brandList: unionId = brand['brand']['goodsBrandId'] name = brand['brand']['brandName'] self.brandIds[unionId] = name log.success(f'品牌:{name} 编号:{unionId}') if not self.auto: ids = prompt('输入需要爬取的品牌编号', default='').strip().split(' ') if ids == ['']: return IgnoreRequest() else: ids = self.Ids if not ids: return IgnoreRequest() log.info(f'获取 {ids} 品牌包含商品') for unionId in ids: yield Request(page_url(unionId), callback=self.parse_brandInfo, meta={ 'unionId': unionId, 'name': self.brandIds[unionId] })
def parse_serieList(self, response): serieList = json.loads(response.body_as_unicode())['data']['list'] for data in serieList: for serie in data['seriesList']: unionId = serie['productSeriesId'] name = serie['name'] self.serieIds[unionId] = name log.success(f'系列:{name} 编号:{unionId}') if not self.auto: ids = prompt('输入需要爬取的系列编号', default='').strip().split(' ') if ids == ['']: return IgnoreRequest() else: ids = self.Ids if not ids: return IgnoreRequest() log.info(f'获取 {ids} 系列包含商品') for unionId in ids: yield Request(page_url(unionId), callback=self.parse_serieInfo, meta={ 'unionId': unionId, 'name': self.serieIds[unionId] })