def crawl( verbose, debug, proxy, ): settings = get_project_settings() if verbose: log.setLevel(logging.DEBUG) if proxy: settings['DOWNLOADER_MIDDLEWARES'].update( {'DuTracker.middlewares.RandomProxy': 760}) settings['PROXY_URL'] = proxy if debug: settings['LOG_ENABLED'] = True log.info('初始化数据库 product.sqlite') runner = CrawlerRunner(settings) @defer.inlineCallbacks def crawl(): yield runner.crawl(BrandSpider) yield runner.crawl(SerieSpider) yield runner.crawl(ProductSpider, fromDB=True) reactor.stop() crawl() reactor.run()
def start_requests(self): log.info('获取系列列表') for url in self.start_urls: yield Request(url, dont_filter=True, headers={ 'AppId': 'wxapp', 'appVersion': '3.5.0', }, callback=self.parse_serieList)
def start_requests(self): log.info('获取品牌列表') for url in self.start_urls: yield Request(url, dont_filter=True, headers={ 'AppId': 'wxapp', 'appVersion': '3.5.0', }, callback=self.parse_brandList, meta={'dont_retry': True})
def show(): settings = get_project_settings() log.info('显示远程品牌&系列信息') runner = CrawlerRunner(settings) @defer.inlineCallbacks def crawl(): yield runner.crawl(BrandSpider, auto=True) yield runner.crawl(SerieSpider, auto=True) reactor.stop() crawl() reactor.run()
def start_requests(self): log.info(f'选取商品销量高于 {self.soldNum_min} 开始追踪') pools = self.get_items() for p in pools: yield scrapy.Request(p.url, meta={ 'productId': p.id, 'title': p.title, 'brand': p.brand, 'serie': p.serie, 'articleNumber': p.articleNumber })
def parse_serieInfo(self, response): log.info(f'系列列表响应') data = json.loads(response.body_as_unicode())['data'] unionId = response.meta.get('unionId') name = response.meta.get('name') num = data['total'] page = math.ceil(num / 20) log.success(f'系列:{name} 编号:{unionId} 商品总数:{num} 页面数:{page}') for page in range(1, page + 1): yield Request(page_url(unionId, page), callback=self.parse_productId, meta={ 'unionId': unionId, 'name': self.serieIds[unionId] }, headers=headers())
def start_requests(self): log.info('获取商品详情') if self.fromDB: [self.productIds.append(p.id) for p in Product.select()] for pid in self.productIds: log.info(f'获取商品详情request {pid}') url = get_product_info_url(pid) log.info(f'商品详情request url:{url}') log.info("headers ---> {0}".format(headers())) yield Request(url, headers=headers())
def parse_brandList(self, response): brandList = json.loads(response.body_as_unicode())['data']['list'] for brand in brandList: unionId = brand['brand']['goodsBrandId'] name = brand['brand']['brandName'] self.brandIds[unionId] = name log.success(f'品牌:{name} 编号:{unionId}') if not self.auto: ids = prompt('输入需要爬取的品牌编号', default='').strip().split(' ') if ids == ['']: return IgnoreRequest() else: ids = self.Ids if not ids: return IgnoreRequest() log.info(f'获取 {ids} 品牌包含商品') for unionId in ids: yield Request(page_url(unionId), callback=self.parse_brandInfo, meta={ 'unionId': unionId, 'name': self.brandIds[unionId] })
def parse_serieList(self, response): serieList = json.loads(response.body_as_unicode())['data']['list'] for data in serieList: for serie in data['seriesList']: unionId = serie['productSeriesId'] name = serie['name'] self.serieIds[unionId] = name log.success(f'系列:{name} 编号:{unionId}') if not self.auto: ids = prompt('输入需要爬取的系列编号', default='').strip().split(' ') if ids == ['']: return IgnoreRequest() else: ids = self.Ids if not ids: return IgnoreRequest() log.info(f'获取 {ids} 系列包含商品') for unionId in ids: yield Request(page_url(unionId), callback=self.parse_serieInfo, meta={ 'unionId': unionId, 'name': self.serieIds[unionId] })
def start_requests(self): log.info('获取品牌列表') for url in self.start_urls: yield Request(url, dont_filter=True, callback=self.parse_brandList, meta={ 'dont_retry': True }, headers=headers())
def start(verbose, debug, proxy, min, product, brand, serie, check, delay, news, days): def check_db(): from DuTracker.tsdb import influxdb try: influxdb.ping() except Exception as e: log.error(f'InfluxDB 连接错误') sys.exit(1) else: log.success(f'InfluxDB 连接成功') if check: check_db() # https://stackoverflow.com/questions/44228851/scrapy-on-a-schedule settings = get_project_settings() if verbose: log.setLevel(logging.DEBUG) if proxy: settings['DOWNLOADER_MIDDLEWARES'].update( {'DuTracker.middlewares.RandomProxy': 760}) settings['PROXY_URL'] = proxy if debug: settings['LOG_ENABLED'] = True if delay: settings['DOWNLOAD_DELAY'] = delay process = CrawlerProcess(settings) sched = TwistedScheduler() if brand: sched.add_job(process.crawl, 'interval', args=[BrandSpider], kwargs={ 'auto': True, 'Ids': brand }, days=1) process.crawl(BrandSpider, auto=True, Ids=brand) if serie: sched.add_job(process.crawl, 'interval', args=[SerieSpider], kwargs={ 'auto': True, 'Ids': serie }, days=1) process.crawl(SerieSpider, auto=True, Ids=serie) if brand or serie: sched.add_job(process.crawl, 'interval', args=[ProductSpider], kwargs={'fromDB': True}, days=1) process.crawl(ProductSpider, fromDB=True) process.crawl(TrackerSpider, soldNum_min=min, Ids=product) sched.add_job(process.crawl, 'interval', args=[TrackerSpider], kwargs={ 'soldNum_min': min, 'Ids': product }, hours=6) if news: sched.add_job(process.crawl, 'interval', args=[TrackerSpider], kwargs={ 'newItem': True, 'days': days }, hours=1) sched.add_job(sched.print_jobs, 'interval', hours=6) log.info('开始商品价格追踪') sched.start() process.start(False)
def start_requests(self): log.info('获取系列列表') for url in self.start_urls: yield Request(url, dont_filter=True, callback=self.parse_serieList, headers=headers())