예제 #1
0
def crawl(
    verbose,
    debug,
    proxy,
):
    settings = get_project_settings()

    if verbose:
        log.setLevel(logging.DEBUG)
    if proxy:
        settings['DOWNLOADER_MIDDLEWARES'].update(
            {'DuTracker.middlewares.RandomProxy': 760})
        settings['PROXY_URL'] = proxy
    if debug:
        settings['LOG_ENABLED'] = True

    log.info('初始化数据库 product.sqlite')
    runner = CrawlerRunner(settings)

    @defer.inlineCallbacks
    def crawl():
        yield runner.crawl(BrandSpider)
        yield runner.crawl(SerieSpider)
        yield runner.crawl(ProductSpider, fromDB=True)
        reactor.stop()

    crawl()
    reactor.run()
예제 #2
0
파일: serie.py 프로젝트: yftx/DuTracker
 def start_requests(self):
     log.info('获取系列列表')
     for url in self.start_urls:
         yield Request(url,
                       dont_filter=True,
                       headers={
                           'AppId': 'wxapp',
                           'appVersion': '3.5.0',
                       },
                       callback=self.parse_serieList)
예제 #3
0
파일: brand.py 프로젝트: yftx/DuTracker
 def start_requests(self):
     log.info('获取品牌列表')
     for url in self.start_urls:
         yield Request(url,
                       dont_filter=True,
                       headers={
                           'AppId': 'wxapp',
                           'appVersion': '3.5.0',
                       },
                       callback=self.parse_brandList,
                       meta={'dont_retry': True})
예제 #4
0
def show():
    settings = get_project_settings()
    log.info('显示远程品牌&系列信息')
    runner = CrawlerRunner(settings)

    @defer.inlineCallbacks
    def crawl():
        yield runner.crawl(BrandSpider, auto=True)
        yield runner.crawl(SerieSpider, auto=True)
        reactor.stop()

    crawl()
    reactor.run()
예제 #5
0
파일: tracker.py 프로젝트: yftx/DuTracker
    def start_requests(self):
        log.info(f'选取商品销量高于 {self.soldNum_min} 开始追踪')
        pools = self.get_items()

        for p in pools:
            yield scrapy.Request(p.url,
                                 meta={
                                     'productId': p.id,
                                     'title': p.title,
                                     'brand': p.brand,
                                     'serie': p.serie,
                                     'articleNumber': p.articleNumber
                                 })
예제 #6
0
	def parse_serieInfo(self, response):
		log.info(f'系列列表响应')
		data = json.loads(response.body_as_unicode())['data']
		unionId = response.meta.get('unionId')
		name = response.meta.get('name')

		num = data['total']
		page = math.ceil(num / 20)
		log.success(f'系列:{name} 编号:{unionId} 商品总数:{num} 页面数:{page}')

		for page in range(1, page + 1):
			yield Request(page_url(unionId, page), callback=self.parse_productId, meta={
				'unionId': unionId,
				'name': self.serieIds[unionId]
			}, headers=headers())
예제 #7
0
 def start_requests(self):
     log.info('获取商品详情')
     if self.fromDB:
         [self.productIds.append(p.id) for p in Product.select()]
     for pid in self.productIds:
         log.info(f'获取商品详情request {pid}')
         url = get_product_info_url(pid)
         log.info(f'商品详情request url:{url}')
         log.info("headers ---> {0}".format(headers()))
         yield Request(url, headers=headers())
예제 #8
0
파일: brand.py 프로젝트: yftx/DuTracker
    def parse_brandList(self, response):
        brandList = json.loads(response.body_as_unicode())['data']['list']
        for brand in brandList:
            unionId = brand['brand']['goodsBrandId']
            name = brand['brand']['brandName']
            self.brandIds[unionId] = name
            log.success(f'品牌:{name} 编号:{unionId}')

        if not self.auto:
            ids = prompt('输入需要爬取的品牌编号', default='').strip().split(' ')
            if ids == ['']: return IgnoreRequest()
        else:
            ids = self.Ids
            if not ids: return IgnoreRequest()

        log.info(f'获取 {ids} 品牌包含商品')
        for unionId in ids:
            yield Request(page_url(unionId),
                          callback=self.parse_brandInfo,
                          meta={
                              'unionId': unionId,
                              'name': self.brandIds[unionId]
                          })
예제 #9
0
파일: serie.py 프로젝트: yftx/DuTracker
    def parse_serieList(self, response):
        serieList = json.loads(response.body_as_unicode())['data']['list']
        for data in serieList:
            for serie in data['seriesList']:
                unionId = serie['productSeriesId']
                name = serie['name']
                self.serieIds[unionId] = name
                log.success(f'系列:{name} 编号:{unionId}')
        if not self.auto:
            ids = prompt('输入需要爬取的系列编号', default='').strip().split(' ')
            if ids == ['']: return IgnoreRequest()
        else:
            ids = self.Ids
            if not ids: return IgnoreRequest()

        log.info(f'获取 {ids} 系列包含商品')
        for unionId in ids:
            yield Request(page_url(unionId),
                          callback=self.parse_serieInfo,
                          meta={
                              'unionId': unionId,
                              'name': self.serieIds[unionId]
                          })
예제 #10
0
	def start_requests(self):
		log.info('获取品牌列表')
		for url in self.start_urls:
			yield Request(url, dont_filter=True, callback=self.parse_brandList, meta={
				'dont_retry': True
			}, headers=headers())
예제 #11
0
def start(verbose, debug, proxy, min, product, brand, serie, check, delay,
          news, days):
    def check_db():
        from DuTracker.tsdb import influxdb
        try:
            influxdb.ping()
        except Exception as e:
            log.error(f'InfluxDB 连接错误')
            sys.exit(1)
        else:
            log.success(f'InfluxDB 连接成功')

    if check: check_db()

    # https://stackoverflow.com/questions/44228851/scrapy-on-a-schedule
    settings = get_project_settings()

    if verbose: log.setLevel(logging.DEBUG)
    if proxy:
        settings['DOWNLOADER_MIDDLEWARES'].update(
            {'DuTracker.middlewares.RandomProxy': 760})
        settings['PROXY_URL'] = proxy
    if debug: settings['LOG_ENABLED'] = True
    if delay: settings['DOWNLOAD_DELAY'] = delay

    process = CrawlerProcess(settings)
    sched = TwistedScheduler()

    if brand:
        sched.add_job(process.crawl,
                      'interval',
                      args=[BrandSpider],
                      kwargs={
                          'auto': True,
                          'Ids': brand
                      },
                      days=1)
        process.crawl(BrandSpider, auto=True, Ids=brand)
    if serie:
        sched.add_job(process.crawl,
                      'interval',
                      args=[SerieSpider],
                      kwargs={
                          'auto': True,
                          'Ids': serie
                      },
                      days=1)
        process.crawl(SerieSpider, auto=True, Ids=serie)
    if brand or serie:
        sched.add_job(process.crawl,
                      'interval',
                      args=[ProductSpider],
                      kwargs={'fromDB': True},
                      days=1)
        process.crawl(ProductSpider, fromDB=True)
    process.crawl(TrackerSpider, soldNum_min=min, Ids=product)

    sched.add_job(process.crawl,
                  'interval',
                  args=[TrackerSpider],
                  kwargs={
                      'soldNum_min': min,
                      'Ids': product
                  },
                  hours=6)
    if news:
        sched.add_job(process.crawl,
                      'interval',
                      args=[TrackerSpider],
                      kwargs={
                          'newItem': True,
                          'days': days
                      },
                      hours=1)

    sched.add_job(sched.print_jobs, 'interval', hours=6)

    log.info('开始商品价格追踪')
    sched.start()
    process.start(False)
예제 #12
0
	def start_requests(self):
		log.info('获取系列列表')
		for url in self.start_urls:
			yield Request(url, dont_filter=True, callback=self.parse_serieList, headers=headers())