def run(self): r = get_redis() if settings.CRAWLER_DEBUG: r.delete(settings.CRAWLER_CONFIG["processor"]) while True: try: rsp = r.brpop(settings.CRAWLER_CONFIG["processor"]) except Exception as e: print e continue data = json.loads(rsp[1]) #logger.info(json.dumps(data, encoding="UTF-8", ensure_ascii=False)) self.process(data)
def run(self): r = get_redis() while True: now = datetime.now() for item in Seed.objects.filter( status=Seed.STATUS_ENABLE).order_by('-weight'): rules = IndexRule.objects.filter( seed=item, status=IndexRule.STATUS_ENABLE, next_crawl_time__lte=now) for rule in rules: try: deital_rule = DetailRule.objects.get(index_rule=rule) except DetailRule.DoesNotExist as e: print e continue base = { 'url': '', 'kind': KIND_LIST_URL, "seed_id": item.pk, 'rule_id': rule.pk, "fresh_pages": rule.fresh_pages, 'site_config': rule.site.get_config(), 'list_rules': rule.list_rules, 'next_url_rules': rule.next_url_rules, 'detail_rules': deital_rule.data, 'detail_exclude': deital_rule.exclude, 'detail_multi': deital_rule.multi, 'detail_multi_unique': deital_rule.multi_unique, 'detail_fresh_time': deital_rule.fresh_time, 'unique_key': item.data[0]["unique_key"] } for url in rule.url: data = base.copy() data['url'] = url r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data)) # 更新index_rule rule.next_crawl_time = now + timedelta( seconds=rule.frequency) rule.save() logging.debug(data) #print r.rpop('unicrawler:urls') time.sleep(1)
def monitor_service(self): conf = settings.CRAWLER_CONFIG r = get_redis() now = datetime.now().replace(second=0, microsecond=0) pipe = r.pipeline() result = pipe.llen(conf['downloader']).llen(conf['extractor']).llen(conf['processor']).execute() scheduler = IndexRule.objects.filter(seed__status=Seed.STATUS_ENABLE, status=IndexRule.STATUS_ENABLE, next_crawl_time__lte=now).count() print result Service.objects.create( scheduler=scheduler, downloader=result[0], extractor=result[1], processor=result[2], create_time=now )
def run(self): r = get_redis() while True: now = datetime.now() for item in Seed.objects.filter(status=Seed.STATUS_ENABLE).order_by('-weight'): rules = IndexRule.objects.filter(seed=item, status=IndexRule.STATUS_ENABLE, next_crawl_time__lte=now) for rule in rules: try: detail_rule = DetailRule.objects.get(index_rule=rule) except DetailRule.DoesNotExist as e: print e continue base = { 'url': '', 'kind': KIND_LIST_URL, "seed_id": item.pk, 'rule_id': rule.pk, "fresh_pages": rule.fresh_pages, 'site_config': rule.site.get_config(), 'list_rules': rule.list_rules, 'next_url_rules': rule.next_url_rules, 'detail_rules': detail_rule.data, 'detail_exclude': detail_rule.exclude, 'detail_multi': detail_rule.multi, 'detail_multi_unique': detail_rule.multi_unique, 'detail_fresh_time': detail_rule.fresh_time, 'unique_key': item.data[0]["unique_key"] } for url in rule.url: data = base.copy() data['url'] = url r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data)) # 更新index_rule rule.next_crawl_time = now + timedelta(seconds=rule.frequency) rule.save() logging.debug(data) #print r.rpop('unicrawler:urls') time.sleep(1)
def run(self): r = get_redis() if settings.CRAWLER_DEBUG: r.delete(settings.CRAWLER_CONFIG["extractor"]) while True: try: data = r.brpop(settings.CRAWLER_CONFIG["extractor"]) except Exception as e: print e continue #print data data = json.loads(data[1]) body = data['body'] # 1 如果当前接卸的页面是列表页 if data["kind"] == KIND_LIST_URL: # 1.1先找详情页 # 检查详情的内容是否都包含在列表页中 multi_rules = data['detail_multi'] if multi_rules: # 1.1.1 详情都包含在列表页中 multi_parts = self.extract(body, multi_rules, {'data': data}) for part in multi_parts: self.get_detail(part, data) else: # 1.1.2 详情不在列表中,通过列表url去访问详情 detail_urls = self.extract(body, data['list_rules'], {'data': data}) #logger.debug('detail_urls: %s' % detail_urls) for item in detail_urls: item_data = { "url": item, 'kind': KIND_DETAIL_URL, 'seed_id': data['seed_id'], 'rule_id': data['rule_id'], #'fresh_pages': '', #'list_rules': '', #'next_url_rules': '', 'site_config': data['site_config'], 'detail_rules': data['detail_rules'], 'detail_exclude': data['detail_exclude'], 'detail_multi': data['detail_multi'], 'detail_multi_unique': data['detail_multi_unique'], 'detail_fresh_time': data['detail_fresh_time'], 'unique_key': data['unique_key'] } r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(item_data)) # 1.2后找下一页 next_urls = self.extract(body, data["next_url_rules"], {'data': data}) print 'next_urls: %s' % next_urls for item in next_urls: item_data = { "url": item, 'kind': KIND_LIST_URL, 'seed_id': data['seed_id'], 'rule_id': data['rule_id'], 'fresh_pages': data['fresh_pages'] - 1, 'site_config': data['site_config'], 'list_rules': data['list_rules'], 'next_url_rules': data['next_url_rules'], 'detail_rules': data['detail_rules'], 'detail_exclude': data['detail_exclude'], 'detail_multi': data['detail_multi'], 'detail_multi_unique': data['detail_multi_unique'], 'detail_fresh_time': data['detail_fresh_time'], 'unique_key': data['unique_key'] } if item_data['fresh_pages'] > 0: logger.debug('list:%s' % data['url']) r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(item_data)) # 2 如果当前解析的页面是详情页 elif data["kind"] == KIND_DETAIL_URL: logger.debug('detail:%s' % data['url']) # 如果没有多项详情,则只是单项 self.get_detail(body, data)
def __init__(self): self.redis = get_redis()
def run(self): r = get_redis() if settings.CRAWLER_DEBUG: r.delete(settings.CRAWLER_CONFIG["extractor"]) while True: try: data = r.brpop(settings.CRAWLER_CONFIG["extractor"]) except Exception as e: print e continue #print data data = json.loads(data[1]) body = data['body'] # 1 如果当前接卸的页面是列表页 if data["kind"] == KIND_LIST_URL: # 1.1先找详情页 # 检查详情的内容是否都包含在列表页中 multi_rules = data['detail_multi'] if multi_rules: # 1.1.1 详情都包含在列表页中 multi_parts = self.extract(body, multi_rules, {'data': data}) for part in multi_parts: self.get_detail(part, data) else: # 1.1.2 详情不在列表中,通过列表url去访问详情 detail_urls = self.extract(body, data['list_rules'], {'data': data}) #logger.debug('detail_urls: %s' % detail_urls) for item in detail_urls: item_data = { "url": item, 'kind': KIND_DETAIL_URL, 'seed_id': data['seed_id'], 'rule_id': data['rule_id'], #'fresh_pages': '', #'list_rules': '', #'next_url_rules': '', 'site_config': data['site_config'], 'detail_rules': data['detail_rules'], 'detail_exclude': data['detail_exclude'], 'detail_multi': data['detail_multi'], 'detail_multi_unique': data['detail_multi_unique'], 'detail_fresh_time': data['detail_fresh_time'], 'unique_key': data['unique_key'] } r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(item_data)) # 1.2后找下一页 next_urls = self.extract(body, data["next_url_rules"], {'data': data}) site_config = data['site_config'] print 'next_urls: %s' % next_urls for item in next_urls: item = checkUrlValidate(item, site_config) item_data = { "url": item, 'kind': KIND_LIST_URL, 'seed_id': data['seed_id'], 'rule_id': data['rule_id'], 'fresh_pages': data['fresh_pages'] - 1, 'site_config': data['site_config'], 'list_rules': data['list_rules'], 'next_url_rules': data['next_url_rules'], 'detail_rules': data['detail_rules'], 'detail_exclude': data['detail_exclude'], 'detail_multi': data['detail_multi'], 'detail_multi_unique': data['detail_multi_unique'], 'detail_fresh_time': data['detail_fresh_time'], 'unique_key': data['unique_key'] } if item_data['fresh_pages'] > 0: logger.debug('list:%s' % data['url']) r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(item_data)) # 2 如果当前解析的页面是详情页 elif data["kind"] == KIND_DETAIL_URL: logger.debug('detail:%s' % data['url']) # 如果没有多项详情,则只是单项 self.get_detail(body, data)