def run_job(self): Logger().info('product_consumer start') self.http = Http() self.proxy_engine = get_proxy_engine() self.http.set_headers(self.headers) while True: job_dict = self.get_job_obj() if job_dict: job_entity = ProductAddJobEntity.instance(job_dict) try: if self.proxy_engine: # product 反扒比较苛刻,这边用了随机IP的代理 self.http.set_proxy(self.proxy_engine.get_proxy()) crawler = ProductAddCrawler(job_entity, self.http) if crawler.productItem: job_dict['product_item_id'] = crawler.productItem.id new_job = ProductJobEntity.instance(job_dict) self.set_job_by_key(RedisListKeyEnum.product_crawl_job, new_job) except CrawlErrorException: # 爬虫失败异常,http 连续失败次数+1 self.set_error_job(job_entity) except NotFoundException: # 页面不存在,不做处理 pass common.sleep_random()
def new_consumer(group_id): consumer = KafkaConsumer('orders', group_id=group_id, auto_offset_reset='smallest') for msg in consumer: common.sleep_random() json_data = json.loads(msg.value) common.prGreen( "\nService %s got a message: -- Msg Key: %s, partition: [%s] offset [%s]:" % (group_id, msg.key, msg.partition, msg.offset)) pprint.pprint(json_data, indent=1, width=40)
def run_job(self): Logger().info('product_consumer start') self.http = Http() self.proxy_engine = get_proxy_engine() self.http.set_headers(self.headers) while True: job_dict = self.get_job_obj() if job_dict: job_entity = ProductJobEntity.instance(job_dict) try: if self.proxy_engine: # product 反扒比较苛刻,这边用了随机IP的代理 self.http.set_proxy(self.proxy_engine.get_proxy()) ProductCrawler(job_entity, self.http) except CrawlErrorException: # 爬虫失败异常,http 连续失败次数+1 self.set_error_job(job_entity) except NotFoundException: # 页面不存在,不做处理 pass common.sleep_random()
def run_job(self): Logger().info('product_review_consumer start') self.http = Http() self.proxy_engine = get_proxy_engine() self.http.set_headers(self.headers) while True: job_dict = self.get_job_obj() if job_dict: job_entity = ProductReviewJobEntity.instance(job_dict) try: if self.proxy_engine: self.http.set_proxy(self.proxy_engine.get_proxy()) crawl = ProductReviewCrawler(job_entity, self.http) if crawl.crawl_next_page: job_entity.page += 1 self.set_job(job_entity) except CrawlErrorException: # 爬虫失败异常,http 连续失败次数+1 self.set_error_job(job_entity) except NotFoundException: # 页面不存在,不做处理 pass common.sleep_random()
def callbackstore(ch, method, properties, body): common.sleep_random() print_method("store", method.routing_key) common.print_json(body) ch.basic_ack(delivery_tag=method.delivery_tag)
def start_store_consumer(): common.sleep_random() print("Service for \" store \" started") common.divide() new_consumer("store")
def start_clothes_consumer(): common.sleep_random() print("Service for \"clothes\" started") common.divide() new_consumer("clothes")
def start_mail_consumer(): common.sleep_random() print("Service for \"mail\" started") common.divide() new_consumer("mail")