def spider_closed(self, spider, reason): stats = spider.crawler.stats.get_stats() # 获取数据 start_time = stats.get("start_time") finish_time = stats.get("finish_time") duration = (finish_time - start_time).seconds # 保存收集到的信息 result = ScrapydUtil.parse_log_file(self.log_file) item = { "job_id": result.get('job_id', ''), "project": result.get('project', ''), "spider": spider.name, "item_scraped_count": stats.get("item_scraped_count", 0), "item_dropped_count": stats.get("item_dropped_count", 0), "start_time": start_time.strftime(self.DATETIME_FORMAT), "finish_time": finish_time.strftime(self.DATETIME_FORMAT), "duration": duration, "finish_reason": stats.get("finish_reason"), "log_error_count": stats.get("log_count/ERROR", 0), } logger.info(item) self.collection_item(item)
def insert_data(self, table, data): """插入数据""" try: result = table.insert(data) logger.info("success: {}".format(result)) except (pymongo.errors.DuplicateKeyError, pymongo.errors.ServerSelectionTimeoutError) as e: logger.info("error: {}".format(e)) finally: raise DropItem
def spider_closed(self, spider, reason): stats = spider.crawler.stats.get_stats() # 获取数据 item_scraped_count = stats.get("item_scraped_count", 0) item_dropped_count = stats.get("item_dropped_count", 0) start_time = stats.get("start_time") finish_time = stats.get("finish_time") # 打印收集日志 item_count = item_scraped_count + item_dropped_count duration = (finish_time - start_time).seconds logger.info("*" * 30) logger.info("* {}".format(spider.name)) logger.info("* item_count : {}".format(item_count)) logger.info("* duration : {}".format(duration)) logger.info("*" * 30)
def process_request(self, request, spider): logger.info('*' * 32) logger.info(self.__class__.__name__) logger.info('*' * 32) return HtmlResponse(url=request.url, request=request)
def collection_item(self, item): """处理收集到的数据,以json 形式提交""" res = requests.post(self.stats_collection_url, json=item) logger.info(res.text)