def flush(self): """Delete all run-time data generated by this crawler.""" Queue.flush(self) Tag.delete(self) Event.delete(self) CrawlerState.flush(self) CrawlerRun.flush(self)
def flush(self): """Delete all run-time data generated by this crawler.""" Tag.delete(self.name) Event.delete(self.name) Result.delete(self.name) session.commit() signals.crawler_flush.send(self)
def run(self, incremental=None, run_id=None): """Queue the execution of a particular crawler.""" state = { 'crawler': self.name, 'run_id': run_id or Job.random_id(), 'incremental': settings.INCREMENTAL } if incremental is not None: state['incremental'] = incremental # Cancel previous runs: self.cancel() # Flush out previous events data but keep the counts: Event.delete_data(self) Queue.queue(self.init_stage, state, {})
def run(self, incremental=None, run_id=None): """Queue the execution of a particular crawler.""" state = { "crawler": self.name, "run_id": run_id or Job.random_id(), "incremental": settings.INCREMENTAL, } if incremental is not None: state["incremental"] = incremental # Cancel previous runs: self.cancel() # Flush out previous events data but keep the counts: Event.delete_data(self) init_stage = self.get(self.init_stage) Queue.queue(init_stage, state, {})
def crawler(name): crawler = get_crawler(name) stages = [] for stage in crawler: data = Event.get_stage_counts(crawler, stage) data["total_ops"] = stage.op_count data["stage"] = stage stages.append(data) runs = list(crawler.runs) for run in runs: run.update(Event.get_run_counts(crawler, run["run_id"])) runs = sorted(runs, key=lambda r: r.get("start"), reverse=True) return render_template("crawler.html", crawler=crawler, stages=stages, runs=runs)
def emit_exception(self, exc): self.log.exception(exc) return Event.save(self.crawler, self.stage, Event.LEVEL_ERROR, self.run_id, error=exc.__class__.__name__, message=str(exc))
def emit_exception(self, exc): self.log.exception(exc) return Event.save(self.crawler.name, self.operation_id, Event.LEVEL_ERROR, error_type=exc.__class__.__name__, error_message=unicode(exc), error_details=traceback.format_exc())
def emit_exception(self, exc): self.log.exception(exc) return Event.save(self.crawler.name, self.stage.name, Event.LEVEL_ERROR, self.run_id, error_type=exc.__class__.__name__, error_message=six.text_type(exc), error_details=traceback.format_exc())
def crawler_stages(crawler): """See the number of executions of each stage.""" stages = [] for stage in crawler: data = Event.get_stage_counts(crawler, stage) data['total_ops'] = stage.op_count data['stage'] = stage stages.append(data) return stages
def run(self, incremental=None, run_id=None): """Queue the execution of a particular crawler.""" state = { 'crawler': self.name, 'run_id': run_id, 'incremental': settings.INCREMENTAL } if incremental is not None: state['incremental'] = incremental # Cancel previous runs: self.cancel() # Flush out previous events: Event.delete(self) Queue.queue(self.init_stage, state, {}) if not settings.REDIS_HOST: TaskRunner.run()
def emit_warning(self, message, type=None, details=None, *args): if len(args): message = message % args self.log.warning(message) return Event.save(self.crawler.name, self.operation_id, Event.LEVEL_WARNING, error_type=type, error_message=message, error_details=details)
def emit_warning(self, message, type=None, *args): if len(args): message = message % args self.log.warning(message) return Event.save(self.crawler, self.stage, Event.LEVEL_WARNING, self.run_id, error=type, message=message)
def index(): """Generate a list of all crawlers, alphabetically, with op counts.""" crawlers = [] for crawler in manager: data = Event.get_counts(crawler) data["last_active"] = crawler.last_run data["total_ops"] = crawler.op_count data["running"] = crawler.is_running data["crawler"] = crawler crawlers.append(data) return render_template("index.html", crawlers=crawlers)
def crawlers_index(): """Generate a list of all crawlers, sorted alphabetically, with op counts.""" crawlers = [] for crawler in manager: data = Event.get_counts(crawler) data['last_active'] = crawler.last_run data['total_ops'] = crawler.op_count data['running'] = crawler.is_running data['crawler'] = crawler crawlers.append(data) return crawlers
def events(name): crawler = get_crawler(name) page = int(request.args.get("page", 1)) start = (max(1, page) - 1) * PAGE_SIZE end = start + PAGE_SIZE run_id = request.args.get("run_id") level = request.args.get("level") stage_name = request.args.get("stage_name") if stage_name: events = Event.get_stage_events(crawler, stage_name, start, end, level) elif run_id: events = Event.get_run_events(crawler, run_id, start, end, level) else: events = Event.get_crawler_events(crawler, start, end, level) total = len(events) pages = int(math.ceil((float(total) / PAGE_SIZE))) return render_template("events.html", crawler=crawler, results=events, page=page, pages=pages)
def crawler_events(crawler, level=None, stage_name=None, run_id=None, page=1, per_page=15): start = (max(1, page) - 1) * per_page end = start + per_page if stage_name: events = Event.get_stage_events(crawler, stage_name, start, end, level) elif run_id: events = Event.get_run_events(crawler, run_id, start, end, level) else: events = Event.get_crawler_events(crawler, start, end, level) total = len(events) return { 'page': page, 'per_page': per_page, 'pages': int(math.ceil((float(total) / per_page))), 'total': total, 'results': events }
def flush_events(self): Event.delete(self)
def flush(self): """Delete all run-time data generated by this crawler.""" self.queue.cancel() Event.delete(self) Crawl.flush(self)
def crawler_runs(crawler): runs = list(crawler.runs) for run in runs: run.update(Event.get_run_counts(crawler, run['run_id'])) return runs