Beispiel #1
0
 def flush(self):
     """Delete all run-time data generated by this crawler."""
     Queue.flush(self)
     Tag.delete(self)
     Event.delete(self)
     CrawlerState.flush(self)
     CrawlerRun.flush(self)
Beispiel #2
0
 def flush(self):
     """Delete all run-time data generated by this crawler."""
     Tag.delete(self.name)
     Event.delete(self.name)
     Result.delete(self.name)
     session.commit()
     signals.crawler_flush.send(self)
Beispiel #3
0
    def run(self, incremental=None, run_id=None):
        """Queue the execution of a particular crawler."""
        state = {
            'crawler': self.name,
            'run_id': run_id or Job.random_id(),
            'incremental': settings.INCREMENTAL
        }
        if incremental is not None:
            state['incremental'] = incremental

        # Cancel previous runs:
        self.cancel()
        # Flush out previous events data but keep the counts:
        Event.delete_data(self)
        Queue.queue(self.init_stage, state, {})
Beispiel #4
0
    def run(self, incremental=None, run_id=None):
        """Queue the execution of a particular crawler."""
        state = {
            "crawler": self.name,
            "run_id": run_id or Job.random_id(),
            "incremental": settings.INCREMENTAL,
        }
        if incremental is not None:
            state["incremental"] = incremental

        # Cancel previous runs:
        self.cancel()
        # Flush out previous events data but keep the counts:
        Event.delete_data(self)
        init_stage = self.get(self.init_stage)
        Queue.queue(init_stage, state, {})
Beispiel #5
0
def crawler(name):
    crawler = get_crawler(name)
    stages = []
    for stage in crawler:
        data = Event.get_stage_counts(crawler, stage)
        data["total_ops"] = stage.op_count
        data["stage"] = stage
        stages.append(data)
    runs = list(crawler.runs)
    for run in runs:
        run.update(Event.get_run_counts(crawler, run["run_id"]))
    runs = sorted(runs, key=lambda r: r.get("start"), reverse=True)
    return render_template("crawler.html",
                           crawler=crawler,
                           stages=stages,
                           runs=runs)
Beispiel #6
0
 def emit_exception(self, exc):
     self.log.exception(exc)
     return Event.save(self.crawler,
                       self.stage,
                       Event.LEVEL_ERROR,
                       self.run_id,
                       error=exc.__class__.__name__,
                       message=str(exc))
Beispiel #7
0
 def emit_exception(self, exc):
     self.log.exception(exc)
     return Event.save(self.crawler.name,
                       self.operation_id,
                       Event.LEVEL_ERROR,
                       error_type=exc.__class__.__name__,
                       error_message=unicode(exc),
                       error_details=traceback.format_exc())
Beispiel #8
0
 def emit_exception(self, exc):
     self.log.exception(exc)
     return Event.save(self.crawler.name,
                       self.stage.name,
                       Event.LEVEL_ERROR,
                       self.run_id,
                       error_type=exc.__class__.__name__,
                       error_message=six.text_type(exc),
                       error_details=traceback.format_exc())
Beispiel #9
0
def crawler_stages(crawler):
    """See the number of executions of each stage."""
    stages = []
    for stage in crawler:
        data = Event.get_stage_counts(crawler, stage)
        data['total_ops'] = stage.op_count
        data['stage'] = stage
        stages.append(data)
    return stages
Beispiel #10
0
    def run(self, incremental=None, run_id=None):
        """Queue the execution of a particular crawler."""
        state = {
            'crawler': self.name,
            'run_id': run_id,
            'incremental': settings.INCREMENTAL
        }
        if incremental is not None:
            state['incremental'] = incremental

        # Cancel previous runs:
        self.cancel()
        # Flush out previous events:
        Event.delete(self)
        Queue.queue(self.init_stage, state, {})

        if not settings.REDIS_HOST:
            TaskRunner.run()
Beispiel #11
0
 def emit_warning(self, message, type=None, details=None, *args):
     if len(args):
         message = message % args
     self.log.warning(message)
     return Event.save(self.crawler.name,
                       self.operation_id,
                       Event.LEVEL_WARNING,
                       error_type=type,
                       error_message=message,
                       error_details=details)
Beispiel #12
0
 def emit_warning(self, message, type=None, *args):
     if len(args):
         message = message % args
     self.log.warning(message)
     return Event.save(self.crawler,
                       self.stage,
                       Event.LEVEL_WARNING,
                       self.run_id,
                       error=type,
                       message=message)
Beispiel #13
0
def index():
    """Generate a list of all crawlers, alphabetically, with op counts."""
    crawlers = []
    for crawler in manager:
        data = Event.get_counts(crawler)
        data["last_active"] = crawler.last_run
        data["total_ops"] = crawler.op_count
        data["running"] = crawler.is_running
        data["crawler"] = crawler
        crawlers.append(data)
    return render_template("index.html", crawlers=crawlers)
Beispiel #14
0
def crawlers_index():
    """Generate a list of all crawlers, sorted alphabetically, with op
    counts."""
    crawlers = []
    for crawler in manager:
        data = Event.get_counts(crawler)
        data['last_active'] = crawler.last_run
        data['total_ops'] = crawler.op_count
        data['running'] = crawler.is_running
        data['crawler'] = crawler
        crawlers.append(data)
    return crawlers
Beispiel #15
0
def events(name):
    crawler = get_crawler(name)
    page = int(request.args.get("page", 1))
    start = (max(1, page) - 1) * PAGE_SIZE
    end = start + PAGE_SIZE
    run_id = request.args.get("run_id")
    level = request.args.get("level")
    stage_name = request.args.get("stage_name")

    if stage_name:
        events = Event.get_stage_events(crawler, stage_name, start, end, level)
    elif run_id:
        events = Event.get_run_events(crawler, run_id, start, end, level)
    else:
        events = Event.get_crawler_events(crawler, start, end, level)
    total = len(events)
    pages = int(math.ceil((float(total) / PAGE_SIZE)))
    return render_template("events.html",
                           crawler=crawler,
                           results=events,
                           page=page,
                           pages=pages)
Beispiel #16
0
def crawler_events(crawler,
                   level=None,
                   stage_name=None,
                   run_id=None,
                   page=1,
                   per_page=15):
    start = (max(1, page) - 1) * per_page
    end = start + per_page

    if stage_name:
        events = Event.get_stage_events(crawler, stage_name, start, end, level)
    elif run_id:
        events = Event.get_run_events(crawler, run_id, start, end, level)
    else:
        events = Event.get_crawler_events(crawler, start, end, level)
    total = len(events)

    return {
        'page': page,
        'per_page': per_page,
        'pages': int(math.ceil((float(total) / per_page))),
        'total': total,
        'results': events
    }
Beispiel #17
0
 def flush_events(self):
     Event.delete(self)
Beispiel #18
0
 def flush(self):
     """Delete all run-time data generated by this crawler."""
     self.queue.cancel()
     Event.delete(self)
     Crawl.flush(self)
Beispiel #19
0
def crawler_runs(crawler):
    runs = list(crawler.runs)
    for run in runs:
        run.update(Event.get_run_counts(crawler, run['run_id']))
    return runs