Ejemplo n.º 1
0
def run():
    clawers = Clawer.objects.filter(status=Clawer.STATUS_ON).all()
    monitor = RealTimeMonitor()
    download_queue = DownloadQueue()

    for clawer in clawers:
        clawer_settting = clawer.cached_settings()
        queue_name = clawer_settting.prior_to_queue_name()

        # clawer_tasks = ClawerTask.objects.filter(clawer_id=clawer.id, status=ClawerTask.STATUS_LIVE).order_by("id")[:clawer_settting.dispatch]
        # 不按照id排序
        clawer_tasks = ClawerTask.objects.filter(
            clawer_id=clawer.id,
            status=ClawerTask.STATUS_LIVE)[:clawer_settting.dispatch]

        for item in clawer_tasks:
            if not download_queue.enqueue(queue_name,
                                          download_clawer_task,
                                          args=[item, clawer_settting]):
                break
            item.status = ClawerTask.STATUS_PROCESS
            item.save()
            #trace it
            monitor.trace_task_status(item)

        print "clawer is %d, job count %d, queue name %s" % (
            clawer.id, len(download_queue.jobs), queue_name)

    return download_queue
Ejemplo n.º 2
0
    def __init__(self, clawer_task, clawer_setting):
        from clawer.models import ClawerDownloadLog, RealTimeMonitor

        self.clawer_task = clawer_task
        self.download_log = ClawerDownloadLog(clawer=clawer_task.clawer,
                                              task=clawer_task,
                                              hostname=socket.gethostname())
        self.monitor = RealTimeMonitor()
        self.background_queue = BackgroundQueue()
        self.headers = {
            "user-agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:40.0) Gecko/20100101 Firefox/40.0"
        }
        self.sentry = SentryClient()

        self.clawer_setting = clawer_setting

        self.downloader = Download(self.clawer_task.uri,
                                   engine=self.clawer_setting.download_engine,
                                   js=self.clawer_setting.download_js)

        if self.clawer_setting.proxy:
            self.downloader.add_proxies(
                self.clawer_setting.proxy.strip().split("\n"))

        if self.clawer_task.cookie:
            self.headers["cookie"] = self.clawer_task.cookie
            self.downloader.add_cookie(self.clawer_task.cookie)
        if self.clawer_setting.cookie:
            self.headers["cookie"] = self.clawer_task.cookie
            self.downloader.add_cookie(self.clawer_task.cookie)

        self.downloader.add_headers(self.headers)
Ejemplo n.º 3
0
    def test_task_stat(self):
        clawer = Clawer.objects.create(name="hi", info="good")
        clawer_generator = ClawerTaskGenerator.objects.create(
            clawer=clawer,
            code="print hello",
            cron="*",
            status=ClawerTaskGenerator.STATUS_PRODUCT)
        clawer_task = ClawerTask.objects.create(
            clawer=clawer,
            task_generator=clawer_generator,
            uri="http://github.com",
            status=ClawerTask.STATUS_FAIL)
        monitor = RealTimeMonitor()
        monitor.trace_task_status(clawer_task)
        monitor.trace_task_status(clawer_task)
        monitor.trace_task_status(clawer_task)
        url = reverse("clawer.apis.monitor.task_stat")

        resp = self.logined_client.get(url)
        result = json.loads(resp.content)
        self.assertTrue(result["is_ok"])

        clawer.delete()
        clawer_generator.delete()
        clawer_task.delete()
Ejemplo n.º 4
0
 def __init__(self, clawer, clawer_task):
     from clawer.models import RealTimeMonitor, ClawerAnalysisLog
     
     self.clawer = clawer
     self.clawer_task = clawer_task
     self.monitor = RealTimeMonitor()
     self.background_queue = BackgroundQueue()
     self.hostname = socket.gethostname()[:16]
     self.runing_analysis = self.clawer.runing_analysis()
     self.analysis_log = ClawerAnalysisLog(clawer=self.clawer, task=self.clawer_task, hostname=self.hostname, analysis=self.runing_analysis)
Ejemplo n.º 5
0
 def __init__(self, task_generator):
     from clawer.models import ClawerGenerateLog, RealTimeMonitor
     
     self.task_generator = task_generator
     self.clawer = self.task_generator.clawer
     self.out_path = "/tmp/task_generator_%d" % self.task_generator.id
     self.monitor = RealTimeMonitor()
     self.hostname = socket.gethostname()[:16]
     self.generate_log = ClawerGenerateLog(clawer=self.clawer, task_generator=self.task_generator, hostname=self.hostname)
     self.start_time = time.time()
     self.end_time = None
     self.content_bytes = 0
     self.url_cache = UrlCache()
Ejemplo n.º 6
0
def task_stat(request):
    result = {"is_ok":True, "status":[], "series":[], "xAxis":[]}
    monitor = RealTimeMonitor()
    
    for (status, name) in ClawerTask.STATUS_CHOICES:
        result["status"].append(name)
        
        remote_data = monitor.load_task_stat(status)
        dts = sorted(remote_data["data"].keys())
        if result["xAxis"] == []:
            result["xAxis"] = [x.strftime("%d %H:%M") for x in dts]
        serie = [remote_data["data"][x]["count"] for x in dts]
        result["series"].append(serie)
    
    return result
Ejemplo n.º 7
0
    def test_trace(self):
        clawer = Clawer.objects.create(name="hi", info="good")
        clawer_generator = ClawerTaskGenerator.objects.create(
            clawer=clawer,
            code="print hello",
            cron="*",
            status=ClawerTaskGenerator.STATUS_PRODUCT)
        clawer_task = ClawerTask.objects.create(
            clawer=clawer,
            task_generator=clawer_generator,
            uri="http://github.com",
            status=ClawerTask.STATUS_FAIL)
        monitor = RealTimeMonitor()
        monitor.trace_task_status(clawer_task)
        monitor.trace_task_status(clawer_task)
        result = monitor.trace_task_status(clawer_task)
        #print result
        self.assertEqual(len(result["data"]), monitor.POINT_COUNT)

        clawer.delete()
        clawer_generator.delete()
        clawer_task.delete()