Example #1
0
    def __init__(self, clawer_task, clawer_setting):
        from clawer.models import ClawerDownloadLog, RealTimeMonitor

        self.clawer_task = clawer_task
        self.download_log = ClawerDownloadLog(clawer=clawer_task.clawer,
                                              task=clawer_task,
                                              hostname=socket.gethostname())
        self.monitor = RealTimeMonitor()
        self.background_queue = BackgroundQueue()
        self.headers = {
            "user-agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:40.0) Gecko/20100101 Firefox/40.0"
        }
        self.sentry = SentryClient()

        self.clawer_setting = clawer_setting

        self.downloader = Download(self.clawer_task.uri,
                                   engine=self.clawer_setting.download_engine,
                                   js=self.clawer_setting.download_js)

        if self.clawer_setting.proxy:
            self.downloader.add_proxies(
                self.clawer_setting.proxy.strip().split("\n"))

        if self.clawer_task.cookie:
            self.headers["cookie"] = self.clawer_task.cookie
            self.downloader.add_cookie(self.clawer_task.cookie)
        if self.clawer_setting.cookie:
            self.headers["cookie"] = self.clawer_task.cookie
            self.downloader.add_cookie(self.clawer_task.cookie)

        self.downloader.add_headers(self.headers)
Example #2
0
def run():
    clawers = Clawer.objects.filter(status=Clawer.STATUS_ON).all()
    monitor = RealTimeMonitor()
    download_queue = DownloadQueue()

    for clawer in clawers:
        clawer_settting = clawer.cached_settings()
        queue_name = clawer_settting.prior_to_queue_name()

        # clawer_tasks = ClawerTask.objects.filter(clawer_id=clawer.id, status=ClawerTask.STATUS_LIVE).order_by("id")[:clawer_settting.dispatch]
        # 不按照id排序
        clawer_tasks = ClawerTask.objects.filter(
            clawer_id=clawer.id,
            status=ClawerTask.STATUS_LIVE)[:clawer_settting.dispatch]

        for item in clawer_tasks:
            if not download_queue.enqueue(queue_name,
                                          download_clawer_task,
                                          args=[item, clawer_settting]):
                break
            item.status = ClawerTask.STATUS_PROCESS
            item.save()
            #trace it
            monitor.trace_task_status(item)

        print "clawer is %d, job count %d, queue name %s" % (
            clawer.id, len(download_queue.jobs), queue_name)

    return download_queue
Example #3
0
def task_stat(request):
    result = {"is_ok":True, "status":[], "series":[], "xAxis":[]}
    monitor = RealTimeMonitor()
    
    for (status, name) in ClawerTask.STATUS_CHOICES:
        result["status"].append(name)
        
        remote_data = monitor.load_task_stat(status)
        dts = sorted(remote_data["data"].keys())
        if result["xAxis"] == []:
            result["xAxis"] = [x.strftime("%d %H:%M") for x in dts]
        serie = [remote_data["data"][x]["count"] for x in dts]
        result["series"].append(serie)
    
    return result
Example #4
0
 def __init__(self, clawer, clawer_task):
     from clawer.models import RealTimeMonitor, ClawerAnalysisLog
     
     self.clawer = clawer
     self.clawer_task = clawer_task
     self.monitor = RealTimeMonitor()
     self.background_queue = BackgroundQueue()
     self.hostname = socket.gethostname()[:16]
     self.runing_analysis = self.clawer.runing_analysis()
     self.analysis_log = ClawerAnalysisLog(clawer=self.clawer, task=self.clawer_task, hostname=self.hostname, analysis=self.runing_analysis)
Example #5
0
    def test_task_stat(self):
        clawer = Clawer.objects.create(name="hi", info="good")
        clawer_generator = ClawerTaskGenerator.objects.create(
            clawer=clawer,
            code="print hello",
            cron="*",
            status=ClawerTaskGenerator.STATUS_PRODUCT)
        clawer_task = ClawerTask.objects.create(
            clawer=clawer,
            task_generator=clawer_generator,
            uri="http://github.com",
            status=ClawerTask.STATUS_FAIL)
        monitor = RealTimeMonitor()
        monitor.trace_task_status(clawer_task)
        monitor.trace_task_status(clawer_task)
        monitor.trace_task_status(clawer_task)
        url = reverse("clawer.apis.monitor.task_stat")

        resp = self.logined_client.get(url)
        result = json.loads(resp.content)
        self.assertTrue(result["is_ok"])

        clawer.delete()
        clawer_generator.delete()
        clawer_task.delete()
Example #6
0
    def test_trace(self):
        clawer = Clawer.objects.create(name="hi", info="good")
        clawer_generator = ClawerTaskGenerator.objects.create(
            clawer=clawer,
            code="print hello",
            cron="*",
            status=ClawerTaskGenerator.STATUS_PRODUCT)
        clawer_task = ClawerTask.objects.create(
            clawer=clawer,
            task_generator=clawer_generator,
            uri="http://github.com",
            status=ClawerTask.STATUS_FAIL)
        monitor = RealTimeMonitor()
        monitor.trace_task_status(clawer_task)
        monitor.trace_task_status(clawer_task)
        result = monitor.trace_task_status(clawer_task)
        #print result
        self.assertEqual(len(result["data"]), monitor.POINT_COUNT)

        clawer.delete()
        clawer_generator.delete()
        clawer_task.delete()
Example #7
0
 def __init__(self, task_generator):
     from clawer.models import ClawerGenerateLog, RealTimeMonitor
     
     self.task_generator = task_generator
     self.clawer = self.task_generator.clawer
     self.out_path = "/tmp/task_generator_%d" % self.task_generator.id
     self.monitor = RealTimeMonitor()
     self.hostname = socket.gethostname()[:16]
     self.generate_log = ClawerGenerateLog(clawer=self.clawer, task_generator=self.task_generator, hostname=self.hostname)
     self.start_time = time.time()
     self.end_time = None
     self.content_bytes = 0
     self.url_cache = UrlCache()
Example #8
0
class DownloadClawerTask(object):
    def __init__(self, clawer_task, clawer_setting):
        from clawer.models import ClawerDownloadLog, RealTimeMonitor

        self.clawer_task = clawer_task
        self.download_log = ClawerDownloadLog(clawer=clawer_task.clawer,
                                              task=clawer_task,
                                              hostname=socket.gethostname())
        self.monitor = RealTimeMonitor()
        self.background_queue = BackgroundQueue()
        self.headers = {
            "user-agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:40.0) Gecko/20100101 Firefox/40.0"
        }
        self.sentry = SentryClient()

        self.clawer_setting = clawer_setting

        self.downloader = Download(self.clawer_task.uri,
                                   engine=self.clawer_setting.download_engine,
                                   js=self.clawer_setting.download_js)

        if self.clawer_setting.proxy:
            self.downloader.add_proxies(
                self.clawer_setting.proxy.strip().split("\n"))

        if self.clawer_task.cookie:
            self.headers["cookie"] = self.clawer_task.cookie
            self.downloader.add_cookie(self.clawer_task.cookie)
        if self.clawer_setting.cookie:
            self.headers["cookie"] = self.clawer_task.cookie
            self.downloader.add_cookie(self.clawer_task.cookie)

        self.downloader.add_headers(self.headers)

    def download(self):
        from clawer.models import ClawerTask, ClawerDownloadLog

        if not self.clawer_task.status in [
                ClawerTask.STATUS_LIVE, ClawerTask.STATUS_PROCESS
        ]:
            return 0

        failed = False

        self.downloader.download()

        if self.downloader.failed:
            self.download_failed()
            return 0

        #save
        try:
            path = self.clawer_task.store_path()
            if os.path.exists(os.path.dirname(path)) is False:
                os.makedirs(os.path.dirname(path), 0775)

            with open(path, "w") as f:
                content = self.downloader.content
                if isinstance(content, types.UnicodeType):
                    content = content.encode("utf-8")
                f.write(content)

            self.clawer_task.store = path
        except:
            failed = True
            self.download_log.failed_reason = traceback.format_exc(10)
            self.sentry.capture()

        if failed:
            self.download_failed()
            return 0

        #success handle
        self.clawer_task.status = ClawerTask.STATUS_SUCCESS
        self.clawer_task.save()

        if self.downloader.response_headers.get("content-length"):
            self.download_log.content_bytes = self.downloader.response_headers[
                "Content-Length"]
        else:
            self.download_log.content_bytes = len(self.downloader.content)
        self.download_log.status = ClawerDownloadLog.STATUS_SUCCESS
        self.download_log.content_encoding = self.downloader.content_encoding
        self.download_log.spend_time = int(self.downloader.spend_time * 1000)
        self.download_log.save()

        self.monitor.trace_task_status(self.clawer_task)
        return self.clawer_task.id

    def download_failed(self):
        from clawer.models import ClawerTask, ClawerDownloadLog

        self.download_log.status = ClawerDownloadLog.STATUS_FAIL
        if self.downloader.failed_exception:
            self.download_log.failed_reason = self.downloader.failed_exception
        self.download_log.spend_time = int(self.downloader.spend_time * 1000)
        self.background_queue.enqueue(clawer_download_log_delay_save,
                                      [self.download_log])

        self.clawer_task.status = ClawerTask.STATUS_FAIL
        self.background_queue.enqueue(clawer_task_delay_save,
                                      [self.clawer_task])

        self.monitor.trace_task_status(self.clawer_task)