def __init__(self, clawer_task, clawer_setting): from clawer.models import ClawerDownloadLog, RealTimeMonitor self.clawer_task = clawer_task self.download_log = ClawerDownloadLog(clawer=clawer_task.clawer, task=clawer_task, hostname=socket.gethostname()) self.monitor = RealTimeMonitor() self.background_queue = BackgroundQueue() self.headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:40.0) Gecko/20100101 Firefox/40.0" } self.sentry = SentryClient() self.clawer_setting = clawer_setting self.downloader = Download(self.clawer_task.uri, engine=self.clawer_setting.download_engine, js=self.clawer_setting.download_js) if self.clawer_setting.proxy: self.downloader.add_proxies( self.clawer_setting.proxy.strip().split("\n")) if self.clawer_task.cookie: self.headers["cookie"] = self.clawer_task.cookie self.downloader.add_cookie(self.clawer_task.cookie) if self.clawer_setting.cookie: self.headers["cookie"] = self.clawer_task.cookie self.downloader.add_cookie(self.clawer_task.cookie) self.downloader.add_headers(self.headers)
def run(): clawers = Clawer.objects.filter(status=Clawer.STATUS_ON).all() monitor = RealTimeMonitor() download_queue = DownloadQueue() for clawer in clawers: clawer_settting = clawer.cached_settings() queue_name = clawer_settting.prior_to_queue_name() # clawer_tasks = ClawerTask.objects.filter(clawer_id=clawer.id, status=ClawerTask.STATUS_LIVE).order_by("id")[:clawer_settting.dispatch] # 不按照id排序 clawer_tasks = ClawerTask.objects.filter( clawer_id=clawer.id, status=ClawerTask.STATUS_LIVE)[:clawer_settting.dispatch] for item in clawer_tasks: if not download_queue.enqueue(queue_name, download_clawer_task, args=[item, clawer_settting]): break item.status = ClawerTask.STATUS_PROCESS item.save() #trace it monitor.trace_task_status(item) print "clawer is %d, job count %d, queue name %s" % ( clawer.id, len(download_queue.jobs), queue_name) return download_queue
def task_stat(request): result = {"is_ok":True, "status":[], "series":[], "xAxis":[]} monitor = RealTimeMonitor() for (status, name) in ClawerTask.STATUS_CHOICES: result["status"].append(name) remote_data = monitor.load_task_stat(status) dts = sorted(remote_data["data"].keys()) if result["xAxis"] == []: result["xAxis"] = [x.strftime("%d %H:%M") for x in dts] serie = [remote_data["data"][x]["count"] for x in dts] result["series"].append(serie) return result
def __init__(self, clawer, clawer_task): from clawer.models import RealTimeMonitor, ClawerAnalysisLog self.clawer = clawer self.clawer_task = clawer_task self.monitor = RealTimeMonitor() self.background_queue = BackgroundQueue() self.hostname = socket.gethostname()[:16] self.runing_analysis = self.clawer.runing_analysis() self.analysis_log = ClawerAnalysisLog(clawer=self.clawer, task=self.clawer_task, hostname=self.hostname, analysis=self.runing_analysis)
def test_task_stat(self): clawer = Clawer.objects.create(name="hi", info="good") clawer_generator = ClawerTaskGenerator.objects.create( clawer=clawer, code="print hello", cron="*", status=ClawerTaskGenerator.STATUS_PRODUCT) clawer_task = ClawerTask.objects.create( clawer=clawer, task_generator=clawer_generator, uri="http://github.com", status=ClawerTask.STATUS_FAIL) monitor = RealTimeMonitor() monitor.trace_task_status(clawer_task) monitor.trace_task_status(clawer_task) monitor.trace_task_status(clawer_task) url = reverse("clawer.apis.monitor.task_stat") resp = self.logined_client.get(url) result = json.loads(resp.content) self.assertTrue(result["is_ok"]) clawer.delete() clawer_generator.delete() clawer_task.delete()
def test_trace(self): clawer = Clawer.objects.create(name="hi", info="good") clawer_generator = ClawerTaskGenerator.objects.create( clawer=clawer, code="print hello", cron="*", status=ClawerTaskGenerator.STATUS_PRODUCT) clawer_task = ClawerTask.objects.create( clawer=clawer, task_generator=clawer_generator, uri="http://github.com", status=ClawerTask.STATUS_FAIL) monitor = RealTimeMonitor() monitor.trace_task_status(clawer_task) monitor.trace_task_status(clawer_task) result = monitor.trace_task_status(clawer_task) #print result self.assertEqual(len(result["data"]), monitor.POINT_COUNT) clawer.delete() clawer_generator.delete() clawer_task.delete()
def __init__(self, task_generator): from clawer.models import ClawerGenerateLog, RealTimeMonitor self.task_generator = task_generator self.clawer = self.task_generator.clawer self.out_path = "/tmp/task_generator_%d" % self.task_generator.id self.monitor = RealTimeMonitor() self.hostname = socket.gethostname()[:16] self.generate_log = ClawerGenerateLog(clawer=self.clawer, task_generator=self.task_generator, hostname=self.hostname) self.start_time = time.time() self.end_time = None self.content_bytes = 0 self.url_cache = UrlCache()
class DownloadClawerTask(object): def __init__(self, clawer_task, clawer_setting): from clawer.models import ClawerDownloadLog, RealTimeMonitor self.clawer_task = clawer_task self.download_log = ClawerDownloadLog(clawer=clawer_task.clawer, task=clawer_task, hostname=socket.gethostname()) self.monitor = RealTimeMonitor() self.background_queue = BackgroundQueue() self.headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:40.0) Gecko/20100101 Firefox/40.0" } self.sentry = SentryClient() self.clawer_setting = clawer_setting self.downloader = Download(self.clawer_task.uri, engine=self.clawer_setting.download_engine, js=self.clawer_setting.download_js) if self.clawer_setting.proxy: self.downloader.add_proxies( self.clawer_setting.proxy.strip().split("\n")) if self.clawer_task.cookie: self.headers["cookie"] = self.clawer_task.cookie self.downloader.add_cookie(self.clawer_task.cookie) if self.clawer_setting.cookie: self.headers["cookie"] = self.clawer_task.cookie self.downloader.add_cookie(self.clawer_task.cookie) self.downloader.add_headers(self.headers) def download(self): from clawer.models import ClawerTask, ClawerDownloadLog if not self.clawer_task.status in [ ClawerTask.STATUS_LIVE, ClawerTask.STATUS_PROCESS ]: return 0 failed = False self.downloader.download() if self.downloader.failed: self.download_failed() return 0 #save try: path = self.clawer_task.store_path() if os.path.exists(os.path.dirname(path)) is False: os.makedirs(os.path.dirname(path), 0775) with open(path, "w") as f: content = self.downloader.content if isinstance(content, types.UnicodeType): content = content.encode("utf-8") f.write(content) self.clawer_task.store = path except: failed = True self.download_log.failed_reason = traceback.format_exc(10) self.sentry.capture() if failed: self.download_failed() return 0 #success handle self.clawer_task.status = ClawerTask.STATUS_SUCCESS self.clawer_task.save() if self.downloader.response_headers.get("content-length"): self.download_log.content_bytes = self.downloader.response_headers[ "Content-Length"] else: self.download_log.content_bytes = len(self.downloader.content) self.download_log.status = ClawerDownloadLog.STATUS_SUCCESS self.download_log.content_encoding = self.downloader.content_encoding self.download_log.spend_time = int(self.downloader.spend_time * 1000) self.download_log.save() self.monitor.trace_task_status(self.clawer_task) return self.clawer_task.id def download_failed(self): from clawer.models import ClawerTask, ClawerDownloadLog self.download_log.status = ClawerDownloadLog.STATUS_FAIL if self.downloader.failed_exception: self.download_log.failed_reason = self.downloader.failed_exception self.download_log.spend_time = int(self.downloader.spend_time * 1000) self.background_queue.enqueue(clawer_download_log_delay_save, [self.download_log]) self.clawer_task.status = ClawerTask.STATUS_FAIL self.background_queue.enqueue(clawer_task_delay_save, [self.clawer_task]) self.monitor.trace_task_status(self.clawer_task)