def robot_task_tacker(self): self.logger.info("start task tacker...") from captcha.http_client import HTTPClient import time client = HTTPClient() while True: try: data = client.get("http://emopad.sinaapp.com/robot/app/task_list") self.logger.info("tracker new task:%s" % data) if data.strip(): t = new_task("webrobot") for line in data.split("\n"): t.add_action(line.strip()) t.close() t.status = "waiting" time.sleep(30) from sailing.conf import settings as CONFIG task_list = FileTask.search(CONFIG.APP_NAME, "done", len=15) for t in task_list: self.logger.info("remove done task:%s" % t._id) # t.remove() except: pass
def robot_task_tacker(self): self.logger.info("start task tacker...") from captcha.http_client import HTTPClient import time client = HTTPClient() while True: try: data = client.get( "http://emopad.sinaapp.com/robot/app/task_list") self.logger.info("tracker new task:%s" % data) if data.strip(): t = new_task("webrobot") for line in data.split("\n"): t.add_action(line.strip()) t.close() t.status = 'waiting' time.sleep(30) from sailing.conf import settings as CONFIG task_list = FileTask.search(CONFIG.APP_NAME, "done", len=15) for t in task_list: self.logger.info("remove done task:%s" % t._id) #t.remove() except: pass
def _crawling(self, site, crawlers, path, url, status): self.logger.info("Crawling path:%s" % site.real_path(path)) task = new_task('spider') task.header('Site', site.hostname) if url.startswith("http:"): url = urlparse(url).path for c in crawlers: c = self.crawlers[c] c.crawl(status, path, url, next_task=task, site=site) task.status = 'waiting' task.remove_empty()
def start(self, t): site = WebSite(t.header('Site'), "", "worker") next_task = new_task('worker') next_task.header('Site', t.header('Site')) for l in t.list_actions(): action, url, save_as, args = self._parse_action(l, site) try: handler = self.actions.get(action, None) self.logger.debug("[%s] %s --> %s, args:%s" % (action, url, save_as, str(args))) if handler is not None: handler(site, self.http, next_task, url, save_as, *args) else: self.logger.error("not found sipder action:'%s'", action) except Exception, e: self.logger.exception(trackable(u"Exception on task '%s'" % e))
def idle(self): task = new_task('spider') url = urlparse(settings.START_INDEX) host_name = url.hostname if url.port: host_name += ":%s" % url.port task.header('Site', host_name) url_path = url.path if(url_path.endswith("/") or not url_path.strip()): url_path = "%sindex.html" % url_path task.add_action("%s ==> %s" % (settings.START_INDEX, url_path.strip("/"))) task.status = 'waiting' self.logger.info("created new task '%s'." % task.path)