class Headquarters(object): '''now just a collection of CrawlJobs''' def __init__(self): self.jobs = {} self.jobslock = threading.RLock() # single shared CrawlInfo database # named 'wide' for historical reasons. self.crawlinfo = CrawlInfo('wide') self.mongo = pymongo.Connection(hqconfig.get('mongo')) self.configdb = self.mongo.crawl self.domaininfo = DomainInfo(self.configdb) self.jobconfigs = MongoJobConfigs(self.configdb) self.coordinator = Coordinator(hqconfig.get('zkhosts')) def shutdown(self): for job in self.jobs.values(): job.shutdown() self.domaininfo.shutdown() self.configdb = None self.mongo.disconnect() def get_job(self, jobname, nocreate=False): with self.jobslock: job = self.jobs.get(jobname) if job is None: if nocreate and not self.jobconfigs.job_exists(jobname): raise ValueError('unknown job %s' % jobname) job = self.jobs[jobname] = CrawlJob( self.jobconfigs, jobname, self.crawlinfo, self.domaininfo) self.coordinator.publish_job(job) return job self.schedulers = {} self.incomingqueues = {} def get_workset_status(self, job): r = self.get_job(job).get_workset_status() r['hq'] = id(self) return r PARAMS = [('loglevel', int)] @property def loglevel(self): return logging.getLogger().getEffectiveLevel() @loglevel.setter def loglevel(self, level): logging.getLogger().setLevel(level) def reload_domaininfo(self): self.domaininfo.load()
def __init__(self): self.jobs = {} self.jobslock = threading.RLock() # single shared CrawlInfo database # named 'wide' for historical reasons. self.crawlinfo = CrawlInfo('wide') self.mongo = pymongo.Connection(hqconfig.get('mongo')) self.configdb = self.mongo.crawl self.domaininfo = DomainInfo(self.configdb) self.jobconfigs = MongoJobConfigs(self.configdb) self.coordinator = Coordinator(hqconfig.get('zkhosts'))
def get_domaininfo(self): if self.domaininfo is None: self.domaininfo = DomainInfo(self.configdb) return self.domaininfo
class Headquarters(object): '''now just a collection of CrawlJobs''' def __init__(self): self.jobs = {} self.jobslock = threading.RLock() mongoserver = hqconfig.get('mongo') logging.warn('using MongoDB: %s', mongoserver) self.mongo = pymongo.Connection(mongoserver) self.configdb = self.mongo.crawl # single shared CrawlInfo database # named 'wide' for historical reasons. #self.crawlinfo = CrawlInfo(self.configdb, 'wide') self.crawlinfo = None # disabled for performance reasons # lazy initialization (FIXME: there must be better abstraction) self.domaininfo = None #self.domaininfo = DomainInfo(self.configdb) self.jobconfigs = JobConfigs(self.configdb) self.coordinator = Coordinator(hqconfig.get('zkhosts')) def shutdown(self): for job in self.jobs.values(): logging.info("shutting down job %s", job) job.shutdown() if self.domaininfo: logging.info("shutting down domaininfo") self.domaininfo.shutdown() if self.crawlinfo: logging.info("shutting down crawlinfo") self.crawlinfo.shutdown() self.configdb = None self.mongo.disconnect() def get_domaininfo(self): if self.domaininfo is None: self.domaininfo = DomainInfo(self.configdb) return self.domaininfo def get_job(self, jobname, nocreate=False): with self.jobslock: job = self.jobs.get(jobname) if job is None: if nocreate and not self.jobconfigs.job_exists(jobname): raise UnknownJobError('unknown job %s' % jobname) job = self.jobs[jobname] = CrawlJob(self, jobname) self.coordinator.publish_job(job) return job self.schedulers = {} self.incomingqueues = {} def get_workset_status(self, job): r = self.get_job(job).get_workset_status() r['hq'] = id(self) return r PARAMS = [('loglevel', int)] @property def loglevel(self): return logging.getLogger().getEffectiveLevel() @loglevel.setter def loglevel(self, level): logging.getLogger().setLevel(level) def reload_domaininfo(self): if self.domaininfo: self.domaininfo.load()