class Headquarters(object): """mini Headquarters object with just one incomng queue""" def __init__(self): self.jobs = {} self.jobslock = threading.RLock() self.mongo = pymongo.Connection(hqconfig.get('mongo')) self.configdb = self.mongo.crawl self.jobconfigs = JobConfigs(self.configdb) #self.coordinator = Coordinator(hqconfig.get('zkhosts')) self.maxinqueuesize = hqconfig.get(('inq', 'maxqueuesize'), 4) def shutdown(self): for job in self.jobs.values(): job.shutdown() self.configdb = None self.mongo.disconnect() def get_job(self, jobname): with self.jobslock: job = self.jobs.get(jobname) if job is None: # too small maxqueuesize would be a problem, so config # parameter is in MB. job = self.jobs[jobname] = CrawlJob( self.jobconfigs.get_job(jobname), maxqueuesize=int(max(1, self.maxinqueuesize)*1000*1000)) #self.coordinator.publish_job(job) return job
class QuarterMaster(object): def __init__(self): zkhosts = hqconfig.get('zkhosts', None) logging.warn('zkhosts=%s', zkhosts) self.coord = Coordinator(zkhosts, alivenode='master') if zkhosts else None self.mongo = pymongo.Connection(host=hqconfig.get('mongo')) self.jobconfigs = JobConfigs(self.mongo.crawl) # crawlinfo is historically named 'wide' but not really wide crawl # specific. #self.crawlinfo = CrawlInfo('wide') self.worksetmapper = WorksetMapper(hqconfig.NWORKSETS_BITS) # distributor for each job self.distributors = {} def shutdown(self): self.coord.shutdown() #self.crawlinfo.shutdown() @property def servers(self): return self.coord and self.coord.get_servers() @property def servers_status(self): return self.coord and self.coord.get_servers_status() def get_distributor(self, job): if job not in self.distributors: self.distributors[job] = Distributor(self.jobconfigs.get_job(job), self.worksetmapper) return self.distributors[job]
class Headquarters(object): """mini Headquarters object with just one incomng queue""" def __init__(self): self.jobs = {} self.jobslock = threading.RLock() self.mongo = pymongo.Connection(hqconfig.get('mongo')) self.configdb = self.mongo.crawl self.jobconfigs = JobConfigs(self.configdb) #self.coordinator = Coordinator(hqconfig.get('zkhosts')) def shutdown(self): for job in self.jobs.values(): job.shutdown() self.configdb = None self.mongo.disconnect() def get_job(self, jobname): with self.jobslock: job = self.jobs.get(jobname) if job is None: job = self.jobs[jobname] = CrawlJob( self.jobconfigs.get_job(jobname)) #self.coordinator.publish_job(job) return job