class Headquarters(object): """mini Headquarters object with just one incomng queue""" def __init__(self): self.jobs = {} self.jobslock = threading.RLock() self.mongo = pymongo.Connection(hqconfig.get('mongo')) self.configdb = self.mongo.crawl self.jobconfigs = JobConfigs(self.configdb) #self.coordinator = Coordinator(hqconfig.get('zkhosts')) self.maxinqueuesize = hqconfig.get(('inq', 'maxqueuesize'), 4) def shutdown(self): for job in self.jobs.values(): job.shutdown() self.configdb = None self.mongo.disconnect() def get_job(self, jobname): with self.jobslock: job = self.jobs.get(jobname) if job is None: # too small maxqueuesize would be a problem, so config # parameter is in MB. job = self.jobs[jobname] = CrawlJob( self.jobconfigs.get_job(jobname), maxqueuesize=int(max(1, self.maxinqueuesize)*1000*1000)) #self.coordinator.publish_job(job) return job
class QuarterMaster(object): def __init__(self): zkhosts = hqconfig.get('zkhosts', None) logging.warn('zkhosts=%s', zkhosts) self.coord = Coordinator(zkhosts, alivenode='master') if zkhosts else None self.mongo = pymongo.Connection(host=hqconfig.get('mongo')) self.jobconfigs = JobConfigs(self.mongo.crawl) # crawlinfo is historically named 'wide' but not really wide crawl # specific. #self.crawlinfo = CrawlInfo('wide') self.worksetmapper = WorksetMapper(hqconfig.NWORKSETS_BITS) # distributor for each job self.distributors = {} def shutdown(self): self.coord.shutdown() #self.crawlinfo.shutdown() @property def servers(self): return self.coord and self.coord.get_servers() @property def servers_status(self): return self.coord and self.coord.get_servers_status() def get_distributor(self, job): if job not in self.distributors: self.distributors[job] = Distributor(self.jobconfigs.get_job(job), self.worksetmapper) return self.distributors[job]
def __init__(self): self.jobs = {} self.jobslock = threading.RLock() self.mongo = pymongo.Connection(hqconfig.get('mongo')) self.configdb = self.mongo.crawl self.jobconfigs = JobConfigs(self.configdb) #self.coordinator = Coordinator(hqconfig.get('zkhosts')) self.maxinqueuesize = hqconfig.get(('inq', 'maxqueuesize'), 4)
def __init__(self): zkhosts = hqconfig.get('zkhosts', None) logging.warn('zkhosts=%s', zkhosts) self.coord = Coordinator(zkhosts, alivenode='master') if zkhosts else None self.mongo = pymongo.Connection(host=hqconfig.get('mongo')) self.jobconfigs = JobConfigs(self.mongo.crawl) # crawlinfo is historically named 'wide' but not really wide crawl # specific. #self.crawlinfo = CrawlInfo('wide') self.worksetmapper = WorksetMapper(hqconfig.NWORKSETS_BITS) # distributor for each job self.distributors = {}
def __init__(self): self.jobs = {} self.jobslock = threading.RLock() mongoserver = hqconfig.get('mongo') logging.warn('using MongoDB: %s', mongoserver) self.mongo = pymongo.Connection(mongoserver) self.configdb = self.mongo.crawl # single shared CrawlInfo database # named 'wide' for historical reasons. #self.crawlinfo = CrawlInfo(self.configdb, 'wide') self.crawlinfo = None # disabled for performance reasons # lazy initialization (FIXME: there must be better abstraction) self.domaininfo = None #self.domaininfo = DomainInfo(self.configdb) self.jobconfigs = JobConfigs(self.configdb) self.coordinator = Coordinator(hqconfig.get('zkhosts'))
class Headquarters(object): """mini Headquarters object with just one incomng queue""" def __init__(self): self.jobs = {} self.jobslock = threading.RLock() self.mongo = pymongo.Connection(hqconfig.get('mongo')) self.configdb = self.mongo.crawl self.jobconfigs = JobConfigs(self.configdb) #self.coordinator = Coordinator(hqconfig.get('zkhosts')) def shutdown(self): for job in self.jobs.values(): job.shutdown() self.configdb = None self.mongo.disconnect() def get_job(self, jobname): with self.jobslock: job = self.jobs.get(jobname) if job is None: job = self.jobs[jobname] = CrawlJob( self.jobconfigs.get_job(jobname)) #self.coordinator.publish_job(job) return job
class Headquarters(object): '''now just a collection of CrawlJobs''' def __init__(self): self.jobs = {} self.jobslock = threading.RLock() mongoserver = hqconfig.get('mongo') logging.warn('using MongoDB: %s', mongoserver) self.mongo = pymongo.Connection(mongoserver) self.configdb = self.mongo.crawl # single shared CrawlInfo database # named 'wide' for historical reasons. #self.crawlinfo = CrawlInfo(self.configdb, 'wide') self.crawlinfo = None # disabled for performance reasons # lazy initialization (FIXME: there must be better abstraction) self.domaininfo = None #self.domaininfo = DomainInfo(self.configdb) self.jobconfigs = JobConfigs(self.configdb) self.coordinator = Coordinator(hqconfig.get('zkhosts')) def shutdown(self): for job in self.jobs.values(): logging.info("shutting down job %s", job) job.shutdown() if self.domaininfo: logging.info("shutting down domaininfo") self.domaininfo.shutdown() if self.crawlinfo: logging.info("shutting down crawlinfo") self.crawlinfo.shutdown() self.configdb = None self.mongo.disconnect() def get_domaininfo(self): if self.domaininfo is None: self.domaininfo = DomainInfo(self.configdb) return self.domaininfo def get_job(self, jobname, nocreate=False): with self.jobslock: job = self.jobs.get(jobname) if job is None: if nocreate and not self.jobconfigs.job_exists(jobname): raise UnknownJobError('unknown job %s' % jobname) job = self.jobs[jobname] = CrawlJob(self, jobname) self.coordinator.publish_job(job) return job self.schedulers = {} self.incomingqueues = {} def get_workset_status(self, job): r = self.get_job(job).get_workset_status() r['hq'] = id(self) return r PARAMS = [('loglevel', int)] @property def loglevel(self): return logging.getLogger().getEffectiveLevel() @loglevel.setter def loglevel(self, level): logging.getLogger().setLevel(level) def reload_domaininfo(self): if self.domaininfo: self.domaininfo.load()
def __init__(self): self.jobs = {} self.jobslock = threading.RLock() self.mongo = pymongo.Connection(hqconfig.get('mongo')) self.configdb = self.mongo.crawl self.jobconfigs = JobConfigs(self.configdb)