Beispiel #1
0
Datei: hq.py Projekt: travisfw/hq
class Headquarters(object):
    '''now just a collection of CrawlJobs'''
    def __init__(self):
        self.jobs = {}
        self.jobslock = threading.RLock()
        # single shared CrawlInfo database
        # named 'wide' for historical reasons.
        self.crawlinfo = CrawlInfo('wide')
        self.mongo = pymongo.Connection(hqconfig.get('mongo'))
        self.configdb = self.mongo.crawl
        self.domaininfo = DomainInfo(self.configdb)
        self.jobconfigs = MongoJobConfigs(self.configdb)
        self.coordinator = Coordinator(hqconfig.get('zkhosts'))

    def shutdown(self):
        for job in self.jobs.values():
            job.shutdown()
        self.domaininfo.shutdown()
        self.configdb = None
        self.mongo.disconnect()

    def get_job(self, jobname, nocreate=False):
        with self.jobslock:
            job = self.jobs.get(jobname)
            if job is None:
                if nocreate and not self.jobconfigs.job_exists(jobname):
                    raise ValueError('unknown job %s' % jobname)
                job = self.jobs[jobname] = CrawlJob(
                    self.jobconfigs, jobname, self.crawlinfo, self.domaininfo)
                self.coordinator.publish_job(job)
            return job

        self.schedulers = {}
        self.incomingqueues = {}

    def get_workset_status(self, job):
        r = self.get_job(job).get_workset_status()
        r['hq'] = id(self)
        return r

    PARAMS = [('loglevel', int)]

    @property
    def loglevel(self):
        return logging.getLogger().getEffectiveLevel()
    @loglevel.setter
    def loglevel(self, level):
        logging.getLogger().setLevel(level)

    def reload_domaininfo(self):
        self.domaininfo.load()
Beispiel #2
0
class Headquarters(object):
    '''now just a collection of CrawlJobs'''
    def __init__(self):
        self.jobs = {}
        self.jobslock = threading.RLock()
        mongoserver = hqconfig.get('mongo')
        logging.warn('using MongoDB: %s', mongoserver)
        self.mongo = pymongo.Connection(mongoserver)
        self.configdb = self.mongo.crawl
        # single shared CrawlInfo database
        # named 'wide' for historical reasons.
        #self.crawlinfo = CrawlInfo(self.configdb, 'wide')
        self.crawlinfo = None # disabled for performance reasons
        # lazy initialization (FIXME: there must be better abstraction)
        self.domaininfo = None
        #self.domaininfo = DomainInfo(self.configdb)
        self.jobconfigs = JobConfigs(self.configdb)
        self.coordinator = Coordinator(hqconfig.get('zkhosts'))

    def shutdown(self):
        for job in self.jobs.values():
            logging.info("shutting down job %s", job)
            job.shutdown()
        if self.domaininfo:
            logging.info("shutting down domaininfo")
            self.domaininfo.shutdown()
        if self.crawlinfo:
            logging.info("shutting down crawlinfo")
            self.crawlinfo.shutdown()
        self.configdb = None
        self.mongo.disconnect()

    def get_domaininfo(self):
        if self.domaininfo is None:
            self.domaininfo = DomainInfo(self.configdb)
        return self.domaininfo

    def get_job(self, jobname, nocreate=False):
        with self.jobslock:
            job = self.jobs.get(jobname)
            if job is None:
                if nocreate and not self.jobconfigs.job_exists(jobname):
                    raise UnknownJobError('unknown job %s' % jobname)
                job = self.jobs[jobname] = CrawlJob(self, jobname)
            self.coordinator.publish_job(job)
            return job

        self.schedulers = {}
        self.incomingqueues = {}

    def get_workset_status(self, job):
        r = self.get_job(job).get_workset_status()
        r['hq'] = id(self)
        return r

    PARAMS = [('loglevel', int)]

    @property
    def loglevel(self):
        return logging.getLogger().getEffectiveLevel()
    @loglevel.setter
    def loglevel(self, level):
        logging.getLogger().setLevel(level)

    def reload_domaininfo(self):
        if self.domaininfo:
            self.domaininfo.load()