Exemple #1
0
class Headquarters(object):
    """mini Headquarters object with just one incomng queue"""
    def __init__(self):
        self.jobs = {}
        self.jobslock = threading.RLock()
        self.mongo = pymongo.Connection(hqconfig.get('mongo'))
        self.configdb = self.mongo.crawl
        self.jobconfigs = JobConfigs(self.configdb)
        #self.coordinator = Coordinator(hqconfig.get('zkhosts'))
        self.maxinqueuesize = hqconfig.get(('inq', 'maxqueuesize'), 4)

    def shutdown(self):
        for job in self.jobs.values():
            job.shutdown()
        self.configdb = None
        self.mongo.disconnect()

    def get_job(self, jobname):
        with self.jobslock:
            job = self.jobs.get(jobname)
            if job is None:
                # too small maxqueuesize would be a problem, so config
                # parameter is in MB.
                job = self.jobs[jobname] = CrawlJob(
                    self.jobconfigs.get_job(jobname),
                    maxqueuesize=int(max(1, self.maxinqueuesize)*1000*1000))
                #self.coordinator.publish_job(job)
            return job
Exemple #2
0
class QuarterMaster(object):
    def __init__(self):
        zkhosts = hqconfig.get('zkhosts', None)
        logging.warn('zkhosts=%s', zkhosts)
        self.coord = Coordinator(zkhosts, alivenode='master') if zkhosts else None
        self.mongo = pymongo.Connection(host=hqconfig.get('mongo'))
        self.jobconfigs = JobConfigs(self.mongo.crawl)

        # crawlinfo is historically named 'wide' but not really wide crawl
        # specific.
        #self.crawlinfo = CrawlInfo('wide')
        
        self.worksetmapper = WorksetMapper(hqconfig.NWORKSETS_BITS)
        # distributor for each job
        self.distributors = {}

    def shutdown(self):
        self.coord.shutdown()
        #self.crawlinfo.shutdown()

    @property
    def servers(self):
        return self.coord and self.coord.get_servers()
    @property
    def servers_status(self):
        return self.coord and self.coord.get_servers_status()

    def get_distributor(self, job):
        if job not in self.distributors:
            self.distributors[job] = Distributor(self.jobconfigs.get_job(job),
                                                 self.worksetmapper)
        return self.distributors[job]
Exemple #3
0
 def __init__(self):
     self.jobs = {}
     self.jobslock = threading.RLock()
     self.mongo = pymongo.Connection(hqconfig.get('mongo'))
     self.configdb = self.mongo.crawl
     self.jobconfigs = JobConfigs(self.configdb)
     #self.coordinator = Coordinator(hqconfig.get('zkhosts'))
     self.maxinqueuesize = hqconfig.get(('inq', 'maxqueuesize'), 4)
Exemple #4
0
    def __init__(self):
        zkhosts = hqconfig.get('zkhosts', None)
        logging.warn('zkhosts=%s', zkhosts)
        self.coord = Coordinator(zkhosts, alivenode='master') if zkhosts else None
        self.mongo = pymongo.Connection(host=hqconfig.get('mongo'))
        self.jobconfigs = JobConfigs(self.mongo.crawl)

        # crawlinfo is historically named 'wide' but not really wide crawl
        # specific.
        #self.crawlinfo = CrawlInfo('wide')
        
        self.worksetmapper = WorksetMapper(hqconfig.NWORKSETS_BITS)
        # distributor for each job
        self.distributors = {}
Exemple #5
0
 def __init__(self):
     self.jobs = {}
     self.jobslock = threading.RLock()
     mongoserver = hqconfig.get('mongo')
     logging.warn('using MongoDB: %s', mongoserver)
     self.mongo = pymongo.Connection(mongoserver)
     self.configdb = self.mongo.crawl
     # single shared CrawlInfo database
     # named 'wide' for historical reasons.
     #self.crawlinfo = CrawlInfo(self.configdb, 'wide')
     self.crawlinfo = None # disabled for performance reasons
     # lazy initialization (FIXME: there must be better abstraction)
     self.domaininfo = None
     #self.domaininfo = DomainInfo(self.configdb)
     self.jobconfigs = JobConfigs(self.configdb)
     self.coordinator = Coordinator(hqconfig.get('zkhosts'))
Exemple #6
0
class Headquarters(object):
    """mini Headquarters object with just one incomng queue"""
    def __init__(self):
        self.jobs = {}
        self.jobslock = threading.RLock()
        self.mongo = pymongo.Connection(hqconfig.get('mongo'))
        self.configdb = self.mongo.crawl
        self.jobconfigs = JobConfigs(self.configdb)
        #self.coordinator = Coordinator(hqconfig.get('zkhosts'))

    def shutdown(self):
        for job in self.jobs.values():
            job.shutdown()
        self.configdb = None
        self.mongo.disconnect()

    def get_job(self, jobname):
        with self.jobslock:
            job = self.jobs.get(jobname)
            if job is None:
                job = self.jobs[jobname] = CrawlJob(
                    self.jobconfigs.get_job(jobname))
                #self.coordinator.publish_job(job)
            return job
Exemple #7
0
class Headquarters(object):
    '''now just a collection of CrawlJobs'''
    def __init__(self):
        self.jobs = {}
        self.jobslock = threading.RLock()
        mongoserver = hqconfig.get('mongo')
        logging.warn('using MongoDB: %s', mongoserver)
        self.mongo = pymongo.Connection(mongoserver)
        self.configdb = self.mongo.crawl
        # single shared CrawlInfo database
        # named 'wide' for historical reasons.
        #self.crawlinfo = CrawlInfo(self.configdb, 'wide')
        self.crawlinfo = None # disabled for performance reasons
        # lazy initialization (FIXME: there must be better abstraction)
        self.domaininfo = None
        #self.domaininfo = DomainInfo(self.configdb)
        self.jobconfigs = JobConfigs(self.configdb)
        self.coordinator = Coordinator(hqconfig.get('zkhosts'))

    def shutdown(self):
        for job in self.jobs.values():
            logging.info("shutting down job %s", job)
            job.shutdown()
        if self.domaininfo:
            logging.info("shutting down domaininfo")
            self.domaininfo.shutdown()
        if self.crawlinfo:
            logging.info("shutting down crawlinfo")
            self.crawlinfo.shutdown()
        self.configdb = None
        self.mongo.disconnect()

    def get_domaininfo(self):
        if self.domaininfo is None:
            self.domaininfo = DomainInfo(self.configdb)
        return self.domaininfo

    def get_job(self, jobname, nocreate=False):
        with self.jobslock:
            job = self.jobs.get(jobname)
            if job is None:
                if nocreate and not self.jobconfigs.job_exists(jobname):
                    raise UnknownJobError('unknown job %s' % jobname)
                job = self.jobs[jobname] = CrawlJob(self, jobname)
            self.coordinator.publish_job(job)
            return job

        self.schedulers = {}
        self.incomingqueues = {}

    def get_workset_status(self, job):
        r = self.get_job(job).get_workset_status()
        r['hq'] = id(self)
        return r

    PARAMS = [('loglevel', int)]

    @property
    def loglevel(self):
        return logging.getLogger().getEffectiveLevel()
    @loglevel.setter
    def loglevel(self, level):
        logging.getLogger().setLevel(level)

    def reload_domaininfo(self):
        if self.domaininfo:
            self.domaininfo.load()
Exemple #8
0
 def __init__(self):
     self.jobs = {}
     self.jobslock = threading.RLock()
     self.mongo = pymongo.Connection(hqconfig.get('mongo'))
     self.configdb = self.mongo.crawl
     self.jobconfigs = JobConfigs(self.configdb)