Example #1
0
class QuarterMaster(object):
    def __init__(self):
        zkhosts = hqconfig.get('zkhosts', None)
        logging.warn('zkhosts=%s', zkhosts)
        self.coord = Coordinator(zkhosts, alivenode='master') if zkhosts else None
        self.mongo = pymongo.Connection(host=hqconfig.get('mongo'))
        self.jobconfigs = JobConfigs(self.mongo.crawl)

        # crawlinfo is historically named 'wide' but not really wide crawl
        # specific.
        #self.crawlinfo = CrawlInfo('wide')
        
        self.worksetmapper = WorksetMapper(hqconfig.NWORKSETS_BITS)
        # distributor for each job
        self.distributors = {}

    def shutdown(self):
        self.coord.shutdown()
        #self.crawlinfo.shutdown()

    @property
    def servers(self):
        return self.coord and self.coord.get_servers()
    @property
    def servers_status(self):
        return self.coord and self.coord.get_servers_status()

    def get_distributor(self, job):
        if job not in self.distributors:
            self.distributors[job] = Distributor(self.jobconfigs.get_job(job),
                                                 self.worksetmapper)
        return self.distributors[job]
Example #2
0
File: hq.py Project: travisfw/hq
class Headquarters(object):
    '''now just a collection of CrawlJobs'''
    def __init__(self):
        self.jobs = {}
        self.jobslock = threading.RLock()
        # single shared CrawlInfo database
        # named 'wide' for historical reasons.
        self.crawlinfo = CrawlInfo('wide')
        self.mongo = pymongo.Connection(hqconfig.get('mongo'))
        self.configdb = self.mongo.crawl
        self.domaininfo = DomainInfo(self.configdb)
        self.jobconfigs = MongoJobConfigs(self.configdb)
        self.coordinator = Coordinator(hqconfig.get('zkhosts'))

    def shutdown(self):
        for job in self.jobs.values():
            job.shutdown()
        self.domaininfo.shutdown()
        self.configdb = None
        self.mongo.disconnect()

    def get_job(self, jobname, nocreate=False):
        with self.jobslock:
            job = self.jobs.get(jobname)
            if job is None:
                if nocreate and not self.jobconfigs.job_exists(jobname):
                    raise ValueError('unknown job %s' % jobname)
                job = self.jobs[jobname] = CrawlJob(
                    self.jobconfigs, jobname, self.crawlinfo, self.domaininfo)
                self.coordinator.publish_job(job)
            return job

        self.schedulers = {}
        self.incomingqueues = {}

    def get_workset_status(self, job):
        r = self.get_job(job).get_workset_status()
        r['hq'] = id(self)
        return r

    PARAMS = [('loglevel', int)]

    @property
    def loglevel(self):
        return logging.getLogger().getEffectiveLevel()
    @loglevel.setter
    def loglevel(self, level):
        logging.getLogger().setLevel(level)

    def reload_domaininfo(self):
        self.domaininfo.load()
Example #3
0
 def __init__(self, jobname):
     self.jobname = jobname
     self.divbase = os.path.join(hqconfig.get('datadir'), jobname, 'div')
     #self.coord = hqconfig.factory.coordinator
     self.coord = Coordinator(hqconfig.get('zkhosts'), readonly=1)
     self._get_servers()
     self.nodename = os.uname()[1]
Example #4
0
class Shuffle(object):
    def __init__(self, jobname):
        self.jobname = jobname
        self.divbase = os.path.join(hqconfig.get('datadir'), jobname, 'div')
        #self.coord = hqconfig.factory.coordinator
        self.coord = Coordinator(hqconfig.get('zkhosts'), readonly=1)
        self._get_servers()
        self.nodename = os.uname()[1]

    def _get_servers(self):
        # TODO: read server info from coordinator
        # TODO: servers list can change in the middle of operation.
        self.id2host = self.coord.get_job_servers(self.jobname)
        self.servers = len(self.id2host)
        # currently fixed - TODO
        self.clients = 25
        
    def ws2id(self, wsid):
        return (wsid % self.clients) / self.servers
    
    def shuffle_divert(self, wsid):
        divdir = os.path.join(self.divbase, str(wsid))
        deque = FileDequeue(divdir)

        serverid = self.ws2id(wsid)
        if serverid not in self.id2host:
            raise ValueError, 'server for ws %d is unknown' % wsid
        server = self.id2host[serverid]
        if server == self.nodename:
            raise ValueError, 'refusing to shuffle to myself'
        if not self.coord.is_server_alive(server):
            raise IOError, 'server %s is not alive' % server

        client = DiscoveredClient(server, self.jobname)

        def dequewrapper(q):
            count = 0
            while 1:
                curi = q.get(timeout=0.1)
                if curi is None: break
                count += 1
                sys.stderr.write('\r%s/%s: submitting %d to %s' % (
                        self.jobname, wsid, count, server))
                yield curi
            sys.stderr.write('\n')

        client.batch_submit_discovered(dequewrapper(deque))
Example #5
0
File: hq.py Project: travisfw/hq
 def __init__(self):
     self.jobs = {}
     self.jobslock = threading.RLock()
     # single shared CrawlInfo database
     # named 'wide' for historical reasons.
     self.crawlinfo = CrawlInfo('wide')
     self.mongo = pymongo.Connection(hqconfig.get('mongo'))
     self.configdb = self.mongo.crawl
     self.domaininfo = DomainInfo(self.configdb)
     self.jobconfigs = MongoJobConfigs(self.configdb)
     self.coordinator = Coordinator(hqconfig.get('zkhosts'))
Example #6
0
    def __init__(self):
        zkhosts = hqconfig.get('zkhosts', None)
        logging.warn('zkhosts=%s', zkhosts)
        self.coord = Coordinator(zkhosts, alivenode='master') if zkhosts else None
        self.mongo = pymongo.Connection(host=hqconfig.get('mongo'))
        self.jobconfigs = JobConfigs(self.mongo.crawl)

        # crawlinfo is historically named 'wide' but not really wide crawl
        # specific.
        #self.crawlinfo = CrawlInfo('wide')
        
        self.worksetmapper = WorksetMapper(hqconfig.NWORKSETS_BITS)
        # distributor for each job
        self.distributors = {}
Example #7
0
 def __init__(self):
     self.jobs = {}
     self.jobslock = threading.RLock()
     mongoserver = hqconfig.get('mongo')
     logging.warn('using MongoDB: %s', mongoserver)
     self.mongo = pymongo.Connection(mongoserver)
     self.configdb = self.mongo.crawl
     # single shared CrawlInfo database
     # named 'wide' for historical reasons.
     #self.crawlinfo = CrawlInfo(self.configdb, 'wide')
     self.crawlinfo = None # disabled for performance reasons
     # lazy initialization (FIXME: there must be better abstraction)
     self.domaininfo = None
     #self.domaininfo = DomainInfo(self.configdb)
     self.jobconfigs = JobConfigs(self.configdb)
     self.coordinator = Coordinator(hqconfig.get('zkhosts'))
Example #8
0
class Headquarters(object):
    '''now just a collection of CrawlJobs'''
    def __init__(self):
        self.jobs = {}
        self.jobslock = threading.RLock()
        mongoserver = hqconfig.get('mongo')
        logging.warn('using MongoDB: %s', mongoserver)
        self.mongo = pymongo.Connection(mongoserver)
        self.configdb = self.mongo.crawl
        # single shared CrawlInfo database
        # named 'wide' for historical reasons.
        #self.crawlinfo = CrawlInfo(self.configdb, 'wide')
        self.crawlinfo = None # disabled for performance reasons
        # lazy initialization (FIXME: there must be better abstraction)
        self.domaininfo = None
        #self.domaininfo = DomainInfo(self.configdb)
        self.jobconfigs = JobConfigs(self.configdb)
        self.coordinator = Coordinator(hqconfig.get('zkhosts'))

    def shutdown(self):
        for job in self.jobs.values():
            logging.info("shutting down job %s", job)
            job.shutdown()
        if self.domaininfo:
            logging.info("shutting down domaininfo")
            self.domaininfo.shutdown()
        if self.crawlinfo:
            logging.info("shutting down crawlinfo")
            self.crawlinfo.shutdown()
        self.configdb = None
        self.mongo.disconnect()

    def get_domaininfo(self):
        if self.domaininfo is None:
            self.domaininfo = DomainInfo(self.configdb)
        return self.domaininfo

    def get_job(self, jobname, nocreate=False):
        with self.jobslock:
            job = self.jobs.get(jobname)
            if job is None:
                if nocreate and not self.jobconfigs.job_exists(jobname):
                    raise UnknownJobError('unknown job %s' % jobname)
                job = self.jobs[jobname] = CrawlJob(self, jobname)
            self.coordinator.publish_job(job)
            return job

        self.schedulers = {}
        self.incomingqueues = {}

    def get_workset_status(self, job):
        r = self.get_job(job).get_workset_status()
        r['hq'] = id(self)
        return r

    PARAMS = [('loglevel', int)]

    @property
    def loglevel(self):
        return logging.getLogger().getEffectiveLevel()
    @loglevel.setter
    def loglevel(self, level):
        logging.getLogger().setLevel(level)

    def reload_domaininfo(self):
        if self.domaininfo:
            self.domaininfo.load()