Ejemplo n.º 1
0
 def __init__(self, jobname):
     self.jobname = jobname
     self.divbase = os.path.join(hqconfig.get('datadir'), jobname, 'div')
     #self.coord = hqconfig.factory.coordinator
     self.coord = Coordinator(hqconfig.get('zkhosts'), readonly=1)
     self._get_servers()
     self.nodename = os.uname()[1]
Ejemplo n.º 2
0
 def __init__(self):
     self.jobs = {}
     self.jobslock = threading.RLock()
     self.mongo = pymongo.Connection(hqconfig.get('mongo'))
     self.configdb = self.mongo.crawl
     self.jobconfigs = JobConfigs(self.configdb)
     #self.coordinator = Coordinator(hqconfig.get('zkhosts'))
     self.maxinqueuesize = hqconfig.get(('inq', 'maxqueuesize'), 4)
Ejemplo n.º 3
0
Archivo: hq.py Proyecto: travisfw/hq
 def __init__(self):
     self.jobs = {}
     self.jobslock = threading.RLock()
     # single shared CrawlInfo database
     # named 'wide' for historical reasons.
     self.crawlinfo = CrawlInfo('wide')
     self.mongo = pymongo.Connection(hqconfig.get('mongo'))
     self.configdb = self.mongo.crawl
     self.domaininfo = DomainInfo(self.configdb)
     self.jobconfigs = MongoJobConfigs(self.configdb)
     self.coordinator = Coordinator(hqconfig.get('zkhosts'))
Ejemplo n.º 4
0
    def __init__(self):
        zkhosts = hqconfig.get('zkhosts', None)
        logging.warn('zkhosts=%s', zkhosts)
        self.coord = Coordinator(zkhosts, alivenode='master') if zkhosts else None
        self.mongo = pymongo.Connection(host=hqconfig.get('mongo'))
        self.jobconfigs = JobConfigs(self.mongo.crawl)

        # crawlinfo is historically named 'wide' but not really wide crawl
        # specific.
        #self.crawlinfo = CrawlInfo('wide')
        
        self.worksetmapper = WorksetMapper(hqconfig.NWORKSETS_BITS)
        # distributor for each job
        self.distributors = {}
Ejemplo n.º 5
0
 def __init__(self, jobname, bufsize=20):
     self.qdir = os.path.join(hqconfig.get('datadir'), jobname, 'ex')
     if not os.path.isdir(self.qdir):
         os.makedirs(self.qdir)
     FileEnqueue.recover(self.qdir)
     self.enq = FileEnqueue(self.qdir, buffer=bufsize, suffix='ex')
     self.queuedcount = 0
Ejemplo n.º 6
0
 def __init__(self):
     self.jobs = {}
     self.jobslock = threading.RLock()
     mongoserver = hqconfig.get('mongo')
     logging.warn('using MongoDB: %s', mongoserver)
     self.mongo = pymongo.Connection(mongoserver)
     self.configdb = self.mongo.crawl
     # single shared CrawlInfo database
     # named 'wide' for historical reasons.
     #self.crawlinfo = CrawlInfo(self.configdb, 'wide')
     self.crawlinfo = None # disabled for performance reasons
     # lazy initialization (FIXME: there must be better abstraction)
     self.domaininfo = None
     #self.domaininfo = DomainInfo(self.configdb)
     self.jobconfigs = JobConfigs(self.configdb)
     self.coordinator = Coordinator(hqconfig.get('zkhosts'))
Ejemplo n.º 7
0
def configdb():
    #return mongo().crawl
    import mongowrapper
    mongoserver = hqconfig.get('mongo')
    connection_params = dict(host=mongoserver)
    logging.info('using MongoDB: %s', mongoserver)
    return mongowrapper.MongoDatabaseWrapper(connection_params, 'crawl')
Ejemplo n.º 8
0
def setuplogging(level=logging.INFO, filename='hq.log'):
    logsdir = os.path.join(hqconfig.get('datadir'), 'logs')
    if not os.path.isdir(logsdir): os.makedirs(logsdir)
    logging.basicConfig(
        filename=os.path.join(logsdir, filename),
        level=level,
        format='%(asctime)s %(levelname)s %(name)s %(message)s',
        datefmt='%F %T')
Ejemplo n.º 9
0
 def init_seen(self):
     if not self.seen:
         try:
             cachesize = hqconfig.get('seencache')
             if cachesize: cachesize = int(cachesize)*(1024**2)
         except:
             cachesize = None
         self.seen = Seen(dbdir=hqconfig.seendir(self.jobname),
                          block_cache_size=cachesize)
Ejemplo n.º 10
0
def dispatcher_leveldb(domaininfo, job, *args, **kwargs):
    # refuse to run if MergeDispatcher files exist
    mseendir = os.path.join(hqconfig.get('datadir'), job, 'mseen')
    if os.path.isdir(mseendir):
        raise Exception('found directory %r, which suggests "merge"'
                        ' dispatcher is in use. remove it if that is'
                        ' no longer the case' % mseendir)
    from dispatcher import Dispatcher
    return Dispatcher(domaininfo, job, *args, **kwargs)
Ejemplo n.º 11
0
def dispatcher_merge(domaininfo, job, *args, **kwargs):
    # refuse to run if MergeDispatcher directory does not exist
    mseendir = os.path.join(hqconfig.get('datadir'), job, 'mseen')
    if not os.path.isdir(mseendir):
        raise Exception('directory %r does not exist. create it and'
                        ' put SEEN file with initial seen list.'
                        % mseendir)
    from mergedispatcher import MergeDispatcher
    return MergeDispatcher(domaininfo, job, *args, **kwargs)
Ejemplo n.º 12
0
 def __call__(self, job):
     # seen cache parameter is in MB
     seen = self.job_seen.get(job)
     if not seen:
         cachesize = hqconfig.get('seencache')
         if cachesize: cachesize = int(cachesize)*(1024**2)
         seen = Seen(dbdir=hqconfig.seendir(job))
         self.job_seen[job] = seen
     return seen
Ejemplo n.º 13
0
    def __init__(self, job, mapper):
        self.jobname = job
        self.mapper = map
        self.basedir = os.path.join(hqconfig.get('datadir'), self.jobname, 'div')
        if not os.path.isdir(self.basedir):
            os.makedirs(self.basedir)

        self.queues = {}
        for fn in os.listdir(self.basedir):
            self.queues[fn] = DivertQueue(self.basedir, fn)
Ejemplo n.º 14
0
Archivo: hq.py Proyecto: travisfw/hq
    def processinq(self, maxn):
        '''process incoming queue. maxn paramter adivces
        upper limit on number of URIs processed in this single call.
        actual number of URIs processed may exceed it if incoming queue
        stores URIs in chunks.'''

        # lazy initialization of seen db
        if not self.seen:
            try:
                cachesize = hqconfig.get('seencache')
                if cachesize: cachesize = int(cachesize)*(1024**2)
            except:
                cachesize = None
            self.seen = Seen(dbdir=hqconfig.seendir(self.jobname),
                             block_cache_size=cachesize)

        result = dict(processed=0, scheduled=0, excluded=0, saved=0,
                      td=0.0, ts=0.0)
        for count in xrange(maxn):
            t0 = time.time()
            furi = self.inq.get(0.01)
            result['td'] += (time.time() - t0)
            if furi is None: break
            result['processed'] += 1
            ws = self.mapper.workset(furi)
            if self.is_workset_active(ws):
                # no need to call self.workset_activating(). it's already
                # done by Scheduler.
                di = self.get_domaininfo(furi['u'])
                if di and di['exclude']:
                    result['excluded'] += 1
                    continue
                t0 = time.time()
                suri = self.seen.already_seen(furi)
                if suri['e'] < int(time.time()):
                    if 'w' in furi:
                        a = furi['w']
                    else:
                        a = dict()
                        for k in ('p','v','x'):
                            m = furi.get(k)
                            if m is not None:
                                a[k] = m
                    curi = dict(u=furi['u'], id=suri['_id'], a=a)
                    self.scheduler.schedule(curi, ws)
                    result['scheduled'] += 1
                result['ts'] += (time.time() - t0)
            else:
                if self.workset_state[ws]:
                    self.workset_deactivating(ws)
                # client is not active
                self.diverter.divert(str(ws), furi)
                result['saved'] += 1
        return result
Ejemplo n.º 15
0
def setuplogging(level=logging.INFO, filename='hq.log'):
    logconfig = dict(
        level=level,
        format='%(asctime)s %(levelname)s %(name)s %(message)s',
        datefmt='%F %T'
    )
    logsdir = os.path.join(hqconfig.get('datadir'), 'logs')
    if not os.path.isdir(logsdir):
        try:
            os.makedirs(logsdir)
            logcnfig['filename'] = os.path.join(logsdir, filename)
        except OSError as ex:
            print >>sys.stderr, "failed to create logging directory {} ({})".format(
                logsdir, ex)
    logging.basicConfig(**logconfig)
Ejemplo n.º 16
0
    def __init__(self, hq, jobname):
        self.hq = hq
        self.jobconfigs = self.hq.jobconfigs
        self.jobname = jobname
        self.mapper = CrawlMapper(self, hqconfig.NWORKSETS_BITS)
        self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname),
                                   self.mapper)

        readsorted = hqconfig.getint('inq.sort', 1)

        inqdir = hqconfig.inqdir(self.jobname)

        def enqfactory(qdir, **kwargs):
            return PriorityEnqueue(qdir, **kwargs)
        def deqfactory(qdir, **kwargs):
            if readsorted:
                kwargs.update(reader=FPSortingQueueFileReader)
            return PriorityDequeue(qdir, **kwargs)

        self.inq = IncomingQueue(inqdir, enq=enqfactory, deq=deqfactory,
                                 buffsize=1000)

        # self.eninq = PriorityEnqueue(
        #     qdir=hqconfig.inqdir(self.jobname),
        #     buffer=1000)

        # deinqargs = {}
        # if readsorted:
        #     deinqargs['reader'] = FPSortingQueueFileReader
        # self.deinq = PriorityDequeue(qdir=hqconfig.inqdir(self.jobname),
        #                              **deinqargs)

        self._dispatcher_mode = hqconfig.get(
            ('jobs', self.jobname, 'dispatcher'), 'internal')

        self.dispatcher = None
        #self.init_dispatcher()

        # currently disabled by default - too slow
        self.use_crawlinfo = False
        self.save_crawlinfo = False

        self.last_inq_count = 0

        self.addedcount = 0
        self.processedcount = 0
Ejemplo n.º 17
0
    def __init__(self, hq, jobname):
        self.hq = hq
        self.jobconfigs = self.hq.jobconfigs
        self.jobname = jobname
        self.mapper = CrawlMapper(self, hqconfig.NWORKSETS_BITS)
        self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname),
                                   self.mapper)

        self.inq = PooledIncomingQueue(
            qdir=hqconfig.inqdir(self.jobname),
            buffsize=1000)

        self._dispatcher_mode = hqconfig.get(
            ('jobs', self.jobname, 'dispatcher'), 'internal')
                                            
        self.dispatcher = None
        #self.init_dispatcher()

        # currently disabled by default - too slow
        self.use_crawlinfo = False
        self.save_crawlinfo = False

        self.last_inq_count = 0
Ejemplo n.º 18
0
    def __init__(self, domaininfo, job, mapper, scheduler, inq,
                 maxsize=int(2e9)):
        """Dispatcher that performs seen check by merging sorted
        cURL records against fixed-size records of URL-IDs.

        This version can resume processing previously terminated by
        system crash etc. without double scheduling.

        :param domaininfo:
        :param job: crawl job name
        :type job: str
        :param mapper: workset mapper
        :param scheduler: workset scheduler
        :param inq: incoming queue
        :param maxsize: max size of input for a batch
        """
        # TODO: currently Dispatcher.__init__() initializes seenfactory,
        # which is not necessary for MergeDispatcher.
        #super(MergeDispatcher, self).__init__(domaininfo, job, mapper,
        #                                      scheduler, inq)
        # begin dup with Dispatcher.__init__()
        self.domaininfo = domaininfo
        self.jobname = job
        self.mapper = mapper
        self.scheduler = scheduler

        self.inq = inq
        self.diverter = Diverter(self.jobname, self.mapper)
        self.excludedlist = ExcludedList(self.jobname)

        self.processedcount = 0
        # end

        self.seendir = os.path.join(hqconfig.get('datadir'),
                                    self.jobname, 'mseen')
        self.maxsize = maxsize
Ejemplo n.º 19
0
def mongo():
    import pymongo
    mongoserver = hqconfig.get('mongo')
    logging.warn('using MongoDB: %s', mongoserver)
    return pymongo.Connection(mongoserver)
Ejemplo n.º 20
0
Archivo: inq.py Proyecto: travisfw/hq
 def __init__(self):
     self.jobs = {}
     self.jobslock = threading.RLock()
     self.mongo = pymongo.Connection(hqconfig.get('mongo'))
     self.configdb = self.mongo.crawl
     self.jobconfigs = JobConfigs(self.configdb)
Ejemplo n.º 21
0
    def GET(self):
        if setup_problems:
            web.header("content-type", "text/html")
            return self.render("error_setup", setup_problems)
        if db is None:
            web.header("content-type", "text/html")
            return "MongoDB connection is not available." " Make sure mongodb is running at %s" % hqconfig.get("mongo")

        errors = None
        try:
            jobs = jobconfigs.get_alljobs()
        except Exception, ex:
            errors = [str(ex)]
            jobs = []
Ejemplo n.º 22
0
except ImportError, ex:
    setup_problems.append(ex)
import hqconfig

try:
    import urihash
    from mongocrawlinfo import CrawlInfo
except ImportError, ex:
    setup_problems.append(ex)

from mongojobconfigs import JobConfigs
from weblib import BaseApp
from zkcoord import Coordinator

try:
    mongo = pymongo.Connection(host=hqconfig.get("mongo"))
    db = mongo.crawl
except:
    mongo = None
    db = None

coord = Coordinator(hqconfig.get("zkhosts"))
jobconfigs = JobConfigs(db)

urls = ("/?", "Status", "/q/(.*)", "Query")
app = web.application(urls, globals())


class Status(BaseApp):
    """implements control web user interface for crawl headquarters"""
Ejemplo n.º 23
0
def coordinator():
    from zkcoord import Coordinator
    return Coordinator(hqconfig.get('zkhosts'))
Ejemplo n.º 24
0
import pymongo
import json
import time
import re
from urlparse import urlsplit, urlunsplit
import atexit
import logging

import urihash
from weblib import BaseApp
from mongocrawlinfo import CrawlInfo
from zkcoord import Coordinator
import hqconfig

try:
    mongo = pymongo.Connection(hqconfig.get('mongo'))
    db = mongo.crawl
except:
    mongo = None
    db = None

urls = (
    '/?', 'Status',
    '/q/(.*)', 'Query'
    )
app = web.application(urls, globals())

class Status(BaseApp):
    '''implements control web user interface for crawl headquarters'''
    def GET(self):
        if db is None:
Ejemplo n.º 25
0
        
        try:
            jobconfigs.add_job_server(job, server)
            r = dict(p, success=1)
        except Exception, ex:
            r = dict(p, success=0, error=str(ex))
        return json.dumps(r)
        
class Static:
    """fallback static files handler. so as to make status page work
    even without static files serving configuration at container level.
    """
    STATICDIR = os.path.abspath(os.path.join(os.path.dirname(__file__),
                                             '../static'))
    def GET(self, path):
        apath = os.path.join(self.STATICDIR, path)
        if not os.path.isfile(apath):
            raise web.notfound(path)
        return open(apath)
        
if __name__ == '__main__':
    logging.basicConfig(filename='/tmp/status.log', level=logging.INFO)
    try:
        app.run()
    except Exception as ex:
        logging.critical('app.run() terminated with error', exc_info=1)
else:
    # for debugging
    web.config.debug = hqconfig.get('web')['debug']
    application = app.wsgifunc()