Example #1
0
    def __init__(self, domaininfo, job, mapper,
                 scheduler, inq=None):
        self.domaininfo = domaininfo
        self.jobname = job
        self.mapper = mapper
        self.scheduler = scheduler

        # TODO: inject these objects from outside
        qdir = hqconfig.inqdir(self.jobname)
        self.inq = inq
        if self.inq is None:
            self.inq = FileDequeue(qdir, reader=FPSortingQueueFileReader)
        # seen database is initialized lazily
        #self.seen = Seen(dbdir=hqconfig.seendir(self.jobname))
        self.seen = None
        self.diverter = Diverter(self.jobname, self.mapper)
        self.excludedlist = ExcludedList(self.jobname)

        self.workset_state = [0 for i in range(self.mapper.nworksets)]

        # TODO: this could be combined with FileDequeue above in a single class
        if Dispatcher.inqwatcher is None:
            iqw = Dispatcher.inqwatcher = InqueueWatcher()
            iqw.start()
        self.watch = Dispatcher.inqwatcher.addwatch(hqconfig.inqdir(self.jobname))
Example #2
0
File: hq.py Project: travisfw/hq
    def __init__(self, jobconfigs, jobname, crawlinfo, domaininfo):
        self.jobconfigs = jobconfigs
        self.jobname = jobname
        self.mapper = CrawlMapper(self, self.NWORKSETS_BITS)
        self.workset_state = [0 for i in range(self.mapper.nworksets)]

        # seen-db initialization is delayed until it's actually needed
        self.seen = None
        #self.seen = Seen(dbdir=os.path.join(HQ_HOME, 'seen', self.jobname))
        self.crawlinfodb = crawlinfo
        self.domaininfo = domaininfo
        self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname),
                                   self.mapper)
        # self.inq = HashSplitIncomingQueue(
        #     qdir=hqconfig.inqdir(self.jobname),
        #     buffsize=500)
        self.inq = PooledIncomingQueue(
            qdir=hqconfig.inqdir(self.jobname),
            buffsize=1000)

        self.diverter = Diverter(self.jobname, self.mapper)

        #self.discovered_executor = ThreadPoolExecutor(poolsize=1)

        # currently disabled by default - too slow
        self.use_crawlinfo = False
        self.save_crawlinfo = False
Example #3
0
File: inq.py Project: travisfw/hq
 def __init__(self, jobconfig):
     self.jobconfig = jobconfig
     self.enq = FileEnqueue(qdir=hqconfig.inqdir(self.jobconfig.name),
                            suffix=os.getpid(),
                            buffer=1000,
                            executor=None,
                            gzip=9)
Example #4
0
    def __init__(self, job, dispatcher_type, maxn):
        self.job = job
        self.maxn = maxn

        self.domaininfo = hqconfig.factory.domaininfo()
        self.jobconfigs = hqconfig.factory.jobconfigs()
        self.coordinator = hqconfig.factory.coordinator()

        # per-job objects
        # TODO: process multiple jobs in one process
        self.mapper = CrawlMapper(CrawlJob(self.job, self.jobconfigs),
                                  hqconfig.NWORKSETS_BITS)
        self.scheduler = Scheduler(hqconfig.worksetdir(self.job),
                                   self.mapper, reading=False)
        self.inqueue = IncomingQueue(hqconfig.inqdir(self.job),
                                     deq=PriorityDequeue)
        self.dispatcher = build_dispatcher(dispatcher_type,
                                           self.domaininfo, self.job,
                                           mapper=self.mapper,
                                           scheduler=self.scheduler,
                                           inqueue=self.inqueue)

        if os.uname()[0] == 'Linux':
            if InqueueProcessor.inqwatcher is None:
                iqw = InqueueProcessor.inqwatcher = InqueueWatcher()
                iqw.start()
            self.watch = InqueueProcessor.inqwatcher.addwatch(self.inqueue.qdir)
Example #5
0
 def __init__(self, jobconfigs, domaininfo, job):
     self.domaininfo = domaininfo
     self.jobconfigs = jobconfigs
     self.jobname = job
     if not self.jobconfigs.job_exists(self.jobname):
         raise ValueError('unknown job %s' % self.jobname)
     self.job = CrawlJob(self.jobconfigs, self.jobname, self.domaininfo)
     if Dispatcher.inqwatcher is None:
         iqw = Dispatcher.inqwatcher = InqueueWatcher()
         iqw.start()
     self.watch = iqw.addwatch(hqconfig.inqdir(self.jobname))
Example #6
0
    def __init__(self, jobconfigs, jobname, domaininfo):
        self.jobconfigs = jobconfigs
        self.jobname = jobname
        self.qdir = hqconfig.inqdir(self.jobname)

        self.inq = FileDequeue(self.qdir, reader=FPSortingQueueFileReader)

        self.mapper = WorksetMapper(hqconfig.NWORKSETS_BITS)
        self.seen = Seen(dbdir=hqconfig.seendir(self.jobname))
        self.domaininfo = domaininfo
        
        self.scheduler = Scheduler(self.jobname, self.mapper)
Example #7
0
    def __init__(self, jobconfig, maxqueuesize=4*1000*1000):
        self.jobconfig = jobconfig
        qdir = hqconfig.inqdir(self.jobconfig.name)
        self.enq = PriorityEnqueue(qdir=qdir,
                               suffix=os.getpid(),
                               maxsize=maxqueuesize,
                               buffer=1000,
                               executor=None,
                               gzip=9)
        self.deq = PriorityDequeue(qdir=qdir, enq=self.enq,
                                   deqfactory=DummyFileDequeue)

        self.lastflush = None
Example #8
0
    def __init__(self, hq, jobname):
        self.hq = hq
        self.jobconfigs = self.hq.jobconfigs
        self.jobname = jobname
        self.mapper = CrawlMapper(self, hqconfig.NWORKSETS_BITS)
        self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname),
                                   self.mapper)

        readsorted = hqconfig.getint('inq.sort', 1)

        inqdir = hqconfig.inqdir(self.jobname)

        def enqfactory(qdir, **kwargs):
            return PriorityEnqueue(qdir, **kwargs)
        def deqfactory(qdir, **kwargs):
            if readsorted:
                kwargs.update(reader=FPSortingQueueFileReader)
            return PriorityDequeue(qdir, **kwargs)

        self.inq = IncomingQueue(inqdir, enq=enqfactory, deq=deqfactory,
                                 buffsize=1000)

        # self.eninq = PriorityEnqueue(
        #     qdir=hqconfig.inqdir(self.jobname),
        #     buffer=1000)

        # deinqargs = {}
        # if readsorted:
        #     deinqargs['reader'] = FPSortingQueueFileReader
        # self.deinq = PriorityDequeue(qdir=hqconfig.inqdir(self.jobname),
        #                              **deinqargs)

        self._dispatcher_mode = hqconfig.get(
            ('jobs', self.jobname, 'dispatcher'), 'internal')

        self.dispatcher = None
        #self.init_dispatcher()

        # currently disabled by default - too slow
        self.use_crawlinfo = False
        self.save_crawlinfo = False

        self.last_inq_count = 0

        self.addedcount = 0
        self.processedcount = 0
Example #9
0
    def __init__(self, hq, jobname):
        self.hq = hq
        self.jobconfigs = self.hq.jobconfigs
        self.jobname = jobname
        self.mapper = CrawlMapper(self, hqconfig.NWORKSETS_BITS)
        self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname),
                                   self.mapper)

        self.inq = PooledIncomingQueue(
            qdir=hqconfig.inqdir(self.jobname),
            buffsize=1000)

        self._dispatcher_mode = hqconfig.get(
            ('jobs', self.jobname, 'dispatcher'), 'internal')
                                            
        self.dispatcher = None
        #self.init_dispatcher()

        # currently disabled by default - too slow
        self.use_crawlinfo = False
        self.save_crawlinfo = False

        self.last_inq_count = 0
Example #10
0
 def inqdir(self, job):
     return hqconfig.inqdir(job)