def __init__(self, domaininfo, job, mapper, scheduler, inq=None): self.domaininfo = domaininfo self.jobname = job self.mapper = mapper self.scheduler = scheduler # TODO: inject these objects from outside qdir = hqconfig.inqdir(self.jobname) self.inq = inq if self.inq is None: self.inq = FileDequeue(qdir, reader=FPSortingQueueFileReader) # seen database is initialized lazily #self.seen = Seen(dbdir=hqconfig.seendir(self.jobname)) self.seen = None self.diverter = Diverter(self.jobname, self.mapper) self.excludedlist = ExcludedList(self.jobname) self.workset_state = [0 for i in range(self.mapper.nworksets)] # TODO: this could be combined with FileDequeue above in a single class if Dispatcher.inqwatcher is None: iqw = Dispatcher.inqwatcher = InqueueWatcher() iqw.start() self.watch = Dispatcher.inqwatcher.addwatch(hqconfig.inqdir(self.jobname))
def __init__(self, jobconfigs, jobname, crawlinfo, domaininfo): self.jobconfigs = jobconfigs self.jobname = jobname self.mapper = CrawlMapper(self, self.NWORKSETS_BITS) self.workset_state = [0 for i in range(self.mapper.nworksets)] # seen-db initialization is delayed until it's actually needed self.seen = None #self.seen = Seen(dbdir=os.path.join(HQ_HOME, 'seen', self.jobname)) self.crawlinfodb = crawlinfo self.domaininfo = domaininfo self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname), self.mapper) # self.inq = HashSplitIncomingQueue( # qdir=hqconfig.inqdir(self.jobname), # buffsize=500) self.inq = PooledIncomingQueue( qdir=hqconfig.inqdir(self.jobname), buffsize=1000) self.diverter = Diverter(self.jobname, self.mapper) #self.discovered_executor = ThreadPoolExecutor(poolsize=1) # currently disabled by default - too slow self.use_crawlinfo = False self.save_crawlinfo = False
def __init__(self, jobconfig): self.jobconfig = jobconfig self.enq = FileEnqueue(qdir=hqconfig.inqdir(self.jobconfig.name), suffix=os.getpid(), buffer=1000, executor=None, gzip=9)
def __init__(self, job, dispatcher_type, maxn): self.job = job self.maxn = maxn self.domaininfo = hqconfig.factory.domaininfo() self.jobconfigs = hqconfig.factory.jobconfigs() self.coordinator = hqconfig.factory.coordinator() # per-job objects # TODO: process multiple jobs in one process self.mapper = CrawlMapper(CrawlJob(self.job, self.jobconfigs), hqconfig.NWORKSETS_BITS) self.scheduler = Scheduler(hqconfig.worksetdir(self.job), self.mapper, reading=False) self.inqueue = IncomingQueue(hqconfig.inqdir(self.job), deq=PriorityDequeue) self.dispatcher = build_dispatcher(dispatcher_type, self.domaininfo, self.job, mapper=self.mapper, scheduler=self.scheduler, inqueue=self.inqueue) if os.uname()[0] == 'Linux': if InqueueProcessor.inqwatcher is None: iqw = InqueueProcessor.inqwatcher = InqueueWatcher() iqw.start() self.watch = InqueueProcessor.inqwatcher.addwatch(self.inqueue.qdir)
def __init__(self, jobconfigs, domaininfo, job): self.domaininfo = domaininfo self.jobconfigs = jobconfigs self.jobname = job if not self.jobconfigs.job_exists(self.jobname): raise ValueError('unknown job %s' % self.jobname) self.job = CrawlJob(self.jobconfigs, self.jobname, self.domaininfo) if Dispatcher.inqwatcher is None: iqw = Dispatcher.inqwatcher = InqueueWatcher() iqw.start() self.watch = iqw.addwatch(hqconfig.inqdir(self.jobname))
def __init__(self, jobconfigs, jobname, domaininfo): self.jobconfigs = jobconfigs self.jobname = jobname self.qdir = hqconfig.inqdir(self.jobname) self.inq = FileDequeue(self.qdir, reader=FPSortingQueueFileReader) self.mapper = WorksetMapper(hqconfig.NWORKSETS_BITS) self.seen = Seen(dbdir=hqconfig.seendir(self.jobname)) self.domaininfo = domaininfo self.scheduler = Scheduler(self.jobname, self.mapper)
def __init__(self, jobconfig, maxqueuesize=4*1000*1000): self.jobconfig = jobconfig qdir = hqconfig.inqdir(self.jobconfig.name) self.enq = PriorityEnqueue(qdir=qdir, suffix=os.getpid(), maxsize=maxqueuesize, buffer=1000, executor=None, gzip=9) self.deq = PriorityDequeue(qdir=qdir, enq=self.enq, deqfactory=DummyFileDequeue) self.lastflush = None
def __init__(self, hq, jobname): self.hq = hq self.jobconfigs = self.hq.jobconfigs self.jobname = jobname self.mapper = CrawlMapper(self, hqconfig.NWORKSETS_BITS) self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname), self.mapper) readsorted = hqconfig.getint('inq.sort', 1) inqdir = hqconfig.inqdir(self.jobname) def enqfactory(qdir, **kwargs): return PriorityEnqueue(qdir, **kwargs) def deqfactory(qdir, **kwargs): if readsorted: kwargs.update(reader=FPSortingQueueFileReader) return PriorityDequeue(qdir, **kwargs) self.inq = IncomingQueue(inqdir, enq=enqfactory, deq=deqfactory, buffsize=1000) # self.eninq = PriorityEnqueue( # qdir=hqconfig.inqdir(self.jobname), # buffer=1000) # deinqargs = {} # if readsorted: # deinqargs['reader'] = FPSortingQueueFileReader # self.deinq = PriorityDequeue(qdir=hqconfig.inqdir(self.jobname), # **deinqargs) self._dispatcher_mode = hqconfig.get( ('jobs', self.jobname, 'dispatcher'), 'internal') self.dispatcher = None #self.init_dispatcher() # currently disabled by default - too slow self.use_crawlinfo = False self.save_crawlinfo = False self.last_inq_count = 0 self.addedcount = 0 self.processedcount = 0
def __init__(self, hq, jobname): self.hq = hq self.jobconfigs = self.hq.jobconfigs self.jobname = jobname self.mapper = CrawlMapper(self, hqconfig.NWORKSETS_BITS) self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname), self.mapper) self.inq = PooledIncomingQueue( qdir=hqconfig.inqdir(self.jobname), buffsize=1000) self._dispatcher_mode = hqconfig.get( ('jobs', self.jobname, 'dispatcher'), 'internal') self.dispatcher = None #self.init_dispatcher() # currently disabled by default - too slow self.use_crawlinfo = False self.save_crawlinfo = False self.last_inq_count = 0
def inqdir(self, job): return hqconfig.inqdir(job)