class WorkSet(object): def __init__(self, wsdir, wsid, writing=False): self.wsid = wsid self.qdir = os.path.join(wsdir, str(self.wsid)) if writing: FileEnqueue.recover(self.qdir) self.enq = FileEnqueue(self.qdir, buffer=200) else: self.enq = DummyFileEnqueue(self.qdir) self.deq = FileDequeue(self.qdir) self.running = True self.scheduledcount = 0 self.checkedoutcount = 0 self.finishedcount = 0 self.activecount = 0 def flush(self): # _flush() should be part of close(), but not now self.enq._flush() self.enq.close() def shutdown(self): self.flush() self.deq.close() def get_status(self): r = dict(id=self.wsid, running=self.running, scheduled=self.scheduledcount, checkedout=self.checkedoutcount, finished=self.finishedcount ) return r def schedule(self, curi): self.enq.queue(curi) self.scheduledcount += 1 def checkout(self, n): if not self.running: return [] r = [] while len(r) < n: curi = self.deq.get(timeout=0.001) if curi is None: self.enq.close() break r.append(curi) self.checkedoutcount += len(r) return r def deschedule(self, furi): self.finishedcount += 1
def __init__(self, domaininfo, job, mapper, scheduler, inq=None): self.domaininfo = domaininfo self.jobname = job self.mapper = mapper self.scheduler = scheduler # TODO: inject these objects from outside qdir = hqconfig.inqdir(self.jobname) self.inq = inq if self.inq is None: self.inq = FileDequeue(qdir, reader=FPSortingQueueFileReader) # seen database is initialized lazily #self.seen = Seen(dbdir=hqconfig.seendir(self.jobname)) self.seen = None self.diverter = Diverter(self.jobname, self.mapper) self.excludedlist = ExcludedList(self.jobname) self.workset_state = [0 for i in range(self.mapper.nworksets)] # TODO: this could be combined with FileDequeue above in a single class if Dispatcher.inqwatcher is None: iqw = Dispatcher.inqwatcher = InqueueWatcher() iqw.start() self.watch = Dispatcher.inqwatcher.addwatch(hqconfig.inqdir(self.jobname))
def __init__(self, jobconfigs, jobname, domaininfo): self.jobconfigs = jobconfigs self.jobname = jobname self.qdir = hqconfig.inqdir(self.jobname) self.inq = FileDequeue(self.qdir, reader=FPSortingQueueFileReader) self.mapper = WorksetMapper(hqconfig.NWORKSETS_BITS) self.seen = Seen(dbdir=hqconfig.seendir(self.jobname)) self.domaininfo = domaininfo self.scheduler = Scheduler(self.jobname, self.mapper)
def __init__(self, wsdir, wsid, writing=False): self.wsid = wsid self.qdir = os.path.join(wsdir, str(self.wsid)) if writing: FileEnqueue.recover(self.qdir) self.enq = FileEnqueue(self.qdir, buffer=200) else: self.enq = DummyFileEnqueue(self.qdir) self.deq = FileDequeue(self.qdir) self.running = True self.scheduledcount = 0 self.checkedoutcount = 0 self.finishedcount = 0 self.activecount = 0
def __init__(self, job, qdirbase, splitter): self.job = job self.splitter = splitter # ensure job directory exists self.qdir = os.path.join(qdirbase, job) if not os.path.isdir(self.qdir): os.makedirs(self.qdir) self.addedcount = 0 self.processedcount = 0 self.maxsize = 1000*1000*1000 # 1GB self.queue_writer = AsyncFlusher() self.enqs = [FileEnqueue(self.qdir, suffix=str(win)) for win in range(self.splitter.nqueues)] # dequeue side #self.lastqfile = None self.rqfile = FileDequeue(self.qdir) self.qfile_read = 0 self.qfile_written = 0
class SplitIncomingQueue(object): '''IncomingQueue variant that stores incoming URLs into multiple queue files, grouping by id range. This scheme has the same effect with merge sort and makes seen check much faster.''' def __init__(self, job, qdirbase, splitter): self.job = job self.splitter = splitter # ensure job directory exists self.qdir = os.path.join(qdirbase, job) if not os.path.isdir(self.qdir): os.makedirs(self.qdir) self.addedcount = 0 self.processedcount = 0 self.maxsize = 1000*1000*1000 # 1GB self.queue_writer = AsyncFlusher() self.enqs = [FileEnqueue(self.qdir, suffix=str(win)) for win in range(self.splitter.nqueues)] # dequeue side #self.lastqfile = None self.rqfile = FileDequeue(self.qdir) self.qfile_read = 0 self.qfile_written = 0 def __del__(self): self.close() def close(self): self.write_executor.shutdown() for enq in self.enqs: enq.close() def shutdown(self): self.close() def hash(self, curi): if 'id' in curi: return curi['id'] else: h = Seen.urikey(curi['u']) curi['id'] = h return h def add(self, curis): result = dict(processed=0) for curi in curis: h = self.hash(curi) win = (h >> self.window_bits) & self.win_mask enq = self.enqs[win] enq.queue(curi) self.addedcount += 1 result['processed'] += 1 return result def get(self, timeout=0.0): o = self.rqfile.get(timeout) # TODO: if queue exhausted, try closing largest enq if o: self.processedcount += 1 return o
def init_queues(self, buffsize=0, maxsize=1000*1000*1000): # dequeue side self.rqfile = FileDequeue(self.qdir) # single queue file, no asynchronous writes self.qfiles = [FileEnqueue(self.qdir, buffer=buffsize, maxsize=maxsize)]
class IncomingQueue(object): # default maxsize 1GB - this would be too big for multi-queue # settings def __init__(self, qdir, noupdate=False, norecover=False, **kw): # ensure qdir directory exists self.qdir = qdir if not os.path.isdir(self.qdir): os.makedirs(self.qdir) self.addedcount = 0 self.processedcount = 0 self.rqfile = None self.qfiles = None if not norecover: FileEnqueue.recover(self.qdir) self.init_queues(**kw) def init_queues(self, buffsize=0, maxsize=1000*1000*1000): # dequeue side self.rqfile = FileDequeue(self.qdir) # single queue file, no asynchronous writes self.qfiles = [FileEnqueue(self.qdir, buffer=buffsize, maxsize=maxsize)] @property def buffsize(self): self.qfiles[0].buffer_size @buffsize.setter def buffsize(self, v): for enq in self.qfiles: enq.buffer_size = v def __del__(self): self.shutdown() def close(self, blocking=True): if self.qfiles: for q in self.qfiles: q.close(blocking=blocking) def flush(self): if self.qfiles: for q in self.qfiles: q._flush() def shutdown(self): if self.rqfile: self.rqfile.close() # _flush should be part of close, but not now. self.flush() self.write_executor.shutdown() self.close() def get_status(self): buffered = sum([enq.buffered_count for enq in self.qfiles]) r = dict(addedcount=self.addedcount, processedcount=self.processedcount, queuefilecount=self.rqfile.qfile_count(), dequeue=self.rqfile.get_status(), bufferedcount=buffered ) if self.rqfile: r['queuefilecount'] = self.rqfile.qfile_count() return r def add(self, curis): result = dict(processed=0) for curi in curis: enq = self.qfiles[0] enq.queue(curi) self.addedcount += 1 result['processed'] += 1 return result def get(self, timeout=0.0): o = self.rqfile.get(timeout) # if queue exhausted, try closing current enq # leave busy queues if not o: self.close(blocking=False) if o: self.processedcount += 1 return o
class CrawlJob(object): def __init__(self, jobconfigs, jobname, domaininfo): self.jobconfigs = jobconfigs self.jobname = jobname self.qdir = hqconfig.inqdir(self.jobname) self.inq = FileDequeue(self.qdir, reader=FPSortingQueueFileReader) self.mapper = WorksetMapper(hqconfig.NWORKSETS_BITS) self.seen = Seen(dbdir=hqconfig.seendir(self.jobname)) self.domaininfo = domaininfo self.scheduler = Scheduler(self.jobname, self.mapper) def shutdown(self): logging.info("closing seen db") self.seen.close() logging.info("shutting down scheduler") self.scheduler.shutdown() logging.info("done.") def get_status(self): r = dict(job=self.jobname, oid=id(self)) r['sch'] = self.scheduler and self.scheduler.get_status() r['inq'] = self.inq and self.inq.get_status() return r # def get_workset_status(self): # r = dict(job=self.jobname, crawljob=id(self)) # if self.scheduler: # r['sch'] = id(self.scheduler) # r['worksets'] = self.scheduler.get_workset_status() # return r def get_domaininfo(self, url): uc = urlsplit(url) host = uc.netloc p = host.find(':') if p > 0: host = host[:p] di = self.domaininfo.get(host) return di def schedule(self, curis): '''schedule curis bypassing seen-check. typically used for starting new crawl cycle.''' scheduled = 0 for curi in curis: #self.scheduler.schedule(curi) ws = self.mapper.workset(curi) self.worksets[ws].schedule(curi) scheduled += 1 return dict(processed=scheduled, scheduled=scheduled) def processinq(self, maxn): '''process incoming queue. maxn paramter adivces upper limit on number of URIs processed in this single call. actual number of URIs processed may exceed it if incoming queue stores URIs in chunks.''' result = dict(processed=0, scheduled=0, excluded=0, td=0.0, ts=0.0) for count in xrange(maxn): t0 = time.time() furi = self.inq.get(0.01) result['td'] += (time.time() - t0) if furi is None: break result['processed'] += 1 di = self.get_domaininfo(furi['u']) if di and di['exclude']: result['excluded'] += 1 continue t0 = time.time() suri = self.seen.already_seen(furi) if suri['e'] < int(time.time()): if 'w' in furi: w = furi['w'] else: w = dict() for k in ('p','v','x'): m = furi.get(k) if m is not None: w[k] = m curi = dict(u=furi['u'], id=suri['_id'], a=w) self.scheduler.schedule(curi) result['scheduled'] += 1 result['ts'] += (time.time() - t0) # currently no access to MongoDB #self.mongo.end_request() return result def makecuri(self, o): return o def flush(self): self.seen.flush() return self.scheduler.flush()
class Dispatcher(object): inqwatcher = None # TODO: take JobConfig, instead of job def __init__(self, domaininfo, job, mapper, scheduler, inq=None): self.domaininfo = domaininfo self.jobname = job self.mapper = mapper self.scheduler = scheduler # TODO: inject these objects from outside qdir = hqconfig.inqdir(self.jobname) self.inq = inq if self.inq is None: self.inq = FileDequeue(qdir, reader=FPSortingQueueFileReader) # seen database is initialized lazily #self.seen = Seen(dbdir=hqconfig.seendir(self.jobname)) self.seen = None self.diverter = Diverter(self.jobname, self.mapper) self.excludedlist = ExcludedList(self.jobname) self.workset_state = [0 for i in range(self.mapper.nworksets)] # TODO: this could be combined with FileDequeue above in a single class if Dispatcher.inqwatcher is None: iqw = Dispatcher.inqwatcher = InqueueWatcher() iqw.start() self.watch = Dispatcher.inqwatcher.addwatch(hqconfig.inqdir(self.jobname)) def shutdown(self): #if self.job: self.job.shutdown() if self.seen: logging.info("closing seen db") self.seen.close() self.seen = None # logging.info("shutting down scheduler") # self.scheduler.shutdown() logging.info("shutting down diverter") self.diverter.shutdown() logging.info("shutting down excludedlist") self.excludedlist.shutdown() logging.info("done.") def flush(self): """flushes URIs buffered in workset objects""" #return self.job.flush() def is_client_active(self, clid): """is client clid active?""" # TODO: update ZooKeeper when active status changes #t = self.client_last_active.get(str(clid)) return self.scheduler.is_active(clid) def is_workset_active(self, wsid): """is workset wsid assigned to any active client?""" clid = self.mapper.worksetclient[wsid] return self.is_client_active(clid) def workset_activating(self, wsid): """activates working set wsid; start sending CURIs to Scheduler and enqueue diverted CURIs back into incoming queue so that processinq will process them (again). called by Scheduler, through CrawlMapper, when client starts feeding. note, unlike workset_deactivating, this method shall not be called from inside processinq method below, because processinq executes it only when at least one CURI is available for processing. if inq is empty, CURIs in divert queues would never be enqueued back. """ # this could be executed asynchronously logging.info('workset %s activated', wsid) self.workset_state[wsid] = 1 # is it better to move files back into inq directory? qfiles = self.diverter.listqfiles(wsid) logging.info('re-scheduling %s to inq', str(qfiles)) self.inq.qfiles_available(qfiles) def workset_deactivating(self, wsid): """deactivates working set wsid; start sending CURIs into divert queues.""" logging.info('workset %s deactivated', wsid) self.workset_state[wsid] = 0 # flush Workset queues. we don't move qfiles to diverter yet. # it will be done when other HQ server becomes active on the # workset, and this HQ server starts forwarding CURIs. self.scheduler.flush_workset(wsid) def init_seen(self): if not self.seen: try: cachesize = hqconfig.get('seencache') if cachesize: cachesize = int(cachesize)*(1024**2) except: cachesize = None self.seen = Seen(dbdir=hqconfig.seendir(self.jobname), block_cache_size=cachesize) def processinq(self, maxn): '''process incoming queue. maxn paramter adivces upper limit on number of URIs processed in this single call. actual number of URIs processed may exceed it if incoming queue stores URIs in chunks.''' # lazy initialization of seen db self.init_seen() result = dict(processed=0, scheduled=0, excluded=0, saved=0, td=0.0, ts=0.0) for count in xrange(maxn): t0 = time.time() furi = self.inq.get(0.01) result['td'] += (time.time() - t0) if furi is None: break result['processed'] += 1 ws = self.mapper.workset(furi) if self.is_workset_active(ws): # no need to call self.workset_activating(). it's already # done by Scheduler di = self.domaininfo.get_byurl(furi['u']) if di and di['exclude']: self.excludedlist.add(furi) result['excluded'] += 1 continue t0 = time.time() suri = self.seen.already_seen(furi) if suri['e'] < int(time.time()): curi = dict(u=furi['u'], id=suri['_id']) a = furi.get('w') if not isinstance(a, dict): a = furi for k in 'pvx': m = a.get(k) if m is not None: curi[k] = m self.scheduler.schedule(curi) result['scheduled'] += 1 result['ts'] += (time.time() - t0) else: if self.workset_state[ws]: self.workset_deactivating(ws) # client is not active self.diverter.divert(str(ws), furi) result['saved'] += 1 return result def wait_available(self, timeout=None): return self.watch.wait(timeout)