class BatchSeenWriter(object): def __init__(self, dbdir): self.seen = Seen(dbdir, options.cachesize*(1024*1024)) self.buffer = [] def put(self, curi): curi['id'] = urihash.urikey(curi['u']) self.buffer.append(curi) if len(self.buffer) >= options.batchsize: self.flush() def flush(self): self.buffer.sort(lambda x, y: cmp(x['id'], y['id'])) for u in self.buffer: self.seen.already_seen(u) self.buffer = [] def processfile(self, fn): print >>sys.stderr, fn count = 0 f = GzipFile(fn) if fn.endswith('.gz') else open(fn) for l in f: try: o = parse_crawllog(l) except Exception as ex: print >>sys.stderr, "skipped %s" % str(ex) continue if o is None: continue self.put(o) count += 1 print >>sys.stderr, "\r%d" % count, f.close() self.flush() sys.stderr.write("\n")
def init_seen(self): if not self.seen: try: cachesize = hqconfig.get('seencache') if cachesize: cachesize = int(cachesize)*(1024**2) except: cachesize = None self.seen = Seen(dbdir=hqconfig.seendir(self.jobname), block_cache_size=cachesize)
def processinq(self, maxn): '''process incoming queue. maxn paramter adivces upper limit on number of URIs processed in this single call. actual number of URIs processed may exceed it if incoming queue stores URIs in chunks.''' # lazy initialization of seen db if not self.seen: try: cachesize = hqconfig.get('seencache') if cachesize: cachesize = int(cachesize)*(1024**2) except: cachesize = None self.seen = Seen(dbdir=hqconfig.seendir(self.jobname), block_cache_size=cachesize) result = dict(processed=0, scheduled=0, excluded=0, saved=0, td=0.0, ts=0.0) for count in xrange(maxn): t0 = time.time() furi = self.inq.get(0.01) result['td'] += (time.time() - t0) if furi is None: break result['processed'] += 1 ws = self.mapper.workset(furi) if self.is_workset_active(ws): # no need to call self.workset_activating(). it's already # done by Scheduler. di = self.get_domaininfo(furi['u']) if di and di['exclude']: result['excluded'] += 1 continue t0 = time.time() suri = self.seen.already_seen(furi) if suri['e'] < int(time.time()): if 'w' in furi: a = furi['w'] else: a = dict() for k in ('p','v','x'): m = furi.get(k) if m is not None: a[k] = m curi = dict(u=furi['u'], id=suri['_id'], a=a) self.scheduler.schedule(curi, ws) result['scheduled'] += 1 result['ts'] += (time.time() - t0) else: if self.workset_state[ws]: self.workset_deactivating(ws) # client is not active self.diverter.divert(str(ws), furi) result['saved'] += 1 return result
def __init__(self, jobconfigs, jobname, domaininfo): self.jobconfigs = jobconfigs self.jobname = jobname self.qdir = hqconfig.inqdir(self.jobname) self.inq = FileDequeue(self.qdir, reader=FPSortingQueueFileReader) self.mapper = WorksetMapper(hqconfig.NWORKSETS_BITS) self.seen = Seen(dbdir=hqconfig.seendir(self.jobname)) self.domaininfo = domaininfo self.scheduler = Scheduler(self.jobname, self.mapper)
class CrawlJob(object): NWORKSETS_BITS = 8 def __init__(self, jobconfigs, jobname, crawlinfo, domaininfo): self.jobconfigs = jobconfigs self.jobname = jobname self.mapper = CrawlMapper(self, self.NWORKSETS_BITS) self.workset_state = [0 for i in range(self.mapper.nworksets)] # seen-db initialization is delayed until it's actually needed self.seen = None #self.seen = Seen(dbdir=os.path.join(HQ_HOME, 'seen', self.jobname)) self.crawlinfodb = crawlinfo self.domaininfo = domaininfo self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname), self.mapper) # self.inq = HashSplitIncomingQueue( # qdir=hqconfig.inqdir(self.jobname), # buffsize=500) self.inq = PooledIncomingQueue( qdir=hqconfig.inqdir(self.jobname), buffsize=1000) self.diverter = Diverter(self.jobname, self.mapper) #self.discovered_executor = ThreadPoolExecutor(poolsize=1) # currently disabled by default - too slow self.use_crawlinfo = False self.save_crawlinfo = False PARAMS = [('use_crawlinfo', bool), ('save_crawlinfo', bool)] def shutdown(self): logging.info("shutting down scheduler") self.scheduler.shutdown() logging.info("shutting down diverter") self.diverter.shutdown() if self.seen: logging.info("closing seen db") self.seen.close() logging.info("closing incoming queues") self.inq.flush() self.inq.close() logging.info("shutting down crawlinfo") self.crawlinfodb.shutdown() logging.info("done.") #self.discovered_executor.shutdown() def get_status(self): r = dict(job=self.jobname, oid=id(self)) r['seen'] = self.seen and self.seen.get_status() r['sch'] = self.scheduler and self.scheduler.get_status() r['inq'] = self.inq and self.inq.get_status() return r def get_workset_status(self): r = dict(job=self.jobname, crawljob=id(self)) if self.scheduler: r['sch'] = id(self.scheduler) r['worksets'] = self.scheduler.get_workset_status() return r #def discovered_async(self, curis): # return self.inq.add(curis) def get_domaininfo(self, url): uc = urlsplit(url) host = uc.netloc p = host.find(':') if p > 0: host = host[:p] di = self.domaininfo.get(host) return di def schedule(self, curis): '''schedule curis bypassing seen-check. typically used for starting new crawl cycle.''' scheduled = 0 for curi in curis: self.scheduler.schedule(curi) scheduled += 1 return dict(processed=scheduled, scheduled=scheduled) def discovered(self, curis): return self.inq.add(curis) def is_client_active(self, clid): """is client clid active?""" # TODO: update ZooKeeper when active status changes #t = self.client_last_active.get(str(clid)) return self.scheduler.is_active(clid) def is_workset_active(self, wsid): """is workset wsid assigned to any active client?""" clid = self.mapper.worksetclient[wsid] return self.is_client_active(clid) def workset_activating(self, wsid): """activates working set wsid; start sending CURIs to Scheduler and enqueue diverted CURIs back into incoming queue so that processinq will process them (again). called by Scheduler, through CrawlMapper, when client starts feeding. note, unlike workset_deactivating, this method shall not be called from inside processinq method below, because processinq executes it only when at least one CURI is available for processing. if inq is empty, CURIs in divert queues would never be enqueued back. """ # this could be executed asynchronously logging.info('workset %s activated', wsid) self.workset_state[wsid] = 1 # is it better to move files back into inq directory? qfiles = self.diverter.listqfiles(wsid) logging.info('re-scheduling %s to inq', str(qfiles)) self.inq.rqfile.qfiles_available(qfiles) def workset_deactivating(self, wsid): """deactivates working set wsid; start sending CURIs into divert queues.""" logging.info('workset %s deactivated', wsid) self.workset_state[wsid] = 0 # flush Workset queues. we don't move qfiles to diverter yet. # it will be done when other HQ server becomes active on the # workset, and this HQ server starts forwarding CURIs. self.scheduler.flush_workset(wsid) def processinq(self, maxn): '''process incoming queue. maxn paramter adivces upper limit on number of URIs processed in this single call. actual number of URIs processed may exceed it if incoming queue stores URIs in chunks.''' # lazy initialization of seen db if not self.seen: try: cachesize = hqconfig.get('seencache') if cachesize: cachesize = int(cachesize)*(1024**2) except: cachesize = None self.seen = Seen(dbdir=hqconfig.seendir(self.jobname), block_cache_size=cachesize) result = dict(processed=0, scheduled=0, excluded=0, saved=0, td=0.0, ts=0.0) for count in xrange(maxn): t0 = time.time() furi = self.inq.get(0.01) result['td'] += (time.time() - t0) if furi is None: break result['processed'] += 1 ws = self.mapper.workset(furi) if self.is_workset_active(ws): # no need to call self.workset_activating(). it's already # done by Scheduler. di = self.get_domaininfo(furi['u']) if di and di['exclude']: result['excluded'] += 1 continue t0 = time.time() suri = self.seen.already_seen(furi) if suri['e'] < int(time.time()): if 'w' in furi: a = furi['w'] else: a = dict() for k in ('p','v','x'): m = furi.get(k) if m is not None: a[k] = m curi = dict(u=furi['u'], id=suri['_id'], a=a) self.scheduler.schedule(curi, ws) result['scheduled'] += 1 result['ts'] += (time.time() - t0) else: if self.workset_state[ws]: self.workset_deactivating(ws) # client is not active self.diverter.divert(str(ws), furi) result['saved'] += 1 return result def makecuri(self, o): if 'a' not in o: if 'w' in o: o['a'] = o['w'] del o['w'] else: a = dict() for k in 'pxv': if k in o: a[k] = o[k] del o[k] if a: o['a'] = a return o def feed(self, client, n): logging.debug('feed %s begin', client) curis = self.scheduler.feed(client, n) # add recrawl info if enabled if self.use_crawlinfo and len(curis) > 0 and self.crawlinfodb: t0 = time.time() self.crawlinfodb.update_crawlinfo(curis) t = time.time() - t0 if t / len(curis) > 0.5: logging.warn("SLOW update_crawlinfo: %s %.3fs/%d", client, t, len(curis)) self.crawlinfodb.mongo.end_request() r = [self.makecuri(u) for u in curis] return r def finished(self, curis): result = dict(processed=0) for curi in curis: self.scheduler.finished(curi) result['processed'] += 1 if self.save_crawlinfo and self.crawlinfodb: for curi in curis: self.crawlinfodb.save_result(curi) # XXX - until I come up with better design self.crawlinfodb.mongo.end_request() return result def reset(self, client): return self.scheduler.reset(client) def flush(self): self.inq.flush() self.inq.close() return self.scheduler.flush_clients()
def __init__(self, dbdir): self.seen = Seen(dbdir, options.cachesize*(1024*1024)) self.buffer = []
class CrawlJob(object): def __init__(self, jobconfigs, jobname, domaininfo): self.jobconfigs = jobconfigs self.jobname = jobname self.qdir = hqconfig.inqdir(self.jobname) self.inq = FileDequeue(self.qdir, reader=FPSortingQueueFileReader) self.mapper = WorksetMapper(hqconfig.NWORKSETS_BITS) self.seen = Seen(dbdir=hqconfig.seendir(self.jobname)) self.domaininfo = domaininfo self.scheduler = Scheduler(self.jobname, self.mapper) def shutdown(self): logging.info("closing seen db") self.seen.close() logging.info("shutting down scheduler") self.scheduler.shutdown() logging.info("done.") def get_status(self): r = dict(job=self.jobname, oid=id(self)) r['sch'] = self.scheduler and self.scheduler.get_status() r['inq'] = self.inq and self.inq.get_status() return r # def get_workset_status(self): # r = dict(job=self.jobname, crawljob=id(self)) # if self.scheduler: # r['sch'] = id(self.scheduler) # r['worksets'] = self.scheduler.get_workset_status() # return r def get_domaininfo(self, url): uc = urlsplit(url) host = uc.netloc p = host.find(':') if p > 0: host = host[:p] di = self.domaininfo.get(host) return di def schedule(self, curis): '''schedule curis bypassing seen-check. typically used for starting new crawl cycle.''' scheduled = 0 for curi in curis: #self.scheduler.schedule(curi) ws = self.mapper.workset(curi) self.worksets[ws].schedule(curi) scheduled += 1 return dict(processed=scheduled, scheduled=scheduled) def processinq(self, maxn): '''process incoming queue. maxn paramter adivces upper limit on number of URIs processed in this single call. actual number of URIs processed may exceed it if incoming queue stores URIs in chunks.''' result = dict(processed=0, scheduled=0, excluded=0, td=0.0, ts=0.0) for count in xrange(maxn): t0 = time.time() furi = self.inq.get(0.01) result['td'] += (time.time() - t0) if furi is None: break result['processed'] += 1 di = self.get_domaininfo(furi['u']) if di and di['exclude']: result['excluded'] += 1 continue t0 = time.time() suri = self.seen.already_seen(furi) if suri['e'] < int(time.time()): if 'w' in furi: w = furi['w'] else: w = dict() for k in ('p','v','x'): m = furi.get(k) if m is not None: w[k] = m curi = dict(u=furi['u'], id=suri['_id'], a=w) self.scheduler.schedule(curi) result['scheduled'] += 1 result['ts'] += (time.time() - t0) # currently no access to MongoDB #self.mongo.end_request() return result def makecuri(self, o): return o def flush(self): self.seen.flush() return self.scheduler.flush()
class Dispatcher(object): inqwatcher = None # TODO: take JobConfig, instead of job def __init__(self, domaininfo, job, mapper, scheduler, inq=None): self.domaininfo = domaininfo self.jobname = job self.mapper = mapper self.scheduler = scheduler # TODO: inject these objects from outside qdir = hqconfig.inqdir(self.jobname) self.inq = inq if self.inq is None: self.inq = FileDequeue(qdir, reader=FPSortingQueueFileReader) # seen database is initialized lazily #self.seen = Seen(dbdir=hqconfig.seendir(self.jobname)) self.seen = None self.diverter = Diverter(self.jobname, self.mapper) self.excludedlist = ExcludedList(self.jobname) self.workset_state = [0 for i in range(self.mapper.nworksets)] # TODO: this could be combined with FileDequeue above in a single class if Dispatcher.inqwatcher is None: iqw = Dispatcher.inqwatcher = InqueueWatcher() iqw.start() self.watch = Dispatcher.inqwatcher.addwatch(hqconfig.inqdir(self.jobname)) def shutdown(self): #if self.job: self.job.shutdown() if self.seen: logging.info("closing seen db") self.seen.close() self.seen = None # logging.info("shutting down scheduler") # self.scheduler.shutdown() logging.info("shutting down diverter") self.diverter.shutdown() logging.info("shutting down excludedlist") self.excludedlist.shutdown() logging.info("done.") def flush(self): """flushes URIs buffered in workset objects""" #return self.job.flush() def is_client_active(self, clid): """is client clid active?""" # TODO: update ZooKeeper when active status changes #t = self.client_last_active.get(str(clid)) return self.scheduler.is_active(clid) def is_workset_active(self, wsid): """is workset wsid assigned to any active client?""" clid = self.mapper.worksetclient[wsid] return self.is_client_active(clid) def workset_activating(self, wsid): """activates working set wsid; start sending CURIs to Scheduler and enqueue diverted CURIs back into incoming queue so that processinq will process them (again). called by Scheduler, through CrawlMapper, when client starts feeding. note, unlike workset_deactivating, this method shall not be called from inside processinq method below, because processinq executes it only when at least one CURI is available for processing. if inq is empty, CURIs in divert queues would never be enqueued back. """ # this could be executed asynchronously logging.info('workset %s activated', wsid) self.workset_state[wsid] = 1 # is it better to move files back into inq directory? qfiles = self.diverter.listqfiles(wsid) logging.info('re-scheduling %s to inq', str(qfiles)) self.inq.qfiles_available(qfiles) def workset_deactivating(self, wsid): """deactivates working set wsid; start sending CURIs into divert queues.""" logging.info('workset %s deactivated', wsid) self.workset_state[wsid] = 0 # flush Workset queues. we don't move qfiles to diverter yet. # it will be done when other HQ server becomes active on the # workset, and this HQ server starts forwarding CURIs. self.scheduler.flush_workset(wsid) def init_seen(self): if not self.seen: try: cachesize = hqconfig.get('seencache') if cachesize: cachesize = int(cachesize)*(1024**2) except: cachesize = None self.seen = Seen(dbdir=hqconfig.seendir(self.jobname), block_cache_size=cachesize) def processinq(self, maxn): '''process incoming queue. maxn paramter adivces upper limit on number of URIs processed in this single call. actual number of URIs processed may exceed it if incoming queue stores URIs in chunks.''' # lazy initialization of seen db self.init_seen() result = dict(processed=0, scheduled=0, excluded=0, saved=0, td=0.0, ts=0.0) for count in xrange(maxn): t0 = time.time() furi = self.inq.get(0.01) result['td'] += (time.time() - t0) if furi is None: break result['processed'] += 1 ws = self.mapper.workset(furi) if self.is_workset_active(ws): # no need to call self.workset_activating(). it's already # done by Scheduler di = self.domaininfo.get_byurl(furi['u']) if di and di['exclude']: self.excludedlist.add(furi) result['excluded'] += 1 continue t0 = time.time() suri = self.seen.already_seen(furi) if suri['e'] < int(time.time()): curi = dict(u=furi['u'], id=suri['_id']) a = furi.get('w') if not isinstance(a, dict): a = furi for k in 'pvx': m = a.get(k) if m is not None: curi[k] = m self.scheduler.schedule(curi) result['scheduled'] += 1 result['ts'] += (time.time() - t0) else: if self.workset_state[ws]: self.workset_deactivating(ws) # client is not active self.diverter.divert(str(ws), furi) result['saved'] += 1 return result def wait_available(self, timeout=None): return self.watch.wait(timeout)