def testSameUnseenURLsInInput(testdatadir, testdomaininfo, testmapper, testscheduler): inq = IncomingQueue(testdatadir.inqdir('wide')) dispatcher = MergeDispatcher(testdomaininfo, 'wide', testmapper, testscheduler, inq) urls = generate_random_urls(100) seenurls = urls[:50] novelurls = urls[50:] seenfile = create_seen(dispatcher, seenurls) dupseenurls = [dict(url) for url in novelurls[:25]] input = urls + dupseenurls inq.add(input) inq.close() result = dispatcher.processinq(0) assert result['processed'] == len(input), result assert result['excluded'] == 0, result assert result['saved'] == 0, result assert result['scheduled'] == len(novelurls), result check_seenfile(seenfile)
def testRecovery(testdatadir, testdomaininfo, testmapper, testscheduler): """tests recovery run after processinq is terminated during scheduling (phase 2).""" inq = IncomingQueue(testdatadir.inqdir('wide')) dispatcher = MergeDispatcher(testdomaininfo, 'wide', testmapper, testscheduler, inq) # TODO: there's another case of getting terminated during # phase 1 - actually it's more likely to happen as it takes # longer than phase 2. fortunately phase 1 recovery is simpler # than phase 2 recovery - just starting over. urls1 = generate_random_urls(50) inq.add(urls1) inq.close() seenfile = create_seen(dispatcher, []) # let TestScheduler exit on 20th (i.e. after scheduling 19) cURLs. testscheduler.failat = 20 try: dispatcher.processinq(0) assert False, 'should raise RuntimeException' except Exception as ex: # expected pass assert len(testscheduler.curis) == 19 #subprocess.call(['ls', '-l', os.path.dirname(seenfile)]) testscheduler.failat = None # enqueue another 50 URLs to verify they are not consumed by # next processinq run. urls2 = generate_random_urls(50) inq.add(urls2) dispatcher.processinq(0) # TODO: want to check all intermediate files are cleaned up? #subprocess.call(['ls', '-l', os.path.dirname(seenfile)]) n = check_seenfile(seenfile) # check: all of urls1 are now seen, none from urls2 assert n == len(urls1) # check: all of urls1 are scheduled, no duplicates assert len(testscheduler.curis) == len(urls1) scheduled_urls = [u['u'] for u in testscheduler.curis] missing = [] for u in urls1: found = (u['u'] in scheduled_urls) print >>sys.stderr, "{} {}".format(u['u'], found) if not found: missing.append(u) assert len(missing) == 0, "missing {} URLs {}".format( len(missing), missing)
def testBasic(testdatadir, testdomaininfo, testmapper, testscheduler): inq = IncomingQueue(testdatadir.inqdir('wide')) dispatcher = MergeDispatcher(testdomaininfo, 'wide', testmapper, testscheduler, inq) urls = generate_random_urls(100) for url in urls: print url['u'] seenurls = urls[:50] novelurls = urls[50:] seenfile = create_seen(dispatcher, seenurls) print "processinq #1" inq.add(urls) inq.close() result = dispatcher.processinq(0) assert result['processed'] == 100, result assert result['excluded'] == 0, result assert result['saved'] == 0, result assert result['scheduled'] == 50, result scheduled = set(url['u'] for url in testscheduler.curis) assert all(url['u'] not in scheduled for url in seenurls) assert all(url['u'] in scheduled for url in novelurls) print "processinq #2" inq.add(urls) inq.close() testscheduler.curis = [] result = dispatcher.processinq(0) assert result['processed'] == 100, result assert result['excluded'] == 0, result assert result['saved'] == 0, result assert result['scheduled'] == 0, result assert len(testscheduler.curis) == 0 check_seenfile(seenfile)
for bucket in self.buckets: self.inq.add(bucket['d']) self.count += 1 inq = IncomingQueue('wide', QUEUE_DIRECTORY) #bucketreader = BucketReader() bucketreader = SequenceBucketReader(100, 100) emitters = [Emitter(bucketreader, inq) for i in range(3)] for e in emitters: e.start() for e in emitters: while 1: e.join(1.0) if not e.is_alive(): break sys.stderr.write('\r%d %s' % (bucketreader.count, inq.get_status())) sys.stderr.write('\n') inq.close() # reading queue out nread = 0 while 1: o = inq.get(timeout=0.01) if o is None: break nread += 1 print >>sys.stderr, "read out %d items" % nread
class CrawlJob(object): def __init__(self, hq, jobname): self.hq = hq self.jobconfigs = self.hq.jobconfigs self.jobname = jobname self.mapper = CrawlMapper(self, hqconfig.NWORKSETS_BITS) self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname), self.mapper) readsorted = hqconfig.getint('inq.sort', 1) inqdir = hqconfig.inqdir(self.jobname) def enqfactory(qdir, **kwargs): return PriorityEnqueue(qdir, **kwargs) def deqfactory(qdir, **kwargs): if readsorted: kwargs.update(reader=FPSortingQueueFileReader) return PriorityDequeue(qdir, **kwargs) self.inq = IncomingQueue(inqdir, enq=enqfactory, deq=deqfactory, buffsize=1000) # self.eninq = PriorityEnqueue( # qdir=hqconfig.inqdir(self.jobname), # buffer=1000) # deinqargs = {} # if readsorted: # deinqargs['reader'] = FPSortingQueueFileReader # self.deinq = PriorityDequeue(qdir=hqconfig.inqdir(self.jobname), # **deinqargs) self._dispatcher_mode = hqconfig.get( ('jobs', self.jobname, 'dispatcher'), 'internal') self.dispatcher = None #self.init_dispatcher() # currently disabled by default - too slow self.use_crawlinfo = False self.save_crawlinfo = False self.last_inq_count = 0 self.addedcount = 0 self.processedcount = 0 PARAMS = [('use_crawlinfo', bool), ('save_crawlinfo', bool), ('dispatcher_mode', str)] @property def dispatcher_mode(self): return self._dispatcher_mode @dispatcher_mode.setter def dispatcher_mode(self, value): self._dispatcher_mode = value if value == 'external': self.shutdown_dispatcher() def init_dispatcher(self): if self.dispatcher: return self.dispatcher if self.dispatcher_mode == 'external': raise RuntimeError, 'dispatcher mode is %s' % self.dispatcher_mode self.dispatcher = LevelDispatcher(self.hq.get_domaininfo(), self.jobname, mapper=self.mapper, scheduler=self.scheduler, inq=self.inq.deq) return self.dispatcher def shutdown_dispatcher(self): if not self.dispatcher: return logging.info("shutting down dispatcher") self.dispatcher.shutdown() self.dispatcher = None def shutdown(self): logging.info("shutting down scheduler") self.scheduler.shutdown() logging.info("closing incoming queues") self.inq.flush() self.inq.close() self.shutdown_dispatcher() logging.info("done.") def get_status(self): r = dict(job=self.jobname, oid=id(self)) r['sch'] = self.scheduler and self.scheduler.get_status() r['inq'] = self.inq.get_status() return r def get_workset_status(self): r = dict(job=self.jobname, crawljob=id(self)) if self.scheduler: r['sch'] = id(self.scheduler) r['worksets'] = self.scheduler.get_workset_status() return r def workset_activating(self, *args): self.init_dispatcher().workset_activating(*args) def schedule(self, curis): '''schedule curis bypassing seen-check. typically used for starting new crawl cycle.''' scheduled = 0 for curi in curis: self.scheduler.schedule(curi) scheduled += 1 return dict(processed=scheduled, scheduled=scheduled) def discovered(self, curis): return self.inq.add(curis) def processinq(self, maxn): return self.init_dispatcher().processinq(maxn) def makecuri(self, o): # temporary rescue measure. delete after everything's got fixed. a = o.get('a') if isinstance(a, dict): for k in 'pvx': m = a.pop(k, None) if m is not None: o[k] = m if not o['a']: del o['a'] return o def feed(self, client, n): logging.debug('feed "%s" begin', client) curis = self.scheduler.feed(client, n) # add recrawl info if enabled if self.use_crawlinfo and len(curis) > 0 and self.hq.crawlinfo: t0 = time.time() self.hq.crawlinfo.update_crawlinfo(curis) t = time.time() - t0 if t / len(curis) > 1.0: logging.warn("SLOW update_crawlinfo: %s %.3fs/%d", client, t, len(curis)) self.hq.crawlinfo.mongo.end_request() r = [self.makecuri(u) for u in curis] # if client queue is empty, request incoming queue to flush if not r: # but do not flush too frequently. if self.addedcount > self.last_inq_count + 1000: self.inq.close() self.last_inq_count = self.addedcount return r def finished(self, curis): result = dict(processed=0) for curi in curis: self.scheduler.finished(curi) result['processed'] += 1 if self.save_crawlinfo and self.hq.crawlinfo: for curi in curis: self.hq.crawlinfo.save_result(curi) # XXX - until I come up with better design self.hq.crawlinfo.mongo.end_request() return result def reset(self, client): return self.scheduler.reset(client) def flush(self): self.inq.close() return self.scheduler.flush_clients() def count_seen(self): """return number of items in seen db. can take pretty long to return. """ return self.init_dispatcher().count_seen() def clear_seen(self): self.init_dispatcher().clear_seen()