def testExcluded(testdatadir, domaininfo, mapper, scheduler): dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler) enq = FileEnqueue(testdatadir.inqdir('wide')) curi = dict(u='http://test.example.com/3') domaininfo.excluded = 1 enq.queue([curi]) enq.close() r = dispatcher.processinq(10) assert r['processed'] == 1, r assert r['scheduled'] == 0, r assert r['excluded'] == 1, r assert r['saved'] == 0, r dispatcher.shutdown() # print exclude qfile content for q in py.path.local(dispatcher.excludedlist.qdir).listdir( fil=lambda p: p.ext == '.gz'): with gzip.open(str(q)) as f: print f.read() items = readqueue(dispatcher.excludedlist.qdir) assert len(items) == 1, items assert isinstance(items[0], dict), items[0] assert items[0]['u'] == curi['u']
class ExcludedList(object): """URL list for storing excluded URLs. As URLs are checked for exclusion before seen check, there are (a lot of) duplicates. read-out is not supported because current HQ makes no use of these URLs. """ # TODO: duplicated code with DivertQueue def __init__(self, jobname, bufsize=20): self.qdir = os.path.join(hqconfig.get('datadir'), jobname, 'ex') if not os.path.isdir(self.qdir): os.makedirs(self.qdir) FileEnqueue.recover(self.qdir) self.enq = FileEnqueue(self.qdir, buffer=bufsize, suffix='ex') self.queuedcount = 0 def flush(self): self.enq._flush() return self.enq.close() def shutdown(self): self.flush() def get_status(self): r = dict(queued=self.queuedcount) return r def add(self, furi): self.enq.queue(furi) self.queuedcount += 1
class CrawlJob(object): def __init__(self, jobconfig): self.jobconfig = jobconfig self.enq = FileEnqueue(qdir=hqconfig.inqdir(self.jobconfig.name), suffix=os.getpid(), buffer=1000, executor=None, gzip=9) def discovered(self, curis): self.enq.queue(curis) return dict(processed=len(curis)) def shutdown(self): self.flush() def flush(self): self.enq._flush() self.enq.close()
def testRegular(testdatadir, domaininfo, mapper, scheduler): dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler) enq = FileEnqueue(testdatadir.inqdir('wide')) curi = dict(u='http://test.example.com/1') enq.queue([curi]) enq.close() r = dispatcher.processinq(10) assert r['processed'] == 1, r assert r['scheduled'] == 1, r assert r['excluded'] == 0, r assert r['saved'] == 0, r assert len(scheduler.curis) == 1 assert scheduler.curis[0]['u'] == curi['u']
def testSeen(testdatadir, domaininfo, mapper, scheduler): dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler) enq = FileEnqueue(testdatadir.inqdir('wide')) curi1 = dict(u='http://test.example.com/2') dispatcher.init_seen() dispatcher.seen.already_seen(curi1) enq.queue([curi1]) enq.close() #subprocess.call('zcat /tmp/hq/wide/inq/*.gz', shell=1) r = dispatcher.processinq(10) assert r['processed'] == 1, r assert r['scheduled'] == 0, r assert r['excluded'] == 0, r assert r['saved'] == 0, r assert len(scheduler.curis) == 0, scheduler.curis
def testOutOfScope(testdatadir, domaininfo, mapper, scheduler): dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler) enq = FileEnqueue(testdatadir.inqdir('wide')) curi = dict(u='http://test.example.com/') scheduler._client_active = False enq.queue([curi]) enq.close() r = dispatcher.processinq(10) assert r['processed'] == 1, r assert r['scheduled'] == 0, r assert r['excluded'] == 0, r assert r['saved'] == 1, r dispatcher.shutdown() items = readqueue(dispatcher.diverter.getqueue('0').qdir) assert len(items) == 1, items assert isinstance(items[0], dict), items[0] assert items[0]['u'] == curi['u']
class DivertQueue(object): """Workset compatible class for storing URIs for delivery to external services (including other HQ).""" def __init__(self, basedir, name, bufsize=500): self.name = name self.qdir = os.path.join(basedir, str(name)) FileEnqueue.recover(self.qdir) self.enq = FileEnqueue(self.qdir, buffer=bufsize, suffix='d') self.queuedcount = 0 def flush(self): self.enq._flush() return self.enq.close() def shutdown(self): self.flush() def get_status(self): r = dict(name=self.name, queued=self.queuedcount) return r def schedule(self, curi): self.enq.queue(curi) self.queuedcount += 1 def listqfiles(self): try: fns = os.listdir(self.qdir) qfiles = [] for fn in fns: if '0' <= fn[0] <= '9' and not fn.endswith('.open'): qfiles.append(os.path.abspath(os.path.join(self.qdir, fn))) return qfiles except: return []
class WorksetWriter(object): """writing side of Workset.""" def __init__(self, wsdir, wsid): self.wsid = wsid self.qdir = os.path.join(wsdir, str(self.wsid)) FileEnqueue.recover(self.qdir) self.enq = FileEnqueue(self.qdir, buffer=500) self.scheduledcount = 0 def flush(self): self.enq._flush() return self.enq.close() def shutdown(self): self.flush() def get_status(self): r = dict(id=self.wsid, running=True, scheduled=self.scheduledcount) return r def schedule(self, curi): self.enq.queue(curi) self.scheduledcount += 1
class DispatcherTestCase(unittest.TestCase): class TestDomainInfo(object): def __init__(self): self.excluded=0 def get_byurl(self, url): return dict(exclude=self.excluded) class TestMapper(object): nworksets = 10 worksetclient = [0] _workset = 0 def workset(self, furi): return self._workset class TestScheduler(object): def __init__(self): self._client_active = True self.curis = [] def is_active(self, clid): return self._client_active def flush_workset(self, wsid): pass def schedule(self, curi): self.curis.append(curi) def setUp(self): self.testdatadir = TestDatadir() self.domaininfo = self.TestDomainInfo() self.mapper = self.TestMapper() self.scheduler = self.TestScheduler() self.dispatcher = Dispatcher(self.domaininfo, 'wide', self.mapper, self.scheduler) # plain FileEnqueue for passing CURL to Dispatcher self.enq = FileEnqueue(self.testdatadir.inqdir('wide')) def readqueue(self, qdir): deq = FileDequeue(qdir) items = [] while 1: d = deq.get(0.01) if d is None: break items.append(d) return items def testRegular(self): curi = dict(u='http://test.example.com/') self.enq.queue([curi]) self.enq.close() r = self.dispatcher.processinq(10) assert r['processed'] == 1, r assert r['scheduled'] == 1, r assert r['excluded'] == 0, r assert r['saved'] == 0, r assert len(self.scheduler.curis) == 1 assert self.scheduler.curis[0]['u'] == curi['u'] def testSeen(self): curi1 = dict(u='http://test.example.com/') self.dispatcher.init_seen() self.dispatcher.seen.already_seen(curi1) self.enq.queue([curi1]) self.enq.close() #subprocess.call('zcat /tmp/hq/wide/inq/*.gz', shell=1) r = self.dispatcher.processinq(10) assert r['processed'] == 1, r assert r['scheduled'] == 0, r assert r['excluded'] == 0, r assert r['saved'] == 0, r assert len(self.scheduler.curis) == 0, self.scheduler.curis def testExcluded(self): curi = dict(u='http://test.example.com/') self.domaininfo.excluded = 1 self.enq.queue([curi]) self.enq.close() r = self.dispatcher.processinq(10) assert r['processed'] == 1, r assert r['scheduled'] == 0, r assert r['excluded'] == 1, r assert r['saved'] == 0, r self.dispatcher.shutdown() subprocess.check_call( 'zcat %s/*.gz' % self.dispatcher.excludedlist.qdir, shell=1) items = self.readqueue(self.dispatcher.excludedlist.qdir) assert len(items) == 1, items assert isinstance(items[0], dict), items[0] assert items[0]['u'] == curi['u'] def testOutOfScope(self): curi = dict(u='http://test.example.com/') self.scheduler._client_active = False self.enq.queue([curi]) self.enq.close() r = self.dispatcher.processinq(10) assert r['processed'] == 1, r assert r['scheduled'] == 0, r assert r['excluded'] == 0, r assert r['saved'] == 1, r self.dispatcher.shutdown() items = self.readqueue(self.dispatcher.diverter.getqueue('0').qdir) assert len(items) == 1, items assert isinstance(items[0], dict), items[0] assert items[0]['u'] == curi['u']