def testExcluded(testdatadir, domaininfo, mapper, scheduler): dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler) enq = FileEnqueue(testdatadir.inqdir('wide')) curi = dict(u='http://test.example.com/3') domaininfo.excluded = 1 enq.queue([curi]) enq.close() r = dispatcher.processinq(10) assert r['processed'] == 1, r assert r['scheduled'] == 0, r assert r['excluded'] == 1, r assert r['saved'] == 0, r dispatcher.shutdown() # print exclude qfile content for q in py.path.local(dispatcher.excludedlist.qdir).listdir( fil=lambda p: p.ext == '.gz'): with gzip.open(str(q)) as f: print f.read() items = readqueue(dispatcher.excludedlist.qdir) assert len(items) == 1, items assert isinstance(items[0], dict), items[0] assert items[0]['u'] == curi['u']
def testSameUnseenURLsInInput(testdatadir, testdomaininfo, testmapper, testscheduler): inq = IncomingQueue(testdatadir.inqdir('wide')) dispatcher = MergeDispatcher(testdomaininfo, 'wide', testmapper, testscheduler, inq) urls = generate_random_urls(100) seenurls = urls[:50] novelurls = urls[50:] seenfile = create_seen(dispatcher, seenurls) dupseenurls = [dict(url) for url in novelurls[:25]] input = urls + dupseenurls inq.add(input) inq.close() result = dispatcher.processinq(0) assert result['processed'] == len(input), result assert result['excluded'] == 0, result assert result['saved'] == 0, result assert result['scheduled'] == len(novelurls), result check_seenfile(seenfile)
def testRecovery(testdatadir, testdomaininfo, testmapper, testscheduler): """tests recovery run after processinq is terminated during scheduling (phase 2).""" inq = IncomingQueue(testdatadir.inqdir('wide')) dispatcher = MergeDispatcher(testdomaininfo, 'wide', testmapper, testscheduler, inq) # TODO: there's another case of getting terminated during # phase 1 - actually it's more likely to happen as it takes # longer than phase 2. fortunately phase 1 recovery is simpler # than phase 2 recovery - just starting over. urls1 = generate_random_urls(50) inq.add(urls1) inq.close() seenfile = create_seen(dispatcher, []) # let TestScheduler exit on 20th (i.e. after scheduling 19) cURLs. testscheduler.failat = 20 try: dispatcher.processinq(0) assert False, 'should raise RuntimeException' except Exception as ex: # expected pass assert len(testscheduler.curis) == 19 #subprocess.call(['ls', '-l', os.path.dirname(seenfile)]) testscheduler.failat = None # enqueue another 50 URLs to verify they are not consumed by # next processinq run. urls2 = generate_random_urls(50) inq.add(urls2) dispatcher.processinq(0) # TODO: want to check all intermediate files are cleaned up? #subprocess.call(['ls', '-l', os.path.dirname(seenfile)]) n = check_seenfile(seenfile) # check: all of urls1 are now seen, none from urls2 assert n == len(urls1) # check: all of urls1 are scheduled, no duplicates assert len(testscheduler.curis) == len(urls1) scheduled_urls = [u['u'] for u in testscheduler.curis] missing = [] for u in urls1: found = (u['u'] in scheduled_urls) print >>sys.stderr, "{} {}".format(u['u'], found) if not found: missing.append(u) assert len(missing) == 0, "missing {} URLs {}".format( len(missing), missing)
def testDiscoveredAndProcessinq(testdatadir): logger.info('testDiscoveredAndProcessinq') # put several CURLs into inq data = [dict(u='http://archive.org/%s' % n, p='L', v='http://www.archive.org/', x='a/@href') for n in range(10)] r = hq.app.request('/wide/mdiscovered', method='POST', data=json.dumps(data)) assert r.status == '200 OK', r result = json.loads(r.data) assert result.get('processed') == len(data), r assert type(result.get('t')) == float r = hq.app.request('/wide/flush') time.sleep(2.0) r = hq.app.request('/wide/flush') time.sleep(2.0) os.system('/bin/ls -R {}'.format(testdatadir.inqdir('wide'))) r = hq.app.request('/wide/processinq?' + urlencode(dict(max=100))) assert r.status == '200 OK', r assert r.headers['content-type'] == 'text/json', r result = json.loads(r.data) assert result.get('job') == 'wide', result assert result.get('max') == 100, result assert result.get('processed') == len(data), result # there's no active client. test with active client is done # in test-dispatcher.py assert result.get('scheduled') == 0, result assert result.get('saved') == len(data), result assert result.get('excluded') == 0, result assert type(result.get('td')) == float, result assert type(result.get('ts')) == float, result logger.info('testDiscovered') data = [dict(u='http://archive.org/d', p='L', v='http://www.archive.org/', x='a/@href')] r = hq.app.request('/wide/mdiscovered', method='POST', data=json.dumps(data)) assert r.status == '200 OK', r assert r.headers['content-type'] == 'text/json', r data = json.loads(r.data) assert data.get('processed') == 1, r assert type(data.get('t')) == float
def testBasic(testdatadir, testdomaininfo, testmapper, testscheduler): inq = IncomingQueue(testdatadir.inqdir('wide')) dispatcher = MergeDispatcher(testdomaininfo, 'wide', testmapper, testscheduler, inq) urls = generate_random_urls(100) for url in urls: print url['u'] seenurls = urls[:50] novelurls = urls[50:] seenfile = create_seen(dispatcher, seenurls) print "processinq #1" inq.add(urls) inq.close() result = dispatcher.processinq(0) assert result['processed'] == 100, result assert result['excluded'] == 0, result assert result['saved'] == 0, result assert result['scheduled'] == 50, result scheduled = set(url['u'] for url in testscheduler.curis) assert all(url['u'] not in scheduled for url in seenurls) assert all(url['u'] in scheduled for url in novelurls) print "processinq #2" inq.add(urls) inq.close() testscheduler.curis = [] result = dispatcher.processinq(0) assert result['processed'] == 100, result assert result['excluded'] == 0, result assert result['saved'] == 0, result assert result['scheduled'] == 0, result assert len(testscheduler.curis) == 0 check_seenfile(seenfile)
def testRegular(testdatadir, domaininfo, mapper, scheduler): dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler) enq = FileEnqueue(testdatadir.inqdir('wide')) curi = dict(u='http://test.example.com/1') enq.queue([curi]) enq.close() r = dispatcher.processinq(10) assert r['processed'] == 1, r assert r['scheduled'] == 1, r assert r['excluded'] == 0, r assert r['saved'] == 0, r assert len(scheduler.curis) == 1 assert scheduler.curis[0]['u'] == curi['u']
def testSeen(testdatadir, domaininfo, mapper, scheduler): dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler) enq = FileEnqueue(testdatadir.inqdir('wide')) curi1 = dict(u='http://test.example.com/2') dispatcher.init_seen() dispatcher.seen.already_seen(curi1) enq.queue([curi1]) enq.close() #subprocess.call('zcat /tmp/hq/wide/inq/*.gz', shell=1) r = dispatcher.processinq(10) assert r['processed'] == 1, r assert r['scheduled'] == 0, r assert r['excluded'] == 0, r assert r['saved'] == 0, r assert len(scheduler.curis) == 0, scheduler.curis
def testOutOfScope(testdatadir, domaininfo, mapper, scheduler): dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler) enq = FileEnqueue(testdatadir.inqdir('wide')) curi = dict(u='http://test.example.com/') scheduler._client_active = False enq.queue([curi]) enq.close() r = dispatcher.processinq(10) assert r['processed'] == 1, r assert r['scheduled'] == 0, r assert r['excluded'] == 0, r assert r['saved'] == 1, r dispatcher.shutdown() items = readqueue(dispatcher.diverter.getqueue('0').qdir) assert len(items) == 1, items assert isinstance(items[0], dict), items[0] assert items[0]['u'] == curi['u']