Example #1
0
def testExcluded(testdatadir, domaininfo, mapper, scheduler):
    dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler)
    enq = FileEnqueue(testdatadir.inqdir('wide'))

    curi = dict(u='http://test.example.com/3')
    domaininfo.excluded = 1

    enq.queue([curi])
    enq.close()

    r = dispatcher.processinq(10)

    assert r['processed'] == 1, r
    assert r['scheduled'] == 0, r
    assert r['excluded'] == 1, r
    assert r['saved'] == 0, r

    dispatcher.shutdown()

    # print exclude qfile content
    for q in py.path.local(dispatcher.excludedlist.qdir).listdir(
        fil=lambda p: p.ext == '.gz'):
        with gzip.open(str(q)) as f:
            print f.read()

    items = readqueue(dispatcher.excludedlist.qdir)
    assert len(items) == 1, items
    assert isinstance(items[0], dict), items[0]
    assert items[0]['u'] == curi['u']
Example #2
0
def testSameUnseenURLsInInput(testdatadir, testdomaininfo, testmapper,
                              testscheduler):
    inq = IncomingQueue(testdatadir.inqdir('wide'))
    dispatcher = MergeDispatcher(testdomaininfo, 'wide',
                                 testmapper, testscheduler, inq)

    urls = generate_random_urls(100)
    seenurls = urls[:50]
    novelurls = urls[50:]
    seenfile = create_seen(dispatcher, seenurls)

    dupseenurls = [dict(url) for url in novelurls[:25]]

    input = urls + dupseenurls
    inq.add(input)
    inq.close()

    result = dispatcher.processinq(0)

    assert result['processed'] == len(input), result
    assert result['excluded'] == 0, result
    assert result['saved'] == 0, result
    assert result['scheduled'] == len(novelurls), result

    check_seenfile(seenfile)
Example #3
0
def testRecovery(testdatadir, testdomaininfo, testmapper, testscheduler):
    """tests recovery run after processinq is terminated during
    scheduling (phase 2)."""
    inq = IncomingQueue(testdatadir.inqdir('wide'))
    dispatcher = MergeDispatcher(testdomaininfo, 'wide',
                                 testmapper, testscheduler, inq)
    # TODO: there's another case of getting terminated during
    # phase 1 - actually it's more likely to happen as it takes
    # longer than phase 2. fortunately phase 1 recovery is simpler
    # than phase 2 recovery - just starting over.
    urls1 = generate_random_urls(50)
    inq.add(urls1)
    inq.close()

    seenfile = create_seen(dispatcher, [])

    # let TestScheduler exit on 20th (i.e. after scheduling 19) cURLs.
    testscheduler.failat = 20
    try:
        dispatcher.processinq(0)
        assert False, 'should raise RuntimeException'
    except Exception as ex:
        # expected
        pass

    assert len(testscheduler.curis) == 19

    #subprocess.call(['ls', '-l', os.path.dirname(seenfile)])

    testscheduler.failat = None
    # enqueue another 50 URLs to verify they are not consumed by
    # next processinq run.
    urls2 = generate_random_urls(50)
    inq.add(urls2)

    dispatcher.processinq(0)

    # TODO: want to check all intermediate files are cleaned up?
    #subprocess.call(['ls', '-l', os.path.dirname(seenfile)])

    n = check_seenfile(seenfile)
    # check: all of urls1 are now seen, none from urls2
    assert n == len(urls1)
    # check: all of urls1 are scheduled, no duplicates
    assert len(testscheduler.curis) == len(urls1)
    scheduled_urls = [u['u'] for u in testscheduler.curis]
    missing = []
    for u in urls1:
        found = (u['u'] in scheduled_urls)
        print >>sys.stderr, "{} {}".format(u['u'], found)
        if not found: missing.append(u)
    assert len(missing) == 0, "missing {} URLs {}".format(
        len(missing), missing)
Example #4
0
def testDiscoveredAndProcessinq(testdatadir):
    logger.info('testDiscoveredAndProcessinq')
    # put several CURLs into inq
    data = [dict(u='http://archive.org/%s' % n,
                 p='L',
                 v='http://www.archive.org/',
                 x='a/@href') for n in range(10)]
    r = hq.app.request('/wide/mdiscovered', method='POST',
                       data=json.dumps(data))
    assert r.status == '200 OK', r
    result = json.loads(r.data)
    assert result.get('processed') == len(data), r
    assert type(result.get('t')) == float

    r = hq.app.request('/wide/flush')
    time.sleep(2.0)
    r = hq.app.request('/wide/flush')
    time.sleep(2.0)
    os.system('/bin/ls -R {}'.format(testdatadir.inqdir('wide')))

    r = hq.app.request('/wide/processinq?' + urlencode(dict(max=100)))
    assert r.status == '200 OK', r
    assert r.headers['content-type'] == 'text/json', r
    result = json.loads(r.data)
    assert result.get('job') == 'wide', result
    assert result.get('max') == 100, result
    assert result.get('processed') == len(data), result
    # there's no active client. test with active client is done
    # in test-dispatcher.py
    assert result.get('scheduled') == 0, result
    assert result.get('saved') == len(data), result
    assert result.get('excluded') == 0, result
    assert type(result.get('td')) == float, result
    assert type(result.get('ts')) == float, result

    logger.info('testDiscovered')
    data = [dict(u='http://archive.org/d',
                p='L',
                v='http://www.archive.org/',
                x='a/@href')]
    r = hq.app.request('/wide/mdiscovered', method='POST',
                       data=json.dumps(data))
    assert r.status == '200 OK', r
    assert r.headers['content-type'] == 'text/json', r
    data = json.loads(r.data)
    assert data.get('processed') == 1, r
    assert type(data.get('t')) == float
Example #5
0
def testBasic(testdatadir, testdomaininfo, testmapper, testscheduler):
    inq = IncomingQueue(testdatadir.inqdir('wide'))
    dispatcher = MergeDispatcher(testdomaininfo, 'wide',
                                 testmapper, testscheduler, inq)

    urls = generate_random_urls(100)
    for url in urls:
        print url['u']

    seenurls = urls[:50]
    novelurls = urls[50:]
    seenfile = create_seen(dispatcher, seenurls)

    print "processinq #1"

    inq.add(urls)
    inq.close()

    result = dispatcher.processinq(0)

    assert result['processed'] == 100, result
    assert result['excluded'] == 0, result
    assert result['saved'] == 0, result
    assert result['scheduled'] == 50, result

    scheduled = set(url['u'] for url in testscheduler.curis)
    assert all(url['u'] not in scheduled for url in seenurls)
    assert all(url['u'] in scheduled for url in novelurls)

    print "processinq #2"

    inq.add(urls)
    inq.close()

    testscheduler.curis = []
    result = dispatcher.processinq(0)

    assert result['processed'] == 100, result
    assert result['excluded'] == 0, result
    assert result['saved'] == 0, result
    assert result['scheduled'] == 0, result

    assert len(testscheduler.curis) == 0

    check_seenfile(seenfile)
Example #6
0
def testRegular(testdatadir, domaininfo, mapper, scheduler):

    dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler)

    enq = FileEnqueue(testdatadir.inqdir('wide'))

    curi = dict(u='http://test.example.com/1')
    enq.queue([curi])
    enq.close()

    r = dispatcher.processinq(10)

    assert r['processed'] == 1, r
    assert r['scheduled'] == 1, r
    assert r['excluded'] == 0, r
    assert r['saved'] == 0, r

    assert len(scheduler.curis) == 1
    assert scheduler.curis[0]['u'] == curi['u']
Example #7
0
def testSeen(testdatadir, domaininfo, mapper, scheduler):
    dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler)
    enq = FileEnqueue(testdatadir.inqdir('wide'))

    curi1 = dict(u='http://test.example.com/2')
    dispatcher.init_seen()
    dispatcher.seen.already_seen(curi1)

    enq.queue([curi1])
    enq.close()

    #subprocess.call('zcat /tmp/hq/wide/inq/*.gz', shell=1)

    r = dispatcher.processinq(10)

    assert r['processed'] == 1, r
    assert r['scheduled'] == 0, r
    assert r['excluded'] == 0, r
    assert r['saved'] == 0, r

    assert len(scheduler.curis) == 0, scheduler.curis
Example #8
0
def testOutOfScope(testdatadir, domaininfo, mapper, scheduler):
    dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler)
    enq = FileEnqueue(testdatadir.inqdir('wide'))

    curi = dict(u='http://test.example.com/')
    scheduler._client_active = False

    enq.queue([curi])
    enq.close()

    r = dispatcher.processinq(10)

    assert r['processed'] == 1, r
    assert r['scheduled'] == 0, r
    assert r['excluded'] == 0, r
    assert r['saved'] == 1, r

    dispatcher.shutdown()

    items = readqueue(dispatcher.diverter.getqueue('0').qdir)
    assert len(items) == 1, items
    assert isinstance(items[0], dict), items[0]
    assert items[0]['u'] == curi['u']