Ejemplo n.º 1
0
def testExcluded(testdatadir, domaininfo, mapper, scheduler):
    dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler)
    enq = FileEnqueue(testdatadir.inqdir('wide'))

    curi = dict(u='http://test.example.com/3')
    domaininfo.excluded = 1

    enq.queue([curi])
    enq.close()

    r = dispatcher.processinq(10)

    assert r['processed'] == 1, r
    assert r['scheduled'] == 0, r
    assert r['excluded'] == 1, r
    assert r['saved'] == 0, r

    dispatcher.shutdown()

    # print exclude qfile content
    for q in py.path.local(dispatcher.excludedlist.qdir).listdir(
        fil=lambda p: p.ext == '.gz'):
        with gzip.open(str(q)) as f:
            print f.read()

    items = readqueue(dispatcher.excludedlist.qdir)
    assert len(items) == 1, items
    assert isinstance(items[0], dict), items[0]
    assert items[0]['u'] == curi['u']
Ejemplo n.º 2
0
    def __init__(self, basedir, name, bufsize=500):
        self.name = name
        self.qdir = os.path.join(basedir, str(name))
        FileEnqueue.recover(self.qdir)
        self.enq = FileEnqueue(self.qdir, buffer=bufsize, suffix='d')

        self.queuedcount = 0
Ejemplo n.º 3
0
    def __init__(self, wsdir, wsid):
        self.wsid = wsid
        self.qdir = os.path.join(wsdir, str(self.wsid))
        FileEnqueue.recover(self.qdir)
        self.enq = FileEnqueue(self.qdir, buffer=500)

        self.scheduledcount = 0
Ejemplo n.º 4
0
 def __init__(self, jobname, bufsize=20):
     self.qdir = os.path.join(hqconfig.get('datadir'), jobname, 'ex')
     if not os.path.isdir(self.qdir):
         os.makedirs(self.qdir)
     FileEnqueue.recover(self.qdir)
     self.enq = FileEnqueue(self.qdir, buffer=bufsize, suffix='ex')
     self.queuedcount = 0
Ejemplo n.º 5
0
class ExcludedList(object):
    """URL list for storing excluded URLs. As URLs are checked for exclusion
    before seen check, there are (a lot of) duplicates.
    read-out is not supported because current HQ makes no use of these URLs.
    """
    # TODO: duplicated code with DivertQueue
    def __init__(self, jobname, bufsize=20):
        self.qdir = os.path.join(hqconfig.get('datadir'), jobname, 'ex')
        if not os.path.isdir(self.qdir):
            os.makedirs(self.qdir)
        FileEnqueue.recover(self.qdir)
        self.enq = FileEnqueue(self.qdir, buffer=bufsize, suffix='ex')
        self.queuedcount = 0

    def flush(self):
        self.enq._flush()
        return self.enq.close()

    def shutdown(self):
        self.flush()

    def get_status(self):
        r = dict(queued=self.queuedcount)
        return r

    def add(self, furi):
        self.enq.queue(furi)
        self.queuedcount += 1
Ejemplo n.º 6
0
    def __init__(self, qdir, noupdate=False, norecover=False, **kw):
        # ensure qdir directory exists
        self.qdir = qdir
        if not os.path.isdir(self.qdir):
            os.makedirs(self.qdir)

        self.addedcount = 0
        self.processedcount = 0

        self.rqfile = None
        self.qfiles = None

        if not norecover:
            FileEnqueue.recover(self.qdir)
        self.init_queues(**kw)
Ejemplo n.º 7
0
Archivo: inq.py Proyecto: travisfw/hq
 def __init__(self, jobconfig):
     self.jobconfig = jobconfig
     self.enq = FileEnqueue(qdir=hqconfig.inqdir(self.jobconfig.name),
                            suffix=os.getpid(),
                            buffer=1000,
                            executor=None,
                            gzip=9)
Ejemplo n.º 8
0
    def __init__(self, wsdir, wsid, writing=False):
        self.wsid = wsid

        self.qdir = os.path.join(wsdir, str(self.wsid))

        if writing:
            FileEnqueue.recover(self.qdir)
            self.enq = FileEnqueue(self.qdir, buffer=200)
        else:
            self.enq = DummyFileEnqueue(self.qdir)
        self.deq = FileDequeue(self.qdir)

        self.running = True

        self.scheduledcount = 0
        self.checkedoutcount = 0
        self.finishedcount = 0
        self.activecount = 0
Ejemplo n.º 9
0
    def setUp(self):
        self.testdatadir = TestDatadir()
        self.domaininfo = self.TestDomainInfo()
        self.mapper = self.TestMapper()
        self.scheduler = self.TestScheduler()
        self.dispatcher = Dispatcher(self.domaininfo,
                                     'wide', self.mapper, self.scheduler)

        # plain FileEnqueue for passing CURL to Dispatcher
        self.enq = FileEnqueue(self.testdatadir.inqdir('wide'))
Ejemplo n.º 10
0
def testRegular(testdatadir, domaininfo, mapper, scheduler):

    dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler)

    enq = FileEnqueue(testdatadir.inqdir('wide'))

    curi = dict(u='http://test.example.com/1')
    enq.queue([curi])
    enq.close()

    r = dispatcher.processinq(10)

    assert r['processed'] == 1, r
    assert r['scheduled'] == 1, r
    assert r['excluded'] == 0, r
    assert r['saved'] == 0, r

    assert len(scheduler.curis) == 1
    assert scheduler.curis[0]['u'] == curi['u']
Ejemplo n.º 11
0
def testSeen(testdatadir, domaininfo, mapper, scheduler):
    dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler)
    enq = FileEnqueue(testdatadir.inqdir('wide'))

    curi1 = dict(u='http://test.example.com/2')
    dispatcher.init_seen()
    dispatcher.seen.already_seen(curi1)

    enq.queue([curi1])
    enq.close()

    #subprocess.call('zcat /tmp/hq/wide/inq/*.gz', shell=1)

    r = dispatcher.processinq(10)

    assert r['processed'] == 1, r
    assert r['scheduled'] == 0, r
    assert r['excluded'] == 0, r
    assert r['saved'] == 0, r

    assert len(scheduler.curis) == 0, scheduler.curis
Ejemplo n.º 12
0
def testOutOfScope(testdatadir, domaininfo, mapper, scheduler):
    dispatcher = LevelDispatcher(domaininfo, 'wide', mapper, scheduler)
    enq = FileEnqueue(testdatadir.inqdir('wide'))

    curi = dict(u='http://test.example.com/')
    scheduler._client_active = False

    enq.queue([curi])
    enq.close()

    r = dispatcher.processinq(10)

    assert r['processed'] == 1, r
    assert r['scheduled'] == 0, r
    assert r['excluded'] == 0, r
    assert r['saved'] == 1, r

    dispatcher.shutdown()

    items = readqueue(dispatcher.diverter.getqueue('0').qdir)
    assert len(items) == 1, items
    assert isinstance(items[0], dict), items[0]
    assert items[0]['u'] == curi['u']
Ejemplo n.º 13
0
class DivertQueue(object):
    """Workset compatible class for storing URIs for delivery to external
    services (including other HQ)."""
    def __init__(self, basedir, name, bufsize=500):
        self.name = name
        self.qdir = os.path.join(basedir, str(name))
        FileEnqueue.recover(self.qdir)
        self.enq = FileEnqueue(self.qdir, buffer=bufsize, suffix='d')

        self.queuedcount = 0

    def flush(self):
        self.enq._flush()
        return self.enq.close()

    def shutdown(self):
        self.flush()

    def get_status(self):
        r = dict(name=self.name, queued=self.queuedcount)
        return r

    def schedule(self, curi):
        self.enq.queue(curi)
        self.queuedcount += 1

    def listqfiles(self):
        try:
            fns = os.listdir(self.qdir)
            qfiles = []
            for fn in fns:
                if '0' <= fn[0] <= '9' and not fn.endswith('.open'):
                    qfiles.append(os.path.abspath(os.path.join(self.qdir, fn)))
            return qfiles
        except:
            return []
Ejemplo n.º 14
0
class WorksetWriter(object):
    """writing side of Workset."""
    def __init__(self, wsdir, wsid):
        self.wsid = wsid
        self.qdir = os.path.join(wsdir, str(self.wsid))
        FileEnqueue.recover(self.qdir)
        self.enq = FileEnqueue(self.qdir, buffer=500)

        self.scheduledcount = 0
        
    def flush(self):
        self.enq._flush()
        return self.enq.close()

    def shutdown(self):
        self.flush()

    def get_status(self):
        r = dict(id=self.wsid, running=True,
                 scheduled=self.scheduledcount)
        return r
    def schedule(self, curi):
        self.enq.queue(curi)
        self.scheduledcount += 1
Ejemplo n.º 15
0
Archivo: inq.py Proyecto: travisfw/hq
class CrawlJob(object):
    def __init__(self, jobconfig):
        self.jobconfig = jobconfig
        self.enq = FileEnqueue(qdir=hqconfig.inqdir(self.jobconfig.name),
                               suffix=os.getpid(),
                               buffer=1000,
                               executor=None,
                               gzip=9)

    def discovered(self, curis):
        self.enq.queue(curis)
        return dict(processed=len(curis))

    def shutdown(self):
        self.flush()

    def flush(self):
        self.enq._flush()
        self.enq.close()
Ejemplo n.º 16
0
class DispatcherTestCase(unittest.TestCase):
    class TestDomainInfo(object):
        def __init__(self):
            self.excluded=0
        def get_byurl(self, url):
            return dict(exclude=self.excluded)
    class TestMapper(object):
        nworksets = 10
        worksetclient = [0]
        _workset = 0
        def workset(self, furi):
            return self._workset
    class TestScheduler(object):
        def __init__(self):
            self._client_active = True
            self.curis = []
        def is_active(self, clid):
            return self._client_active
        def flush_workset(self, wsid):
            pass
        def schedule(self, curi):
            self.curis.append(curi)

    def setUp(self):
        self.testdatadir = TestDatadir()
        self.domaininfo = self.TestDomainInfo()
        self.mapper = self.TestMapper()
        self.scheduler = self.TestScheduler()
        self.dispatcher = Dispatcher(self.domaininfo,
                                     'wide', self.mapper, self.scheduler)

        # plain FileEnqueue for passing CURL to Dispatcher
        self.enq = FileEnqueue(self.testdatadir.inqdir('wide'))

    def readqueue(self, qdir):
        deq = FileDequeue(qdir)
        items = []
        while 1:
            d = deq.get(0.01)
            if d is None: break
            items.append(d)
        return items

    def testRegular(self):
        curi = dict(u='http://test.example.com/')
        self.enq.queue([curi])
        self.enq.close()

        r = self.dispatcher.processinq(10)

        assert r['processed'] == 1, r
        assert r['scheduled'] == 1, r
        assert r['excluded'] == 0, r
        assert r['saved'] == 0, r

        assert len(self.scheduler.curis) == 1
        assert self.scheduler.curis[0]['u'] == curi['u']

    def testSeen(self):
        curi1 = dict(u='http://test.example.com/')
        self.dispatcher.init_seen()
        self.dispatcher.seen.already_seen(curi1)

        self.enq.queue([curi1])
        self.enq.close()

        #subprocess.call('zcat /tmp/hq/wide/inq/*.gz', shell=1)

        r = self.dispatcher.processinq(10)

        assert r['processed'] == 1, r
        assert r['scheduled'] == 0, r
        assert r['excluded'] == 0, r
        assert r['saved'] == 0, r

        assert len(self.scheduler.curis) == 0, self.scheduler.curis

    def testExcluded(self):
        curi = dict(u='http://test.example.com/')
        self.domaininfo.excluded = 1

        self.enq.queue([curi])
        self.enq.close()

        r = self.dispatcher.processinq(10)

        assert r['processed'] == 1, r
        assert r['scheduled'] == 0, r
        assert r['excluded'] == 1, r
        assert r['saved'] == 0, r

        self.dispatcher.shutdown()

        subprocess.check_call(
            'zcat %s/*.gz' % self.dispatcher.excludedlist.qdir,
            shell=1)

        items = self.readqueue(self.dispatcher.excludedlist.qdir)
        assert len(items) == 1, items
        assert isinstance(items[0], dict), items[0]
        assert items[0]['u'] == curi['u']

    def testOutOfScope(self):
        curi = dict(u='http://test.example.com/')
        self.scheduler._client_active = False

        self.enq.queue([curi])
        self.enq.close()

        r = self.dispatcher.processinq(10)

        assert r['processed'] == 1, r
        assert r['scheduled'] == 0, r
        assert r['excluded'] == 0, r
        assert r['saved'] == 1, r

        self.dispatcher.shutdown()

        items = self.readqueue(self.dispatcher.diverter.getqueue('0').qdir)
        assert len(items) == 1, items
        assert isinstance(items[0], dict), items[0]
        assert items[0]['u'] == curi['u']