コード例 #1
0
def testSameUnseenURLsInInput(testdatadir, testdomaininfo, testmapper,
                              testscheduler):
    inq = IncomingQueue(testdatadir.inqdir('wide'))
    dispatcher = MergeDispatcher(testdomaininfo, 'wide',
                                 testmapper, testscheduler, inq)

    urls = generate_random_urls(100)
    seenurls = urls[:50]
    novelurls = urls[50:]
    seenfile = create_seen(dispatcher, seenurls)

    dupseenurls = [dict(url) for url in novelurls[:25]]

    input = urls + dupseenurls
    inq.add(input)
    inq.close()

    result = dispatcher.processinq(0)

    assert result['processed'] == len(input), result
    assert result['excluded'] == 0, result
    assert result['saved'] == 0, result
    assert result['scheduled'] == len(novelurls), result

    check_seenfile(seenfile)
コード例 #2
0
def testBasic(testdatadir, testdomaininfo, testmapper, testscheduler):
    inq = IncomingQueue(testdatadir.inqdir('wide'))
    dispatcher = MergeDispatcher(testdomaininfo, 'wide',
                                 testmapper, testscheduler, inq)

    urls = generate_random_urls(100)
    for url in urls:
        print url['u']

    seenurls = urls[:50]
    novelurls = urls[50:]
    seenfile = create_seen(dispatcher, seenurls)

    print "processinq #1"

    inq.add(urls)
    inq.close()

    result = dispatcher.processinq(0)

    assert result['processed'] == 100, result
    assert result['excluded'] == 0, result
    assert result['saved'] == 0, result
    assert result['scheduled'] == 50, result

    scheduled = set(url['u'] for url in testscheduler.curis)
    assert all(url['u'] not in scheduled for url in seenurls)
    assert all(url['u'] in scheduled for url in novelurls)

    print "processinq #2"

    inq.add(urls)
    inq.close()

    testscheduler.curis = []
    result = dispatcher.processinq(0)

    assert result['processed'] == 100, result
    assert result['excluded'] == 0, result
    assert result['saved'] == 0, result
    assert result['scheduled'] == 0, result

    assert len(testscheduler.curis) == 0

    check_seenfile(seenfile)
コード例 #3
0
def testRecovery(testdatadir, testdomaininfo, testmapper, testscheduler):
    """tests recovery run after processinq is terminated during
    scheduling (phase 2)."""
    inq = IncomingQueue(testdatadir.inqdir('wide'))
    dispatcher = MergeDispatcher(testdomaininfo, 'wide',
                                 testmapper, testscheduler, inq)
    # TODO: there's another case of getting terminated during
    # phase 1 - actually it's more likely to happen as it takes
    # longer than phase 2. fortunately phase 1 recovery is simpler
    # than phase 2 recovery - just starting over.
    urls1 = generate_random_urls(50)
    inq.add(urls1)
    inq.close()

    seenfile = create_seen(dispatcher, [])

    # let TestScheduler exit on 20th (i.e. after scheduling 19) cURLs.
    testscheduler.failat = 20
    try:
        dispatcher.processinq(0)
        assert False, 'should raise RuntimeException'
    except Exception as ex:
        # expected
        pass

    assert len(testscheduler.curis) == 19

    #subprocess.call(['ls', '-l', os.path.dirname(seenfile)])

    testscheduler.failat = None
    # enqueue another 50 URLs to verify they are not consumed by
    # next processinq run.
    urls2 = generate_random_urls(50)
    inq.add(urls2)

    dispatcher.processinq(0)

    # TODO: want to check all intermediate files are cleaned up?
    #subprocess.call(['ls', '-l', os.path.dirname(seenfile)])

    n = check_seenfile(seenfile)
    # check: all of urls1 are now seen, none from urls2
    assert n == len(urls1)
    # check: all of urls1 are scheduled, no duplicates
    assert len(testscheduler.curis) == len(urls1)
    scheduled_urls = [u['u'] for u in testscheduler.curis]
    missing = []
    for u in urls1:
        found = (u['u'] in scheduled_urls)
        print >>sys.stderr, "{} {}".format(u['u'], found)
        if not found: missing.append(u)
    assert len(missing) == 0, "missing {} URLs {}".format(
        len(missing), missing)
コード例 #4
0
ファイル: hq.py プロジェクト: kngenie/crawlhq
    def __init__(self, hq, jobname):
        self.hq = hq
        self.jobconfigs = self.hq.jobconfigs
        self.jobname = jobname
        self.mapper = CrawlMapper(self, hqconfig.NWORKSETS_BITS)
        self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname),
                                   self.mapper)

        readsorted = hqconfig.getint('inq.sort', 1)

        inqdir = hqconfig.inqdir(self.jobname)

        def enqfactory(qdir, **kwargs):
            return PriorityEnqueue(qdir, **kwargs)
        def deqfactory(qdir, **kwargs):
            if readsorted:
                kwargs.update(reader=FPSortingQueueFileReader)
            return PriorityDequeue(qdir, **kwargs)

        self.inq = IncomingQueue(inqdir, enq=enqfactory, deq=deqfactory,
                                 buffsize=1000)

        # self.eninq = PriorityEnqueue(
        #     qdir=hqconfig.inqdir(self.jobname),
        #     buffer=1000)

        # deinqargs = {}
        # if readsorted:
        #     deinqargs['reader'] = FPSortingQueueFileReader
        # self.deinq = PriorityDequeue(qdir=hqconfig.inqdir(self.jobname),
        #                              **deinqargs)

        self._dispatcher_mode = hqconfig.get(
            ('jobs', self.jobname, 'dispatcher'), 'internal')

        self.dispatcher = None
        #self.init_dispatcher()

        # currently disabled by default - too slow
        self.use_crawlinfo = False
        self.save_crawlinfo = False

        self.last_inq_count = 0

        self.addedcount = 0
        self.processedcount = 0
コード例 #5
0
ファイル: test-fileinq.py プロジェクト: kngenie/crawlhq
                self.seq += 1
            self.count += 1
            return dict(d=d)

class Emitter(Thread):
    def __init__(self, buckets, inq):
        Thread.__init__(self)
        self.buckets = buckets
        self.inq = inq
        self.count = 0
    def run(self):
        for bucket in self.buckets:
            self.inq.add(bucket['d'])
            self.count += 1

inq = IncomingQueue('wide', QUEUE_DIRECTORY)
#bucketreader = BucketReader()
bucketreader = SequenceBucketReader(100, 100)
emitters = [Emitter(bucketreader, inq) for i in range(3)]
for e in emitters:
    e.start()
    
for e in emitters:
    while 1:
        e.join(1.0)
        if not e.is_alive(): break
        sys.stderr.write('\r%d %s' % (bucketreader.count, inq.get_status()))
sys.stderr.write('\n')
inq.close()

# reading queue out
コード例 #6
0
ファイル: hq.py プロジェクト: kngenie/crawlhq
class CrawlJob(object):

    def __init__(self, hq, jobname):
        self.hq = hq
        self.jobconfigs = self.hq.jobconfigs
        self.jobname = jobname
        self.mapper = CrawlMapper(self, hqconfig.NWORKSETS_BITS)
        self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname),
                                   self.mapper)

        readsorted = hqconfig.getint('inq.sort', 1)

        inqdir = hqconfig.inqdir(self.jobname)

        def enqfactory(qdir, **kwargs):
            return PriorityEnqueue(qdir, **kwargs)
        def deqfactory(qdir, **kwargs):
            if readsorted:
                kwargs.update(reader=FPSortingQueueFileReader)
            return PriorityDequeue(qdir, **kwargs)

        self.inq = IncomingQueue(inqdir, enq=enqfactory, deq=deqfactory,
                                 buffsize=1000)

        # self.eninq = PriorityEnqueue(
        #     qdir=hqconfig.inqdir(self.jobname),
        #     buffer=1000)

        # deinqargs = {}
        # if readsorted:
        #     deinqargs['reader'] = FPSortingQueueFileReader
        # self.deinq = PriorityDequeue(qdir=hqconfig.inqdir(self.jobname),
        #                              **deinqargs)

        self._dispatcher_mode = hqconfig.get(
            ('jobs', self.jobname, 'dispatcher'), 'internal')

        self.dispatcher = None
        #self.init_dispatcher()

        # currently disabled by default - too slow
        self.use_crawlinfo = False
        self.save_crawlinfo = False

        self.last_inq_count = 0

        self.addedcount = 0
        self.processedcount = 0

    PARAMS = [('use_crawlinfo', bool),
              ('save_crawlinfo', bool),
              ('dispatcher_mode', str)]

    @property
    def dispatcher_mode(self):
        return self._dispatcher_mode
    @dispatcher_mode.setter
    def dispatcher_mode(self, value):
        self._dispatcher_mode = value
        if value == 'external':
            self.shutdown_dispatcher()

    def init_dispatcher(self):
        if self.dispatcher: return self.dispatcher
        if self.dispatcher_mode == 'external':
            raise RuntimeError, 'dispatcher mode is %s' % self.dispatcher_mode
        self.dispatcher = LevelDispatcher(self.hq.get_domaininfo(),
                                          self.jobname,
                                          mapper=self.mapper,
                                          scheduler=self.scheduler,
                                          inq=self.inq.deq)
        return self.dispatcher

    def shutdown_dispatcher(self):
        if not self.dispatcher: return
        logging.info("shutting down dispatcher")
        self.dispatcher.shutdown()
        self.dispatcher = None

    def shutdown(self):
        logging.info("shutting down scheduler")
        self.scheduler.shutdown()
        logging.info("closing incoming queues")
        self.inq.flush()
        self.inq.close()
        self.shutdown_dispatcher()
        logging.info("done.")

    def get_status(self):
        r = dict(job=self.jobname, oid=id(self))
        r['sch'] = self.scheduler and self.scheduler.get_status()
        r['inq'] = self.inq.get_status()
        return r

    def get_workset_status(self):
        r = dict(job=self.jobname, crawljob=id(self))
        if self.scheduler:
            r['sch'] = id(self.scheduler)
            r['worksets'] = self.scheduler.get_workset_status()
        return r

    def workset_activating(self, *args):
        self.init_dispatcher().workset_activating(*args)

    def schedule(self, curis):
        '''schedule curis bypassing seen-check. typically used for starting
           new crawl cycle.'''
        scheduled = 0
        for curi in curis:
            self.scheduler.schedule(curi)
            scheduled += 1
        return dict(processed=scheduled, scheduled=scheduled)

    def discovered(self, curis):
        return self.inq.add(curis)

    def processinq(self, maxn):
        return self.init_dispatcher().processinq(maxn)

    def makecuri(self, o):
        # temporary rescue measure. delete after everything's got fixed.
        a = o.get('a')
        if isinstance(a, dict):
            for k in 'pvx':
                m = a.pop(k, None)
                if m is not None: o[k] = m
            if not o['a']:
                del o['a']
        return o

    def feed(self, client, n):
        logging.debug('feed "%s" begin', client)
        curis = self.scheduler.feed(client, n)
        # add recrawl info if enabled
        if self.use_crawlinfo and len(curis) > 0 and self.hq.crawlinfo:
            t0 = time.time()
            self.hq.crawlinfo.update_crawlinfo(curis)
            t = time.time() - t0
            if t / len(curis) > 1.0:
                logging.warn("SLOW update_crawlinfo: %s %.3fs/%d",
                             client, t, len(curis))
            self.hq.crawlinfo.mongo.end_request()
        r = [self.makecuri(u) for u in curis]
        # if client queue is empty, request incoming queue to flush
        if not r:
            # but do not flush too frequently.
            if self.addedcount > self.last_inq_count + 1000:
                self.inq.close()
                self.last_inq_count = self.addedcount
        return r

    def finished(self, curis):
        result = dict(processed=0)
        for curi in curis:
            self.scheduler.finished(curi)
            result['processed'] += 1
        if self.save_crawlinfo and self.hq.crawlinfo:
            for curi in curis:
                self.hq.crawlinfo.save_result(curi)
            # XXX - until I come up with better design
            self.hq.crawlinfo.mongo.end_request()
        return result

    def reset(self, client):
        return self.scheduler.reset(client)

    def flush(self):
        self.inq.close()
        return self.scheduler.flush_clients()

    def count_seen(self):
        """return number of items in seen db.
        can take pretty long to return.
        """
        return self.init_dispatcher().count_seen()

    def clear_seen(self):
        self.init_dispatcher().clear_seen()