Esempio n. 1
0
class BatchSeenWriter(object):
    def __init__(self, dbdir):
        self.seen = Seen(dbdir, options.cachesize*(1024*1024))
        self.buffer = []

    def put(self, curi):
        curi['id'] = urihash.urikey(curi['u'])
        self.buffer.append(curi)
        if len(self.buffer) >= options.batchsize:
            self.flush()

    def flush(self):
        self.buffer.sort(lambda x, y: cmp(x['id'], y['id']))
        for u in self.buffer:
            self.seen.already_seen(u)
        self.buffer = []

    def processfile(self, fn):
        print >>sys.stderr, fn
        count = 0
        f = GzipFile(fn) if fn.endswith('.gz') else open(fn)
        for l in f:
            try:
                o = parse_crawllog(l)
            except Exception as ex:
                print >>sys.stderr, "skipped %s" % str(ex)
                continue
            if o is None:
                continue
            self.put(o)
            count += 1
            print >>sys.stderr, "\r%d" % count,
        f.close()
        self.flush()
        sys.stderr.write("\n")
Esempio n. 2
0
 def init_seen(self):
     if not self.seen:
         try:
             cachesize = hqconfig.get('seencache')
             if cachesize: cachesize = int(cachesize)*(1024**2)
         except:
             cachesize = None
         self.seen = Seen(dbdir=hqconfig.seendir(self.jobname),
                          block_cache_size=cachesize)
Esempio n. 3
0
File: hq.py Progetto: travisfw/hq
    def processinq(self, maxn):
        '''process incoming queue. maxn paramter adivces
        upper limit on number of URIs processed in this single call.
        actual number of URIs processed may exceed it if incoming queue
        stores URIs in chunks.'''

        # lazy initialization of seen db
        if not self.seen:
            try:
                cachesize = hqconfig.get('seencache')
                if cachesize: cachesize = int(cachesize)*(1024**2)
            except:
                cachesize = None
            self.seen = Seen(dbdir=hqconfig.seendir(self.jobname),
                             block_cache_size=cachesize)

        result = dict(processed=0, scheduled=0, excluded=0, saved=0,
                      td=0.0, ts=0.0)
        for count in xrange(maxn):
            t0 = time.time()
            furi = self.inq.get(0.01)
            result['td'] += (time.time() - t0)
            if furi is None: break
            result['processed'] += 1
            ws = self.mapper.workset(furi)
            if self.is_workset_active(ws):
                # no need to call self.workset_activating(). it's already
                # done by Scheduler.
                di = self.get_domaininfo(furi['u'])
                if di and di['exclude']:
                    result['excluded'] += 1
                    continue
                t0 = time.time()
                suri = self.seen.already_seen(furi)
                if suri['e'] < int(time.time()):
                    if 'w' in furi:
                        a = furi['w']
                    else:
                        a = dict()
                        for k in ('p','v','x'):
                            m = furi.get(k)
                            if m is not None:
                                a[k] = m
                    curi = dict(u=furi['u'], id=suri['_id'], a=a)
                    self.scheduler.schedule(curi, ws)
                    result['scheduled'] += 1
                result['ts'] += (time.time() - t0)
            else:
                if self.workset_state[ws]:
                    self.workset_deactivating(ws)
                # client is not active
                self.diverter.divert(str(ws), furi)
                result['saved'] += 1
        return result
Esempio n. 4
0
    def __init__(self, jobconfigs, jobname, domaininfo):
        self.jobconfigs = jobconfigs
        self.jobname = jobname
        self.qdir = hqconfig.inqdir(self.jobname)

        self.inq = FileDequeue(self.qdir, reader=FPSortingQueueFileReader)

        self.mapper = WorksetMapper(hqconfig.NWORKSETS_BITS)
        self.seen = Seen(dbdir=hqconfig.seendir(self.jobname))
        self.domaininfo = domaininfo
        
        self.scheduler = Scheduler(self.jobname, self.mapper)
Esempio n. 5
0
File: hq.py Progetto: travisfw/hq
class CrawlJob(object):
    NWORKSETS_BITS = 8

    def __init__(self, jobconfigs, jobname, crawlinfo, domaininfo):
        self.jobconfigs = jobconfigs
        self.jobname = jobname
        self.mapper = CrawlMapper(self, self.NWORKSETS_BITS)
        self.workset_state = [0 for i in range(self.mapper.nworksets)]

        # seen-db initialization is delayed until it's actually needed
        self.seen = None
        #self.seen = Seen(dbdir=os.path.join(HQ_HOME, 'seen', self.jobname))
        self.crawlinfodb = crawlinfo
        self.domaininfo = domaininfo
        self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname),
                                   self.mapper)
        # self.inq = HashSplitIncomingQueue(
        #     qdir=hqconfig.inqdir(self.jobname),
        #     buffsize=500)
        self.inq = PooledIncomingQueue(
            qdir=hqconfig.inqdir(self.jobname),
            buffsize=1000)

        self.diverter = Diverter(self.jobname, self.mapper)

        #self.discovered_executor = ThreadPoolExecutor(poolsize=1)

        # currently disabled by default - too slow
        self.use_crawlinfo = False
        self.save_crawlinfo = False

    PARAMS = [('use_crawlinfo', bool),
              ('save_crawlinfo', bool)]

    def shutdown(self):
        logging.info("shutting down scheduler")
        self.scheduler.shutdown()
        logging.info("shutting down diverter")
        self.diverter.shutdown()
        if self.seen:
            logging.info("closing seen db")
            self.seen.close()
        logging.info("closing incoming queues")
        self.inq.flush()
        self.inq.close()
        logging.info("shutting down crawlinfo")
        self.crawlinfodb.shutdown()
        logging.info("done.")
        #self.discovered_executor.shutdown()

    def get_status(self):
        r = dict(job=self.jobname, oid=id(self))
        r['seen'] = self.seen and self.seen.get_status()
        r['sch'] = self.scheduler and self.scheduler.get_status()
        r['inq'] = self.inq and self.inq.get_status()
        return r

    def get_workset_status(self):
        r = dict(job=self.jobname, crawljob=id(self))
        if self.scheduler:
            r['sch'] = id(self.scheduler)
            r['worksets'] = self.scheduler.get_workset_status()
        return r
        
    #def discovered_async(self, curis):
    #    return self.inq.add(curis)

    def get_domaininfo(self, url):
        uc = urlsplit(url)
        host = uc.netloc
        p = host.find(':')
        if p > 0: host = host[:p]
        di = self.domaininfo.get(host)
        return di
        
    def schedule(self, curis):
        '''schedule curis bypassing seen-check. typically used for starting
           new crawl cycle.'''
        scheduled = 0
        for curi in curis:
            self.scheduler.schedule(curi)
            scheduled += 1
        return dict(processed=scheduled, scheduled=scheduled)

    def discovered(self, curis):
        return self.inq.add(curis)
        
    def is_client_active(self, clid):
        """is client clid active?"""
        # TODO: update ZooKeeper when active status changes
        #t = self.client_last_active.get(str(clid))
        return self.scheduler.is_active(clid)

    def is_workset_active(self, wsid):
        """is workset wsid assigned to any active client?"""
        clid = self.mapper.worksetclient[wsid]
        return self.is_client_active(clid)

    def workset_activating(self, wsid):
        """activates working set wsid; start sending CURIs to Scheduler
        and enqueue diverted CURIs back into incoming queue so that
        processinq will process them (again). called by Scheduler,
        through CrawlMapper, when client starts feeding.
        note, unlike workset_deactivating, this method shall not be
        called from inside processinq method below, because processinq
        executes it only when at least one CURI is available for processing.
        if inq is empty, CURIs in divert queues would never be enqueued back.
        """
        # this could be executed asynchronously
        logging.info('workset %s activated', wsid)
        self.workset_state[wsid] = 1
        # is it better to move files back into inq directory?
        qfiles = self.diverter.listqfiles(wsid)
        logging.info('re-scheduling %s to inq', str(qfiles))
        self.inq.rqfile.qfiles_available(qfiles)

    def workset_deactivating(self, wsid):
        """deactivates working set wsid; start sending CURIs into
        divert queues."""
        logging.info('workset %s deactivated', wsid)
        self.workset_state[wsid] = 0
        # flush Workset queues. we don't move qfiles to diverter yet.
        # it will be done when other HQ server becomes active on the
        # workset, and this HQ server starts forwarding CURIs.
        self.scheduler.flush_workset(wsid)

    def processinq(self, maxn):
        '''process incoming queue. maxn paramter adivces
        upper limit on number of URIs processed in this single call.
        actual number of URIs processed may exceed it if incoming queue
        stores URIs in chunks.'''

        # lazy initialization of seen db
        if not self.seen:
            try:
                cachesize = hqconfig.get('seencache')
                if cachesize: cachesize = int(cachesize)*(1024**2)
            except:
                cachesize = None
            self.seen = Seen(dbdir=hqconfig.seendir(self.jobname),
                             block_cache_size=cachesize)

        result = dict(processed=0, scheduled=0, excluded=0, saved=0,
                      td=0.0, ts=0.0)
        for count in xrange(maxn):
            t0 = time.time()
            furi = self.inq.get(0.01)
            result['td'] += (time.time() - t0)
            if furi is None: break
            result['processed'] += 1
            ws = self.mapper.workset(furi)
            if self.is_workset_active(ws):
                # no need to call self.workset_activating(). it's already
                # done by Scheduler.
                di = self.get_domaininfo(furi['u'])
                if di and di['exclude']:
                    result['excluded'] += 1
                    continue
                t0 = time.time()
                suri = self.seen.already_seen(furi)
                if suri['e'] < int(time.time()):
                    if 'w' in furi:
                        a = furi['w']
                    else:
                        a = dict()
                        for k in ('p','v','x'):
                            m = furi.get(k)
                            if m is not None:
                                a[k] = m
                    curi = dict(u=furi['u'], id=suri['_id'], a=a)
                    self.scheduler.schedule(curi, ws)
                    result['scheduled'] += 1
                result['ts'] += (time.time() - t0)
            else:
                if self.workset_state[ws]:
                    self.workset_deactivating(ws)
                # client is not active
                self.diverter.divert(str(ws), furi)
                result['saved'] += 1
        return result

    def makecuri(self, o):
        if 'a' not in o:
            if 'w' in o:
                o['a'] = o['w']
                del o['w']
            else:
                a = dict()
                for k in 'pxv':
                    if k in o:
                        a[k] = o[k]
                        del o[k]
                if a: o['a'] = a
        return o

    def feed(self, client, n):
        logging.debug('feed %s begin', client)
        curis = self.scheduler.feed(client, n)
        # add recrawl info if enabled
        if self.use_crawlinfo and len(curis) > 0 and self.crawlinfodb:
            t0 = time.time()
            self.crawlinfodb.update_crawlinfo(curis)
            t = time.time() - t0
            if t / len(curis) > 0.5:
                logging.warn("SLOW update_crawlinfo: %s %.3fs/%d",
                             client, t, len(curis))
            self.crawlinfodb.mongo.end_request()
        r = [self.makecuri(u) for u in curis]
        return r

    def finished(self, curis):
        result = dict(processed=0)
        for curi in curis:
            self.scheduler.finished(curi)
            result['processed'] += 1
        if self.save_crawlinfo and self.crawlinfodb:
            for curi in curis:
                self.crawlinfodb.save_result(curi)
            # XXX - until I come up with better design
            self.crawlinfodb.mongo.end_request()
        return result

    def reset(self, client):
        return self.scheduler.reset(client)

    def flush(self):
        self.inq.flush()
        self.inq.close()
        return self.scheduler.flush_clients()
Esempio n. 6
0
 def __init__(self, dbdir):
     self.seen = Seen(dbdir, options.cachesize*(1024*1024))
     self.buffer = []
Esempio n. 7
0
class CrawlJob(object):
    def __init__(self, jobconfigs, jobname, domaininfo):
        self.jobconfigs = jobconfigs
        self.jobname = jobname
        self.qdir = hqconfig.inqdir(self.jobname)

        self.inq = FileDequeue(self.qdir, reader=FPSortingQueueFileReader)

        self.mapper = WorksetMapper(hqconfig.NWORKSETS_BITS)
        self.seen = Seen(dbdir=hqconfig.seendir(self.jobname))
        self.domaininfo = domaininfo
        
        self.scheduler = Scheduler(self.jobname, self.mapper)

    def shutdown(self):
        logging.info("closing seen db")
        self.seen.close()
        logging.info("shutting down scheduler")
        self.scheduler.shutdown()
        logging.info("done.")

    def get_status(self):
        r = dict(job=self.jobname, oid=id(self))
        r['sch'] = self.scheduler and self.scheduler.get_status()
        r['inq'] = self.inq and self.inq.get_status()
        return r

    # def get_workset_status(self):
    #     r = dict(job=self.jobname, crawljob=id(self))
    #     if self.scheduler:
    #         r['sch'] = id(self.scheduler)
    #         r['worksets'] = self.scheduler.get_workset_status()
    #     return r
        
    def get_domaininfo(self, url):
        uc = urlsplit(url)
        host = uc.netloc
        p = host.find(':')
        if p > 0: host = host[:p]
        di = self.domaininfo.get(host)
        return di
        
    def schedule(self, curis):
        '''schedule curis bypassing seen-check. typically used for starting
           new crawl cycle.'''
        scheduled = 0
        for curi in curis:
            #self.scheduler.schedule(curi)
            ws = self.mapper.workset(curi)
            self.worksets[ws].schedule(curi)
            scheduled += 1
        return dict(processed=scheduled, scheduled=scheduled)

    def processinq(self, maxn):
        '''process incoming queue. maxn paramter adivces
        upper limit on number of URIs processed in this single call.
        actual number of URIs processed may exceed it if incoming queue
        stores URIs in chunks.'''
        result = dict(processed=0, scheduled=0, excluded=0, td=0.0, ts=0.0)
        for count in xrange(maxn):
            t0 = time.time()
            furi = self.inq.get(0.01)
            
            result['td'] += (time.time() - t0)
            if furi is None: break
            result['processed'] += 1
            di = self.get_domaininfo(furi['u'])
            if di and di['exclude']:
                result['excluded'] += 1
                continue
            t0 = time.time()
            suri = self.seen.already_seen(furi)
            if suri['e'] < int(time.time()):
                if 'w' in furi:
                    w = furi['w']
                else:
                    w = dict()
                    for k in ('p','v','x'):
                        m = furi.get(k)
                        if m is not None:
                            w[k] = m
                curi = dict(u=furi['u'], id=suri['_id'], a=w)
                self.scheduler.schedule(curi)
                result['scheduled'] += 1
            result['ts'] += (time.time() - t0)
        # currently no access to MongoDB
        #self.mongo.end_request()
        return result

    def makecuri(self, o):
        return o

    def flush(self):
        self.seen.flush()
        return self.scheduler.flush()
Esempio n. 8
0
class Dispatcher(object):
    inqwatcher = None

    # TODO: take JobConfig, instead of job
    def __init__(self, domaininfo, job, mapper,
                 scheduler, inq=None):
        self.domaininfo = domaininfo
        self.jobname = job
        self.mapper = mapper
        self.scheduler = scheduler

        # TODO: inject these objects from outside
        qdir = hqconfig.inqdir(self.jobname)
        self.inq = inq
        if self.inq is None:
            self.inq = FileDequeue(qdir, reader=FPSortingQueueFileReader)
        # seen database is initialized lazily
        #self.seen = Seen(dbdir=hqconfig.seendir(self.jobname))
        self.seen = None
        self.diverter = Diverter(self.jobname, self.mapper)
        self.excludedlist = ExcludedList(self.jobname)

        self.workset_state = [0 for i in range(self.mapper.nworksets)]

        # TODO: this could be combined with FileDequeue above in a single class
        if Dispatcher.inqwatcher is None:
            iqw = Dispatcher.inqwatcher = InqueueWatcher()
            iqw.start()
        self.watch = Dispatcher.inqwatcher.addwatch(hqconfig.inqdir(self.jobname))

    def shutdown(self):
        #if self.job: self.job.shutdown()
        if self.seen:
            logging.info("closing seen db")
            self.seen.close()
            self.seen = None
        # logging.info("shutting down scheduler")
        # self.scheduler.shutdown()
        logging.info("shutting down diverter")
        self.diverter.shutdown()
        logging.info("shutting down excludedlist")
        self.excludedlist.shutdown()
        logging.info("done.")

    def flush(self):
        """flushes URIs buffered in workset objects"""
        #return self.job.flush()
        
    def is_client_active(self, clid):
        """is client clid active?"""
        # TODO: update ZooKeeper when active status changes
        #t = self.client_last_active.get(str(clid))
        return self.scheduler.is_active(clid)

    def is_workset_active(self, wsid):
        """is workset wsid assigned to any active client?"""
        clid = self.mapper.worksetclient[wsid]
        return self.is_client_active(clid)

    def workset_activating(self, wsid):
        """activates working set wsid; start sending CURIs to Scheduler
        and enqueue diverted CURIs back into incoming queue so that
        processinq will process them (again). called by Scheduler,
        through CrawlMapper, when client starts feeding.
        note, unlike workset_deactivating, this method shall not be
        called from inside processinq method below, because processinq
        executes it only when at least one CURI is available for processing.
        if inq is empty, CURIs in divert queues would never be enqueued back.
        """
        # this could be executed asynchronously
        logging.info('workset %s activated', wsid)
        self.workset_state[wsid] = 1
        # is it better to move files back into inq directory?
        qfiles = self.diverter.listqfiles(wsid)
        logging.info('re-scheduling %s to inq', str(qfiles))
        self.inq.qfiles_available(qfiles)

    def workset_deactivating(self, wsid):
        """deactivates working set wsid; start sending CURIs into
        divert queues."""
        logging.info('workset %s deactivated', wsid)
        self.workset_state[wsid] = 0
        # flush Workset queues. we don't move qfiles to diverter yet.
        # it will be done when other HQ server becomes active on the
        # workset, and this HQ server starts forwarding CURIs.
        self.scheduler.flush_workset(wsid)

    def init_seen(self):
        if not self.seen:
            try:
                cachesize = hqconfig.get('seencache')
                if cachesize: cachesize = int(cachesize)*(1024**2)
            except:
                cachesize = None
            self.seen = Seen(dbdir=hqconfig.seendir(self.jobname),
                             block_cache_size=cachesize)

    def processinq(self, maxn):
        '''process incoming queue. maxn paramter adivces
        upper limit on number of URIs processed in this single call.
        actual number of URIs processed may exceed it if incoming queue
        stores URIs in chunks.'''

        # lazy initialization of seen db
        self.init_seen()

        result = dict(processed=0, scheduled=0, excluded=0, saved=0,
                      td=0.0, ts=0.0)
        for count in xrange(maxn):
            t0 = time.time()
            furi = self.inq.get(0.01)
            
            result['td'] += (time.time() - t0)
            if furi is None: break
            result['processed'] += 1
            ws = self.mapper.workset(furi)
            if self.is_workset_active(ws):
                # no need to call self.workset_activating(). it's already
                # done by Scheduler
                di = self.domaininfo.get_byurl(furi['u'])
                if di and di['exclude']:
                    self.excludedlist.add(furi)
                    result['excluded'] += 1
                    continue
                t0 = time.time()
                suri = self.seen.already_seen(furi)
                if suri['e'] < int(time.time()):
                    curi = dict(u=furi['u'], id=suri['_id'])
                    a = furi.get('w')
                    if not isinstance(a, dict): a = furi
                    for k in 'pvx':
                        m = a.get(k)
                        if m is not None: curi[k] = m
                    self.scheduler.schedule(curi)
                    result['scheduled'] += 1
                result['ts'] += (time.time() - t0)
            else:
                if self.workset_state[ws]:
                    self.workset_deactivating(ws)
                # client is not active
                self.diverter.divert(str(ws), furi)
                result['saved'] += 1
        return result

    def wait_available(self, timeout=None):
        return self.watch.wait(timeout)