コード例 #1
0
ファイル: seen.py プロジェクト: kngenie/crawlhq
 def __call__(self, job):
     # seen cache parameter is in MB
     seen = self.job_seen.get(job)
     if not seen:
         cachesize = hqconfig.get('seencache')
         if cachesize: cachesize = int(cachesize)*(1024**2)
         seen = Seen(dbdir=hqconfig.seendir(job))
         self.job_seen[job] = seen
     return seen
コード例 #2
0
ファイル: dispatcher.py プロジェクト: travisfw/crawlhq
 def init_seen(self):
     if not self.seen:
         try:
             cachesize = hqconfig.get('seencache')
             if cachesize: cachesize = int(cachesize)*(1024**2)
         except:
             cachesize = None
         self.seen = Seen(dbdir=hqconfig.seendir(self.jobname),
                          block_cache_size=cachesize)
コード例 #3
0
ファイル: hq.py プロジェクト: travisfw/hq
    def processinq(self, maxn):
        '''process incoming queue. maxn paramter adivces
        upper limit on number of URIs processed in this single call.
        actual number of URIs processed may exceed it if incoming queue
        stores URIs in chunks.'''

        # lazy initialization of seen db
        if not self.seen:
            try:
                cachesize = hqconfig.get('seencache')
                if cachesize: cachesize = int(cachesize)*(1024**2)
            except:
                cachesize = None
            self.seen = Seen(dbdir=hqconfig.seendir(self.jobname),
                             block_cache_size=cachesize)

        result = dict(processed=0, scheduled=0, excluded=0, saved=0,
                      td=0.0, ts=0.0)
        for count in xrange(maxn):
            t0 = time.time()
            furi = self.inq.get(0.01)
            result['td'] += (time.time() - t0)
            if furi is None: break
            result['processed'] += 1
            ws = self.mapper.workset(furi)
            if self.is_workset_active(ws):
                # no need to call self.workset_activating(). it's already
                # done by Scheduler.
                di = self.get_domaininfo(furi['u'])
                if di and di['exclude']:
                    result['excluded'] += 1
                    continue
                t0 = time.time()
                suri = self.seen.already_seen(furi)
                if suri['e'] < int(time.time()):
                    if 'w' in furi:
                        a = furi['w']
                    else:
                        a = dict()
                        for k in ('p','v','x'):
                            m = furi.get(k)
                            if m is not None:
                                a[k] = m
                    curi = dict(u=furi['u'], id=suri['_id'], a=a)
                    self.scheduler.schedule(curi, ws)
                    result['scheduled'] += 1
                result['ts'] += (time.time() - t0)
            else:
                if self.workset_state[ws]:
                    self.workset_deactivating(ws)
                # client is not active
                self.diverter.divert(str(ws), furi)
                result['saved'] += 1
        return result
コード例 #4
0
ファイル: dispatcher.py プロジェクト: travisfw/hq
    def __init__(self, jobconfigs, jobname, domaininfo):
        self.jobconfigs = jobconfigs
        self.jobname = jobname
        self.qdir = hqconfig.inqdir(self.jobname)

        self.inq = FileDequeue(self.qdir, reader=FPSortingQueueFileReader)

        self.mapper = WorksetMapper(hqconfig.NWORKSETS_BITS)
        self.seen = Seen(dbdir=hqconfig.seendir(self.jobname))
        self.domaininfo = domaininfo
        
        self.scheduler = Scheduler(self.jobname, self.mapper)
コード例 #5
0
ファイル: seenrebuild.py プロジェクト: kngenie/crawlhq
            try:
                o = parse_crawllog(l)
            except Exception as ex:
                print >>sys.stderr, "skipped %s" % str(ex)
                continue
            if o is None:
                continue
            self.put(o)
            count += 1
            print >>sys.stderr, "\r%d" % count,
        f.close()
        self.flush()
        sys.stderr.write("\n")

opt = OptionParser(usage='%prog [OPTIONS] JOB crawl.log...')
opt.add_option('-C', action='store', dest='cachesize', type='int',
               default=1024,
               help='LevelDB block cache size in MiB (default 1024)')
opt.add_option('-b', action='store', dest='batchsize', type='int',
               default=1000000,
               help='number of URIs to process in a batch (default 1M)')

options, args = opt.parse_args()
if len(args) < 1:
    opt.error('specify JOB')
job = args.pop(0)

batchseen = BatchSeenWriter(hqconfig.seendir(job))
for fn in args:
    batchseen.processfile(fn)
コード例 #6
0
ファイル: seenrepair.py プロジェクト: kngenie/crawlhq
#!/usr/bin/python
import sys, os
sys.path.append(os.path.join(os.path.dirname(__file__), '../lib'))
from optparse import OptionParser
import hqconfig
import leveldb

opt = OptionParser('%prog JOB')
opt.add_option('-f', action='store_true', dest='force', default=False,
               help='force repair even if this script guessed it is not a'
               'valid level db directory')
options, args = opt.parse_args()
if len(args) == 0:
    opt.error('specify job name')
job = args[0]    
seendir = hqconfig.seendir(job)
if not os.path.isdir(seendir):
    opt.error('%s is not a directory')
if not os.path.isfile(os.path.join(seendir, 'CURRENT')):
    print >>sys.stderr, "%s does not seem to be a LevelDB directory" % seendir
    if not options.force:
        exit(1)
seendir_owner = os.stat(seendir).st_uid
if seendir_owner != os.geteuid():
    print >>sys.stderr, "%s owner (%d) does not match effective user (%d)" % (
        seendir, seendir_owner, os.geteuid())
    if not options.force:
        exit(1)
leveldb.IntHash.repair_db(hqconfig.seendir(job))