def __call__(self, job): # seen cache parameter is in MB seen = self.job_seen.get(job) if not seen: cachesize = hqconfig.get('seencache') if cachesize: cachesize = int(cachesize)*(1024**2) seen = Seen(dbdir=hqconfig.seendir(job)) self.job_seen[job] = seen return seen
def init_seen(self): if not self.seen: try: cachesize = hqconfig.get('seencache') if cachesize: cachesize = int(cachesize)*(1024**2) except: cachesize = None self.seen = Seen(dbdir=hqconfig.seendir(self.jobname), block_cache_size=cachesize)
def processinq(self, maxn): '''process incoming queue. maxn paramter adivces upper limit on number of URIs processed in this single call. actual number of URIs processed may exceed it if incoming queue stores URIs in chunks.''' # lazy initialization of seen db if not self.seen: try: cachesize = hqconfig.get('seencache') if cachesize: cachesize = int(cachesize)*(1024**2) except: cachesize = None self.seen = Seen(dbdir=hqconfig.seendir(self.jobname), block_cache_size=cachesize) result = dict(processed=0, scheduled=0, excluded=0, saved=0, td=0.0, ts=0.0) for count in xrange(maxn): t0 = time.time() furi = self.inq.get(0.01) result['td'] += (time.time() - t0) if furi is None: break result['processed'] += 1 ws = self.mapper.workset(furi) if self.is_workset_active(ws): # no need to call self.workset_activating(). it's already # done by Scheduler. di = self.get_domaininfo(furi['u']) if di and di['exclude']: result['excluded'] += 1 continue t0 = time.time() suri = self.seen.already_seen(furi) if suri['e'] < int(time.time()): if 'w' in furi: a = furi['w'] else: a = dict() for k in ('p','v','x'): m = furi.get(k) if m is not None: a[k] = m curi = dict(u=furi['u'], id=suri['_id'], a=a) self.scheduler.schedule(curi, ws) result['scheduled'] += 1 result['ts'] += (time.time() - t0) else: if self.workset_state[ws]: self.workset_deactivating(ws) # client is not active self.diverter.divert(str(ws), furi) result['saved'] += 1 return result
def __init__(self, jobconfigs, jobname, domaininfo): self.jobconfigs = jobconfigs self.jobname = jobname self.qdir = hqconfig.inqdir(self.jobname) self.inq = FileDequeue(self.qdir, reader=FPSortingQueueFileReader) self.mapper = WorksetMapper(hqconfig.NWORKSETS_BITS) self.seen = Seen(dbdir=hqconfig.seendir(self.jobname)) self.domaininfo = domaininfo self.scheduler = Scheduler(self.jobname, self.mapper)
try: o = parse_crawllog(l) except Exception as ex: print >>sys.stderr, "skipped %s" % str(ex) continue if o is None: continue self.put(o) count += 1 print >>sys.stderr, "\r%d" % count, f.close() self.flush() sys.stderr.write("\n") opt = OptionParser(usage='%prog [OPTIONS] JOB crawl.log...') opt.add_option('-C', action='store', dest='cachesize', type='int', default=1024, help='LevelDB block cache size in MiB (default 1024)') opt.add_option('-b', action='store', dest='batchsize', type='int', default=1000000, help='number of URIs to process in a batch (default 1M)') options, args = opt.parse_args() if len(args) < 1: opt.error('specify JOB') job = args.pop(0) batchseen = BatchSeenWriter(hqconfig.seendir(job)) for fn in args: batchseen.processfile(fn)
#!/usr/bin/python import sys, os sys.path.append(os.path.join(os.path.dirname(__file__), '../lib')) from optparse import OptionParser import hqconfig import leveldb opt = OptionParser('%prog JOB') opt.add_option('-f', action='store_true', dest='force', default=False, help='force repair even if this script guessed it is not a' 'valid level db directory') options, args = opt.parse_args() if len(args) == 0: opt.error('specify job name') job = args[0] seendir = hqconfig.seendir(job) if not os.path.isdir(seendir): opt.error('%s is not a directory') if not os.path.isfile(os.path.join(seendir, 'CURRENT')): print >>sys.stderr, "%s does not seem to be a LevelDB directory" % seendir if not options.force: exit(1) seendir_owner = os.stat(seendir).st_uid if seendir_owner != os.geteuid(): print >>sys.stderr, "%s owner (%d) does not match effective user (%d)" % ( seendir, seendir_owner, os.geteuid()) if not options.force: exit(1) leveldb.IntHash.repair_db(hqconfig.seendir(job))