Python seendirの例

プログラミング言語: Python

名前空間/パッケージ名: hqconfig

メソッド/関数: seendir

hotexamples.comのコード掲載数: 6

Python seendir - 6件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのhqconfig.seendirの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: seen.py プロジェクト: kngenie/crawlhq

 def __call__(self, job):
     # seen cache parameter is in MB
     seen = self.job_seen.get(job)
     if not seen:
         cachesize = hqconfig.get('seencache')
         if cachesize: cachesize = int(cachesize)*(1024**2)
         seen = Seen(dbdir=hqconfig.seendir(job))
         self.job_seen[job] = seen
     return seen

コード例 #2

ファイルを表示

ファイル: dispatcher.py プロジェクト: travisfw/crawlhq

 def init_seen(self):
     if not self.seen:
         try:
             cachesize = hqconfig.get('seencache')
             if cachesize: cachesize = int(cachesize)*(1024**2)
         except:
             cachesize = None
         self.seen = Seen(dbdir=hqconfig.seendir(self.jobname),
                          block_cache_size=cachesize)

コード例 #3

ファイルを表示

ファイル: hq.py プロジェクト: travisfw/hq

    def processinq(self, maxn):
        '''process incoming queue. maxn paramter adivces
        upper limit on number of URIs processed in this single call.
        actual number of URIs processed may exceed it if incoming queue
        stores URIs in chunks.'''

        # lazy initialization of seen db
        if not self.seen:
            try:
                cachesize = hqconfig.get('seencache')
                if cachesize: cachesize = int(cachesize)*(1024**2)
            except:
                cachesize = None
            self.seen = Seen(dbdir=hqconfig.seendir(self.jobname),
                             block_cache_size=cachesize)

        result = dict(processed=0, scheduled=0, excluded=0, saved=0,
                      td=0.0, ts=0.0)
        for count in xrange(maxn):
            t0 = time.time()
            furi = self.inq.get(0.01)
            result['td'] += (time.time() - t0)
            if furi is None: break
            result['processed'] += 1
            ws = self.mapper.workset(furi)
            if self.is_workset_active(ws):
                # no need to call self.workset_activating(). it's already
                # done by Scheduler.
                di = self.get_domaininfo(furi['u'])
                if di and di['exclude']:
                    result['excluded'] += 1
                    continue
                t0 = time.time()
                suri = self.seen.already_seen(furi)
                if suri['e'] < int(time.time()):
                    if 'w' in furi:
                        a = furi['w']
                    else:
                        a = dict()
                        for k in ('p','v','x'):
                            m = furi.get(k)
                            if m is not None:
                                a[k] = m
                    curi = dict(u=furi['u'], id=suri['_id'], a=a)
                    self.scheduler.schedule(curi, ws)
                    result['scheduled'] += 1
                result['ts'] += (time.time() - t0)
            else:
                if self.workset_state[ws]:
                    self.workset_deactivating(ws)
                # client is not active
                self.diverter.divert(str(ws), furi)
                result['saved'] += 1
        return result

コード例 #4

ファイルを表示

ファイル: dispatcher.py プロジェクト: travisfw/hq

    def __init__(self, jobconfigs, jobname, domaininfo):
        self.jobconfigs = jobconfigs
        self.jobname = jobname
        self.qdir = hqconfig.inqdir(self.jobname)

        self.inq = FileDequeue(self.qdir, reader=FPSortingQueueFileReader)

        self.mapper = WorksetMapper(hqconfig.NWORKSETS_BITS)
        self.seen = Seen(dbdir=hqconfig.seendir(self.jobname))
        self.domaininfo = domaininfo
        
        self.scheduler = Scheduler(self.jobname, self.mapper)

コード例 #5

ファイルを表示

ファイル: seenrebuild.py プロジェクト: kngenie/crawlhq

            try:
                o = parse_crawllog(l)
            except Exception as ex:
                print >>sys.stderr, "skipped %s" % str(ex)
                continue
            if o is None:
                continue
            self.put(o)
            count += 1
            print >>sys.stderr, "\r%d" % count,
        f.close()
        self.flush()
        sys.stderr.write("\n")

opt = OptionParser(usage='%prog [OPTIONS] JOB crawl.log...')
opt.add_option('-C', action='store', dest='cachesize', type='int',
               default=1024,
               help='LevelDB block cache size in MiB (default 1024)')
opt.add_option('-b', action='store', dest='batchsize', type='int',
               default=1000000,
               help='number of URIs to process in a batch (default 1M)')

options, args = opt.parse_args()
if len(args) < 1:
    opt.error('specify JOB')
job = args.pop(0)

batchseen = BatchSeenWriter(hqconfig.seendir(job))
for fn in args:
    batchseen.processfile(fn)

コード例 #6

ファイルを表示

ファイル: seenrepair.py プロジェクト: kngenie/crawlhq

#!/usr/bin/python
import sys, os
sys.path.append(os.path.join(os.path.dirname(__file__), '../lib'))
from optparse import OptionParser
import hqconfig
import leveldb

opt = OptionParser('%prog JOB')
opt.add_option('-f', action='store_true', dest='force', default=False,
               help='force repair even if this script guessed it is not a'
               'valid level db directory')
options, args = opt.parse_args()
if len(args) == 0:
    opt.error('specify job name')
job = args[0]    
seendir = hqconfig.seendir(job)
if not os.path.isdir(seendir):
    opt.error('%s is not a directory')
if not os.path.isfile(os.path.join(seendir, 'CURRENT')):
    print >>sys.stderr, "%s does not seem to be a LevelDB directory" % seendir
    if not options.force:
        exit(1)
seendir_owner = os.stat(seendir).st_uid
if seendir_owner != os.geteuid():
    print >>sys.stderr, "%s owner (%d) does not match effective user (%d)" % (
        seendir, seendir_owner, os.geteuid())
    if not options.force:
        exit(1)
leveldb.IntHash.repair_db(hqconfig.seendir(job))