コード例 #1
0
ファイル: dispatcher.py プロジェクト: travisfw/crawlhq
class ExcludedList(object):
    """URL list for storing excluded URLs. As URLs are checked for exclusion
    before seen check, there are (a lot of) duplicates.
    read-out is not supported because current HQ makes no use of these URLs.
    """
    # TODO: duplicated code with DivertQueue
    def __init__(self, jobname, bufsize=20):
        self.qdir = os.path.join(hqconfig.get('datadir'), jobname, 'ex')
        if not os.path.isdir(self.qdir):
            os.makedirs(self.qdir)
        FileEnqueue.recover(self.qdir)
        self.enq = FileEnqueue(self.qdir, buffer=bufsize, suffix='ex')
        self.queuedcount = 0

    def flush(self):
        self.enq._flush()
        return self.enq.close()

    def shutdown(self):
        self.flush()

    def get_status(self):
        r = dict(queued=self.queuedcount)
        return r

    def add(self, furi):
        self.enq.queue(furi)
        self.queuedcount += 1
コード例 #2
0
ファイル: inq.py プロジェクト: travisfw/hq
class CrawlJob(object):
    def __init__(self, jobconfig):
        self.jobconfig = jobconfig
        self.enq = FileEnqueue(qdir=hqconfig.inqdir(self.jobconfig.name),
                               suffix=os.getpid(),
                               buffer=1000,
                               executor=None,
                               gzip=9)

    def discovered(self, curis):
        self.enq.queue(curis)
        return dict(processed=len(curis))

    def shutdown(self):
        self.flush()

    def flush(self):
        self.enq._flush()
        self.enq.close()
コード例 #3
0
ファイル: diverter.py プロジェクト: kngenie/crawlhq
class DivertQueue(object):
    """Workset compatible class for storing URIs for delivery to external
    services (including other HQ)."""
    def __init__(self, basedir, name, bufsize=500):
        self.name = name
        self.qdir = os.path.join(basedir, str(name))
        FileEnqueue.recover(self.qdir)
        self.enq = FileEnqueue(self.qdir, buffer=bufsize, suffix='d')

        self.queuedcount = 0

    def flush(self):
        self.enq._flush()
        return self.enq.close()

    def shutdown(self):
        self.flush()

    def get_status(self):
        r = dict(name=self.name, queued=self.queuedcount)
        return r

    def schedule(self, curi):
        self.enq.queue(curi)
        self.queuedcount += 1

    def listqfiles(self):
        try:
            fns = os.listdir(self.qdir)
            qfiles = []
            for fn in fns:
                if '0' <= fn[0] <= '9' and not fn.endswith('.open'):
                    qfiles.append(os.path.abspath(os.path.join(self.qdir, fn)))
            return qfiles
        except:
            return []
コード例 #4
0
ファイル: dispatcher.py プロジェクト: travisfw/hq
class WorksetWriter(object):
    """writing side of Workset."""
    def __init__(self, wsdir, wsid):
        self.wsid = wsid
        self.qdir = os.path.join(wsdir, str(self.wsid))
        FileEnqueue.recover(self.qdir)
        self.enq = FileEnqueue(self.qdir, buffer=500)

        self.scheduledcount = 0
        
    def flush(self):
        self.enq._flush()
        return self.enq.close()

    def shutdown(self):
        self.flush()

    def get_status(self):
        r = dict(id=self.wsid, running=True,
                 scheduled=self.scheduledcount)
        return r
    def schedule(self, curi):
        self.enq.queue(curi)
        self.scheduledcount += 1