class ExcludedList(object): """URL list for storing excluded URLs. As URLs are checked for exclusion before seen check, there are (a lot of) duplicates. read-out is not supported because current HQ makes no use of these URLs. """ # TODO: duplicated code with DivertQueue def __init__(self, jobname, bufsize=20): self.qdir = os.path.join(hqconfig.get('datadir'), jobname, 'ex') if not os.path.isdir(self.qdir): os.makedirs(self.qdir) FileEnqueue.recover(self.qdir) self.enq = FileEnqueue(self.qdir, buffer=bufsize, suffix='ex') self.queuedcount = 0 def flush(self): self.enq._flush() return self.enq.close() def shutdown(self): self.flush() def get_status(self): r = dict(queued=self.queuedcount) return r def add(self, furi): self.enq.queue(furi) self.queuedcount += 1
class CrawlJob(object): def __init__(self, jobconfig): self.jobconfig = jobconfig self.enq = FileEnqueue(qdir=hqconfig.inqdir(self.jobconfig.name), suffix=os.getpid(), buffer=1000, executor=None, gzip=9) def discovered(self, curis): self.enq.queue(curis) return dict(processed=len(curis)) def shutdown(self): self.flush() def flush(self): self.enq._flush() self.enq.close()
class DivertQueue(object): """Workset compatible class for storing URIs for delivery to external services (including other HQ).""" def __init__(self, basedir, name, bufsize=500): self.name = name self.qdir = os.path.join(basedir, str(name)) FileEnqueue.recover(self.qdir) self.enq = FileEnqueue(self.qdir, buffer=bufsize, suffix='d') self.queuedcount = 0 def flush(self): self.enq._flush() return self.enq.close() def shutdown(self): self.flush() def get_status(self): r = dict(name=self.name, queued=self.queuedcount) return r def schedule(self, curi): self.enq.queue(curi) self.queuedcount += 1 def listqfiles(self): try: fns = os.listdir(self.qdir) qfiles = [] for fn in fns: if '0' <= fn[0] <= '9' and not fn.endswith('.open'): qfiles.append(os.path.abspath(os.path.join(self.qdir, fn))) return qfiles except: return []
class WorksetWriter(object): """writing side of Workset.""" def __init__(self, wsdir, wsid): self.wsid = wsid self.qdir = os.path.join(wsdir, str(self.wsid)) FileEnqueue.recover(self.qdir) self.enq = FileEnqueue(self.qdir, buffer=500) self.scheduledcount = 0 def flush(self): self.enq._flush() return self.enq.close() def shutdown(self): self.flush() def get_status(self): r = dict(id=self.wsid, running=True, scheduled=self.scheduledcount) return r def schedule(self, curi): self.enq.queue(curi) self.scheduledcount += 1