def add_iter(it, pool, lck): if lck.acquire(timeout=lock_Timeout): for tmp in it: if not tmp in pool: pool.add(tmp) lck.release() else: logger.info("Fail to update the Pool.")
def del_iter(it, pool, lck): if lck.acquire(timeout=lock_Timeout): for tmp in it: if tmp in pool: pool.remove(tmp) lck.release() else: logger.info("Fail to clean the Pool.")
def write_iter(fname, it, lck, wrtm="w"): if it: with open(fname, wrtm) as f: if lck.acquire(timeout=lock_Timeout): for tmp in it: f.write(tmp.encode("utf-8", "ignore")) f.write("\n") lck.release() else: logger.info("Nest: Fail to dump the Pool.")
def launch_server(self): context = zmq.Context() socket = context.socket(zmq.PUB) socket.bind(self.addr) while self.run: try: req = socket.recv_json() except Exception as e: logger.info(e) socket.send_json(self.handle(req)) socket.close()
def mergePool(dValue, dLock, cmd): tmp = cmd.strip().split() srcPool = tmp[0] + "Pool" tgtPool = tmp[-1] + "Pool" if (srcPool != tgtPool) and (srcPool in dValue) and (tgtPool in dValue): if dLock[srcPool].acquire( timeout=lock_Timeout) and dLock[tgtPool].acquire( timeout=lock_Timeout): dValue[tgtPool] |= dValue[srcPool] else: logger.info("Fail to merge the Pool.") else: logger.info("Illegal command.")
def delPool(dValue, dLock, cmd): def del_iter(it, pool, lck): if lck.acquire(timeout=lock_Timeout): for tmp in it: if tmp in pool: pool.remove(tmp) lck.release() else: logger.info("Fail to clean the Pool.") tmp = cmd.strip().split() al = [unquote(tmpu) for tmpu in tmp[1:]] key = tmp[0] + "Pool" if key in dValue: del_iter(al, dValue[key], dLock[key]) else: logger.info("Illegal pool: " + tmp[0])
def addPool(dValue, dLock, cmd): def add_iter(it, pool, lck): if lck.acquire(timeout=lock_Timeout): for tmp in it: if not tmp in pool: pool.add(tmp) lck.release() else: logger.info("Fail to update the Pool.") tmp = cmd.strip().split() al = [unquote(tmpu) for tmpu in tmp[1:]] key = tmp[0] + Pool if key in dValue: add_iter(al, dValue[key], dLock[key]) else: logger.info("Illegal pool: " + tmp[0])
def t_core(self, dValue, dLock, lock_Timeout = 3.0): url_ext = URLExtractor() while self.working: if dLock["todoPool"].acquire(timeout = lock_Timeout): url = None if len(dValue["todoPool"]) > 0: url = dValue["todoPool"].pop() dLock["todoPool"].release() if url is not None: if dLock["cachePool"].acquire(timeout = lock_Timeout): dValue["cachePool"].add(url) dLock["cachePool"].release() else: logger.info(self.prompt + "Fail to add %s to cache Pool." % (url)) if dLock["failPool"].acquire(timeout = lock_Timeout): dValue["failPool"].add(url) dLock["failPool"].release() else: logger.info(self.prompt + "Fail to add %s to fail Pool." % (url)) self.processURL(url, dValue, dLock, url_ext, lock_Timeout) else: logger.info(self.prompt + "Fail to get the lock of to-do Pool.")
def dumpPool(dValue, dLock, lock_Timeout=3.0): def write_iter(fname, it, lck, wrtm="w"): if it: with open(fname, wrtm) as f: if lck.acquire(timeout=lock_Timeout): for tmp in it: f.write(tmp.encode("utf-8", "ignore")) f.write("\n") lck.release() else: logger.info("Nest: Fail to dump the Pool.") logger.info("Nest: dump to-do pool") write_iter(todoPoolf, dValue["todoPool"], dLock["todoPool"]) logger.info("Nest: dump done pool") write_iter(donePoolf, dValue["donePool"], dLock["donePool"]) logger.info("Nest: dump cache pool") write_iter(cachePoolf, dValue["cachePool"], dLock["cachePool"]) logger.info("Nest: dump fail pool") write_iter(failPoolf, dValue["failPool"], dLock["failPool"], "a")
def processURL(self, url, dValue, dLock, exter, lck_timeout = 3.0): data, proto_prefix, host, real_url, http_code, infos, is_Decoded, encode_method, confidence = getURL(url) if data is not None: real_url = real_url if real_url != url: logger.info("%s%s was redirected to %s" % (self.prompt, url, real_url,)) success_ext = True if extract_Page(url, host, real_url, proto_prefix, data, is_Decoded): urls = exter.extract(data, proto_prefix, host) if urls: todo = [] if dLock["donePool"].acquire(timeout = lck_timeout): for url in urls: if not url in dValue["donePool"]: todo.append(url) dLock["donePool"].release() if todo: if dLock["todoPool"].acquire(timeout = lck_timeout): for url in urls: if not url in dValue["todoPool"]: dValue["todoPool"].add(url) dLock["todoPool"].release() else: logger.info(self.prompt + "Fail to get the lock of to-do Pool.") success_ext = False else: logger.info(self.prompt + "Fail to get the lock of done Pool.") success_ext = False saveURL(real_url, data, "utf-8") if success_ext: if dLock["donePool"].acquire(timeout = lck_timeout): dValue["donePool"].add(real_url) dLock["donePool"].release() if dLock["cachePool"].acquire(timeout = lck_timeout): if url in dValue["cachePool"]: dValue["cachePool"].remove(url) dLock["cachePool"].release() if dLock["failPool"].acquire(timeout = lck_timeout): if url in dValue["failPool"]: dValue["failPool"].remove(url) dLock["failPool"].release() else: logger.info(self.prompt + "Fail to remove %s from fail Pool." % (url)) else: logger.info(self.prompt + "Fail to remove %s from cache Pool." % (url)) else: logger.info(self.prompt + "Fail to get the lock of done Pool.")