コード例 #1
0
 def add_iter(it, pool, lck):
     if lck.acquire(timeout=lock_Timeout):
         for tmp in it:
             if not tmp in pool:
                 pool.add(tmp)
         lck.release()
     else:
         logger.info("Fail to update the Pool.")
コード例 #2
0
 def del_iter(it, pool, lck):
     if lck.acquire(timeout=lock_Timeout):
         for tmp in it:
             if tmp in pool:
                 pool.remove(tmp)
         lck.release()
     else:
         logger.info("Fail to clean the Pool.")
コード例 #3
0
 def write_iter(fname, it, lck, wrtm="w"):
     if it:
         with open(fname, wrtm) as f:
             if lck.acquire(timeout=lock_Timeout):
                 for tmp in it:
                     f.write(tmp.encode("utf-8", "ignore"))
                     f.write("\n")
                 lck.release()
             else:
                 logger.info("Nest: Fail to dump the Pool.")
コード例 #4
0
ファイル: zmqnest.py プロジェクト: hfxunlp/crawler
 def launch_server(self):
     context = zmq.Context()
     socket = context.socket(zmq.PUB)
     socket.bind(self.addr)
     while self.run:
         try:
             req = socket.recv_json()
         except Exception as e:
             logger.info(e)
         socket.send_json(self.handle(req))
     socket.close()
コード例 #5
0
def mergePool(dValue, dLock, cmd):

    tmp = cmd.strip().split()
    srcPool = tmp[0] + "Pool"
    tgtPool = tmp[-1] + "Pool"
    if (srcPool != tgtPool) and (srcPool in dValue) and (tgtPool in dValue):
        if dLock[srcPool].acquire(
                timeout=lock_Timeout) and dLock[tgtPool].acquire(
                    timeout=lock_Timeout):
            dValue[tgtPool] |= dValue[srcPool]
        else:
            logger.info("Fail to merge the Pool.")
    else:
        logger.info("Illegal command.")
コード例 #6
0
def delPool(dValue, dLock, cmd):
    def del_iter(it, pool, lck):
        if lck.acquire(timeout=lock_Timeout):
            for tmp in it:
                if tmp in pool:
                    pool.remove(tmp)
            lck.release()
        else:
            logger.info("Fail to clean the Pool.")

    tmp = cmd.strip().split()
    al = [unquote(tmpu) for tmpu in tmp[1:]]
    key = tmp[0] + "Pool"
    if key in dValue:
        del_iter(al, dValue[key], dLock[key])
    else:
        logger.info("Illegal pool: " + tmp[0])
コード例 #7
0
def addPool(dValue, dLock, cmd):
    def add_iter(it, pool, lck):
        if lck.acquire(timeout=lock_Timeout):
            for tmp in it:
                if not tmp in pool:
                    pool.add(tmp)
            lck.release()
        else:
            logger.info("Fail to update the Pool.")

    tmp = cmd.strip().split()
    al = [unquote(tmpu) for tmpu in tmp[1:]]
    key = tmp[0] + Pool
    if key in dValue:
        add_iter(al, dValue[key], dLock[key])
    else:
        logger.info("Illegal pool: " + tmp[0])
コード例 #8
0
ファイル: spider.py プロジェクト: hfxunlp/crawler
	def t_core(self, dValue, dLock, lock_Timeout = 3.0):

		url_ext = URLExtractor()
		while self.working:

			if dLock["todoPool"].acquire(timeout = lock_Timeout):
				url = None
				if len(dValue["todoPool"]) > 0:
					url = dValue["todoPool"].pop()
				dLock["todoPool"].release()
				if url is not None:
					if dLock["cachePool"].acquire(timeout = lock_Timeout):
						dValue["cachePool"].add(url)
						dLock["cachePool"].release()
					else:
						logger.info(self.prompt + "Fail to add %s to cache Pool." % (url))
					if dLock["failPool"].acquire(timeout = lock_Timeout):
						dValue["failPool"].add(url)
						dLock["failPool"].release()
					else:
						logger.info(self.prompt + "Fail to add %s to fail Pool." % (url))
					self.processURL(url, dValue, dLock, url_ext, lock_Timeout)

			else:
				logger.info(self.prompt + "Fail to get the lock of to-do Pool.")
コード例 #9
0
def dumpPool(dValue, dLock, lock_Timeout=3.0):
    def write_iter(fname, it, lck, wrtm="w"):
        if it:
            with open(fname, wrtm) as f:
                if lck.acquire(timeout=lock_Timeout):
                    for tmp in it:
                        f.write(tmp.encode("utf-8", "ignore"))
                        f.write("\n")
                    lck.release()
                else:
                    logger.info("Nest: Fail to dump the Pool.")

    logger.info("Nest: dump to-do pool")
    write_iter(todoPoolf, dValue["todoPool"], dLock["todoPool"])
    logger.info("Nest: dump done pool")
    write_iter(donePoolf, dValue["donePool"], dLock["donePool"])
    logger.info("Nest: dump cache pool")
    write_iter(cachePoolf, dValue["cachePool"], dLock["cachePool"])
    logger.info("Nest: dump fail pool")
    write_iter(failPoolf, dValue["failPool"], dLock["failPool"], "a")
コード例 #10
0
ファイル: spider.py プロジェクト: hfxunlp/crawler
	def processURL(self, url, dValue, dLock, exter, lck_timeout = 3.0):

		data, proto_prefix, host, real_url, http_code, infos, is_Decoded, encode_method, confidence = getURL(url)
		if data is not None:
			real_url = real_url
			if real_url != url:
				logger.info("%s%s was redirected to %s" % (self.prompt, url, real_url,))

			success_ext = True
			if extract_Page(url, host, real_url, proto_prefix, data, is_Decoded):
				urls = exter.extract(data, proto_prefix, host)
				if urls:
					todo = []
					if dLock["donePool"].acquire(timeout = lck_timeout):
						for url in urls:
							if not url in dValue["donePool"]:
								todo.append(url)
						dLock["donePool"].release()
						if todo:
							if dLock["todoPool"].acquire(timeout = lck_timeout):
								for url in urls:
									if not url in dValue["todoPool"]:
										dValue["todoPool"].add(url)
								dLock["todoPool"].release()
							else:
								logger.info(self.prompt + "Fail to get the lock of to-do Pool.")
								success_ext = False
					else:
						logger.info(self.prompt + "Fail to get the lock of done Pool.")
						success_ext = False

			saveURL(real_url, data, "utf-8")
			if success_ext:
				if dLock["donePool"].acquire(timeout = lck_timeout):
					dValue["donePool"].add(real_url)
					dLock["donePool"].release()
					if dLock["cachePool"].acquire(timeout = lck_timeout):
						if url in dValue["cachePool"]:
							dValue["cachePool"].remove(url)
						dLock["cachePool"].release()
						if dLock["failPool"].acquire(timeout = lck_timeout):
							if url in dValue["failPool"]:
								dValue["failPool"].remove(url)
							dLock["failPool"].release()
						else:
							logger.info(self.prompt + "Fail to remove %s from fail Pool." % (url))
					else:
						logger.info(self.prompt + "Fail to remove %s from cache Pool." % (url))
				else:
					logger.info(self.prompt + "Fail to get the lock of done Pool.")