def crawl(url, currentDepth, countUrls): redisCon = Redis(host=conf.REDIS_HOST, port=conf.REDIS_PORT, password=conf.REDIS_PASSWD) try: headers = dict() headers[HTTP_HEADER.USER_AGENT] = randomUserAgents() response = requests.get(url, timeout=10, headers=headers) # crawlMsg = 'crawled %s depth: %d count: %d' % (url, currentDepth, countVisitedUrls) # logger.log(CUSTOM_LOGGING.SYSINFO, crawlMsg) content = response.text kb.pageEncoding = response.encoding conf.cookie = str(response.cookies.get_dict()) hashData = hashUrl(url) redisCon.sadd('visited', hashData) redisCon.lpush('visitedList', url) getDB().insert({'url':url, 'depth': currentDepth, 'count':countUrls}) except Exception, ex: logger.log(CUSTOM_LOGGING.ERROR, ex) # print traceback.print_exc() return
def crawlerThread(): global countVisitedUrls while visitQueue.qsize() > 0: url = visitQueue.get() try: hashData = hashUrl(url) if hashData not in visited: headers[HTTP_HEADER.USER_AGENT] = randomUserAgents() response = requests.get(url, timeout=10, headers=headers) crawlMsg = 'crawled %s depth: %d count: %d' % (url, currentDepth, countVisitedUrls) logger.log(CUSTOM_LOGGING.SYSINFO, crawlMsg) content = response.text kb.pageEncoding = response.encoding conf.cookie = str(response.cookies.get_dict()) try: lock.acquire() visited.add(hashData) countVisitedUrls += 1 fp.write(url + '\n') lock.release() except Exception, ex: logger.log(CUSTOM_LOGGING.ERROR, ex) if lock.locked(): lock.release() continue else: continue