Beispiel #1
0
def crawl(url, currentDepth, countUrls):

    redisCon = Redis(host=conf.REDIS_HOST,
                      port=conf.REDIS_PORT,
                      password=conf.REDIS_PASSWD)

    try:
        headers = dict()
        headers[HTTP_HEADER.USER_AGENT] = randomUserAgents()

        response = requests.get(url, timeout=10, headers=headers)
        # crawlMsg = 'crawled %s depth: %d count: %d' % (url, currentDepth, countVisitedUrls)
        # logger.log(CUSTOM_LOGGING.SYSINFO, crawlMsg)
        content = response.text

        kb.pageEncoding = response.encoding
        conf.cookie = str(response.cookies.get_dict())
        hashData = hashUrl(url)
        redisCon.sadd('visited', hashData)
        redisCon.lpush('visitedList', url)
        getDB().insert({'url':url, 'depth': currentDepth, 'count':countUrls})

    except Exception, ex:
        logger.log(CUSTOM_LOGGING.ERROR, ex)
        # print traceback.print_exc()
        return
Beispiel #2
0
    def crawlerThread():
        global countVisitedUrls

        while visitQueue.qsize() > 0:
            url = visitQueue.get()
            try:
                hashData = hashUrl(url)
                if hashData not in visited:
                    headers[HTTP_HEADER.USER_AGENT] = randomUserAgents()
                    response = requests.get(url, timeout=10, headers=headers)
                    crawlMsg = 'crawled %s depth: %d count: %d' % (url, currentDepth, countVisitedUrls)
                    logger.log(CUSTOM_LOGGING.SYSINFO, crawlMsg)
                    content = response.text

                    kb.pageEncoding = response.encoding
                    conf.cookie = str(response.cookies.get_dict())

                    try:
                        lock.acquire()
                        visited.add(hashData)
                        countVisitedUrls += 1
                        fp.write(url + '\n')
                        lock.release()
                    except Exception, ex:
                        logger.log(CUSTOM_LOGGING.ERROR, ex)
                        if lock.locked():
                            lock.release()
                        continue
                else:
                    continue