Esempio n. 1
0
class Crawler(object):
    """
    This class defines methods used to perform crawling (command
    line option '--crawl'
    """
    def getTargetUrls(self):
        try:
            threadData = getCurrentThreadData()
            threadData.shared.outputs = oset()

            def crawlThread():
                threadData = getCurrentThreadData()

                while kb.threadContinue:
                    with kb.locks.limits:
                        if threadData.shared.unprocessed:
                            current = threadData.shared.unprocessed.pop()
                        else:
                            break

                    content = None
                    try:
                        if current:
                            content = Request.getPage(url=current,
                                                      crawling=True,
                                                      raise404=False)[0]
                    except SqlmapConnectionException, e:
                        errMsg = "connection exception detected (%s). skipping " % e
                        errMsg += "url '%s'" % current
                        logger.critical(errMsg)
                    except httplib.InvalidURL, e:
                        errMsg = "invalid url detected (%s). skipping " % e
                        errMsg += "url '%s'" % current
                        logger.critical(errMsg)

                    if not kb.threadContinue:
                        break

                    if isinstance(content, unicode):
                        try:
                            soup = BeautifulSoup(content)
                            for tag in soup('a'):
                                if tag.get("href"):
                                    url = urlparse.urljoin(
                                        conf.url, tag.get("href"))

                                    # flag to know if we are dealing with the same target host
                                    _ = reduce(
                                        lambda x, y: x == y,
                                        map(
                                            lambda x: urlparse.urlparse(
                                                x).netloc.split(':')[0],
                                            (url, conf.url)))

                                    if conf.scope:
                                        if not re.search(
                                                conf.scope, url, re.I):
                                            continue
                                    elif not _:
                                        continue

                                    if url.split('.')[-1].lower(
                                    ) not in CRAWL_EXCLUDE_EXTENSIONS:
                                        with kb.locks.outputs:
                                            threadData.shared.deeper.add(url)
                                            if re.search(r"(.*?)\?(.+)", url):
                                                threadData.shared.outputs.add(
                                                    url)
                        except UnicodeEncodeError:  # for non-HTML files
                            pass
                        finally:
                            if conf.forms:
                                findPageForms(content, current, False, True)

                    if conf.verbose in (1, 2):
                        threadData.shared.count += 1
                        status = '%d/%d links visited (%d%s)' % (
                            threadData.shared.count, threadData.shared.length,
                            round(100.0 * threadData.shared.count /
                                  threadData.shared.length), '%')
                        dataToStdout(
                            "\r[%s] [INFO] %s" % (time.strftime("%X"), status),
                            True)
Esempio n. 2
0
                                if conf.scope:
                                    if not re.search(conf.scope, url, re.I):
                                        continue
                                elif not _:
                                    continue

                                if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS:
                                    with kb.locks.value:
                                        threadData.shared.deeper.add(url)
                                        if re.search(r"(.*?)\?(.+)", url):
                                            threadData.shared.value.add(url)
                    except UnicodeEncodeError:  # for non-HTML files
                        pass
                    finally:
                        if conf.forms:
                            findPageForms(content, current, False, True)

                if conf.verbose in (1, 2):
                    threadData.shared.count += 1
                    status = '%d/%d links visited (%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length))
                    dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)

        threadData.shared.deeper = set()
        threadData.shared.unprocessed = set([target])

        logger.info("starting crawler")

        for i in xrange(conf.crawlDepth):
            if i > 0 and conf.threads == 1:
                singleTimeWarnMessage("running in a single-thread mode. This could take a while.")
            threadData.shared.count = 0
Esempio n. 3
0
        def crawlThread():
            threadData = getCurrentThreadData()

            while kb.threadContinue:
                with kb.locks.limit:
                    if threadData.shared.unprocessed:
                        current = threadData.shared.unprocessed.pop()
                        if current in visited:
                            continue
                        elif conf.crawlExclude and re.search(conf.crawlExclude, current):
                            dbgMsg = "skipping '%s'" % current
                            logger.debug(dbgMsg)
                            continue
                        else:
                            visited.add(current)
                    else:
                        break

                content = None
                try:
                    if current:
                        content = Request.getPage(url=current, crawling=True, raise404=False)[0]
                except SqlmapConnectionException as ex:
                    errMsg = "connection exception detected ('%s'). skipping " % getSafeExString(ex)
                    errMsg += "URL '%s'" % current
                    logger.critical(errMsg)
                except SqlmapSyntaxException:
                    errMsg = "invalid URL detected. skipping '%s'" % current
                    logger.critical(errMsg)
                except _http_client.InvalidURL as ex:
                    errMsg = "invalid URL detected ('%s'). skipping " % getSafeExString(ex)
                    errMsg += "URL '%s'" % current
                    logger.critical(errMsg)

                if not kb.threadContinue:
                    break

                if isinstance(content, six.text_type):
                    try:
                        match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)
                        if match:
                            content = "<html>%s</html>" % match.group(1)

                        soup = BeautifulSoup(content)
                        tags = soup('a')

                        tags += re.finditer(r'(?i)\s(href|src)=["\'](?P<href>[^>"\']+)', content)
                        tags += re.finditer(r'(?i)window\.open\(["\'](?P<href>[^)"\']+)["\']', content)

                        for tag in tags:
                            href = tag.get("href") if hasattr(tag, "get") else tag.group("href")

                            if href:
                                if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID:
                                    current = threadData.lastRedirectURL[1]
                                url = _urllib.parse.urljoin(current, htmlUnescape(href))

                                # flag to know if we are dealing with the same target host
                                _ = checkSameHost(url, target)

                                if conf.scope:
                                    if not re.search(conf.scope, url, re.I):
                                        continue
                                elif not _:
                                    continue

                                if (extractRegexResult(r"\A[^?]+\.(?P<result>\w+)(\?|\Z)", url) or "").lower() not in CRAWL_EXCLUDE_EXTENSIONS:
                                    with kb.locks.value:
                                        threadData.shared.deeper.add(url)
                                        if re.search(r"(.*?)\?(.+)", url) and not re.search(r"\?(v=)?\d+\Z", url) and not re.search(r"(?i)\.(js|css)(\?|\Z)", url):
                                            threadData.shared.value.add(url)
                    except UnicodeEncodeError:  # for non-HTML files
                        pass
                    except ValueError:          # for non-valid links
                        pass
                    finally:
                        if conf.forms:
                            threadData.shared.formsFound |= len(findPageForms(content, current, False, True)) > 0

                if conf.verbose in (1, 2):
                    threadData.shared.count += 1
                    status = '%d/%d links visited (%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length))
                    dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)
Esempio n. 4
0
                                        continue
                                elif not _:
                                    continue

                                if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS:
                                    with kb.locks.value:
                                        threadData.shared.deeper.add(url)
                                        if re.search(r"(.*?)\?(.+)", url):
                                            threadData.shared.value.add(url)
                    except UnicodeEncodeError:  # 用于非HTML文件
                        pass
                    except ValueError:          # 用于非有效链接
                        pass
                    finally:
                        if conf.forms:
                            findPageForms(content, current, False, True)

                if conf.verbose in (1, 2):
                    threadData.shared.count += 1
                    status = u'访问了%d/%d个链接(%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length))
                    dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)

        threadData.shared.deeper = set()
        threadData.shared.unprocessed = set([target])

        if not conf.sitemapUrl:
            message = u"你想检查网站是否存在站点地图sitemap(.xml)文件吗? [y/N] "

            if readInput(message, default='N', boolean=True):
                found = True
                items = None
Esempio n. 5
0
        # print traceback.print_exc()
        return

    if isinstance(content, unicode):
        try:
            match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)
            if match:
                content = "<html>%s</html>" % match.group(1)

            soup = BeautifulSoup(content, 'lxml')
            tags = soup('a')
            if not tags:
                tags = re.finditer(r'(?si)<a[^>]+href="(?P<href>[^>"]+)"', content)

            for tag in tags:
                href = tag.get("href") if hasattr(tag, "get") else tag.group("href")

                if href and 'javascript:' not in href:
                    href = urlparse.urljoin(conf.CRAWL_SITE, href)
                    if conf.CRAWL_SITE in href:
                        redisCon.lpush('tmpVisit', href)
                        # logger.log(CUSTOM_LOGGING.ERROR, href)

        except Exception, ex:  # for non-HTML files
            logger.log(CUSTOM_LOGGING.ERROR, ex)
        finally:
            forms = findPageForms(content, url, False)
            for form in forms:
                formMsg = '%s has form, url: %s method: %s data: %s' % (url, form[0], form[1], form[2])
                logger.log(CUSTOM_LOGGING.WARNING, formMsg)