Esempio n. 1
0
def cli():
    parser = argparse.ArgumentParser(
        description='Check HTTPs rules for validity')
    parser.add_argument('checker_config',
                        help='an integer for the accumulator')
    parser.add_argument('rule_files',
                        nargs="*",
                        default=[],
                        help="Specific XML rule files")
    parser.add_argument('--json_file',
                        default=None,
                        help='write results in json file')
    args = parser.parse_args()

    config = SafeConfigParser()
    config.read(args.checker_config)

    logfile = config.get("log", "logfile")
    loglevel = convertLoglevel(config.get("log", "loglevel"))
    if logfile == "-":
        logging.basicConfig(stream=sys.stderr,
                            level=loglevel,
                            format="%(levelname)s %(message)s")
    else:
        logging.basicConfig(
            filename=logfile,
            level=loglevel,
            format=
            "%(asctime)s %(levelname)s %(message)s [%(pathname)s:%(lineno)d]")

    autoDisable = False
    if config.has_option("rulesets", "auto_disable"):
        autoDisable = config.getboolean("rulesets", "auto_disable")
    # Test rules even if they have default_off=...
    includeDefaultOff = False
    if config.has_option("rulesets", "include_default_off"):
        includeDefaultOff = config.getboolean("rulesets",
                                              "include_default_off")
    ruledir = config.get("rulesets", "rulesdir")
    checkCoverage = False
    if config.has_option("rulesets", "check_coverage"):
        checkCoverage = config.getboolean("rulesets", "check_coverage")
    checkTargetValidity = False
    if config.has_option("rulesets", "check_target_validity"):
        checkTargetValidity = config.getboolean("rulesets",
                                                "check_target_validity")
    checkNonmatchGroups = False
    if config.has_option("rulesets", "check_nonmatch_groups"):
        checkNonmatchGroups = config.getboolean("rulesets",
                                                "check_nonmatch_groups")
    checkTestFormatting = False
    if config.has_option("rulesets", "check_test_formatting"):
        checkTestFormatting = config.getboolean("rulesets",
                                                "check_test_formatting")
    certdir = config.get("certificates", "basedir")
    if config.has_option("rulesets", "skiplist") and config.has_option(
            "rulesets", "skipfield"):
        skiplist = config.get("rulesets", "skiplist")
        skipfield = config.get("rulesets", "skipfield")
        with open(skiplist) as f:
            f.readline()
            for line in f:
                splitLine = line.split(",")
                fileHash = splitLine[0]
                if splitLine[int(skipfield)] == "1":
                    skipdict[binascii.unhexlify(fileHash)] = 1

    threadCount = config.getint("http", "threads")
    httpEnabled = True
    if config.has_option("http", "enabled"):
        httpEnabled = config.getboolean("http", "enabled")

    metricName = config.get("thresholds", "metric")
    thresholdDistance = config.getfloat("thresholds", "max_distance")
    metricClass = getMetricClass(metricName)
    metric = metricClass()

    # Debugging options, graphviz dump
    dumpGraphvizTrie = False
    if config.has_option("debug", "dump_graphviz_trie"):
        dumpGraphvizTrie = config.getboolean("debug", "dump_graphviz_trie")
    if dumpGraphvizTrie:
        graphvizFile = config.get("debug", "graphviz_file")
        exitAfterDump = config.getboolean("debug", "exit_after_dump")

    if args.rule_files:
        xmlFnames = args.rule_files
    else:
        xmlFnames = glob.glob(os.path.join(ruledir, "*.xml"))
    trie = RuleTrie()

    rulesets = []
    coverageProblemsExist = False
    targetValidityProblemExist = False
    nonmatchGroupProblemsExist = False
    testFormattingProblemsExist = False
    for xmlFname in xmlFnames:
        logging.debug("Parsing {}".format(xmlFname))
        if skipFile(xmlFname):
            logging.debug(
                "Skipping rule file '{}', matches skiplist.".format(xmlFname))
            continue

        ruleset = Ruleset(
            etree.parse(open(xmlFname, "rb")).getroot(), xmlFname)
        if ruleset.defaultOff and not includeDefaultOff:
            logging.debug("Skipping rule '{}', reason: {}".format(
                ruleset.name, ruleset.defaultOff))
            continue
        # Check whether ruleset coverage by tests was sufficient.
        if checkCoverage:
            logging.debug("Checking coverage for '{}'.".format(ruleset.name))
            problems = ruleset.getCoverageProblems()
            for problem in problems:
                coverageProblemsExist = True
                logging.error(problem)
        if checkTargetValidity:
            logging.debug("Checking target validity for '{}'.".format(
                ruleset.name))
            problems = ruleset.getTargetValidityProblems()
            for problem in problems:
                targetValidityProblemExist = True
                logging.error(problem)
        if checkNonmatchGroups:
            logging.debug("Checking non-match groups for '{}'.".format(
                ruleset.name))
            problems = ruleset.getNonmatchGroupProblems()
            for problem in problems:
                nonmatchGroupProblemsExist = True
                logging.error(problem)
        if checkTestFormatting:
            logging.debug("Checking test formatting for '{}'.".format(
                ruleset.name))
            problems = ruleset.getTestFormattingProblems()
            for problem in problems:
                testFormattingProblemsExist = True
                logging.error(problem)
        trie.addRuleset(ruleset)
        rulesets.append(ruleset)

    # Trie is built now, dump it if it's set in config
    if dumpGraphvizTrie:
        logging.debug("Dumping graphviz ruleset trie")
        graph = trie.generateGraphizGraph()
        if graphvizFile == "-":
            graph.dot()
        else:
            with open(graphvizFile, "w") as gvFd:
                graph.dot(gvFd)
        if exitAfterDump:
            sys.exit(0)
    fetchOptions = http_client.FetchOptions(config)
    fetchers = list()

    # Ensure "default" is in the platform dirs
    if not os.path.isdir(os.path.join(certdir, "default")):
        raise RuntimeError(
            "Platform 'default' is missing from certificate directories")

    platforms = http_client.CertificatePlatforms(
        os.path.join(certdir, "default"))
    fetchers.append(
        http_client.HTTPFetcher("default", platforms, fetchOptions, trie))
    # fetches pages with unrewritten URLs
    fetcherPlain = http_client.HTTPFetcher("default", platforms, fetchOptions)

    urlList = []
    if config.has_option("http", "url_list"):
        with open(config.get("http", "url_list")) as urlFile:
            urlList = [line.rstrip() for line in urlFile.readlines()]

    if httpEnabled:
        taskQueue = queue.Queue(1000)
        resQueue = queue.Queue()
        startTime = time.time()
        testedUrlPairCount = 0
        config.getboolean("debug", "exit_after_dump")

        for i in range(threadCount):
            t = UrlComparisonThread(taskQueue, metric, thresholdDistance,
                                    autoDisable, resQueue)
            t.setDaemon(True)
            t.start()

        # set of main pages to test
        mainPages = set(urlList)
        # If list of URLs to test/scan was not defined, use the test URL extraction
        # methods built into the Ruleset implementation.
        if not urlList:
            for ruleset in rulesets:
                if ruleset.platform != "default" and os.path.isdir(
                        os.path.join(certdir, ruleset.platform)):
                    theseFetchers = copy.deepcopy(fetchers)
                    platforms.addPlatform(
                        ruleset.platform,
                        os.path.join(certdir, ruleset.platform))
                    theseFetchers.append(
                        http_client.HTTPFetcher(ruleset.platform, platforms,
                                                fetchOptions, trie))
                else:
                    theseFetchers = fetchers
                testUrls = []
                for test in ruleset.tests:
                    if not ruleset.excludes(test.url):
                        testedUrlPairCount += 1
                        testUrls.append(test.url)
                    else:
                        # TODO: We should fetch the non-rewritten exclusion URLs to make
                        # sure they still exist.
                        logging.debug("Skipping excluded URL {}".format(
                            test.url))
                task = ComparisonTask(testUrls, fetcherPlain, theseFetchers,
                                      ruleset)
                taskQueue.put(task)

        taskQueue.join()
        logging.info(
            "Finished in {:.2f} seconds. Loaded rulesets: {}, URL pairs: {}.".
            format(time.time() - startTime, len(xmlFnames),
                   testedUrlPairCount))
        if args.json_file:
            json_output(resQueue, args.json_file, problems)
    if checkCoverage:
        if coverageProblemsExist:
            return 1  # exit with error code
    if checkTargetValidity:
        if targetValidityProblemExist:
            return 1  # exit with error code
    if checkNonmatchGroups:
        if nonmatchGroupProblemsExist:
            return 1  # exit with error code
    if checkTestFormatting:
        if testFormattingProblemsExist:
            return 1  # exit with error code
    return 0  # exit with success
	
	# Trie is built now, dump it if it's set in config
	if dumpGraphvizTrie:
		logging.debug("Dumping graphviz ruleset trie")
		graph = trie.generateGraphizGraph()
		if graphvizFile == "-":
			graph.dot()
		else:
			with file(graphvizFile, "w") as gvFd:
				graph.dot(gvFd)
		if exitAfterDump:
			sys.exit(0)
	fetchOptions = http_client.FetchOptions(config)
	fetcherMap = dict() #maps platform to fetcher
	
	platforms = http_client.CertificatePlatforms(os.path.join(certdir, "default"))
	for platform in havePlatforms:
		#adding "default" again won't break things
		platforms.addPlatform(platform, os.path.join(certdir, platform))
		fetcher = http_client.HTTPFetcher(platform, platforms, fetchOptions, trie)
		fetcherMap[platform] = fetcher
	
	#fetches pages with unrewritten URLs
	fetcherPlain = http_client.HTTPFetcher("default", platforms, fetchOptions)
	
	urlList = []
	if config.has_option("http", "url_list"):
		with file(config.get("http", "url_list")) as urlFile:
			urlList = [line.rstrip() for line in urlFile.readlines()]
			
	if httpEnabled:
def cli():
    if len(sys.argv) < 2:
        print >> sys.stderr, "check_rules.py checker.config"
        sys.exit(1)

    config = SafeConfigParser()
    config.read(sys.argv[1])

    logfile = config.get("log", "logfile")
    loglevel = convertLoglevel(config.get("log", "loglevel"))
    if logfile == "-":
        logging.basicConfig(
            stream=sys.stderr,
            level=loglevel,
            format=
            "%(asctime)s %(levelname)s %(message)s [%(pathname)s:%(lineno)d]")
    else:
        logging.basicConfig(
            filename=logfile,
            level=loglevel,
            format=
            "%(asctime)s %(levelname)s %(message)s [%(pathname)s:%(lineno)d]")

    ruledir = config.get("rulesets", "rulesdir")
    certdir = config.get("certificates", "basedir")

    threadCount = config.getint("http", "threads")

    #get all platform dirs, make sure "default" is among them
    certdirFiles = glob.glob(os.path.join(certdir, "*"))
    havePlatforms = set([
        os.path.basename(fname) for fname in certdirFiles
        if os.path.isdir(fname)
    ])
    logging.debug("Loaded certificate platforms: %s", ",".join(havePlatforms))
    if "default" not in havePlatforms:
        raise RuntimeError(
            "Platform 'default' is missing from certificate directories")

    metricName = config.get("thresholds", "metric")
    thresholdDistance = config.getfloat("thresholds", "max_distance")
    metricClass = getMetricClass(metricName)
    metric = metricClass()

    urlList = []
    if config.has_option("http", "url_list"):
        with file(config.get("http", "url_list")) as urlFile:
            urlList = [line.rstrip() for line in urlFile.readlines()]

    # Debugging options, graphviz dump
    dumpGraphvizTrie = False
    if config.has_option("debug", "dump_graphviz_trie"):
        dumpGraphvizTrie = config.getboolean("debug", "dump_graphviz_trie")
    if dumpGraphvizTrie:
        graphvizFile = config.get("debug", "graphviz_file")
        exitAfterDump = config.getboolean("debug", "exit_after_dump")

    xmlFnames = glob.glob(os.path.join(ruledir, "*.xml"))
    trie = RuleTrie()

    # set of main pages to test
    mainPages = set(urlList)

    for xmlFname in xmlFnames:
        ruleset = Ruleset(etree.parse(file(xmlFname)).getroot(), xmlFname)
        if ruleset.defaultOff:
            logging.debug("Skipping rule '%s', reason: %s", ruleset.name,
                          ruleset.defaultOff)
            continue
        #if list of URLs to test/scan was not defined, guess URLs from target elements
        if not urlList:
            for target in ruleset.uniqueTargetFQDNs():
                targetHTTPLangingPage = "http://%s/" % target
                if not ruleset.excludes(targetHTTPLangingPage):
                    mainPages.add(targetHTTPLangingPage)
                else:
                    logging.debug("Skipping landing page %s",
                                  targetHTTPLangingPage)
        trie.addRuleset(ruleset)

    # Trie is built now, dump it if it's set in config
    if dumpGraphvizTrie:
        logging.debug("Dumping graphviz ruleset trie")
        graph = trie.generateGraphizGraph()
        if graphvizFile == "-":
            graph.dot()
        else:
            with file(graphvizFile, "w") as gvFd:
                graph.dot(gvFd)
        if exitAfterDump:
            sys.exit(0)

    fetchOptions = http_client.FetchOptions(config)
    fetcherMap = dict()  #maps platform to fetcher

    platforms = http_client.CertificatePlatforms(
        os.path.join(certdir, "default"))
    for platform in havePlatforms:
        #adding "default" again won't break things
        platforms.addPlatform(platform, os.path.join(certdir, platform))
        fetcher = http_client.HTTPFetcher(platform, platforms, fetchOptions,
                                          trie)
        fetcherMap[platform] = fetcher

    #fetches pages with unrewritten URLs
    fetcherPlain = http_client.HTTPFetcher("default", platforms, fetchOptions)

    taskQueue = Queue.Queue(1000)
    startTime = time.time()
    testedUrlPairCount = 0

    for i in range(threadCount):
        t = UrlComparisonThread(taskQueue, metric, thresholdDistance)
        t.setDaemon(True)
        t.start()

    for plainUrl in mainPages:
        try:
            ruleFname = None
            ruleMatch = trie.transformUrl(plainUrl)
            transformedUrl = ruleMatch.url

            if plainUrl == transformedUrl:
                logging.info("Identical URL: %s", plainUrl)
                continue

            #URL was transformed, thus ruleset must exist that did it
            ruleFname = os.path.basename(ruleMatch.ruleset.filename)
            fetcher = fetcherMap.get(ruleMatch.ruleset.platform)
            if not fetcher:
                logging.warn(
                    "Unknown platform '%s', using 'default' instead. Rulefile: %s.",
                    ruleMatch.ruleset.platform, ruleFname)
                fetcher = fetcherMap["default"]

        except:
            logging.exception(
                "Failed to transform plain URL %s. Rulefile: %s.", plainUrl,
                ruleFname)
            continue

        testedUrlPairCount += 1
        task = ComparisonTask(plainUrl, transformedUrl, fetcherPlain, fetcher,
                              ruleFname)
        taskQueue.put(task)

    taskQueue.join()
    logging.info(
        "Finished in %.2f seconds. Loaded rulesets: %d, URL pairs: %d.",
        time.time() - startTime, len(xmlFnames), testedUrlPairCount)