コード例 #1
0
def main():
    def set_stdio_encoding(enc=NATIVE):
        import codecs
        stdio = ["stdin", "stdout", "stderr"]
        for x in stdio:
            obj = getattr(sys, x)
            if not obj.encoding: setattr(sys, x, codecs.getwriter(enc)(obj))

    set_stdio_encoding()

    log_level = log.INFO
    log.basicConfig(format="%(levelname)s>> %(message)s", level=log_level)

    with io.open(sys.argv[1], encoding="UTF-8") as fd:
        #adfilter = AdblockRulesLite(fd, supported_options=["third-party"], skip_unsupported_rules=False)
        adfilter = adblockparser.AdblockRules(
            fd,
            supported_options=["third-party"],
            skip_unsupported_rules=False)
    import time
    n = 1000
    a = time.time()
    for i in range(n):
        adfilter.should_block("http://www.events.kaloooga.com/stom",
                              {"third-party": False})
    b = time.time()
    total = b - a
    print(total, total / n)

    ret = adfilter.should_block("http://www.events.kaloooga.com/stom",
                                {"third-party": False})
コード例 #2
0
def test_easylist_filter():
    urls_to_be_blocked = _create_sample_urls()

    rules = adblockparser.AdblockRules(_create_sample_easylist())

    for url, to_be_blocked in urls_to_be_blocked:
        result = rules.should_block(url)  # "http://ads.example.com"
        assert result == to_be_blocked
コード例 #3
0
 def setup(self) -> None:
     """Child function."""
     asyncio.run(self._setup_downloads())
     if self.tag_list:
         self._extract_date_from_list()
         self._prepare_tag_list()
         if self.extraction_method == ExtractionMethod.USE_ADBLOCK_PARSER:
             self.match_rules = adblockparser.AdblockRules(
                 self.tag_list, skip_unsupported_rules=False, use_re2=False)
コード例 #4
0
ファイル: sfp_adblock.py プロジェクト: webshell520/spiderfoot
    def handleEvent(self, event):
        eventName = event.eventType
        srcModuleName = event.module
        eventData = event.data

        self.sf.debug("Received event, " + eventName + ", from " +
                      srcModuleName)

        if self.errorState:
            return None

        if self.rules is None:
            raw = self.sf.fetchUrl(self.opts['blocklist'], timeout=30)
            if raw['content'] is not None:
                lines = raw['content'].split('\n')
                self.sf.debug("RULE LINES: " + str(len(lines)))
                try:
                    self.rules = adblockparser.AdblockRules(lines)
                except BaseException as e:
                    self.errorState = True
                    self.sf.error(
                        "Parsing error handling AdBlock list: " + str(e),
                        False)
            else:
                self.errorState = True
                self.sf.error(
                    "Unable to download AdBlockPlus list: " +
                    self.opts['blocklist'], False)

        if "_EXTERNAL" in eventName:
            pagetype = "_EXTERNAL"
        else:
            pagetype = "_INTERNAL"

        if eventData not in self.results:
            self.results.append(eventData)
        else:
            self.sf.debug(
                "Already checked this page for AdBlock matching, skipping.")
            return None

        try:
            if self.rules and self.rules.should_block(eventData):
                evt = SpiderFootEvent("URL_ADBLOCKED" + pagetype, eventData,
                                      self.__name__, event)
                self.notifyListeners(evt)
        except BaseException as e:
            self.sf.error("Parsing error handling AdBlock list: " + str(e),
                          False)
            self.errorState = True

        return None
コード例 #5
0
    def setBlocklistRules(self, blocklist):
        """Parse AdBlock Plus blocklist and set blocklist rules

        Args:
            blocklist (str): plaintext AdBlock Plus blocklist
        """
        if not blocklist:
            return

        lines = blocklist.split('\n')
        self.debug(f"Retrieved {len(lines)} AdBlock blocklist rules")
        try:
            self.rules = adblockparser.AdblockRules(lines)
        except adblockparser.AdblockParsingError as e:
            self.errorState = True
            self.error(f"Parsing error handling AdBlock list: {e}")
コード例 #6
0
    def _load(self, path):
        try:
            import adblockparser
        except ImportError:
            log.msg('WARNING: https://github.com/scrapinghub/adblockparser '
                    'library is not available, filters are not loaded.')
            return

        for fname in os.listdir(path):
            if not fname.endswith('.txt'):
                continue
            fpath = os.path.join(path, fname)
            name = fname[:-len('.txt')]

            if not os.path.isfile(fpath):
                continue

            if self.verbosity >= 1:
                log.msg("Loading filter %s" % name)

            with open(fpath, 'rb') as f:
                lines = [line.decode('utf8').strip() for line in f]

            rules = adblockparser.AdblockRules(
                lines,
                supported_options=self.supported_options,
                skip_unsupported_rules=False,
                max_mem=512 * 1024 * 1024,  # this doesn't actually use 512M
            )
            filters_num = len(rules.rules)

            if self.verbosity >= 2:
                log.msg("%d rule(s) loaded for filter %s" %
                        (filters_num, name))

            if not rules.uses_re2 and filters_num > self.RE2_WARN_THRESHOLD:
                log.msg('WARNING: a filter %s with %d rules loaded, but '
                        'pyre2 library is not installed. Matching may become '
                        'slow; installing https://github.com/axiak/pyre2 is '
                        'highly recommended.' % (name, filters_num))

            self.filters[name] = rules
コード例 #7
0
    def __init__(self, rules, no_whitelist):
        """Initializes an instance of _RulesMatcher.

    Args:
      rules: ([str]) list of rules.
      no_whitelist: (bool) Whether the whitelisting rules should be ignored.
    """
        self._rules = self._FilterRules(rules, no_whitelist)
        if self._rules:
            try:
                import adblockparser
                self._matcher = adblockparser.AdblockRules(self._rules)
            except ImportError:
                logging.critical(
                    'Likely you need to install adblockparser. Try:\n'
                    ' pip install --user adblockparser\n'
                    'For 10-100x better performance, also try:\n'
                    " pip install --user 're2 >= 0.2.21'")
                raise
        else:
            self._matcher = None
コード例 #8
0
def find_third_party_using_given_csv(csv_file, script_dir):
    with open(script_dir + '/dst_characterize/easylist_adblock.txt', 'r') as f:
        ad_rules = adblockparser.AdblockRules(f.readlines())
    f.close()

    ad_list = set()

    with open(csv_file, mode="r") as csv_file1:
        csv_reader = csv.DictReader(csv_file1)
        visited_domains = [
        ]  # don't need to run domains through ad rules more than once
        for row in csv_reader:
            current_domain: str = row[options[3]]
            if current_domain not in visited_domains:
                visited_domains.append(current_domain)
                current_domain_full = "http://" + current_domain + "/"
                if ad_rules.should_block(current_domain_full):
                    ad_list.add(current_domain)
                else:
                    current_domain_full = "https://" + current_domain + "/"
                    if ad_rules.should_block(current_domain_full):
                        ad_list.add(current_domain)
    csv_file1.close()
    return ad_list
コード例 #9
0
ファイル: commandCenter.py プロジェクト: Afrionos/FADWeb
Data.timeout = int(config["timeout"])
Data.rules_file = config["rules_file"]
Data.url_file = config["url_file"]
Data.portRequest = int(config["portRequest"])
Data.portMessage = int(config["portMessage"])
Data.ip = config["ip"]

print("Reading AdblockRule Textfile")
f = open(Data.rules_file, "r")
raw = f.read().split("\n")
raw_rules = []
for r in raw:
    raw_rules.append(r)
f.close()
print("Generating AdblockRules")
Data.rules = adblockparser.AdblockRules(raw_rules)

print("Generating Database")
analyze.createDatabase()

print("Reading Urllist Textfile")
f = open(Data.url_file, "r")
li = f.read()
f.close()
li = li.split("\n")

#counter=int(random.random()*100000)
counter = 0
for l in li:

    if counter > 0: