Example #1
0
    def compile_hs(self, hs_db_file=None):
        if hs_db_file and os.path.isfile(hs_db_file):
            log.info(f'Loading Hyperscan DB from disk: {hs_db_file}')
            with open(hs_db_file, 'rb') as f:
                self.hs_db = hyperscan.loads(bytearray(f.read()))
            return

        self.hs_db = hyperscan.Database()

        num_patterns = len(self.regexes)
        expressions = list(
            map(
                functools.partial(str.encode, encoding='utf-8'),
                map(operator.itemgetter(self.Regex._fields.index('pattern')),
                    self.regexes)))
        ids = list(range(num_patterns))
        flags = list(
            map(operator.itemgetter(self.Regex._fields.index('hs_flags')),
                self.regexes))

        start = time.time()
        self.hs_db.compile(
            expressions=expressions,
            ids=ids,
            elements=num_patterns,
            flags=flags,
        )
        self.stats.hs_compilation_time = time.time() - start

        if hs_db_file:
            log.info(f'Saving Hyperscan DB to disk: {hs_db_file}')
            with open(hs_db_file, 'wb') as f:
                f.write(hyperscan.dumps(self.hs_db))
Example #2
0
def database_stream():
    db = hyperscan.Database(mode=(hyperscan.HS_MODE_STREAM |
                                  hyperscan.HS_MODE_SOM_HORIZON_LARGE))
    expressions, ids, flags = zip(*patterns)
    db.compile(expressions=expressions, ids=ids,
               elements=len(patterns), flags=flags)
    return db
def get_starting_db_exprs() -> tuple[hyperscan.Database | None, set[str]]:

    if SERIALIZED_PATH.exists() and EXPRESSIONS_PATH.exists():
        with contextlib.suppress(Exception):
            with SERIALIZED_PATH.open(mode="rb") as fp_r:
                db = hyperscan.loadb(fp_r.read())
            with EXPRESSIONS_PATH.open(mode="r") as fp:
                expressions = {e.strip() for e in fp.readlines() if e}

            return db, expressions

    if EXPRESSIONS_PATH.exists():
        with EXPRESSIONS_PATH.open(mode="r") as fp:
            expressions = {e.strip() for e in fp.readlines() if e}

        if expressions:

            try:
                db = hyperscan.Database()
                db.compile(expressions=tuple(expr.encode() for expr in expressions))
            except Exception as exc:
                log.exception("Error loading in expressions from file", exc_info=exc)
            else:
                return db, expressions

        else:

            return None, expressions

    return None, set()
def benchmark_hyperscan(LINE):
    import hyperscan

    db = hyperscan.Database()
    db.compile(
        expressions=[k.encode("utf-8") for k in KEYS],
        ids=list(range(len(KEYS))),
        elements=len(KEYS),
        flags=0,
    )

    l = []

    def on_match(id: int,
                 from_: int,
                 to: int,
                 flags: int,
                 context: Optional[Any] = None) -> Optional[bool]:
        l.append(id)

    LINE = LINE.encode("utf-8")
    db.scan(LINE, match_event_handler=on_match)
    print(l)

    # TODO the Python hyperscan API I found initially is extremely inefficient,
    # doing a Python callback on every match. So not doing any match handling
    # here, just to get a sense of raw performance.
    benchmark("db.scan(LINE)", locals())
Example #5
0
def database_block():
    db = hyperscan.Database()
    expressions, ids, flags = zip(*patterns)
    db.compile(expressions=expressions,
               ids=ids,
               elements=len(patterns),
               flags=flags)
    return db
Example #6
0
    def compile_db_in_memory(expressions: List[bytes], ids: List[int],
                             flags: List[int]) -> hs.Database:
        assert len(expressions) == len(
            ids), "There must be an id for every expression."

        db = hs.Database(mode=HYPERSCAN_DB_MODE)
        db.compile(expressions=expressions, ids=ids, flags=flags)
        return db
Example #7
0
def compile_database(pattern_set):
    db = hyperscan.Database()
    ### Compile patterns
    expressions, ids, flags = zip(*patterns)
    db.compile(expressions=expressions,
               ids=ids,
               elements=len(patterns),
               flags=flags)
    print(db.info().decode())
Example #8
0
def test_literal_expressions(mocker):
    db = hyperscan.Database()
    expressions, ids, _ = zip(*patterns)
    expressions = [e + b'\0' for e in expressions]
    db.compile(expressions=expressions, ids=ids, literal=True)
    callback = mocker.Mock(return_value=None)
    expected = []
    for i, expression in enumerate(expressions):
        expression = expression[:-1]
        db.scan(expression, match_event_handler=callback, context=expression)
        expected.append(mocker.call(ids[i], 0, len(expression), 0, expression))
    assert callback.mock_calls == expected
Example #9
0
def build_database(expr_path, mode=hyperscan.HS_MODE_STREAM):
    ids = []
    expressions = []
    flags = []
    with io.open(expr_path, 'r') as f:
        for line in f:
            id_, expression, flags_ = process_expression(line)
            ids.append(id_)
            expressions.append(expression)
            flags.append(flags_)
    database = hyperscan.Database(mode=mode)
    database.compile(expressions=expressions, ids=ids, flags=flags)
    return len(expressions), database
def update_db_from_expressions(db: hyperscan.Database | None, expressions: set[str]) -> hyperscan.Database | None:
    log.info("Updating expressions to %s", expressions)
    if expressions:
        if not db:
            db = hyperscan.Database()

        db.compile(expressions=tuple(expr.encode() for expr in expressions))
        atomic_save(SERIALIZED_PATH, hyperscan.dumpb(db))
    else:
        db = None

    atomic_save(EXPRESSIONS_PATH, "\n".join(expressions).encode())

    return db
Example #11
0
def compile_test():
    db = hyperscan.Database()
    patterns = (
        # expression,  id, flags
        (br'fo+', 0, 0),
        (br'^foobar$', 1, hyperscan.HS_FLAG_CASELESS),
        (br'BAR', 2,
         hyperscan.HS_FLAG_CASELESS | hyperscan.HS_FLAG_SOM_LEFTMOST),
    )
    expressions, ids, flags = zip(*patterns)
    db.compile(expressions=expressions,
               ids=ids,
               elements=len(patterns),
               flags=flags)
    print(db.info().decode())
Example #12
0
    def stream(self, rules):
        """ Load the hyperscan database. """
        self._stream = hyperscan.Database(mode=hyperscan.HS_MODE_BLOCK)
        patterns = []
        for r in rules:
            rule_id, rule, _, _ = r.values()
            patterns.append(
                (rule.encode('utf-8'), rule_id, hyperscan.HS_FLAG_CASELESS
                 | hyperscan.HS_FLAG_UTF8 | hyperscan.HS_FLAG_UCP))

        expressions, ids, flags = zip(*patterns)
        self._stream.compile(expressions=expressions,
                             ids=ids,
                             elements=len(patterns),
                             flags=flags)
Example #13
0
def hyperscan_match(regexes, text):
    """Run regexes on text using hyperscan, for debugging."""
    # import here so the dependency is optional
    import hyperscan  # pylint: disable=import-outside-toplevel

    flags = [hyperscan.HS_FLAG_SOM_LEFTMOST] * len(regexes)
    regexes = [regex.encode("utf8") for regex in regexes]
    hyperscan_db = hyperscan.Database()
    hyperscan_db.compile(expressions=regexes, flags=flags)
    matches = []

    def on_match(index, start, end, flags, context):
        matches.append((index, start, end, flags, context))

    hyperscan_db.scan(text.encode("utf8"), on_match)

    return matches
Example #14
0
    def hyperscan_db(self):
        """Compile extractors into a hyperscan DB. Use a cache file
        if we've compiled this set before."""
        if not hasattr(self, "_db"):
            # import here so the dependency is optional
            import hyperscan  # pylint: disable=import-outside-toplevel

            hyperscan_db = None
            cache = None

            flag_conversion = {re.I: hyperscan.HS_FLAG_CASELESS}

            def convert_flags(re_flags):
                hyperscan_flags = 0
                for re_flag, hyperscan_flag in flag_conversion.items():
                    if re_flags & re_flag:
                        hyperscan_flags |= hyperscan_flag
                return hyperscan_flags

            def convert_regex(regex):
                # hyperscan doesn't understand repetition flags like {,3},
                # so replace with {0,3}:
                regex = re.sub(r"\{,(\d+)\}", r"{0,\1}", regex)
                # Characters like "§" convert to more than one byte in utf8,
                # so "§?" won't work as expected. Convert "§?" to "(?:§)?":
                long_chars = [c for c in regex if len(c.encode("utf8")) > 1]
                if long_chars:
                    regex = re.sub(
                        rf'([{"".join(set(long_chars))}])\?', r"(?:\1)?", regex
                    )
                # encode as bytes:
                return regex.encode("utf8")

            expressions = [convert_regex(e.regex) for e in self.extractors]
            # HS_FLAG_SOM_LEFTMOST so hyperscan includes the start offset
            flags = [
                convert_flags(e.flags) | hyperscan.HS_FLAG_SOM_LEFTMOST
                for e in self.extractors
            ]

            if self.cache_dir is not None:
                # Attempt to use cache.
                # Cache key is a hash of all regexes and flags, so we
                # automatically recompile if anything changes.
                fingerprint = hashlib.md5(
                    str(expressions).encode("utf8") + str(flags).encode("utf8")
                ).hexdigest()
                cache_dir = Path(self.cache_dir)
                cache_dir.mkdir(exist_ok=True)
                cache = cache_dir / fingerprint
                if cache.exists():
                    hyperscan_db = hyperscan.loadb(cache.read_bytes())

            if not hyperscan_db:
                # No cache, so compile database.
                hyperscan_db = hyperscan.Database()
                hyperscan_db.compile(expressions=expressions, flags=flags)
                if cache:
                    cache.write_bytes(hyperscan.dumpb(hyperscan_db))

            self._db = hyperscan_db

        return self._db
Example #15
0
import hyperscan

db = hyperscan.Database()
patterns = (
    # expression,  id, flags
    (br'fo+', 0, 0),
    (br'^foobar$', 1, hyperscan.HS_FLAG_CASELESS),
    (br'BAR', 2, hyperscan.HS_FLAG_CASELESS
     | hyperscan.HS_FLAG_SOM_LEFTMOST),
)
expressions, ids, flags = zip(*patterns)
db.compile(expressions=expressions,
           ids=ids,
           elements=len(patterns),
           flags=flags)
print(db.info().decode())
# Version: 5.1.1 Features: AVX2 Mode: BLOCK