Beispiel #1
0
    def __init__(self, needles=None, **kwargs):
        """Init.

        Args:
          needles: A list of strings we search for.
          **kwargs: passthrough.
        Raises:
          RuntimeError: No needles provided.
        """
        super(MultiStringFinderCheck, self).__init__(**kwargs)

        # It is an error to not provide something to search for and Acora will
        # raise later.
        if not needles:
            raise RuntimeError("No needles provided to search.")

        # Passing large patterns to the acora module will cause huge memory
        # consumption.
        if max([len(x) for x in needles]) > 50:
            raise RuntimeError("Pattern too large to search with ahocorasic.")

        # Our scanner must operate on raw bytes so we need to make
        # sure all the needles are bytes too.
        byte_needles = [utils.SmartStr(x) for x in needles]
        tree = acora.AcoraBuilder(*byte_needles)
        self.engine = tree.build()

        self.base_offset = None
        self.hits = None
Beispiel #2
0
    def __init__(self, spotfile='anchors-sorted.txt'):
        builder = acora.AcoraBuilder()

        with open(spotfile, 'r') as inputfile:
            for count, line in enumerate(inputfile):
                builder.add(line.rstrip("\n"))

        print "Building the tree"
        self.tree = builder.build()
Beispiel #3
0
    def test_deepcopy_machine(self):
        from copy import deepcopy
        s = self._swrap

        builder = acora.AcoraBuilder(*list(map(s, ['a', 'b', 'c'])))
        ac1 = builder.build(acora=self.acora)
        ac2 = deepcopy(ac1)

        self.assertEqual(sorted(ac1.finditer(s('abcd'))),
                         self._result([('a', 0), ('b', 1), ('c', 2)]))

        self.assertEqual(sorted(ac2.finditer(s('abcd'))),
                         self._result([('a', 0), ('b', 1), ('c', 2)]))
Beispiel #4
0
 def __init__(self, spec_set, min_count=1, min_len=1):
     key_lst = []
     if type(spec_set) == dict or type(spec_set) == collections.Counter:
         for spec, cnt in spec_set.items():
             if cnt >= min_count and len(spec) >= min_len:
                 key_lst.append(spec)
     elif type(spec_set) == list:
         key_lst = spec_set
     else:
         print 'ERROR: wrong value type:', type(spec_set)
         exit(-1)
     self.builder = acora.AcoraBuilder(key_lst)
     self.ac = self.builder.build()
Beispiel #5
0
 def _build_ignore_case(self, *keywords):
     keywords = list(map(self._swrap, keywords))
     builder = acora.AcoraBuilder(*keywords, ignore_case=True)
     if DOTDEBUG:
         print('Initial tree:')
         tree_to_dot(builder.tree)
     machine = builder.build(acora=self.acora)
     if DOTDEBUG:
         print('\nProcessed tree:')
         tree_to_dot(builder.tree)
         if not isinstance(machine, acora.PyAcora):
             print('\nMachine:')
             machine_to_dot(machine)
     return machine
Beispiel #6
0
    def test_deepcopy_builder(self):
        from copy import deepcopy
        s = self._swrap

        builder1 = acora.AcoraBuilder(*list(map(s, ['a', 'b', 'c'])))
        builder2 = deepcopy(builder1)
        builder2.add(s('ab'), s('bc'))

        finditer1 = builder1.build(acora=self.acora).finditer
        finditer2 = builder2.build(acora=self.acora).finditer

        self.assertEquals(sorted(finditer1(s('abcd'))),
                          self._result([('a', 0), ('b', 1), ('c', 2)]))

        self.assertEquals(
            sorted(finditer2(s('abcd'))),
            self._result([('a', 0), ('ab', 0), ('b', 1), ('bc', 1), ('c', 2)]))
Beispiel #7
0
    def test_pickle2_machine(self):
        import pickle
        s = self._swrap

        builder = acora.AcoraBuilder(*list(map(s, ['a', 'b', 'c'])))
        ac1 = builder.build(acora=self.acora)
        #if not isinstance(ac1, acora.PyAcora):
        #    machine_to_dot(ac1)
        ac2 = pickle.loads(pickle.dumps(ac1, protocol=pickle.HIGHEST_PROTOCOL))
        #if not isinstance(ac2, acora.PyAcora):
        #    machine_to_dot(ac2)

        self.assertEqual(sorted(ac1.finditer(s('abcd'))),
                         self._result([('a', 0), ('b', 1), ('c', 2)]))

        self.assertEqual(sorted(ac2.finditer(s('abcd'))),
                         self._result([('a', 0), ('b', 1), ('c', 2)]))
Beispiel #8
0
    def __init__(self, needles=None, **kwargs):
        """
        Args:
          needles: A list of strings we search for.
        """
        super(MultiStringFinderCheck, self).__init__(**kwargs)
        if not needles:
            needles = []

        tree = acora.AcoraBuilder(*needles)

        self.engine = tree.build()

        self.base_offset = None
        self.hits = None
        self.next_hit_index = 0
        self.current_hit = None
Beispiel #9
0
    def __init__(self, needles=None, **kwargs):
        """
        Args:
          needles: A list of strings we search for.
        """
        super(MultiStringFinderCheck, self).__init__(**kwargs)

        # It is an error to not provide something to search for and Acora will
        # raise later.
        if not needles:
            raise RuntimeError("No needles provided to search.")

        tree = acora.AcoraBuilder(*needles)

        self.engine = tree.build()

        self.base_offset = None
        self.hits = None
        self.next_hit_index = 0
        self.current_hit = None
Beispiel #10
0
    def test_pickle_machine_new(self):
        s = self._swrap

        builder = acora.AcoraBuilder(*list(map(s, ['a', 'bc', 'c'])))
        ac = builder.build(acora=self.acora)
        #if not isinstance(ac, acora.PyAcora):
        #    machine_to_dot(ac)

        import pickle
        p = pickle.dumps(ac)

        del builder, ac
        import gc
        gc.collect()

        ac = pickle.loads(p)
        #if not isinstance(ac, acora.PyAcora):
        #    machine_to_dot(ac)
        self.assertEqual(sorted(ac.finditer(s('abcd'))),
                         self._result([('a', 0), ('bc', 1), ('c', 2)]))
Beispiel #11
0
 def _build_ignore_case(self, *keywords):
     keywords = list(map(self._swrap, keywords))
     return acora.AcoraBuilder(*keywords).build(ignore_case=True,
                                                acora=self.acora)
Beispiel #12
0
 def _build(self, *keywords):
     keywords = list(map(self._swrap, keywords))
     return acora.AcoraBuilder(*keywords).build(acora=self.acora)
Beispiel #13
0
def logfilter(fh,
              blacklist,
              field,
              parser=None,
              reverse=False,
              delimiter=None,
              ignorecase=False,
              with_acora=False,
              word_boundaries=False,
              **kwargs):
    """Filter rows from a log stream using a blacklist"""

    blacklist = dict.fromkeys([l.strip() for l \
                               in blacklist \
                               if l and not l.startswith('#')])
    re_flags = 0

    if ignorecase:
        re_flags = re.IGNORECASE

    _is_blacklisted = None
    if with_acora is False:
        # Regular expression based matching
        if word_boundaries:
            _is_blacklisted = partial(_is_blacklisted_re_wb,
                                      delimiter=delimiter,
                                      field=field,
                                      blacklist=blacklist,
                                      re_flags=re_flags)
        else:
            _is_blacklisted = partial(_is_blacklisted_re,
                                      delimiter=delimiter,
                                      field=field,
                                      blacklist=blacklist,
                                      re_flags=re_flags)
    else:
        # Aho-Corasick multiple string pattern matching
        # using the acora Cython library
        builder = acora.AcoraBuilder(*blacklist)
        ac = builder.build()
        _transform_func = lambda x: x
        if ignorecase:
            _transform_func = lambda x: x.lower()

        if word_boundaries:
            _is_blacklisted = partial(_is_blacklisted_ac_wb,
                                      delimiter=delimiter,
                                      field=field,
                                      transform_func=_transform_func,
                                      ac=ac)
        else:
            _is_blacklisted = partial(_is_blacklisted_ac,
                                      delimiter=delimiter,
                                      field=field,
                                      transform_func=_transform_func,
                                      ac=ac)

    _is_blacklisted_func = _is_blacklisted
    if parser:
        # Custom parser specified, use field-based matching
        parser = eval(parser, vars(logtools.parsers), {})()
        fields = field.split(',')
        is_indices = reduce(and_, (k.isdigit() for k in fields), True)
        if is_indices:
            # Field index based matching
            def _is_blacklisted_func(line):
                parsed_line = parser(line)
                for field in fields:
                    if _is_blacklisted(parsed_line.by_index(field)):
                        return True
                return False
        else:
            # Named field based matching
            def _is_blacklisted_func(line):
                parsed_line = parser(line)
                for field in fields:
                    if _is_blacklisted(parsed_line.by_index(field)):
                        return True
                return False

    num_lines = 0
    num_filtered = 0
    num_nomatch = 0
    for line in imap(lambda x: x.strip(), fh):
        try:
            is_blacklisted = _is_blacklisted_func(line)
        except (KeyError, ValueError):
            # Parsing error
            logging.warn("No match for line: %s", line)
            num_nomatch += 1
            continue
        else:
            if is_blacklisted ^ reverse:
                logging.debug("Filtering line: %s", line)
                num_filtered += 1
                continue

            num_lines += 1
            yield line

    logging.info("Number of lines after filtering: %s", num_lines)
    logging.info("Number of lines filtered: %s", num_filtered)
    if num_nomatch:
        logging.info("Number of lines could not match on: %s", num_nomatch)

    return