def __init__(self, needles=None, **kwargs): """Init. Args: needles: A list of strings we search for. **kwargs: passthrough. Raises: RuntimeError: No needles provided. """ super(MultiStringFinderCheck, self).__init__(**kwargs) # It is an error to not provide something to search for and Acora will # raise later. if not needles: raise RuntimeError("No needles provided to search.") # Passing large patterns to the acora module will cause huge memory # consumption. if max([len(x) for x in needles]) > 50: raise RuntimeError("Pattern too large to search with ahocorasic.") # Our scanner must operate on raw bytes so we need to make # sure all the needles are bytes too. byte_needles = [utils.SmartStr(x) for x in needles] tree = acora.AcoraBuilder(*byte_needles) self.engine = tree.build() self.base_offset = None self.hits = None
def __init__(self, spotfile='anchors-sorted.txt'): builder = acora.AcoraBuilder() with open(spotfile, 'r') as inputfile: for count, line in enumerate(inputfile): builder.add(line.rstrip("\n")) print "Building the tree" self.tree = builder.build()
def test_deepcopy_machine(self): from copy import deepcopy s = self._swrap builder = acora.AcoraBuilder(*list(map(s, ['a', 'b', 'c']))) ac1 = builder.build(acora=self.acora) ac2 = deepcopy(ac1) self.assertEqual(sorted(ac1.finditer(s('abcd'))), self._result([('a', 0), ('b', 1), ('c', 2)])) self.assertEqual(sorted(ac2.finditer(s('abcd'))), self._result([('a', 0), ('b', 1), ('c', 2)]))
def __init__(self, spec_set, min_count=1, min_len=1): key_lst = [] if type(spec_set) == dict or type(spec_set) == collections.Counter: for spec, cnt in spec_set.items(): if cnt >= min_count and len(spec) >= min_len: key_lst.append(spec) elif type(spec_set) == list: key_lst = spec_set else: print 'ERROR: wrong value type:', type(spec_set) exit(-1) self.builder = acora.AcoraBuilder(key_lst) self.ac = self.builder.build()
def _build_ignore_case(self, *keywords): keywords = list(map(self._swrap, keywords)) builder = acora.AcoraBuilder(*keywords, ignore_case=True) if DOTDEBUG: print('Initial tree:') tree_to_dot(builder.tree) machine = builder.build(acora=self.acora) if DOTDEBUG: print('\nProcessed tree:') tree_to_dot(builder.tree) if not isinstance(machine, acora.PyAcora): print('\nMachine:') machine_to_dot(machine) return machine
def test_deepcopy_builder(self): from copy import deepcopy s = self._swrap builder1 = acora.AcoraBuilder(*list(map(s, ['a', 'b', 'c']))) builder2 = deepcopy(builder1) builder2.add(s('ab'), s('bc')) finditer1 = builder1.build(acora=self.acora).finditer finditer2 = builder2.build(acora=self.acora).finditer self.assertEquals(sorted(finditer1(s('abcd'))), self._result([('a', 0), ('b', 1), ('c', 2)])) self.assertEquals( sorted(finditer2(s('abcd'))), self._result([('a', 0), ('ab', 0), ('b', 1), ('bc', 1), ('c', 2)]))
def test_pickle2_machine(self): import pickle s = self._swrap builder = acora.AcoraBuilder(*list(map(s, ['a', 'b', 'c']))) ac1 = builder.build(acora=self.acora) #if not isinstance(ac1, acora.PyAcora): # machine_to_dot(ac1) ac2 = pickle.loads(pickle.dumps(ac1, protocol=pickle.HIGHEST_PROTOCOL)) #if not isinstance(ac2, acora.PyAcora): # machine_to_dot(ac2) self.assertEqual(sorted(ac1.finditer(s('abcd'))), self._result([('a', 0), ('b', 1), ('c', 2)])) self.assertEqual(sorted(ac2.finditer(s('abcd'))), self._result([('a', 0), ('b', 1), ('c', 2)]))
def __init__(self, needles=None, **kwargs): """ Args: needles: A list of strings we search for. """ super(MultiStringFinderCheck, self).__init__(**kwargs) if not needles: needles = [] tree = acora.AcoraBuilder(*needles) self.engine = tree.build() self.base_offset = None self.hits = None self.next_hit_index = 0 self.current_hit = None
def __init__(self, needles=None, **kwargs): """ Args: needles: A list of strings we search for. """ super(MultiStringFinderCheck, self).__init__(**kwargs) # It is an error to not provide something to search for and Acora will # raise later. if not needles: raise RuntimeError("No needles provided to search.") tree = acora.AcoraBuilder(*needles) self.engine = tree.build() self.base_offset = None self.hits = None self.next_hit_index = 0 self.current_hit = None
def test_pickle_machine_new(self): s = self._swrap builder = acora.AcoraBuilder(*list(map(s, ['a', 'bc', 'c']))) ac = builder.build(acora=self.acora) #if not isinstance(ac, acora.PyAcora): # machine_to_dot(ac) import pickle p = pickle.dumps(ac) del builder, ac import gc gc.collect() ac = pickle.loads(p) #if not isinstance(ac, acora.PyAcora): # machine_to_dot(ac) self.assertEqual(sorted(ac.finditer(s('abcd'))), self._result([('a', 0), ('bc', 1), ('c', 2)]))
def _build_ignore_case(self, *keywords): keywords = list(map(self._swrap, keywords)) return acora.AcoraBuilder(*keywords).build(ignore_case=True, acora=self.acora)
def _build(self, *keywords): keywords = list(map(self._swrap, keywords)) return acora.AcoraBuilder(*keywords).build(acora=self.acora)
def logfilter(fh, blacklist, field, parser=None, reverse=False, delimiter=None, ignorecase=False, with_acora=False, word_boundaries=False, **kwargs): """Filter rows from a log stream using a blacklist""" blacklist = dict.fromkeys([l.strip() for l \ in blacklist \ if l and not l.startswith('#')]) re_flags = 0 if ignorecase: re_flags = re.IGNORECASE _is_blacklisted = None if with_acora is False: # Regular expression based matching if word_boundaries: _is_blacklisted = partial(_is_blacklisted_re_wb, delimiter=delimiter, field=field, blacklist=blacklist, re_flags=re_flags) else: _is_blacklisted = partial(_is_blacklisted_re, delimiter=delimiter, field=field, blacklist=blacklist, re_flags=re_flags) else: # Aho-Corasick multiple string pattern matching # using the acora Cython library builder = acora.AcoraBuilder(*blacklist) ac = builder.build() _transform_func = lambda x: x if ignorecase: _transform_func = lambda x: x.lower() if word_boundaries: _is_blacklisted = partial(_is_blacklisted_ac_wb, delimiter=delimiter, field=field, transform_func=_transform_func, ac=ac) else: _is_blacklisted = partial(_is_blacklisted_ac, delimiter=delimiter, field=field, transform_func=_transform_func, ac=ac) _is_blacklisted_func = _is_blacklisted if parser: # Custom parser specified, use field-based matching parser = eval(parser, vars(logtools.parsers), {})() fields = field.split(',') is_indices = reduce(and_, (k.isdigit() for k in fields), True) if is_indices: # Field index based matching def _is_blacklisted_func(line): parsed_line = parser(line) for field in fields: if _is_blacklisted(parsed_line.by_index(field)): return True return False else: # Named field based matching def _is_blacklisted_func(line): parsed_line = parser(line) for field in fields: if _is_blacklisted(parsed_line.by_index(field)): return True return False num_lines = 0 num_filtered = 0 num_nomatch = 0 for line in imap(lambda x: x.strip(), fh): try: is_blacklisted = _is_blacklisted_func(line) except (KeyError, ValueError): # Parsing error logging.warn("No match for line: %s", line) num_nomatch += 1 continue else: if is_blacklisted ^ reverse: logging.debug("Filtering line: %s", line) num_filtered += 1 continue num_lines += 1 yield line logging.info("Number of lines after filtering: %s", num_lines) logging.info("Number of lines filtered: %s", num_filtered) if num_nomatch: logging.info("Number of lines could not match on: %s", num_nomatch) return