def log_cluster(sc, log_lines, support): """ Run log cluster Args: log_lines(rdd of LogLine): Input log messages as LogLine objects support(int): Threshold # of occurrences before a pattern can be included Returns: list[DistributedTemplateLine]: Returns a list of DistributedTemplateLine objects defining the templates """ frequent_word_dict = log_lines.flatMap(parse_words)\ .reduceByKey(lambda x, y: x + y)\ .filter(lambda key_count: key_count[1] >= support)\ .collectAsMap() frequent_words = sc.broadcast(set(frequent_word_dict.keys())) clusters = log_lines.map( lambda x: extract_patterns(x, frequent_words)).groupByKey().filter( lambda freq_word_pattern_pattern: len(freq_word_pattern_pattern[ 1]) >= support).map(collapse_patterns).collect() templates = [' '.join(cluster) for cluster in clusters] transformed_templates = read_templates(templates) return transformed_templates
def log_cluster(sc, log_lines, support): """ Run log cluster Args: log_lines(rdd of LogLine): Input log messages as LogLine objects support(int): Threshold # of occurrences before a pattern can be included Returns: list[DistributedTemplateLine]: Returns a list of DistributedTemplateLine objects defining the templates """ frequent_word_dict = log_lines.flatMap(parse_words)\ .reduceByKey(lambda x, y: x + y)\ .filter(lambda key_count: key_count[1] >= support)\ .collectAsMap() frequent_words = sc.broadcast(set(frequent_word_dict.keys())) clusters = log_lines.map( lambda x: extract_patterns( x, frequent_words)) .groupByKey() .filter( lambda freq_word_pattern_pattern: len( freq_word_pattern_pattern[1]) >= support) .map(collapse_patterns) .collect() templates = [' '.join(cluster) for cluster in clusters] transformed_templates = read_templates(templates) return transformed_templates
def parse_output(output): """ Parses and interprets the output (from stdout) of running logcluster.pl. Args: output (string): string containing stdout output of running logcluster.pl. Returns: matches (list Template): list of Templates, created by the output of logcluster and ordered by the length of their matches. """ output = output.splitlines() matches = list() template_id = 1 for o in range(0, len(output), 3): # every 3rd line is a template matches.append(output[o].strip()) """ for o in range(0, len(output), 3): # every 3rd line is a template m = output[o].strip() #fixedLine = re.escape(m) #replacement = _findReplacement(fixedLine).strip() stripped = r'' + m.strip() escaped = re.escape(stripped) replaced = unescape_skips(escaped) template = DistributedTemplateLine( id=str(uuid.uuid4()), template=replaced, skip_words=get_word_skip_names(re.compile(replaced)), raw_str=m, ) #template = Template(template_id, replacement, m) matches.append(template) template_id += 1 # Make sure that small get done before large # TODO do the correct thing someday ''' correct way: For each pair of regexes r and s for languages L(r) and L(s) Find the corresponding Deterministic Finite Automata M(r) and M(s) [1] Compute the cross-product machine M(r x s) and assign accepting states so that it computes L(r) - L(s) Use a DFS or BFS of the the M(r x s) transition table to see if any accepting state can be reached from the start state If no, you can eliminate s because L(s) is a subset of L(r). Reassign accepting states so that M(r x s) computes L(s) - L(r) Repeat the steps above to see if it's possible to eliminate r ''' simple_cmp = lambda x, y: len(y.template) - len(x.template) matches = sorted(matches, cmp=simple_cmp) matches = [DistributedTemplateLine(m.id, re.compile(m.template+'$'), m.skip_words, m.raw_str) for m in matches] return matches """ from magichour.api.dist.templates.templateEval import read_templates return read_templates(matches)
def log_cluster_local(log_lines, support): """ Run log cluster Args:gen_events = event_step(gen_windows, "glove", **glove_kwargs) log_lines(rdd of LogLine): Input log messages as LogLine objects support(int): Threshold # of occurrences before a pattern can be included Returns: list[DistributedTemplateLine]: Returns a list of DistributedTemplateLine objects defining the templates """ if isinstance(support, str): support = int(support) frequent_words = local_word_count(log_lines, support) pattern_dict = defaultdict(list) for log_line in log_lines: pattern_key, original_line = extract_patterns( log_line, DummyTuple(value=frequent_words)) pattern_dict[pattern_key].append(original_line) items_to_delete = set() for pattern_key in pattern_dict: if len(pattern_dict[pattern_key]) < support: items_to_delete.add(pattern_key) # Delete infrequent patterns for pattern_key in items_to_delete: del pattern_dict[pattern_key] clusters = map(collapse_patterns, pattern_dict.items()) templates = [' '.join(cluster) for cluster in clusters] transformed_templates = read_templates(templates) return transformed_templates