Ejemplo n.º 1
0
def log_cluster(sc, log_lines, support):
    """
    Run log cluster

    Args:
         log_lines(rdd of LogLine): Input log messages as LogLine objects
         support(int): Threshold # of occurrences before a pattern can be included

    Returns:
        list[DistributedTemplateLine]: Returns a list of DistributedTemplateLine objects defining the templates
    """
    frequent_word_dict = log_lines.flatMap(parse_words)\
        .reduceByKey(lambda x, y: x + y)\
        .filter(lambda key_count: key_count[1] >= support)\
        .collectAsMap()

    frequent_words = sc.broadcast(set(frequent_word_dict.keys()))

    clusters = log_lines.map(
        lambda x: extract_patterns(x, frequent_words)).groupByKey().filter(
            lambda freq_word_pattern_pattern: len(freq_word_pattern_pattern[
                1]) >= support).map(collapse_patterns).collect()

    templates = [' '.join(cluster) for cluster in clusters]

    transformed_templates = read_templates(templates)
    return transformed_templates
Ejemplo n.º 2
0
def log_cluster(sc, log_lines, support):
    """
    Run log cluster

    Args:
         log_lines(rdd of LogLine): Input log messages as LogLine objects
         support(int): Threshold # of occurrences before a pattern can be included

    Returns:
        list[DistributedTemplateLine]: Returns a list of DistributedTemplateLine objects defining the templates
    """
    frequent_word_dict = log_lines.flatMap(parse_words)\
        .reduceByKey(lambda x, y: x + y)\
        .filter(lambda key_count: key_count[1] >= support)\
        .collectAsMap()

    frequent_words = sc.broadcast(set(frequent_word_dict.keys()))

    clusters = log_lines.map(
        lambda x: extract_patterns(
            x, frequent_words)) .groupByKey() .filter(
        lambda freq_word_pattern_pattern: len(
            freq_word_pattern_pattern[1]) >= support) .map(collapse_patterns) .collect()

    templates = [' '.join(cluster) for cluster in clusters]

    transformed_templates = read_templates(templates)
    return transformed_templates
Ejemplo n.º 3
0
def parse_output(output):
    """
    Parses and interprets the output (from stdout) of running logcluster.pl.

    Args:
        output (string): string containing stdout output of running logcluster.pl.

    Returns:
        matches (list Template): list of Templates, created by the output of logcluster and ordered by the length of their matches.
    """
    output = output.splitlines()

    matches = list()
    template_id = 1
    for o in range(0, len(output), 3):  # every 3rd line is a template
        matches.append(output[o].strip())

    """
    for o in range(0, len(output), 3): # every 3rd line is a template
        m = output[o].strip()
        #fixedLine = re.escape(m)
        #replacement = _findReplacement(fixedLine).strip()

        stripped = r'' + m.strip()
        escaped = re.escape(stripped)
        replaced = unescape_skips(escaped)
        template = DistributedTemplateLine(
            id=str(uuid.uuid4()),
            template=replaced,
            skip_words=get_word_skip_names(re.compile(replaced)),
            raw_str=m,
        )
        #template = Template(template_id, replacement, m)
        matches.append(template)
        template_id += 1

    # Make sure that small get done before large
    # TODO do the correct thing someday
    '''
    correct way:

    For each pair of regexes r and s for languages L(r) and L(s)
      Find the corresponding Deterministic Finite Automata M(r) and M(s)   [1]
        Compute the cross-product machine M(r x s) and assign accepting states
           so that it computes L(r) - L(s)
        Use a DFS or BFS of the the M(r x s) transition table to see if any
           accepting state can be reached from the start state
        If no, you can eliminate s because L(s) is a subset of L(r).
        Reassign accepting states so that M(r x s) computes L(s) - L(r)
        Repeat the steps above to see if it's possible to eliminate r

    '''
    simple_cmp = lambda x, y: len(y.template) - len(x.template)
    matches = sorted(matches, cmp=simple_cmp)
    matches = [DistributedTemplateLine(m.id, re.compile(m.template+'$'), m.skip_words, m.raw_str) for m in matches]
    return matches
    """

    from magichour.api.dist.templates.templateEval import read_templates
    return read_templates(matches)
Ejemplo n.º 4
0
def parse_output(output):
    """
    Parses and interprets the output (from stdout) of running logcluster.pl.

    Args:
        output (string): string containing stdout output of running logcluster.pl.

    Returns:
        matches (list Template): list of Templates, created by the output of logcluster and ordered by the length of their matches.
    """
    output = output.splitlines()

    matches = list()
    template_id = 1
    for o in range(0, len(output), 3):  # every 3rd line is a template
        matches.append(output[o].strip())
    """
    for o in range(0, len(output), 3): # every 3rd line is a template
        m = output[o].strip()
        #fixedLine = re.escape(m)
        #replacement = _findReplacement(fixedLine).strip()

        stripped = r'' + m.strip()
        escaped = re.escape(stripped)
        replaced = unescape_skips(escaped)
        template = DistributedTemplateLine(
            id=str(uuid.uuid4()),
            template=replaced,
            skip_words=get_word_skip_names(re.compile(replaced)),
            raw_str=m,
        )
        #template = Template(template_id, replacement, m)
        matches.append(template)
        template_id += 1

    # Make sure that small get done before large
    # TODO do the correct thing someday
    '''
    correct way:

    For each pair of regexes r and s for languages L(r) and L(s)
      Find the corresponding Deterministic Finite Automata M(r) and M(s)   [1]
        Compute the cross-product machine M(r x s) and assign accepting states
           so that it computes L(r) - L(s)
        Use a DFS or BFS of the the M(r x s) transition table to see if any
           accepting state can be reached from the start state
        If no, you can eliminate s because L(s) is a subset of L(r).
        Reassign accepting states so that M(r x s) computes L(s) - L(r)
        Repeat the steps above to see if it's possible to eliminate r

    '''
    simple_cmp = lambda x, y: len(y.template) - len(x.template)
    matches = sorted(matches, cmp=simple_cmp)
    matches = [DistributedTemplateLine(m.id, re.compile(m.template+'$'), m.skip_words, m.raw_str) for m in matches]
    return matches
    """

    from magichour.api.dist.templates.templateEval import read_templates
    return read_templates(matches)
Ejemplo n.º 5
0
def log_cluster_local(log_lines, support):
    """
    Run log cluster

    Args:gen_events = event_step(gen_windows, "glove", **glove_kwargs)
         log_lines(rdd of LogLine): Input log messages as LogLine objects
         support(int): Threshold # of occurrences before a pattern can be included

    Returns:
        list[DistributedTemplateLine]: Returns a list of DistributedTemplateLine objects defining the templates
    """
    if isinstance(support, str):
        support = int(support)

    frequent_words = local_word_count(log_lines, support)

    pattern_dict = defaultdict(list)
    for log_line in log_lines:
        pattern_key, original_line = extract_patterns(
            log_line, DummyTuple(value=frequent_words))
        pattern_dict[pattern_key].append(original_line)

    items_to_delete = set()
    for pattern_key in pattern_dict:
        if len(pattern_dict[pattern_key]) < support:
            items_to_delete.add(pattern_key)

    # Delete infrequent patterns
    for pattern_key in items_to_delete:
        del pattern_dict[pattern_key]

    clusters = map(collapse_patterns, pattern_dict.items())

    templates = [' '.join(cluster) for cluster in clusters]

    transformed_templates = read_templates(templates)
    return transformed_templates
Ejemplo n.º 6
0
def log_cluster_local(log_lines, support):
    """
    Run log cluster

    Args:gen_events = event_step(gen_windows, "glove", **glove_kwargs)
         log_lines(rdd of LogLine): Input log messages as LogLine objects
         support(int): Threshold # of occurrences before a pattern can be included

    Returns:
        list[DistributedTemplateLine]: Returns a list of DistributedTemplateLine objects defining the templates
    """
    if isinstance(support, str):
        support = int(support)

    frequent_words = local_word_count(log_lines, support)

    pattern_dict = defaultdict(list)
    for log_line in log_lines:
        pattern_key, original_line = extract_patterns(
            log_line, DummyTuple(value=frequent_words))
        pattern_dict[pattern_key].append(original_line)

    items_to_delete = set()
    for pattern_key in pattern_dict:
        if len(pattern_dict[pattern_key]) < support:
            items_to_delete.add(pattern_key)

    # Delete infrequent patterns
    for pattern_key in items_to_delete:
        del pattern_dict[pattern_key]

    clusters = map(collapse_patterns, pattern_dict.items())

    templates = [' '.join(cluster) for cluster in clusters]

    transformed_templates = read_templates(templates)
    return transformed_templates