Esempio n. 1
0
def event_gen_fp_growth(sc, log_lines,
                        minSupport=0.2,
                        numPartitions=10,
                        windowLen=120,
                        remove_junk_drawer=True):
    retval = list()
    windowed = window_rdd(sc, log_lines, windowLen, False)
    temp = FPGrowthRDD(windowed, minSupport, numPartitions).collect()

    items = [frozenset(fi.items) for fi in temp]
    pruned_items = list(get_longest_sets_possible(items))
    if remove_junk_drawer:
        for item_id, item in enumerate(pruned_items):
            event = Event(
                id=item_id, template_ids=[
                    i for i in sorted(
                        item, key=int) if i != -1])
            retval.append(event)
    else:
        for item_id, item in enumerate(pruned_items):
            event = Event(
                id=item_id, template_ids=[
                    i for i in sorted(
                        item, key=int)])
            retval.append(event)

    return retval
Esempio n. 2
0
def fp_growth(windows, min_support, iterations=0):
    from fp_growth import find_frequent_itemsets
    itemsets = []

    if 0 < min_support < 1:
        new_support = math.ceil(min_support * len(windows))
        logger.info("Min support %s%% of %s: %s", min_support * 100,
                    len(windows), new_support)
        min_support = new_support

    itemset_gen = find_frequent_itemsets(windows, min_support)
    if iterations > 1:
        for x in xrange(0, iterations):
            template_ids = frozenset(next(itemset_gen))
            itemsets.append(template_ids)
    else:
        for itemset in itemset_gen:
            template_ids = frozenset(itemset)
            itemsets.append(template_ids)

    logger.info("Removing subsets from fp_growth output...")
    if len(itemsets):
        itemsets = get_nonsubsets(itemsets)

    ret = [
        Event(id=str(uuid.uuid4()), template_ids=template_ids)
        for template_ids in itemsets
    ]
    return ret
Esempio n. 3
0
def paris(windows, r_slack, num_iterations, tau=1.0):
    ws = [set([template_id for template_id in w]) for w in windows]
    A, R = paris_lib.PARIS(ws, r_slack, num_iterations=num_iterations, tau=tau)

    itemsets = [frozenset(a) for a in A]
    ret = [
        Event(id=str(uuid.uuid4()), template_ids=template_ids)
        for template_ids in itemsets
    ]
    return ret
Esempio n. 4
0
def event_gen(templates, manual_list=simple_ssh_events):
    """Generate events manually against the automatically discovered templates."""
    result = []
    manual_events = create_manual_events(manual_list)
    for mevent in manual_events:
        template_ids = [
            t.id for t in templates if mevent.template_regex.search(t.raw_str)
        ]
        if template_ids:
            event = Event(id=mevent.id, template_ids=template_ids)
            result.append(event)
    return result
Esempio n. 5
0
def tfidf_filter_event_defs(events, threshold):
    template_ids = [[template_id for template_id in event.template_ids]
                    for event in events]
    to_filter = tf_idf_filter(template_ids, threshold)
    filtered_events = []
    for event in events:
        template_ids = []
        for template_id in event.template_ids:
            if template_id not in to_filter:
                template_ids.append(template_id)
        e = Event(id=event.id, template_ids=template_ids)
        filtered_events.append(e)
    return filtered_events
Esempio n. 6
0
def tfidf_filter_events(events, threshold, deduplicate=True):
    # tfidf_filter_namedtuple(events, threshold, Event)
    filtered_events = tfidf_filter_event_defs(events, threshold)
    if not deduplicate or not filtered_events:
        return filtered_events
    else:
        # Note that calling this will reassign random event IDs.
        logger.info("Removing subsets from tfidf_filter result...")
        template_id_sets = [frozenset(event.template_ids)
                            for event in filtered_events]
        template_id_sets = get_nonsubsets(template_id_sets)
        filtered_events = [
            Event(
                id=str(
                    uuid.uuid4()),
                template_ids=template_id_set) for template_id_set in template_id_sets]
        return filtered_events
Esempio n. 7
0
def glove(windows,
          num_components=16,
          glove_window=10,
          epochs=20,
          verbose=False):
    import glove
    import hdbscan
    import multiprocessing

    ws = [[template_id for template_id in w] for w in windows]
    corpus = glove.Corpus()
    corpus.fit(ws, window=glove_window)
    # TODO: Explore reasonable glove defaults
    glove_model = glove.Glove(no_components=num_components, learning_rate=0.05)
    glove_model.fit(corpus.matrix,
                    epochs=epochs,
                    no_threads=multiprocessing.cpu_count(),
                    verbose=verbose)
    glove_model.add_dictionary(corpus.dictionary)

    labels = []
    vectors = []
    # TODO: Explore how to pull data more nicely from glove
    for key in glove_model.__dict__['dictionary']:
        word_vector_index = glove_model.__dict__['dictionary'][key]
        labels.append(key)
        vectors.append(
            list(glove_model.__dict__['word_vectors'][word_vector_index]))

    # Clustering
    output_events = defaultdict(list)
    for i, val in enumerate(
            hdbscan.HDBSCAN(min_cluster_size=2).fit_predict(vectors)):
        output_events[val].append(labels[i])

    # Create event objects
    events = []
    for item in output_events:
        event = Event(id=str(uuid.uuid4()),
                      template_ids=map(int, output_events[item]))
        if len(event.template_ids) > 0:
            events.append(event)
    return events
Esempio n. 8
0
def apply_single_distributed_tuple(input_tuple,
                                   event2template_broadcast,
                                   window_time=None):
    """
    Helper function that takes a tuple as input and breaks out inputs to pass
    to apply_queue
    Args:
        input_tuple(tuple(tuple(event_id, window_num), iterable(DistributedLogLines))):
                    A tuple of inputs to be split and passed
        event2template_broadcast (broadcast variable): Dictionary mapping event_ids to templates contained

    Returns:
        timed_events (list(TimedEvent)): A list of found events with their component log lines
    """
    event_description, log_msgs = input_tuple
    event_id, window_num = event_description

    event = Event(id=event_id,
                  template_ids=list(event2template_broadcast.value[event_id]))
    # Comes in as iterable, follow on functions expect list
    log_msgs = list(log_msgs)

    return apply_queue(event, log_msgs, window_time=window_time)
Esempio n. 9
0
def event_gen_word2vec(sc, log_lines, window_size=60):
    import hdbscan
    D = log_lines.map(
        lambda logline: (
            int(logline.ts / window_size),
            (logline.ts,
             logline.templateId))) .groupByKey() .map(
        lambda window_loglines: [
            str(templateId) for (
                ts,
                templateId) in sorted(
                window_loglines[1])])

    # Run Word2Vec
    model = Word2Vec().setVectorSize(16).setSeed(42).fit(D)
    model_vectors = model.getVectors()

    # mapping dict_distrib
    labels = []
    vectors = []
    for label, vector in model_vectors.items():
        labels.append(label)
        vectors.append(list(vector))

    # Clsutering
    output_events = defaultdict(list)
    for i, val in enumerate(hdbscan.HDBSCAN(
            min_cluster_size=2).fit_predict(vectors)):
        output_events[val].append(labels[i])

    # Create event objects
    events = []
    for item in output_events:
        event = Event(id=item, template_ids=map(int, output_events[item]))
        if len(event.template_ids) > 0:
            events.append(event)
    return events