def event_gen_fp_growth(sc, log_lines, minSupport=0.2, numPartitions=10, windowLen=120, remove_junk_drawer=True): retval = list() windowed = window_rdd(sc, log_lines, windowLen, False) temp = FPGrowthRDD(windowed, minSupport, numPartitions).collect() items = [frozenset(fi.items) for fi in temp] pruned_items = list(get_longest_sets_possible(items)) if remove_junk_drawer: for item_id, item in enumerate(pruned_items): event = Event( id=item_id, template_ids=[ i for i in sorted( item, key=int) if i != -1]) retval.append(event) else: for item_id, item in enumerate(pruned_items): event = Event( id=item_id, template_ids=[ i for i in sorted( item, key=int)]) retval.append(event) return retval
def fp_growth(windows, min_support, iterations=0): from fp_growth import find_frequent_itemsets itemsets = [] if 0 < min_support < 1: new_support = math.ceil(min_support * len(windows)) logger.info("Min support %s%% of %s: %s", min_support * 100, len(windows), new_support) min_support = new_support itemset_gen = find_frequent_itemsets(windows, min_support) if iterations > 1: for x in xrange(0, iterations): template_ids = frozenset(next(itemset_gen)) itemsets.append(template_ids) else: for itemset in itemset_gen: template_ids = frozenset(itemset) itemsets.append(template_ids) logger.info("Removing subsets from fp_growth output...") if len(itemsets): itemsets = get_nonsubsets(itemsets) ret = [ Event(id=str(uuid.uuid4()), template_ids=template_ids) for template_ids in itemsets ] return ret
def paris(windows, r_slack, num_iterations, tau=1.0): ws = [set([template_id for template_id in w]) for w in windows] A, R = paris_lib.PARIS(ws, r_slack, num_iterations=num_iterations, tau=tau) itemsets = [frozenset(a) for a in A] ret = [ Event(id=str(uuid.uuid4()), template_ids=template_ids) for template_ids in itemsets ] return ret
def event_gen(templates, manual_list=simple_ssh_events): """Generate events manually against the automatically discovered templates.""" result = [] manual_events = create_manual_events(manual_list) for mevent in manual_events: template_ids = [ t.id for t in templates if mevent.template_regex.search(t.raw_str) ] if template_ids: event = Event(id=mevent.id, template_ids=template_ids) result.append(event) return result
def tfidf_filter_event_defs(events, threshold): template_ids = [[template_id for template_id in event.template_ids] for event in events] to_filter = tf_idf_filter(template_ids, threshold) filtered_events = [] for event in events: template_ids = [] for template_id in event.template_ids: if template_id not in to_filter: template_ids.append(template_id) e = Event(id=event.id, template_ids=template_ids) filtered_events.append(e) return filtered_events
def tfidf_filter_events(events, threshold, deduplicate=True): # tfidf_filter_namedtuple(events, threshold, Event) filtered_events = tfidf_filter_event_defs(events, threshold) if not deduplicate or not filtered_events: return filtered_events else: # Note that calling this will reassign random event IDs. logger.info("Removing subsets from tfidf_filter result...") template_id_sets = [frozenset(event.template_ids) for event in filtered_events] template_id_sets = get_nonsubsets(template_id_sets) filtered_events = [ Event( id=str( uuid.uuid4()), template_ids=template_id_set) for template_id_set in template_id_sets] return filtered_events
def glove(windows, num_components=16, glove_window=10, epochs=20, verbose=False): import glove import hdbscan import multiprocessing ws = [[template_id for template_id in w] for w in windows] corpus = glove.Corpus() corpus.fit(ws, window=glove_window) # TODO: Explore reasonable glove defaults glove_model = glove.Glove(no_components=num_components, learning_rate=0.05) glove_model.fit(corpus.matrix, epochs=epochs, no_threads=multiprocessing.cpu_count(), verbose=verbose) glove_model.add_dictionary(corpus.dictionary) labels = [] vectors = [] # TODO: Explore how to pull data more nicely from glove for key in glove_model.__dict__['dictionary']: word_vector_index = glove_model.__dict__['dictionary'][key] labels.append(key) vectors.append( list(glove_model.__dict__['word_vectors'][word_vector_index])) # Clustering output_events = defaultdict(list) for i, val in enumerate( hdbscan.HDBSCAN(min_cluster_size=2).fit_predict(vectors)): output_events[val].append(labels[i]) # Create event objects events = [] for item in output_events: event = Event(id=str(uuid.uuid4()), template_ids=map(int, output_events[item])) if len(event.template_ids) > 0: events.append(event) return events
def apply_single_distributed_tuple(input_tuple, event2template_broadcast, window_time=None): """ Helper function that takes a tuple as input and breaks out inputs to pass to apply_queue Args: input_tuple(tuple(tuple(event_id, window_num), iterable(DistributedLogLines))): A tuple of inputs to be split and passed event2template_broadcast (broadcast variable): Dictionary mapping event_ids to templates contained Returns: timed_events (list(TimedEvent)): A list of found events with their component log lines """ event_description, log_msgs = input_tuple event_id, window_num = event_description event = Event(id=event_id, template_ids=list(event2template_broadcast.value[event_id])) # Comes in as iterable, follow on functions expect list log_msgs = list(log_msgs) return apply_queue(event, log_msgs, window_time=window_time)
def event_gen_word2vec(sc, log_lines, window_size=60): import hdbscan D = log_lines.map( lambda logline: ( int(logline.ts / window_size), (logline.ts, logline.templateId))) .groupByKey() .map( lambda window_loglines: [ str(templateId) for ( ts, templateId) in sorted( window_loglines[1])]) # Run Word2Vec model = Word2Vec().setVectorSize(16).setSeed(42).fit(D) model_vectors = model.getVectors() # mapping dict_distrib labels = [] vectors = [] for label, vector in model_vectors.items(): labels.append(label) vectors.append(list(vector)) # Clsutering output_events = defaultdict(list) for i, val in enumerate(hdbscan.HDBSCAN( min_cluster_size=2).fit_predict(vectors)): output_events[val].append(labels[i]) # Create event objects events = [] for item in output_events: event = Event(id=item, template_ids=map(int, output_events[item])) if len(event.template_ids) > 0: events.append(event) return events